Mercurial > hg > mpdl-group
view software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexFR.lex @ 23:e845310098ba
diverse Korrekturen
author | Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de> |
---|---|
date | Tue, 27 Nov 2012 12:35:19 +0100 |
parents | 4a3641ae14d2 |
children |
line wrap: on
line source
/* * Normalization rules for French text * [this is a JFlex specification] * * Wolfgang Schmidle * version 2011-07-12 * */ package de.mpg.mpiwg.berlin.mpdl.lt.text.norm.lang; %% %public %class MpdlNormalizerLexFR %type java.lang.String %unicode // French: fr %states DISP, DICT, SEARCH %state CELEX %{ private String original = ""; private String normalized = ""; private int problem = 0; private void add (String norm) { original += yytext(); normalized += norm; } private static final String LB = "[\u002d\u00ad] "; %} hyphen = [-\u{00ad}] // hyphen and soft hyphen LB = {hyphen} \u0020 // lb = ({hyphen} \u0020)? END = \n Alphabet = [abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ] %% // jump over empty xml elements "<"[^><]+"/>" { add(yytext()); } "-<"[^><]+"/>" { add(yytext()); } "<"[^><]+"></"[^><]+">" { add(yytext()); } "-<"[^><]+"></"[^><]+">" { add(yytext()); } <DISP, DICT, SEARCH, CELEX> { ſ { add("s"); } ß { add("ss"); } æ { add("ae"); } } <CELEX> { [œŒ] { add("oe"); } [áàâ] { add("a"); } [éèê] { add("e"); } [íìî] { add("i"); } [óòô] { add("o"); } [úùû] { add("u"); } ’ { add(""); } {Alphabet} { add(yytext()); } . { problem = 1; add(yytext()); } // in particular "@" } // default @ { problem = 1; add(yytext()); } {LB} { add(yytext()); } . { add(yytext()); } <DISP> { {END} { switch (problem) { case 1: return original; default: return normalized; } } } <DICT, CELEX> { {END} { switch (problem) { case 1: return ""; default: return normalized.replaceAll(LB, ""); } } } <SEARCH> { {END} { switch (problem) { case 1: return original; default: return normalized.replaceAll(LB, "").toLowerCase(); } } } /* Annahmen: - die Routine wird wortweise aufgerufen, mit einem \n am Ende des Strings - Zeilenumbrüche innerhalb des Wortes sind als hyphen/soft hyphen plus space markiert TO DO: FR: richtig? vollständig? */