# HG changeset patch
# User Josef Willenborg language
to
* s
, without offset tracking.
@@ -52,8 +52,11 @@
}
}
if (useNormFunction()) {
- // normalize the string by string replace
- normStr = normalize(normStr, null);
+ // normalize the string by string replacements
+ if (normMode == MODE_4LEXICA)
+ normStr = normalize4Lexica(normStr, null);
+ else if (normMode == MODE_4HUMAN_READERS)
+ normStr = normalize4HumanReaders(normStr);
}
return normStr;
}
@@ -92,7 +95,7 @@
* @param offsets character offset table
* @return normalized string
*/
- public String normalize(String s, int[] offsets) {
+ private String normalize4Lexica(String s, int[] offsets) {
this.offsets = offsets;
if (language.equals("la") || language.equals("lat")) {
StringBuffer buf = new StringBuffer();
@@ -479,9 +482,11 @@
case '\u00e4': replace = "ae"; break;
case '\u00f6': replace = "oe"; break;
case '\u00fc': replace = "ue"; break;
+ case '\u00ad': break; // soft hyphen
case '\u00e9': replace = "e"; break;
- case '\u00ad': break; // soft hyphen
- case '-': break;
+ // new in MPDL project by J. Willenborg
+ case '\u017f': replace = "s"; break; // LATIN SMALL LETTER LONG S
+ // case '-': break;
default: replace += c; break;
}
buf.append(replace);
@@ -1007,16 +1012,126 @@
return s;
}
}
-
- /**
- * Returns the offset table.
- *
- * @return offset table
- */
- public int[] getOffsetTable() {
- return offsets;
+
+ private String normalize4HumanReaders(String s) {
+ String normStr = s;
+ StringReader strReader = new StringReader(normStr + "\n");
+ MpdlNormalizerLexAll mpdlNormalizerLexAll = new MpdlNormalizerLexAll(strReader);
+ if (Language.getInstance().isLatin(language)) {
+ mpdlNormalizerLexAll.yybegin(MpdlNormalizerLexAll.LA);
+ } else if (Language.getInstance().isChinese(language)) {
+ mpdlNormalizerLexAll.yybegin(MpdlNormalizerLexAll.ZH);
+ } else {
+ // TODO normalization for all languages
+ return normalize4Lexica(s, null); // old function
+ }
+ String retStr = "";
+ String token = "";
+ while (token != null) {
+ try {
+ token = mpdlNormalizerLexAll.yylex();
+ if (token != null)
+ retStr += token;
+ } catch (IOException e ) {
+ // nothing cause IOException is not needed for a StringReader
+ }
+ }
+ normStr = retStr;
+ return normStr;
}
+ /*
+ // explicit words
+ normStr = normStr.replaceAll("aliàs", "alias");
+ normStr = normStr.replaceAll("hîc", "hic");
+ normStr = normStr.replaceAll("quòd", "quod");
+ normStr = normStr.replaceAll("Quòd", "Quod");
+ normStr = normStr.replaceAll("QVòd", "Quod");
+ normStr = normStr.replaceAll("Cùmque", "Cumque");
+ normStr = normStr.replaceAll("aër", "aer");
+ // ij
+ normStr = normStr.replaceAll("ij", "ii");
+ // qu/qv
+ normStr = normStr.replaceAll("qv", "qu");
+ // normStr = normStr.replaceAll("qV", "qU");
+ normStr = normStr.replaceAll("Qv", "Qu");
+ normStr = normStr.replaceAll("QV", "QU");
+ // u/v
+ String vowels = getVowels();
+ String consonants = getConsonants();
+ normStr = normStr.replaceAll("([" + vowels + "])([-]*)u([" + vowels +"])", "$1$2v$3"); // vowel + u + vowel --> vowel + v + vowel
+ normStr = normStr.replaceAll("([" + vowels + "])([-]*)U([" + vowels +"])", "$1$2V$3"); // vowel + U + vowel --> vowel + V + vowel
+ normStr = normStr.replaceAll("([" + consonants + "])([-]*)v([" + consonants +"])", "$1$2u$3"); // consonant + v + consonant --> consonant + u + consonant
+ normStr = normStr.replaceAll("([" + consonants + "])([-]*)V([" + consonants +"])", "$1$2U$3"); // consonant + V + consonant --> consonant + U + consonant
+ normStr = normStr.replaceAll("^v([" + consonants +"])", "u$1"); // v + consonant --> u + consonant
+ normStr = normStr.replaceAll("^V([" + consonants +"])", "U$1"); // V + consonant --> U + consonant
+ // end of word: diacritica
+ normStr = normStr.replaceAll("à$", "a");
+ normStr = normStr.replaceAll("è$", "e");
+ normStr = normStr.replaceAll("ò$", "o");
+ normStr = normStr.replaceAll("àm$", "am");
+ normStr = normStr.replaceAll("ùm$", "um");
+ String normStrTmp = normStr;
+ normStr = "";
+ for (int i = 0; i < normStrTmp.length(); i++) {
+ char c = normStrTmp.charAt(i);
+ String replace = "";
+ switch (c) {
+ case 'ſ': replace = "s"; break;
+ case 'ß': replace = "ss"; break;
+ case 'æ': replace = "ae"; break;
+ case 'Æ': replace = "AE"; break;
+ case 'ę': replace = "ae"; break;
+ case 'œ': replace = "oe"; break;
+ default: replace += c; break;
+ }
+ normStr = normStr + replace;
+ }
+
+
+ private String getVowels() {
+ String retStr = null;
+ if (Language.getInstance().isItalian(language)) {
+ retStr = "AEIOUaeiou" +
+ "\u00c6\u00e6" + // AE ligatures
+ "\u0152\u0153"; // OE ligatures
+ } else if (Language.getInstance().isLatin(language)) {
+ retStr = "AEIOUaeiouÆœęàèòù";
+ }
+ // TODO all languages
+ return retStr;
+ }
+
+ private String getConsonants() {
+ String retStr = null;
+ if (Language.getInstance().isItalian(language)) {
+ retStr = "BCDFGHKLMNPQRSTVWXZ" +
+ "bcdfghklmnpqrstvwxz" +
+ "ſß"; // long/sharp S
+ } else if (Language.getInstance().isLatin(language)) {
+ retStr = "BCDFGHKLMNPQRSTVWXZ" +
+ "bcdfghklmnpqrstvwxz" +
+ "ſß"; // long/sharp S
+ }
+ // TODO all languages
+ return retStr;
+ }
+
+
+
+
+
+ *
+ *
+ *
+ *
+ */
+
+
+
+
+
+
/**
* Returns a copy of an integer array with the element at
* index
removed ("killed").
@@ -1024,7 +1139,7 @@
* @param array integer array
* @param index index of element to remove
*/
- static private int[] arrayKill(int[] array, int index) {
+ private int[] arrayKill(int[] array, int index) {
int[] newArray = new int[array.length - 1];
System.arraycopy(array, 0, newArray, 0, index);
System.arraycopy(array, index + 1, newArray, index, array.length - index - 1);
@@ -1040,7 +1155,7 @@
* @param value value to insert into new slots
* @param count number of new slots to insert
*/
- static private int[] arrayInsert(int[] array, int index, int value, int count) {
+ private int[] arrayInsert(int[] array, int index, int value, int count) {
int[] newArray = new int[array.length + count];
System.arraycopy(array, 0, newArray, 0, index);
for (int i = 0; i < count; i++) newArray[index + i] = value;
@@ -1048,31 +1163,4 @@
return newArray;
}
- /**
- * We provide main()
so that our services will be available
- * outside Java (i.e., so we can run as a Un*x-style filter).
- */
- static public void main(String[] argv) throws ApplicationException {
- if (argv.length != 1) {
- System.err.println("You must specify a language.");
- System.exit(1);
- }
- String rec;
- StringBuffer buf = new StringBuffer();
- BufferedReader bin = null;
- try {
- bin = new BufferedReader(new InputStreamReader(System.in, "UTF-8"));
- while ((rec = bin.readLine()) != null)
- buf.append(rec + "\n");
- }
- catch (UnsupportedEncodingException e) {
- System.err.println(e);
- System.exit(1);
- } catch (IOException e) {
- System.err.println(e);
- System.exit(1);
- }
- MpdlNormalizer orth = new MpdlNormalizer(argv[0]);
- System.out.print(orth.normalize(buf.toString()));
- }
}
\ No newline at end of file
diff -r 94305c504178 -r 2396a569e446 software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/MpdlTokenizer.java
--- a/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/MpdlTokenizer.java Tue Feb 08 14:36:38 2011 +0100
+++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/MpdlTokenizer.java Tue Feb 08 14:54:09 2011 +0100
@@ -11,11 +11,14 @@
public class MpdlTokenizer extends Tokenizer {
private static final int MAX_WORD_LEN = 255;
private static final int IO_BUFFER_SIZE = 1024;
- private String language; // TODO make the tokenizer language dependent
+ private static String SPECIAL_NOT_WORD_DELIM_SYMBOL = new Character('\u2424').toString();
+ private boolean regWithoutSemicolon = false; // hack: in some cases there are words with a semicolon, then the normalization should be without semicolon
+ private boolean isInNotWordDelimMode = false;
private int offset = 0, bufferIndex = 0, dataLen = 0;
private char[] buffer = new char[MAX_WORD_LEN];
private char[] ioBuffer = new char[IO_BUFFER_SIZE];
private MpdlNormalizer normalizer;
+ private String language;
public MpdlTokenizer(Reader input, String language) {
super(input);
@@ -28,12 +31,22 @@
this.normalizer = normalizer;
}
+ public void setRegWithoutSemicolon(boolean regWithoutSemicolon) {
+ this.regWithoutSemicolon = regWithoutSemicolon;
+ }
+
+ public boolean isRegWithoutSemicolon() {
+ return regWithoutSemicolon;
+ }
+
/** Returns true iff a character should be included in a token. This
* tokenizer generates as tokens adjacent sequences of characters which
* satisfy this predicate. Characters for which this is false are used to
* define token boundaries and are not included in tokens. */
protected boolean isTokenChar(char c) {
boolean isTokenChar = true;
+ if (isRegWithoutSemicolon() && c == ';') // hack: special case for regularization and normalization; feel free to remove it later
+ return true;
switch (c) {
case ' ': isTokenChar = false; break;
case '.': isTokenChar = false; break;
@@ -51,12 +64,37 @@
case '&': isTokenChar = false; break;
case '+': isTokenChar = false; break;
case '"': isTokenChar = false; break;
+ case '„': isTokenChar = false; break;
+ case '“': isTokenChar = false; break;
+ case '«': isTokenChar = false; break;
+ case '»': isTokenChar = false; break;
case '\'': isTokenChar = false; break;
- // case '\t': isTokenChar = false; break;
- // case '\n': isTokenChar = false; break; // do not break words which are on another line
+ case '\t': isTokenChar = false; break; // do not break words which have tabs in it
+ case '\n': isTokenChar = false; break; // do not break words which are on another line
}
return isTokenChar;
}
+
+ protected boolean isTokenCharInNotWordDelimMode(char c) {
+ boolean isTokenCharInNotWordDelimMode = false;
+ if (isInNotWordDelimMode) {
+ switch (c) {
+ case ' ': isTokenCharInNotWordDelimMode = true; break;
+ case '\t': isTokenCharInNotWordDelimMode = true; break;
+ case '\n': isTokenCharInNotWordDelimMode = true; break;
+ }
+ }
+ return isTokenCharInNotWordDelimMode;
+ }
+
+ protected boolean isSpecialNotWordDelimSymbol(char c) {
+ boolean isSpecialNotWordDelimSymbol = false;
+ switch (c) {
+ case '\u2424': isSpecialNotWordDelimSymbol = true; break; // unicode character for newline
+ }
+ return isSpecialNotWordDelimSymbol;
+ }
+
/** Called on each token character to normalize it before it is added to the
* token. The default implementation does nothing. Subclasses may use this
@@ -67,6 +105,8 @@
/** Returns the next token in the stream, or null at EOS. */
public final Token next() throws IOException {
+ if (language != null && language.equals("zh"))
+ return nextChinese();
int length = 0;
int start = offset;
while (true) {
@@ -84,7 +124,13 @@
} else {
c = ioBuffer[bufferIndex++];
}
- if (isTokenChar(c)) { // if it's a token char
+ if (isInNotWordDelimMode && isTokenChar(c) && (! isSpecialNotWordDelimSymbol(c))) {
+ isInNotWordDelimMode = false;
+ }
+ if (isSpecialNotWordDelimSymbol(c)) {
+ isInNotWordDelimMode = true;
+ }
+ if (isTokenChar(c) || isTokenCharInNotWordDelimMode(c)) { // if it's a token char
if (length == 0) // start of token
start = offset - 1;
buffer[length++] = normalize(c); // buffer it, normalized
@@ -93,8 +139,10 @@
} else if (length > 0) // at non-Letter w/ chars
break; // return 'em
}
+ isInNotWordDelimMode = false;
Token newToken = new Token(start, start + length);
newToken.setTermBuffer(buffer, 0, length);
+ removeSpecialSymbols(newToken); // remove some special symbols in token (e.g. symbol for word delimiting xml elements)
if (normalizer != null) {
char[] termBuffer = newToken.termBuffer();
int termBufferLength = newToken.termLength();
@@ -110,4 +158,75 @@
}
return newToken;
}
+
+ private Token removeSpecialSymbols(Token token) {
+ char[] termBuffer = token.termBuffer();
+ int termBufferLength = token.termLength();
+ String tokenText = new String(termBuffer, 0, termBufferLength);
+ String newTokenText = tokenText.replaceAll(SPECIAL_NOT_WORD_DELIM_SYMBOL, ""); // a symbol which marks word delimiting xml elements
+ int newTokenTextLength = newTokenText.length();
+ char[] newTokenTextBuffer = newTokenText.toCharArray();
+ token.setTermBuffer(newTokenTextBuffer, 0, newTokenTextLength);
+ return token;
+ }
+
+
+
+ /*
+ * chinese Tokenizer: taken from org.apache.lucene.analysis.cn.ChineseTokenizer
+ *
+ */
+ private int length;
+ private int start;
+
+ private final void push(char c) {
+ if (length == 0) start = offset-1; // start of token
+ buffer[length++] = Character.toLowerCase(c); // buffer it
+ }
+
+ private final Token flush() {
+ if (length>0) {
+ return new Token(new String(buffer, 0, length), start, start+length);
+ }
+ else
+ return null;
+ }
+
+ public final Token nextChinese() throws IOException {
+ length = 0;
+ start = offset;
+ while (true) {
+ final char c;
+ offset++;
+ if (bufferIndex >= dataLen) {
+ dataLen = input.read(ioBuffer);
+ bufferIndex = 0;
+ }
+ if (dataLen == -1)
+ return flush();
+ else
+ c = ioBuffer[bufferIndex++];
+ switch(Character.getType(c)) {
+ case Character.DECIMAL_DIGIT_NUMBER:
+ case Character.LOWERCASE_LETTER:
+ case Character.UPPERCASE_LETTER:
+ push(c);
+ if (length == MAX_WORD_LEN)
+ return flush();
+ break;
+ case Character.OTHER_LETTER:
+ if (length>0) {
+ bufferIndex--;
+ offset--;
+ return flush();
+ }
+ push(c);
+ return flush();
+ default:
+ if (length>0)
+ return flush();
+ break;
+ }
+ }
+ }
}
diff -r 94305c504178 -r 2396a569e446 software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/MpdlTokenizerAnalyzer.java
--- a/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/MpdlTokenizerAnalyzer.java Tue Feb 08 14:36:38 2011 +0100
+++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/MpdlTokenizerAnalyzer.java Tue Feb 08 14:54:09 2011 +0100
@@ -16,6 +16,7 @@
public class MpdlTokenizerAnalyzer extends Analyzer {
protected String language = MpdlConstants.DEFAULT_LANGUAGE;
protected MpdlNormalizer normalizer = null;
+ private boolean regWithoutSemicolon = false; // hack: in some cases there are words with a semicolon, then the normalization should be without semicolon
public MpdlTokenizerAnalyzer(String language) {
this.language = language;
@@ -27,8 +28,18 @@
this.normalizer = normalizer;
}
+ public void setRegWithoutSemicolon(boolean regWithoutSemicolon) {
+ this.regWithoutSemicolon = regWithoutSemicolon;
+ }
+
+ public boolean isRegWithoutSemicolon() {
+ return regWithoutSemicolon;
+ }
+
public TokenStream tokenStream(String fieldName, Reader reader) {
- TokenStream result = new MpdlTokenizer(reader, language, normalizer);
+ MpdlTokenizer tmpTokenizer = new MpdlTokenizer(reader, language, normalizer);
+ tmpTokenizer.setRegWithoutSemicolon(regWithoutSemicolon); // hack: feel free to remove it later
+ TokenStream result = (TokenStream) tmpTokenizer;
result = new MpdlFilter(result); // filter to remove the hyphen in a token etc.
result = new LowerCaseFilter(result);
return result;
@@ -38,7 +49,9 @@
ArrayListaState
+ */
+ private static final int [] ZZ_ATTRIBUTE = zzUnpackAttribute();
+
+ private static final String ZZ_ATTRIBUTE_PACKED_0 =
+ "\4\0\5\11\3\1\6\11\14\1\4\11\1\1\7\11"+
+ "\1\0\1\11\3\0\1\11\1\0\1\11\3\0\6\11"+
+ "\1\0\1\11\2\0\1\11\1\0\1\11\3\0\1\11"+
+ "\1\0\1\11\1\0\1\11\11\0\1\11\5\0";
+
+ private static int [] zzUnpackAttribute() {
+ int [] result = new int[89];
+ int offset = 0;
+ offset = zzUnpackAttribute(ZZ_ATTRIBUTE_PACKED_0, offset, result);
+ return result;
+ }
+
+ private static int zzUnpackAttribute(String packed, int offset, int [] result) {
+ int i = 0; /* index in packed string */
+ int j = offset; /* index in unpacked array */
+ int l = packed.length();
+ while (i < l) {
+ int count = packed.charAt(i++);
+ int value = packed.charAt(i++);
+ do result[j++] = value; while (--count > 0);
+ }
+ return j;
+ }
+
+ /** the input device */
+ private java.io.Reader zzReader;
+
+ /** the current state of the DFA */
+ private int zzState;
+
+ /** the current lexical state */
+ private int zzLexicalState = YYINITIAL;
+
+ /** this buffer contains the current text to be matched and is
+ the source of the yytext() string */
+ private char zzBuffer[] = new char[ZZ_BUFFERSIZE];
+
+ /** the textposition at the last accepting state */
+ private int zzMarkedPos;
+
+ /** the current text position in the buffer */
+ private int zzCurrentPos;
+
+ /** startRead marks the beginning of the yytext() string in the buffer */
+ private int zzStartRead;
+
+ /** endRead marks the last character in the buffer, that has been read
+ from input */
+ private int zzEndRead;
+
+ /** number of newlines encountered up to the start of the matched text */
+ private int yyline;
+
+ /** the number of characters up to the start of the matched text */
+ private int yychar;
+
+ /**
+ * the number of characters from the last newline up to the start of the
+ * matched text
+ */
+ private int yycolumn;
+
+ /**
+ * zzAtBOL == true <=> the scanner is currently at the beginning of a line
+ */
+ private boolean zzAtBOL = true;
+
+ /** zzAtEOF == true <=> the scanner is at the EOF */
+ private boolean zzAtEOF;
+
+ /** denotes if the user-EOF-code has already been executed */
+ private boolean zzEOFDone;
+
+ /* user code: */
+ int cv = 0; // consonant = 1, vowel = 2, everything else = 0
+
+
+ /**
+ * Creates a new scanner
+ * There is also a java.io.InputStream version of this constructor.
+ *
+ * @param in the java.io.Reader to read input from.
+ */
+ public MpdlNormalizerLexAll(java.io.Reader in) {
+ this.zzReader = in;
+ }
+
+ /**
+ * Creates a new scanner.
+ * There is also java.io.Reader version of this constructor.
+ *
+ * @param in the java.io.Inputstream to read input from.
+ */
+ public MpdlNormalizerLexAll(java.io.InputStream in) {
+ this(new java.io.InputStreamReader(in));
+ }
+
+ /**
+ * Unpacks the compressed character translation table.
+ *
+ * @param packed the packed character translation table
+ * @return the unpacked character translation table
+ */
+ private static char [] zzUnpackCMap(String packed) {
+ char [] map = new char[0x10000];
+ int i = 0; /* index in packed string */
+ int j = 0; /* index in unpacked array */
+ while (i < 172) {
+ int count = packed.charAt(i++);
+ char value = packed.charAt(i++);
+ do map[j++] = value; while (--count > 0);
+ }
+ return map;
+ }
+
+
+ /**
+ * Refills the input buffer.
+ *
+ * @return false
, iff there was new input.
+ *
+ * @exception java.io.IOException if any I/O-Error occurs
+ */
+ private boolean zzRefill() throws java.io.IOException {
+
+ /* first: make room (if you can) */
+ if (zzStartRead > 0) {
+ System.arraycopy(zzBuffer, zzStartRead,
+ zzBuffer, 0,
+ zzEndRead-zzStartRead);
+
+ /* translate stored positions */
+ zzEndRead-= zzStartRead;
+ zzCurrentPos-= zzStartRead;
+ zzMarkedPos-= zzStartRead;
+ zzStartRead = 0;
+ }
+
+ /* is the buffer big enough? */
+ if (zzCurrentPos >= zzBuffer.length) {
+ /* if not: blow it up */
+ char newBuffer[] = new char[zzCurrentPos*2];
+ System.arraycopy(zzBuffer, 0, newBuffer, 0, zzBuffer.length);
+ zzBuffer = newBuffer;
+ }
+
+ /* finally: fill the buffer with new input */
+ int numRead = zzReader.read(zzBuffer, zzEndRead,
+ zzBuffer.length-zzEndRead);
+
+ if (numRead > 0) {
+ zzEndRead+= numRead;
+ return false;
+ }
+ // unlikely but not impossible: read 0 characters, but not at end of stream
+ if (numRead == 0) {
+ int c = zzReader.read();
+ if (c == -1) {
+ return true;
+ } else {
+ zzBuffer[zzEndRead++] = (char) c;
+ return false;
+ }
+ }
+
+ // numRead < 0
+ return true;
+ }
+
+
+ /**
+ * Closes the input stream.
+ */
+ public final void yyclose() throws java.io.IOException {
+ zzAtEOF = true; /* indicate end of file */
+ zzEndRead = zzStartRead; /* invalidate buffer */
+
+ if (zzReader != null)
+ zzReader.close();
+ }
+
+
+ /**
+ * Resets the scanner to read from a new input stream.
+ * Does not close the old reader.
+ *
+ * All internal variables are reset, the old input stream
+ * cannot be reused (internal buffer is discarded and lost).
+ * Lexical state is set to ZZ_INITIAL.
+ *
+ * @param reader the new input stream
+ */
+ public final void yyreset(java.io.Reader reader) {
+ zzReader = reader;
+ zzAtBOL = true;
+ zzAtEOF = false;
+ zzEOFDone = false;
+ zzEndRead = zzStartRead = 0;
+ zzCurrentPos = zzMarkedPos = 0;
+ yyline = yychar = yycolumn = 0;
+ zzLexicalState = YYINITIAL;
+ }
+
+
+ /**
+ * Returns the current lexical state.
+ */
+ public final int yystate() {
+ return zzLexicalState;
+ }
+
+
+ /**
+ * Enters a new lexical state
+ *
+ * @param newState the new lexical state
+ */
+ public final void yybegin(int newState) {
+ zzLexicalState = newState;
+ }
+
+
+ /**
+ * Returns the text matched by the current regular expression.
+ */
+ public final String yytext() {
+ return new String( zzBuffer, zzStartRead, zzMarkedPos-zzStartRead );
+ }
+
+
+ /**
+ * Returns the character at position pos from the
+ * matched text.
+ *
+ * It is equivalent to yytext().charAt(pos), but faster
+ *
+ * @param pos the position of the character to fetch.
+ * A value from 0 to yylength()-1.
+ *
+ * @return the character at position pos
+ */
+ public final char yycharat(int pos) {
+ return zzBuffer[zzStartRead+pos];
+ }
+
+
+ /**
+ * Returns the length of the matched text region.
+ */
+ public final int yylength() {
+ return zzMarkedPos-zzStartRead;
+ }
+
+
+ /**
+ * Reports an error that occured while scanning.
+ *
+ * In a wellformed scanner (no or only correct usage of
+ * yypushback(int) and a match-all fallback rule) this method
+ * will only be called with things that "Can't Possibly Happen".
+ * If this method is called, something is seriously wrong
+ * (e.g. a JFlex bug producing a faulty scanner etc.).
+ *
+ * Usual syntax/scanner level error handling should be done
+ * in error fallback rules.
+ *
+ * @param errorCode the code of the errormessage to display
+ */
+ private void zzScanError(int errorCode) {
+ String message;
+ try {
+ message = ZZ_ERROR_MSG[errorCode];
+ }
+ catch (ArrayIndexOutOfBoundsException e) {
+ message = ZZ_ERROR_MSG[ZZ_UNKNOWN_ERROR];
+ }
+
+ throw new Error(message);
+ }
+
+
+ /**
+ * Pushes the specified amount of characters back into the input stream.
+ *
+ * They will be read again by then next call of the scanning method
+ *
+ * @param number the number of characters to be read again.
+ * This number must not be greater than yylength()!
+ */
+ public void yypushback(int number) {
+ if ( number > yylength() )
+ zzScanError(ZZ_PUSHBACK_2BIG);
+
+ zzMarkedPos -= number;
+ }
+
+
+ /**
+ * Resumes scanning until the next regular expression is matched,
+ * the end of input is encountered or an I/O-Error occurs.
+ *
+ * @return the next token
+ * @exception java.io.IOException if any I/O-Error occurs
+ */
+ public java.lang.String yylex() throws java.io.IOException {
+ int zzInput;
+ int zzAction;
+
+ // cached fields:
+ int zzCurrentPosL;
+ int zzMarkedPosL;
+ int zzEndReadL = zzEndRead;
+ char [] zzBufferL = zzBuffer;
+ char [] zzCMapL = ZZ_CMAP;
+
+ int [] zzTransL = ZZ_TRANS;
+ int [] zzRowMapL = ZZ_ROWMAP;
+ int [] zzAttrL = ZZ_ATTRIBUTE;
+
+ while (true) {
+ zzMarkedPosL = zzMarkedPos;
+
+ if (zzMarkedPosL > zzStartRead) {
+ switch (zzBufferL[zzMarkedPosL-1]) {
+ case '\n':
+ case '\u000B':
+ case '\u000C':
+ case '\u0085':
+ case '\u2028':
+ case '\u2029':
+ zzAtBOL = true;
+ break;
+ case '\r':
+ if (zzMarkedPosL < zzEndReadL)
+ zzAtBOL = zzBufferL[zzMarkedPosL] != '\n';
+ else if (zzAtEOF)
+ zzAtBOL = false;
+ else {
+ boolean eof = zzRefill();
+ zzMarkedPosL = zzMarkedPos;
+ zzEndReadL = zzEndRead;
+ zzBufferL = zzBuffer;
+ if (eof)
+ zzAtBOL = false;
+ else
+ zzAtBOL = zzBufferL[zzMarkedPosL] != '\n';
+ }
+ break;
+ default:
+ zzAtBOL = false;
+ }
+ }
+ zzAction = -1;
+
+ zzCurrentPosL = zzCurrentPos = zzStartRead = zzMarkedPosL;
+
+ if (zzAtBOL)
+ zzState = ZZ_LEXSTATE[zzLexicalState+1];
+ else
+ zzState = ZZ_LEXSTATE[zzLexicalState];
+
+
+ zzForAction: {
+ while (true) {
+
+ if (zzCurrentPosL < zzEndReadL)
+ zzInput = zzBufferL[zzCurrentPosL++];
+ else if (zzAtEOF) {
+ zzInput = YYEOF;
+ break zzForAction;
+ }
+ else {
+ // store back cached positions
+ zzCurrentPos = zzCurrentPosL;
+ zzMarkedPos = zzMarkedPosL;
+ boolean eof = zzRefill();
+ // get translated positions and possibly new buffer
+ zzCurrentPosL = zzCurrentPos;
+ zzMarkedPosL = zzMarkedPos;
+ zzBufferL = zzBuffer;
+ zzEndReadL = zzEndRead;
+ if (eof) {
+ zzInput = YYEOF;
+ break zzForAction;
+ }
+ else {
+ zzInput = zzBufferL[zzCurrentPosL++];
+ }
+ }
+ int zzNext = zzTransL[ zzRowMapL[zzState] + zzCMapL[zzInput] ];
+ if (zzNext == -1) break zzForAction;
+ zzState = zzNext;
+
+ int zzAttributes = zzAttrL[zzState];
+ if ( (zzAttributes & 1) == 1 ) {
+ zzAction = zzState;
+ zzMarkedPosL = zzCurrentPosL;
+ if ( (zzAttributes & 8) == 8 ) break zzForAction;
+ }
+
+ }
+ }
+
+ // store back cached position
+ zzMarkedPos = zzMarkedPosL;
+
+ switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) {
+ case 25:
+ // lookahead expression with fixed base length
+ zzMarkedPos = zzStartRead + 1;
+ { return "o";
+ }
+ case 39: break;
+ case 22:
+ { cv = 2; return "ii";
+ }
+ case 40: break;
+ case 35:
+ // lookahead expression with fixed base length
+ zzMarkedPos = zzStartRead + 2;
+ { return "od";
+ }
+ case 41: break;
+ case 7:
+ { cv = 1; return "s";
+ }
+ case 42: break;
+ case 24:
+ // lookahead expression with fixed base length
+ zzMarkedPos = zzStartRead + 1;
+ { return "e";
+ }
+ case 43: break;
+ case 29:
+ { cv = 1; return "Qu";
+ }
+ case 44: break;
+ case 19:
+ // lookahead expression with fixed base length
+ zzMarkedPos = zzStartRead + 1;
+ { switch(cv) {
+ case 2: return "v";
+ default: cv = 2; return "u";
+ }
+ }
+ case 45: break;
+ case 9:
+ { cv = 2; return "ae";
+ }
+ case 46: break;
+ case 15:
+ { return "精";
+ }
+ case 47: break;
+ case 3:
+ { cv = 0; return yytext();
+ }
+ case 48: break;
+ case 27:
+ // lookahead expression with fixed base length
+ zzMarkedPos = zzStartRead + 1;
+ { switch(cv) {
+ case 2: return "V";
+ default: cv = 2; return "U";
+ }
+ }
+ case 49: break;
+ case 2:
+ { return "";
+ }
+ case 50: break;
+ case 33:
+ // lookahead expression with fixed base length
+ zzMarkedPos = zzStartRead + 2;
+ { return "am";
+ }
+ case 51: break;
+ case 18:
+ { cv = 1; return "qu";
+ }
+ case 52: break;
+ case 14:
+ { return "歷";
+ }
+ case 53: break;
+ case 8:
+ { cv = 1; return "ss";
+ }
+ case 54: break;
+ case 4:
+ { cv = 2; return yytext();
+ }
+ case 55: break;
+ case 32:
+ { return "庶";
+ }
+ case 56: break;
+ case 6:
+ { cv = 0; return "";
+ }
+ case 57: break;
+ case 16:
+ { switch(cv) {
+ case 1: return yytext().replace("v", "u");
+ default: cv = 1; return yytext();
+ }
+ }
+ case 58: break;
+ case 12:
+ { return "奇";
+ }
+ case 59: break;
+ case 38:
+ { return "hic";
+ }
+ case 60: break;
+ case 26:
+ { cv = 2; return "oi";
+ }
+ case 61: break;
+ case 36:
+ // lookahead expression with fixed base length
+ zzMarkedPos = zzStartRead + 2;
+ { return "um";
+ }
+ case 62: break;
+ case 17:
+ { switch(cv) {
+ case 1: return yytext().replace("V", "U");
+ default: cv = 1; return yytext();
+ }
+ }
+ case 63: break;
+ case 21:
+ { cv = 2; return "uu";
+ }
+ case 64: break;
+ case 31:
+ // lookahead expression with fixed base length
+ zzMarkedPos = zzStartRead + 1;
+ { cv = 1; return "U";
+ }
+ case 65: break;
+ case 1:
+ { return yytext();
+ }
+ case 66: break;
+ case 34:
+ // lookahead expression with fixed base length
+ zzMarkedPos = zzStartRead + 2;
+ { return "as";
+ }
+ case 67: break;
+ case 23:
+ // lookahead expression with fixed base length
+ zzMarkedPos = zzStartRead + 1;
+ { return "a";
+ }
+ case 68: break;
+ case 13:
+ { return "時";
+ }
+ case 69: break;
+ case 10:
+ { cv = 2; return "AE";
+ }
+ case 70: break;
+ case 37:
+ // lookahead expression with fixed base length
+ zzMarkedPos = zzStartRead + 2;
+ { return "us";
+ }
+ case 71: break;
+ case 5:
+ { cv = 1; return yytext();
+ }
+ case 72: break;
+ case 28:
+ // lookahead expression with fixed base length
+ zzMarkedPos = zzStartRead + 1;
+ { cv = 1; return "u";
+ }
+ case 73: break;
+ case 30:
+ { cv = 1; return "QU";
+ }
+ case 74: break;
+ case 20:
+ { cv = 2; return "ui";
+ }
+ case 75: break;
+ case 11:
+ { cv = 2; return "oe";
+ }
+ case 76: break;
+ default:
+ if (zzInput == YYEOF && zzStartRead == zzCurrentPos) {
+ zzAtEOF = true;
+ return null;
+ }
+ else {
+ zzScanError(ZZ_NO_MATCH);
+ }
+ }
+ }
+ }
+
+
+}
diff -r 94305c504178 -r 2396a569e446 software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlNormalizerLexAll.lex
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlNormalizerLexAll.lex Tue Feb 08 14:54:09 2011 +0100
@@ -0,0 +1,143 @@
+/*
+ * Normalization rules for all languages
+ * [this is a JFlex specification]
+ *
+ * Wolfgang Schmidle
+ * 2011-01-25
+ *
+ */
+
+package de.mpg.mpiwg.berlin.mpdl.lt.analyzer.lang;
+
+%%
+
+%public
+%class MpdlNormalizerLexAll
+%type java.lang.String
+%unicode
+// %debug
+
+%states LA, ZH
+
+%{
+ int cv = 0; // consonant = 1, vowel = 2, everything else = 0
+%}
+
+VOWEL=[AEIOUaeiouÆæęàèòùœ]
+CONS=[BCDFGHKLMNPQRSTVWXZbcdfghklmnpqrstvwxzſß]
+LR=[lLrR]
+QUE=(que)?
+END=\n
+
+%%
+
+aState
+ */
+ private static final int [] ZZ_ATTRIBUTE = zzUnpackAttribute();
+
+ private static final String ZZ_ATTRIBUTE_PACKED_0 =
+ "\1\0\1\11\1\1\u010c\11\3\1\14\11\1\1\3\0"+
+ "\1\11\1\0\1\1\33\0\2\11\17\0\1\11";
+
+ private static int [] zzUnpackAttribute() {
+ int [] result = new int[338];
+ int offset = 0;
+ offset = zzUnpackAttribute(ZZ_ATTRIBUTE_PACKED_0, offset, result);
+ return result;
+ }
+
+ private static int zzUnpackAttribute(String packed, int offset, int [] result) {
+ int i = 0; /* index in packed string */
+ int j = offset; /* index in unpacked array */
+ int l = packed.length();
+ while (i < l) {
+ int count = packed.charAt(i++);
+ int value = packed.charAt(i++);
+ do result[j++] = value; while (--count > 0);
+ }
+ return j;
+ }
+
+ /** the input device */
+ private java.io.Reader zzReader;
+
+ /** the current state of the DFA */
+ private int zzState;
+
+ /** the current lexical state */
+ private int zzLexicalState = YYINITIAL;
+
+ /** this buffer contains the current text to be matched and is
+ the source of the yytext() string */
+ private char zzBuffer[] = new char[ZZ_BUFFERSIZE];
+
+ /** the textposition at the last accepting state */
+ private int zzMarkedPos;
+
+ /** the current text position in the buffer */
+ private int zzCurrentPos;
+
+ /** startRead marks the beginning of the yytext() string in the buffer */
+ private int zzStartRead;
+
+ /** endRead marks the last character in the buffer, that has been read
+ from input */
+ private int zzEndRead;
+
+ /** number of newlines encountered up to the start of the matched text */
+ private int yyline;
+
+ /** the number of characters up to the start of the matched text */
+ private int yychar;
+
+ /**
+ * the number of characters from the last newline up to the start of the
+ * matched text
+ */
+ private int yycolumn;
+
+ /**
+ * zzAtBOL == true <=> the scanner is currently at the beginning of a line
+ */
+ private boolean zzAtBOL = true;
+
+ /** zzAtEOF == true <=> the scanner is at the EOF */
+ private boolean zzAtEOF;
+
+ /** denotes if the user-EOF-code has already been executed */
+ private boolean zzEOFDone;
+
+
+ /**
+ * Creates a new scanner
+ * There is also a java.io.InputStream version of this constructor.
+ *
+ * @param in the java.io.Reader to read input from.
+ */
+ public Unicode2BetacodeLex(java.io.Reader in) {
+ this.zzReader = in;
+ }
+
+ /**
+ * Creates a new scanner.
+ * There is also java.io.Reader version of this constructor.
+ *
+ * @param in the java.io.Inputstream to read input from.
+ */
+ public Unicode2BetacodeLex(java.io.InputStream in) {
+ this(new java.io.InputStreamReader(in));
+ }
+
+ /**
+ * Unpacks the compressed character translation table.
+ *
+ * @param packed the packed character translation table
+ * @return the unpacked character translation table
+ */
+ private static char [] zzUnpackCMap(String packed) {
+ char [] map = new char[0x10000];
+ int i = 0; /* index in packed string */
+ int j = 0; /* index in unpacked array */
+ while (i < 724) {
+ int count = packed.charAt(i++);
+ char value = packed.charAt(i++);
+ do map[j++] = value; while (--count > 0);
+ }
+ return map;
+ }
+
+
+ /**
+ * Refills the input buffer.
+ *
+ * @return false
, iff there was new input.
+ *
+ * @exception java.io.IOException if any I/O-Error occurs
+ */
+ private boolean zzRefill() throws java.io.IOException {
+
+ /* first: make room (if you can) */
+ if (zzStartRead > 0) {
+ System.arraycopy(zzBuffer, zzStartRead,
+ zzBuffer, 0,
+ zzEndRead-zzStartRead);
+
+ /* translate stored positions */
+ zzEndRead-= zzStartRead;
+ zzCurrentPos-= zzStartRead;
+ zzMarkedPos-= zzStartRead;
+ zzStartRead = 0;
+ }
+
+ /* is the buffer big enough? */
+ if (zzCurrentPos >= zzBuffer.length) {
+ /* if not: blow it up */
+ char newBuffer[] = new char[zzCurrentPos*2];
+ System.arraycopy(zzBuffer, 0, newBuffer, 0, zzBuffer.length);
+ zzBuffer = newBuffer;
+ }
+
+ /* finally: fill the buffer with new input */
+ int numRead = zzReader.read(zzBuffer, zzEndRead,
+ zzBuffer.length-zzEndRead);
+
+ if (numRead > 0) {
+ zzEndRead+= numRead;
+ return false;
+ }
+ // unlikely but not impossible: read 0 characters, but not at end of stream
+ if (numRead == 0) {
+ int c = zzReader.read();
+ if (c == -1) {
+ return true;
+ } else {
+ zzBuffer[zzEndRead++] = (char) c;
+ return false;
+ }
+ }
+
+ // numRead < 0
+ return true;
+ }
+
+
+ /**
+ * Closes the input stream.
+ */
+ public final void yyclose() throws java.io.IOException {
+ zzAtEOF = true; /* indicate end of file */
+ zzEndRead = zzStartRead; /* invalidate buffer */
+
+ if (zzReader != null)
+ zzReader.close();
+ }
+
+
+ /**
+ * Resets the scanner to read from a new input stream.
+ * Does not close the old reader.
+ *
+ * All internal variables are reset, the old input stream
+ * cannot be reused (internal buffer is discarded and lost).
+ * Lexical state is set to ZZ_INITIAL.
+ *
+ * @param reader the new input stream
+ */
+ public final void yyreset(java.io.Reader reader) {
+ zzReader = reader;
+ zzAtBOL = true;
+ zzAtEOF = false;
+ zzEOFDone = false;
+ zzEndRead = zzStartRead = 0;
+ zzCurrentPos = zzMarkedPos = 0;
+ yyline = yychar = yycolumn = 0;
+ zzLexicalState = YYINITIAL;
+ }
+
+
+ /**
+ * Returns the current lexical state.
+ */
+ public final int yystate() {
+ return zzLexicalState;
+ }
+
+
+ /**
+ * Enters a new lexical state
+ *
+ * @param newState the new lexical state
+ */
+ public final void yybegin(int newState) {
+ zzLexicalState = newState;
+ }
+
+
+ /**
+ * Returns the text matched by the current regular expression.
+ */
+ public final String yytext() {
+ return new String( zzBuffer, zzStartRead, zzMarkedPos-zzStartRead );
+ }
+
+
+ /**
+ * Returns the character at position pos from the
+ * matched text.
+ *
+ * It is equivalent to yytext().charAt(pos), but faster
+ *
+ * @param pos the position of the character to fetch.
+ * A value from 0 to yylength()-1.
+ *
+ * @return the character at position pos
+ */
+ public final char yycharat(int pos) {
+ return zzBuffer[zzStartRead+pos];
+ }
+
+
+ /**
+ * Returns the length of the matched text region.
+ */
+ public final int yylength() {
+ return zzMarkedPos-zzStartRead;
+ }
+
+
+ /**
+ * Reports an error that occured while scanning.
+ *
+ * In a wellformed scanner (no or only correct usage of
+ * yypushback(int) and a match-all fallback rule) this method
+ * will only be called with things that "Can't Possibly Happen".
+ * If this method is called, something is seriously wrong
+ * (e.g. a JFlex bug producing a faulty scanner etc.).
+ *
+ * Usual syntax/scanner level error handling should be done
+ * in error fallback rules.
+ *
+ * @param errorCode the code of the errormessage to display
+ */
+ private void zzScanError(int errorCode) {
+ String message;
+ try {
+ message = ZZ_ERROR_MSG[errorCode];
+ }
+ catch (ArrayIndexOutOfBoundsException e) {
+ message = ZZ_ERROR_MSG[ZZ_UNKNOWN_ERROR];
+ }
+
+ throw new Error(message);
+ }
+
+
+ /**
+ * Pushes the specified amount of characters back into the input stream.
+ *
+ * They will be read again by then next call of the scanning method
+ *
+ * @param number the number of characters to be read again.
+ * This number must not be greater than yylength()!
+ */
+ public void yypushback(int number) {
+ if ( number > yylength() )
+ zzScanError(ZZ_PUSHBACK_2BIG);
+
+ zzMarkedPos -= number;
+ }
+
+
+ /**
+ * Resumes scanning until the next regular expression is matched,
+ * the end of input is encountered or an I/O-Error occurs.
+ *
+ * @return the next token
+ * @exception java.io.IOException if any I/O-Error occurs
+ */
+ public java.lang.String yylex() throws java.io.IOException {
+ int zzInput;
+ int zzAction;
+
+ // cached fields:
+ int zzCurrentPosL;
+ int zzMarkedPosL;
+ int zzEndReadL = zzEndRead;
+ char [] zzBufferL = zzBuffer;
+ char [] zzCMapL = ZZ_CMAP;
+
+ int [] zzTransL = ZZ_TRANS;
+ int [] zzRowMapL = ZZ_ROWMAP;
+ int [] zzAttrL = ZZ_ATTRIBUTE;
+
+ while (true) {
+ zzMarkedPosL = zzMarkedPos;
+
+ zzAction = -1;
+
+ zzCurrentPosL = zzCurrentPos = zzStartRead = zzMarkedPosL;
+
+ zzState = ZZ_LEXSTATE[zzLexicalState];
+
+
+ zzForAction: {
+ while (true) {
+
+ if (zzCurrentPosL < zzEndReadL)
+ zzInput = zzBufferL[zzCurrentPosL++];
+ else if (zzAtEOF) {
+ zzInput = YYEOF;
+ break zzForAction;
+ }
+ else {
+ // store back cached positions
+ zzCurrentPos = zzCurrentPosL;
+ zzMarkedPos = zzMarkedPosL;
+ boolean eof = zzRefill();
+ // get translated positions and possibly new buffer
+ zzCurrentPosL = zzCurrentPos;
+ zzMarkedPosL = zzMarkedPos;
+ zzBufferL = zzBuffer;
+ zzEndReadL = zzEndRead;
+ if (eof) {
+ zzInput = YYEOF;
+ break zzForAction;
+ }
+ else {
+ zzInput = zzBufferL[zzCurrentPosL++];
+ }
+ }
+ int zzNext = zzTransL[ zzRowMapL[zzState] + zzCMapL[zzInput] ];
+ if (zzNext == -1) break zzForAction;
+ zzState = zzNext;
+
+ int zzAttributes = zzAttrL[zzState];
+ if ( (zzAttributes & 1) == 1 ) {
+ zzAction = zzState;
+ zzMarkedPosL = zzCurrentPosL;
+ if ( (zzAttributes & 8) == 8 ) break zzForAction;
+ }
+
+ }
+ }
+
+ // store back cached position
+ zzMarkedPos = zzMarkedPosL;
+
+ switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) {
+ case 266:
+ { return "p";
+ }
+ case 287: break;
+ case 102:
+ { return "*(w";
+ }
+ case 288: break;
+ case 20:
+ { return "*(\\a";
+ }
+ case 289: break;
+ case 21:
+ { return "*)/a";
+ }
+ case 290: break;
+ case 181:
+ { return "*a/";
+ }
+ case 291: break;
+ case 237:
+ { return "*a";
+ }
+ case 292: break;
+ case 260:
+ { return "n";
+ }
+ case 293: break;
+ case 89:
+ { return "*(u";
+ }
+ case 294: break;
+ case 16:
+ { return "a(=";
+ }
+ case 295: break;
+ case 30:
+ { return "e(/";
+ }
+ case 296: break;
+ case 195:
+ { return "i+\\";
+ }
+ case 297: break;
+ case 222:
+ { return "w=";
+ }
+ case 298: break;
+ case 210:
+ { return "u+=";
+ }
+ case 299: break;
+ case 99:
+ { return "w)=";
+ }
+ case 300: break;
+ case 256:
+ { return "l";
+ }
+ case 301: break;
+ case 205:
+ { return "u+\\";
+ }
+ case 302: break;
+ case 23:
+ { return "*)=a";
+ }
+ case 303: break;
+ case 225:
+ { return "*o/";
+ }
+ case 304: break;
+ case 44:
+ { return "h(=";
+ }
+ case 305: break;
+ case 3:
+ { return "j";
+ }
+ case 306: break;
+ case 103:
+ { return "*)\\w";
+ }
+ case 307: break;
+ case 152:
+ { return "*(/|h";
+ }
+ case 308: break;
+ case 165:
+ { return "*)\\|w";
+ }
+ case 309: break;
+ case 248:
+ { return "h";
+ }
+ case 310: break;
+ case 76:
+ { return "*(o";
+ }
+ case 311: break;
+ case 159:
+ { return "w)/|";
+ }
+ case 312: break;
+ case 178:
+ { return "*a^";
+ }
+ case 313: break;
+ case 141:
+ { return "h)\\|";
+ }
+ case 314: break;
+ case 106:
+ { return "*(/w";
+ }
+ case 315: break;
+ case 275:
+ { return "f";
+ }
+ case 316: break;
+ case 227:
+ { return "/";
+ }
+ case 317: break;
+ case 91:
+ { return "*(/u";
+ }
+ case 318: break;
+ case 242:
+ { return "d";
+ }
+ case 319: break;
+ case 161:
+ { return "w)=|";
+ }
+ case 320: break;
+ case 57:
+ { return "i)/";
+ }
+ case 321: break;
+ case 154:
+ { return "*(=|h";
+ }
+ case 322: break;
+ case 95:
+ { return "w)\\";
+ }
+ case 323: break;
+ case 108:
+ { return "*(=w";
+ }
+ case 324: break;
+ case 116:
+ { return "i/";
+ }
+ case 325: break;
+ case 238:
+ { return "b";
+ }
+ case 326: break;
+ case 207:
+ { return "r)";
+ }
+ case 327: break;
+ case 147:
+ { return "*)|h";
+ }
+ case 328: break;
+ case 62:
+ { return "*(i";
+ }
+ case 329: break;
+ case 230:
+ { return "+";
+ }
+ case 330: break;
+ case 77:
+ { return "*)\\o";
+ }
+ case 331: break;
+ case 166:
+ { return "*(\\|w";
+ }
+ case 332: break;
+ case 71:
+ { return "o)\\";
+ }
+ case 333: break;
+ case 92:
+ { return "*(=u";
+ }
+ case 334: break;
+ case 232:
+ { return ")";
+ }
+ case 335: break;
+ case 14:
+ { return "a(/";
+ }
+ case 336: break;
+ case 122:
+ { return "w/";
+ }
+ case 337: break;
+ case 206:
+ { return "u+/";
+ }
+ case 338: break;
+ case 80:
+ { return "*(/o";
+ }
+ case 339: break;
+ case 97:
+ { return "w)/";
+ }
+ case 340: break;
+ case 123:
+ { return "a)|";
+ }
+ case 341: break;
+ case 229:
+ { return "^";
+ }
+ case 342: break;
+ case 32:
+ { return "*(e";
+ }
+ case 343: break;
+ case 286:
+ { return "'";
+ }
+ case 344: break;
+ case 42:
+ { return "h(/";
+ }
+ case 345: break;
+ case 53:
+ { return "i)";
+ }
+ case 346: break;
+ case 174:
+ { return "a|";
+ }
+ case 347: break;
+ case 63:
+ { return "*)\\i";
+ }
+ case 348: break;
+ case 139:
+ { return "h)|";
+ }
+ case 349: break;
+ case 193:
+ { return "i^";
+ }
+ case 350: break;
+ case 18:
+ { return "*(a";
+ }
+ case 351: break;
+ case 74:
+ { return "o(/";
+ }
+ case 352: break;
+ case 93:
+ { return "w)";
+ }
+ case 353: break;
+ case 66:
+ { return "*(/i";
+ }
+ case 354: break;
+ case 101:
+ { return "*)w";
+ }
+ case 355: break;
+ case 7:
+ { return "!";
+ }
+ case 356: break;
+ case 33:
+ { return "*)\\e";
+ }
+ case 357: break;
+ case 15:
+ { return "a)=";
+ }
+ case 358: break;
+ case 29:
+ { return "e)/";
+ }
+ case 359: break;
+ case 68:
+ { return "*(=i";
+ }
+ case 360: break;
+ case 125:
+ { return "a)\\|";
+ }
+ case 361: break;
+ case 36:
+ { return "*(/e";
+ }
+ case 362: break;
+ case 115:
+ { return "i\\";
+ }
+ case 363: break;
+ case 201:
+ { return "*i\\";
+ }
+ case 364: break;
+ case 112:
+ { return "e/";
+ }
+ case 365: break;
+ case 218:
+ { return "w/|";
+ }
+ case 366: break;
+ case 176:
+ { return "a=";
+ }
+ case 367: break;
+ case 19:
+ { return "*)\\a";
+ }
+ case 368: break;
+ case 43:
+ { return "h)=";
+ }
+ case 369: break;
+ case 133:
+ { return "*)\\|a";
+ }
+ case 370: break;
+ case 270:
+ { return "s1";
+ }
+ case 371: break;
+ case 247:
+ { return "*z";
+ }
+ case 372: break;
+ case 204:
+ { return "u_";
+ }
+ case 373: break;
+ case 143:
+ { return "h)/|";
+ }
+ case 374: break;
+ case 22:
+ { return "*(/a";
+ }
+ case 375: break;
+ case 82:
+ { return "u(";
+ }
+ case 376: break;
+ case 75:
+ { return "*)o";
+ }
+ case 377: break;
+ case 223:
+ { return "w=|";
+ }
+ case 378: break;
+ case 278:
+ { return "*x";
+ }
+ case 379: break;
+ case 121:
+ { return "w\\";
+ }
+ case 380: break;
+ case 200:
+ { return "*i_";
+ }
+ case 381: break;
+ case 219:
+ { return "*w\\";
+ }
+ case 382: break;
+ case 25:
+ { return "e)";
+ }
+ case 383: break;
+ case 145:
+ { return "h)=|";
+ }
+ case 384: break;
+ case 151:
+ { return "*)/|h";
+ }
+ case 385: break;
+ case 24:
+ { return "*(=a";
+ }
+ case 386: break;
+ case 4:
+ { return "*v";
+ }
+ case 387: break;
+ case 192:
+ { return "*h|";
+ }
+ case 388: break;
+ case 39:
+ { return "h)\\";
+ }
+ case 389: break;
+ case 272:
+ { return "*t";
+ }
+ case 390: break;
+ case 134:
+ { return "*(\\|a";
+ }
+ case 391: break;
+ case 214:
+ { return "*u/";
+ }
+ case 392: break;
+ case 61:
+ { return "*)i";
+ }
+ case 393: break;
+ case 269:
+ { return "*r";
+ }
+ case 394: break;
+ case 160:
+ { return "w(/|";
+ }
+ case 395: break;
+ case 13:
+ { return "a)/";
+ }
+ case 396: break;
+ case 153:
+ { return "*)=|h";
+ }
+ case 397: break;
+ case 267:
+ { return "*p";
+ }
+ case 398: break;
+ case 111:
+ { return "e\\";
+ }
+ case 399: break;
+ case 88:
+ { return "u(=";
+ }
+ case 400: break;
+ case 31:
+ { return "*)e";
+ }
+ case 401: break;
+ case 188:
+ { return "*e\\";
+ }
+ case 402: break;
+ case 110:
+ { return "a/";
+ }
+ case 403: break;
+ case 162:
+ { return "w(=|";
+ }
+ case 404: break;
+ case 41:
+ { return "h)/";
+ }
+ case 405: break;
+ case 261:
+ { return "*n";
+ }
+ case 406: break;
+ case 226:
+ { return "\\";
+ }
+ case 407: break;
+ case 96:
+ { return "w(\\";
+ }
+ case 408: break;
+ case 148:
+ { return "*(|h";
+ }
+ case 409: break;
+ case 257:
+ { return "*l";
+ }
+ case 410: break;
+ case 211:
+ { return "*u^";
+ }
+ case 411: break;
+ case 198:
+ { return "i+=";
+ }
+ case 412: break;
+ case 279:
+ { return "y";
+ }
+ case 413: break;
+ case 17:
+ { return "*)a";
+ }
+ case 414: break;
+ case 73:
+ { return "o)/";
+ }
+ case 415: break;
+ case 72:
+ { return "o(\\";
+ }
+ case 416: break;
+ case 118:
+ { return "o/";
+ }
+ case 417: break;
+ case 168:
+ { return "*(/|w";
+ }
+ case 418: break;
+ case 2:
+ { return "*j";
+ }
+ case 419: break;
+ case 281:
+ { return "w";
+ }
+ case 420: break;
+ case 48:
+ { return "*(\\h";
+ }
+ case 421: break;
+ case 49:
+ { return "*)/h";
+ }
+ case 422: break;
+ case 9:
+ { return "a)";
+ }
+ case 423: break;
+ case 216:
+ { return "w\\|";
+ }
+ case 424: break;
+ case 249:
+ { return "*h";
+ }
+ case 425: break;
+ case 273:
+ { return "u";
+ }
+ case 426: break;
+ case 171:
+ { return "a^";
+ }
+ case 427: break;
+ case 175:
+ { return "a/|";
+ }
+ case 428: break;
+ case 285:
+ { return "<";
+ }
+ case 429: break;
+ case 276:
+ { return "*f";
+ }
+ case 430: break;
+ case 38:
+ { return "h(";
+ }
+ case 431: break;
+ case 283:
+ // lookahead expression with fixed base length
+ zzMarkedPos = zzStartRead + 1;
+ { return "s";
+ }
+ case 432: break;
+ case 51:
+ { return "*)=h";
+ }
+ case 433: break;
+ case 127:
+ { return "a)/|";
+ }
+ case 434: break;
+ case 170:
+ { return "*(=|w";
+ }
+ case 435: break;
+ case 69:
+ { return "o)";
+ }
+ case 436: break;
+ case 243:
+ { return "*d";
+ }
+ case 437: break;
+ case 185:
+ { return "h/|";
+ }
+ case 438: break;
+ case 250:
+ { return "q";
+ }
+ case 439: break;
+ case 163:
+ { return "*)|w";
+ }
+ case 440: break;
+ case 8:
+ { return ":";
+ }
+ case 441: break;
+ case 177:
+ { return "a=|";
+ }
+ case 442: break;
+ case 239:
+ { return "*b";
+ }
+ case 443: break;
+ case 158:
+ { return "w(\\|";
+ }
+ case 444: break;
+ case 109:
+ { return "a\\";
+ }
+ case 445: break;
+ case 264:
+ { return "o";
+ }
+ case 446: break;
+ case 129:
+ { return "a)=|";
+ }
+ case 447: break;
+ case 86:
+ { return "u(/";
+ }
+ case 448: break;
+ case 180:
+ { return "*a\\";
+ }
+ case 449: break;
+ case 11:
+ { return "a)\\";
+ }
+ case 450: break;
+ case 187:
+ { return "h=|";
+ }
+ case 451: break;
+ case 258:
+ { return "m";
+ }
+ case 452: break;
+ case 191:
+ { return "*h/";
+ }
+ case 453: break;
+ case 113:
+ { return "h\\";
+ }
+ case 454: break;
+ case 190:
+ { return "*h\\";
+ }
+ case 455: break;
+ case 196:
+ { return "i+/";
+ }
+ case 456: break;
+ case 254:
+ { return "k";
+ }
+ case 457: break;
+ case 215:
+ { return "*(r";
+ }
+ case 458: break;
+ case 27:
+ { return "e)\\";
+ }
+ case 459: break;
+ case 117:
+ { return "o\\";
+ }
+ case 460: break;
+ case 252:
+ { return "i";
+ }
+ case 461: break;
+ case 224:
+ { return "*o\\";
+ }
+ case 462: break;
+ case 144:
+ { return "h(/|";
+ }
+ case 463: break;
+ case 179:
+ { return "*a_";
+ }
+ case 464: break;
+ case 221:
+ { return "*w|";
+ }
+ case 465: break;
+ case 240:
+ { return "g";
+ }
+ case 466: break;
+ case 55:
+ { return "i)\\";
+ }
+ case 467: break;
+ case 209:
+ { return "u=";
+ }
+ case 468: break;
+ case 87:
+ { return "u)=";
+ }
+ case 469: break;
+ case 244:
+ { return "e";
+ }
+ case 470: break;
+ case 146:
+ { return "h(=|";
+ }
+ case 471: break;
+ case 83:
+ { return "u)\\";
+ }
+ case 472: break;
+ case 40:
+ { return "h(\\";
+ }
+ case 473: break;
+ case 262:
+ { return "c";
+ }
+ case 474: break;
+ case 136:
+ { return "*(/|a";
+ }
+ case 475: break;
+ case 236:
+ { return "a";
+ }
+ case 476: break;
+ case 208:
+ { return "r(";
+ }
+ case 477: break;
+ case 46:
+ { return "*(h";
+ }
+ case 478: break;
+ case 228:
+ { return "_";
+ }
+ case 479: break;
+ case 183:
+ { return "h\\|";
+ }
+ case 480: break;
+ case 233:
+ { return "(";
+ }
+ case 481: break;
+ case 138:
+ { return "*(=|a";
+ }
+ case 482: break;
+ case 194:
+ { return "i_";
+ }
+ case 483: break;
+ case 167:
+ { return "*)/|w";
+ }
+ case 484: break;
+ case 54:
+ { return "i(";
+ }
+ case 485: break;
+ case 131:
+ { return "*)|a";
+ }
+ case 486: break;
+ case 47:
+ { return "*)\\h";
+ }
+ case 487: break;
+ case 184:
+ { return "h|";
+ }
+ case 488: break;
+ case 149:
+ { return "*)\\|h";
+ }
+ case 489: break;
+ case 94:
+ { return "w(";
+ }
+ case 490: break;
+ case 50:
+ { return "*(/h";
+ }
+ case 491: break;
+ case 120:
+ { return "u/";
+ }
+ case 492: break;
+ case 85:
+ { return "u)/";
+ }
+ case 493: break;
+ case 169:
+ { return "*)=|w";
+ }
+ case 494: break;
+ case 156:
+ { return "w(|";
+ }
+ case 495: break;
+ case 202:
+ { return "*i/";
+ }
+ case 496: break;
+ case 52:
+ { return "*(=h";
+ }
+ case 497: break;
+ case 128:
+ { return "a(/|";
+ }
+ case 498: break;
+ case 157:
+ { return "w)\\|";
+ }
+ case 499: break;
+ case 60:
+ { return "i(=";
+ }
+ case 500: break;
+ case 164:
+ { return "*(|w";
+ }
+ case 501: break;
+ case 150:
+ { return "*(\\|h";
+ }
+ case 502: break;
+ case 220:
+ { return "*w/";
+ }
+ case 503: break;
+ case 186:
+ { return "h=";
+ }
+ case 504: break;
+ case 81:
+ { return "u)";
+ }
+ case 505: break;
+ case 130:
+ { return "a(=|";
+ }
+ case 506: break;
+ case 280:
+ { return "*y";
+ }
+ case 507: break;
+ case 203:
+ { return "u^";
+ }
+ case 508: break;
+ case 104:
+ { return "*(\\w";
+ }
+ case 509: break;
+ case 12:
+ { return "a(\\";
+ }
+ case 510: break;
+ case 105:
+ { return "*)/w";
+ }
+ case 511: break;
+ case 182:
+ { return "*a|";
+ }
+ case 512: break;
+ case 282:
+ { return "*w";
+ }
+ case 513: break;
+ case 199:
+ { return "*i^";
+ }
+ case 514: break;
+ case 100:
+ { return "w(=";
+ }
+ case 515: break;
+ case 90:
+ { return "*(\\u";
+ }
+ case 516: break;
+ case 26:
+ { return "e(";
+ }
+ case 517: break;
+ case 1:
+ { return yytext();
+ }
+ case 518: break;
+ case 142:
+ { return "h(\\|";
+ }
+ case 519: break;
+ case 274:
+ { return "*u";
+ }
+ case 520: break;
+ case 28:
+ { return "e(\\";
+ }
+ case 521: break;
+ case 107:
+ { return "*)=w";
+ }
+ case 522: break;
+ case 173:
+ { return "a\\|";
+ }
+ case 523: break;
+ case 6:
+ { return "*s";
+ }
+ case 524: break;
+ case 45:
+ { return "*)h";
+ }
+ case 525: break;
+ case 251:
+ { return "*q";
+ }
+ case 526: break;
+ case 119:
+ { return "u\\";
+ }
+ case 527: break;
+ case 56:
+ { return "i(\\";
+ }
+ case 528: break;
+ case 213:
+ { return "*u\\";
+ }
+ case 529: break;
+ case 284:
+ { return ">";
+ }
+ case 530: break;
+ case 78:
+ { return "*(\\o";
+ }
+ case 531: break;
+ case 189:
+ { return "*e/";
+ }
+ case 532: break;
+ case 79:
+ { return "*)/o";
+ }
+ case 533: break;
+ case 265:
+ { return "*o";
+ }
+ case 534: break;
+ case 135:
+ { return "*)/|a";
+ }
+ case 535: break;
+ case 84:
+ { return "u(\\";
+ }
+ case 536: break;
+ case 235:
+ { return "|";
+ }
+ case 537: break;
+ case 58:
+ { return "i(/";
+ }
+ case 538: break;
+ case 259:
+ { return "*m";
+ }
+ case 539: break;
+ case 212:
+ { return "*u_";
+ }
+ case 540: break;
+ case 114:
+ { return "h/";
+ }
+ case 541: break;
+ case 246:
+ { return "z";
+ }
+ case 542: break;
+ case 255:
+ { return "*k";
+ }
+ case 543: break;
+ case 277:
+ { return "x";
+ }
+ case 544: break;
+ case 64:
+ { return "*(\\i";
+ }
+ case 545: break;
+ case 65:
+ { return "*)/i";
+ }
+ case 546: break;
+ case 137:
+ { return "*)=|a";
+ }
+ case 547: break;
+ case 253:
+ { return "*i";
+ }
+ case 548: break;
+ case 98:
+ { return "w(/";
+ }
+ case 549: break;
+ case 5:
+ { return "v";
+ }
+ case 550: break;
+ case 124:
+ { return "a(|";
+ }
+ case 551: break;
+ case 234:
+ { return "?";
+ }
+ case 552: break;
+ case 172:
+ { return "a_";
+ }
+ case 553: break;
+ case 217:
+ { return "w|";
+ }
+ case 554: break;
+ case 10:
+ { return "a(";
+ }
+ case 555: break;
+ case 241:
+ { return "*g";
+ }
+ case 556: break;
+ case 155:
+ { return "w)|";
+ }
+ case 557: break;
+ case 37:
+ { return "h)";
+ }
+ case 558: break;
+ case 271:
+ { return "t";
+ }
+ case 559: break;
+ case 231:
+ { return "=";
+ }
+ case 560: break;
+ case 67:
+ { return "*)=i";
+ }
+ case 561: break;
+ case 34:
+ { return "*(\\e";
+ }
+ case 562: break;
+ case 35:
+ { return "*)/e";
+ }
+ case 563: break;
+ case 140:
+ { return "h(|";
+ }
+ case 564: break;
+ case 132:
+ { return "*(|a";
+ }
+ case 565: break;
+ case 245:
+ { return "*e";
+ }
+ case 566: break;
+ case 268:
+ { return "r";
+ }
+ case 567: break;
+ case 59:
+ { return "i)=";
+ }
+ case 568: break;
+ case 70:
+ { return "o(";
+ }
+ case 569: break;
+ case 126:
+ { return "a(\\|";
+ }
+ case 570: break;
+ case 263:
+ { return "*c";
+ }
+ case 571: break;
+ case 197:
+ { return "i=";
+ }
+ case 572: break;
+ default:
+ if (zzInput == YYEOF && zzStartRead == zzCurrentPos) {
+ zzAtEOF = true;
+ return null;
+ }
+ else {
+ zzScanError(ZZ_NO_MATCH);
+ }
+ }
+ }
+ }
+
+
+}
diff -r 94305c504178 -r 2396a569e446 software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/general/Unicode2Buckwalter.lex
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/general/Unicode2Buckwalter.lex Tue Feb 08 14:54:09 2011 +0100
@@ -0,0 +1,121 @@
+package de.mpg.mpiwg.berlin.mpdl.lt.general;
+
+%%
+%{
+ /*
+ * Betacode to Unicode conversion
+ */
+
+%}
+
+%class Unicode2BuckwalterLex
+%public
+%type java.lang.String
+%unicode
+%%
+
+
+"<"[^>]+">" { return yytext(); }
+
+"\u0621" { return "'"; } /* Hamza */
+"\u0622" { return "|"; } /* ALEF WITH MADDA ABOVE from AraMorph */
+"\u0623" { return ">"; } /* Hamza */
+"\u0624" { return "&"; } /* Hamza */
+"\u0625" { return "<"; } /* Alif + HamzaBelow */
+"\u0626" { return "}"; } /* Ya + HamzaAbove */
+"\u0627" { return "A"; } /* Alif */
+"\u0628" { return "b"; } /* Ba */
+"\u0629" { return "p"; } /* TaMarbuta */
+"\u062A" { return "t"; } /* Ta */
+"\u062B" { return "v"; } /* Tha */
+"\u062C" { return "j"; } /* Jeem */
+"\u062D" { return "H"; } /* HHa */
+"\u062E" { return "x"; } /* Kha */
+"\u062F" { return "d"; } /* Dal */
+"\u0630" { return "*"; } /* Thal */
+"\u0631" { return "r"; } /* Ra */
+"\u0632" { return "z"; } /* Zain */
+"\u0633" { return "s"; } /* Seen */
+"\u0634" { return "$"; } /* Sheen */
+"\u0635" { return "S"; } /* Sad */
+"\u0636" { return "D"; } /* DDad */
+"\u0637" { return "T"; } /* TTa */
+"\u0638" { return "Z"; } /* DTha */
+"\u0639" { return "E"; } /* Ain */
+"\u063A" { return "g"; } /* Ghain */
+
+"\u0640" { return "_"; } /* Tatweel */
+"\u0641" { return "f"; } /* Fa */
+"\u0642" { return "q"; } /* Qaf */
+"\u0643" { return "k"; } /* Kaf */
+"\u0644" { return "l"; } /* Lam */
+"\u0645" { return "m"; } /* Meem */
+"\u0646" { return "n"; } /* Noon */
+"\u0647" { return "h"; } /* Ha */
+"\u0648" { return "w"; } /* Waw */
+"\u0649" { return "Y"; } /* AlifMaksura */
+"\u064A" { return "y"; } /* Ya */
+"\u064B" { return "F"; } /* Fathatan */
+"\u064C" { return "N"; } /* Dammatan */
+"\u064D" { return "K"; } /* Kasratan */
+"\u064E" { return "a"; } /* Fatha */
+"\u064F" { return "u"; } /* Damma */
+"\u0650" { return "i"; } /* Kasra */
+"\u0651" { return "~"; } /* Shadda */
+"\u0652" { return "o"; } /* Sukun */
+"\u0653" { return "^"; } /* Maddah */
+"\u0654" { return "#"; } /* HamzaAbove */
+
+"\u0670" { return "`"; } /* AlifKhanjareeya */
+"\u0671" { return "{"; } /* Alif + HamzatWasl */
+
+"\u067E" { return "P"; } /* PEH from AraMorph */
+"\u0686" { return "J"; } /* TCHEH from AraMorph */
+"\u06A4" { return "V"; } /* VEH from AraMorph */
+"\u06AF" { return "G"; } /* GAF from AraMorph */
+"\u0698" { return "R"; } /* JEH from AraMorph */
+"\u061F" { return "?"; } /* QUESTION MARK from AraMorph */
+
+"\u06DC" { return ":"; } /* SmallHighSeen */
+"\u06DF" { return "@"; } /* SmallHighRoundedZero */
+
+"\u06E2" { return "["; } /* SmallHighMeemIsolatedForm */
+"\u06E3" { return ";"; } /* SmallLowSeen */
+"\u06E5" { return ","; } /* SmallWaw */
+"\u06E6" { return "."; } /* SmallYa */
+"\u06E8" { return "!"; } /* SmallHighNoon */
+"\u06EA" { return "-"; } /* EmptyCentreLowStop */
+"\u06EB" { return "+"; } /* EmptyCentreHighStop */
+"\u06EC" { return "%"; } /* RoundedHighStopWithFilledCentre */
+"\u06ED" { return "]"; } /* SmallLowMeem */
+
+[\&_]"vert;" { return "|"; }
+[\&_]"lpar;" { return "("; }
+[\&_]"rpar;" { return ")"; }
+[\_\&]"lt;" { return "<"; }
+[\_\&]"gt;" { return ">"; }
+"'" { return "'"; }
+
+"&"[a-zA-Z]+";" { return yytext(); }
+
+. { return yytext(); }
+\n { return yytext(); }
+
+/* make problemes */
+/* "\u06E0" { return "\\""; } SmallHighUprightRectangularZero */
+
+
+/* double entries */
+/* "\u060C" { return ","; } COMMA from AraMorph */
+/* "\u061B" { return ";"; } SEMICOLON from AraMorph */
+
+/* not in buckwalter contained */
+/* \u0679 : ARABIC LETTER TTEH */
+/* \u0688 : ARABIC LETTER DDAL */
+/* \u06A9 : ARABIC LETTER KEHEH */
+/* \u0691 : ARABIC LETTER RREH */
+/* \u06BA : ARABIC LETTER NOON GHUNNA */
+/* \u06BE : ARABIC LETTER HEH DOACHASHMEE */
+/* \u06C1 : ARABIC LETTER HEH GOAL */
+/* \u06D2 : ARABIC LETTER YEH BARREE */
+
diff -r 94305c504178 -r 2396a569e446 software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/general/Unicode2BuckwalterLex.java
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/general/Unicode2BuckwalterLex.java Tue Feb 08 14:54:09 2011 +0100
@@ -0,0 +1,882 @@
+/* The following code was generated by JFlex 1.4.3 on 14.12.10 17:12 */
+
+package de.mpg.mpiwg.berlin.mpdl.lt.general;
+
+
+/**
+ * This class is a scanner generated by
+ * JFlex 1.4.3
+ * on 14.12.10 17:12 from the specification file
+ * /Users/jwillenborg/test/jflex/Unicode2Buckwalter.lex
+ */
+public class Unicode2BuckwalterLex {
+
+ /** This character denotes the end of file */
+ public static final int YYEOF = -1;
+
+ /** initial size of the lookahead buffer */
+ private static final int ZZ_BUFFERSIZE = 16384;
+
+ /** lexical states */
+ public static final int YYINITIAL = 0;
+
+ /**
+ * ZZ_LEXSTATE[l] is the state in the DFA for the lexical state l
+ * ZZ_LEXSTATE[l+1] is the state in the DFA for the lexical state l
+ * at the beginning of a line
+ * l is of the form l = 2*k, k a non negative integer
+ */
+ private static final int ZZ_LEXSTATE[] = {
+ 0, 0
+ };
+
+ /**
+ * Translates characters to character classes
+ */
+ private static final String ZZ_CMAP_PACKED =
+ "\12\0\1\0\30\0\1\120\2\0\1\117\11\0\1\121\2\0\1\122"+
+ "\5\0\1\123\1\0\1\112\1\1\1\0\1\2\2\0\32\124\4\0"+
+ "\1\105\1\0\1\115\3\124\1\107\1\124\1\116\4\124\1\113\3\124"+
+ "\1\114\1\124\1\110\1\124\1\111\1\124\1\106\4\124\u05a4\0\1\71"+
+ "\1\0\1\3\1\4\1\5\1\6\1\7\1\10\1\11\1\12\1\13"+
+ "\1\14\1\15\1\16\1\17\1\20\1\21\1\22\1\23\1\24\1\25"+
+ "\1\26\1\27\1\30\1\31\1\32\1\33\1\34\5\0\1\35\1\36"+
+ "\1\37\1\40\1\41\1\42\1\43\1\44\1\45\1\46\1\47\1\50"+
+ "\1\51\1\52\1\53\1\54\1\55\1\56\1\57\1\60\1\61\33\0"+
+ "\1\62\1\63\14\0\1\64\7\0\1\65\21\0\1\70\13\0\1\66"+
+ "\12\0\1\67\54\0\1\72\2\0\1\73\2\0\1\74\1\75\1\0"+
+ "\1\76\1\77\1\0\1\100\1\0\1\101\1\102\1\103\1\104\uf912\0";
+
+ /**
+ * Translates characters to character classes
+ */
+ private static final char [] ZZ_CMAP = zzUnpackCMap(ZZ_CMAP_PACKED);
+
+ /**
+ * Translates DFA states to action switch labels.
+ */
+ private static final int [] ZZ_ACTION = zzUnpackAction();
+
+ private static final String ZZ_ACTION_PACKED_0 =
+ "\1\0\2\1\1\2\1\3\1\4\1\5\1\6\1\7"+
+ "\1\10\1\11\1\12\1\13\1\14\1\15\1\16\1\17"+
+ "\1\20\1\21\1\22\1\23\1\24\1\25\1\26\1\27"+
+ "\1\30\1\31\1\32\1\33\1\34\1\35\1\36\1\37"+
+ "\1\40\1\41\1\42\1\43\1\44\1\45\1\46\1\47"+
+ "\1\50\1\51\1\52\1\53\1\54\1\55\1\56\1\57"+
+ "\1\60\1\61\1\62\1\63\1\64\1\65\1\66\1\67"+
+ "\1\70\1\71\1\72\1\73\1\74\1\75\1\76\1\77"+
+ "\1\100\1\101\1\102\1\103\2\1\30\0\1\104\1\0"+
+ "\1\105\13\0\1\106\1\107";
+
+ private static int [] zzUnpackAction() {
+ int [] result = new int[111];
+ int offset = 0;
+ offset = zzUnpackAction(ZZ_ACTION_PACKED_0, offset, result);
+ return result;
+ }
+
+ private static int zzUnpackAction(String packed, int offset, int [] result) {
+ int i = 0; /* index in packed string */
+ int j = offset; /* index in unpacked array */
+ int l = packed.length();
+ while (i < l) {
+ int count = packed.charAt(i++);
+ int value = packed.charAt(i++);
+ do result[j++] = value; while (--count > 0);
+ }
+ return j;
+ }
+
+
+ /**
+ * Translates a state to a row index in the transition table
+ */
+ private static final int [] ZZ_ROWMAP = zzUnpackRowMap();
+
+ private static final String ZZ_ROWMAP_PACKED_0 =
+ "\0\0\0\125\0\252\0\125\0\125\0\125\0\125\0\125"+
+ "\0\125\0\125\0\125\0\125\0\125\0\125\0\125\0\125"+
+ "\0\125\0\125\0\125\0\125\0\125\0\125\0\125\0\125"+
+ "\0\125\0\125\0\125\0\125\0\125\0\125\0\125\0\125"+
+ "\0\125\0\125\0\125\0\125\0\125\0\125\0\125\0\125"+
+ "\0\125\0\125\0\125\0\125\0\125\0\125\0\125\0\125"+
+ "\0\125\0\125\0\125\0\125\0\125\0\125\0\125\0\125"+
+ "\0\125\0\125\0\125\0\125\0\125\0\125\0\125\0\125"+
+ "\0\125\0\125\0\125\0\125\0\125\0\377\0\u0154\0\u01a9"+
+ "\0\u01fe\0\u0253\0\u02a8\0\u02fd\0\u0352\0\u03a7\0\u03fc\0\u0451"+
+ "\0\u04a6\0\u04fb\0\u0550\0\u05a5\0\u05fa\0\u064f\0\u06a4\0\u06f9"+
+ "\0\u074e\0\u07a3\0\u07f8\0\u084d\0\u08a2\0\u08f7\0\u094c\0\125"+
+ "\0\u09a1\0\125\0\u09f6\0\u0a4b\0\u0aa0\0\u0af5\0\u0b4a\0\u0b9f"+
+ "\0\u0bf4\0\u0c49\0\u0c9e\0\u0cf3\0\u0d48\0\125\0\125";
+
+ private static int [] zzUnpackRowMap() {
+ int [] result = new int[111];
+ int offset = 0;
+ offset = zzUnpackRowMap(ZZ_ROWMAP_PACKED_0, offset, result);
+ return result;
+ }
+
+ private static int zzUnpackRowMap(String packed, int offset, int [] result) {
+ int i = 0; /* index in packed string */
+ int j = offset; /* index in unpacked array */
+ int l = packed.length();
+ while (i < l) {
+ int high = packed.charAt(i++) << 16;
+ result[j++] = high | packed.charAt(i++);
+ }
+ return j;
+ }
+
+ /**
+ * The transition table of the DFA
+ */
+ private static final int [] ZZ_TRANS = zzUnpackTrans();
+
+ private static final String ZZ_TRANS_PACKED_0 =
+ "\1\2\1\3\1\2\1\4\1\5\1\6\1\7\1\10"+
+ "\1\11\1\12\1\13\1\14\1\15\1\16\1\17\1\20"+
+ "\1\21\1\22\1\23\1\24\1\25\1\26\1\27\1\30"+
+ "\1\31\1\32\1\33\1\34\1\35\1\36\1\37\1\40"+
+ "\1\41\1\42\1\43\1\44\1\45\1\46\1\47\1\50"+
+ "\1\51\1\52\1\53\1\54\1\55\1\56\1\57\1\60"+
+ "\1\61\1\62\1\63\1\64\1\65\1\66\1\67\1\70"+
+ "\1\71\1\72\1\73\1\74\1\75\1\76\1\77\1\100"+
+ "\1\101\1\102\1\103\1\104\1\105\1\106\11\2\1\107"+
+ "\5\2\125\0\2\110\1\0\122\110\106\0\1\111\1\0"+
+ "\1\112\2\0\1\113\2\0\1\114\114\0\1\115\1\116"+
+ "\1\117\1\116\1\0\1\120\2\116\1\121\1\0\1\122"+
+ "\3\0\1\116\2\110\1\2\122\110\107\0\1\123\131\0"+
+ "\1\124\121\0\1\125\2\0\1\126\121\0\1\127\121\0"+
+ "\1\116\1\130\2\116\1\2\4\116\5\0\1\116\106\0"+
+ "\4\116\1\2\4\116\5\0\1\116\106\0\4\116\1\2"+
+ "\1\116\1\131\2\116\5\0\1\116\106\0\3\116\1\132"+
+ "\1\2\1\116\1\133\2\116\5\0\1\116\106\0\3\116"+
+ "\1\134\1\2\4\116\5\0\1\116\121\0\1\135\113\0"+
+ "\1\136\131\0\1\137\121\0\1\140\127\0\1\141\121\0"+
+ "\1\142\120\0\2\116\1\143\1\116\1\2\4\116\5\0"+
+ "\1\116\106\0\4\116\1\2\2\116\1\144\1\116\5\0"+
+ "\1\116\106\0\4\116\1\140\4\116\5\0\1\116\106\0"+
+ "\4\116\1\2\2\116\1\145\1\116\5\0\1\116\106\0"+
+ "\4\116\1\142\4\116\5\0\1\116\122\0\1\146\113\0"+
+ "\1\147\123\0\1\150\124\0\1\151\122\0\3\116\1\152"+
+ "\1\2\4\116\5\0\1\116\106\0\2\116\1\153\1\116"+
+ "\1\2\4\116\5\0\1\116\106\0\2\116\1\154\1\116"+
+ "\1\2\4\116\5\0\1\116\123\0\1\155\113\0\1\5"+
+ "\124\0\1\156\124\0\1\157\120\0\4\116\1\5\4\116"+
+ "\5\0\1\116\106\0\4\116\1\156\4\116\5\0\1\116"+
+ "\106\0\4\116\1\157\4\116\5\0\1\116\112\0\1\4"+
+ "\12\0";
+
+ private static int [] zzUnpackTrans() {
+ int [] result = new int[3485];
+ int offset = 0;
+ offset = zzUnpackTrans(ZZ_TRANS_PACKED_0, offset, result);
+ return result;
+ }
+
+ private static int zzUnpackTrans(String packed, int offset, int [] result) {
+ int i = 0; /* index in packed string */
+ int j = offset; /* index in unpacked array */
+ int l = packed.length();
+ while (i < l) {
+ int count = packed.charAt(i++);
+ int value = packed.charAt(i++);
+ value--;
+ do result[j++] = value; while (--count > 0);
+ }
+ return j;
+ }
+
+
+ /* error codes */
+ private static final int ZZ_UNKNOWN_ERROR = 0;
+ private static final int ZZ_NO_MATCH = 1;
+ private static final int ZZ_PUSHBACK_2BIG = 2;
+
+ /* error messages for the codes above */
+ private static final String ZZ_ERROR_MSG[] = {
+ "Unkown internal scanner error",
+ "Error: could not match input",
+ "Error: pushback value was too large"
+ };
+
+ /**
+ * ZZ_ATTRIBUTE[aState] contains the attributes of state aState
+ */
+ private static final int [] ZZ_ATTRIBUTE = zzUnpackAttribute();
+
+ private static final String ZZ_ATTRIBUTE_PACKED_0 =
+ "\1\0\1\11\1\1\102\11\2\1\30\0\1\11\1\0"+
+ "\1\11\13\0\2\11";
+
+ private static int [] zzUnpackAttribute() {
+ int [] result = new int[111];
+ int offset = 0;
+ offset = zzUnpackAttribute(ZZ_ATTRIBUTE_PACKED_0, offset, result);
+ return result;
+ }
+
+ private static int zzUnpackAttribute(String packed, int offset, int [] result) {
+ int i = 0; /* index in packed string */
+ int j = offset; /* index in unpacked array */
+ int l = packed.length();
+ while (i < l) {
+ int count = packed.charAt(i++);
+ int value = packed.charAt(i++);
+ do result[j++] = value; while (--count > 0);
+ }
+ return j;
+ }
+
+ /** the input device */
+ private java.io.Reader zzReader;
+
+ /** the current state of the DFA */
+ private int zzState;
+
+ /** the current lexical state */
+ private int zzLexicalState = YYINITIAL;
+
+ /** this buffer contains the current text to be matched and is
+ the source of the yytext() string */
+ private char zzBuffer[] = new char[ZZ_BUFFERSIZE];
+
+ /** the textposition at the last accepting state */
+ private int zzMarkedPos;
+
+ /** the current text position in the buffer */
+ private int zzCurrentPos;
+
+ /** startRead marks the beginning of the yytext() string in the buffer */
+ private int zzStartRead;
+
+ /** endRead marks the last character in the buffer, that has been read
+ from input */
+ private int zzEndRead;
+
+ /** number of newlines encountered up to the start of the matched text */
+ private int yyline;
+
+ /** the number of characters up to the start of the matched text */
+ private int yychar;
+
+ /**
+ * the number of characters from the last newline up to the start of the
+ * matched text
+ */
+ private int yycolumn;
+
+ /**
+ * zzAtBOL == true <=> the scanner is currently at the beginning of a line
+ */
+ private boolean zzAtBOL = true;
+
+ /** zzAtEOF == true <=> the scanner is at the EOF */
+ private boolean zzAtEOF;
+
+ /** denotes if the user-EOF-code has already been executed */
+ private boolean zzEOFDone;
+
+ /* user code: */
+ /*
+ * Betacode to Unicode conversion
+ */
+
+
+
+ /**
+ * Creates a new scanner
+ * There is also a java.io.InputStream version of this constructor.
+ *
+ * @param in the java.io.Reader to read input from.
+ */
+ public Unicode2BuckwalterLex(java.io.Reader in) {
+ this.zzReader = in;
+ }
+
+ /**
+ * Creates a new scanner.
+ * There is also java.io.Reader version of this constructor.
+ *
+ * @param in the java.io.Inputstream to read input from.
+ */
+ public Unicode2BuckwalterLex(java.io.InputStream in) {
+ this(new java.io.InputStreamReader(in));
+ }
+
+ /**
+ * Unpacks the compressed character translation table.
+ *
+ * @param packed the packed character translation table
+ * @return the unpacked character translation table
+ */
+ private static char [] zzUnpackCMap(String packed) {
+ char [] map = new char[0x10000];
+ int i = 0; /* index in packed string */
+ int j = 0; /* index in unpacked array */
+ while (i < 240) {
+ int count = packed.charAt(i++);
+ char value = packed.charAt(i++);
+ do map[j++] = value; while (--count > 0);
+ }
+ return map;
+ }
+
+
+ /**
+ * Refills the input buffer.
+ *
+ * @return false
, iff there was new input.
+ *
+ * @exception java.io.IOException if any I/O-Error occurs
+ */
+ private boolean zzRefill() throws java.io.IOException {
+
+ /* first: make room (if you can) */
+ if (zzStartRead > 0) {
+ System.arraycopy(zzBuffer, zzStartRead,
+ zzBuffer, 0,
+ zzEndRead-zzStartRead);
+
+ /* translate stored positions */
+ zzEndRead-= zzStartRead;
+ zzCurrentPos-= zzStartRead;
+ zzMarkedPos-= zzStartRead;
+ zzStartRead = 0;
+ }
+
+ /* is the buffer big enough? */
+ if (zzCurrentPos >= zzBuffer.length) {
+ /* if not: blow it up */
+ char newBuffer[] = new char[zzCurrentPos*2];
+ System.arraycopy(zzBuffer, 0, newBuffer, 0, zzBuffer.length);
+ zzBuffer = newBuffer;
+ }
+
+ /* finally: fill the buffer with new input */
+ int numRead = zzReader.read(zzBuffer, zzEndRead,
+ zzBuffer.length-zzEndRead);
+
+ if (numRead > 0) {
+ zzEndRead+= numRead;
+ return false;
+ }
+ // unlikely but not impossible: read 0 characters, but not at end of stream
+ if (numRead == 0) {
+ int c = zzReader.read();
+ if (c == -1) {
+ return true;
+ } else {
+ zzBuffer[zzEndRead++] = (char) c;
+ return false;
+ }
+ }
+
+ // numRead < 0
+ return true;
+ }
+
+
+ /**
+ * Closes the input stream.
+ */
+ public final void yyclose() throws java.io.IOException {
+ zzAtEOF = true; /* indicate end of file */
+ zzEndRead = zzStartRead; /* invalidate buffer */
+
+ if (zzReader != null)
+ zzReader.close();
+ }
+
+
+ /**
+ * Resets the scanner to read from a new input stream.
+ * Does not close the old reader.
+ *
+ * All internal variables are reset, the old input stream
+ * cannot be reused (internal buffer is discarded and lost).
+ * Lexical state is set to ZZ_INITIAL.
+ *
+ * @param reader the new input stream
+ */
+ public final void yyreset(java.io.Reader reader) {
+ zzReader = reader;
+ zzAtBOL = true;
+ zzAtEOF = false;
+ zzEOFDone = false;
+ zzEndRead = zzStartRead = 0;
+ zzCurrentPos = zzMarkedPos = 0;
+ yyline = yychar = yycolumn = 0;
+ zzLexicalState = YYINITIAL;
+ }
+
+
+ /**
+ * Returns the current lexical state.
+ */
+ public final int yystate() {
+ return zzLexicalState;
+ }
+
+
+ /**
+ * Enters a new lexical state
+ *
+ * @param newState the new lexical state
+ */
+ public final void yybegin(int newState) {
+ zzLexicalState = newState;
+ }
+
+
+ /**
+ * Returns the text matched by the current regular expression.
+ */
+ public final String yytext() {
+ return new String( zzBuffer, zzStartRead, zzMarkedPos-zzStartRead );
+ }
+
+
+ /**
+ * Returns the character at position pos from the
+ * matched text.
+ *
+ * It is equivalent to yytext().charAt(pos), but faster
+ *
+ * @param pos the position of the character to fetch.
+ * A value from 0 to yylength()-1.
+ *
+ * @return the character at position pos
+ */
+ public final char yycharat(int pos) {
+ return zzBuffer[zzStartRead+pos];
+ }
+
+
+ /**
+ * Returns the length of the matched text region.
+ */
+ public final int yylength() {
+ return zzMarkedPos-zzStartRead;
+ }
+
+
+ /**
+ * Reports an error that occured while scanning.
+ *
+ * In a wellformed scanner (no or only correct usage of
+ * yypushback(int) and a match-all fallback rule) this method
+ * will only be called with things that "Can't Possibly Happen".
+ * If this method is called, something is seriously wrong
+ * (e.g. a JFlex bug producing a faulty scanner etc.).
+ *
+ * Usual syntax/scanner level error handling should be done
+ * in error fallback rules.
+ *
+ * @param errorCode the code of the errormessage to display
+ */
+ private void zzScanError(int errorCode) {
+ String message;
+ try {
+ message = ZZ_ERROR_MSG[errorCode];
+ }
+ catch (ArrayIndexOutOfBoundsException e) {
+ message = ZZ_ERROR_MSG[ZZ_UNKNOWN_ERROR];
+ }
+
+ throw new Error(message);
+ }
+
+
+ /**
+ * Pushes the specified amount of characters back into the input stream.
+ *
+ * They will be read again by then next call of the scanning method
+ *
+ * @param number the number of characters to be read again.
+ * This number must not be greater than yylength()!
+ */
+ public void yypushback(int number) {
+ if ( number > yylength() )
+ zzScanError(ZZ_PUSHBACK_2BIG);
+
+ zzMarkedPos -= number;
+ }
+
+
+ /**
+ * Resumes scanning until the next regular expression is matched,
+ * the end of input is encountered or an I/O-Error occurs.
+ *
+ * @return the next token
+ * @exception java.io.IOException if any I/O-Error occurs
+ */
+ public java.lang.String yylex() throws java.io.IOException {
+ int zzInput;
+ int zzAction;
+
+ // cached fields:
+ int zzCurrentPosL;
+ int zzMarkedPosL;
+ int zzEndReadL = zzEndRead;
+ char [] zzBufferL = zzBuffer;
+ char [] zzCMapL = ZZ_CMAP;
+
+ int [] zzTransL = ZZ_TRANS;
+ int [] zzRowMapL = ZZ_ROWMAP;
+ int [] zzAttrL = ZZ_ATTRIBUTE;
+
+ while (true) {
+ zzMarkedPosL = zzMarkedPos;
+
+ zzAction = -1;
+
+ zzCurrentPosL = zzCurrentPos = zzStartRead = zzMarkedPosL;
+
+ zzState = ZZ_LEXSTATE[zzLexicalState];
+
+
+ zzForAction: {
+ while (true) {
+
+ if (zzCurrentPosL < zzEndReadL)
+ zzInput = zzBufferL[zzCurrentPosL++];
+ else if (zzAtEOF) {
+ zzInput = YYEOF;
+ break zzForAction;
+ }
+ else {
+ // store back cached positions
+ zzCurrentPos = zzCurrentPosL;
+ zzMarkedPos = zzMarkedPosL;
+ boolean eof = zzRefill();
+ // get translated positions and possibly new buffer
+ zzCurrentPosL = zzCurrentPos;
+ zzMarkedPosL = zzMarkedPos;
+ zzBufferL = zzBuffer;
+ zzEndReadL = zzEndRead;
+ if (eof) {
+ zzInput = YYEOF;
+ break zzForAction;
+ }
+ else {
+ zzInput = zzBufferL[zzCurrentPosL++];
+ }
+ }
+ int zzNext = zzTransL[ zzRowMapL[zzState] + zzCMapL[zzInput] ];
+ if (zzNext == -1) break zzForAction;
+ zzState = zzNext;
+
+ int zzAttributes = zzAttrL[zzState];
+ if ( (zzAttributes & 1) == 1 ) {
+ zzAction = zzState;
+ zzMarkedPosL = zzCurrentPosL;
+ if ( (zzAttributes & 8) == 8 ) break zzForAction;
+ }
+
+ }
+ }
+
+ // store back cached position
+ zzMarkedPos = zzMarkedPosL;
+
+ switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) {
+ case 23:
+ { return "D";
+ }
+ case 72: break;
+ case 17:
+ { return "*";
+ }
+ case 73: break;
+ case 46:
+ { return "o";
+ }
+ case 74: break;
+ case 60:
+ { return ";";
+ }
+ case 75: break;
+ case 63:
+ { return "!";
+ }
+ case 76: break;
+ case 29:
+ { return "f";
+ }
+ case 77: break;
+ case 36:
+ { return "w";
+ }
+ case 78: break;
+ case 67:
+ { return "]";
+ }
+ case 79: break;
+ case 70:
+ { return ")";
+ }
+ case 80: break;
+ case 69:
+ { return ">";
+ }
+ case 81: break;
+ case 34:
+ { return "n";
+ }
+ case 82: break;
+ case 24:
+ { return "T";
+ }
+ case 83: break;
+ case 57:
+ { return ":";
+ }
+ case 84: break;
+ case 41:
+ { return "K";
+ }
+ case 85: break;
+ case 12:
+ { return "v";
+ }
+ case 86: break;
+ case 71:
+ { return "(";
+ }
+ case 87: break;
+ case 33:
+ { return "m";
+ }
+ case 88: break;
+ case 22:
+ { return "S";
+ }
+ case 89: break;
+ case 45:
+ { return "~";
+ }
+ case 90: break;
+ case 16:
+ { return "d";
+ }
+ case 91: break;
+ case 52:
+ { return "J";
+ }
+ case 92: break;
+ case 43:
+ { return "u";
+ }
+ case 93: break;
+ case 59:
+ { return "[";
+ }
+ case 94: break;
+ case 8:
+ { return "A";
+ }
+ case 95: break;
+ case 2:
+ { return "'";
+ }
+ case 96: break;
+ case 32:
+ { return "l";
+ }
+ case 97: break;
+ case 55:
+ { return "R";
+ }
+ case 98: break;
+ case 7:
+ { return "}";
+ }
+ case 99: break;
+ case 11:
+ { return "t";
+ }
+ case 100: break;
+ case 25:
+ { return "Z";
+ }
+ case 101: break;
+ case 58:
+ { return "@";
+ }
+ case 102: break;
+ case 5:
+ { return "&";
+ }
+ case 103: break;
+ case 31:
+ { return "k";
+ }
+ case 104: break;
+ case 3:
+ { return "|";
+ }
+ case 105: break;
+ case 9:
+ { return "b";
+ }
+ case 106: break;
+ case 14:
+ { return "H";
+ }
+ case 107: break;
+ case 62:
+ { return ".";
+ }
+ case 108: break;
+ case 20:
+ { return "s";
+ }
+ case 109: break;
+ case 37:
+ { return "Y";
+ }
+ case 110: break;
+ case 56:
+ { return "?";
+ }
+ case 111: break;
+ case 66:
+ { return "%";
+ }
+ case 112: break;
+ case 13:
+ { return "j";
+ }
+ case 113: break;
+ case 51:
+ { return "P";
+ }
+ case 114: break;
+ case 50:
+ { return "{";
+ }
+ case 115: break;
+ case 1:
+ { return yytext();
+ }
+ case 116: break;
+ case 42:
+ { return "a";
+ }
+ case 117: break;
+ case 54:
+ { return "G";
+ }
+ case 118: break;
+ case 64:
+ { return "-";
+ }
+ case 119: break;
+ case 18:
+ { return "r";
+ }
+ case 120: break;
+ case 4:
+ { return ">";
+ }
+ case 121: break;
+ case 21:
+ { return "$";
+ }
+ case 122: break;
+ case 44:
+ { return "i";
+ }
+ case 123: break;
+ case 19:
+ { return "z";
+ }
+ case 124: break;
+ case 68:
+ { return "<";
+ }
+ case 125: break;
+ case 49:
+ { return "`";
+ }
+ case 126: break;
+ case 39:
+ { return "F";
+ }
+ case 127: break;
+ case 61:
+ { return ",";
+ }
+ case 128: break;
+ case 30:
+ { return "q";
+ }
+ case 129: break;
+ case 48:
+ { return "#";
+ }
+ case 130: break;
+ case 35:
+ { return "h";
+ }
+ case 131: break;
+ case 40:
+ { return "N";
+ }
+ case 132: break;
+ case 38:
+ { return "y";
+ }
+ case 133: break;
+ case 28:
+ { return "_";
+ }
+ case 134: break;
+ case 26:
+ { return "E";
+ }
+ case 135: break;
+ case 65:
+ { return "+";
+ }
+ case 136: break;
+ case 10:
+ { return "p";
+ }
+ case 137: break;
+ case 53:
+ { return "V";
+ }
+ case 138: break;
+ case 6:
+ { return "<";
+ }
+ case 139: break;
+ case 27:
+ { return "g";
+ }
+ case 140: break;
+ case 15:
+ { return "x";
+ }
+ case 141: break;
+ case 47:
+ { return "^";
+ }
+ case 142: break;
+ default:
+ if (zzInput == YYEOF && zzStartRead == zzCurrentPos) {
+ zzAtEOF = true;
+ return null;
+ }
+ else {
+ zzScanError(ZZ_NO_MATCH);
+ }
+ }
+ }
+ }
+
+
+}
diff -r 94305c504178 -r 2396a569e446 software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/lex/db/LexHandler.java
--- a/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/lex/db/LexHandler.java Tue Feb 08 14:36:38 2011 +0100
+++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/lex/db/LexHandler.java Tue Feb 08 14:54:09 2011 +0100
@@ -59,6 +59,8 @@
if (! hasLexEntry) {
hasLexEntry = hasLexEntryKey(lName, language);
}
+ if (language.equals("de") || language.equals("fr") || language.equals("nl")) // TODO Lexika für diese Sprachen in BerkeleyDB einbringen (für frund nl auch eine bessere Morph.) und dann diese Zeilen wieder löschen
+ lexEntryKeys.add(lName);
if (! lName.equals(formName) && hasLexEntry) {
lexEntryKeys.add(lName);
}
@@ -72,6 +74,8 @@
public boolean hasLexEntryKey(String formName, String language) throws ApplicationException {
boolean hasLexEntry = false;
+ if (language.equals("zh")) // jedes chin. einzelne Zeichen hat autom. immer einen Lexikoneintrag
+ return true;
ArrayListNode
to
+ * the supplied Writer
.
Node
to serialize.
+ * @param writer Writer
to write to.
+ * @param indentLevel current indentation.
+ */
+ private void serializeNode(Node node, Writer writer, String indentLevel) throws ApplicationException {
+ try {
+ // Determine action based on node type
+ switch (node.getNodeType()) {
+ case Node.DOCUMENT_NODE:
+ writer.write("");
+ writer.write("\n");
+ // recurse on each child
+ NodeList nodes = node.getChildNodes();
+ if (nodes != null) {
+ for (int i=0; i