# HG changeset patch # User Josef Willenborg # Date 1297173249 -3600 # Node ID 2396a569e446d14e378c1a6540df70d45b87e262 # Parent 94305c504178099f60850ba986ae19201b73b186 new functions: externalObjects, normalizer, Unicode2Betacode diff -r 94305c504178 -r 2396a569e446 software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/client/DocumentHandler.java --- a/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/client/DocumentHandler.java Tue Feb 08 14:36:38 2011 +0100 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/client/DocumentHandler.java Tue Feb 08 14:54:09 2011 +0100 @@ -1,6 +1,5 @@ package de.mpg.mpiwg.berlin.mpdl.client; - import java.io.File; import java.io.FilenameFilter; import java.net.MalformedURLException; @@ -368,4 +367,4 @@ endOfOperation = new Date().getTime(); } -} +} \ No newline at end of file diff -r 94305c504178 -r 2396a569e446 software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/escidoc/TestESciDoc.java --- a/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/escidoc/TestESciDoc.java Tue Feb 08 14:36:38 2011 +0100 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/escidoc/TestESciDoc.java Tue Feb 08 14:54:09 2011 +0100 @@ -61,11 +61,11 @@ TestESciDoc test = new TestESciDoc(); test.init("jwillenborg"); // init eSciDoc-Session with cookie as user jwillenborg - // test.grant("aeisemann", "admin"); - String uid = test.getUserId("aeisemann"); + // test.grant("urte", "admin"); + String uid = test.getUserId("urte"); String users = test.getAllUsers(); - String grantAdmin = test.getGrantHrefByUserNameAndRoleName("aeisemann", "escidoc:role-system-administrator"); - String grants = test.getGrantsByUserName("aeisemann"); + String grantAdmin = test.getGrantHrefByUserNameAndRoleName("urte", "escidoc:role-system-administrator"); + String grants = test.getGrantsByUserName("urte"); String bla = ""; // test.testSchemaValidation(); diff -r 94305c504178 -r 2396a569e446 software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/externalObjects/app/ExtElement.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/externalObjects/app/ExtElement.java Tue Feb 08 14:54:09 2011 +0100 @@ -0,0 +1,121 @@ +package de.mpg.mpiwg.berlin.mpdl.externalObjects.app; + +import java.util.Date; + +import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; +import de.mpg.mpiwg.berlin.mpdl.util.XmlUtil; + +public class ExtElement extends ExtObject { + private String pageNumber; + private String xmlNodeId; + private String before; + private String charPos; + private String xpath; + + public static ExtElement parseXmlStr(String xmlStr) throws ApplicationException { + XmlUtil xmlUtil = XmlUtil.getInstance(); + String uid = xmlUtil.evaluateToString(xmlStr, "/object/@uid", null); + String dateStr = xmlUtil.evaluateToString(xmlStr, "/object/@modificationDate", null); + String docId = xmlUtil.evaluateToString(xmlStr, "/object/@documentId", null); + String pageNumber = xmlUtil.evaluateToString(xmlStr, "/object/@pageNumber", null); + String xmlNodeId = xmlUtil.evaluateToString(xmlStr, "/object/@xmlNodeId", null); + String before = xmlUtil.evaluateToString(xmlStr, "/object/@before", null); + String charPos = xmlUtil.evaluateToString(xmlStr, "/object/@charPos", null); + String xpath = xmlUtil.evaluateToString(xmlStr, "/object/@xpath", null); + String content = xmlUtil.evaluateToXmlString(xmlStr, "/object/content/*", null); + Date modDate = xmlUtil.toDate(dateStr); + if (uid == null || docId == null || pageNumber == null) + throw new ApplicationException("one of the required fields could not be read in: " + xmlStr); + ExtElement e = new ExtElement(); + e.setUid(uid); + e.setModificationDate(modDate); + e.setDocumentId(docId); + e.setPageNumber(pageNumber); + e.setXmlNodeId(xmlNodeId); + e.setXpath(xpath); + e.setBefore(before); + e.setCharPos(charPos); + e.setContent(content); + return e; + } + + public String toString() { + return getXmlString(); + } + + public String getXmlString() { + String xmlString = ""; + if (content != null) { + // write the uid and modificationDate into the content node + if (! content.contains("uid")) { + int firstClose = content.indexOf(">"); + if (firstClose != -1) + content = content.substring(0, firstClose) + " uid=\"" + uid + "\" modificationDate=\"" + modificationDate + "\" " + content.substring(firstClose); + } + xmlString = xmlString + "" + content + ""; + } + xmlString = xmlString + ""; + return xmlString; + } + + public String getXpath() { + return xpath; + } + + public void setXpath(String xpath) { + this.xpath = xpath; + } + + public String getXmlNodeId() { + return xmlNodeId; + } + + public void setXmlNodeId(String xmlNodeId) { + this.xmlNodeId = xmlNodeId; + } + + public String getCharPos() { + return charPos; + } + + public void setCharPos(String charPos) { + this.charPos = charPos; + } + + public String getPageNumber() { + return pageNumber; + } + + public void setPageNumber(String pageNumber) { + this.pageNumber = pageNumber; + } + + public String getBefore() { + return before; + } + + public void setBefore(String before) { + this.before = before; + } + +} diff -r 94305c504178 -r 2396a569e446 software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/externalObjects/app/ExtObject.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/externalObjects/app/ExtObject.java Tue Feb 08 14:54:09 2011 +0100 @@ -0,0 +1,43 @@ +package de.mpg.mpiwg.berlin.mpdl.externalObjects.app; + +import java.util.Date; + +public class ExtObject { + protected String uid; + protected Date modificationDate; + protected String documentId; + protected String content; + + public String getUid() { + return uid; + } + + public void setUid(String uid) { + this.uid = uid; + } + + public Date getModificationDate() { + return modificationDate; + } + + public void setModificationDate(Date modificationDate) { + this.modificationDate = modificationDate; + } + + public String getDocumentId() { + return documentId; + } + + public void setDocumentId(String documentId) { + this.documentId = documentId; + } + + public String getContent() { + return content; + } + + public void setContent(String content) { + this.content = content; + } + +} diff -r 94305c504178 -r 2396a569e446 software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/externalObjects/app/ExtQuery.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/externalObjects/app/ExtQuery.java Tue Feb 08 14:54:09 2011 +0100 @@ -0,0 +1,83 @@ +package de.mpg.mpiwg.berlin.mpdl.externalObjects.app; + +import java.util.Date; + +import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; +import de.mpg.mpiwg.berlin.mpdl.util.XmlUtil; + +public class ExtQuery extends ExtObject { + private String queryType; // url, fulltext or fulltextMorph + private String queryName; // optional: name of the query + + public static ExtQuery parseXmlStr(String xmlStr) throws ApplicationException { + XmlUtil xmlUtil = XmlUtil.getInstance(); + String uid = xmlUtil.evaluateToString(xmlStr, "/object/@uid", null); + String dateStr = xmlUtil.evaluateToString(xmlStr, "/object/@modificationDate", null); + String docId = xmlUtil.evaluateToString(xmlStr, "/object/@documentId", null); + String queryType = xmlUtil.evaluateToString(xmlStr, "/object/@queryType", null); + String queryName = xmlUtil.evaluateToString(xmlStr, "/object/@queryName", null); + String content = xmlUtil.evaluateToXmlString(xmlStr, "/object/content/*", null); + Date modDate = xmlUtil.toDate(dateStr); + if (uid == null || docId == null || queryType == null || content == null) + throw new ApplicationException("one of the required fields could not be read in: " + xmlStr); + ExtQuery e = new ExtQuery(); + e.setUid(uid); + e.setModificationDate(modDate); + e.setDocumentId(docId); + e.setQueryType(queryType); + e.setQueryName(queryName); + e.setContent(content); + return e; + } + + public String toString() { + return getXmlString(); + } + + public String getXmlString() { + String xmlString = ""; + if (content != null) { + // write the uid and modificationDate into the content node + if (! content.contains("uid")) { + int firstClose = content.indexOf(">"); + if (firstClose != -1) + content = content.substring(0, firstClose) + " uid=\"" + uid + "\" modificationDate=\"" + modificationDate + "\" " + content.substring(firstClose); + } + xmlString = xmlString + "" + content + ""; + } + xmlString = xmlString + ""; + return xmlString; + } + + public String getQueryType() { + return queryType; + } + + public void setQueryType(String queryType) { + this.queryType = queryType; + } + + public String getQueryName() { + return queryName; + } + + public void setQueryName(String queryName) { + this.queryName = queryName; + } + +} diff -r 94305c504178 -r 2396a569e446 software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/externalObjects/app/ExternalObjectsHandler.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/externalObjects/app/ExternalObjectsHandler.java Tue Feb 08 14:54:09 2011 +0100 @@ -0,0 +1,184 @@ +package de.mpg.mpiwg.berlin.mpdl.externalObjects.app; + +import java.io.UnsupportedEncodingException; +import java.util.ArrayList; +import java.util.Date; + +import com.sleepycat.je.Cursor; +import com.sleepycat.je.Database; +import com.sleepycat.je.DatabaseEntry; +import com.sleepycat.je.DatabaseException; +import com.sleepycat.je.LockMode; +import com.sleepycat.je.OperationStatus; + +import de.mpg.mpiwg.berlin.mpdl.util.Util; +import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; +import de.mpg.mpiwg.berlin.mpdl.externalObjects.db.DbEnvExternalObjects; +import de.mpg.mpiwg.berlin.mpdl.general.MpdlConstants; + +public class ExternalObjectsHandler { + private static ExternalObjectsHandler instance; + private static String MPDL_DATA_DIR = MpdlConstants.MPDL_EXIST_DATA_DIR; + private static String DB_DIR_EXTERNAL_OBJECTS = MPDL_DATA_DIR + "/dataBerkeleyDB/externalObjects"; + private DbEnvExternalObjects dbEnvExternalObjects; + private Date beginOfOperation; + private Date endOfOperation; + + public static ExternalObjectsHandler getInstance() throws ApplicationException { + if (instance == null) { + instance = new ExternalObjectsHandler(); + instance.init(); + } + return instance; + } + + public ArrayList readExternalElements(String documentId, String pageNumber) throws ApplicationException { + return readDBExternalElements(documentId, pageNumber); + } + + public void writeExternalElement(ExtElement element) throws ApplicationException { + writeDBExternalElement(element); + } + + public void deleteExternalElement(ExtElement element) throws ApplicationException { + deleteDBExternalElement(element); + } + + private void writeDBExternalElement(ExtElement element) throws ApplicationException { + try { + String keyStr = element.getDocumentId() + "###" + element.getPageNumber(); + String valueStr = element.getXmlString(); + DatabaseEntry dbEntryKey = new DatabaseEntry(keyStr.getBytes("utf-8")); + DatabaseEntry dbEntryValue = new DatabaseEntry(valueStr.getBytes("utf-8")); + Database elementDB = dbEnvExternalObjects.getElementDB(); + elementDB.put(null, dbEntryKey, dbEntryValue); + } catch (DatabaseException e) { + throw new ApplicationException(e); + } catch (UnsupportedEncodingException e) { + throw new ApplicationException(e); + } + } + + private void deleteDBExternalElement(ExtElement element) throws ApplicationException { + try { + String keyStr = element.getDocumentId() + "###" + element.getPageNumber(); + DatabaseEntry dbEntryKey = new DatabaseEntry(keyStr.getBytes("utf-8")); + Database elementDB = dbEnvExternalObjects.getElementDB(); + elementDB.delete(null, dbEntryKey); + } catch (DatabaseException e) { + throw new ApplicationException(e); + } catch (UnsupportedEncodingException e) { + throw new ApplicationException(e); + } + } + + private ArrayList readDBExternalElements(String documentId, String pageNumber) throws ApplicationException { + ArrayList retElements = new ArrayList(); + String hashKey = documentId + "###" + pageNumber; + try { + Database elementDB = dbEnvExternalObjects.getElementDB(); + Cursor cursor = elementDB.openCursor(null, null); + byte[] bHashKey = hashKey.getBytes("utf-8"); + DatabaseEntry dbEntryKey = new DatabaseEntry(bHashKey); + DatabaseEntry foundValue = new DatabaseEntry(); + OperationStatus operationStatus = cursor.getSearchKey(dbEntryKey, foundValue, LockMode.DEFAULT); + while (operationStatus == OperationStatus.SUCCESS) { + byte[] foundValueBytes = foundValue.getData(); + String foundValueStr = new String(foundValueBytes, "utf-8"); + ExtElement e = ExtElement.parseXmlStr(foundValueStr); + retElements.add(e); + operationStatus = cursor.getNextDup(dbEntryKey, foundValue, LockMode.DEFAULT); + } + cursor.close(); + } catch (DatabaseException e) { + throw new ApplicationException(e); + } catch (UnsupportedEncodingException e) { + throw new ApplicationException(e); + } + return retElements; + } + + private void init() throws ApplicationException { + dbEnvExternalObjects = new DbEnvExternalObjects(); + dbEnvExternalObjects.setDataDir(DB_DIR_EXTERNAL_OBJECTS); + dbEnvExternalObjects.init(); + dbEnvExternalObjects.openDatabases(); + } + + public static void main(String[] args) throws ApplicationException { + getInstance(); + instance.beginOperation(); + System.out.print("Start ..."); + // instance.deleteSampleData(); + // instance.writeSampleData(); + instance.readSampleData(); + instance.end(); + instance.endOperation(); + Double elapsedTime = new Util().getSecondWithMillisecondsBetween(instance.beginOfOperation, instance.endOfOperation); + System.out.println("End."); + System.out.println("Needed time: " + elapsedTime + " seconds"); + } + + private void deleteSampleData() throws ApplicationException { + ExtElement e = new ExtElement(); + e.setUid("joe"); + e.setDocumentId("/archimedes/it/l223.xml"); + e.setPageNumber("17"); + deleteExternalElement(e); + } + + private void writeSampleData() throws ApplicationException { + Date now = new Date(); + + String sId = "1.2.2.2.2.5"; + ExtElement e = new ExtElement(); + e.setUid("joe"); + e.setModificationDate(now); + e.setDocumentId("/archimedes/it/l223.xml"); + e.setPageNumber("17"); + e.setXmlNodeId(sId); + e.setContent("This is a test note to sentence " + sId + ""); + writeExternalElement(e); + + ExtElement e2 = new ExtElement(); + String sId2 = "1.2.2.2.2.7"; + e2.setUid("michael"); + e2.setModificationDate(now); + e2.setDocumentId("/archimedes/it/l223.xml"); + e2.setPageNumber("17"); + e2.setXmlNodeId(sId2); + e2.setCharPos("18"); + e2.setContent("This is a test note to sentence " + sId2 + ""); + writeExternalElement(e2); + + /* + String sId3 = "1.2.2.2.2.8.15.3.3"; + e3.setUid("joe"); + e3.setModificationDate(now); + e3.setDocumentId("/archimedes/it/l223.xml"); + e3.setPageNumber("17"); + e3.setXmlNodeId(sId3); + e2.setContent("This is an external test note to sentence " + sId3 + ""); + writeExternalElement(e3); + */ + + } + + private void readSampleData() throws ApplicationException { + ArrayList elements = readExternalElements("/archimedes/it/l223.xml", "17"); + System.out.println(elements); + } + + private void end() throws ApplicationException { + dbEnvExternalObjects.close(); + } + + private void beginOperation() { + beginOfOperation = new Date(); + } + + private void endOperation() { + endOfOperation = new Date(); + } + +} \ No newline at end of file diff -r 94305c504178 -r 2396a569e446 software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/externalObjects/db/DbEnvExternalObjects.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/externalObjects/db/DbEnvExternalObjects.java Tue Feb 08 14:54:09 2011 +0100 @@ -0,0 +1,104 @@ +package de.mpg.mpiwg.berlin.mpdl.externalObjects.db; + +import java.io.File; + +import com.sleepycat.je.Database; +import com.sleepycat.je.DatabaseConfig; +import com.sleepycat.je.DatabaseException; +import com.sleepycat.je.Environment; +import com.sleepycat.je.EnvironmentConfig; + +import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; + +public class DbEnvExternalObjects { + private String dataDir; + private File envPath; + private Environment env; + private EnvironmentConfig envConfig; + private DatabaseConfig dbConfig; + private Database elementDB; + private Database objectDB; + + public DbEnvExternalObjects() { + } + + public void setDataDir(String dataDir) { + this.dataDir = dataDir; + } + + public void init() throws ApplicationException { + try { + envConfig = new EnvironmentConfig(); + dbConfig = new DatabaseConfig(); + envConfig.setReadOnly(false); + dbConfig.setReadOnly(false); + envConfig.setAllowCreate(true); + dbConfig.setAllowCreate(true); + envConfig.setTransactional(true); + dbConfig.setTransactional(true); + // allow duplicates for keys + dbConfig.setSortedDuplicates(true); + envPath = new File(dataDir); + env = new Environment(envPath, envConfig); + } catch (DatabaseException e) { + throw new ApplicationException(e); + } + } + + public void openDatabases() throws ApplicationException { + try { + // open databases (and create them if they do not exist) + elementDB = env.openDatabase(null, "ElementDB", dbConfig); + objectDB = env.openDatabase(null, "ObjectDB", dbConfig); + } catch (DatabaseException e) { + throw new ApplicationException(e); + } + } + + public void removeDatabases() throws ApplicationException { + try { + if (objectDB != null) + objectDB.close(); + if (elementDB != null) + elementDB.close(); + env.removeDatabase(null, "ElementDB"); + env.removeDatabase(null, "ObjectDB"); + objectDB = null; + elementDB = null; + /* + env.truncateDatabase(null, "ElementDB", bla); + env.truncateDatabase(null, "ObjectDB", bla); + */ + } catch (DatabaseException e) { + throw new ApplicationException(e); + } + } + + public Environment getEnv() { + return env; + } + + public Database getElementDB() { + return elementDB; + } + + public Database getObjectDB() { + return objectDB; + } + + public void close() throws ApplicationException { + if (env != null) { + try { + if (objectDB != null) + objectDB.close(); + if (elementDB != null) + elementDB.close(); + if (env != null) + env.close(); + } catch (DatabaseException e) { + throw new ApplicationException(e); + } + } + } +} + diff -r 94305c504178 -r 2396a569e446 software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/general/MpdlConstants.java --- a/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/general/MpdlConstants.java Tue Feb 08 14:36:38 2011 +0100 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/general/MpdlConstants.java Tue Feb 08 14:54:09 2011 +0100 @@ -13,7 +13,7 @@ public static String MPDL_PROJECT_NAME = "mpdl"; public static String TYPE_STATIC = "static"; public static String TYPE_DYNAMIC = "dynamic"; - + // eXist settings: data public static String MPDL_EXIST_DATA_DIR = EXIST_HOME + "/webapp/WEB-INF/dataMpdl"; // other call would be: ConfigurationHelper.getExistHome() diff -r 94305c504178 -r 2396a569e446 software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/MpdlNormalizer.java --- a/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/MpdlNormalizer.java Tue Feb 08 14:36:38 2011 +0100 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/MpdlNormalizer.java Tue Feb 08 14:54:09 2011 +0100 @@ -1,23 +1,19 @@ package de.mpg.mpiwg.berlin.mpdl.lt.analyzer; -import java.io.BufferedReader; import java.io.IOException; -import java.io.InputStreamReader; -import java.io.UnsupportedEncodingException; +import java.io.StringReader; import java.util.ArrayList; import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; +import de.mpg.mpiwg.berlin.mpdl.lt.analyzer.lang.MpdlNormalizerLexAll; import de.mpg.mpiwg.berlin.mpdl.lt.doc.regularization.Regularization; import de.mpg.mpiwg.berlin.mpdl.lt.doc.regularization.RegularizationManager; import de.mpg.mpiwg.berlin.mpdl.lt.general.Language; public class MpdlNormalizer { - static final private String IT_VOWELS = "AEIOUaeiou" + - "\u00c6\u00e6" + // AE ligatures - "\u0152\u0153"; // OE ligatures - static final private String IT_CONS = "BCDFGHKLMNPQRSTVWXZ" + - "bcdfghklmnpqrstvwxz" + - "\u017f\u00df"; // long/sharp S + public static int MODE_4LEXICA = 1; // normalization for lexica etc. which have sometimes only ascii in it + public static int MODE_4HUMAN_READERS = 2; // normalization for human readers + private int normMode = MODE_4LEXICA; // Default private String[] normFunctionsToUse = {"reg", "norm"}; // default is to use all of these normalization functions private String language; private int[] offsets; @@ -32,6 +28,10 @@ this.language = language; } + public void setNormMode(int normMode) { + this.normMode = normMode; + } + /** * Applies the normalization rules in language to * s, without offset tracking. @@ -52,8 +52,11 @@ } } if (useNormFunction()) { - // normalize the string by string replace - normStr = normalize(normStr, null); + // normalize the string by string replacements + if (normMode == MODE_4LEXICA) + normStr = normalize4Lexica(normStr, null); + else if (normMode == MODE_4HUMAN_READERS) + normStr = normalize4HumanReaders(normStr); } return normStr; } @@ -92,7 +95,7 @@ * @param offsets character offset table * @return normalized string */ - public String normalize(String s, int[] offsets) { + private String normalize4Lexica(String s, int[] offsets) { this.offsets = offsets; if (language.equals("la") || language.equals("lat")) { StringBuffer buf = new StringBuffer(); @@ -479,9 +482,11 @@ case '\u00e4': replace = "ae"; break; case '\u00f6': replace = "oe"; break; case '\u00fc': replace = "ue"; break; + case '\u00ad': break; // soft hyphen case '\u00e9': replace = "e"; break; - case '\u00ad': break; // soft hyphen - case '-': break; + // new in MPDL project by J. Willenborg + case '\u017f': replace = "s"; break; // LATIN SMALL LETTER LONG S + // case '-': break; default: replace += c; break; } buf.append(replace); @@ -1007,16 +1012,126 @@ return s; } } - - /** - * Returns the offset table. - * - * @return offset table - */ - public int[] getOffsetTable() { - return offsets; + + private String normalize4HumanReaders(String s) { + String normStr = s; + StringReader strReader = new StringReader(normStr + "\n"); + MpdlNormalizerLexAll mpdlNormalizerLexAll = new MpdlNormalizerLexAll(strReader); + if (Language.getInstance().isLatin(language)) { + mpdlNormalizerLexAll.yybegin(MpdlNormalizerLexAll.LA); + } else if (Language.getInstance().isChinese(language)) { + mpdlNormalizerLexAll.yybegin(MpdlNormalizerLexAll.ZH); + } else { + // TODO normalization for all languages + return normalize4Lexica(s, null); // old function + } + String retStr = ""; + String token = ""; + while (token != null) { + try { + token = mpdlNormalizerLexAll.yylex(); + if (token != null) + retStr += token; + } catch (IOException e ) { + // nothing cause IOException is not needed for a StringReader + } + } + normStr = retStr; + return normStr; } + /* + // explicit words + normStr = normStr.replaceAll("aliàs", "alias"); + normStr = normStr.replaceAll("hîc", "hic"); + normStr = normStr.replaceAll("quòd", "quod"); + normStr = normStr.replaceAll("Quòd", "Quod"); + normStr = normStr.replaceAll("QVòd", "Quod"); + normStr = normStr.replaceAll("Cùmque", "Cumque"); + normStr = normStr.replaceAll("aër", "aer"); + // ij + normStr = normStr.replaceAll("ij", "ii"); + // qu/qv + normStr = normStr.replaceAll("qv", "qu"); + // normStr = normStr.replaceAll("qV", "qU"); + normStr = normStr.replaceAll("Qv", "Qu"); + normStr = normStr.replaceAll("QV", "QU"); + // u/v + String vowels = getVowels(); + String consonants = getConsonants(); + normStr = normStr.replaceAll("([" + vowels + "])([-]*)u([" + vowels +"])", "$1$2v$3"); // vowel + u + vowel --> vowel + v + vowel + normStr = normStr.replaceAll("([" + vowels + "])([-]*)U([" + vowels +"])", "$1$2V$3"); // vowel + U + vowel --> vowel + V + vowel + normStr = normStr.replaceAll("([" + consonants + "])([-]*)v([" + consonants +"])", "$1$2u$3"); // consonant + v + consonant --> consonant + u + consonant + normStr = normStr.replaceAll("([" + consonants + "])([-]*)V([" + consonants +"])", "$1$2U$3"); // consonant + V + consonant --> consonant + U + consonant + normStr = normStr.replaceAll("^v([" + consonants +"])", "u$1"); // v + consonant --> u + consonant + normStr = normStr.replaceAll("^V([" + consonants +"])", "U$1"); // V + consonant --> U + consonant + // end of word: diacritica + normStr = normStr.replaceAll("à$", "a"); + normStr = normStr.replaceAll("è$", "e"); + normStr = normStr.replaceAll("ò$", "o"); + normStr = normStr.replaceAll("àm$", "am"); + normStr = normStr.replaceAll("ùm$", "um"); + String normStrTmp = normStr; + normStr = ""; + for (int i = 0; i < normStrTmp.length(); i++) { + char c = normStrTmp.charAt(i); + String replace = ""; + switch (c) { + case 'ſ': replace = "s"; break; + case 'ß': replace = "ss"; break; + case 'æ': replace = "ae"; break; + case 'Æ': replace = "AE"; break; + case 'ę': replace = "ae"; break; + case 'œ': replace = "oe"; break; + default: replace += c; break; + } + normStr = normStr + replace; + } + + + private String getVowels() { + String retStr = null; + if (Language.getInstance().isItalian(language)) { + retStr = "AEIOUaeiou" + + "\u00c6\u00e6" + // AE ligatures + "\u0152\u0153"; // OE ligatures + } else if (Language.getInstance().isLatin(language)) { + retStr = "AEIOUaeiouÆœęàèòù"; + } + // TODO all languages + return retStr; + } + + private String getConsonants() { + String retStr = null; + if (Language.getInstance().isItalian(language)) { + retStr = "BCDFGHKLMNPQRSTVWXZ" + + "bcdfghklmnpqrstvwxz" + + "ſß"; // long/sharp S + } else if (Language.getInstance().isLatin(language)) { + retStr = "BCDFGHKLMNPQRSTVWXZ" + + "bcdfghklmnpqrstvwxz" + + "ſß"; // long/sharp S + } + // TODO all languages + return retStr; + } + + + + + + * + * + * + * + */ + + + + + + /** * Returns a copy of an integer array with the element at * index removed ("killed"). @@ -1024,7 +1139,7 @@ * @param array integer array * @param index index of element to remove */ - static private int[] arrayKill(int[] array, int index) { + private int[] arrayKill(int[] array, int index) { int[] newArray = new int[array.length - 1]; System.arraycopy(array, 0, newArray, 0, index); System.arraycopy(array, index + 1, newArray, index, array.length - index - 1); @@ -1040,7 +1155,7 @@ * @param value value to insert into new slots * @param count number of new slots to insert */ - static private int[] arrayInsert(int[] array, int index, int value, int count) { + private int[] arrayInsert(int[] array, int index, int value, int count) { int[] newArray = new int[array.length + count]; System.arraycopy(array, 0, newArray, 0, index); for (int i = 0; i < count; i++) newArray[index + i] = value; @@ -1048,31 +1163,4 @@ return newArray; } - /** - * We provide main() so that our services will be available - * outside Java (i.e., so we can run as a Un*x-style filter). - */ - static public void main(String[] argv) throws ApplicationException { - if (argv.length != 1) { - System.err.println("You must specify a language."); - System.exit(1); - } - String rec; - StringBuffer buf = new StringBuffer(); - BufferedReader bin = null; - try { - bin = new BufferedReader(new InputStreamReader(System.in, "UTF-8")); - while ((rec = bin.readLine()) != null) - buf.append(rec + "\n"); - } - catch (UnsupportedEncodingException e) { - System.err.println(e); - System.exit(1); - } catch (IOException e) { - System.err.println(e); - System.exit(1); - } - MpdlNormalizer orth = new MpdlNormalizer(argv[0]); - System.out.print(orth.normalize(buf.toString())); - } } \ No newline at end of file diff -r 94305c504178 -r 2396a569e446 software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/MpdlTokenizer.java --- a/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/MpdlTokenizer.java Tue Feb 08 14:36:38 2011 +0100 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/MpdlTokenizer.java Tue Feb 08 14:54:09 2011 +0100 @@ -11,11 +11,14 @@ public class MpdlTokenizer extends Tokenizer { private static final int MAX_WORD_LEN = 255; private static final int IO_BUFFER_SIZE = 1024; - private String language; // TODO make the tokenizer language dependent + private static String SPECIAL_NOT_WORD_DELIM_SYMBOL = new Character('\u2424').toString(); + private boolean regWithoutSemicolon = false; // hack: in some cases there are words with a semicolon, then the normalization should be without semicolon + private boolean isInNotWordDelimMode = false; private int offset = 0, bufferIndex = 0, dataLen = 0; private char[] buffer = new char[MAX_WORD_LEN]; private char[] ioBuffer = new char[IO_BUFFER_SIZE]; private MpdlNormalizer normalizer; + private String language; public MpdlTokenizer(Reader input, String language) { super(input); @@ -28,12 +31,22 @@ this.normalizer = normalizer; } + public void setRegWithoutSemicolon(boolean regWithoutSemicolon) { + this.regWithoutSemicolon = regWithoutSemicolon; + } + + public boolean isRegWithoutSemicolon() { + return regWithoutSemicolon; + } + /** Returns true iff a character should be included in a token. This * tokenizer generates as tokens adjacent sequences of characters which * satisfy this predicate. Characters for which this is false are used to * define token boundaries and are not included in tokens. */ protected boolean isTokenChar(char c) { boolean isTokenChar = true; + if (isRegWithoutSemicolon() && c == ';') // hack: special case for regularization and normalization; feel free to remove it later + return true; switch (c) { case ' ': isTokenChar = false; break; case '.': isTokenChar = false; break; @@ -51,12 +64,37 @@ case '&': isTokenChar = false; break; case '+': isTokenChar = false; break; case '"': isTokenChar = false; break; + case '„': isTokenChar = false; break; + case '“': isTokenChar = false; break; + case '«': isTokenChar = false; break; + case '»': isTokenChar = false; break; case '\'': isTokenChar = false; break; - // case '\t': isTokenChar = false; break; - // case '\n': isTokenChar = false; break; // do not break words which are on another line + case '\t': isTokenChar = false; break; // do not break words which have tabs in it + case '\n': isTokenChar = false; break; // do not break words which are on another line } return isTokenChar; } + + protected boolean isTokenCharInNotWordDelimMode(char c) { + boolean isTokenCharInNotWordDelimMode = false; + if (isInNotWordDelimMode) { + switch (c) { + case ' ': isTokenCharInNotWordDelimMode = true; break; + case '\t': isTokenCharInNotWordDelimMode = true; break; + case '\n': isTokenCharInNotWordDelimMode = true; break; + } + } + return isTokenCharInNotWordDelimMode; + } + + protected boolean isSpecialNotWordDelimSymbol(char c) { + boolean isSpecialNotWordDelimSymbol = false; + switch (c) { + case '\u2424': isSpecialNotWordDelimSymbol = true; break; // unicode character for newline + } + return isSpecialNotWordDelimSymbol; + } + /** Called on each token character to normalize it before it is added to the * token. The default implementation does nothing. Subclasses may use this @@ -67,6 +105,8 @@ /** Returns the next token in the stream, or null at EOS. */ public final Token next() throws IOException { + if (language != null && language.equals("zh")) + return nextChinese(); int length = 0; int start = offset; while (true) { @@ -84,7 +124,13 @@ } else { c = ioBuffer[bufferIndex++]; } - if (isTokenChar(c)) { // if it's a token char + if (isInNotWordDelimMode && isTokenChar(c) && (! isSpecialNotWordDelimSymbol(c))) { + isInNotWordDelimMode = false; + } + if (isSpecialNotWordDelimSymbol(c)) { + isInNotWordDelimMode = true; + } + if (isTokenChar(c) || isTokenCharInNotWordDelimMode(c)) { // if it's a token char if (length == 0) // start of token start = offset - 1; buffer[length++] = normalize(c); // buffer it, normalized @@ -93,8 +139,10 @@ } else if (length > 0) // at non-Letter w/ chars break; // return 'em } + isInNotWordDelimMode = false; Token newToken = new Token(start, start + length); newToken.setTermBuffer(buffer, 0, length); + removeSpecialSymbols(newToken); // remove some special symbols in token (e.g. symbol for word delimiting xml elements) if (normalizer != null) { char[] termBuffer = newToken.termBuffer(); int termBufferLength = newToken.termLength(); @@ -110,4 +158,75 @@ } return newToken; } + + private Token removeSpecialSymbols(Token token) { + char[] termBuffer = token.termBuffer(); + int termBufferLength = token.termLength(); + String tokenText = new String(termBuffer, 0, termBufferLength); + String newTokenText = tokenText.replaceAll(SPECIAL_NOT_WORD_DELIM_SYMBOL, ""); // a symbol which marks word delimiting xml elements + int newTokenTextLength = newTokenText.length(); + char[] newTokenTextBuffer = newTokenText.toCharArray(); + token.setTermBuffer(newTokenTextBuffer, 0, newTokenTextLength); + return token; + } + + + + /* + * chinese Tokenizer: taken from org.apache.lucene.analysis.cn.ChineseTokenizer + * + */ + private int length; + private int start; + + private final void push(char c) { + if (length == 0) start = offset-1; // start of token + buffer[length++] = Character.toLowerCase(c); // buffer it + } + + private final Token flush() { + if (length>0) { + return new Token(new String(buffer, 0, length), start, start+length); + } + else + return null; + } + + public final Token nextChinese() throws IOException { + length = 0; + start = offset; + while (true) { + final char c; + offset++; + if (bufferIndex >= dataLen) { + dataLen = input.read(ioBuffer); + bufferIndex = 0; + } + if (dataLen == -1) + return flush(); + else + c = ioBuffer[bufferIndex++]; + switch(Character.getType(c)) { + case Character.DECIMAL_DIGIT_NUMBER: + case Character.LOWERCASE_LETTER: + case Character.UPPERCASE_LETTER: + push(c); + if (length == MAX_WORD_LEN) + return flush(); + break; + case Character.OTHER_LETTER: + if (length>0) { + bufferIndex--; + offset--; + return flush(); + } + push(c); + return flush(); + default: + if (length>0) + return flush(); + break; + } + } + } } diff -r 94305c504178 -r 2396a569e446 software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/MpdlTokenizerAnalyzer.java --- a/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/MpdlTokenizerAnalyzer.java Tue Feb 08 14:36:38 2011 +0100 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/MpdlTokenizerAnalyzer.java Tue Feb 08 14:54:09 2011 +0100 @@ -16,6 +16,7 @@ public class MpdlTokenizerAnalyzer extends Analyzer { protected String language = MpdlConstants.DEFAULT_LANGUAGE; protected MpdlNormalizer normalizer = null; + private boolean regWithoutSemicolon = false; // hack: in some cases there are words with a semicolon, then the normalization should be without semicolon public MpdlTokenizerAnalyzer(String language) { this.language = language; @@ -27,8 +28,18 @@ this.normalizer = normalizer; } + public void setRegWithoutSemicolon(boolean regWithoutSemicolon) { + this.regWithoutSemicolon = regWithoutSemicolon; + } + + public boolean isRegWithoutSemicolon() { + return regWithoutSemicolon; + } + public TokenStream tokenStream(String fieldName, Reader reader) { - TokenStream result = new MpdlTokenizer(reader, language, normalizer); + MpdlTokenizer tmpTokenizer = new MpdlTokenizer(reader, language, normalizer); + tmpTokenizer.setRegWithoutSemicolon(regWithoutSemicolon); // hack: feel free to remove it later + TokenStream result = (TokenStream) tmpTokenizer; result = new MpdlFilter(result); // filter to remove the hyphen in a token etc. result = new LowerCaseFilter(result); return result; @@ -38,7 +49,9 @@ ArrayList token = new ArrayList(); try { Reader reader = new StringReader(inputString); - TokenStream result = new MpdlTokenizer(reader, language, normalizer); + MpdlTokenizer tmpTokenizer = new MpdlTokenizer(reader, language, normalizer); + tmpTokenizer.setRegWithoutSemicolon(regWithoutSemicolon); // hack: feel free to remove it later + TokenStream result = (TokenStream) tmpTokenizer; result = new MpdlFilter(result); // filter to remove the hyphen in a token etc. result = new LowerCaseFilter(result); Token t = result.next(); diff -r 94305c504178 -r 2396a569e446 software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlNormalizerLexAll.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlNormalizerLexAll.java Tue Feb 08 14:54:09 2011 +0100 @@ -0,0 +1,820 @@ +/* The following code was generated by JFlex 1.4.3 on 27.01.11 13:29 */ + +/* + * Normalization rules for all languages + * [this is a JFlex specification] + * + * Wolfgang Schmidle + * 2011-01-25 + * + */ + +package de.mpg.mpiwg.berlin.mpdl.lt.analyzer.lang; + + +/** + * This class is a scanner generated by + * JFlex 1.4.3 + * on 27.01.11 13:29 from the specification file + * MpdlNormalizerLexAll.lex + */ +public class MpdlNormalizerLexAll { + + /** This character denotes the end of file */ + public static final int YYEOF = -1; + + /** initial size of the lookahead buffer */ + private static final int ZZ_BUFFERSIZE = 16384; + + /** lexical states */ + public static final int YYINITIAL = 0; + public static final int ZH = 4; + public static final int LA = 2; + + /** + * ZZ_LEXSTATE[l] is the state in the DFA for the lexical state l + * ZZ_LEXSTATE[l+1] is the state in the DFA for the lexical state l + * at the beginning of a line + * l is of the form l = 2*k, k a non negative integer + */ + private static final int ZZ_LEXSTATE[] = { + 0, 0, 1, 2, 3, 3 + }; + + /** + * Translates characters to character classes + */ + private static final String ZZ_CMAP_PACKED = + "\12\0\1\7\66\0\1\1\3\2\1\1\3\2\1\1\1\0\1\2"+ + "\1\3\2\2\1\1\1\2\1\40\1\3\2\2\1\36\1\41\2\2"+ + "\1\0\1\2\6\0\1\31\1\2\1\21\1\27\1\6\2\2\1\17"+ + "\1\15\1\16\1\2\1\3\1\23\1\2\1\33\1\2\1\4\1\3"+ + "\1\24\1\2\1\5\1\37\2\2\1\0\1\2\113\0\1\13\30\0"+ + "\1\11\1\22\5\0\1\12\1\0\1\25\2\0\1\32\2\0\1\20"+ + "\1\34\2\0\1\26\6\0\1\30\2\0\1\35\34\0\1\12\71\0"+ + "\1\14\53\0\1\10\u6479\0\1\43\u057a\0\1\44\u0f5d\0\1\42\u5dab\0"+ + "\1\46\u040e\0\1\47\u1d8f\0\1\45\u05e2\0"; + + /** + * Translates characters to character classes + */ + private static final char [] ZZ_CMAP = zzUnpackCMap(ZZ_CMAP_PACKED); + + /** + * Translates DFA states to action switch labels. + */ + private static final int [] ZZ_ACTION = zzUnpackAction(); + + private static final String ZZ_ACTION_PACKED_0 = + "\4\0\1\1\1\2\1\3\1\4\3\5\1\4\1\6"+ + "\1\7\1\10\1\11\1\12\1\13\10\4\4\5\1\14"+ + "\1\15\1\16\1\17\1\1\1\20\1\21\1\22\1\23"+ + "\1\24\1\25\1\26\1\0\1\27\3\0\1\30\1\0"+ + "\1\31\3\0\1\32\1\33\1\34\1\35\1\36\1\37"+ + "\1\0\1\40\2\0\1\41\1\0\1\42\3\0\1\43"+ + "\1\0\1\44\1\0\1\45\11\0\1\46\5\0"; + + private static int [] zzUnpackAction() { + int [] result = new int[89]; + int offset = 0; + offset = zzUnpackAction(ZZ_ACTION_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackAction(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int count = packed.charAt(i++); + int value = packed.charAt(i++); + do result[j++] = value; while (--count > 0); + } + return j; + } + + + /** + * Translates a state to a row index in the transition table + */ + private static final int [] ZZ_ROWMAP = zzUnpackRowMap(); + + private static final String ZZ_ROWMAP_PACKED_0 = + "\0\0\0\50\0\120\0\170\0\240\0\240\0\240\0\240"+ + "\0\240\0\310\0\360\0\u0118\0\240\0\240\0\240\0\240"+ + "\0\240\0\240\0\u0140\0\u0168\0\u0190\0\u01b8\0\u01e0\0\u0208"+ + "\0\u0230\0\u0258\0\u0280\0\u02a8\0\u02d0\0\u02f8\0\240\0\240"+ + "\0\240\0\240\0\u0320\0\240\0\240\0\240\0\240\0\240"+ + "\0\240\0\240\0\u0348\0\240\0\u0370\0\u0398\0\u03c0\0\240"+ + "\0\u03e8\0\240\0\u0410\0\u0438\0\u0460\0\240\0\240\0\240"+ + "\0\240\0\240\0\240\0\u0488\0\240\0\u04b0\0\u04d8\0\240"+ + "\0\u0500\0\240\0\u0528\0\u0550\0\u0578\0\240\0\u05a0\0\240"+ + "\0\u05c8\0\240\0\u05f0\0\u0618\0\u0640\0\u0668\0\u0690\0\u06b8"+ + "\0\u06e0\0\u0708\0\u0730\0\240\0\u0758\0\u0780\0\u07a8\0\u07d0"+ + "\0\u07f8"; + + private static int [] zzUnpackRowMap() { + int [] result = new int[89]; + int offset = 0; + offset = zzUnpackRowMap(ZZ_ROWMAP_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackRowMap(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int high = packed.charAt(i++) << 16; + result[j++] = high | packed.charAt(i++); + } + return j; + } + + /** + * The transition table of the DFA + */ + private static final int [] ZZ_TRANS = zzUnpackTrans(); + + private static final String ZZ_TRANS_PACKED_0 = + "\7\5\1\6\40\5\1\7\1\10\1\11\1\12\1\13"+ + "\1\14\1\10\1\15\1\16\1\17\1\20\1\21\1\22"+ + "\1\23\1\7\1\11\1\7\1\11\1\24\2\11\1\25"+ + "\1\26\1\11\1\27\1\30\1\7\1\31\2\7\1\32"+ + "\1\33\1\34\1\35\7\7\1\10\1\11\1\12\1\13"+ + "\1\14\1\10\1\15\1\16\1\17\1\20\1\21\1\22"+ + "\1\23\1\7\1\36\1\7\1\11\1\24\2\11\1\25"+ + "\1\26\1\11\1\27\1\30\1\7\1\31\2\7\1\32"+ + "\1\33\1\34\1\35\6\7\7\5\1\6\32\5\1\37"+ + "\1\40\1\41\1\42\1\43\1\5\107\0\1\44\1\0"+ + "\1\45\45\0\1\46\11\0\1\47\3\0\2\47\3\0"+ + "\4\47\4\0\1\47\2\0\2\47\1\0\2\47\1\0"+ + "\1\47\1\50\1\51\1\47\27\0\1\52\35\0\1\53"+ + "\2\0\1\54\13\0\1\55\1\56\27\0\1\57\2\0"+ + "\1\60\44\0\1\61\2\0\1\62\17\0\1\63\43\0"+ + "\1\64\1\65\55\0\1\20\47\0\1\22\1\0\1\66"+ + "\14\0\1\67\3\0\2\67\3\0\4\67\4\0\1\67"+ + "\2\0\2\67\1\0\2\67\1\0\1\67\2\0\1\67"+ + "\13\0\3\70\3\0\2\70\5\0\1\70\1\0\1\70"+ + "\1\0\2\70\2\0\1\70\7\0\3\70\45\0\1\71"+ + "\1\0\1\72\10\0\3\73\3\0\2\73\5\0\1\73"+ + "\1\0\1\73\1\0\2\73\2\0\1\73\7\0\3\73"+ + "\26\0\1\74\76\0\1\75\5\0\1\76\46\0\1\77"+ + "\2\0\1\100\44\0\1\101\2\0\1\102\45\0\1\103"+ + "\47\0\1\104\46\0\1\105\2\0\1\106\44\0\1\107"+ + "\2\0\1\110\44\0\1\111\2\0\1\112\61\0\1\113"+ + "\34\0\1\114\46\0\1\115\47\0\1\116\50\0\1\117"+ + "\47\0\1\120\46\0\1\121\47\0\1\122\47\0\1\123"+ + "\51\0\1\124\47\0\1\54\46\0\1\125\47\0\1\126"+ + "\50\0\1\60\47\0\1\62\46\0\1\127\47\0\1\130"+ + "\47\0\1\131\50\0\1\100\47\0\1\102\47\0\1\106"+ + "\47\0\1\110\47\0\1\112\40\0"; + + private static int [] zzUnpackTrans() { + int [] result = new int[2080]; + int offset = 0; + offset = zzUnpackTrans(ZZ_TRANS_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackTrans(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int count = packed.charAt(i++); + int value = packed.charAt(i++); + value--; + do result[j++] = value; while (--count > 0); + } + return j; + } + + + /* error codes */ + private static final int ZZ_UNKNOWN_ERROR = 0; + private static final int ZZ_NO_MATCH = 1; + private static final int ZZ_PUSHBACK_2BIG = 2; + + /* error messages for the codes above */ + private static final String ZZ_ERROR_MSG[] = { + "Unkown internal scanner error", + "Error: could not match input", + "Error: pushback value was too large" + }; + + /** + * ZZ_ATTRIBUTE[aState] contains the attributes of state aState + */ + private static final int [] ZZ_ATTRIBUTE = zzUnpackAttribute(); + + private static final String ZZ_ATTRIBUTE_PACKED_0 = + "\4\0\5\11\3\1\6\11\14\1\4\11\1\1\7\11"+ + "\1\0\1\11\3\0\1\11\1\0\1\11\3\0\6\11"+ + "\1\0\1\11\2\0\1\11\1\0\1\11\3\0\1\11"+ + "\1\0\1\11\1\0\1\11\11\0\1\11\5\0"; + + private static int [] zzUnpackAttribute() { + int [] result = new int[89]; + int offset = 0; + offset = zzUnpackAttribute(ZZ_ATTRIBUTE_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackAttribute(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int count = packed.charAt(i++); + int value = packed.charAt(i++); + do result[j++] = value; while (--count > 0); + } + return j; + } + + /** the input device */ + private java.io.Reader zzReader; + + /** the current state of the DFA */ + private int zzState; + + /** the current lexical state */ + private int zzLexicalState = YYINITIAL; + + /** this buffer contains the current text to be matched and is + the source of the yytext() string */ + private char zzBuffer[] = new char[ZZ_BUFFERSIZE]; + + /** the textposition at the last accepting state */ + private int zzMarkedPos; + + /** the current text position in the buffer */ + private int zzCurrentPos; + + /** startRead marks the beginning of the yytext() string in the buffer */ + private int zzStartRead; + + /** endRead marks the last character in the buffer, that has been read + from input */ + private int zzEndRead; + + /** number of newlines encountered up to the start of the matched text */ + private int yyline; + + /** the number of characters up to the start of the matched text */ + private int yychar; + + /** + * the number of characters from the last newline up to the start of the + * matched text + */ + private int yycolumn; + + /** + * zzAtBOL == true <=> the scanner is currently at the beginning of a line + */ + private boolean zzAtBOL = true; + + /** zzAtEOF == true <=> the scanner is at the EOF */ + private boolean zzAtEOF; + + /** denotes if the user-EOF-code has already been executed */ + private boolean zzEOFDone; + + /* user code: */ + int cv = 0; // consonant = 1, vowel = 2, everything else = 0 + + + /** + * Creates a new scanner + * There is also a java.io.InputStream version of this constructor. + * + * @param in the java.io.Reader to read input from. + */ + public MpdlNormalizerLexAll(java.io.Reader in) { + this.zzReader = in; + } + + /** + * Creates a new scanner. + * There is also java.io.Reader version of this constructor. + * + * @param in the java.io.Inputstream to read input from. + */ + public MpdlNormalizerLexAll(java.io.InputStream in) { + this(new java.io.InputStreamReader(in)); + } + + /** + * Unpacks the compressed character translation table. + * + * @param packed the packed character translation table + * @return the unpacked character translation table + */ + private static char [] zzUnpackCMap(String packed) { + char [] map = new char[0x10000]; + int i = 0; /* index in packed string */ + int j = 0; /* index in unpacked array */ + while (i < 172) { + int count = packed.charAt(i++); + char value = packed.charAt(i++); + do map[j++] = value; while (--count > 0); + } + return map; + } + + + /** + * Refills the input buffer. + * + * @return false, iff there was new input. + * + * @exception java.io.IOException if any I/O-Error occurs + */ + private boolean zzRefill() throws java.io.IOException { + + /* first: make room (if you can) */ + if (zzStartRead > 0) { + System.arraycopy(zzBuffer, zzStartRead, + zzBuffer, 0, + zzEndRead-zzStartRead); + + /* translate stored positions */ + zzEndRead-= zzStartRead; + zzCurrentPos-= zzStartRead; + zzMarkedPos-= zzStartRead; + zzStartRead = 0; + } + + /* is the buffer big enough? */ + if (zzCurrentPos >= zzBuffer.length) { + /* if not: blow it up */ + char newBuffer[] = new char[zzCurrentPos*2]; + System.arraycopy(zzBuffer, 0, newBuffer, 0, zzBuffer.length); + zzBuffer = newBuffer; + } + + /* finally: fill the buffer with new input */ + int numRead = zzReader.read(zzBuffer, zzEndRead, + zzBuffer.length-zzEndRead); + + if (numRead > 0) { + zzEndRead+= numRead; + return false; + } + // unlikely but not impossible: read 0 characters, but not at end of stream + if (numRead == 0) { + int c = zzReader.read(); + if (c == -1) { + return true; + } else { + zzBuffer[zzEndRead++] = (char) c; + return false; + } + } + + // numRead < 0 + return true; + } + + + /** + * Closes the input stream. + */ + public final void yyclose() throws java.io.IOException { + zzAtEOF = true; /* indicate end of file */ + zzEndRead = zzStartRead; /* invalidate buffer */ + + if (zzReader != null) + zzReader.close(); + } + + + /** + * Resets the scanner to read from a new input stream. + * Does not close the old reader. + * + * All internal variables are reset, the old input stream + * cannot be reused (internal buffer is discarded and lost). + * Lexical state is set to ZZ_INITIAL. + * + * @param reader the new input stream + */ + public final void yyreset(java.io.Reader reader) { + zzReader = reader; + zzAtBOL = true; + zzAtEOF = false; + zzEOFDone = false; + zzEndRead = zzStartRead = 0; + zzCurrentPos = zzMarkedPos = 0; + yyline = yychar = yycolumn = 0; + zzLexicalState = YYINITIAL; + } + + + /** + * Returns the current lexical state. + */ + public final int yystate() { + return zzLexicalState; + } + + + /** + * Enters a new lexical state + * + * @param newState the new lexical state + */ + public final void yybegin(int newState) { + zzLexicalState = newState; + } + + + /** + * Returns the text matched by the current regular expression. + */ + public final String yytext() { + return new String( zzBuffer, zzStartRead, zzMarkedPos-zzStartRead ); + } + + + /** + * Returns the character at position pos from the + * matched text. + * + * It is equivalent to yytext().charAt(pos), but faster + * + * @param pos the position of the character to fetch. + * A value from 0 to yylength()-1. + * + * @return the character at position pos + */ + public final char yycharat(int pos) { + return zzBuffer[zzStartRead+pos]; + } + + + /** + * Returns the length of the matched text region. + */ + public final int yylength() { + return zzMarkedPos-zzStartRead; + } + + + /** + * Reports an error that occured while scanning. + * + * In a wellformed scanner (no or only correct usage of + * yypushback(int) and a match-all fallback rule) this method + * will only be called with things that "Can't Possibly Happen". + * If this method is called, something is seriously wrong + * (e.g. a JFlex bug producing a faulty scanner etc.). + * + * Usual syntax/scanner level error handling should be done + * in error fallback rules. + * + * @param errorCode the code of the errormessage to display + */ + private void zzScanError(int errorCode) { + String message; + try { + message = ZZ_ERROR_MSG[errorCode]; + } + catch (ArrayIndexOutOfBoundsException e) { + message = ZZ_ERROR_MSG[ZZ_UNKNOWN_ERROR]; + } + + throw new Error(message); + } + + + /** + * Pushes the specified amount of characters back into the input stream. + * + * They will be read again by then next call of the scanning method + * + * @param number the number of characters to be read again. + * This number must not be greater than yylength()! + */ + public void yypushback(int number) { + if ( number > yylength() ) + zzScanError(ZZ_PUSHBACK_2BIG); + + zzMarkedPos -= number; + } + + + /** + * Resumes scanning until the next regular expression is matched, + * the end of input is encountered or an I/O-Error occurs. + * + * @return the next token + * @exception java.io.IOException if any I/O-Error occurs + */ + public java.lang.String yylex() throws java.io.IOException { + int zzInput; + int zzAction; + + // cached fields: + int zzCurrentPosL; + int zzMarkedPosL; + int zzEndReadL = zzEndRead; + char [] zzBufferL = zzBuffer; + char [] zzCMapL = ZZ_CMAP; + + int [] zzTransL = ZZ_TRANS; + int [] zzRowMapL = ZZ_ROWMAP; + int [] zzAttrL = ZZ_ATTRIBUTE; + + while (true) { + zzMarkedPosL = zzMarkedPos; + + if (zzMarkedPosL > zzStartRead) { + switch (zzBufferL[zzMarkedPosL-1]) { + case '\n': + case '\u000B': + case '\u000C': + case '\u0085': + case '\u2028': + case '\u2029': + zzAtBOL = true; + break; + case '\r': + if (zzMarkedPosL < zzEndReadL) + zzAtBOL = zzBufferL[zzMarkedPosL] != '\n'; + else if (zzAtEOF) + zzAtBOL = false; + else { + boolean eof = zzRefill(); + zzMarkedPosL = zzMarkedPos; + zzEndReadL = zzEndRead; + zzBufferL = zzBuffer; + if (eof) + zzAtBOL = false; + else + zzAtBOL = zzBufferL[zzMarkedPosL] != '\n'; + } + break; + default: + zzAtBOL = false; + } + } + zzAction = -1; + + zzCurrentPosL = zzCurrentPos = zzStartRead = zzMarkedPosL; + + if (zzAtBOL) + zzState = ZZ_LEXSTATE[zzLexicalState+1]; + else + zzState = ZZ_LEXSTATE[zzLexicalState]; + + + zzForAction: { + while (true) { + + if (zzCurrentPosL < zzEndReadL) + zzInput = zzBufferL[zzCurrentPosL++]; + else if (zzAtEOF) { + zzInput = YYEOF; + break zzForAction; + } + else { + // store back cached positions + zzCurrentPos = zzCurrentPosL; + zzMarkedPos = zzMarkedPosL; + boolean eof = zzRefill(); + // get translated positions and possibly new buffer + zzCurrentPosL = zzCurrentPos; + zzMarkedPosL = zzMarkedPos; + zzBufferL = zzBuffer; + zzEndReadL = zzEndRead; + if (eof) { + zzInput = YYEOF; + break zzForAction; + } + else { + zzInput = zzBufferL[zzCurrentPosL++]; + } + } + int zzNext = zzTransL[ zzRowMapL[zzState] + zzCMapL[zzInput] ]; + if (zzNext == -1) break zzForAction; + zzState = zzNext; + + int zzAttributes = zzAttrL[zzState]; + if ( (zzAttributes & 1) == 1 ) { + zzAction = zzState; + zzMarkedPosL = zzCurrentPosL; + if ( (zzAttributes & 8) == 8 ) break zzForAction; + } + + } + } + + // store back cached position + zzMarkedPos = zzMarkedPosL; + + switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) { + case 25: + // lookahead expression with fixed base length + zzMarkedPos = zzStartRead + 1; + { return "o"; + } + case 39: break; + case 22: + { cv = 2; return "ii"; + } + case 40: break; + case 35: + // lookahead expression with fixed base length + zzMarkedPos = zzStartRead + 2; + { return "od"; + } + case 41: break; + case 7: + { cv = 1; return "s"; + } + case 42: break; + case 24: + // lookahead expression with fixed base length + zzMarkedPos = zzStartRead + 1; + { return "e"; + } + case 43: break; + case 29: + { cv = 1; return "Qu"; + } + case 44: break; + case 19: + // lookahead expression with fixed base length + zzMarkedPos = zzStartRead + 1; + { switch(cv) { + case 2: return "v"; + default: cv = 2; return "u"; + } + } + case 45: break; + case 9: + { cv = 2; return "ae"; + } + case 46: break; + case 15: + { return "精"; + } + case 47: break; + case 3: + { cv = 0; return yytext(); + } + case 48: break; + case 27: + // lookahead expression with fixed base length + zzMarkedPos = zzStartRead + 1; + { switch(cv) { + case 2: return "V"; + default: cv = 2; return "U"; + } + } + case 49: break; + case 2: + { return ""; + } + case 50: break; + case 33: + // lookahead expression with fixed base length + zzMarkedPos = zzStartRead + 2; + { return "am"; + } + case 51: break; + case 18: + { cv = 1; return "qu"; + } + case 52: break; + case 14: + { return "歷"; + } + case 53: break; + case 8: + { cv = 1; return "ss"; + } + case 54: break; + case 4: + { cv = 2; return yytext(); + } + case 55: break; + case 32: + { return "庶"; + } + case 56: break; + case 6: + { cv = 0; return ""; + } + case 57: break; + case 16: + { switch(cv) { + case 1: return yytext().replace("v", "u"); + default: cv = 1; return yytext(); + } + } + case 58: break; + case 12: + { return "奇"; + } + case 59: break; + case 38: + { return "hic"; + } + case 60: break; + case 26: + { cv = 2; return "oi"; + } + case 61: break; + case 36: + // lookahead expression with fixed base length + zzMarkedPos = zzStartRead + 2; + { return "um"; + } + case 62: break; + case 17: + { switch(cv) { + case 1: return yytext().replace("V", "U"); + default: cv = 1; return yytext(); + } + } + case 63: break; + case 21: + { cv = 2; return "uu"; + } + case 64: break; + case 31: + // lookahead expression with fixed base length + zzMarkedPos = zzStartRead + 1; + { cv = 1; return "U"; + } + case 65: break; + case 1: + { return yytext(); + } + case 66: break; + case 34: + // lookahead expression with fixed base length + zzMarkedPos = zzStartRead + 2; + { return "as"; + } + case 67: break; + case 23: + // lookahead expression with fixed base length + zzMarkedPos = zzStartRead + 1; + { return "a"; + } + case 68: break; + case 13: + { return "時"; + } + case 69: break; + case 10: + { cv = 2; return "AE"; + } + case 70: break; + case 37: + // lookahead expression with fixed base length + zzMarkedPos = zzStartRead + 2; + { return "us"; + } + case 71: break; + case 5: + { cv = 1; return yytext(); + } + case 72: break; + case 28: + // lookahead expression with fixed base length + zzMarkedPos = zzStartRead + 1; + { cv = 1; return "u"; + } + case 73: break; + case 30: + { cv = 1; return "QU"; + } + case 74: break; + case 20: + { cv = 2; return "ui"; + } + case 75: break; + case 11: + { cv = 2; return "oe"; + } + case 76: break; + default: + if (zzInput == YYEOF && zzStartRead == zzCurrentPos) { + zzAtEOF = true; + return null; + } + else { + zzScanError(ZZ_NO_MATCH); + } + } + } + } + + +} diff -r 94305c504178 -r 2396a569e446 software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlNormalizerLexAll.lex --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlNormalizerLexAll.lex Tue Feb 08 14:54:09 2011 +0100 @@ -0,0 +1,143 @@ +/* + * Normalization rules for all languages + * [this is a JFlex specification] + * + * Wolfgang Schmidle + * 2011-01-25 + * + */ + +package de.mpg.mpiwg.berlin.mpdl.lt.analyzer.lang; + +%% + +%public +%class MpdlNormalizerLexAll +%type java.lang.String +%unicode +// %debug + +%states LA, ZH + +%{ + int cv = 0; // consonant = 1, vowel = 2, everything else = 0 +%} + +VOWEL=[AEIOUaeiouÆæęàèòùœ] +CONS=[BCDFGHKLMNPQRSTVWXZbcdfghklmnpqrstvwxzſß] +LR=[lLrR] +QUE=(que)? +END=\n + +%% + + { + +// 1. simple replacements + +// 1.1 single characters +ſ { cv = 1; return "s"; } +ß { cv = 1; return "ss"; } +[æę] { cv = 2; return "ae"; } +Æ { cv = 2; return "AE"; } +œ { cv = 2; return "oe"; } +// 1.2 character combinations +ij { cv = 2; return "ii"; } + +// 2. diacritics + +// 2.1 superfluous diacritics in single words +^ hîc {END} { return "hic"; } + +// 2.2 superfluous diacritics at the end of a word +// 2.2.1 common cases +à / {QUE} {END} { return "a"; } +àm / {QUE} {END} { return "am"; } +às / {QUE} {END} { return "as"; } // (-àsque will likely never occur) +// à / [ms]? {QUE} {END} { return "a"; } +è / {QUE} {END} { return "e"; } +ò / {QUE} {END} { return "o"; } +òd / {QUE} {END} { return "od"; } +ùm / {QUE} {END} { return "um"; } +ùs / {QUE} {END} { return "us"; } + +// 2.3 superfluous diacritics within a word +// 2.3.1 common cases +aë { cv = 2; return "ae"; } +oë { cv = 2; return "oe"; } +// 2.3.2 rare cases +oï { cv = 2; return "oi"; } +uï { cv = 2; return "ui"; } +// 2.3.3 extremely rare cases +uü { cv = 2; return "uu"; } + +// 3. rules for u and v + +// 3.1 rules for u + +u/{VOWEL} { + switch(cv) { + case 2: return "v"; + default: cv = 2; return "u"; + } + } +U/{VOWEL} { + switch(cv) { + case 2: return "V"; + default: cv = 2; return "U"; + } + } + +// 3.2 rules for v + +qv { cv = 1; return "qu"; } // the replaced v still counts as consonant +Qv { cv = 1; return "Qu"; } +QV { cv = 1; return "QU"; } + +{LR}v { + switch(cv) { + case 1: return yytext().replace("v", "u"); + default: cv = 1; return yytext(); + } + } +{LR}V { + switch(cv) { + case 1: return yytext().replace("V", "U"); + default: cv = 1; return yytext(); + } + } + +v/{CONS} { cv = 1; return "u"; } +V/{CONS} { cv = 1; return "U"; } + + +// default + +{VOWEL} { cv = 2; return yytext(); } +{CONS} { cv = 1; return yytext(); } +\n { cv = 0; return ""; } +. { cv = 0; return yytext(); } + +} + + { + +// Codepoint < FFFF + +竒 { return "奇"; } // 7AD2 --> 5947 +旹 { return "時"; } // 65F9 --> 6642 +歴 { return "歷"; } // 6B74 --> 6B77 +精 { return "精"; } // FA1D --> 7CBE (FA1D is a compatibility ideograph) + +// Codepoint > FFFF + +庶 { return "庶"; } // 2F88D --> 5EB6 (2F88D is a compatibility ideograph) + + +} + + +// default (can be overridden by individual languages) + +\n { return ""; } +. { return yytext(); } diff -r 94305c504178 -r 2396a569e446 software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/doc/DictionarizerContentHandler.java --- a/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/doc/DictionarizerContentHandler.java Tue Feb 08 14:36:38 2011 +0100 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/doc/DictionarizerContentHandler.java Tue Feb 08 14:54:09 2011 +0100 @@ -15,6 +15,7 @@ private static int MARK_SIZE = MARK.length(); private static int ELEMENT_TYPE_CHARACTERS = 1; private static int ELEMENT_TYPE_COMPLEX = 2; + private static String SPECIAL_NOT_WORD_DELIM_SYMBOL = new Character('\u2424').toString(); private String xmlnsString = ""; private String language; private String outputXmlFragment = ""; @@ -176,6 +177,17 @@ return isComplex; } + /** + * feel free to add/remove some element names + * @return true if element is a word delimiter element else false + */ + private boolean isWordDelimiterElement() { + boolean isWordDelimiterElement = true; + if (name.equals("lb") || name.equals("cb") || name.equals("gap") || name.equals("figure") || name.equals("image") || name.equals("note") || name.equals("handwritten") || name.equals("anchor")) + isWordDelimiterElement = false; + return isWordDelimiterElement; + } + private String toXmlString() throws SAXException { String retString = ""; String elemLanguage = language; // default value for the document/page @@ -200,15 +212,20 @@ if (! composite.isComplex()) { if (composite.value != null && ! composite.value.equals("")) { String compositeValueStr = composite.value; - compositesChars += compositeValueStr; - compositesCharsWithMarks += compositeValueStr; + compositesChars = compositesChars + compositeValueStr; + compositesCharsWithMarks = compositesCharsWithMarks + compositeValueStr; } } else { + if (! composite.isWordDelimiterElement()) { + compositesChars = compositesChars + SPECIAL_NOT_WORD_DELIM_SYMBOL; // add a special symbol at the position of the "not word delimiter element" (e.g. line break) + } complexElements.add(composite); compositesCharsWithMarks += MARK; } } String compositesCharsDictionarized = characters2DictWords(compositesChars, elemLanguage); + compositesChars = compositesChars.replaceAll(SPECIAL_NOT_WORD_DELIM_SYMBOL, ""); + compositesCharsDictionarized = compositesCharsDictionarized.replaceAll(SPECIAL_NOT_WORD_DELIM_SYMBOL, ""); if (complexElements.size() > 0) { for (int i=0; i lexEntryKeys = lexHandler.getLexEntryKeys(wordTokenText, language, false); + String wordTokenTextWithoutSpecialSymbols = removeSpecialSymbols(wordTokenText); + ArrayList lexEntryKeys = lexHandler.getLexEntryKeys(wordTokenTextWithoutSpecialSymbols, language, false); if (lexEntryKeys != null) { String lexForms = ""; for (int j=0; j" + wordStrDeresolved + ""; + retStr = retStr + beforeStrDeresolved + "" + wordStrDeresolved + ""; } else { retStr = retStr + beforeStrDeresolved + wordStrDeresolved; } @@ -272,5 +290,12 @@ } return retStr; } + + private String removeSpecialSymbols(String inputStr) { + String retStr = inputStr.replaceAll(" ", ""); + retStr = retStr.replaceAll("\n", ""); + retStr = retStr.replaceAll("-", ""); + return retStr; + } } } diff -r 94305c504178 -r 2396a569e446 software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/doc/NormalizeCharsContentHandler.java --- a/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/doc/NormalizeCharsContentHandler.java Tue Feb 08 14:36:38 2011 +0100 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/doc/NormalizeCharsContentHandler.java Tue Feb 08 14:54:09 2011 +0100 @@ -102,6 +102,7 @@ String retStr = ""; try { MpdlTokenizerAnalyzer tokenizerAnalyzer = new MpdlTokenizerAnalyzer(language); + tokenizerAnalyzer.setRegWithoutSemicolon(true); // hack: feel free to remove it later ArrayList wordTokens = tokenizerAnalyzer.getToken(charactersStr); int endPos = 0; for (int i=0; i < wordTokens.size(); i++) { @@ -111,10 +112,9 @@ String beforeStrDeresolved = StringUtilEscapeChars.deresolveXmlEntities(beforeStr); endPos = wordToken.endOffset(); String wordStr = charactersStr.substring(startPos, endPos); - MpdlNormalizer mpdlNormalizer = new MpdlNormalizer(normalizeFunctions, language); + mpdlNormalizer.setNormMode(MpdlNormalizer.MODE_4HUMAN_READERS); String normalizedWordStr = mpdlNormalizer.normalize(wordStr); - String normalizedWordStrDeresolved = StringUtilEscapeChars.deresolveXmlEntities(normalizedWordStr); // String wordTokenText = wordToken.termText(); retStr = retStr + beforeStrDeresolved + normalizedWordStrDeresolved; diff -r 94305c504178 -r 2396a569e446 software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/doc/regularization/RegularizationManager.java --- a/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/doc/regularization/RegularizationManager.java Tue Feb 08 14:36:38 2011 +0100 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/doc/regularization/RegularizationManager.java Tue Feb 08 14:54:09 2011 +0100 @@ -55,7 +55,9 @@ // instance.writeAllRegs(); ArrayList regs = instance.findRegsByNorm("la", "Illiusque"); + ArrayList regs2 = instance.findRegsByNorm("la", "Itaque"); Regularization bla = regs.get(0); + Regularization bla2 = regs2.get(0); instance.end(); instance.endOperation(); diff -r 94305c504178 -r 2396a569e446 software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/general/Language.java --- a/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/general/Language.java Tue Feb 08 14:36:38 2011 +0100 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/general/Language.java Tue Feb 08 14:54:09 2011 +0100 @@ -26,6 +26,8 @@ languageIds.put("lat", "la"); languageIds.put("nl", "nl"); languageIds.put("zh", "zh"); + languageIds.put("zho", "zh"); + languageIds.put("zho-Hant", "zh"); } public String getLanguageId(String language) { @@ -35,4 +37,25 @@ retLanguageId = languageIds.get(language); return retLanguageId; } + + public boolean isLatin(String language) { + if (getLanguageId(language).equals("la")) + return true; + else + return false; + } + + public boolean isItalian(String language) { + if (getLanguageId(language).equals("it")) + return true; + else + return false; + } + + public boolean isChinese(String language) { + if (getLanguageId(language).equals("zh")) + return true; + else + return false; + } } \ No newline at end of file diff -r 94305c504178 -r 2396a569e446 software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/general/Transcoder.java --- a/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/general/Transcoder.java Tue Feb 08 14:36:38 2011 +0100 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/general/Transcoder.java Tue Feb 08 14:54:09 2011 +0100 @@ -62,7 +62,42 @@ return encodedUnicodeStr; */ } + + public String transcodeFromUnicode2BetaCode(String inputStr) throws ApplicationException { + StringReader strReader = new StringReader(inputStr); + Unicode2BetacodeLex betacode2UnicodeLex = new Unicode2BetacodeLex(strReader); + String retStr = ""; + String token = ""; + while (token != null) { + try { + token = betacode2UnicodeLex.yylex(); + if (token != null) + retStr += token; + } catch (IOException e ) { + throw new ApplicationException(e); + } + } + return retStr; + } + + public String transcodeFromUnicode2Buckwalter(String inputStr) throws ApplicationException { + StringReader strReader = new StringReader(inputStr); + Unicode2BuckwalterLex betacode2UnicodeLex = new Unicode2BuckwalterLex(strReader); + String retStr = ""; + String token = ""; + while (token != null) { + try { + token = betacode2UnicodeLex.yylex(); + if (token != null) + retStr += token; + } catch (IOException e ) { + throw new ApplicationException(e); + } + } + return retStr; + } + public String transcodeFromBuckwalter2Unicode(String inputStr) throws ApplicationException { StringReader strReader = new StringReader(inputStr); Buckwalter2UnicodeLex buckwalter2UnicodeLex = new Buckwalter2UnicodeLex(strReader); diff -r 94305c504178 -r 2396a569e446 software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/general/Unicode2Betacode.lex --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/general/Unicode2Betacode.lex Tue Feb 08 14:54:09 2011 +0100 @@ -0,0 +1,319 @@ +package de.mpg.mpiwg.berlin.mpdl.lt.general; + +%% + +%class Unicode2BetacodeLex +%public +%type java.lang.String +%unicode +%% + + +"<"[^>]\u+">" { return yytext(); } + +"H" { return "*j"; } +"h" { return "j"; } +"F" { return "*v"; } +"f" { return "v"; } +"\u03a3" { return "*s"; } + +"." { return "!"; } +"\u00B7" { return ":"; } /* MPDL update */ + +"\u1F00" { return "a)"; } +"\u1F01" { return "a("; } +"\u1F02" { return "a)\\"; } +"\u1F03" { return "a(\\"; } +"\u1F04" { return "a)/"; } +"\u1F05" { return "a(/"; } +"\u1F06" { return "a)="; } +"\u1F07" { return "a(="; } +"\u1F08" { return "*)a"; } +"\u1F09" { return "*(a"; } +"\u1F0A" { return "*)\\a"; } +"\u1F0B" { return "*(\\a"; } +"\u1F0C" { return "*)/a"; } +"\u1F0D" { return "*(/a"; } +"\u1F0E" { return "*)=a"; } +"\u1F0F" { return "*(=a"; } +"\u1F10" { return "e)"; } +"\u1F11" { return "e("; } +"\u1F12" { return "e)\\"; } +"\u1F13" { return "e(\\"; } +"\u1F14" { return "e)/"; } +"\u1F15" { return "e(/"; } +"\u1F18" { return "*)e"; } +"\u1F19" { return "*(e"; } +"\u1F1A" { return "*)\\e"; } +"\u1F1B" { return "*(\\e"; } +"\u1F1C" { return "*)/e"; } +"\u1F1D" { return "*(/e"; } +"\u1F20" { return "h)"; } +"\u1F21" { return "h("; } +"\u1F22" { return "h)\\"; } +"\u1F23" { return "h(\\"; } +"\u1F24" { return "h)/"; } +"\u1F25" { return "h(/"; } +"\u1F26" { return "h)="; } +"\u1F27" { return "h(="; } +"\u1F28" { return "*)h"; } +"\u1F29" { return "*(h"; } +"\u1F2A" { return "*)\\h"; } +"\u1F2B" { return "*(\\h"; } +"\u1F2C" { return "*)/h"; } +"\u1F2D" { return "*(/h"; } +"\u1F2E" { return "*)=h"; } +"\u1F2F" { return "*(=h"; } +"\u1F30" { return "i)"; } +"\u1F31" { return "i("; } +"\u1F32" { return "i)\\"; } +"\u1F33" { return "i(\\"; } +"\u1F34" { return "i)/"; } +"\u1F35" { return "i(/"; } +"\u1F36" { return "i)="; } +"\u1F37" { return "i(="; } +"\u1F38" { return "*)i"; } +"\u1F39" { return "*(i"; } +"\u1F3A" { return "*)\\i"; } +"\u1F3B" { return "*(\\i"; } +"\u1F3C" { return "*)/i"; } +"\u1F3D" { return "*(/i"; } +"\u1F3E" { return "*)=i"; } +"\u1F3F" { return "*(=i"; } +"\u1F40" { return "o)"; } +"\u1F41" { return "o("; } +"\u1F42" { return "o)\\"; } +"\u1F43" { return "o(\\"; } +"\u1F44" { return "o)/"; } +"\u1F45" { return "o(/"; } +"\u1F48" { return "*)o"; } +"\u1F49" { return "*(o"; } +"\u1F4A" { return "*)\\o"; } +"\u1F4B" { return "*(\\o"; } +"\u1F4C" { return "*)/o"; } +"\u1F4D" { return "*(/o"; } +"\u1F50" { return "u)"; } +"\u1F51" { return "u("; } +"\u1F52" { return "u)\\"; } +"\u1F53" { return "u(\\"; } +"\u1F54" { return "u)/"; } +"\u1F55" { return "u(/"; } +"\u1F56" { return "u)="; } +"\u1F57" { return "u(="; } +"\u1F59" { return "*(u"; } +"\u1F5B" { return "*(\\u"; } +"\u1F5D" { return "*(/u"; } +"\u1F5F" { return "*(=u"; } +"\u1F60" { return "w)"; } +"\u1F61" { return "w("; } +"\u1F62" { return "w)\\"; } +"\u1F63" { return "w(\\"; } +"\u1F64" { return "w)/"; } +"\u1F65" { return "w(/"; } +"\u1F66" { return "w)="; } +"\u1F67" { return "w(="; } +"\u1F68" { return "*)w"; } +"\u1F69" { return "*(w"; } +"\u1F6A" { return "*)\\w"; } +"\u1F6B" { return "*(\\w"; } +"\u1F6C" { return "*)/w"; } +"\u1F6D" { return "*(/w"; } +"\u1F6E" { return "*)=w"; } +"\u1F6F" { return "*(=w"; } +"\u1F70" { return "a\\"; } +"\u1F71" { return "a/"; } +"\u1F72" { return "e\\"; } +"\u1F73" { return "e/"; } +"\u1F74" { return "h\\"; } +"\u1F75" { return "h/"; } +"\u1F76" { return "i\\"; } +"\u1F77" { return "i/"; } +"\u1F78" { return "o\\"; } +"\u1F79" { return "o/"; } +"\u1F7A" { return "u\\"; } +"\u1F7B" { return "u/"; } +"\u1F7C" { return "w\\"; } +"\u1F7D" { return "w/"; } +"\u1F80" { return "a)|"; } +"\u1F81" { return "a(|"; } +"\u1F82" { return "a)\\|"; } +"\u1F83" { return "a(\\|"; } +"\u1F84" { return "a)/|"; } +"\u1F85" { return "a(/|"; } +"\u1F86" { return "a)=|"; } +"\u1F87" { return "a(=|"; } +"\u1F88" { return "*)|a"; } +"\u1F89" { return "*(|a"; } +"\u1F8A" { return "*)\\|a"; } +"\u1F8B" { return "*(\\|a"; } +"\u1F8C" { return "*)/|a"; } +"\u1F8D" { return "*(/|a"; } +"\u1F8E" { return "*)=|a"; } +"\u1F8F" { return "*(=|a"; } +"\u1F90" { return "h)|"; } +"\u1F91" { return "h(|"; } +"\u1F92" { return "h)\\|"; } +"\u1F93" { return "h(\\|"; } +"\u1F94" { return "h)/|"; } +"\u1F95" { return "h(/|"; } +"\u1F96" { return "h)=|"; } +"\u1F97" { return "h(=|"; } +"\u1F98" { return "*)|h"; } +"\u1F99" { return "*(|h"; } +"\u1F9A" { return "*)\\|h"; } +"\u1F9B" { return "*(\\|h"; } +"\u1F9C" { return "*)/|h"; } +"\u1F9D" { return "*(/|h"; } +"\u1F9E" { return "*)=|h"; } +"\u1F9F" { return "*(=|h"; } +"\u1FA0" { return "w)|"; } +"\u1FA1" { return "w(|"; } +"\u1FA2" { return "w)\\|"; } +"\u1FA3" { return "w(\\|"; } +"\u1FA4" { return "w)/|"; } +"\u1FA5" { return "w(/|"; } +"\u1FA6" { return "w)=|"; } +"\u1FA7" { return "w(=|"; } +"\u1FA8" { return "*)|w"; } +"\u1FA9" { return "*(|w"; } +"\u1FAA" { return "*)\\|w"; } +"\u1FAB" { return "*(\\|w"; } +"\u1FAC" { return "*)/|w"; } +"\u1FAD" { return "*(/|w"; } +"\u1FAE" { return "*)=|w"; } +"\u1FAF" { return "*(=|w"; } +"\u1FB0" { return "a^"; } +"\u1FB1" { return "a_"; } +"\u1FB2" { return "a\\|"; } +"\u1FB3" { return "a|"; } +"\u1FB4" { return "a/|"; } +"\u1FB6" { return "a="; } +"\u1FB7" { return "a=|"; } +"\u1FB8" { return "*a^"; } +"\u1FB9" { return "*a_"; } +"\u1FBA" { return "*a\\"; } +"\u1FBB" { return "*a/"; } +"\u1FBC" { return "*a|"; } +"\u1FC2" { return "h\\|"; } +"\u1FC3" { return "h|"; } +"\u1FC4" { return "h/|"; } +"\u1FC6" { return "h="; } +"\u1FC7" { return "h=|"; } +"\u1FC8" { return "*e\\"; } +"\u1FC9" { return "*e/"; } +"\u1FCA" { return "*h\\"; } +"\u1FCB" { return "*h/"; } +"\u1FCC" { return "*h|"; } +"\u1FD0" { return "i^"; } +"\u1FD1" { return "i_"; } +"\u1FD2" { return "i+\\"; } +"\u1FD3" { return "i+/"; } +"\u1FD6" { return "i="; } +"\u1FD7" { return "i+="; } +"\u1FD8" { return "*i^"; } +"\u1FD9" { return "*i_"; } +"\u1FDA" { return "*i\\"; } +"\u1FDB" { return "*i/"; } +"\u1FE0" { return "u^"; } +"\u1FE1" { return "u_"; } +"\u1FE2" { return "u+\\"; } +"\u1FE3" { return "u+/"; } +"\u1FE4" { return "r)"; } +"\u1FE5" { return "r("; } +"\u1FE6" { return "u="; } +"\u1FE7" { return "u+="; } +"\u1FE8" { return "*u^"; } +"\u1FE9" { return "*u_"; } +"\u1FEA" { return "*u\\"; } +"\u1FEB" { return "*u/"; } +"\u1FEC" { return "*(r"; } +"\u1FF2" { return "w\\|"; } +"\u1FF3" { return "w|"; } +"\u1FF4" { return "w/|"; } +"\u1FFA" { return "*w\\"; } +"\u1FFB" { return "*w/"; } +"\u1FFC" { return "*w|"; } +"\u1FF6" { return "w="; } +"\u1FF7" { return "w=|"; } +"\u1FF8" { return "*o\\"; } +"\u1FF9" { return "*o/"; } + +"\u0300" { return "\\"; } +"\u0301" { return "/"; } +"\u0304" { return "_"; } +"\u0306" { return "^"; } +"\u0308" { return "+"; } +"\u0302" { return "="; } +"\u0313" { return ")"; } +"\u0314" { return "("; } +"\u0323" { return "?"; } +"\u0345" { return "|"; } + +"\u03b1" { return "a"; } /* MPDL update */ +"\u0391" { return "*a"; } /* MPDL update */ +"\u03b2" { return "b"; } /* MPDL update */ +"\u0392" { return "*b"; } /* MPDL update */ +"\u03b3" { return "g"; } /* MPDL update */ +"\u0393" { return "*g"; } /* MPDL update */ +"\u03b4" { return "d"; } /* MPDL update */ +"\u0394" { return "*d"; } /* MPDL update */ +"\u03b5" { return "e"; } /* MPDL update */ +"\u0395" { return "*e"; } /* MPDL update */ +"\u03b6" { return "z"; } /* MPDL update */ +"\u0396" { return "*z"; } /* MPDL update */ +"\u03b7" { return "h"; } /* MPDL update */ +"\u0397" { return "*h"; } /* MPDL update */ +"\u03b8" { return "q"; } /* MPDL update */ +"\u0398" { return "*q"; } /* MPDL update */ +"\u03b9" { return "i"; } /* MPDL update */ +"\u0399" { return "*i"; } /* MPDL update */ +"\u03ba" { return "k"; } /* MPDL update */ +"\u039a" { return "*k"; } /* MPDL update */ +"\u03bb" { return "l"; } /* MPDL update */ +"\u039b" { return "*l"; } /* MPDL update */ +"\u03bc" { return "m"; } /* MPDL update */ +"\u039c" { return "*m"; } /* MPDL update */ +"\u03bd" { return "n"; } /* MPDL update */ +"\u039d" { return "*n"; } /* MPDL update */ +"\u03be" { return "c"; } /* MPDL update */ +"\u039e" { return "*c"; } /* MPDL update */ +"\u03bf" { return "o"; } /* MPDL update */ +"\u039f" { return "*o"; } /* MPDL update */ +"\u03c0" { return "p"; } /* MPDL update */ +"\u03a0" { return "*p"; } /* MPDL update */ +"\u03c1" { return "r"; } /* MPDL update */ +"\u03a1" { return "*r"; } /* MPDL update */ + +"\u03a3" { return "*s"; } /* MPDL update */ +"\u03c3" { return "s1"; } /* mdh 2002-01-07 */ +"\u03c2"/\-\- { return "s"; } +"\u03c3"/\> }[a-z\?\!0-9*=\/()\'\-] { return "s"; } /* MPDL update */ +"\u03c2"/\< { return "s"; } /* MPDL update */ +"\u03c3"/[\[\]][a-z\?\!0-9*=\/()\'\-] { return "s"; } /* MPDL update */ +"\u03c2"/\??[^a-z0-9*=\/()\'\-\[\?] { return "s"; } +"\u03c3" { return "s"; } /* MPDL update */ + +"\u03c4" { return "t"; } /* MPDL update */ +"\u03a4" { return "*t"; } /* MPDL update */ +"\u03c5" { return "u"; } /* MPDL update */ +"\u03a5" { return "*u"; } /* MPDL update */ +"\u03c6" { return "f"; } /* MPDL update */ +"\u03a6" { return "*f"; } /* MPDL update */ +"\u03c7" { return "x"; } /* MPDL update */ +"\u03a7" { return "*x"; } /* MPDL update */ +"\u03c8" { return "y"; } /* MPDL update */ +"\u03a8" { return "*y"; } /* MPDL update */ +"\u03c9" { return "w"; } /* MPDL update */ +"\u03a9" { return "*w"; } /* MPDL update */ + +[\&_]"vert;" { return "|"; } +[\&_]"lpar;" { return "("; } +[\&_]"rpar;" { return ")"; } +[\_\&]"lt;" { return "<"; } +[\_\&]"gt;" { return ">"; } +"'" { return "'"; } /* MPDL update */ + +"&"[a-zA-Z]+";" { return yytext(); } + +. { return yytext(); } +\n { return yytext(); } \ No newline at end of file diff -r 94305c504178 -r 2396a569e446 software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/general/Unicode2BetacodeLex.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/general/Unicode2BetacodeLex.java Tue Feb 08 14:54:09 2011 +0100 @@ -0,0 +1,1866 @@ +/* The following code was generated by JFlex 1.4.3 on 14.12.10 15:03 */ + +package de.mpg.mpiwg.berlin.mpdl.lt.general; + + +/** + * This class is a scanner generated by + * JFlex 1.4.3 + * on 14.12.10 15:03 from the specification file + * /Users/jwillenborg/test/jflex/Unicode2Betacode.lex + */ +public class Unicode2BetacodeLex { + + /** This character denotes the end of file */ + public static final int YYEOF = -1; + + /** initial size of the lookahead buffer */ + private static final int ZZ_BUFFERSIZE = 16384; + + /** lexical states */ + public static final int YYINITIAL = 0; + + /** + * ZZ_LEXSTATE[l] is the state in the DFA for the lexical state l + * ZZ_LEXSTATE[l+1] is the state in the DFA for the lexical state l + * at the beginning of a line + * l is of the form l = 2*k, k a non negative integer + */ + private static final int ZZ_LEXSTATE[] = { + 0, 0 + }; + + /** + * Translates characters to character classes + */ + private static final String ZZ_CMAP_PACKED = + "\12\0\1\0\26\0\1\u0118\1\0\1\u0130\2\0\1\u0113\4\u011c\2\0"+ + "\1\u0112\1\11\1\u011c\1\u0131\2\u011c\1\u0132\5\u011c\1\u0133\1\0\1\u0116"+ + "\1\1\1\u011c\1\2\1\u011b\1\0\5\u0134\1\6\1\u0134\1\4\22\u0134"+ + "\1\u011d\1\0\1\u011a\1\0\1\u012a\1\0\1\u012f\3\u0135\1\u012c\1\7"+ + "\1\u0114\1\5\3\u0135\1\u0119\3\u0135\1\u012e\1\u0135\1\u012d\1\u0135\1\u0115"+ + "\1\3\1\u012b\4\u0135\2\0\1\u0117\71\0\1\12\u0248\0\1\344\1\345"+ + "\1\351\1\0\1\346\1\0\1\347\1\0\1\350\12\0\1\352\1\353"+ + "\16\0\1\354\41\0\1\355\113\0\1\357\1\361\1\363\1\365\1\367"+ + "\1\371\1\373\1\375\1\377\1\u0101\1\u0103\1\u0105\1\u0107\1\u0109\1\u010b"+ + "\1\u010d\1\u010f\1\0\1\10\1\u011f\1\u0121\1\u0123\1\u0125\1\u0127\1\u0129"+ + "\7\0\1\356\1\360\1\362\1\364\1\366\1\370\1\372\1\374\1\376"+ + "\1\u0100\1\u0102\1\u0104\1\u0106\1\u0108\1\u010a\1\u010c\1\u010e\1\u0111\1\u0110"+ + "\1\u011e\1\u0120\1\u0122\1\u0124\1\u0126\1\u0128\u1b36\0\1\13\1\14\1\15"+ + "\1\16\1\17\1\20\1\21\1\22\1\23\1\24\1\25\1\26\1\27"+ + "\1\30\1\31\1\32\1\33\1\34\1\35\1\36\1\37\1\40\2\0"+ + "\1\41\1\42\1\43\1\44\1\45\1\46\2\0\1\47\1\50\1\51"+ + "\1\52\1\53\1\54\1\55\1\56\1\57\1\60\1\61\1\62\1\63"+ + "\1\64\1\65\1\66\1\67\1\70\1\71\1\72\1\73\1\74\1\75"+ + "\1\76\1\77\1\100\1\101\1\102\1\103\1\104\1\105\1\106\1\107"+ + "\1\110\1\111\1\112\1\113\1\114\2\0\1\115\1\116\1\117\1\120"+ + "\1\121\1\122\2\0\1\123\1\124\1\125\1\126\1\127\1\130\1\131"+ + "\1\132\1\0\1\133\1\0\1\134\1\0\1\135\1\0\1\136\1\137"+ + "\1\140\1\141\1\142\1\143\1\144\1\145\1\146\1\147\1\150\1\151"+ + "\1\152\1\153\1\154\1\155\1\156\1\157\1\160\1\161\1\162\1\163"+ + "\1\164\1\165\1\166\1\167\1\170\1\171\1\172\1\173\1\174\2\0"+ + "\1\175\1\176\1\177\1\200\1\201\1\202\1\203\1\204\1\205\1\206"+ + "\1\207\1\210\1\211\1\212\1\213\1\214\1\215\1\216\1\217\1\220"+ + "\1\221\1\222\1\223\1\224\1\225\1\226\1\227\1\230\1\231\1\232"+ + "\1\233\1\234\1\235\1\236\1\237\1\240\1\241\1\242\1\243\1\244"+ + "\1\245\1\246\1\247\1\250\1\251\1\252\1\253\1\254\1\255\1\256"+ + "\1\257\1\260\1\261\1\0\1\262\1\263\1\264\1\265\1\266\1\267"+ + "\1\270\5\0\1\271\1\272\1\273\1\0\1\274\1\275\1\276\1\277"+ + "\1\300\1\301\1\302\3\0\1\303\1\304\1\305\1\306\2\0\1\307"+ + "\1\310\1\311\1\312\1\313\1\314\4\0\1\315\1\316\1\317\1\320"+ + "\1\321\1\322\1\323\1\324\1\325\1\326\1\327\1\330\1\331\5\0"+ + "\1\332\1\333\1\334\1\0\1\340\1\341\1\342\1\343\1\335\1\336"+ + "\1\337\ue003\0"; + + /** + * Translates characters to character classes + */ + private static final char [] ZZ_CMAP = zzUnpackCMap(ZZ_CMAP_PACKED); + + /** + * Translates DFA states to action switch labels. + */ + private static final int [] ZZ_ACTION = zzUnpackAction(); + + private static final String ZZ_ACTION_PACKED_0 = + "\1\0\2\1\1\2\1\3\1\4\1\5\1\6\1\7"+ + "\1\10\1\11\1\12\1\13\1\14\1\15\1\16\1\17"+ + "\1\20\1\21\1\22\1\23\1\24\1\25\1\26\1\27"+ + "\1\30\1\31\1\32\1\33\1\34\1\35\1\36\1\37"+ + "\1\40\1\41\1\42\1\43\1\44\1\45\1\46\1\47"+ + "\1\50\1\51\1\52\1\53\1\54\1\55\1\56\1\57"+ + "\1\60\1\61\1\62\1\63\1\64\1\65\1\66\1\67"+ + "\1\70\1\71\1\72\1\73\1\74\1\75\1\76\1\77"+ + "\1\100\1\101\1\102\1\103\1\104\1\105\1\106\1\107"+ + "\1\110\1\111\1\112\1\113\1\114\1\115\1\116\1\117"+ + "\1\120\1\121\1\122\1\123\1\124\1\125\1\126\1\127"+ + "\1\130\1\131\1\132\1\133\1\134\1\135\1\136\1\137"+ + "\1\140\1\141\1\142\1\143\1\144\1\145\1\146\1\147"+ + "\1\150\1\151\1\152\1\153\1\154\1\155\1\156\1\157"+ + "\1\160\1\161\1\162\1\163\1\164\1\165\1\166\1\167"+ + "\1\170\1\171\1\172\1\173\1\174\1\175\1\176\1\177"+ + "\1\200\1\201\1\202\1\203\1\204\1\205\1\206\1\207"+ + "\1\210\1\211\1\212\1\213\1\214\1\215\1\216\1\217"+ + "\1\220\1\221\1\222\1\223\1\224\1\225\1\226\1\227"+ + "\1\230\1\231\1\232\1\233\1\234\1\235\1\236\1\237"+ + "\1\240\1\241\1\242\1\243\1\244\1\245\1\246\1\247"+ + "\1\250\1\251\1\252\1\253\1\254\1\255\1\256\1\257"+ + "\1\260\1\261\1\262\1\263\1\264\1\265\1\266\1\267"+ + "\1\270\1\271\1\272\1\273\1\274\1\275\1\276\1\277"+ + "\1\300\1\301\1\302\1\303\1\304\1\305\1\306\1\307"+ + "\1\310\1\311\1\312\1\313\1\314\1\315\1\316\1\317"+ + "\1\320\1\321\1\322\1\323\1\324\1\325\1\326\1\327"+ + "\1\330\1\331\1\332\1\333\1\334\1\335\1\336\1\337"+ + "\1\340\1\341\1\342\1\343\1\344\1\345\1\346\1\347"+ + "\1\350\1\351\1\352\1\353\1\354\1\355\1\356\1\357"+ + "\1\360\1\361\1\362\1\363\1\364\1\365\1\366\1\367"+ + "\1\370\1\371\1\372\1\373\1\374\1\375\1\376\1\377"+ + "\1\u0100\1\u0101\1\u0102\1\u0103\1\u0104\1\u0105\1\u0106\1\u0107"+ + "\1\u0108\1\u0109\1\u010a\1\u010b\1\u010c\1\u010d\1\u010e\2\1"+ + "\1\u010f\1\u0110\1\u0111\1\u0112\1\u0113\1\u0114\1\u0115\1\u0116"+ + "\1\u0117\1\u0118\1\u0119\1\u011a\1\1\3\0\1\u011b\1\0"+ + "\1\u011b\33\0\1\u011c\1\u011d\17\0\1\u011e"; + + private static int [] zzUnpackAction() { + int [] result = new int[338]; + int offset = 0; + offset = zzUnpackAction(ZZ_ACTION_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackAction(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int count = packed.charAt(i++); + int value = packed.charAt(i++); + do result[j++] = value; while (--count > 0); + } + return j; + } + + + /** + * Translates a state to a row index in the transition table + */ + private static final int [] ZZ_ROWMAP = zzUnpackRowMap(); + + private static final String ZZ_ROWMAP_PACKED_0 = + "\0\0\0\u0136\0\u026c\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136"+ + "\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136"+ + "\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136"+ + "\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136"+ + "\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136"+ + "\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136"+ + "\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136"+ + "\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136"+ + "\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136"+ + "\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136"+ + "\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136"+ + "\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136"+ + "\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136"+ + "\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136"+ + "\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136"+ + "\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136"+ + "\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136"+ + "\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136"+ + "\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136"+ + "\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136"+ + "\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136"+ + "\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136"+ + "\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136"+ + "\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136"+ + "\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136"+ + "\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136"+ + "\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136"+ + "\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136"+ + "\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136"+ + "\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136"+ + "\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136"+ + "\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136"+ + "\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136"+ + "\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u03a2"+ + "\0\u04d8\0\u060e\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136"+ + "\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0744\0\u087a"+ + "\0\u09b0\0\u0ae6\0\u0136\0\u0c1c\0\u0d52\0\u0e88\0\u0fbe\0\u10f4"+ + "\0\u122a\0\u1360\0\u1496\0\u15cc\0\u1702\0\u1838\0\u196e\0\u1aa4"+ + "\0\u1bda\0\u1d10\0\u1e46\0\u1f7c\0\u20b2\0\u21e8\0\u231e\0\u2454"+ + "\0\u258a\0\u26c0\0\u27f6\0\u292c\0\u2a62\0\u2b98\0\u2cce\0\u2e04"+ + "\0\u0136\0\u0136\0\u2f3a\0\u3070\0\u31a6\0\u32dc\0\u3412\0\u3548"+ + "\0\u367e\0\u37b4\0\u38ea\0\u3a20\0\u3b56\0\u3c8c\0\u3dc2\0\u3ef8"+ + "\0\u402e\0\u0136"; + + private static int [] zzUnpackRowMap() { + int [] result = new int[338]; + int offset = 0; + offset = zzUnpackRowMap(ZZ_ROWMAP_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackRowMap(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int high = packed.charAt(i++) << 16; + result[j++] = high | packed.charAt(i++); + } + return j; + } + + /** + * The transition table of the DFA + */ + private static final int [] ZZ_TRANS = zzUnpackTrans(); + + private static final String ZZ_TRANS_PACKED_0 = + "\1\2\1\3\2\2\1\4\1\5\1\6\1\7\1\10"+ + "\1\11\1\12\1\13\1\14\1\15\1\16\1\17\1\20"+ + "\1\21\1\22\1\23\1\24\1\25\1\26\1\27\1\30"+ + "\1\31\1\32\1\33\1\34\1\35\1\36\1\37\1\40"+ + "\1\41\1\42\1\43\1\44\1\45\1\46\1\47\1\50"+ + "\1\51\1\52\1\53\1\54\1\55\1\56\1\57\1\60"+ + "\1\61\1\62\1\63\1\64\1\65\1\66\1\67\1\70"+ + "\1\71\1\72\1\73\1\74\1\75\1\76\1\77\1\100"+ + "\1\101\1\102\1\103\1\104\1\105\1\106\1\107\1\110"+ + "\1\111\1\112\1\113\1\114\1\115\1\116\1\117\1\120"+ + "\1\121\1\122\1\123\1\124\1\125\1\126\1\127\1\130"+ + "\1\131\1\132\1\133\1\134\1\135\1\136\1\137\1\140"+ + "\1\141\1\142\1\143\1\144\1\145\1\146\1\147\1\150"+ + "\1\151\1\152\1\153\1\154\1\155\1\156\1\157\1\160"+ + "\1\161\1\162\1\163\1\164\1\165\1\166\1\167\1\170"+ + "\1\171\1\172\1\173\1\174\1\175\1\176\1\177\1\200"+ + "\1\201\1\202\1\203\1\204\1\205\1\206\1\207\1\210"+ + "\1\211\1\212\1\213\1\214\1\215\1\216\1\217\1\220"+ + "\1\221\1\222\1\223\1\224\1\225\1\226\1\227\1\230"+ + "\1\231\1\232\1\233\1\234\1\235\1\236\1\237\1\240"+ + "\1\241\1\242\1\243\1\244\1\245\1\246\1\247\1\250"+ + "\1\251\1\252\1\253\1\254\1\255\1\256\1\257\1\260"+ + "\1\261\1\262\1\263\1\264\1\265\1\266\1\267\1\270"+ + "\1\271\1\272\1\273\1\274\1\275\1\276\1\277\1\300"+ + "\1\301\1\302\1\303\1\304\1\305\1\306\1\307\1\310"+ + "\1\311\1\312\1\313\1\314\1\315\1\316\1\317\1\320"+ + "\1\321\1\322\1\323\1\324\1\325\1\326\1\327\1\330"+ + "\1\331\1\332\1\333\1\334\1\335\1\336\1\337\1\340"+ + "\1\341\1\342\1\343\1\344\1\345\1\346\1\347\1\350"+ + "\1\351\1\352\1\353\1\354\1\355\1\356\1\357\1\360"+ + "\1\361\1\362\1\363\1\364\1\365\1\366\1\367\1\370"+ + "\1\371\1\372\1\373\1\374\1\375\1\376\1\377\1\u0100"+ + "\1\u0101\1\u0102\1\u0103\1\u0104\1\u0105\1\u0106\1\u0107\1\u0108"+ + "\1\u0109\1\u010a\1\u010b\1\u010c\1\u010d\1\u010e\1\u010f\1\u0110"+ + "\1\u0111\1\2\1\u0112\12\2\1\u0113\1\u0114\1\u0115\1\u0116"+ + "\1\u0117\1\u0118\1\u0119\1\u011a\1\u011b\1\u011c\1\u011d\1\u011e"+ + "\1\u011f\13\2\u0136\0\2\u0120\1\0\u0133\u0120\u0113\0\1\u0121"+ + "\6\0\1\u0122\2\0\1\u0122\30\0\3\u0123\1\0\1\u0123"+ + "\1\0\1\u0123\1\0\u010a\u0123\1\u0124\1\u0125\2\0\3\u0123"+ + "\1\0\1\u0123\1\u0126\2\0\15\u0123\5\0\1\u0123\3\0"+ + "\1\u0123\4\0\5\u0127\u010c\0\1\u0128\1\u0127\3\0\1\u0129"+ + "\21\0\1\u012a\1\u0127\1\u012b\2\u0127\1\u012c\3\0\2\u0127"+ + "\u0114\0\1\u012d\4\0\1\u012e\21\0\1\u012f\1\0\1\u0130"+ + "\13\0\1\u0131\u0246\0\1\u0132\44\0\1\u0123\1\0\1\u0123"+ + "\1\0\1\u0123\u010a\0\1\u0123\1\0\2\u0123\2\0\2\u0123"+ + "\1\0\2\u0123\16\0\5\u0123\1\0\3\u0123\1\0\1\u0123"+ + "\u0112\0\1\u0123\u013c\0\1\u0133\34\0\3\u0123\1\0\1\u0123"+ + "\1\0\1\u0123\1\0\u010a\u0123\1\0\1\u0123\2\0\3\u0123"+ + "\1\0\1\u0123\3\0\15\u0123\5\0\1\u0123\3\0\1\u0123"+ + "\4\0\5\u0127\u010c\0\2\u0127\1\2\2\0\1\u0127\21\0"+ + "\5\u0127\4\0\2\u0127\3\0\5\u0127\u010c\0\1\u0127\1\u0134"+ + "\1\2\2\0\1\u0127\21\0\5\u0127\4\0\2\u0127\3\0"+ + "\5\u0127\u010c\0\1\u0127\1\u0135\1\2\2\0\1\u0127\21\0"+ + "\3\u0127\1\u0136\1\u0127\4\0\2\u0127\3\0\5\u0127\u010c\0"+ + "\2\u0127\1\2\2\0\1\u0127\21\0\1\u0127\1\u0137\3\u0127"+ + "\4\0\2\u0127\3\0\5\u0127\u010c\0\2\u0127\1\2\2\0"+ + "\1\u0127\21\0\3\u0127\1\u0138\1\u0127\4\0\2\u0127\u0131\0"+ + "\1\u0139\u0119\0\1\u013a\u0135\0\1\u013b\30\0\1\u013c\u0133\0"+ + "\1\u013d\u0137\0\1\u013e\11\0\1\2\1\u0131\u0247\0\1\u013f"+ + "\u0135\0\1\u0140\43\0\5\u0127\u010c\0\2\u0127\1\u0141\2\0"+ + "\1\u0127\21\0\5\u0127\4\0\2\u0127\3\0\5\u0127\u010c\0"+ + "\2\u0127\1\u0142\2\0\1\u0127\21\0\5\u0127\4\0\2\u0127"+ + "\3\0\5\u0127\u010c\0\2\u0127\1\2\2\0\1\u0127\21\0"+ + "\4\u0127\1\u0143\4\0\2\u0127\3\0\5\u0127\u010c\0\2\u0127"+ + "\1\2\2\0\1\u0127\21\0\2\u0127\1\u0144\2\u0127\4\0"+ + "\2\u0127\3\0\5\u0127\u010c\0\2\u0127\1\2\2\0\1\u0127"+ + "\21\0\4\u0127\1\u0145\4\0\2\u0127\u0132\0\1\u0146\u0119\0"+ + "\1\u0141\u0135\0\1\u0142\u014e\0\1\u0147\u0133\0\1\u0148\u0137\0"+ + "\1\u0149\u011c\0\1\u014a\u0135\0\1\u0123\42\0\5\u0127\u010c\0"+ + "\2\u0127\1\2\2\0\1\u0127\21\0\2\u0127\1\u014b\2\u0127"+ + "\4\0\2\u0127\3\0\5\u0127\u010c\0\1\u0127\1\u014c\1\2"+ + "\2\0\1\u0127\21\0\5\u0127\4\0\2\u0127\3\0\5\u0127"+ + "\u010c\0\2\u0127\1\2\2\0\1\u0127\21\0\2\u0127\1\u014d"+ + "\2\u0127\4\0\2\u0127\u0133\0\1\u014e\u012f\0\1\u014f\u011d\0"+ + "\1\u0150\u014d\0\1\u0151\u011f\0\1\u0122\41\0\5\u0127\u010c\0"+ + "\2\u0127\1\353\2\0\1\u0127\21\0\5\u0127\4\0\2\u0127"+ + "\3\0\5\u0127\u010c\0\2\u0127\1\355\2\0\1\u0127\21\0"+ + "\5\u0127\4\0\2\u0127\3\0\5\u0127\u010c\0\2\u0127\1\352"+ + "\2\0\1\u0127\21\0\5\u0127\4\0\2\u0127\u0116\0\1\u0152"+ + "\u0135\0\1\353\u0135\0\1\355\u0135\0\1\352\37\0"; + + private static int [] zzUnpackTrans() { + int [] result = new int[16740]; + int offset = 0; + offset = zzUnpackTrans(ZZ_TRANS_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackTrans(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int count = packed.charAt(i++); + int value = packed.charAt(i++); + value--; + do result[j++] = value; while (--count > 0); + } + return j; + } + + + /* error codes */ + private static final int ZZ_UNKNOWN_ERROR = 0; + private static final int ZZ_NO_MATCH = 1; + private static final int ZZ_PUSHBACK_2BIG = 2; + + /* error messages for the codes above */ + private static final String ZZ_ERROR_MSG[] = { + "Unkown internal scanner error", + "Error: could not match input", + "Error: pushback value was too large" + }; + + /** + * ZZ_ATTRIBUTE[aState] contains the attributes of state aState + */ + private static final int [] ZZ_ATTRIBUTE = zzUnpackAttribute(); + + private static final String ZZ_ATTRIBUTE_PACKED_0 = + "\1\0\1\11\1\1\u010c\11\3\1\14\11\1\1\3\0"+ + "\1\11\1\0\1\1\33\0\2\11\17\0\1\11"; + + private static int [] zzUnpackAttribute() { + int [] result = new int[338]; + int offset = 0; + offset = zzUnpackAttribute(ZZ_ATTRIBUTE_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackAttribute(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int count = packed.charAt(i++); + int value = packed.charAt(i++); + do result[j++] = value; while (--count > 0); + } + return j; + } + + /** the input device */ + private java.io.Reader zzReader; + + /** the current state of the DFA */ + private int zzState; + + /** the current lexical state */ + private int zzLexicalState = YYINITIAL; + + /** this buffer contains the current text to be matched and is + the source of the yytext() string */ + private char zzBuffer[] = new char[ZZ_BUFFERSIZE]; + + /** the textposition at the last accepting state */ + private int zzMarkedPos; + + /** the current text position in the buffer */ + private int zzCurrentPos; + + /** startRead marks the beginning of the yytext() string in the buffer */ + private int zzStartRead; + + /** endRead marks the last character in the buffer, that has been read + from input */ + private int zzEndRead; + + /** number of newlines encountered up to the start of the matched text */ + private int yyline; + + /** the number of characters up to the start of the matched text */ + private int yychar; + + /** + * the number of characters from the last newline up to the start of the + * matched text + */ + private int yycolumn; + + /** + * zzAtBOL == true <=> the scanner is currently at the beginning of a line + */ + private boolean zzAtBOL = true; + + /** zzAtEOF == true <=> the scanner is at the EOF */ + private boolean zzAtEOF; + + /** denotes if the user-EOF-code has already been executed */ + private boolean zzEOFDone; + + + /** + * Creates a new scanner + * There is also a java.io.InputStream version of this constructor. + * + * @param in the java.io.Reader to read input from. + */ + public Unicode2BetacodeLex(java.io.Reader in) { + this.zzReader = in; + } + + /** + * Creates a new scanner. + * There is also java.io.Reader version of this constructor. + * + * @param in the java.io.Inputstream to read input from. + */ + public Unicode2BetacodeLex(java.io.InputStream in) { + this(new java.io.InputStreamReader(in)); + } + + /** + * Unpacks the compressed character translation table. + * + * @param packed the packed character translation table + * @return the unpacked character translation table + */ + private static char [] zzUnpackCMap(String packed) { + char [] map = new char[0x10000]; + int i = 0; /* index in packed string */ + int j = 0; /* index in unpacked array */ + while (i < 724) { + int count = packed.charAt(i++); + char value = packed.charAt(i++); + do map[j++] = value; while (--count > 0); + } + return map; + } + + + /** + * Refills the input buffer. + * + * @return false, iff there was new input. + * + * @exception java.io.IOException if any I/O-Error occurs + */ + private boolean zzRefill() throws java.io.IOException { + + /* first: make room (if you can) */ + if (zzStartRead > 0) { + System.arraycopy(zzBuffer, zzStartRead, + zzBuffer, 0, + zzEndRead-zzStartRead); + + /* translate stored positions */ + zzEndRead-= zzStartRead; + zzCurrentPos-= zzStartRead; + zzMarkedPos-= zzStartRead; + zzStartRead = 0; + } + + /* is the buffer big enough? */ + if (zzCurrentPos >= zzBuffer.length) { + /* if not: blow it up */ + char newBuffer[] = new char[zzCurrentPos*2]; + System.arraycopy(zzBuffer, 0, newBuffer, 0, zzBuffer.length); + zzBuffer = newBuffer; + } + + /* finally: fill the buffer with new input */ + int numRead = zzReader.read(zzBuffer, zzEndRead, + zzBuffer.length-zzEndRead); + + if (numRead > 0) { + zzEndRead+= numRead; + return false; + } + // unlikely but not impossible: read 0 characters, but not at end of stream + if (numRead == 0) { + int c = zzReader.read(); + if (c == -1) { + return true; + } else { + zzBuffer[zzEndRead++] = (char) c; + return false; + } + } + + // numRead < 0 + return true; + } + + + /** + * Closes the input stream. + */ + public final void yyclose() throws java.io.IOException { + zzAtEOF = true; /* indicate end of file */ + zzEndRead = zzStartRead; /* invalidate buffer */ + + if (zzReader != null) + zzReader.close(); + } + + + /** + * Resets the scanner to read from a new input stream. + * Does not close the old reader. + * + * All internal variables are reset, the old input stream + * cannot be reused (internal buffer is discarded and lost). + * Lexical state is set to ZZ_INITIAL. + * + * @param reader the new input stream + */ + public final void yyreset(java.io.Reader reader) { + zzReader = reader; + zzAtBOL = true; + zzAtEOF = false; + zzEOFDone = false; + zzEndRead = zzStartRead = 0; + zzCurrentPos = zzMarkedPos = 0; + yyline = yychar = yycolumn = 0; + zzLexicalState = YYINITIAL; + } + + + /** + * Returns the current lexical state. + */ + public final int yystate() { + return zzLexicalState; + } + + + /** + * Enters a new lexical state + * + * @param newState the new lexical state + */ + public final void yybegin(int newState) { + zzLexicalState = newState; + } + + + /** + * Returns the text matched by the current regular expression. + */ + public final String yytext() { + return new String( zzBuffer, zzStartRead, zzMarkedPos-zzStartRead ); + } + + + /** + * Returns the character at position pos from the + * matched text. + * + * It is equivalent to yytext().charAt(pos), but faster + * + * @param pos the position of the character to fetch. + * A value from 0 to yylength()-1. + * + * @return the character at position pos + */ + public final char yycharat(int pos) { + return zzBuffer[zzStartRead+pos]; + } + + + /** + * Returns the length of the matched text region. + */ + public final int yylength() { + return zzMarkedPos-zzStartRead; + } + + + /** + * Reports an error that occured while scanning. + * + * In a wellformed scanner (no or only correct usage of + * yypushback(int) and a match-all fallback rule) this method + * will only be called with things that "Can't Possibly Happen". + * If this method is called, something is seriously wrong + * (e.g. a JFlex bug producing a faulty scanner etc.). + * + * Usual syntax/scanner level error handling should be done + * in error fallback rules. + * + * @param errorCode the code of the errormessage to display + */ + private void zzScanError(int errorCode) { + String message; + try { + message = ZZ_ERROR_MSG[errorCode]; + } + catch (ArrayIndexOutOfBoundsException e) { + message = ZZ_ERROR_MSG[ZZ_UNKNOWN_ERROR]; + } + + throw new Error(message); + } + + + /** + * Pushes the specified amount of characters back into the input stream. + * + * They will be read again by then next call of the scanning method + * + * @param number the number of characters to be read again. + * This number must not be greater than yylength()! + */ + public void yypushback(int number) { + if ( number > yylength() ) + zzScanError(ZZ_PUSHBACK_2BIG); + + zzMarkedPos -= number; + } + + + /** + * Resumes scanning until the next regular expression is matched, + * the end of input is encountered or an I/O-Error occurs. + * + * @return the next token + * @exception java.io.IOException if any I/O-Error occurs + */ + public java.lang.String yylex() throws java.io.IOException { + int zzInput; + int zzAction; + + // cached fields: + int zzCurrentPosL; + int zzMarkedPosL; + int zzEndReadL = zzEndRead; + char [] zzBufferL = zzBuffer; + char [] zzCMapL = ZZ_CMAP; + + int [] zzTransL = ZZ_TRANS; + int [] zzRowMapL = ZZ_ROWMAP; + int [] zzAttrL = ZZ_ATTRIBUTE; + + while (true) { + zzMarkedPosL = zzMarkedPos; + + zzAction = -1; + + zzCurrentPosL = zzCurrentPos = zzStartRead = zzMarkedPosL; + + zzState = ZZ_LEXSTATE[zzLexicalState]; + + + zzForAction: { + while (true) { + + if (zzCurrentPosL < zzEndReadL) + zzInput = zzBufferL[zzCurrentPosL++]; + else if (zzAtEOF) { + zzInput = YYEOF; + break zzForAction; + } + else { + // store back cached positions + zzCurrentPos = zzCurrentPosL; + zzMarkedPos = zzMarkedPosL; + boolean eof = zzRefill(); + // get translated positions and possibly new buffer + zzCurrentPosL = zzCurrentPos; + zzMarkedPosL = zzMarkedPos; + zzBufferL = zzBuffer; + zzEndReadL = zzEndRead; + if (eof) { + zzInput = YYEOF; + break zzForAction; + } + else { + zzInput = zzBufferL[zzCurrentPosL++]; + } + } + int zzNext = zzTransL[ zzRowMapL[zzState] + zzCMapL[zzInput] ]; + if (zzNext == -1) break zzForAction; + zzState = zzNext; + + int zzAttributes = zzAttrL[zzState]; + if ( (zzAttributes & 1) == 1 ) { + zzAction = zzState; + zzMarkedPosL = zzCurrentPosL; + if ( (zzAttributes & 8) == 8 ) break zzForAction; + } + + } + } + + // store back cached position + zzMarkedPos = zzMarkedPosL; + + switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) { + case 266: + { return "p"; + } + case 287: break; + case 102: + { return "*(w"; + } + case 288: break; + case 20: + { return "*(\\a"; + } + case 289: break; + case 21: + { return "*)/a"; + } + case 290: break; + case 181: + { return "*a/"; + } + case 291: break; + case 237: + { return "*a"; + } + case 292: break; + case 260: + { return "n"; + } + case 293: break; + case 89: + { return "*(u"; + } + case 294: break; + case 16: + { return "a(="; + } + case 295: break; + case 30: + { return "e(/"; + } + case 296: break; + case 195: + { return "i+\\"; + } + case 297: break; + case 222: + { return "w="; + } + case 298: break; + case 210: + { return "u+="; + } + case 299: break; + case 99: + { return "w)="; + } + case 300: break; + case 256: + { return "l"; + } + case 301: break; + case 205: + { return "u+\\"; + } + case 302: break; + case 23: + { return "*)=a"; + } + case 303: break; + case 225: + { return "*o/"; + } + case 304: break; + case 44: + { return "h(="; + } + case 305: break; + case 3: + { return "j"; + } + case 306: break; + case 103: + { return "*)\\w"; + } + case 307: break; + case 152: + { return "*(/|h"; + } + case 308: break; + case 165: + { return "*)\\|w"; + } + case 309: break; + case 248: + { return "h"; + } + case 310: break; + case 76: + { return "*(o"; + } + case 311: break; + case 159: + { return "w)/|"; + } + case 312: break; + case 178: + { return "*a^"; + } + case 313: break; + case 141: + { return "h)\\|"; + } + case 314: break; + case 106: + { return "*(/w"; + } + case 315: break; + case 275: + { return "f"; + } + case 316: break; + case 227: + { return "/"; + } + case 317: break; + case 91: + { return "*(/u"; + } + case 318: break; + case 242: + { return "d"; + } + case 319: break; + case 161: + { return "w)=|"; + } + case 320: break; + case 57: + { return "i)/"; + } + case 321: break; + case 154: + { return "*(=|h"; + } + case 322: break; + case 95: + { return "w)\\"; + } + case 323: break; + case 108: + { return "*(=w"; + } + case 324: break; + case 116: + { return "i/"; + } + case 325: break; + case 238: + { return "b"; + } + case 326: break; + case 207: + { return "r)"; + } + case 327: break; + case 147: + { return "*)|h"; + } + case 328: break; + case 62: + { return "*(i"; + } + case 329: break; + case 230: + { return "+"; + } + case 330: break; + case 77: + { return "*)\\o"; + } + case 331: break; + case 166: + { return "*(\\|w"; + } + case 332: break; + case 71: + { return "o)\\"; + } + case 333: break; + case 92: + { return "*(=u"; + } + case 334: break; + case 232: + { return ")"; + } + case 335: break; + case 14: + { return "a(/"; + } + case 336: break; + case 122: + { return "w/"; + } + case 337: break; + case 206: + { return "u+/"; + } + case 338: break; + case 80: + { return "*(/o"; + } + case 339: break; + case 97: + { return "w)/"; + } + case 340: break; + case 123: + { return "a)|"; + } + case 341: break; + case 229: + { return "^"; + } + case 342: break; + case 32: + { return "*(e"; + } + case 343: break; + case 286: + { return "'"; + } + case 344: break; + case 42: + { return "h(/"; + } + case 345: break; + case 53: + { return "i)"; + } + case 346: break; + case 174: + { return "a|"; + } + case 347: break; + case 63: + { return "*)\\i"; + } + case 348: break; + case 139: + { return "h)|"; + } + case 349: break; + case 193: + { return "i^"; + } + case 350: break; + case 18: + { return "*(a"; + } + case 351: break; + case 74: + { return "o(/"; + } + case 352: break; + case 93: + { return "w)"; + } + case 353: break; + case 66: + { return "*(/i"; + } + case 354: break; + case 101: + { return "*)w"; + } + case 355: break; + case 7: + { return "!"; + } + case 356: break; + case 33: + { return "*)\\e"; + } + case 357: break; + case 15: + { return "a)="; + } + case 358: break; + case 29: + { return "e)/"; + } + case 359: break; + case 68: + { return "*(=i"; + } + case 360: break; + case 125: + { return "a)\\|"; + } + case 361: break; + case 36: + { return "*(/e"; + } + case 362: break; + case 115: + { return "i\\"; + } + case 363: break; + case 201: + { return "*i\\"; + } + case 364: break; + case 112: + { return "e/"; + } + case 365: break; + case 218: + { return "w/|"; + } + case 366: break; + case 176: + { return "a="; + } + case 367: break; + case 19: + { return "*)\\a"; + } + case 368: break; + case 43: + { return "h)="; + } + case 369: break; + case 133: + { return "*)\\|a"; + } + case 370: break; + case 270: + { return "s1"; + } + case 371: break; + case 247: + { return "*z"; + } + case 372: break; + case 204: + { return "u_"; + } + case 373: break; + case 143: + { return "h)/|"; + } + case 374: break; + case 22: + { return "*(/a"; + } + case 375: break; + case 82: + { return "u("; + } + case 376: break; + case 75: + { return "*)o"; + } + case 377: break; + case 223: + { return "w=|"; + } + case 378: break; + case 278: + { return "*x"; + } + case 379: break; + case 121: + { return "w\\"; + } + case 380: break; + case 200: + { return "*i_"; + } + case 381: break; + case 219: + { return "*w\\"; + } + case 382: break; + case 25: + { return "e)"; + } + case 383: break; + case 145: + { return "h)=|"; + } + case 384: break; + case 151: + { return "*)/|h"; + } + case 385: break; + case 24: + { return "*(=a"; + } + case 386: break; + case 4: + { return "*v"; + } + case 387: break; + case 192: + { return "*h|"; + } + case 388: break; + case 39: + { return "h)\\"; + } + case 389: break; + case 272: + { return "*t"; + } + case 390: break; + case 134: + { return "*(\\|a"; + } + case 391: break; + case 214: + { return "*u/"; + } + case 392: break; + case 61: + { return "*)i"; + } + case 393: break; + case 269: + { return "*r"; + } + case 394: break; + case 160: + { return "w(/|"; + } + case 395: break; + case 13: + { return "a)/"; + } + case 396: break; + case 153: + { return "*)=|h"; + } + case 397: break; + case 267: + { return "*p"; + } + case 398: break; + case 111: + { return "e\\"; + } + case 399: break; + case 88: + { return "u(="; + } + case 400: break; + case 31: + { return "*)e"; + } + case 401: break; + case 188: + { return "*e\\"; + } + case 402: break; + case 110: + { return "a/"; + } + case 403: break; + case 162: + { return "w(=|"; + } + case 404: break; + case 41: + { return "h)/"; + } + case 405: break; + case 261: + { return "*n"; + } + case 406: break; + case 226: + { return "\\"; + } + case 407: break; + case 96: + { return "w(\\"; + } + case 408: break; + case 148: + { return "*(|h"; + } + case 409: break; + case 257: + { return "*l"; + } + case 410: break; + case 211: + { return "*u^"; + } + case 411: break; + case 198: + { return "i+="; + } + case 412: break; + case 279: + { return "y"; + } + case 413: break; + case 17: + { return "*)a"; + } + case 414: break; + case 73: + { return "o)/"; + } + case 415: break; + case 72: + { return "o(\\"; + } + case 416: break; + case 118: + { return "o/"; + } + case 417: break; + case 168: + { return "*(/|w"; + } + case 418: break; + case 2: + { return "*j"; + } + case 419: break; + case 281: + { return "w"; + } + case 420: break; + case 48: + { return "*(\\h"; + } + case 421: break; + case 49: + { return "*)/h"; + } + case 422: break; + case 9: + { return "a)"; + } + case 423: break; + case 216: + { return "w\\|"; + } + case 424: break; + case 249: + { return "*h"; + } + case 425: break; + case 273: + { return "u"; + } + case 426: break; + case 171: + { return "a^"; + } + case 427: break; + case 175: + { return "a/|"; + } + case 428: break; + case 285: + { return "<"; + } + case 429: break; + case 276: + { return "*f"; + } + case 430: break; + case 38: + { return "h("; + } + case 431: break; + case 283: + // lookahead expression with fixed base length + zzMarkedPos = zzStartRead + 1; + { return "s"; + } + case 432: break; + case 51: + { return "*)=h"; + } + case 433: break; + case 127: + { return "a)/|"; + } + case 434: break; + case 170: + { return "*(=|w"; + } + case 435: break; + case 69: + { return "o)"; + } + case 436: break; + case 243: + { return "*d"; + } + case 437: break; + case 185: + { return "h/|"; + } + case 438: break; + case 250: + { return "q"; + } + case 439: break; + case 163: + { return "*)|w"; + } + case 440: break; + case 8: + { return ":"; + } + case 441: break; + case 177: + { return "a=|"; + } + case 442: break; + case 239: + { return "*b"; + } + case 443: break; + case 158: + { return "w(\\|"; + } + case 444: break; + case 109: + { return "a\\"; + } + case 445: break; + case 264: + { return "o"; + } + case 446: break; + case 129: + { return "a)=|"; + } + case 447: break; + case 86: + { return "u(/"; + } + case 448: break; + case 180: + { return "*a\\"; + } + case 449: break; + case 11: + { return "a)\\"; + } + case 450: break; + case 187: + { return "h=|"; + } + case 451: break; + case 258: + { return "m"; + } + case 452: break; + case 191: + { return "*h/"; + } + case 453: break; + case 113: + { return "h\\"; + } + case 454: break; + case 190: + { return "*h\\"; + } + case 455: break; + case 196: + { return "i+/"; + } + case 456: break; + case 254: + { return "k"; + } + case 457: break; + case 215: + { return "*(r"; + } + case 458: break; + case 27: + { return "e)\\"; + } + case 459: break; + case 117: + { return "o\\"; + } + case 460: break; + case 252: + { return "i"; + } + case 461: break; + case 224: + { return "*o\\"; + } + case 462: break; + case 144: + { return "h(/|"; + } + case 463: break; + case 179: + { return "*a_"; + } + case 464: break; + case 221: + { return "*w|"; + } + case 465: break; + case 240: + { return "g"; + } + case 466: break; + case 55: + { return "i)\\"; + } + case 467: break; + case 209: + { return "u="; + } + case 468: break; + case 87: + { return "u)="; + } + case 469: break; + case 244: + { return "e"; + } + case 470: break; + case 146: + { return "h(=|"; + } + case 471: break; + case 83: + { return "u)\\"; + } + case 472: break; + case 40: + { return "h(\\"; + } + case 473: break; + case 262: + { return "c"; + } + case 474: break; + case 136: + { return "*(/|a"; + } + case 475: break; + case 236: + { return "a"; + } + case 476: break; + case 208: + { return "r("; + } + case 477: break; + case 46: + { return "*(h"; + } + case 478: break; + case 228: + { return "_"; + } + case 479: break; + case 183: + { return "h\\|"; + } + case 480: break; + case 233: + { return "("; + } + case 481: break; + case 138: + { return "*(=|a"; + } + case 482: break; + case 194: + { return "i_"; + } + case 483: break; + case 167: + { return "*)/|w"; + } + case 484: break; + case 54: + { return "i("; + } + case 485: break; + case 131: + { return "*)|a"; + } + case 486: break; + case 47: + { return "*)\\h"; + } + case 487: break; + case 184: + { return "h|"; + } + case 488: break; + case 149: + { return "*)\\|h"; + } + case 489: break; + case 94: + { return "w("; + } + case 490: break; + case 50: + { return "*(/h"; + } + case 491: break; + case 120: + { return "u/"; + } + case 492: break; + case 85: + { return "u)/"; + } + case 493: break; + case 169: + { return "*)=|w"; + } + case 494: break; + case 156: + { return "w(|"; + } + case 495: break; + case 202: + { return "*i/"; + } + case 496: break; + case 52: + { return "*(=h"; + } + case 497: break; + case 128: + { return "a(/|"; + } + case 498: break; + case 157: + { return "w)\\|"; + } + case 499: break; + case 60: + { return "i(="; + } + case 500: break; + case 164: + { return "*(|w"; + } + case 501: break; + case 150: + { return "*(\\|h"; + } + case 502: break; + case 220: + { return "*w/"; + } + case 503: break; + case 186: + { return "h="; + } + case 504: break; + case 81: + { return "u)"; + } + case 505: break; + case 130: + { return "a(=|"; + } + case 506: break; + case 280: + { return "*y"; + } + case 507: break; + case 203: + { return "u^"; + } + case 508: break; + case 104: + { return "*(\\w"; + } + case 509: break; + case 12: + { return "a(\\"; + } + case 510: break; + case 105: + { return "*)/w"; + } + case 511: break; + case 182: + { return "*a|"; + } + case 512: break; + case 282: + { return "*w"; + } + case 513: break; + case 199: + { return "*i^"; + } + case 514: break; + case 100: + { return "w(="; + } + case 515: break; + case 90: + { return "*(\\u"; + } + case 516: break; + case 26: + { return "e("; + } + case 517: break; + case 1: + { return yytext(); + } + case 518: break; + case 142: + { return "h(\\|"; + } + case 519: break; + case 274: + { return "*u"; + } + case 520: break; + case 28: + { return "e(\\"; + } + case 521: break; + case 107: + { return "*)=w"; + } + case 522: break; + case 173: + { return "a\\|"; + } + case 523: break; + case 6: + { return "*s"; + } + case 524: break; + case 45: + { return "*)h"; + } + case 525: break; + case 251: + { return "*q"; + } + case 526: break; + case 119: + { return "u\\"; + } + case 527: break; + case 56: + { return "i(\\"; + } + case 528: break; + case 213: + { return "*u\\"; + } + case 529: break; + case 284: + { return ">"; + } + case 530: break; + case 78: + { return "*(\\o"; + } + case 531: break; + case 189: + { return "*e/"; + } + case 532: break; + case 79: + { return "*)/o"; + } + case 533: break; + case 265: + { return "*o"; + } + case 534: break; + case 135: + { return "*)/|a"; + } + case 535: break; + case 84: + { return "u(\\"; + } + case 536: break; + case 235: + { return "|"; + } + case 537: break; + case 58: + { return "i(/"; + } + case 538: break; + case 259: + { return "*m"; + } + case 539: break; + case 212: + { return "*u_"; + } + case 540: break; + case 114: + { return "h/"; + } + case 541: break; + case 246: + { return "z"; + } + case 542: break; + case 255: + { return "*k"; + } + case 543: break; + case 277: + { return "x"; + } + case 544: break; + case 64: + { return "*(\\i"; + } + case 545: break; + case 65: + { return "*)/i"; + } + case 546: break; + case 137: + { return "*)=|a"; + } + case 547: break; + case 253: + { return "*i"; + } + case 548: break; + case 98: + { return "w(/"; + } + case 549: break; + case 5: + { return "v"; + } + case 550: break; + case 124: + { return "a(|"; + } + case 551: break; + case 234: + { return "?"; + } + case 552: break; + case 172: + { return "a_"; + } + case 553: break; + case 217: + { return "w|"; + } + case 554: break; + case 10: + { return "a("; + } + case 555: break; + case 241: + { return "*g"; + } + case 556: break; + case 155: + { return "w)|"; + } + case 557: break; + case 37: + { return "h)"; + } + case 558: break; + case 271: + { return "t"; + } + case 559: break; + case 231: + { return "="; + } + case 560: break; + case 67: + { return "*)=i"; + } + case 561: break; + case 34: + { return "*(\\e"; + } + case 562: break; + case 35: + { return "*)/e"; + } + case 563: break; + case 140: + { return "h(|"; + } + case 564: break; + case 132: + { return "*(|a"; + } + case 565: break; + case 245: + { return "*e"; + } + case 566: break; + case 268: + { return "r"; + } + case 567: break; + case 59: + { return "i)="; + } + case 568: break; + case 70: + { return "o("; + } + case 569: break; + case 126: + { return "a(\\|"; + } + case 570: break; + case 263: + { return "*c"; + } + case 571: break; + case 197: + { return "i="; + } + case 572: break; + default: + if (zzInput == YYEOF && zzStartRead == zzCurrentPos) { + zzAtEOF = true; + return null; + } + else { + zzScanError(ZZ_NO_MATCH); + } + } + } + } + + +} diff -r 94305c504178 -r 2396a569e446 software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/general/Unicode2Buckwalter.lex --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/general/Unicode2Buckwalter.lex Tue Feb 08 14:54:09 2011 +0100 @@ -0,0 +1,121 @@ +package de.mpg.mpiwg.berlin.mpdl.lt.general; + +%% +%{ + /* + * Betacode to Unicode conversion + */ + +%} + +%class Unicode2BuckwalterLex +%public +%type java.lang.String +%unicode +%% + + +"<"[^>]+">" { return yytext(); } + +"\u0621" { return "'"; } /* Hamza */ +"\u0622" { return "|"; } /* ALEF WITH MADDA ABOVE from AraMorph */ +"\u0623" { return ">"; } /* Hamza */ +"\u0624" { return "&"; } /* Hamza */ +"\u0625" { return "<"; } /* Alif + HamzaBelow */ +"\u0626" { return "}"; } /* Ya + HamzaAbove */ +"\u0627" { return "A"; } /* Alif */ +"\u0628" { return "b"; } /* Ba */ +"\u0629" { return "p"; } /* TaMarbuta */ +"\u062A" { return "t"; } /* Ta */ +"\u062B" { return "v"; } /* Tha */ +"\u062C" { return "j"; } /* Jeem */ +"\u062D" { return "H"; } /* HHa */ +"\u062E" { return "x"; } /* Kha */ +"\u062F" { return "d"; } /* Dal */ +"\u0630" { return "*"; } /* Thal */ +"\u0631" { return "r"; } /* Ra */ +"\u0632" { return "z"; } /* Zain */ +"\u0633" { return "s"; } /* Seen */ +"\u0634" { return "$"; } /* Sheen */ +"\u0635" { return "S"; } /* Sad */ +"\u0636" { return "D"; } /* DDad */ +"\u0637" { return "T"; } /* TTa */ +"\u0638" { return "Z"; } /* DTha */ +"\u0639" { return "E"; } /* Ain */ +"\u063A" { return "g"; } /* Ghain */ + +"\u0640" { return "_"; } /* Tatweel */ +"\u0641" { return "f"; } /* Fa */ +"\u0642" { return "q"; } /* Qaf */ +"\u0643" { return "k"; } /* Kaf */ +"\u0644" { return "l"; } /* Lam */ +"\u0645" { return "m"; } /* Meem */ +"\u0646" { return "n"; } /* Noon */ +"\u0647" { return "h"; } /* Ha */ +"\u0648" { return "w"; } /* Waw */ +"\u0649" { return "Y"; } /* AlifMaksura */ +"\u064A" { return "y"; } /* Ya */ +"\u064B" { return "F"; } /* Fathatan */ +"\u064C" { return "N"; } /* Dammatan */ +"\u064D" { return "K"; } /* Kasratan */ +"\u064E" { return "a"; } /* Fatha */ +"\u064F" { return "u"; } /* Damma */ +"\u0650" { return "i"; } /* Kasra */ +"\u0651" { return "~"; } /* Shadda */ +"\u0652" { return "o"; } /* Sukun */ +"\u0653" { return "^"; } /* Maddah */ +"\u0654" { return "#"; } /* HamzaAbove */ + +"\u0670" { return "`"; } /* AlifKhanjareeya */ +"\u0671" { return "{"; } /* Alif + HamzatWasl */ + +"\u067E" { return "P"; } /* PEH from AraMorph */ +"\u0686" { return "J"; } /* TCHEH from AraMorph */ +"\u06A4" { return "V"; } /* VEH from AraMorph */ +"\u06AF" { return "G"; } /* GAF from AraMorph */ +"\u0698" { return "R"; } /* JEH from AraMorph */ +"\u061F" { return "?"; } /* QUESTION MARK from AraMorph */ + +"\u06DC" { return ":"; } /* SmallHighSeen */ +"\u06DF" { return "@"; } /* SmallHighRoundedZero */ + +"\u06E2" { return "["; } /* SmallHighMeemIsolatedForm */ +"\u06E3" { return ";"; } /* SmallLowSeen */ +"\u06E5" { return ","; } /* SmallWaw */ +"\u06E6" { return "."; } /* SmallYa */ +"\u06E8" { return "!"; } /* SmallHighNoon */ +"\u06EA" { return "-"; } /* EmptyCentreLowStop */ +"\u06EB" { return "+"; } /* EmptyCentreHighStop */ +"\u06EC" { return "%"; } /* RoundedHighStopWithFilledCentre */ +"\u06ED" { return "]"; } /* SmallLowMeem */ + +[\&_]"vert;" { return "|"; } +[\&_]"lpar;" { return "("; } +[\&_]"rpar;" { return ")"; } +[\_\&]"lt;" { return "<"; } +[\_\&]"gt;" { return ">"; } +"'" { return "'"; } + +"&"[a-zA-Z]+";" { return yytext(); } + +. { return yytext(); } +\n { return yytext(); } + +/* make problemes */ +/* "\u06E0" { return "\\""; } SmallHighUprightRectangularZero */ + + +/* double entries */ +/* "\u060C" { return ","; } COMMA from AraMorph */ +/* "\u061B" { return ";"; } SEMICOLON from AraMorph */ + +/* not in buckwalter contained */ +/* \u0679 : ARABIC LETTER TTEH */ +/* \u0688 : ARABIC LETTER DDAL */ +/* \u06A9 : ARABIC LETTER KEHEH */ +/* \u0691 : ARABIC LETTER RREH */ +/* \u06BA : ARABIC LETTER NOON GHUNNA */ +/* \u06BE : ARABIC LETTER HEH DOACHASHMEE */ +/* \u06C1 : ARABIC LETTER HEH GOAL */ +/* \u06D2 : ARABIC LETTER YEH BARREE */ + diff -r 94305c504178 -r 2396a569e446 software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/general/Unicode2BuckwalterLex.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/general/Unicode2BuckwalterLex.java Tue Feb 08 14:54:09 2011 +0100 @@ -0,0 +1,882 @@ +/* The following code was generated by JFlex 1.4.3 on 14.12.10 17:12 */ + +package de.mpg.mpiwg.berlin.mpdl.lt.general; + + +/** + * This class is a scanner generated by + * JFlex 1.4.3 + * on 14.12.10 17:12 from the specification file + * /Users/jwillenborg/test/jflex/Unicode2Buckwalter.lex + */ +public class Unicode2BuckwalterLex { + + /** This character denotes the end of file */ + public static final int YYEOF = -1; + + /** initial size of the lookahead buffer */ + private static final int ZZ_BUFFERSIZE = 16384; + + /** lexical states */ + public static final int YYINITIAL = 0; + + /** + * ZZ_LEXSTATE[l] is the state in the DFA for the lexical state l + * ZZ_LEXSTATE[l+1] is the state in the DFA for the lexical state l + * at the beginning of a line + * l is of the form l = 2*k, k a non negative integer + */ + private static final int ZZ_LEXSTATE[] = { + 0, 0 + }; + + /** + * Translates characters to character classes + */ + private static final String ZZ_CMAP_PACKED = + "\12\0\1\0\30\0\1\120\2\0\1\117\11\0\1\121\2\0\1\122"+ + "\5\0\1\123\1\0\1\112\1\1\1\0\1\2\2\0\32\124\4\0"+ + "\1\105\1\0\1\115\3\124\1\107\1\124\1\116\4\124\1\113\3\124"+ + "\1\114\1\124\1\110\1\124\1\111\1\124\1\106\4\124\u05a4\0\1\71"+ + "\1\0\1\3\1\4\1\5\1\6\1\7\1\10\1\11\1\12\1\13"+ + "\1\14\1\15\1\16\1\17\1\20\1\21\1\22\1\23\1\24\1\25"+ + "\1\26\1\27\1\30\1\31\1\32\1\33\1\34\5\0\1\35\1\36"+ + "\1\37\1\40\1\41\1\42\1\43\1\44\1\45\1\46\1\47\1\50"+ + "\1\51\1\52\1\53\1\54\1\55\1\56\1\57\1\60\1\61\33\0"+ + "\1\62\1\63\14\0\1\64\7\0\1\65\21\0\1\70\13\0\1\66"+ + "\12\0\1\67\54\0\1\72\2\0\1\73\2\0\1\74\1\75\1\0"+ + "\1\76\1\77\1\0\1\100\1\0\1\101\1\102\1\103\1\104\uf912\0"; + + /** + * Translates characters to character classes + */ + private static final char [] ZZ_CMAP = zzUnpackCMap(ZZ_CMAP_PACKED); + + /** + * Translates DFA states to action switch labels. + */ + private static final int [] ZZ_ACTION = zzUnpackAction(); + + private static final String ZZ_ACTION_PACKED_0 = + "\1\0\2\1\1\2\1\3\1\4\1\5\1\6\1\7"+ + "\1\10\1\11\1\12\1\13\1\14\1\15\1\16\1\17"+ + "\1\20\1\21\1\22\1\23\1\24\1\25\1\26\1\27"+ + "\1\30\1\31\1\32\1\33\1\34\1\35\1\36\1\37"+ + "\1\40\1\41\1\42\1\43\1\44\1\45\1\46\1\47"+ + "\1\50\1\51\1\52\1\53\1\54\1\55\1\56\1\57"+ + "\1\60\1\61\1\62\1\63\1\64\1\65\1\66\1\67"+ + "\1\70\1\71\1\72\1\73\1\74\1\75\1\76\1\77"+ + "\1\100\1\101\1\102\1\103\2\1\30\0\1\104\1\0"+ + "\1\105\13\0\1\106\1\107"; + + private static int [] zzUnpackAction() { + int [] result = new int[111]; + int offset = 0; + offset = zzUnpackAction(ZZ_ACTION_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackAction(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int count = packed.charAt(i++); + int value = packed.charAt(i++); + do result[j++] = value; while (--count > 0); + } + return j; + } + + + /** + * Translates a state to a row index in the transition table + */ + private static final int [] ZZ_ROWMAP = zzUnpackRowMap(); + + private static final String ZZ_ROWMAP_PACKED_0 = + "\0\0\0\125\0\252\0\125\0\125\0\125\0\125\0\125"+ + "\0\125\0\125\0\125\0\125\0\125\0\125\0\125\0\125"+ + "\0\125\0\125\0\125\0\125\0\125\0\125\0\125\0\125"+ + "\0\125\0\125\0\125\0\125\0\125\0\125\0\125\0\125"+ + "\0\125\0\125\0\125\0\125\0\125\0\125\0\125\0\125"+ + "\0\125\0\125\0\125\0\125\0\125\0\125\0\125\0\125"+ + "\0\125\0\125\0\125\0\125\0\125\0\125\0\125\0\125"+ + "\0\125\0\125\0\125\0\125\0\125\0\125\0\125\0\125"+ + "\0\125\0\125\0\125\0\125\0\125\0\377\0\u0154\0\u01a9"+ + "\0\u01fe\0\u0253\0\u02a8\0\u02fd\0\u0352\0\u03a7\0\u03fc\0\u0451"+ + "\0\u04a6\0\u04fb\0\u0550\0\u05a5\0\u05fa\0\u064f\0\u06a4\0\u06f9"+ + "\0\u074e\0\u07a3\0\u07f8\0\u084d\0\u08a2\0\u08f7\0\u094c\0\125"+ + "\0\u09a1\0\125\0\u09f6\0\u0a4b\0\u0aa0\0\u0af5\0\u0b4a\0\u0b9f"+ + "\0\u0bf4\0\u0c49\0\u0c9e\0\u0cf3\0\u0d48\0\125\0\125"; + + private static int [] zzUnpackRowMap() { + int [] result = new int[111]; + int offset = 0; + offset = zzUnpackRowMap(ZZ_ROWMAP_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackRowMap(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int high = packed.charAt(i++) << 16; + result[j++] = high | packed.charAt(i++); + } + return j; + } + + /** + * The transition table of the DFA + */ + private static final int [] ZZ_TRANS = zzUnpackTrans(); + + private static final String ZZ_TRANS_PACKED_0 = + "\1\2\1\3\1\2\1\4\1\5\1\6\1\7\1\10"+ + "\1\11\1\12\1\13\1\14\1\15\1\16\1\17\1\20"+ + "\1\21\1\22\1\23\1\24\1\25\1\26\1\27\1\30"+ + "\1\31\1\32\1\33\1\34\1\35\1\36\1\37\1\40"+ + "\1\41\1\42\1\43\1\44\1\45\1\46\1\47\1\50"+ + "\1\51\1\52\1\53\1\54\1\55\1\56\1\57\1\60"+ + "\1\61\1\62\1\63\1\64\1\65\1\66\1\67\1\70"+ + "\1\71\1\72\1\73\1\74\1\75\1\76\1\77\1\100"+ + "\1\101\1\102\1\103\1\104\1\105\1\106\11\2\1\107"+ + "\5\2\125\0\2\110\1\0\122\110\106\0\1\111\1\0"+ + "\1\112\2\0\1\113\2\0\1\114\114\0\1\115\1\116"+ + "\1\117\1\116\1\0\1\120\2\116\1\121\1\0\1\122"+ + "\3\0\1\116\2\110\1\2\122\110\107\0\1\123\131\0"+ + "\1\124\121\0\1\125\2\0\1\126\121\0\1\127\121\0"+ + "\1\116\1\130\2\116\1\2\4\116\5\0\1\116\106\0"+ + "\4\116\1\2\4\116\5\0\1\116\106\0\4\116\1\2"+ + "\1\116\1\131\2\116\5\0\1\116\106\0\3\116\1\132"+ + "\1\2\1\116\1\133\2\116\5\0\1\116\106\0\3\116"+ + "\1\134\1\2\4\116\5\0\1\116\121\0\1\135\113\0"+ + "\1\136\131\0\1\137\121\0\1\140\127\0\1\141\121\0"+ + "\1\142\120\0\2\116\1\143\1\116\1\2\4\116\5\0"+ + "\1\116\106\0\4\116\1\2\2\116\1\144\1\116\5\0"+ + "\1\116\106\0\4\116\1\140\4\116\5\0\1\116\106\0"+ + "\4\116\1\2\2\116\1\145\1\116\5\0\1\116\106\0"+ + "\4\116\1\142\4\116\5\0\1\116\122\0\1\146\113\0"+ + "\1\147\123\0\1\150\124\0\1\151\122\0\3\116\1\152"+ + "\1\2\4\116\5\0\1\116\106\0\2\116\1\153\1\116"+ + "\1\2\4\116\5\0\1\116\106\0\2\116\1\154\1\116"+ + "\1\2\4\116\5\0\1\116\123\0\1\155\113\0\1\5"+ + "\124\0\1\156\124\0\1\157\120\0\4\116\1\5\4\116"+ + "\5\0\1\116\106\0\4\116\1\156\4\116\5\0\1\116"+ + "\106\0\4\116\1\157\4\116\5\0\1\116\112\0\1\4"+ + "\12\0"; + + private static int [] zzUnpackTrans() { + int [] result = new int[3485]; + int offset = 0; + offset = zzUnpackTrans(ZZ_TRANS_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackTrans(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int count = packed.charAt(i++); + int value = packed.charAt(i++); + value--; + do result[j++] = value; while (--count > 0); + } + return j; + } + + + /* error codes */ + private static final int ZZ_UNKNOWN_ERROR = 0; + private static final int ZZ_NO_MATCH = 1; + private static final int ZZ_PUSHBACK_2BIG = 2; + + /* error messages for the codes above */ + private static final String ZZ_ERROR_MSG[] = { + "Unkown internal scanner error", + "Error: could not match input", + "Error: pushback value was too large" + }; + + /** + * ZZ_ATTRIBUTE[aState] contains the attributes of state aState + */ + private static final int [] ZZ_ATTRIBUTE = zzUnpackAttribute(); + + private static final String ZZ_ATTRIBUTE_PACKED_0 = + "\1\0\1\11\1\1\102\11\2\1\30\0\1\11\1\0"+ + "\1\11\13\0\2\11"; + + private static int [] zzUnpackAttribute() { + int [] result = new int[111]; + int offset = 0; + offset = zzUnpackAttribute(ZZ_ATTRIBUTE_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackAttribute(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int count = packed.charAt(i++); + int value = packed.charAt(i++); + do result[j++] = value; while (--count > 0); + } + return j; + } + + /** the input device */ + private java.io.Reader zzReader; + + /** the current state of the DFA */ + private int zzState; + + /** the current lexical state */ + private int zzLexicalState = YYINITIAL; + + /** this buffer contains the current text to be matched and is + the source of the yytext() string */ + private char zzBuffer[] = new char[ZZ_BUFFERSIZE]; + + /** the textposition at the last accepting state */ + private int zzMarkedPos; + + /** the current text position in the buffer */ + private int zzCurrentPos; + + /** startRead marks the beginning of the yytext() string in the buffer */ + private int zzStartRead; + + /** endRead marks the last character in the buffer, that has been read + from input */ + private int zzEndRead; + + /** number of newlines encountered up to the start of the matched text */ + private int yyline; + + /** the number of characters up to the start of the matched text */ + private int yychar; + + /** + * the number of characters from the last newline up to the start of the + * matched text + */ + private int yycolumn; + + /** + * zzAtBOL == true <=> the scanner is currently at the beginning of a line + */ + private boolean zzAtBOL = true; + + /** zzAtEOF == true <=> the scanner is at the EOF */ + private boolean zzAtEOF; + + /** denotes if the user-EOF-code has already been executed */ + private boolean zzEOFDone; + + /* user code: */ + /* + * Betacode to Unicode conversion + */ + + + + /** + * Creates a new scanner + * There is also a java.io.InputStream version of this constructor. + * + * @param in the java.io.Reader to read input from. + */ + public Unicode2BuckwalterLex(java.io.Reader in) { + this.zzReader = in; + } + + /** + * Creates a new scanner. + * There is also java.io.Reader version of this constructor. + * + * @param in the java.io.Inputstream to read input from. + */ + public Unicode2BuckwalterLex(java.io.InputStream in) { + this(new java.io.InputStreamReader(in)); + } + + /** + * Unpacks the compressed character translation table. + * + * @param packed the packed character translation table + * @return the unpacked character translation table + */ + private static char [] zzUnpackCMap(String packed) { + char [] map = new char[0x10000]; + int i = 0; /* index in packed string */ + int j = 0; /* index in unpacked array */ + while (i < 240) { + int count = packed.charAt(i++); + char value = packed.charAt(i++); + do map[j++] = value; while (--count > 0); + } + return map; + } + + + /** + * Refills the input buffer. + * + * @return false, iff there was new input. + * + * @exception java.io.IOException if any I/O-Error occurs + */ + private boolean zzRefill() throws java.io.IOException { + + /* first: make room (if you can) */ + if (zzStartRead > 0) { + System.arraycopy(zzBuffer, zzStartRead, + zzBuffer, 0, + zzEndRead-zzStartRead); + + /* translate stored positions */ + zzEndRead-= zzStartRead; + zzCurrentPos-= zzStartRead; + zzMarkedPos-= zzStartRead; + zzStartRead = 0; + } + + /* is the buffer big enough? */ + if (zzCurrentPos >= zzBuffer.length) { + /* if not: blow it up */ + char newBuffer[] = new char[zzCurrentPos*2]; + System.arraycopy(zzBuffer, 0, newBuffer, 0, zzBuffer.length); + zzBuffer = newBuffer; + } + + /* finally: fill the buffer with new input */ + int numRead = zzReader.read(zzBuffer, zzEndRead, + zzBuffer.length-zzEndRead); + + if (numRead > 0) { + zzEndRead+= numRead; + return false; + } + // unlikely but not impossible: read 0 characters, but not at end of stream + if (numRead == 0) { + int c = zzReader.read(); + if (c == -1) { + return true; + } else { + zzBuffer[zzEndRead++] = (char) c; + return false; + } + } + + // numRead < 0 + return true; + } + + + /** + * Closes the input stream. + */ + public final void yyclose() throws java.io.IOException { + zzAtEOF = true; /* indicate end of file */ + zzEndRead = zzStartRead; /* invalidate buffer */ + + if (zzReader != null) + zzReader.close(); + } + + + /** + * Resets the scanner to read from a new input stream. + * Does not close the old reader. + * + * All internal variables are reset, the old input stream + * cannot be reused (internal buffer is discarded and lost). + * Lexical state is set to ZZ_INITIAL. + * + * @param reader the new input stream + */ + public final void yyreset(java.io.Reader reader) { + zzReader = reader; + zzAtBOL = true; + zzAtEOF = false; + zzEOFDone = false; + zzEndRead = zzStartRead = 0; + zzCurrentPos = zzMarkedPos = 0; + yyline = yychar = yycolumn = 0; + zzLexicalState = YYINITIAL; + } + + + /** + * Returns the current lexical state. + */ + public final int yystate() { + return zzLexicalState; + } + + + /** + * Enters a new lexical state + * + * @param newState the new lexical state + */ + public final void yybegin(int newState) { + zzLexicalState = newState; + } + + + /** + * Returns the text matched by the current regular expression. + */ + public final String yytext() { + return new String( zzBuffer, zzStartRead, zzMarkedPos-zzStartRead ); + } + + + /** + * Returns the character at position pos from the + * matched text. + * + * It is equivalent to yytext().charAt(pos), but faster + * + * @param pos the position of the character to fetch. + * A value from 0 to yylength()-1. + * + * @return the character at position pos + */ + public final char yycharat(int pos) { + return zzBuffer[zzStartRead+pos]; + } + + + /** + * Returns the length of the matched text region. + */ + public final int yylength() { + return zzMarkedPos-zzStartRead; + } + + + /** + * Reports an error that occured while scanning. + * + * In a wellformed scanner (no or only correct usage of + * yypushback(int) and a match-all fallback rule) this method + * will only be called with things that "Can't Possibly Happen". + * If this method is called, something is seriously wrong + * (e.g. a JFlex bug producing a faulty scanner etc.). + * + * Usual syntax/scanner level error handling should be done + * in error fallback rules. + * + * @param errorCode the code of the errormessage to display + */ + private void zzScanError(int errorCode) { + String message; + try { + message = ZZ_ERROR_MSG[errorCode]; + } + catch (ArrayIndexOutOfBoundsException e) { + message = ZZ_ERROR_MSG[ZZ_UNKNOWN_ERROR]; + } + + throw new Error(message); + } + + + /** + * Pushes the specified amount of characters back into the input stream. + * + * They will be read again by then next call of the scanning method + * + * @param number the number of characters to be read again. + * This number must not be greater than yylength()! + */ + public void yypushback(int number) { + if ( number > yylength() ) + zzScanError(ZZ_PUSHBACK_2BIG); + + zzMarkedPos -= number; + } + + + /** + * Resumes scanning until the next regular expression is matched, + * the end of input is encountered or an I/O-Error occurs. + * + * @return the next token + * @exception java.io.IOException if any I/O-Error occurs + */ + public java.lang.String yylex() throws java.io.IOException { + int zzInput; + int zzAction; + + // cached fields: + int zzCurrentPosL; + int zzMarkedPosL; + int zzEndReadL = zzEndRead; + char [] zzBufferL = zzBuffer; + char [] zzCMapL = ZZ_CMAP; + + int [] zzTransL = ZZ_TRANS; + int [] zzRowMapL = ZZ_ROWMAP; + int [] zzAttrL = ZZ_ATTRIBUTE; + + while (true) { + zzMarkedPosL = zzMarkedPos; + + zzAction = -1; + + zzCurrentPosL = zzCurrentPos = zzStartRead = zzMarkedPosL; + + zzState = ZZ_LEXSTATE[zzLexicalState]; + + + zzForAction: { + while (true) { + + if (zzCurrentPosL < zzEndReadL) + zzInput = zzBufferL[zzCurrentPosL++]; + else if (zzAtEOF) { + zzInput = YYEOF; + break zzForAction; + } + else { + // store back cached positions + zzCurrentPos = zzCurrentPosL; + zzMarkedPos = zzMarkedPosL; + boolean eof = zzRefill(); + // get translated positions and possibly new buffer + zzCurrentPosL = zzCurrentPos; + zzMarkedPosL = zzMarkedPos; + zzBufferL = zzBuffer; + zzEndReadL = zzEndRead; + if (eof) { + zzInput = YYEOF; + break zzForAction; + } + else { + zzInput = zzBufferL[zzCurrentPosL++]; + } + } + int zzNext = zzTransL[ zzRowMapL[zzState] + zzCMapL[zzInput] ]; + if (zzNext == -1) break zzForAction; + zzState = zzNext; + + int zzAttributes = zzAttrL[zzState]; + if ( (zzAttributes & 1) == 1 ) { + zzAction = zzState; + zzMarkedPosL = zzCurrentPosL; + if ( (zzAttributes & 8) == 8 ) break zzForAction; + } + + } + } + + // store back cached position + zzMarkedPos = zzMarkedPosL; + + switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) { + case 23: + { return "D"; + } + case 72: break; + case 17: + { return "*"; + } + case 73: break; + case 46: + { return "o"; + } + case 74: break; + case 60: + { return ";"; + } + case 75: break; + case 63: + { return "!"; + } + case 76: break; + case 29: + { return "f"; + } + case 77: break; + case 36: + { return "w"; + } + case 78: break; + case 67: + { return "]"; + } + case 79: break; + case 70: + { return ")"; + } + case 80: break; + case 69: + { return ">"; + } + case 81: break; + case 34: + { return "n"; + } + case 82: break; + case 24: + { return "T"; + } + case 83: break; + case 57: + { return ":"; + } + case 84: break; + case 41: + { return "K"; + } + case 85: break; + case 12: + { return "v"; + } + case 86: break; + case 71: + { return "("; + } + case 87: break; + case 33: + { return "m"; + } + case 88: break; + case 22: + { return "S"; + } + case 89: break; + case 45: + { return "~"; + } + case 90: break; + case 16: + { return "d"; + } + case 91: break; + case 52: + { return "J"; + } + case 92: break; + case 43: + { return "u"; + } + case 93: break; + case 59: + { return "["; + } + case 94: break; + case 8: + { return "A"; + } + case 95: break; + case 2: + { return "'"; + } + case 96: break; + case 32: + { return "l"; + } + case 97: break; + case 55: + { return "R"; + } + case 98: break; + case 7: + { return "}"; + } + case 99: break; + case 11: + { return "t"; + } + case 100: break; + case 25: + { return "Z"; + } + case 101: break; + case 58: + { return "@"; + } + case 102: break; + case 5: + { return "&"; + } + case 103: break; + case 31: + { return "k"; + } + case 104: break; + case 3: + { return "|"; + } + case 105: break; + case 9: + { return "b"; + } + case 106: break; + case 14: + { return "H"; + } + case 107: break; + case 62: + { return "."; + } + case 108: break; + case 20: + { return "s"; + } + case 109: break; + case 37: + { return "Y"; + } + case 110: break; + case 56: + { return "?"; + } + case 111: break; + case 66: + { return "%"; + } + case 112: break; + case 13: + { return "j"; + } + case 113: break; + case 51: + { return "P"; + } + case 114: break; + case 50: + { return "{"; + } + case 115: break; + case 1: + { return yytext(); + } + case 116: break; + case 42: + { return "a"; + } + case 117: break; + case 54: + { return "G"; + } + case 118: break; + case 64: + { return "-"; + } + case 119: break; + case 18: + { return "r"; + } + case 120: break; + case 4: + { return ">"; + } + case 121: break; + case 21: + { return "$"; + } + case 122: break; + case 44: + { return "i"; + } + case 123: break; + case 19: + { return "z"; + } + case 124: break; + case 68: + { return "<"; + } + case 125: break; + case 49: + { return "`"; + } + case 126: break; + case 39: + { return "F"; + } + case 127: break; + case 61: + { return ","; + } + case 128: break; + case 30: + { return "q"; + } + case 129: break; + case 48: + { return "#"; + } + case 130: break; + case 35: + { return "h"; + } + case 131: break; + case 40: + { return "N"; + } + case 132: break; + case 38: + { return "y"; + } + case 133: break; + case 28: + { return "_"; + } + case 134: break; + case 26: + { return "E"; + } + case 135: break; + case 65: + { return "+"; + } + case 136: break; + case 10: + { return "p"; + } + case 137: break; + case 53: + { return "V"; + } + case 138: break; + case 6: + { return "<"; + } + case 139: break; + case 27: + { return "g"; + } + case 140: break; + case 15: + { return "x"; + } + case 141: break; + case 47: + { return "^"; + } + case 142: break; + default: + if (zzInput == YYEOF && zzStartRead == zzCurrentPos) { + zzAtEOF = true; + return null; + } + else { + zzScanError(ZZ_NO_MATCH); + } + } + } + } + + +} diff -r 94305c504178 -r 2396a569e446 software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/lex/db/LexHandler.java --- a/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/lex/db/LexHandler.java Tue Feb 08 14:36:38 2011 +0100 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/lex/db/LexHandler.java Tue Feb 08 14:54:09 2011 +0100 @@ -59,6 +59,8 @@ if (! hasLexEntry) { hasLexEntry = hasLexEntryKey(lName, language); } + if (language.equals("de") || language.equals("fr") || language.equals("nl")) // TODO Lexika für diese Sprachen in BerkeleyDB einbringen (für frund nl auch eine bessere Morph.) und dann diese Zeilen wieder löschen + lexEntryKeys.add(lName); if (! lName.equals(formName) && hasLexEntry) { lexEntryKeys.add(lName); } @@ -72,6 +74,8 @@ public boolean hasLexEntryKey(String formName, String language) throws ApplicationException { boolean hasLexEntry = false; + if (language.equals("zh")) // jedes chin. einzelne Zeichen hat autom. immer einen Lexikoneintrag + return true; ArrayList statLexicons = Lexica.getInstance().getLexicons(language); if (statLexicons != null) { for (int i=0; i 0) { + for (int i=0; i This will serialize a DOM Node to + * the supplied Writer.

+ * + * @param node DOM Node to serialize. + * @param writer Writer to write to. + * @param indentLevel current indentation. + */ + private void serializeNode(Node node, Writer writer, String indentLevel) throws ApplicationException { + try { + // Determine action based on node type + switch (node.getNodeType()) { + case Node.DOCUMENT_NODE: + writer.write(""); + writer.write("\n"); + // recurse on each child + NodeList nodes = node.getChildNodes(); + if (nodes != null) { + for (int i=0; i"); + // recurse on each child + NodeList children = node.getChildNodes(); + if (children != null) { + if ((children.item(0) != null) && (children.item(0).getNodeType() == Node.ELEMENT_NODE)) { + writer.write("\n"); + } + for (int i=0; i"); + writer.write("\n"); + break; + case Node.TEXT_NODE: + writer.write(node.getNodeValue()); + break; + case Node.CDATA_SECTION_NODE: + writer.write(""); + break; + case Node.COMMENT_NODE: + writer.write(indentLevel + ""); + writer.write("\n"); + break; + case Node.PROCESSING_INSTRUCTION_NODE: + writer.write(""); + writer.write("\n"); + break; + case Node.ENTITY_REFERENCE_NODE: + writer.write("&" + node.getNodeName() + ";"); + break; + case Node.DOCUMENT_TYPE_NODE: + DocumentType docType = (DocumentType)node; + writer.write(""); + writer.write("\n"); + break; + } + } catch (IOException e) { + throw new ApplicationException(e); + } + } + // TODO not used yet, test it public Node doc(File xmlFile, File schemaFile) throws ApplicationException { Node root = null; diff -r 94305c504178 -r 2396a569e446 software/eXist/mpdl-modules/src/org/exist/xquery/modules/mpdltext/ExternalObject.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/org/exist/xquery/modules/mpdltext/ExternalObject.java Tue Feb 08 14:54:09 2011 +0100 @@ -0,0 +1,113 @@ +/* + * eXist Open Source Native XML Database: Extension module + * Copyright (C) 2008 Josef Willenborg + * jwillenborg@mpiwg-berlin.mpg.de + * http://www.mpiwg-berlin.mpg.de + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * $Id: TextModule.java $ + */ +package org.exist.xquery.modules.mpdltext; + +import java.util.ArrayList; +import java.util.Date; + +import org.exist.dom.QName; +import org.exist.xquery.BasicFunction; +import org.exist.xquery.Cardinality; +import org.exist.xquery.FunctionSignature; +import org.exist.xquery.XPathException; +import org.exist.xquery.XQueryContext; +import org.exist.xquery.value.Sequence; +import org.exist.xquery.value.SequenceType; +import org.exist.xquery.value.StringValue; +import org.exist.xquery.value.Type; +import org.exist.xquery.value.ValueSequence; + +import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; +import de.mpg.mpiwg.berlin.mpdl.externalObjects.app.ExtElement; +import de.mpg.mpiwg.berlin.mpdl.externalObjects.app.ExternalObjectsHandler; + +/** + * @author Josef Willenborg (jwillenborg@mpiwg-berlin.mpg.de) + */ +public class ExternalObject extends BasicFunction { + + public final static FunctionSignature signature = + new FunctionSignature( + new QName("externalObject", MPDLTextModule.NAMESPACE_URI, MPDLTextModule.PREFIX), + "A function which add, update, delete or read external elements", + new SequenceType[] { new SequenceType(Type.STRING, Cardinality.EXACTLY_ONE), + new SequenceType(Type.STRING, Cardinality.EXACTLY_ONE), + new SequenceType(Type.STRING, Cardinality.EXACTLY_ONE) }, + new SequenceType(Type.STRING, Cardinality.EXACTLY_ONE)); + + public ExternalObject(XQueryContext context) { + super(context, signature); + } + + public Sequence eval(Sequence[] args, Sequence contextSequence) throws XPathException { + Sequence operation = args[0]; // read, update or delete + Sequence type = args[1]; + Sequence object = args[2]; + if (operation.isEmpty() || type.isEmpty()) + return Sequence.EMPTY_SEQUENCE; + String operationStr = operation.getStringValue(); + String typeStr = type.getStringValue(); + String objectStr = object.getStringValue(); + + ValueSequence result = null; + String resultStr = ""; + try { + if (typeStr.equals("element")) { + ExtElement e = ExtElement.parseXmlStr(objectStr); + if (operation.equals("create") || operation.equals("update")) { + Date now = new Date(); + e.setModificationDate(now); + } + String documentId = e.getDocumentId(); + String pageNumber = e.getPageNumber(); + if (operationStr.equals("read")) { + ExternalObjectsHandler externalObjectsHandler = ExternalObjectsHandler.getInstance(); + ArrayList elems = externalObjectsHandler.readExternalElements(documentId, pageNumber); + if (elems != null && elems.size() > 0) { + resultStr = ""; + for (int i=0; i"; + } + } else if (operationStr.equals("create")) { + // TODO + } else if (operationStr.equals("update")) { + // TODO + } else if (operationStr.equals("delete")) { + // TODO + } + } else if (typeStr.equals("query")) { + // TODO + } + result = new ValueSequence(); + result.add(new StringValue(resultStr)); + } catch (ApplicationException e) { + throw new XPathException(e); + } + return result; + } + +} diff -r 94305c504178 -r 2396a569e446 software/eXist/mpdl-modules/src/org/exist/xquery/modules/mpdltext/InsertAtCharPos.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/org/exist/xquery/modules/mpdltext/InsertAtCharPos.java Tue Feb 08 14:54:09 2011 +0100 @@ -0,0 +1,103 @@ +/* + * eXist Open Source Native XML Database: Extension module + * Copyright (C) 2008 Josef Willenborg + * jwillenborg@mpiwg-berlin.mpg.de + * http://www.mpiwg-berlin.mpg.de + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * $Id: TextModule.java $ + */ +package org.exist.xquery.modules.mpdltext; + +import org.exist.dom.QName; +import org.exist.xquery.BasicFunction; +import org.exist.xquery.Cardinality; +import org.exist.xquery.FunctionSignature; +import org.exist.xquery.XPathException; +import org.exist.xquery.XQueryContext; +import org.exist.xquery.value.Sequence; +import org.exist.xquery.value.SequenceType; +import org.exist.xquery.value.StringValue; +import org.exist.xquery.value.Type; +import org.exist.xquery.value.ValueSequence; + +public class InsertAtCharPos extends BasicFunction { + + public final static FunctionSignature signature = + new FunctionSignature( + new QName("insertAtCharPos", MPDLTextModule.NAMESPACE_URI, MPDLTextModule.PREFIX), + "A function which inserts in the xml element node string (first parameter) the given xml element " + + "node string (second parameter) at the given char position (third parameter).", + new SequenceType[] { new SequenceType(Type.STRING, Cardinality.EXACTLY_ONE), + new SequenceType(Type.STRING, Cardinality.EXACTLY_ONE), + new SequenceType(Type.INT, Cardinality.EXACTLY_ONE) + }, + new SequenceType(Type.STRING, Cardinality.EXACTLY_ONE)); + + public InsertAtCharPos(XQueryContext context) { + super(context, signature); + } + + public Sequence eval(Sequence[] args, Sequence contextSequence) throws XPathException { + Sequence elementInputStr = args[0]; + Sequence insertElementStr = args[1]; + Sequence charPosStr = args[2]; + String elementInputStrStr = elementInputStr.getStringValue(); + String insertElementStrStr = insertElementStr.getStringValue(); + String charPosStrStr = charPosStr.getStringValue(); + Integer charPos = new Integer(charPosStrStr); + int strCharIndex = getIndex(elementInputStrStr, charPos); + if (charPos == 0) + strCharIndex = getIndex(elementInputStrStr, charPos + 1) - 1; + String resultStr = elementInputStrStr.substring(0, strCharIndex) + insertElementStrStr + elementInputStrStr.substring(strCharIndex); + ValueSequence result = new ValueSequence(); + result.add(new StringValue(resultStr)); + return result; + } + + private int getIndex(String xmlString, int charPos) { + int size = xmlString.length(); + int counter = 0; + int charCounter = 0; + int counterLastChar = -1; + boolean isEntity = false; + boolean isElement = false; + while (counter < size) { + char c = xmlString.charAt(counter); + switch (c) { + case '<': isElement = true; break; + case '>': isElement = false; break; + case '&': isEntity = true; break; + case ';': isEntity = false; break; + } + // count all chars which are not inside elements and entities + // if element closing char ">" is found it should not be counted as a char + // if an entity closing char ";" is found it should be counted cause the entity itself is one char long + if (! isEntity && ! isElement && !(c == '>')) { + charCounter++; + counterLastChar = counter; + } + if (charCounter == charPos) { + break; + } + counter++; + } + // input charPos was bigger than available chars: return the last available charPos + if (counter == size) + return counterLastChar + 1; + return counter + 1; + } +} diff -r 94305c504178 -r 2396a569e446 software/eXist/mpdl-modules/src/org/exist/xquery/modules/mpdltext/MPDLTextModule.java --- a/software/eXist/mpdl-modules/src/org/exist/xquery/modules/mpdltext/MPDLTextModule.java Tue Feb 08 14:36:38 2011 +0100 +++ b/software/eXist/mpdl-modules/src/org/exist/xquery/modules/mpdltext/MPDLTextModule.java Tue Feb 08 14:54:09 2011 +0100 @@ -50,7 +50,9 @@ new FunctionDef(GetBig5EncodedTerms.signature, GetBig5EncodedTerms.class), new FunctionDef(EncodeBig5.signature, EncodeBig5.class), new FunctionDef(LuceneQueryParser.signature, LuceneQueryParser.class), - new FunctionDef(ToCLevelGenerator.signature, ToCLevelGenerator.class) + new FunctionDef(ExternalObject.signature, ExternalObject.class), + new FunctionDef(InsertAtCharPos.signature, InsertAtCharPos.class), + new FunctionDef(ToCLevelGenerator.signature, ToCLevelGenerator.class) }; public MPDLTextModule() {