# HG changeset patch # User Josef Willenborg # Date 1298387025 -3600 # Node ID 1ec29fdd0db8234bb8d1cb2d8d218ad51548a326 # Parent d2a1c14fde311b393e6c16c3d20bb1f2171af7e1 neue .lex Dateien f?r Normalisierung / externe Objekte update diff -r d2a1c14fde31 -r 1ec29fdd0db8 software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/externalObjects/app/ExtElement.java --- a/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/externalObjects/app/ExtElement.java Thu Feb 10 14:02:05 2011 +0100 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/externalObjects/app/ExtElement.java Tue Feb 22 16:03:45 2011 +0100 @@ -8,12 +8,17 @@ public class ExtElement extends ExtObject { private String pageNumber; private String xmlNodeId; - private String before; + private String before = "false"; private String charPos; private String xpath; + public ExtElement() { + this.type = "element"; + } + public static ExtElement parseXmlStr(String xmlStr) throws ApplicationException { XmlUtil xmlUtil = XmlUtil.getInstance(); + xmlUtil.setNsContext("general"); String uid = xmlUtil.evaluateToString(xmlStr, "/object/@uid", null); String dateStr = xmlUtil.evaluateToString(xmlStr, "/object/@modificationDate", null); String docId = xmlUtil.evaluateToString(xmlStr, "/object/@documentId", null); @@ -24,8 +29,6 @@ String xpath = xmlUtil.evaluateToString(xmlStr, "/object/@xpath", null); String content = xmlUtil.evaluateToXmlString(xmlStr, "/object/content/*", null); Date modDate = xmlUtil.toDate(dateStr); - if (uid == null || docId == null || pageNumber == null) - throw new ApplicationException("one of the required fields could not be read in: " + xmlStr); ExtElement e = new ExtElement(); e.setUid(uid); e.setModificationDate(modDate); @@ -44,7 +47,13 @@ } public String getXmlString() { - String xmlString = ""; if (content != null) { // write the uid and modificationDate into the content node @@ -110,6 +114,13 @@ this.pageNumber = pageNumber; } + public boolean isBefore() { + if (before != null && before.equals("true")) + return true; + else + return false; + } + public String getBefore() { return before; } diff -r d2a1c14fde31 -r 1ec29fdd0db8 software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/externalObjects/app/ExtObject.java --- a/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/externalObjects/app/ExtObject.java Thu Feb 10 14:02:05 2011 +0100 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/externalObjects/app/ExtObject.java Tue Feb 22 16:03:45 2011 +0100 @@ -2,12 +2,27 @@ import java.util.Date; +import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; + public class ExtObject { + protected String type; // is set by subclass: element, query, ... protected String uid; protected Date modificationDate; protected String documentId; protected String content; + public String getXmlString() { + return null; // always handled in subclass + } + + public ExtObject getInstance(String xmlStr) throws ApplicationException { + return null; // always handled in subclass + } + + public String getType() { + return type; + } + public String getUid() { return uid; } diff -r d2a1c14fde31 -r 1ec29fdd0db8 software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/externalObjects/app/ExtQuery.java --- a/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/externalObjects/app/ExtQuery.java Thu Feb 10 14:02:05 2011 +0100 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/externalObjects/app/ExtQuery.java Tue Feb 22 16:03:45 2011 +0100 @@ -9,8 +9,13 @@ private String queryType; // url, fulltext or fulltextMorph private String queryName; // optional: name of the query + public ExtQuery() { + this.type = "query"; + } + public static ExtQuery parseXmlStr(String xmlStr) throws ApplicationException { XmlUtil xmlUtil = XmlUtil.getInstance(); + xmlUtil.setNsContext("general"); String uid = xmlUtil.evaluateToString(xmlStr, "/object/@uid", null); String dateStr = xmlUtil.evaluateToString(xmlStr, "/object/@modificationDate", null); String docId = xmlUtil.evaluateToString(xmlStr, "/object/@documentId", null); @@ -18,46 +23,43 @@ String queryName = xmlUtil.evaluateToString(xmlStr, "/object/@queryName", null); String content = xmlUtil.evaluateToXmlString(xmlStr, "/object/content/*", null); Date modDate = xmlUtil.toDate(dateStr); - if (uid == null || docId == null || queryType == null || content == null) - throw new ApplicationException("one of the required fields could not be read in: " + xmlStr); ExtQuery e = new ExtQuery(); e.setUid(uid); e.setModificationDate(modDate); e.setDocumentId(docId); e.setQueryType(queryType); e.setQueryName(queryName); - e.setContent(content); + if (content != null && ! content.isEmpty()) + e.setContent(content); return e; } + public ExtQuery getInstance(String xmlStr) throws ApplicationException { + return parseXmlStr(xmlStr); + } + public String toString() { return getXmlString(); } public String getXmlString() { String xmlString = ""; if (content != null) { - // write the uid and modificationDate into the content node - if (! content.contains("uid")) { - int firstClose = content.indexOf(">"); - if (firstClose != -1) - content = content.substring(0, firstClose) + " uid=\"" + uid + "\" modificationDate=\"" + modificationDate + "\" " + content.substring(firstClose); - } xmlString = xmlString + "" + content + ""; } xmlString = xmlString + ""; diff -r d2a1c14fde31 -r 1ec29fdd0db8 software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/externalObjects/app/ExternalObjectsHandler.java --- a/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/externalObjects/app/ExternalObjectsHandler.java Thu Feb 10 14:02:05 2011 +0100 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/externalObjects/app/ExternalObjectsHandler.java Tue Feb 22 16:03:45 2011 +0100 @@ -10,7 +10,9 @@ import com.sleepycat.je.DatabaseException; import com.sleepycat.je.LockMode; import com.sleepycat.je.OperationStatus; +import com.sleepycat.je.Transaction; +import de.mpg.mpiwg.berlin.mpdl.util.StringUtilEscapeChars; import de.mpg.mpiwg.berlin.mpdl.util.Util; import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; import de.mpg.mpiwg.berlin.mpdl.externalObjects.db.DbEnvExternalObjects; @@ -32,22 +34,50 @@ return instance; } - public ArrayList readExternalElements(String documentId, String pageNumber) throws ApplicationException { - return readDBExternalElements(documentId, pageNumber); + public void createExternalElement(ExtElement element) throws ApplicationException { + createDBExternalElement(element); } - public void writeExternalElement(ExtElement element) throws ApplicationException { - writeDBExternalElement(element); + public void updateExternalElement(ExtElement element) throws ApplicationException { + updateDBExternalElement(element); } public void deleteExternalElement(ExtElement element) throws ApplicationException { deleteDBExternalElement(element); } - private void writeDBExternalElement(ExtElement element) throws ApplicationException { + public ArrayList readExternalElements(ExtElement element) throws ApplicationException { + return readDBExternalElements(element); + } + + public void createExternalObject(ExtObject object) throws ApplicationException { + createDBExternalObject(object); + } + + public void updateExternalObject(ExtObject object) throws ApplicationException { + updateDBExternalObject(object); + } + + public void deleteExternalObject(ExtObject object) throws ApplicationException { + deleteDBExternalObject(object); + } + + public ArrayList readExternalObjects(ExtObject object) throws ApplicationException { + return readDBExternalObjects(object); + } + + private void createDBExternalElement(ExtElement element) throws ApplicationException { try { - String keyStr = element.getDocumentId() + "###" + element.getPageNumber(); + test(element); + String content = element.getContent(); String valueStr = element.getXmlString(); + if (content == null) + throw new ApplicationException("External object: no content element specified in: " + valueStr); + Date now = new Date(); + element.setModificationDate(now); + String docId = element.getDocumentId(); + String pageNumber = element.getPageNumber(); + String keyStr = docId + "###" + pageNumber; DatabaseEntry dbEntryKey = new DatabaseEntry(keyStr.getBytes("utf-8")); DatabaseEntry dbEntryValue = new DatabaseEntry(valueStr.getBytes("utf-8")); Database elementDB = dbEnvExternalObjects.getElementDB(); @@ -59,21 +89,92 @@ } } - private void deleteDBExternalElement(ExtElement element) throws ApplicationException { + private void updateDBExternalElement(ExtElement element) throws ApplicationException { + test(element); + String content = element.getContent(); + String elementXmlStr = element.getXmlString(); + if (content == null) + throw new ApplicationException("External object: no content element specified in: " + elementXmlStr); + Date now = new Date(); + element.setModificationDate(now); + String docId = element.getDocumentId(); + String pageNumber = element.getPageNumber(); + String uid = element.getUid(); + String xmlNodeId = element.getXmlNodeId(); + String hashKey = docId + "###" + pageNumber; + boolean updated = false; try { - String keyStr = element.getDocumentId() + "###" + element.getPageNumber(); - DatabaseEntry dbEntryKey = new DatabaseEntry(keyStr.getBytes("utf-8")); Database elementDB = dbEnvExternalObjects.getElementDB(); - elementDB.delete(null, dbEntryKey); + Transaction t = dbEnvExternalObjects.getEnv().beginTransaction(null, null); + Cursor cursor = elementDB.openCursor(t, null); + byte[] bHashKey = hashKey.getBytes("utf-8"); + DatabaseEntry dbEntryKey = new DatabaseEntry(bHashKey); + DatabaseEntry foundValue = new DatabaseEntry(); + OperationStatus operationStatus = cursor.getSearchKey(dbEntryKey, foundValue, LockMode.DEFAULT); + while (operationStatus == OperationStatus.SUCCESS && ! updated) { + byte[] foundValueBytes = foundValue.getData(); + String foundValueStr = new String(foundValueBytes, "utf-8"); + ExtElement elem = ExtElement.parseXmlStr(foundValueStr); + String elemUid = elem.getUid(); + String elemXmlNodeId = elem.getXmlNodeId(); + if (uid.equals(elemUid) && xmlNodeId.equals(elemXmlNodeId)) { + cursor.delete(); + byte[] elementXmlStrBytes = elementXmlStr.getBytes("utf-8"); + DatabaseEntry dbEntryValue = new DatabaseEntry(elementXmlStrBytes); + cursor.put(dbEntryKey, dbEntryValue); + updated = true; + break; + } + operationStatus = cursor.getNextDup(dbEntryKey, foundValue, LockMode.DEFAULT); + } + cursor.close(); + t.commit(); } catch (DatabaseException e) { throw new ApplicationException(e); } catch (UnsupportedEncodingException e) { throw new ApplicationException(e); } - } - - private ArrayList readDBExternalElements(String documentId, String pageNumber) throws ApplicationException { + } + + private void deleteDBExternalElement(ExtElement element) throws ApplicationException { + test(element); + String docId = element.getDocumentId(); + String pageNumber = element.getPageNumber(); + String uid = element.getUid(); + String xmlNodeId = element.getXmlNodeId(); + String hashKey = docId + "###" + pageNumber; + try { + Database elementDB = dbEnvExternalObjects.getElementDB(); + Transaction t = dbEnvExternalObjects.getEnv().beginTransaction(null, null); + Cursor cursor = elementDB.openCursor(t, null); + byte[] bHashKey = hashKey.getBytes("utf-8"); + DatabaseEntry dbEntryKey = new DatabaseEntry(bHashKey); + DatabaseEntry foundValue = new DatabaseEntry(); + OperationStatus operationStatus = cursor.getSearchKey(dbEntryKey, foundValue, LockMode.DEFAULT); + while (operationStatus == OperationStatus.SUCCESS) { + byte[] foundValueBytes = foundValue.getData(); + String foundValueStr = new String(foundValueBytes, "utf-8"); + ExtElement elem = ExtElement.parseXmlStr(foundValueStr); + String elemUid = elem.getUid(); + String elemXmlNodeId = elem.getXmlNodeId(); + if (uid.equals(elemUid) && xmlNodeId.equals(elemXmlNodeId)) { + cursor.delete(); + } + operationStatus = cursor.getNextDup(dbEntryKey, foundValue, LockMode.DEFAULT); + } + cursor.close(); + t.commit(); + } catch (DatabaseException e) { + throw new ApplicationException(e); + } catch (UnsupportedEncodingException e) { + throw new ApplicationException(e); + } + } + + private ArrayList readDBExternalElements(ExtElement element) throws ApplicationException { ArrayList retElements = new ArrayList(); + String documentId = element.getDocumentId(); + String pageNumber = element.getPageNumber(); String hashKey = documentId + "###" + pageNumber; try { Database elementDB = dbEnvExternalObjects.getElementDB(); @@ -98,6 +199,165 @@ return retElements; } + private void test(ExtElement element) throws ApplicationException { + String uid = element.getUid(); + String docId = element.getDocumentId(); + String xmlNodeId = element.getXmlNodeId(); + String pageNumber = element.getPageNumber(); + String xmlStr = element.getXmlString(); + if (uid == null) + throw new ApplicationException("External object: no attribute \"uid\" specified in: " + xmlStr); + if (docId == null) + throw new ApplicationException("External object: no attribute \"documentId\" specified in: " + xmlStr); + if (xmlNodeId == null) + throw new ApplicationException("External object: no attribute \"xmlNodeId\" specified in: " + xmlStr); + if (pageNumber == null) + throw new ApplicationException("External object: no attribute \"pageNumber\" specified in: " + xmlStr); + } + + private void createDBExternalObject(ExtObject extObject) throws ApplicationException { + try { + test(extObject); + Date now = new Date(); + extObject.setModificationDate(now); + String type = extObject.getType(); + String uid = extObject.getUid(); + String docId = extObject.getDocumentId(); + if (docId == null || docId.isEmpty()) + docId = "-1"; + String keyStr = type + "###" + uid + "###" + docId; + String valueStr = extObject.getXmlString(); + DatabaseEntry dbEntryKey = new DatabaseEntry(keyStr.getBytes("utf-8")); + DatabaseEntry dbEntryValue = new DatabaseEntry(valueStr.getBytes("utf-8")); + Database objectDB = dbEnvExternalObjects.getObjectDB(); + objectDB.put(null, dbEntryKey, dbEntryValue); + } catch (DatabaseException e) { + throw new ApplicationException(e); + } catch (UnsupportedEncodingException e) { + throw new ApplicationException(e); + } + } + + private void updateDBExternalObject(ExtObject object) throws ApplicationException { + test(object); + String content = object.getContent(); + String elementXmlStr = object.getXmlString(); + if (content == null) + throw new ApplicationException("External object: no content element specified in: " + elementXmlStr); + Date modificationDate = object.getModificationDate(); + Date now = new Date(); + object.setModificationDate(now); + String type = object.getType(); + String uid = object.getUid(); + String docId = object.getDocumentId(); + String hashKey = type + "###" + uid + "###" + docId; + boolean updated = false; + try { + Database objectDB = dbEnvExternalObjects.getObjectDB(); + Transaction t = dbEnvExternalObjects.getEnv().beginTransaction(null, null); + Cursor cursor = objectDB.openCursor(t, null); + byte[] bHashKey = hashKey.getBytes("utf-8"); + DatabaseEntry dbEntryKey = new DatabaseEntry(bHashKey); + DatabaseEntry foundValue = new DatabaseEntry(); + OperationStatus operationStatus = cursor.getSearchKey(dbEntryKey, foundValue, LockMode.DEFAULT); + while (operationStatus == OperationStatus.SUCCESS && ! updated) { + byte[] foundValueBytes = foundValue.getData(); + String foundValueStr = new String(foundValueBytes, "utf-8"); + ExtObject obj = object.getInstance(foundValueStr); + Date objModificationDate = obj.getModificationDate(); + if (modificationDate.equals(objModificationDate)) { + cursor.delete(); + byte[] elementXmlStrBytes = elementXmlStr.getBytes("utf-8"); + DatabaseEntry dbEntryValue = new DatabaseEntry(elementXmlStrBytes); + cursor.put(dbEntryKey, dbEntryValue); + updated = true; + break; + } + operationStatus = cursor.getNextDup(dbEntryKey, foundValue, LockMode.DEFAULT); + } + cursor.close(); + t.commit(); + } catch (DatabaseException e) { + throw new ApplicationException(e); + } catch (UnsupportedEncodingException e) { + throw new ApplicationException(e); + } + } + + private void deleteDBExternalObject(ExtObject object) throws ApplicationException { + test(object); + Date modificationDate = object.getModificationDate(); + Date now = new Date(); + object.setModificationDate(now); + String type = object.getType(); + String uid = object.getUid(); + String docId = object.getDocumentId(); + String hashKey = type + "###" + uid + "###" + docId; + try { + Database objectDB = dbEnvExternalObjects.getObjectDB(); + Transaction t = dbEnvExternalObjects.getEnv().beginTransaction(null, null); + Cursor cursor = objectDB.openCursor(t, null); + byte[] bHashKey = hashKey.getBytes("utf-8"); + DatabaseEntry dbEntryKey = new DatabaseEntry(bHashKey); + DatabaseEntry foundValue = new DatabaseEntry(); + OperationStatus operationStatus = cursor.getSearchKey(dbEntryKey, foundValue, LockMode.DEFAULT); + while (operationStatus == OperationStatus.SUCCESS) { + byte[] foundValueBytes = foundValue.getData(); + String foundValueStr = new String(foundValueBytes, "utf-8"); + ExtObject obj = object.getInstance(foundValueStr); + Date objModificationDate = obj.getModificationDate(); + if (modificationDate == null || modificationDate.equals(objModificationDate)) { + cursor.delete(); + } + operationStatus = cursor.getNextDup(dbEntryKey, foundValue, LockMode.DEFAULT); + } + cursor.close(); + t.commit(); + } catch (DatabaseException e) { + throw new ApplicationException(e); + } catch (UnsupportedEncodingException e) { + throw new ApplicationException(e); + } + } + + private ArrayList readDBExternalObjects(ExtObject object) throws ApplicationException { + ArrayList retElements = new ArrayList(); + String type = object.getType(); + String uid = object.getUid(); + String docId = object.getDocumentId(); + String hashKey = type + "###" + uid + "###" + docId; + try { + Database objectDB = dbEnvExternalObjects.getObjectDB(); + Cursor cursor = objectDB.openCursor(null, null); + byte[] bHashKey = hashKey.getBytes("utf-8"); + DatabaseEntry dbEntryKey = new DatabaseEntry(bHashKey); + DatabaseEntry foundValue = new DatabaseEntry(); + OperationStatus operationStatus = cursor.getSearchKey(dbEntryKey, foundValue, LockMode.DEFAULT); + while (operationStatus == OperationStatus.SUCCESS) { + byte[] foundValueBytes = foundValue.getData(); + String foundValueStr = new String(foundValueBytes, "utf-8"); + ExtObject obj = object.getInstance(foundValueStr); + retElements.add(obj); + operationStatus = cursor.getNextDup(dbEntryKey, foundValue, LockMode.DEFAULT); + } + cursor.close(); + } catch (DatabaseException e) { + throw new ApplicationException(e); + } catch (UnsupportedEncodingException e) { + throw new ApplicationException(e); + } + return retElements; + } + + private void test(ExtObject object) throws ApplicationException { + String uid = object.getUid(); + String xmlStr = object.getXmlString(); + if (uid == null) + throw new ApplicationException("External object: no attribute \"uid\" specified in: " + xmlStr); + } + + + private void init() throws ApplicationException { dbEnvExternalObjects = new DbEnvExternalObjects(); dbEnvExternalObjects.setDataDir(DB_DIR_EXTERNAL_OBJECTS); @@ -109,8 +369,9 @@ getInstance(); instance.beginOperation(); System.out.print("Start ..."); - // instance.deleteSampleData(); - // instance.writeSampleData(); + instance.deleteSampleData(); + instance.createSampleData(); + // instance.updateSampleData(); instance.readSampleData(); instance.end(); instance.endOperation(); @@ -120,53 +381,125 @@ } private void deleteSampleData() throws ApplicationException { - ExtElement e = new ExtElement(); - e.setUid("joe"); - e.setDocumentId("/archimedes/it/l223.xml"); - e.setPageNumber("17"); - deleteExternalElement(e); + String xmlNodeId1 = "/archimedes[1]/text[1]/body[1]/chap[1]/p[1]/s[2]"; + String objectXmlStr1 = + "" + + ""; + ExtElement e1 = ExtElement.parseXmlStr(objectXmlStr1); + String xmlNodeId2 = "/archimedes[1]/text[1]/body[1]/chap[1]/p[1]/s[4]"; + String objectXmlStr2 = + "" + + ""; + ExtElement e2 = ExtElement.parseXmlStr(objectXmlStr2); + deleteExternalElement(e1); + deleteExternalElement(e2); + + ExtQuery q = new ExtQuery(); + q.setUid("joe"); + q.setDocumentId("/archimedes/it/l223.xml"); + ArrayList objects = readExternalObjects(q); + for (int i=0; iThis is a test note to sentence " + sId + ""); - writeExternalElement(e); + e.setContent("This is a test note to element " + sId + " with this external link" + ""); + createExternalElement(e); ExtElement e2 = new ExtElement(); - String sId2 = "1.2.2.2.2.7"; + String sId2 = "/archimedes[1]/text[1]/body[1]/chap[1]/p[1]/s[4]"; e2.setUid("michael"); e2.setModificationDate(now); e2.setDocumentId("/archimedes/it/l223.xml"); e2.setPageNumber("17"); e2.setXmlNodeId(sId2); e2.setCharPos("18"); - e2.setContent("This is a test note to sentence " + sId2 + ""); - writeExternalElement(e2); + e2.setContent("This is a test note to element " + sId2 + ""); + createExternalElement(e2); - /* - String sId3 = "1.2.2.2.2.8.15.3.3"; - e3.setUid("joe"); - e3.setModificationDate(now); - e3.setDocumentId("/archimedes/it/l223.xml"); - e3.setPageNumber("17"); - e3.setXmlNodeId(sId3); - e2.setContent("This is an external test note to sentence " + sId3 + ""); - writeExternalElement(e3); - */ + ExtQuery q1 = new ExtQuery(); + q1.setUid("joe"); + q1.setDocumentId("/archimedes/it/l223.xml"); + q1.setQueryType("fulltext"); + q1.setQueryName("seminario"); + createExternalObject(q1); + + ExtQuery q2 = new ExtQuery(); + q2.setUid("michael"); + q2.setDocumentId("/archimedes/it/l223.xml"); + q2.setQueryType("url"); + String url = "http://mpdl-proto.mpiwg-berlin.mpg.de/mpdl/page-query-result.xql?document=/archimedes/it/l223.xml&pn=17&mode=text&query-type=fulltextMorph&query=seminario&query-result-pn=1"; + String urlDeresolved = StringUtilEscapeChars.deresolveXmlEntities(url); + q2.setQueryName(urlDeresolved); + createExternalObject(q2); } + private void updateSampleData() throws ApplicationException { + Date now = new Date(); + String xmlNodeId = "/archimedes[1]/text[1]/body[1]/chap[1]/p[1]/s[2]"; + String objectXmlStr = + "" + + "" + "This is a test note to element " + xmlNodeId + " with this external link" + "" + "" + + ""; + ExtElement e = ExtElement.parseXmlStr(objectXmlStr); + e.setModificationDate(now); + updateExternalElement(e); + } + private void readSampleData() throws ApplicationException { - ArrayList elements = readExternalElements("/archimedes/it/l223.xml", "17"); + ExtElement elem = new ExtElement(); + elem.setDocumentId("/archimedes/it/l223.xml"); + elem.setPageNumber("17"); + ArrayList elements = readExternalElements(elem); System.out.println(elements); + + ExtQuery q1 = new ExtQuery(); + q1.setUid("joe"); + q1.setDocumentId("/archimedes/it/l223.xml"); + ArrayList objects = readExternalObjects(q1); + System.out.println(objects); + + ExtQuery q2 = new ExtQuery(); + q2.setUid("michael"); + q2.setDocumentId("/archimedes/it/l223.xml"); + objects = readExternalObjects(q2); + System.out.println(objects); } private void end() throws ApplicationException { diff -r d2a1c14fde31 -r 1ec29fdd0db8 software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/MpdlNormalizer.java --- a/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/MpdlNormalizer.java Thu Feb 10 14:02:05 2011 +0100 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/MpdlNormalizer.java Tue Feb 22 16:03:45 2011 +0100 @@ -5,7 +5,15 @@ import java.util.ArrayList; import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; -import de.mpg.mpiwg.berlin.mpdl.lt.analyzer.lang.MpdlNormalizerLexAll; +import de.mpg.mpiwg.berlin.mpdl.lt.analyzer.lang.MpdlNormalizerLexAR; +import de.mpg.mpiwg.berlin.mpdl.lt.analyzer.lang.MpdlNormalizerLexDE; +import de.mpg.mpiwg.berlin.mpdl.lt.analyzer.lang.MpdlNormalizerLexEL; +import de.mpg.mpiwg.berlin.mpdl.lt.analyzer.lang.MpdlNormalizerLexEN; +import de.mpg.mpiwg.berlin.mpdl.lt.analyzer.lang.MpdlNormalizerLexFR; +import de.mpg.mpiwg.berlin.mpdl.lt.analyzer.lang.MpdlNormalizerLexIT; +import de.mpg.mpiwg.berlin.mpdl.lt.analyzer.lang.MpdlNormalizerLexLA; +import de.mpg.mpiwg.berlin.mpdl.lt.analyzer.lang.MpdlNormalizerLexNL; +import de.mpg.mpiwg.berlin.mpdl.lt.analyzer.lang.MpdlNormalizerLexZH; import de.mpg.mpiwg.berlin.mpdl.lt.doc.regularization.Regularization; import de.mpg.mpiwg.berlin.mpdl.lt.doc.regularization.RegularizationManager; import de.mpg.mpiwg.berlin.mpdl.lt.general.Language; @@ -1014,30 +1022,89 @@ } private String normalize4HumanReaders(String s) { - String normStr = s; - StringReader strReader = new StringReader(normStr + "\n"); - MpdlNormalizerLexAll mpdlNormalizerLexAll = new MpdlNormalizerLexAll(strReader); - if (Language.getInstance().isLatin(language)) { - mpdlNormalizerLexAll.yybegin(MpdlNormalizerLexAll.LA); - } else if (Language.getInstance().isChinese(language)) { - mpdlNormalizerLexAll.yybegin(MpdlNormalizerLexAll.ZH); - } else { - // TODO normalization for all languages - return normalize4Lexica(s, null); // old function - } + StringReader strReader = new StringReader(s + "\n"); String retStr = ""; String token = ""; - while (token != null) { - try { - token = mpdlNormalizerLexAll.yylex(); - if (token != null) - retStr += token; - } catch (IOException e ) { - // nothing cause IOException is not needed for a StringReader + try { + if (Language.getInstance().isLatin(language)) { + MpdlNormalizerLexLA mpdlNormalizerLex = new MpdlNormalizerLexLA(strReader); + mpdlNormalizerLex.yybegin(MpdlNormalizerLexLA.DISP); + while (token != null) { + token = mpdlNormalizerLex.yylex(); + if (token != null) + retStr += token; + } + } else if (Language.getInstance().isArabic(language)) { + MpdlNormalizerLexAR mpdlNormalizerLex = new MpdlNormalizerLexAR(strReader); + mpdlNormalizerLex.yybegin(MpdlNormalizerLexAR.DISP); + while (token != null) { + token = mpdlNormalizerLex.yylex(); + if (token != null) + retStr += token; + } + } else if (Language.getInstance().isGerman(language)) { + MpdlNormalizerLexDE mpdlNormalizerLex = new MpdlNormalizerLexDE(strReader); + mpdlNormalizerLex.yybegin(MpdlNormalizerLexDE.DISP); + while (token != null) { + token = mpdlNormalizerLex.yylex(); + if (token != null) + retStr += token; + } + } else if (Language.getInstance().isGreek(language)) { + MpdlNormalizerLexEL mpdlNormalizerLex = new MpdlNormalizerLexEL(strReader); + mpdlNormalizerLex.yybegin(MpdlNormalizerLexEL.DISP); + while (token != null) { + token = mpdlNormalizerLex.yylex(); + if (token != null) + retStr += token; + } + } else if (Language.getInstance().isEnglish(language)) { + MpdlNormalizerLexEN mpdlNormalizerLex = new MpdlNormalizerLexEN(strReader); + mpdlNormalizerLex.yybegin(MpdlNormalizerLexEN.DISP); + while (token != null) { + token = mpdlNormalizerLex.yylex(); + if (token != null) + retStr += token; + } + } else if (Language.getInstance().isFrench(language)) { + MpdlNormalizerLexFR mpdlNormalizerLex = new MpdlNormalizerLexFR(strReader); + mpdlNormalizerLex.yybegin(MpdlNormalizerLexFR.DISP); + while (token != null) { + token = mpdlNormalizerLex.yylex(); + if (token != null) + retStr += token; + } + } else if (Language.getInstance().isItalian(language)) { + MpdlNormalizerLexIT mpdlNormalizerLex = new MpdlNormalizerLexIT(strReader); + mpdlNormalizerLex.yybegin(MpdlNormalizerLexIT.DISP); + while (token != null) { + token = mpdlNormalizerLex.yylex(); + if (token != null) + retStr += token; + } + } else if (Language.getInstance().isDutch(language)) { + MpdlNormalizerLexNL mpdlNormalizerLex = new MpdlNormalizerLexNL(strReader); + mpdlNormalizerLex.yybegin(MpdlNormalizerLexNL.DISP); + while (token != null) { + token = mpdlNormalizerLex.yylex(); + if (token != null) + retStr += token; + } + } else if (Language.getInstance().isChinese(language)) { + MpdlNormalizerLexZH mpdlNormalizerLex = new MpdlNormalizerLexZH(strReader); + mpdlNormalizerLex.yybegin(MpdlNormalizerLexZH.DISP); + while (token != null) { + token = mpdlNormalizerLex.yylex(); + if (token != null) + retStr += token; + } + } else { + return normalize4Lexica(s, null); // old function } + } catch (IOException e ) { + // nothing cause IOException is not needed for a StringReader } - normStr = retStr; - return normStr; + return retStr; } /* diff -r d2a1c14fde31 -r 1ec29fdd0db8 software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlNormalizerLexAR.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlNormalizerLexAR.java Tue Feb 22 16:03:45 2011 +0100 @@ -0,0 +1,572 @@ +/* The following code was generated by JFlex 1.4.3 on 22.02.11 12:02 */ + +/* + * Normalization rules for Arabic text + * [this is a JFlex specification] + * + * Wolfgang Schmidle + * version 0.96 + * 2011-02-21 + * + */ + +package de.mpg.mpiwg.berlin.mpdl.lt.analyzer.lang; + + +/** + * This class is a scanner generated by + * JFlex 1.4.3 + * on 22.02.11 12:02 from the specification file + * MpdlNormalizerLexAR.lex + */ +public class MpdlNormalizerLexAR { + + /** This character denotes the end of file */ + public static final int YYEOF = -1; + + /** initial size of the lookahead buffer */ + private static final int ZZ_BUFFERSIZE = 16384; + + /** lexical states */ + public static final int SEARCH = 6; + public static final int DICT = 4; + public static final int YYINITIAL = 0; + public static final int DISP = 2; + + /** + * ZZ_LEXSTATE[l] is the state in the DFA for the lexical state l + * ZZ_LEXSTATE[l+1] is the state in the DFA for the lexical state l + * at the beginning of a line + * l is of the form l = 2*k, k a non negative integer + */ + private static final int ZZ_LEXSTATE[] = { + 0, 0, 1, 1, 2, 2, 1, 1 + }; + + /** + * Translates characters to character classes + */ + private static final String ZZ_CMAP_PACKED = + "\12\0\1\1\65\0\1\2\uffbf\0"; + + /** + * Translates characters to character classes + */ + private static final char [] ZZ_CMAP = zzUnpackCMap(ZZ_CMAP_PACKED); + + /** + * Translates DFA states to action switch labels. + */ + private static final int [] ZZ_ACTION = zzUnpackAction(); + + private static final String ZZ_ACTION_PACKED_0 = + "\3\0\1\1\1\2\1\3\1\4"; + + private static int [] zzUnpackAction() { + int [] result = new int[7]; + int offset = 0; + offset = zzUnpackAction(ZZ_ACTION_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackAction(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int count = packed.charAt(i++); + int value = packed.charAt(i++); + do result[j++] = value; while (--count > 0); + } + return j; + } + + + /** + * Translates a state to a row index in the transition table + */ + private static final int [] ZZ_ROWMAP = zzUnpackRowMap(); + + private static final String ZZ_ROWMAP_PACKED_0 = + "\0\0\0\3\0\6\0\11\0\11\0\11\0\11"; + + private static int [] zzUnpackRowMap() { + int [] result = new int[7]; + int offset = 0; + offset = zzUnpackRowMap(ZZ_ROWMAP_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackRowMap(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int high = packed.charAt(i++) << 16; + result[j++] = high | packed.charAt(i++); + } + return j; + } + + /** + * The transition table of the DFA + */ + private static final int [] ZZ_TRANS = zzUnpackTrans(); + + private static final String ZZ_TRANS_PACKED_0 = + "\1\4\1\0\1\5\1\4\1\6\1\5\1\4\1\7"+ + "\1\5\3\0"; + + private static int [] zzUnpackTrans() { + int [] result = new int[12]; + int offset = 0; + offset = zzUnpackTrans(ZZ_TRANS_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackTrans(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int count = packed.charAt(i++); + int value = packed.charAt(i++); + value--; + do result[j++] = value; while (--count > 0); + } + return j; + } + + + /* error codes */ + private static final int ZZ_UNKNOWN_ERROR = 0; + private static final int ZZ_NO_MATCH = 1; + private static final int ZZ_PUSHBACK_2BIG = 2; + + /* error messages for the codes above */ + private static final String ZZ_ERROR_MSG[] = { + "Unkown internal scanner error", + "Error: could not match input", + "Error: pushback value was too large" + }; + + /** + * ZZ_ATTRIBUTE[aState] contains the attributes of state aState + */ + private static final int [] ZZ_ATTRIBUTE = zzUnpackAttribute(); + + private static final String ZZ_ATTRIBUTE_PACKED_0 = + "\3\0\4\11"; + + private static int [] zzUnpackAttribute() { + int [] result = new int[7]; + int offset = 0; + offset = zzUnpackAttribute(ZZ_ATTRIBUTE_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackAttribute(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int count = packed.charAt(i++); + int value = packed.charAt(i++); + do result[j++] = value; while (--count > 0); + } + return j; + } + + /** the input device */ + private java.io.Reader zzReader; + + /** the current state of the DFA */ + private int zzState; + + /** the current lexical state */ + private int zzLexicalState = YYINITIAL; + + /** this buffer contains the current text to be matched and is + the source of the yytext() string */ + private char zzBuffer[] = new char[ZZ_BUFFERSIZE]; + + /** the textposition at the last accepting state */ + private int zzMarkedPos; + + /** the current text position in the buffer */ + private int zzCurrentPos; + + /** startRead marks the beginning of the yytext() string in the buffer */ + private int zzStartRead; + + /** endRead marks the last character in the buffer, that has been read + from input */ + private int zzEndRead; + + /** number of newlines encountered up to the start of the matched text */ + private int yyline; + + /** the number of characters up to the start of the matched text */ + private int yychar; + + /** + * the number of characters from the last newline up to the start of the + * matched text + */ + private int yycolumn; + + /** + * zzAtBOL == true <=> the scanner is currently at the beginning of a line + */ + private boolean zzAtBOL = true; + + /** zzAtEOF == true <=> the scanner is at the EOF */ + private boolean zzAtEOF; + + /** denotes if the user-EOF-code has already been executed */ + private boolean zzEOFDone; + + /* user code: */ + private String original = ""; + private String normalized = ""; + private int problem = 0; + + private void add (String norm) { + original += yytext(); + normalized += norm; + } + + + /** + * Creates a new scanner + * There is also a java.io.InputStream version of this constructor. + * + * @param in the java.io.Reader to read input from. + */ + public MpdlNormalizerLexAR(java.io.Reader in) { + this.zzReader = in; + } + + /** + * Creates a new scanner. + * There is also java.io.Reader version of this constructor. + * + * @param in the java.io.Inputstream to read input from. + */ + public MpdlNormalizerLexAR(java.io.InputStream in) { + this(new java.io.InputStreamReader(in)); + } + + /** + * Unpacks the compressed character translation table. + * + * @param packed the packed character translation table + * @return the unpacked character translation table + */ + private static char [] zzUnpackCMap(String packed) { + char [] map = new char[0x10000]; + int i = 0; /* index in packed string */ + int j = 0; /* index in unpacked array */ + while (i < 10) { + int count = packed.charAt(i++); + char value = packed.charAt(i++); + do map[j++] = value; while (--count > 0); + } + return map; + } + + + /** + * Refills the input buffer. + * + * @return false, iff there was new input. + * + * @exception java.io.IOException if any I/O-Error occurs + */ + private boolean zzRefill() throws java.io.IOException { + + /* first: make room (if you can) */ + if (zzStartRead > 0) { + System.arraycopy(zzBuffer, zzStartRead, + zzBuffer, 0, + zzEndRead-zzStartRead); + + /* translate stored positions */ + zzEndRead-= zzStartRead; + zzCurrentPos-= zzStartRead; + zzMarkedPos-= zzStartRead; + zzStartRead = 0; + } + + /* is the buffer big enough? */ + if (zzCurrentPos >= zzBuffer.length) { + /* if not: blow it up */ + char newBuffer[] = new char[zzCurrentPos*2]; + System.arraycopy(zzBuffer, 0, newBuffer, 0, zzBuffer.length); + zzBuffer = newBuffer; + } + + /* finally: fill the buffer with new input */ + int numRead = zzReader.read(zzBuffer, zzEndRead, + zzBuffer.length-zzEndRead); + + if (numRead > 0) { + zzEndRead+= numRead; + return false; + } + // unlikely but not impossible: read 0 characters, but not at end of stream + if (numRead == 0) { + int c = zzReader.read(); + if (c == -1) { + return true; + } else { + zzBuffer[zzEndRead++] = (char) c; + return false; + } + } + + // numRead < 0 + return true; + } + + + /** + * Closes the input stream. + */ + public final void yyclose() throws java.io.IOException { + zzAtEOF = true; /* indicate end of file */ + zzEndRead = zzStartRead; /* invalidate buffer */ + + if (zzReader != null) + zzReader.close(); + } + + + /** + * Resets the scanner to read from a new input stream. + * Does not close the old reader. + * + * All internal variables are reset, the old input stream + * cannot be reused (internal buffer is discarded and lost). + * Lexical state is set to ZZ_INITIAL. + * + * @param reader the new input stream + */ + public final void yyreset(java.io.Reader reader) { + zzReader = reader; + zzAtBOL = true; + zzAtEOF = false; + zzEOFDone = false; + zzEndRead = zzStartRead = 0; + zzCurrentPos = zzMarkedPos = 0; + yyline = yychar = yycolumn = 0; + zzLexicalState = YYINITIAL; + } + + + /** + * Returns the current lexical state. + */ + public final int yystate() { + return zzLexicalState; + } + + + /** + * Enters a new lexical state + * + * @param newState the new lexical state + */ + public final void yybegin(int newState) { + zzLexicalState = newState; + } + + + /** + * Returns the text matched by the current regular expression. + */ + public final String yytext() { + return new String( zzBuffer, zzStartRead, zzMarkedPos-zzStartRead ); + } + + + /** + * Returns the character at position pos from the + * matched text. + * + * It is equivalent to yytext().charAt(pos), but faster + * + * @param pos the position of the character to fetch. + * A value from 0 to yylength()-1. + * + * @return the character at position pos + */ + public final char yycharat(int pos) { + return zzBuffer[zzStartRead+pos]; + } + + + /** + * Returns the length of the matched text region. + */ + public final int yylength() { + return zzMarkedPos-zzStartRead; + } + + + /** + * Reports an error that occured while scanning. + * + * In a wellformed scanner (no or only correct usage of + * yypushback(int) and a match-all fallback rule) this method + * will only be called with things that "Can't Possibly Happen". + * If this method is called, something is seriously wrong + * (e.g. a JFlex bug producing a faulty scanner etc.). + * + * Usual syntax/scanner level error handling should be done + * in error fallback rules. + * + * @param errorCode the code of the errormessage to display + */ + private void zzScanError(int errorCode) { + String message; + try { + message = ZZ_ERROR_MSG[errorCode]; + } + catch (ArrayIndexOutOfBoundsException e) { + message = ZZ_ERROR_MSG[ZZ_UNKNOWN_ERROR]; + } + + throw new Error(message); + } + + + /** + * Pushes the specified amount of characters back into the input stream. + * + * They will be read again by then next call of the scanning method + * + * @param number the number of characters to be read again. + * This number must not be greater than yylength()! + */ + public void yypushback(int number) { + if ( number > yylength() ) + zzScanError(ZZ_PUSHBACK_2BIG); + + zzMarkedPos -= number; + } + + + /** + * Resumes scanning until the next regular expression is matched, + * the end of input is encountered or an I/O-Error occurs. + * + * @return the next token + * @exception java.io.IOException if any I/O-Error occurs + */ + public java.lang.String yylex() throws java.io.IOException { + int zzInput; + int zzAction; + + // cached fields: + int zzCurrentPosL; + int zzMarkedPosL; + int zzEndReadL = zzEndRead; + char [] zzBufferL = zzBuffer; + char [] zzCMapL = ZZ_CMAP; + + int [] zzTransL = ZZ_TRANS; + int [] zzRowMapL = ZZ_ROWMAP; + int [] zzAttrL = ZZ_ATTRIBUTE; + + while (true) { + zzMarkedPosL = zzMarkedPos; + + zzAction = -1; + + zzCurrentPosL = zzCurrentPos = zzStartRead = zzMarkedPosL; + + zzState = ZZ_LEXSTATE[zzLexicalState]; + + + zzForAction: { + while (true) { + + if (zzCurrentPosL < zzEndReadL) + zzInput = zzBufferL[zzCurrentPosL++]; + else if (zzAtEOF) { + zzInput = YYEOF; + break zzForAction; + } + else { + // store back cached positions + zzCurrentPos = zzCurrentPosL; + zzMarkedPos = zzMarkedPosL; + boolean eof = zzRefill(); + // get translated positions and possibly new buffer + zzCurrentPosL = zzCurrentPos; + zzMarkedPosL = zzMarkedPos; + zzBufferL = zzBuffer; + zzEndReadL = zzEndRead; + if (eof) { + zzInput = YYEOF; + break zzForAction; + } + else { + zzInput = zzBufferL[zzCurrentPosL++]; + } + } + int zzNext = zzTransL[ zzRowMapL[zzState] + zzCMapL[zzInput] ]; + if (zzNext == -1) break zzForAction; + zzState = zzNext; + + int zzAttributes = zzAttrL[zzState]; + if ( (zzAttributes & 1) == 1 ) { + zzAction = zzState; + zzMarkedPosL = zzCurrentPosL; + if ( (zzAttributes & 8) == 8 ) break zzForAction; + } + + } + } + + // store back cached position + zzMarkedPos = zzMarkedPosL; + + switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) { + case 4: + { switch (problem) { + case 1: return ""; + default: return normalized; + } + } + case 5: break; + case 2: + { problem = 1; add(yytext()); + } + case 6: break; + case 3: + { switch (problem) { + case 1: return original; + default: return normalized; + } + } + case 7: break; + case 1: + { add(yytext()); + } + case 8: break; + default: + if (zzInput == YYEOF && zzStartRead == zzCurrentPos) { + zzAtEOF = true; + return null; + } + else { + zzScanError(ZZ_NO_MATCH); + } + } + } + } + + +} diff -r d2a1c14fde31 -r 1ec29fdd0db8 software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlNormalizerLexAR.lex --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlNormalizerLexAR.lex Tue Feb 22 16:03:45 2011 +0100 @@ -0,0 +1,74 @@ +/* + * Normalization rules for Arabic text + * [this is a JFlex specification] + * + * Wolfgang Schmidle + * version 0.96 + * 2011-02-21 + * + */ + +package de.mpg.mpiwg.berlin.mpdl.lt.analyzer.lang; + +%% + +%public +%class MpdlNormalizerLexAR +%type java.lang.String +%unicode + +// Arabic: ar + +%states DISP, DICT, SEARCH + +%{ + private String original = ""; + private String normalized = ""; + private int problem = 0; + + private void add (String norm) { + original += yytext(); + normalized += norm; + } +%} + +END = \n + +%% + +@ { problem = 1; add(yytext()); } +. { add(yytext()); } + + + { + +{END} { + switch (problem) { + case 1: return original; + default: return normalized; + } + } +} + + { + +{END} { + switch (problem) { + case 1: return ""; + default: return normalized; + } + } +} + + +/* + +Annahmen: +- die Routine wird wortweise aufgerufen, mit einem \n am Ende des Strings +- Wörter mit Zeilenumbrüchen wurden bereits wieder zusammengesetzt + +TO DO: + +AR: fehlt noch + +*/ diff -r d2a1c14fde31 -r 1ec29fdd0db8 software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlNormalizerLexDE.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlNormalizerLexDE.java Tue Feb 22 16:03:45 2011 +0100 @@ -0,0 +1,629 @@ +/* The following code was generated by JFlex 1.4.3 on 22.02.11 12:03 */ + +/* + * Normalization rules for German text + * [this is a JFlex specification] + * + * Wolfgang Schmidle + * version 0.96 + * 2011-02-21 + * + */ + +package de.mpg.mpiwg.berlin.mpdl.lt.analyzer.lang; + + +/** + * This class is a scanner generated by + * JFlex 1.4.3 + * on 22.02.11 12:03 from the specification file + * MpdlNormalizerLexDE.lex + */ +public class MpdlNormalizerLexDE { + + /** This character denotes the end of file */ + public static final int YYEOF = -1; + + /** initial size of the lookahead buffer */ + private static final int ZZ_BUFFERSIZE = 16384; + + /** lexical states */ + public static final int SEARCH = 6; + public static final int DICT = 4; + public static final int YYINITIAL = 0; + public static final int CELEX = 8; + public static final int DISP = 2; + public static final int GRIMM = 10; + + /** + * ZZ_LEXSTATE[l] is the state in the DFA for the lexical state l + * ZZ_LEXSTATE[l+1] is the state in the DFA for the lexical state l + * at the beginning of a line + * l is of the form l = 2*k, k a non negative integer + */ + private static final int ZZ_LEXSTATE[] = { + 0, 0, 1, 1, 2, 2, 1, 1, 3, 3, 4, 4 + }; + + /** + * Translates characters to character classes + */ + private static final String ZZ_CMAP_PACKED = + "\12\0\1\1\65\0\1\15\32\2\6\0\1\6\15\2\1\10\5\2"+ + "\1\4\5\2\111\0\1\11\21\0\1\12\5\0\1\13\2\0\1\14"+ + "\4\0\1\11\21\0\1\12\5\0\1\13\202\0\1\3\u01e4\0\1\7"+ + "\1\0\1\5\ufc99\0"; + + /** + * Translates characters to character classes + */ + private static final char [] ZZ_CMAP = zzUnpackCMap(ZZ_CMAP_PACKED); + + /** + * Translates DFA states to action switch labels. + */ + private static final int [] ZZ_ACTION = zzUnpackAction(); + + private static final String ZZ_ACTION_PACKED_0 = + "\5\0\1\1\1\2\1\3\1\4\3\1\1\5\3\1"+ + "\1\6\1\7\1\10\1\11\1\12\1\13\1\14\1\15"+ + "\1\16"; + + private static int [] zzUnpackAction() { + int [] result = new int[25]; + int offset = 0; + offset = zzUnpackAction(ZZ_ACTION_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackAction(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int count = packed.charAt(i++); + int value = packed.charAt(i++); + do result[j++] = value; while (--count > 0); + } + return j; + } + + + /** + * Translates a state to a row index in the transition table + */ + private static final int [] ZZ_ROWMAP = zzUnpackRowMap(); + + private static final String ZZ_ROWMAP_PACKED_0 = + "\0\0\0\16\0\34\0\52\0\70\0\106\0\106\0\106"+ + "\0\106\0\124\0\142\0\160\0\106\0\176\0\214\0\232"+ + "\0\106\0\106\0\106\0\106\0\106\0\106\0\106\0\106"+ + "\0\106"; + + private static int [] zzUnpackRowMap() { + int [] result = new int[25]; + int offset = 0; + offset = zzUnpackRowMap(ZZ_ROWMAP_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackRowMap(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int high = packed.charAt(i++) << 16; + result[j++] = high | packed.charAt(i++); + } + return j; + } + + /** + * The transition table of the DFA + */ + private static final int [] ZZ_TRANS = zzUnpackTrans(); + + private static final String ZZ_TRANS_PACKED_0 = + "\1\6\1\0\1\6\1\7\11\6\1\10\1\6\1\11"+ + "\1\6\1\7\1\12\1\6\1\13\1\6\1\14\4\6"+ + "\1\10\1\6\1\15\1\6\1\7\1\12\1\6\1\13"+ + "\1\6\1\14\4\6\2\10\1\15\1\6\1\7\1\16"+ + "\1\10\1\17\1\10\1\20\1\21\1\22\1\23\1\24"+ + "\1\10\1\6\1\15\1\6\1\7\1\12\1\6\1\13"+ + "\1\6\1\14\3\6\1\25\1\10\23\0\1\26\1\0"+ + "\1\27\15\0\1\30\15\0\1\31\13\0\1\26\1\0"+ + "\1\23\15\0\1\21\15\0\1\22\6\0"; + + private static int [] zzUnpackTrans() { + int [] result = new int[168]; + int offset = 0; + offset = zzUnpackTrans(ZZ_TRANS_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackTrans(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int count = packed.charAt(i++); + int value = packed.charAt(i++); + value--; + do result[j++] = value; while (--count > 0); + } + return j; + } + + + /* error codes */ + private static final int ZZ_UNKNOWN_ERROR = 0; + private static final int ZZ_NO_MATCH = 1; + private static final int ZZ_PUSHBACK_2BIG = 2; + + /* error messages for the codes above */ + private static final String ZZ_ERROR_MSG[] = { + "Unkown internal scanner error", + "Error: could not match input", + "Error: pushback value was too large" + }; + + /** + * ZZ_ATTRIBUTE[aState] contains the attributes of state aState + */ + private static final int [] ZZ_ATTRIBUTE = zzUnpackAttribute(); + + private static final String ZZ_ATTRIBUTE_PACKED_0 = + "\5\0\4\11\3\1\1\11\3\1\11\11"; + + private static int [] zzUnpackAttribute() { + int [] result = new int[25]; + int offset = 0; + offset = zzUnpackAttribute(ZZ_ATTRIBUTE_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackAttribute(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int count = packed.charAt(i++); + int value = packed.charAt(i++); + do result[j++] = value; while (--count > 0); + } + return j; + } + + /** the input device */ + private java.io.Reader zzReader; + + /** the current state of the DFA */ + private int zzState; + + /** the current lexical state */ + private int zzLexicalState = YYINITIAL; + + /** this buffer contains the current text to be matched and is + the source of the yytext() string */ + private char zzBuffer[] = new char[ZZ_BUFFERSIZE]; + + /** the textposition at the last accepting state */ + private int zzMarkedPos; + + /** the current text position in the buffer */ + private int zzCurrentPos; + + /** startRead marks the beginning of the yytext() string in the buffer */ + private int zzStartRead; + + /** endRead marks the last character in the buffer, that has been read + from input */ + private int zzEndRead; + + /** number of newlines encountered up to the start of the matched text */ + private int yyline; + + /** the number of characters up to the start of the matched text */ + private int yychar; + + /** + * the number of characters from the last newline up to the start of the + * matched text + */ + private int yycolumn; + + /** + * zzAtBOL == true <=> the scanner is currently at the beginning of a line + */ + private boolean zzAtBOL = true; + + /** zzAtEOF == true <=> the scanner is at the EOF */ + private boolean zzAtEOF; + + /** denotes if the user-EOF-code has already been executed */ + private boolean zzEOFDone; + + /* user code: */ + private String original = ""; + private String normalized = ""; + private int problem = 0; + + private void add (String norm) { + original += yytext(); + normalized += norm; + } + + + /** + * Creates a new scanner + * There is also a java.io.InputStream version of this constructor. + * + * @param in the java.io.Reader to read input from. + */ + public MpdlNormalizerLexDE(java.io.Reader in) { + this.zzReader = in; + } + + /** + * Creates a new scanner. + * There is also java.io.Reader version of this constructor. + * + * @param in the java.io.Inputstream to read input from. + */ + public MpdlNormalizerLexDE(java.io.InputStream in) { + this(new java.io.InputStreamReader(in)); + } + + /** + * Unpacks the compressed character translation table. + * + * @param packed the packed character translation table + * @return the unpacked character translation table + */ + private static char [] zzUnpackCMap(String packed) { + char [] map = new char[0x10000]; + int i = 0; /* index in packed string */ + int j = 0; /* index in unpacked array */ + while (i < 66) { + int count = packed.charAt(i++); + char value = packed.charAt(i++); + do map[j++] = value; while (--count > 0); + } + return map; + } + + + /** + * Refills the input buffer. + * + * @return false, iff there was new input. + * + * @exception java.io.IOException if any I/O-Error occurs + */ + private boolean zzRefill() throws java.io.IOException { + + /* first: make room (if you can) */ + if (zzStartRead > 0) { + System.arraycopy(zzBuffer, zzStartRead, + zzBuffer, 0, + zzEndRead-zzStartRead); + + /* translate stored positions */ + zzEndRead-= zzStartRead; + zzCurrentPos-= zzStartRead; + zzMarkedPos-= zzStartRead; + zzStartRead = 0; + } + + /* is the buffer big enough? */ + if (zzCurrentPos >= zzBuffer.length) { + /* if not: blow it up */ + char newBuffer[] = new char[zzCurrentPos*2]; + System.arraycopy(zzBuffer, 0, newBuffer, 0, zzBuffer.length); + zzBuffer = newBuffer; + } + + /* finally: fill the buffer with new input */ + int numRead = zzReader.read(zzBuffer, zzEndRead, + zzBuffer.length-zzEndRead); + + if (numRead > 0) { + zzEndRead+= numRead; + return false; + } + // unlikely but not impossible: read 0 characters, but not at end of stream + if (numRead == 0) { + int c = zzReader.read(); + if (c == -1) { + return true; + } else { + zzBuffer[zzEndRead++] = (char) c; + return false; + } + } + + // numRead < 0 + return true; + } + + + /** + * Closes the input stream. + */ + public final void yyclose() throws java.io.IOException { + zzAtEOF = true; /* indicate end of file */ + zzEndRead = zzStartRead; /* invalidate buffer */ + + if (zzReader != null) + zzReader.close(); + } + + + /** + * Resets the scanner to read from a new input stream. + * Does not close the old reader. + * + * All internal variables are reset, the old input stream + * cannot be reused (internal buffer is discarded and lost). + * Lexical state is set to ZZ_INITIAL. + * + * @param reader the new input stream + */ + public final void yyreset(java.io.Reader reader) { + zzReader = reader; + zzAtBOL = true; + zzAtEOF = false; + zzEOFDone = false; + zzEndRead = zzStartRead = 0; + zzCurrentPos = zzMarkedPos = 0; + yyline = yychar = yycolumn = 0; + zzLexicalState = YYINITIAL; + } + + + /** + * Returns the current lexical state. + */ + public final int yystate() { + return zzLexicalState; + } + + + /** + * Enters a new lexical state + * + * @param newState the new lexical state + */ + public final void yybegin(int newState) { + zzLexicalState = newState; + } + + + /** + * Returns the text matched by the current regular expression. + */ + public final String yytext() { + return new String( zzBuffer, zzStartRead, zzMarkedPos-zzStartRead ); + } + + + /** + * Returns the character at position pos from the + * matched text. + * + * It is equivalent to yytext().charAt(pos), but faster + * + * @param pos the position of the character to fetch. + * A value from 0 to yylength()-1. + * + * @return the character at position pos + */ + public final char yycharat(int pos) { + return zzBuffer[zzStartRead+pos]; + } + + + /** + * Returns the length of the matched text region. + */ + public final int yylength() { + return zzMarkedPos-zzStartRead; + } + + + /** + * Reports an error that occured while scanning. + * + * In a wellformed scanner (no or only correct usage of + * yypushback(int) and a match-all fallback rule) this method + * will only be called with things that "Can't Possibly Happen". + * If this method is called, something is seriously wrong + * (e.g. a JFlex bug producing a faulty scanner etc.). + * + * Usual syntax/scanner level error handling should be done + * in error fallback rules. + * + * @param errorCode the code of the errormessage to display + */ + private void zzScanError(int errorCode) { + String message; + try { + message = ZZ_ERROR_MSG[errorCode]; + } + catch (ArrayIndexOutOfBoundsException e) { + message = ZZ_ERROR_MSG[ZZ_UNKNOWN_ERROR]; + } + + throw new Error(message); + } + + + /** + * Pushes the specified amount of characters back into the input stream. + * + * They will be read again by then next call of the scanning method + * + * @param number the number of characters to be read again. + * This number must not be greater than yylength()! + */ + public void yypushback(int number) { + if ( number > yylength() ) + zzScanError(ZZ_PUSHBACK_2BIG); + + zzMarkedPos -= number; + } + + + /** + * Resumes scanning until the next regular expression is matched, + * the end of input is encountered or an I/O-Error occurs. + * + * @return the next token + * @exception java.io.IOException if any I/O-Error occurs + */ + public java.lang.String yylex() throws java.io.IOException { + int zzInput; + int zzAction; + + // cached fields: + int zzCurrentPosL; + int zzMarkedPosL; + int zzEndReadL = zzEndRead; + char [] zzBufferL = zzBuffer; + char [] zzCMapL = ZZ_CMAP; + + int [] zzTransL = ZZ_TRANS; + int [] zzRowMapL = ZZ_ROWMAP; + int [] zzAttrL = ZZ_ATTRIBUTE; + + while (true) { + zzMarkedPosL = zzMarkedPos; + + zzAction = -1; + + zzCurrentPosL = zzCurrentPos = zzStartRead = zzMarkedPosL; + + zzState = ZZ_LEXSTATE[zzLexicalState]; + + + zzForAction: { + while (true) { + + if (zzCurrentPosL < zzEndReadL) + zzInput = zzBufferL[zzCurrentPosL++]; + else if (zzAtEOF) { + zzInput = YYEOF; + break zzForAction; + } + else { + // store back cached positions + zzCurrentPos = zzCurrentPosL; + zzMarkedPos = zzMarkedPosL; + boolean eof = zzRefill(); + // get translated positions and possibly new buffer + zzCurrentPosL = zzCurrentPos; + zzMarkedPosL = zzMarkedPos; + zzBufferL = zzBuffer; + zzEndReadL = zzEndRead; + if (eof) { + zzInput = YYEOF; + break zzForAction; + } + else { + zzInput = zzBufferL[zzCurrentPosL++]; + } + } + int zzNext = zzTransL[ zzRowMapL[zzState] + zzCMapL[zzInput] ]; + if (zzNext == -1) break zzForAction; + zzState = zzNext; + + int zzAttributes = zzAttrL[zzState]; + if ( (zzAttributes & 1) == 1 ) { + zzAction = zzState; + zzMarkedPosL = zzCurrentPosL; + if ( (zzAttributes & 8) == 8 ) break zzForAction; + } + + } + } + + // store back cached position + zzMarkedPos = zzMarkedPosL; + + switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) { + case 10: + { add("sz"); + } + case 15: break; + case 3: + { problem = 1; add(yytext()); + } + case 16: break; + case 6: + { add("ae"); + } + case 17: break; + case 2: + { add("s"); + } + case 18: break; + case 4: + { switch (problem) { + case 1: return original; + default: return normalized; + } + } + case 19: break; + case 12: + { add("ü"); + } + case 20: break; + case 8: + { add("ue"); + } + case 21: break; + case 11: + { add("u"); + } + case 22: break; + case 13: + { add("ä"); + } + case 23: break; + case 1: + { add(yytext()); + } + case 24: break; + case 9: + { add("ss"); + } + case 25: break; + case 7: + { add("oe"); + } + case 26: break; + case 14: + { add("ö"); + } + case 27: break; + case 5: + { switch (problem) { + case 1: return ""; + default: return normalized; + } + } + case 28: break; + default: + if (zzInput == YYEOF && zzStartRead == zzCurrentPos) { + zzAtEOF = true; + return null; + } + else { + zzScanError(ZZ_NO_MATCH); + } + } + } + } + + +} diff -r d2a1c14fde31 -r 1ec29fdd0db8 software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlNormalizerLexDE.lex --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlNormalizerLexDE.lex Tue Feb 22 16:03:45 2011 +0100 @@ -0,0 +1,117 @@ +/* + * Normalization rules for German text + * [this is a JFlex specification] + * + * Wolfgang Schmidle + * version 0.96 + * 2011-02-21 + * + */ + +package de.mpg.mpiwg.berlin.mpdl.lt.analyzer.lang; + +%% + +%public +%class MpdlNormalizerLexDE +%type java.lang.String +%unicode + +// German: de, deu, ger + +%states DISP, DICT, SEARCH +%state CELEX, GRIMM + +%{ + private String original = ""; + private String normalized = ""; + private int problem = 0; + + private void add (String norm) { + original += yytext(); + normalized += norm; + } +%} + +END = \n + +Alphabet = [abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ] + +%% + +ſ { add("s"); } + +// Fraktur + + { + +uͦ {add("u"); } +aͤ {add("ä"); } +oͤ {add("ö"); } +uͤ {add("ü"); } + +} + + { + +// normalize ä ö ü ß only for Celex! + +ä | Ä | aͤ { add("ae"); } +ö | Ö | oͤ { add("oe"); } +ü | Ü | uͤ { add("ue"); } +uͦ {add("u"); } +ß { add("ss"); } + +{Alphabet} { add(yytext()); } + +. { problem = 1; add(yytext()); } + +} + + { + +ß { add("sz"); } + +} + + +// default + +@ { problem = 1; add(yytext()); } +. { add(yytext()); } + + + { + +{END} { + switch (problem) { + case 1: return original; + default: return normalized; + } + } +} + + { + +{END} { + switch (problem) { + case 1: return ""; + default: return normalized; + } + } +} + + +/* + +Annahmen: +- die Routine wird wortweise aufgerufen, mit einem \n am Ende des Strings +- Wörter mit Zeilenumbrüchen wurden bereits wieder zusammengesetzt + +TO DO: + +DE: Trennung von Deutsch und Fraktur? +DE: Celex: hyphens weg? + +*/ diff -r d2a1c14fde31 -r 1ec29fdd0db8 software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlNormalizerLexEL.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlNormalizerLexEL.java Tue Feb 22 16:03:45 2011 +0100 @@ -0,0 +1,687 @@ +/* The following code was generated by JFlex 1.4.3 on 22.02.11 12:03 */ + +/* + * Normalization rules for Greek text + * [this is a JFlex specification] + * + * Wolfgang Schmidle + * version 0.96 + * 2011-02-21 + * + */ + +package de.mpg.mpiwg.berlin.mpdl.lt.analyzer.lang; + + +/** + * This class is a scanner generated by + * JFlex 1.4.3 + * on 22.02.11 12:03 from the specification file + * MpdlNormalizerLexEL.lex + */ +public class MpdlNormalizerLexEL { + + /** This character denotes the end of file */ + public static final int YYEOF = -1; + + /** initial size of the lookahead buffer */ + private static final int ZZ_BUFFERSIZE = 16384; + + /** lexical states */ + public static final int SEARCH = 6; + public static final int DICT = 4; + public static final int YYINITIAL = 0; + public static final int DISP = 2; + + /** + * ZZ_LEXSTATE[l] is the state in the DFA for the lexical state l + * ZZ_LEXSTATE[l+1] is the state in the DFA for the lexical state l + * at the beginning of a line + * l is of the form l = 2*k, k a non negative integer + */ + private static final int ZZ_LEXSTATE[] = { + 0, 0, 1, 1, 2, 2, 3, 3 + }; + + /** + * Translates characters to character classes + */ + private static final String ZZ_CMAP_PACKED = + "\12\0\1\1\65\0\1\3\32\3\6\0\32\3\u0331\0\1\4\1\5"+ + "\1\6\1\7\15\0\1\2\3\0\2\2\11\0\1\10\1\11\1\12"+ + "\u1ba1\0\1\13\1\0\1\15\1\0\1\16\1\0\1\20\1\0\1\21"+ + "\1\0\1\22\1\0\1\23\65\0\1\14\17\0\1\17\57\0\1\24"+ + "\ue00d\0"; + + /** + * Translates characters to character classes + */ + private static final char [] ZZ_CMAP = zzUnpackCMap(ZZ_CMAP_PACKED); + + /** + * Translates DFA states to action switch labels. + */ + private static final int [] ZZ_ACTION = zzUnpackAction(); + + private static final String ZZ_ACTION_PACKED_0 = + "\4\0\1\1\1\2\1\3\1\4\1\5\1\6\1\7"+ + "\1\10\1\11\1\12\1\13\12\1\1\14\1\0\1\15"+ + "\1\0\1\16\1\0\1\17\1\0\1\20\1\0\1\21"+ + "\1\0\1\22\1\0\1\23\1\0\1\24\1\0\1\25"+ + "\1\0"; + + private static int [] zzUnpackAction() { + int [] result = new int[45]; + int offset = 0; + offset = zzUnpackAction(ZZ_ACTION_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackAction(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int count = packed.charAt(i++); + int value = packed.charAt(i++); + do result[j++] = value; while (--count > 0); + } + return j; + } + + + /** + * Translates a state to a row index in the transition table + */ + private static final int [] ZZ_ROWMAP = zzUnpackRowMap(); + + private static final String ZZ_ROWMAP_PACKED_0 = + "\0\0\0\25\0\52\0\77\0\124\0\124\0\124\0\124"+ + "\0\124\0\124\0\124\0\124\0\124\0\124\0\124\0\151"+ + "\0\176\0\223\0\250\0\275\0\322\0\347\0\374\0\u0111"+ + "\0\u0126\0\124\0\u013b\0\124\0\u0150\0\124\0\u0165\0\124"+ + "\0\u017a\0\124\0\u018f\0\124\0\u01a4\0\124\0\u01b9\0\124"+ + "\0\u01ce\0\124\0\u01e3\0\124\0\u01f8"; + + private static int [] zzUnpackRowMap() { + int [] result = new int[45]; + int offset = 0; + offset = zzUnpackRowMap(ZZ_ROWMAP_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackRowMap(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int high = packed.charAt(i++) << 16; + result[j++] = high | packed.charAt(i++); + } + return j; + } + + /** + * The transition table of the DFA + */ + private static final int [] ZZ_TRANS = zzUnpackTrans(); + + private static final String ZZ_TRANS_PACKED_0 = + "\1\5\1\0\24\5\1\6\1\5\1\7\1\10\1\11"+ + "\1\12\1\13\1\14\1\15\1\16\13\5\1\17\1\5"+ + "\1\7\1\10\1\11\1\12\1\13\1\14\1\15\1\16"+ + "\1\20\1\21\1\22\1\23\1\24\1\25\1\26\1\27"+ + "\1\30\1\31\1\5\1\6\1\5\1\7\1\10\1\11"+ + "\1\12\1\13\1\14\1\15\1\16\1\20\1\21\1\22"+ + "\1\23\1\24\1\25\1\26\1\27\1\30\1\31\26\0"+ + "\1\32\1\33\23\0\1\34\1\35\23\0\1\36\1\37"+ + "\23\0\1\40\1\41\23\0\1\42\1\43\23\0\1\44"+ + "\1\45\23\0\1\46\1\47\23\0\1\50\1\51\23\0"+ + "\1\52\1\53\23\0\1\54\1\55\23\0\1\32\24\0"+ + "\1\34\24\0\1\36\24\0\1\40\24\0\1\42\24\0"+ + "\1\44\24\0\1\46\24\0\1\50\24\0\1\52\24\0"+ + "\1\54\23\0"; + + private static int [] zzUnpackTrans() { + int [] result = new int[525]; + int offset = 0; + offset = zzUnpackTrans(ZZ_TRANS_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackTrans(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int count = packed.charAt(i++); + int value = packed.charAt(i++); + value--; + do result[j++] = value; while (--count > 0); + } + return j; + } + + + /* error codes */ + private static final int ZZ_UNKNOWN_ERROR = 0; + private static final int ZZ_NO_MATCH = 1; + private static final int ZZ_PUSHBACK_2BIG = 2; + + /* error messages for the codes above */ + private static final String ZZ_ERROR_MSG[] = { + "Unkown internal scanner error", + "Error: could not match input", + "Error: pushback value was too large" + }; + + /** + * ZZ_ATTRIBUTE[aState] contains the attributes of state aState + */ + private static final int [] ZZ_ATTRIBUTE = zzUnpackAttribute(); + + private static final String ZZ_ATTRIBUTE_PACKED_0 = + "\4\0\13\11\12\1\1\11\1\0\1\11\1\0\1\11"+ + "\1\0\1\11\1\0\1\11\1\0\1\11\1\0\1\11"+ + "\1\0\1\11\1\0\1\11\1\0\1\11\1\0"; + + private static int [] zzUnpackAttribute() { + int [] result = new int[45]; + int offset = 0; + offset = zzUnpackAttribute(ZZ_ATTRIBUTE_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackAttribute(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int count = packed.charAt(i++); + int value = packed.charAt(i++); + do result[j++] = value; while (--count > 0); + } + return j; + } + + /** the input device */ + private java.io.Reader zzReader; + + /** the current state of the DFA */ + private int zzState; + + /** the current lexical state */ + private int zzLexicalState = YYINITIAL; + + /** this buffer contains the current text to be matched and is + the source of the yytext() string */ + private char zzBuffer[] = new char[ZZ_BUFFERSIZE]; + + /** the textposition at the last accepting state */ + private int zzMarkedPos; + + /** the current text position in the buffer */ + private int zzCurrentPos; + + /** startRead marks the beginning of the yytext() string in the buffer */ + private int zzStartRead; + + /** endRead marks the last character in the buffer, that has been read + from input */ + private int zzEndRead; + + /** number of newlines encountered up to the start of the matched text */ + private int yyline; + + /** the number of characters up to the start of the matched text */ + private int yychar; + + /** + * the number of characters from the last newline up to the start of the + * matched text + */ + private int yycolumn; + + /** + * zzAtBOL == true <=> the scanner is currently at the beginning of a line + */ + private boolean zzAtBOL = true; + + /** zzAtEOF == true <=> the scanner is at the EOF */ + private boolean zzAtEOF; + + /** denotes if the user-EOF-code has already been executed */ + private boolean zzEOFDone; + + /* user code: */ + private String original = ""; + private String normalized = ""; + private int problem = 0; + + private void add (String norm) { + original += yytext(); + normalized += norm; + } + + + /** + * Creates a new scanner + * There is also a java.io.InputStream version of this constructor. + * + * @param in the java.io.Reader to read input from. + */ + public MpdlNormalizerLexEL(java.io.Reader in) { + this.zzReader = in; + } + + /** + * Creates a new scanner. + * There is also java.io.Reader version of this constructor. + * + * @param in the java.io.Inputstream to read input from. + */ + public MpdlNormalizerLexEL(java.io.InputStream in) { + this(new java.io.InputStreamReader(in)); + } + + /** + * Unpacks the compressed character translation table. + * + * @param packed the packed character translation table + * @return the unpacked character translation table + */ + private static char [] zzUnpackCMap(String packed) { + char [] map = new char[0x10000]; + int i = 0; /* index in packed string */ + int j = 0; /* index in unpacked array */ + while (i < 82) { + int count = packed.charAt(i++); + char value = packed.charAt(i++); + do map[j++] = value; while (--count > 0); + } + return map; + } + + + /** + * Refills the input buffer. + * + * @return false, iff there was new input. + * + * @exception java.io.IOException if any I/O-Error occurs + */ + private boolean zzRefill() throws java.io.IOException { + + /* first: make room (if you can) */ + if (zzStartRead > 0) { + System.arraycopy(zzBuffer, zzStartRead, + zzBuffer, 0, + zzEndRead-zzStartRead); + + /* translate stored positions */ + zzEndRead-= zzStartRead; + zzCurrentPos-= zzStartRead; + zzMarkedPos-= zzStartRead; + zzStartRead = 0; + } + + /* is the buffer big enough? */ + if (zzCurrentPos >= zzBuffer.length) { + /* if not: blow it up */ + char newBuffer[] = new char[zzCurrentPos*2]; + System.arraycopy(zzBuffer, 0, newBuffer, 0, zzBuffer.length); + zzBuffer = newBuffer; + } + + /* finally: fill the buffer with new input */ + int numRead = zzReader.read(zzBuffer, zzEndRead, + zzBuffer.length-zzEndRead); + + if (numRead > 0) { + zzEndRead+= numRead; + return false; + } + // unlikely but not impossible: read 0 characters, but not at end of stream + if (numRead == 0) { + int c = zzReader.read(); + if (c == -1) { + return true; + } else { + zzBuffer[zzEndRead++] = (char) c; + return false; + } + } + + // numRead < 0 + return true; + } + + + /** + * Closes the input stream. + */ + public final void yyclose() throws java.io.IOException { + zzAtEOF = true; /* indicate end of file */ + zzEndRead = zzStartRead; /* invalidate buffer */ + + if (zzReader != null) + zzReader.close(); + } + + + /** + * Resets the scanner to read from a new input stream. + * Does not close the old reader. + * + * All internal variables are reset, the old input stream + * cannot be reused (internal buffer is discarded and lost). + * Lexical state is set to ZZ_INITIAL. + * + * @param reader the new input stream + */ + public final void yyreset(java.io.Reader reader) { + zzReader = reader; + zzAtBOL = true; + zzAtEOF = false; + zzEOFDone = false; + zzEndRead = zzStartRead = 0; + zzCurrentPos = zzMarkedPos = 0; + yyline = yychar = yycolumn = 0; + zzLexicalState = YYINITIAL; + } + + + /** + * Returns the current lexical state. + */ + public final int yystate() { + return zzLexicalState; + } + + + /** + * Enters a new lexical state + * + * @param newState the new lexical state + */ + public final void yybegin(int newState) { + zzLexicalState = newState; + } + + + /** + * Returns the text matched by the current regular expression. + */ + public final String yytext() { + return new String( zzBuffer, zzStartRead, zzMarkedPos-zzStartRead ); + } + + + /** + * Returns the character at position pos from the + * matched text. + * + * It is equivalent to yytext().charAt(pos), but faster + * + * @param pos the position of the character to fetch. + * A value from 0 to yylength()-1. + * + * @return the character at position pos + */ + public final char yycharat(int pos) { + return zzBuffer[zzStartRead+pos]; + } + + + /** + * Returns the length of the matched text region. + */ + public final int yylength() { + return zzMarkedPos-zzStartRead; + } + + + /** + * Reports an error that occured while scanning. + * + * In a wellformed scanner (no or only correct usage of + * yypushback(int) and a match-all fallback rule) this method + * will only be called with things that "Can't Possibly Happen". + * If this method is called, something is seriously wrong + * (e.g. a JFlex bug producing a faulty scanner etc.). + * + * Usual syntax/scanner level error handling should be done + * in error fallback rules. + * + * @param errorCode the code of the errormessage to display + */ + private void zzScanError(int errorCode) { + String message; + try { + message = ZZ_ERROR_MSG[errorCode]; + } + catch (ArrayIndexOutOfBoundsException e) { + message = ZZ_ERROR_MSG[ZZ_UNKNOWN_ERROR]; + } + + throw new Error(message); + } + + + /** + * Pushes the specified amount of characters back into the input stream. + * + * They will be read again by then next call of the scanning method + * + * @param number the number of characters to be read again. + * This number must not be greater than yylength()! + */ + public void yypushback(int number) { + if ( number > yylength() ) + zzScanError(ZZ_PUSHBACK_2BIG); + + zzMarkedPos -= number; + } + + + /** + * Resumes scanning until the next regular expression is matched, + * the end of input is encountered or an I/O-Error occurs. + * + * @return the next token + * @exception java.io.IOException if any I/O-Error occurs + */ + public java.lang.String yylex() throws java.io.IOException { + int zzInput; + int zzAction; + + // cached fields: + int zzCurrentPosL; + int zzMarkedPosL; + int zzEndReadL = zzEndRead; + char [] zzBufferL = zzBuffer; + char [] zzCMapL = ZZ_CMAP; + + int [] zzTransL = ZZ_TRANS; + int [] zzRowMapL = ZZ_ROWMAP; + int [] zzAttrL = ZZ_ATTRIBUTE; + + while (true) { + zzMarkedPosL = zzMarkedPos; + + zzAction = -1; + + zzCurrentPosL = zzCurrentPos = zzStartRead = zzMarkedPosL; + + zzState = ZZ_LEXSTATE[zzLexicalState]; + + + zzForAction: { + while (true) { + + if (zzCurrentPosL < zzEndReadL) + zzInput = zzBufferL[zzCurrentPosL++]; + else if (zzAtEOF) { + zzInput = YYEOF; + break zzForAction; + } + else { + // store back cached positions + zzCurrentPos = zzCurrentPosL; + zzMarkedPos = zzMarkedPosL; + boolean eof = zzRefill(); + // get translated positions and possibly new buffer + zzCurrentPosL = zzCurrentPos; + zzMarkedPosL = zzMarkedPos; + zzBufferL = zzBuffer; + zzEndReadL = zzEndRead; + if (eof) { + zzInput = YYEOF; + break zzForAction; + } + else { + zzInput = zzBufferL[zzCurrentPosL++]; + } + } + int zzNext = zzTransL[ zzRowMapL[zzState] + zzCMapL[zzInput] ]; + if (zzNext == -1) break zzForAction; + zzState = zzNext; + + int zzAttributes = zzAttrL[zzState]; + if ( (zzAttributes & 1) == 1 ) { + zzAction = zzState; + zzMarkedPosL = zzCurrentPosL; + if ( (zzAttributes & 8) == 8 ) break zzForAction; + } + + } + } + + // store back cached position + zzMarkedPos = zzMarkedPosL; + + switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) { + case 21: + // lookahead expression with fixed base length + zzMarkedPos = zzStartRead + 1; + { add("ῴ"); + } + case 22: break; + case 6: + { add("ή"); + } + case 23: break; + case 15: + // lookahead expression with fixed base length + zzMarkedPos = zzStartRead + 1; + { add("ή"); + } + case 24: break; + case 7: + { add("ί"); + } + case 25: break; + case 1: + { add(yytext()); + } + case 26: break; + case 20: + // lookahead expression with fixed base length + zzMarkedPos = zzStartRead + 1; + { add("ώ"); + } + case 27: break; + case 17: + // lookahead expression with fixed base length + zzMarkedPos = zzStartRead + 1; + { add("ί"); + } + case 28: break; + case 13: + // lookahead expression with fixed base length + zzMarkedPos = zzStartRead + 1; + { add("ᾴ"); + } + case 29: break; + case 8: + { add("ό"); + } + case 30: break; + case 12: + // lookahead expression with fixed base length + zzMarkedPos = zzStartRead + 1; + { add("ά"); + } + case 31: break; + case 9: + { add("ύ"); + } + case 32: break; + case 3: + { problem = 1; add(yytext()); + } + case 33: break; + case 18: + // lookahead expression with fixed base length + zzMarkedPos = zzStartRead + 1; + { add("ό"); + } + case 34: break; + case 4: + { add("ά"); + } + case 35: break; + case 2: + { switch (problem) { + case 1: return original; + default: return normalized; + } + } + case 36: break; + case 10: + { add("ώ"); + } + case 37: break; + case 14: + // lookahead expression with fixed base length + zzMarkedPos = zzStartRead + 1; + { add("έ"); + } + case 38: break; + case 16: + // lookahead expression with fixed base length + zzMarkedPos = zzStartRead + 1; + { add("ῄ"); + } + case 39: break; + case 5: + { add("έ"); + } + case 40: break; + case 11: + { switch (problem) { + case 1: return ""; + default: return normalized; + } + } + case 41: break; + case 19: + // lookahead expression with fixed base length + zzMarkedPos = zzStartRead + 1; + { add("ύ"); + } + case 42: break; + default: + if (zzInput == YYEOF && zzStartRead == zzCurrentPos) { + zzAtEOF = true; + return null; + } + else { + zzScanError(ZZ_NO_MATCH); + } + } + } + } + + +} diff -r d2a1c14fde31 -r 1ec29fdd0db8 software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlNormalizerLexEL.lex --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlNormalizerLexEL.lex Tue Feb 22 16:03:45 2011 +0100 @@ -0,0 +1,123 @@ +/* + * Normalization rules for Greek text + * [this is a JFlex specification] + * + * Wolfgang Schmidle + * version 0.96 + * 2011-02-21 + * + */ + +package de.mpg.mpiwg.berlin.mpdl.lt.analyzer.lang; + +%% + +%public +%class MpdlNormalizerLexEL +%type java.lang.String +%unicode + +// Greek: el, grc + +%states DISP, DICT, SEARCH + +%{ + private String original = ""; + private String normalized = ""; + private int problem = 0; + + private void add (String norm) { + original += yytext(); + normalized += norm; + } +%} + +END = \n + +wordend = [νρς]? {END} + +Latin = [abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ] + + +%% + + { + +// replace tonos by oxia +// (although this should really be corrected in the text rather than normalized) +ά { add("ά"); } +έ { add("έ"); } +ή { add("ή"); } +ί { add("ί"); } +ό { add("ό"); } +ύ { add("ύ"); } +ώ { add("ώ"); } + +} + + { + +ὰ / {wordend} { add("ά"); } +ᾲ / {wordend} { add("ᾴ"); } +ὲ / {wordend} { add("έ"); } +ὴ / {wordend} { add("ή"); } +ῂ / {wordend} { add("ῄ"); } +ὶ / {wordend} { add("ί"); } +ὸ / {wordend} { add("ό"); } +ὺ / {wordend} { add("ύ"); } +ὼ / {wordend} { add("ώ"); } +ῲ / {wordend} { add("ῴ"); } + +// other candidates: Ὰ Ὲ Ὴ Ὶ Ὺ Ὸ Ὼ + +} + + { + +@ { problem = 1; add(yytext()); } +{Latin} { problem = 1; add(yytext()); } + +} + + +// default + +. { add(yytext()); } + + + { + +{END} { + switch (problem) { + case 1: return original; + default: return normalized; + } + } +} + + { + +{END} { + switch (problem) { + case 1: return ""; + default: return normalized; + } + } +} + + +/* + +Annahmen: +- die Routine wird wortweise aufgerufen, mit einem \n am Ende des Strings +- Wörter mit Zeilenumbrüchen wurden bereits wieder zusammengesetzt + +TO DO: + +EL: tonos --> oxia wieder rausnehmen, weil es im Text geändert werden muss? +EL: gibt es noch weitere Fälle, wo legitimerweise ein Gravis vorkommen kann? +EL: kommen Großbuchstaben mit Gravis bei uns jemals vor, und sollen sie normalisiert werden? +EL: neuer State BETACODE ? +EL: nicht falsche Zeichen definieren, sondern erlaubte Zeichen + +*/ diff -r d2a1c14fde31 -r 1ec29fdd0db8 software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlNormalizerLexEN.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlNormalizerLexEN.java Tue Feb 22 16:03:45 2011 +0100 @@ -0,0 +1,576 @@ +/* The following code was generated by JFlex 1.4.3 on 22.02.11 12:03 */ + +/* + * Normalization rules for English text + * [this is a JFlex specification] + * + * Wolfgang Schmidle + * version 0.96 + * 2011-02-21 + * + */ + +package de.mpg.mpiwg.berlin.mpdl.lt.analyzer.lang; + + +/** + * This class is a scanner generated by + * JFlex 1.4.3 + * on 22.02.11 12:03 from the specification file + * MpdlNormalizerLexEN.lex + */ +public class MpdlNormalizerLexEN { + + /** This character denotes the end of file */ + public static final int YYEOF = -1; + + /** initial size of the lookahead buffer */ + private static final int ZZ_BUFFERSIZE = 16384; + + /** lexical states */ + public static final int SEARCH = 6; + public static final int DICT = 4; + public static final int YYINITIAL = 0; + public static final int DISP = 2; + + /** + * ZZ_LEXSTATE[l] is the state in the DFA for the lexical state l + * ZZ_LEXSTATE[l+1] is the state in the DFA for the lexical state l + * at the beginning of a line + * l is of the form l = 2*k, k a non negative integer + */ + private static final int ZZ_LEXSTATE[] = { + 0, 0, 1, 1, 2, 2, 1, 1 + }; + + /** + * Translates characters to character classes + */ + private static final String ZZ_CMAP_PACKED = + "\12\0\1\1\65\0\1\3\u013e\0\1\2\ufe80\0"; + + /** + * Translates characters to character classes + */ + private static final char [] ZZ_CMAP = zzUnpackCMap(ZZ_CMAP_PACKED); + + /** + * Translates DFA states to action switch labels. + */ + private static final int [] ZZ_ACTION = zzUnpackAction(); + + private static final String ZZ_ACTION_PACKED_0 = + "\3\0\1\1\1\2\1\3\1\4\1\5"; + + private static int [] zzUnpackAction() { + int [] result = new int[8]; + int offset = 0; + offset = zzUnpackAction(ZZ_ACTION_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackAction(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int count = packed.charAt(i++); + int value = packed.charAt(i++); + do result[j++] = value; while (--count > 0); + } + return j; + } + + + /** + * Translates a state to a row index in the transition table + */ + private static final int [] ZZ_ROWMAP = zzUnpackRowMap(); + + private static final String ZZ_ROWMAP_PACKED_0 = + "\0\0\0\4\0\10\0\14\0\14\0\14\0\14\0\14"; + + private static int [] zzUnpackRowMap() { + int [] result = new int[8]; + int offset = 0; + offset = zzUnpackRowMap(ZZ_ROWMAP_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackRowMap(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int high = packed.charAt(i++) << 16; + result[j++] = high | packed.charAt(i++); + } + return j; + } + + /** + * The transition table of the DFA + */ + private static final int [] ZZ_TRANS = zzUnpackTrans(); + + private static final String ZZ_TRANS_PACKED_0 = + "\1\4\1\0\1\4\1\5\1\4\1\6\1\7\1\5"+ + "\1\4\1\10\1\7\1\5\4\0"; + + private static int [] zzUnpackTrans() { + int [] result = new int[16]; + int offset = 0; + offset = zzUnpackTrans(ZZ_TRANS_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackTrans(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int count = packed.charAt(i++); + int value = packed.charAt(i++); + value--; + do result[j++] = value; while (--count > 0); + } + return j; + } + + + /* error codes */ + private static final int ZZ_UNKNOWN_ERROR = 0; + private static final int ZZ_NO_MATCH = 1; + private static final int ZZ_PUSHBACK_2BIG = 2; + + /* error messages for the codes above */ + private static final String ZZ_ERROR_MSG[] = { + "Unkown internal scanner error", + "Error: could not match input", + "Error: pushback value was too large" + }; + + /** + * ZZ_ATTRIBUTE[aState] contains the attributes of state aState + */ + private static final int [] ZZ_ATTRIBUTE = zzUnpackAttribute(); + + private static final String ZZ_ATTRIBUTE_PACKED_0 = + "\3\0\5\11"; + + private static int [] zzUnpackAttribute() { + int [] result = new int[8]; + int offset = 0; + offset = zzUnpackAttribute(ZZ_ATTRIBUTE_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackAttribute(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int count = packed.charAt(i++); + int value = packed.charAt(i++); + do result[j++] = value; while (--count > 0); + } + return j; + } + + /** the input device */ + private java.io.Reader zzReader; + + /** the current state of the DFA */ + private int zzState; + + /** the current lexical state */ + private int zzLexicalState = YYINITIAL; + + /** this buffer contains the current text to be matched and is + the source of the yytext() string */ + private char zzBuffer[] = new char[ZZ_BUFFERSIZE]; + + /** the textposition at the last accepting state */ + private int zzMarkedPos; + + /** the current text position in the buffer */ + private int zzCurrentPos; + + /** startRead marks the beginning of the yytext() string in the buffer */ + private int zzStartRead; + + /** endRead marks the last character in the buffer, that has been read + from input */ + private int zzEndRead; + + /** number of newlines encountered up to the start of the matched text */ + private int yyline; + + /** the number of characters up to the start of the matched text */ + private int yychar; + + /** + * the number of characters from the last newline up to the start of the + * matched text + */ + private int yycolumn; + + /** + * zzAtBOL == true <=> the scanner is currently at the beginning of a line + */ + private boolean zzAtBOL = true; + + /** zzAtEOF == true <=> the scanner is at the EOF */ + private boolean zzAtEOF; + + /** denotes if the user-EOF-code has already been executed */ + private boolean zzEOFDone; + + /* user code: */ + private String original = ""; + private String normalized = ""; + private int problem = 0; + + private void add (String norm) { + original += yytext(); + normalized += norm; + } + + + /** + * Creates a new scanner + * There is also a java.io.InputStream version of this constructor. + * + * @param in the java.io.Reader to read input from. + */ + public MpdlNormalizerLexEN(java.io.Reader in) { + this.zzReader = in; + } + + /** + * Creates a new scanner. + * There is also java.io.Reader version of this constructor. + * + * @param in the java.io.Inputstream to read input from. + */ + public MpdlNormalizerLexEN(java.io.InputStream in) { + this(new java.io.InputStreamReader(in)); + } + + /** + * Unpacks the compressed character translation table. + * + * @param packed the packed character translation table + * @return the unpacked character translation table + */ + private static char [] zzUnpackCMap(String packed) { + char [] map = new char[0x10000]; + int i = 0; /* index in packed string */ + int j = 0; /* index in unpacked array */ + while (i < 14) { + int count = packed.charAt(i++); + char value = packed.charAt(i++); + do map[j++] = value; while (--count > 0); + } + return map; + } + + + /** + * Refills the input buffer. + * + * @return false, iff there was new input. + * + * @exception java.io.IOException if any I/O-Error occurs + */ + private boolean zzRefill() throws java.io.IOException { + + /* first: make room (if you can) */ + if (zzStartRead > 0) { + System.arraycopy(zzBuffer, zzStartRead, + zzBuffer, 0, + zzEndRead-zzStartRead); + + /* translate stored positions */ + zzEndRead-= zzStartRead; + zzCurrentPos-= zzStartRead; + zzMarkedPos-= zzStartRead; + zzStartRead = 0; + } + + /* is the buffer big enough? */ + if (zzCurrentPos >= zzBuffer.length) { + /* if not: blow it up */ + char newBuffer[] = new char[zzCurrentPos*2]; + System.arraycopy(zzBuffer, 0, newBuffer, 0, zzBuffer.length); + zzBuffer = newBuffer; + } + + /* finally: fill the buffer with new input */ + int numRead = zzReader.read(zzBuffer, zzEndRead, + zzBuffer.length-zzEndRead); + + if (numRead > 0) { + zzEndRead+= numRead; + return false; + } + // unlikely but not impossible: read 0 characters, but not at end of stream + if (numRead == 0) { + int c = zzReader.read(); + if (c == -1) { + return true; + } else { + zzBuffer[zzEndRead++] = (char) c; + return false; + } + } + + // numRead < 0 + return true; + } + + + /** + * Closes the input stream. + */ + public final void yyclose() throws java.io.IOException { + zzAtEOF = true; /* indicate end of file */ + zzEndRead = zzStartRead; /* invalidate buffer */ + + if (zzReader != null) + zzReader.close(); + } + + + /** + * Resets the scanner to read from a new input stream. + * Does not close the old reader. + * + * All internal variables are reset, the old input stream + * cannot be reused (internal buffer is discarded and lost). + * Lexical state is set to ZZ_INITIAL. + * + * @param reader the new input stream + */ + public final void yyreset(java.io.Reader reader) { + zzReader = reader; + zzAtBOL = true; + zzAtEOF = false; + zzEOFDone = false; + zzEndRead = zzStartRead = 0; + zzCurrentPos = zzMarkedPos = 0; + yyline = yychar = yycolumn = 0; + zzLexicalState = YYINITIAL; + } + + + /** + * Returns the current lexical state. + */ + public final int yystate() { + return zzLexicalState; + } + + + /** + * Enters a new lexical state + * + * @param newState the new lexical state + */ + public final void yybegin(int newState) { + zzLexicalState = newState; + } + + + /** + * Returns the text matched by the current regular expression. + */ + public final String yytext() { + return new String( zzBuffer, zzStartRead, zzMarkedPos-zzStartRead ); + } + + + /** + * Returns the character at position pos from the + * matched text. + * + * It is equivalent to yytext().charAt(pos), but faster + * + * @param pos the position of the character to fetch. + * A value from 0 to yylength()-1. + * + * @return the character at position pos + */ + public final char yycharat(int pos) { + return zzBuffer[zzStartRead+pos]; + } + + + /** + * Returns the length of the matched text region. + */ + public final int yylength() { + return zzMarkedPos-zzStartRead; + } + + + /** + * Reports an error that occured while scanning. + * + * In a wellformed scanner (no or only correct usage of + * yypushback(int) and a match-all fallback rule) this method + * will only be called with things that "Can't Possibly Happen". + * If this method is called, something is seriously wrong + * (e.g. a JFlex bug producing a faulty scanner etc.). + * + * Usual syntax/scanner level error handling should be done + * in error fallback rules. + * + * @param errorCode the code of the errormessage to display + */ + private void zzScanError(int errorCode) { + String message; + try { + message = ZZ_ERROR_MSG[errorCode]; + } + catch (ArrayIndexOutOfBoundsException e) { + message = ZZ_ERROR_MSG[ZZ_UNKNOWN_ERROR]; + } + + throw new Error(message); + } + + + /** + * Pushes the specified amount of characters back into the input stream. + * + * They will be read again by then next call of the scanning method + * + * @param number the number of characters to be read again. + * This number must not be greater than yylength()! + */ + public void yypushback(int number) { + if ( number > yylength() ) + zzScanError(ZZ_PUSHBACK_2BIG); + + zzMarkedPos -= number; + } + + + /** + * Resumes scanning until the next regular expression is matched, + * the end of input is encountered or an I/O-Error occurs. + * + * @return the next token + * @exception java.io.IOException if any I/O-Error occurs + */ + public java.lang.String yylex() throws java.io.IOException { + int zzInput; + int zzAction; + + // cached fields: + int zzCurrentPosL; + int zzMarkedPosL; + int zzEndReadL = zzEndRead; + char [] zzBufferL = zzBuffer; + char [] zzCMapL = ZZ_CMAP; + + int [] zzTransL = ZZ_TRANS; + int [] zzRowMapL = ZZ_ROWMAP; + int [] zzAttrL = ZZ_ATTRIBUTE; + + while (true) { + zzMarkedPosL = zzMarkedPos; + + zzAction = -1; + + zzCurrentPosL = zzCurrentPos = zzStartRead = zzMarkedPosL; + + zzState = ZZ_LEXSTATE[zzLexicalState]; + + + zzForAction: { + while (true) { + + if (zzCurrentPosL < zzEndReadL) + zzInput = zzBufferL[zzCurrentPosL++]; + else if (zzAtEOF) { + zzInput = YYEOF; + break zzForAction; + } + else { + // store back cached positions + zzCurrentPos = zzCurrentPosL; + zzMarkedPos = zzMarkedPosL; + boolean eof = zzRefill(); + // get translated positions and possibly new buffer + zzCurrentPosL = zzCurrentPos; + zzMarkedPosL = zzMarkedPos; + zzBufferL = zzBuffer; + zzEndReadL = zzEndRead; + if (eof) { + zzInput = YYEOF; + break zzForAction; + } + else { + zzInput = zzBufferL[zzCurrentPosL++]; + } + } + int zzNext = zzTransL[ zzRowMapL[zzState] + zzCMapL[zzInput] ]; + if (zzNext == -1) break zzForAction; + zzState = zzNext; + + int zzAttributes = zzAttrL[zzState]; + if ( (zzAttributes & 1) == 1 ) { + zzAction = zzState; + zzMarkedPosL = zzCurrentPosL; + if ( (zzAttributes & 8) == 8 ) break zzForAction; + } + + } + } + + // store back cached position + zzMarkedPos = zzMarkedPosL; + + switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) { + case 5: + { switch (problem) { + case 1: return ""; + default: return normalized; + } + } + case 6: break; + case 2: + { problem = 1; add(yytext()); + } + case 7: break; + case 4: + { add("s"); + } + case 8: break; + case 3: + { switch (problem) { + case 1: return original; + default: return normalized; + } + } + case 9: break; + case 1: + { add(yytext()); + } + case 10: break; + default: + if (zzInput == YYEOF && zzStartRead == zzCurrentPos) { + zzAtEOF = true; + return null; + } + else { + zzScanError(ZZ_NO_MATCH); + } + } + } + } + + +} diff -r d2a1c14fde31 -r 1ec29fdd0db8 software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlNormalizerLexEN.lex --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlNormalizerLexEN.lex Tue Feb 22 16:03:45 2011 +0100 @@ -0,0 +1,83 @@ +/* + * Normalization rules for English text + * [this is a JFlex specification] + * + * Wolfgang Schmidle + * version 0.96 + * 2011-02-21 + * + */ + +package de.mpg.mpiwg.berlin.mpdl.lt.analyzer.lang; + +%% + +%public +%class MpdlNormalizerLexEN +%type java.lang.String +%unicode + +// 1.5 English: en + +%states DISP, DICT, SEARCH + +%{ + private String original = ""; + private String normalized = ""; + private int problem = 0; + + private void add (String norm) { + original += yytext(); + normalized += norm; + } +%} + +END = \n + +%% + + { + +ſ { add("s"); } + +} + + +// default + +@ { problem = 1; add(yytext()); } +. { add(yytext()); } + + + { + +{END} { + switch (problem) { + case 1: return original; + default: return normalized; + } + } +} + + { + +{END} { + switch (problem) { + case 1: return ""; + default: return normalized; + } + } +} + + +/* + +Annahmen: +- die Routine wird wortweise aufgerufen, mit einem \n am Ende des Strings +- Wörter mit Zeilenumbrüchen wurden bereits wieder zusammengesetzt + +TO DO: + +EN: vollständig? + +*/ diff -r d2a1c14fde31 -r 1ec29fdd0db8 software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlNormalizerLexFR.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlNormalizerLexFR.java Tue Feb 22 16:03:45 2011 +0100 @@ -0,0 +1,621 @@ +/* The following code was generated by JFlex 1.4.3 on 22.02.11 12:03 */ + +/* + * Normalization rules for French text + * [this is a JFlex specification] + * + * Wolfgang Schmidle + * version 0.96 + * 2011-02-21 + * + */ + +package de.mpg.mpiwg.berlin.mpdl.lt.analyzer.lang; + + +/** + * This class is a scanner generated by + * JFlex 1.4.3 + * on 22.02.11 12:03 from the specification file + * MpdlNormalizerLexFR.lex + */ +public class MpdlNormalizerLexFR { + + /** This character denotes the end of file */ + public static final int YYEOF = -1; + + /** initial size of the lookahead buffer */ + private static final int ZZ_BUFFERSIZE = 16384; + + /** lexical states */ + public static final int SEARCH = 6; + public static final int DICT = 4; + public static final int YYINITIAL = 0; + public static final int CELEX = 8; + public static final int DISP = 2; + + /** + * ZZ_LEXSTATE[l] is the state in the DFA for the lexical state l + * ZZ_LEXSTATE[l+1] is the state in the DFA for the lexical state l + * at the beginning of a line + * l is of the form l = 2*k, k a non negative integer + */ + private static final int ZZ_LEXSTATE[] = { + 0, 0, 1, 1, 2, 2, 1, 1, 3, 3 + }; + + /** + * Translates characters to character classes + */ + private static final String ZZ_CMAP_PACKED = + "\12\0\1\1\65\0\1\15\32\2\6\0\32\2\144\0\1\4\3\7"+ + "\3\0\1\5\1\0\3\10\1\0\3\11\3\0\3\12\4\0\3\13"+ + "\126\0\2\6\53\0\1\3\u1e99\0\1\14\udfe6\0"; + + /** + * Translates characters to character classes + */ + private static final char [] ZZ_CMAP = zzUnpackCMap(ZZ_CMAP_PACKED); + + /** + * Translates DFA states to action switch labels. + */ + private static final int [] ZZ_ACTION = zzUnpackAction(); + + private static final String ZZ_ACTION_PACKED_0 = + "\4\0\1\1\1\2\1\3\1\4\1\5\1\6\1\7"+ + "\1\10\1\11\1\12\1\13\1\14\1\15\1\16"; + + private static int [] zzUnpackAction() { + int [] result = new int[18]; + int offset = 0; + offset = zzUnpackAction(ZZ_ACTION_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackAction(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int count = packed.charAt(i++); + int value = packed.charAt(i++); + do result[j++] = value; while (--count > 0); + } + return j; + } + + + /** + * Translates a state to a row index in the transition table + */ + private static final int [] ZZ_ROWMAP = zzUnpackRowMap(); + + private static final String ZZ_ROWMAP_PACKED_0 = + "\0\0\0\16\0\34\0\52\0\70\0\70\0\70\0\70"+ + "\0\70\0\70\0\70\0\70\0\70\0\70\0\70\0\70"+ + "\0\70\0\70"; + + private static int [] zzUnpackRowMap() { + int [] result = new int[18]; + int offset = 0; + offset = zzUnpackRowMap(ZZ_ROWMAP_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackRowMap(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int high = packed.charAt(i++) << 16; + result[j++] = high | packed.charAt(i++); + } + return j; + } + + /** + * The transition table of the DFA + */ + private static final int [] ZZ_TRANS = zzUnpackTrans(); + + private static final String ZZ_TRANS_PACKED_0 = + "\1\5\1\0\13\5\1\6\1\5\1\7\1\5\1\10"+ + "\1\11\1\12\7\5\1\6\1\5\1\13\1\5\1\10"+ + "\1\11\1\12\7\5\2\6\1\13\1\5\1\10\1\11"+ + "\1\12\1\14\1\15\1\16\1\17\1\20\1\21\1\22"+ + "\1\6\16\0"; + + private static int [] zzUnpackTrans() { + int [] result = new int[70]; + int offset = 0; + offset = zzUnpackTrans(ZZ_TRANS_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackTrans(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int count = packed.charAt(i++); + int value = packed.charAt(i++); + value--; + do result[j++] = value; while (--count > 0); + } + return j; + } + + + /* error codes */ + private static final int ZZ_UNKNOWN_ERROR = 0; + private static final int ZZ_NO_MATCH = 1; + private static final int ZZ_PUSHBACK_2BIG = 2; + + /* error messages for the codes above */ + private static final String ZZ_ERROR_MSG[] = { + "Unkown internal scanner error", + "Error: could not match input", + "Error: pushback value was too large" + }; + + /** + * ZZ_ATTRIBUTE[aState] contains the attributes of state aState + */ + private static final int [] ZZ_ATTRIBUTE = zzUnpackAttribute(); + + private static final String ZZ_ATTRIBUTE_PACKED_0 = + "\4\0\16\11"; + + private static int [] zzUnpackAttribute() { + int [] result = new int[18]; + int offset = 0; + offset = zzUnpackAttribute(ZZ_ATTRIBUTE_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackAttribute(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int count = packed.charAt(i++); + int value = packed.charAt(i++); + do result[j++] = value; while (--count > 0); + } + return j; + } + + /** the input device */ + private java.io.Reader zzReader; + + /** the current state of the DFA */ + private int zzState; + + /** the current lexical state */ + private int zzLexicalState = YYINITIAL; + + /** this buffer contains the current text to be matched and is + the source of the yytext() string */ + private char zzBuffer[] = new char[ZZ_BUFFERSIZE]; + + /** the textposition at the last accepting state */ + private int zzMarkedPos; + + /** the current text position in the buffer */ + private int zzCurrentPos; + + /** startRead marks the beginning of the yytext() string in the buffer */ + private int zzStartRead; + + /** endRead marks the last character in the buffer, that has been read + from input */ + private int zzEndRead; + + /** number of newlines encountered up to the start of the matched text */ + private int yyline; + + /** the number of characters up to the start of the matched text */ + private int yychar; + + /** + * the number of characters from the last newline up to the start of the + * matched text + */ + private int yycolumn; + + /** + * zzAtBOL == true <=> the scanner is currently at the beginning of a line + */ + private boolean zzAtBOL = true; + + /** zzAtEOF == true <=> the scanner is at the EOF */ + private boolean zzAtEOF; + + /** denotes if the user-EOF-code has already been executed */ + private boolean zzEOFDone; + + /* user code: */ + private String original = ""; + private String normalized = ""; + private int problem = 0; + + private void add (String norm) { + original += yytext(); + normalized += norm; + } + + + /** + * Creates a new scanner + * There is also a java.io.InputStream version of this constructor. + * + * @param in the java.io.Reader to read input from. + */ + public MpdlNormalizerLexFR(java.io.Reader in) { + this.zzReader = in; + } + + /** + * Creates a new scanner. + * There is also java.io.Reader version of this constructor. + * + * @param in the java.io.Inputstream to read input from. + */ + public MpdlNormalizerLexFR(java.io.InputStream in) { + this(new java.io.InputStreamReader(in)); + } + + /** + * Unpacks the compressed character translation table. + * + * @param packed the packed character translation table + * @return the unpacked character translation table + */ + private static char [] zzUnpackCMap(String packed) { + char [] map = new char[0x10000]; + int i = 0; /* index in packed string */ + int j = 0; /* index in unpacked array */ + while (i < 54) { + int count = packed.charAt(i++); + char value = packed.charAt(i++); + do map[j++] = value; while (--count > 0); + } + return map; + } + + + /** + * Refills the input buffer. + * + * @return false, iff there was new input. + * + * @exception java.io.IOException if any I/O-Error occurs + */ + private boolean zzRefill() throws java.io.IOException { + + /* first: make room (if you can) */ + if (zzStartRead > 0) { + System.arraycopy(zzBuffer, zzStartRead, + zzBuffer, 0, + zzEndRead-zzStartRead); + + /* translate stored positions */ + zzEndRead-= zzStartRead; + zzCurrentPos-= zzStartRead; + zzMarkedPos-= zzStartRead; + zzStartRead = 0; + } + + /* is the buffer big enough? */ + if (zzCurrentPos >= zzBuffer.length) { + /* if not: blow it up */ + char newBuffer[] = new char[zzCurrentPos*2]; + System.arraycopy(zzBuffer, 0, newBuffer, 0, zzBuffer.length); + zzBuffer = newBuffer; + } + + /* finally: fill the buffer with new input */ + int numRead = zzReader.read(zzBuffer, zzEndRead, + zzBuffer.length-zzEndRead); + + if (numRead > 0) { + zzEndRead+= numRead; + return false; + } + // unlikely but not impossible: read 0 characters, but not at end of stream + if (numRead == 0) { + int c = zzReader.read(); + if (c == -1) { + return true; + } else { + zzBuffer[zzEndRead++] = (char) c; + return false; + } + } + + // numRead < 0 + return true; + } + + + /** + * Closes the input stream. + */ + public final void yyclose() throws java.io.IOException { + zzAtEOF = true; /* indicate end of file */ + zzEndRead = zzStartRead; /* invalidate buffer */ + + if (zzReader != null) + zzReader.close(); + } + + + /** + * Resets the scanner to read from a new input stream. + * Does not close the old reader. + * + * All internal variables are reset, the old input stream + * cannot be reused (internal buffer is discarded and lost). + * Lexical state is set to ZZ_INITIAL. + * + * @param reader the new input stream + */ + public final void yyreset(java.io.Reader reader) { + zzReader = reader; + zzAtBOL = true; + zzAtEOF = false; + zzEOFDone = false; + zzEndRead = zzStartRead = 0; + zzCurrentPos = zzMarkedPos = 0; + yyline = yychar = yycolumn = 0; + zzLexicalState = YYINITIAL; + } + + + /** + * Returns the current lexical state. + */ + public final int yystate() { + return zzLexicalState; + } + + + /** + * Enters a new lexical state + * + * @param newState the new lexical state + */ + public final void yybegin(int newState) { + zzLexicalState = newState; + } + + + /** + * Returns the text matched by the current regular expression. + */ + public final String yytext() { + return new String( zzBuffer, zzStartRead, zzMarkedPos-zzStartRead ); + } + + + /** + * Returns the character at position pos from the + * matched text. + * + * It is equivalent to yytext().charAt(pos), but faster + * + * @param pos the position of the character to fetch. + * A value from 0 to yylength()-1. + * + * @return the character at position pos + */ + public final char yycharat(int pos) { + return zzBuffer[zzStartRead+pos]; + } + + + /** + * Returns the length of the matched text region. + */ + public final int yylength() { + return zzMarkedPos-zzStartRead; + } + + + /** + * Reports an error that occured while scanning. + * + * In a wellformed scanner (no or only correct usage of + * yypushback(int) and a match-all fallback rule) this method + * will only be called with things that "Can't Possibly Happen". + * If this method is called, something is seriously wrong + * (e.g. a JFlex bug producing a faulty scanner etc.). + * + * Usual syntax/scanner level error handling should be done + * in error fallback rules. + * + * @param errorCode the code of the errormessage to display + */ + private void zzScanError(int errorCode) { + String message; + try { + message = ZZ_ERROR_MSG[errorCode]; + } + catch (ArrayIndexOutOfBoundsException e) { + message = ZZ_ERROR_MSG[ZZ_UNKNOWN_ERROR]; + } + + throw new Error(message); + } + + + /** + * Pushes the specified amount of characters back into the input stream. + * + * They will be read again by then next call of the scanning method + * + * @param number the number of characters to be read again. + * This number must not be greater than yylength()! + */ + public void yypushback(int number) { + if ( number > yylength() ) + zzScanError(ZZ_PUSHBACK_2BIG); + + zzMarkedPos -= number; + } + + + /** + * Resumes scanning until the next regular expression is matched, + * the end of input is encountered or an I/O-Error occurs. + * + * @return the next token + * @exception java.io.IOException if any I/O-Error occurs + */ + public java.lang.String yylex() throws java.io.IOException { + int zzInput; + int zzAction; + + // cached fields: + int zzCurrentPosL; + int zzMarkedPosL; + int zzEndReadL = zzEndRead; + char [] zzBufferL = zzBuffer; + char [] zzCMapL = ZZ_CMAP; + + int [] zzTransL = ZZ_TRANS; + int [] zzRowMapL = ZZ_ROWMAP; + int [] zzAttrL = ZZ_ATTRIBUTE; + + while (true) { + zzMarkedPosL = zzMarkedPos; + + zzAction = -1; + + zzCurrentPosL = zzCurrentPos = zzStartRead = zzMarkedPosL; + + zzState = ZZ_LEXSTATE[zzLexicalState]; + + + zzForAction: { + while (true) { + + if (zzCurrentPosL < zzEndReadL) + zzInput = zzBufferL[zzCurrentPosL++]; + else if (zzAtEOF) { + zzInput = YYEOF; + break zzForAction; + } + else { + // store back cached positions + zzCurrentPos = zzCurrentPosL; + zzMarkedPos = zzMarkedPosL; + boolean eof = zzRefill(); + // get translated positions and possibly new buffer + zzCurrentPosL = zzCurrentPos; + zzMarkedPosL = zzMarkedPos; + zzBufferL = zzBuffer; + zzEndReadL = zzEndRead; + if (eof) { + zzInput = YYEOF; + break zzForAction; + } + else { + zzInput = zzBufferL[zzCurrentPosL++]; + } + } + int zzNext = zzTransL[ zzRowMapL[zzState] + zzCMapL[zzInput] ]; + if (zzNext == -1) break zzForAction; + zzState = zzNext; + + int zzAttributes = zzAttrL[zzState]; + if ( (zzAttributes & 1) == 1 ) { + zzAction = zzState; + zzMarkedPosL = zzCurrentPosL; + if ( (zzAttributes & 8) == 8 ) break zzForAction; + } + + } + } + + // store back cached position + zzMarkedPos = zzMarkedPosL; + + switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) { + case 2: + { problem = 1; add(yytext()); + } + case 15: break; + case 6: + { add("ae"); + } + case 16: break; + case 4: + { add("s"); + } + case 17: break; + case 12: + { add("o"); + } + case 18: break; + case 3: + { switch (problem) { + case 1: return original; + default: return normalized; + } + } + case 19: break; + case 13: + { add("u"); + } + case 20: break; + case 1: + { add(yytext()); + } + case 21: break; + case 11: + { add("i"); + } + case 22: break; + case 14: + { add(""); + } + case 23: break; + case 10: + { add("e"); + } + case 24: break; + case 9: + { add("a"); + } + case 25: break; + case 5: + { add("ss"); + } + case 26: break; + case 8: + { add("oe"); + } + case 27: break; + case 7: + { switch (problem) { + case 1: return ""; + default: return normalized; + } + } + case 28: break; + default: + if (zzInput == YYEOF && zzStartRead == zzCurrentPos) { + zzAtEOF = true; + return null; + } + else { + zzScanError(ZZ_NO_MATCH); + } + } + } + } + + +} diff -r d2a1c14fde31 -r 1ec29fdd0db8 software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlNormalizerLexFR.lex --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlNormalizerLexFR.lex Tue Feb 22 16:03:45 2011 +0100 @@ -0,0 +1,104 @@ +/* + * Normalization rules for French text + * [this is a JFlex specification] + * + * Wolfgang Schmidle + * version 0.96 + * 2011-02-21 + * + */ + +package de.mpg.mpiwg.berlin.mpdl.lt.analyzer.lang; + +%% + +%public +%class MpdlNormalizerLexFR +%type java.lang.String +%unicode + +// French: fr + +%states DISP, DICT, SEARCH +%state CELEX + +%{ + private String original = ""; + private String normalized = ""; + private int problem = 0; + + private void add (String norm) { + original += yytext(); + normalized += norm; + } +%} + +END = \n + +Alphabet = [abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ] + +%% + + { + +ſ { add("s"); } +ß { add("ss"); } +æ { add("ae"); } + +} + + { + +[œŒ] { add("oe"); } +[áàâ] { add("a"); } +[éèê] { add("e"); } +[íìî] { add("i"); } +[óòô] { add("o"); } +[úùû] { add("u"); } +’ { add(""); } + +{Alphabet} { add(yytext()); } + +. { problem = 1; add(yytext()); } // in particular "@" + +} + +// default + +@ { problem = 1; add(yytext()); } +. { add(yytext()); } + + + { + +{END} { + switch (problem) { + case 1: return original; + default: return normalized; + } + } +} + + { + +{END} { + switch (problem) { + case 1: return ""; + default: return normalized; + } + } +} + + + +/* + +Annahmen: +- die Routine wird wortweise aufgerufen, mit einem \n am Ende des Strings +- Wörter mit Zeilenumbrüchen wurden bereits wieder zusammengesetzt + +TO DO: + +FR: richtig? vollständig? + +*/ diff -r d2a1c14fde31 -r 1ec29fdd0db8 software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlNormalizerLexIT.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlNormalizerLexIT.java Tue Feb 22 16:03:45 2011 +0100 @@ -0,0 +1,874 @@ +/* The following code was generated by JFlex 1.4.3 on 22.02.11 12:03 */ + +/* + * Normalization rules for Italian text + * [this is a JFlex specification] + * + * Wolfgang Schmidle + * version 0.96 + * 2011-02-21 + * + */ + +package de.mpg.mpiwg.berlin.mpdl.lt.analyzer.lang; + + +/** + * This class is a scanner generated by + * JFlex 1.4.3 + * on 22.02.11 12:03 from the specification file + * MpdlNormalizerLexIT.lex + */ +public class MpdlNormalizerLexIT { + + /** This character denotes the end of file */ + public static final int YYEOF = -1; + + /** initial size of the lookahead buffer */ + private static final int ZZ_BUFFERSIZE = 16384; + + /** lexical states */ + public static final int SEARCH = 6; + public static final int DICT = 4; + public static final int YYINITIAL = 0; + public static final int DISP = 2; + + /** + * ZZ_LEXSTATE[l] is the state in the DFA for the lexical state l + * ZZ_LEXSTATE[l+1] is the state in the DFA for the lexical state l + * at the beginning of a line + * l is of the form l = 2*k, k a non negative integer + */ + private static final int ZZ_LEXSTATE[] = { + 0, 0, 1, 2, 3, 4, 5, 6 + }; + + /** + * Translates characters to character classes + */ + private static final String ZZ_CMAP_PACKED = + "\12\0\1\5\42\0\1\4\22\0\1\51\1\1\3\2\1\1\3\2"+ + "\1\40\1\0\1\2\1\3\2\2\1\41\1\2\1\47\1\3\1\2"+ + "\1\37\1\44\1\50\2\2\1\0\1\2\6\0\1\43\3\2\1\11"+ + "\2\2\1\42\1\6\1\35\1\2\1\3\1\2\1\7\1\36\1\13"+ + "\1\45\1\12\1\2\1\10\1\15\1\46\2\2\1\0\1\2\62\0"+ + "\1\4\22\0\1\16\5\0\1\32\1\0\1\17\3\0\1\20\5\0"+ + "\1\21\6\0\1\22\5\0\1\30\1\23\5\0\1\31\1\0\1\24"+ + "\3\0\1\25\5\0\1\26\6\0\1\27\37\0\1\1\70\0\1\34"+ + "\1\33\53\0\1\14\ufe80\0"; + + /** + * Translates characters to character classes + */ + private static final char [] ZZ_CMAP = zzUnpackCMap(ZZ_CMAP_PACKED); + + /** + * Translates DFA states to action switch labels. + */ + private static final int [] ZZ_ACTION = zzUnpackAction(); + + private static final String ZZ_ACTION_PACKED_0 = + "\11\0\1\1\1\2\2\3\1\4\1\5\1\2\1\3"+ + "\1\6\1\2\1\7\1\10\1\11\1\12\1\13\5\3"+ + "\1\14\1\2\1\3\1\6\1\2\1\15\1\16\1\17"+ + "\1\20\1\21\1\22\1\23\1\24\1\25\1\26\1\27"+ + "\1\30\4\0\1\31\1\32\1\0\1\33\1\0\1\34"+ + "\1\35\1\0\1\36\1\37\1\40\4\0\1\41\5\0"+ + "\1\42\1\43\2\0\1\44\1\0\1\45\5\0\1\44"+ + "\1\46\3\0\1\47"; + + private static int [] zzUnpackAction() { + int [] result = new int[89]; + int offset = 0; + offset = zzUnpackAction(ZZ_ACTION_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackAction(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int count = packed.charAt(i++); + int value = packed.charAt(i++); + do result[j++] = value; while (--count > 0); + } + return j; + } + + + /** + * Translates a state to a row index in the transition table + */ + private static final int [] ZZ_ROWMAP = zzUnpackRowMap(); + + private static final String ZZ_ROWMAP_PACKED_0 = + "\0\0\0\52\0\124\0\176\0\250\0\322\0\374\0\u0126"+ + "\0\u0150\0\0\0\0\0\0\0\u017a\0\0\0\0\0\u01a4"+ + "\0\u01ce\0\0\0\u01f8\0\0\0\0\0\0\0\0\0\0"+ + "\0\u0222\0\u024c\0\u0276\0\u02a0\0\u02ca\0\0\0\u02f4\0\u031e"+ + "\0\u0348\0\u0372\0\u039c\0\0\0\0\0\0\0\0\0\0"+ + "\0\0\0\0\0\0\0\0\0\0\0\0\0\u03c6\0\u03f0"+ + "\0\u041a\0\0\0\0\0\0\0\u0444\0\0\0\u046e\0\0"+ + "\0\0\0\u0498\0\0\0\0\0\0\0\u04c2\0\u04ec\0\u0516"+ + "\0\u0540\0\0\0\u056a\0\u0594\0\u05be\0\u05e8\0\u0612\0\0"+ + "\0\0\0\u063c\0\u031e\0\u0666\0\u0690\0\0\0\u06ba\0\u06e4"+ + "\0\u070e\0\0\0\u0738\0\0\0\0\0\u0762\0\u078c\0\u07b6"+ + "\0\0"; + + private static int [] zzUnpackRowMap() { + int [] result = new int[89]; + int offset = 0; + offset = zzUnpackRowMap(ZZ_ROWMAP_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackRowMap(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int high = packed.charAt(i++) << 16; + result[j++] = high | packed.charAt(i++); + } + return j; + } + + /** + * The transition table of the DFA + */ + private static final int [] ZZ_TRANS = zzUnpackTrans(); + + private static final String ZZ_TRANS_PACKED_0 = + "\52\0\1\12\1\13\1\14\1\15\1\16\1\17\1\20"+ + "\1\14\1\21\1\13\1\15\1\14\1\22\1\23\5\12"+ + "\2\13\1\12\2\13\1\24\1\25\1\26\1\27\1\30"+ + "\1\12\1\13\1\31\2\13\1\14\1\13\1\23\1\32"+ + "\1\33\1\34\1\35\1\36\1\12\1\13\1\14\1\15"+ + "\1\16\1\17\1\37\1\14\1\21\1\13\1\15\1\40"+ + "\1\41\1\42\5\12\2\13\1\12\2\13\1\24\1\25"+ + "\1\26\1\27\1\30\1\12\1\13\1\31\2\13\1\43"+ + "\1\13\1\42\1\32\1\33\1\34\1\35\1\36\1\12"+ + "\1\13\1\14\1\15\1\16\1\44\1\20\1\14\1\21"+ + "\1\13\1\15\1\14\1\22\1\23\1\45\1\46\1\47"+ + "\1\50\1\51\1\52\1\53\1\54\1\55\1\56\1\24"+ + "\1\25\1\26\1\27\1\30\1\12\1\13\1\31\2\13"+ + "\1\14\1\13\1\23\1\32\1\33\1\34\1\35\1\36"+ + "\1\12\1\13\1\14\1\15\1\16\1\44\1\37\1\14"+ + "\1\21\1\13\1\15\1\40\1\41\1\42\1\45\1\46"+ + "\1\47\1\50\1\51\1\52\1\53\1\54\1\55\1\56"+ + "\1\24\1\25\1\26\1\27\1\30\1\12\1\13\1\31"+ + "\2\13\1\43\1\13\1\42\1\32\1\33\1\34\1\35"+ + "\1\36\1\12\1\13\1\14\1\15\1\16\1\17\1\20"+ + "\1\14\1\21\1\13\1\15\1\14\1\22\1\23\1\45"+ + "\1\46\1\47\1\50\1\51\1\52\1\53\1\54\1\55"+ + "\1\56\1\24\1\25\1\26\1\27\1\30\1\12\1\13"+ + "\1\31\2\13\1\14\1\13\1\23\1\32\1\33\1\34"+ + "\1\35\1\36\1\12\1\13\1\14\1\15\1\16\1\17"+ + "\1\37\1\14\1\21\1\13\1\15\1\40\1\41\1\42"+ + "\1\45\1\46\1\47\1\50\1\51\1\52\1\53\1\54"+ + "\1\55\1\56\1\24\1\25\1\26\1\27\1\30\1\12"+ + "\1\13\1\31\2\13\1\43\1\13\1\42\1\32\1\33"+ + "\1\34\1\35\1\36\6\0\1\57\4\0\1\60\1\61"+ + "\41\0\1\62\113\0\1\63\1\0\1\63\36\0\1\64"+ + "\22\0\1\65\44\0\1\66\4\0\1\66\2\0\1\66"+ + "\3\0\1\66\5\0\2\66\1\0\2\66\1\0\3\66"+ + "\2\0\1\66\1\0\2\66\1\0\2\66\45\0\1\67"+ + "\57\0\1\70\5\0\2\71\1\72\2\0\2\71\1\0"+ + "\3\71\13\0\1\71\6\0\1\71\2\0\1\71\2\0"+ + "\4\71\47\0\1\73\1\0\1\74\3\0\2\75\1\76"+ + "\2\0\2\75\1\0\3\75\13\0\1\75\6\0\1\75"+ + "\2\0\1\75\2\0\4\75\10\0\1\77\25\0\1\64"+ + "\25\0\1\100\51\0\1\100\3\0\1\101\35\0\1\102"+ + "\4\0\1\102\2\0\1\102\3\0\1\102\5\0\2\102"+ + "\1\0\2\102\1\0\3\102\2\0\1\102\1\0\2\102"+ + "\1\0\2\102\43\0\1\103\4\0\1\104\15\0\1\105"+ + "\53\0\1\106\51\0\1\106\3\0\1\107\72\0\1\110"+ + "\54\0\1\111\12\0\2\71\3\0\2\71\1\0\3\71"+ + "\13\0\1\71\6\0\1\71\2\0\1\71\2\0\4\71"+ + "\3\0\2\75\3\0\2\75\1\0\3\75\13\0\1\75"+ + "\6\0\1\75\2\0\1\75\2\0\4\75\5\0\1\112"+ + "\3\0\1\113\53\0\1\114\43\0\1\115\6\0\1\113"+ + "\43\0\1\116\51\0\1\116\1\117\1\120\46\0\1\121"+ + "\3\0\1\60\53\0\1\122\43\0\1\123\6\0\1\60"+ + "\46\0\1\113\45\0\1\124\60\0\1\113\43\0\1\125"+ + "\50\0\1\126\2\0\1\127\52\0\1\60\54\0\1\60"+ + "\45\0\1\127\100\0\1\130\20\0\1\131\44\0"; + + private static int [] zzUnpackTrans() { + int [] result = new int[2016]; + int offset = 0; + offset = zzUnpackTrans(ZZ_TRANS_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackTrans(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int count = packed.charAt(i++); + int value = packed.charAt(i++); + value--; + do result[j++] = value; while (--count > 0); + } + return j; + } + + + /* error codes */ + private static final int ZZ_UNKNOWN_ERROR = 0; + private static final int ZZ_NO_MATCH = 1; + private static final int ZZ_PUSHBACK_2BIG = 2; + + /* error messages for the codes above */ + private static final String ZZ_ERROR_MSG[] = { + "Unkown internal scanner error", + "Error: could not match input", + "Error: pushback value was too large" + }; + + /** + * ZZ_ATTRIBUTE[aState] contains the attributes of state aState + */ + private static final int [] ZZ_ATTRIBUTE = zzUnpackAttribute(); + + private static final String ZZ_ATTRIBUTE_PACKED_0 = + "\1\10\7\0\1\1\3\11\1\1\2\11\2\1\1\11"+ + "\1\1\5\11\5\1\1\11\5\1\13\11\3\0\3\11"+ + "\1\0\1\11\1\0\2\11\1\0\3\11\4\0\1\11"+ + "\5\0\2\11\2\0\1\1\1\0\1\11\3\0\1\11"+ + "\1\0\2\11\3\0\1\11"; + + private static int [] zzUnpackAttribute() { + int [] result = new int[89]; + int offset = 0; + offset = zzUnpackAttribute(ZZ_ATTRIBUTE_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackAttribute(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int count = packed.charAt(i++); + int value = packed.charAt(i++); + do result[j++] = value; while (--count > 0); + } + return j; + } + + /** the input device */ + private java.io.Reader zzReader; + + /** the current state of the DFA */ + private int zzState; + + /** the current lexical state */ + private int zzLexicalState = YYINITIAL; + + /** this buffer contains the current text to be matched and is + the source of the yytext() string */ + private char zzBuffer[] = new char[ZZ_BUFFERSIZE]; + + /** the textposition at the last accepting state */ + private int zzMarkedPos; + + /** the current text position in the buffer */ + private int zzCurrentPos; + + /** startRead marks the beginning of the yytext() string in the buffer */ + private int zzStartRead; + + /** endRead marks the last character in the buffer, that has been read + from input */ + private int zzEndRead; + + /** number of newlines encountered up to the start of the matched text */ + private int yyline; + + /** the number of characters up to the start of the matched text */ + private int yychar; + + /** + * the number of characters from the last newline up to the start of the + * matched text + */ + private int yycolumn; + + /** + * zzAtBOL == true <=> the scanner is currently at the beginning of a line + */ + private boolean zzAtBOL = true; + + /** zzAtEOF == true <=> the scanner is at the EOF */ + private boolean zzAtEOF; + + /** denotes if the user-EOF-code has already been executed */ + private boolean zzEOFDone; + + /** For the backwards DFA of general lookahead statements */ + private boolean [] zzFin = new boolean [ZZ_BUFFERSIZE+1]; + + /* user code: */ + private static final int CONS = 1; + private static final int VOWEL = 2; + private int cv = 0; // consonant = 1, vowel = 2, everything else = 0 + + private String original = ""; + private String normalized = ""; + private int problem = 0; + + private void add (String norm) { + original += yytext(); + normalized += norm; + } + + + /** + * Creates a new scanner + * There is also a java.io.InputStream version of this constructor. + * + * @param in the java.io.Reader to read input from. + */ + public MpdlNormalizerLexIT(java.io.Reader in) { + this.zzReader = in; + } + + /** + * Creates a new scanner. + * There is also java.io.Reader version of this constructor. + * + * @param in the java.io.Inputstream to read input from. + */ + public MpdlNormalizerLexIT(java.io.InputStream in) { + this(new java.io.InputStreamReader(in)); + } + + /** + * Unpacks the compressed character translation table. + * + * @param packed the packed character translation table + * @return the unpacked character translation table + */ + private static char [] zzUnpackCMap(String packed) { + char [] map = new char[0x10000]; + int i = 0; /* index in packed string */ + int j = 0; /* index in unpacked array */ + while (i < 168) { + int count = packed.charAt(i++); + char value = packed.charAt(i++); + do map[j++] = value; while (--count > 0); + } + return map; + } + + + /** + * Refills the input buffer. + * + * @return false, iff there was new input. + * + * @exception java.io.IOException if any I/O-Error occurs + */ + private boolean zzRefill() throws java.io.IOException { + + /* first: make room (if you can) */ + if (zzStartRead > 0) { + System.arraycopy(zzBuffer, zzStartRead, + zzBuffer, 0, + zzEndRead-zzStartRead); + + /* translate stored positions */ + zzEndRead-= zzStartRead; + zzCurrentPos-= zzStartRead; + zzMarkedPos-= zzStartRead; + zzStartRead = 0; + } + + /* is the buffer big enough? */ + if (zzCurrentPos >= zzBuffer.length) { + /* if not: blow it up */ + char newBuffer[] = new char[zzCurrentPos*2]; + System.arraycopy(zzBuffer, 0, newBuffer, 0, zzBuffer.length); + zzBuffer = newBuffer; + } + + /* finally: fill the buffer with new input */ + int numRead = zzReader.read(zzBuffer, zzEndRead, + zzBuffer.length-zzEndRead); + + if (numRead > 0) { + zzEndRead+= numRead; + return false; + } + // unlikely but not impossible: read 0 characters, but not at end of stream + if (numRead == 0) { + int c = zzReader.read(); + if (c == -1) { + return true; + } else { + zzBuffer[zzEndRead++] = (char) c; + return false; + } + } + + // numRead < 0 + return true; + } + + + /** + * Closes the input stream. + */ + public final void yyclose() throws java.io.IOException { + zzAtEOF = true; /* indicate end of file */ + zzEndRead = zzStartRead; /* invalidate buffer */ + + if (zzReader != null) + zzReader.close(); + } + + + /** + * Resets the scanner to read from a new input stream. + * Does not close the old reader. + * + * All internal variables are reset, the old input stream + * cannot be reused (internal buffer is discarded and lost). + * Lexical state is set to ZZ_INITIAL. + * + * @param reader the new input stream + */ + public final void yyreset(java.io.Reader reader) { + zzReader = reader; + zzAtBOL = true; + zzAtEOF = false; + zzEOFDone = false; + zzEndRead = zzStartRead = 0; + zzCurrentPos = zzMarkedPos = 0; + yyline = yychar = yycolumn = 0; + zzLexicalState = YYINITIAL; + } + + + /** + * Returns the current lexical state. + */ + public final int yystate() { + return zzLexicalState; + } + + + /** + * Enters a new lexical state + * + * @param newState the new lexical state + */ + public final void yybegin(int newState) { + zzLexicalState = newState; + } + + + /** + * Returns the text matched by the current regular expression. + */ + public final String yytext() { + return new String( zzBuffer, zzStartRead, zzMarkedPos-zzStartRead ); + } + + + /** + * Returns the character at position pos from the + * matched text. + * + * It is equivalent to yytext().charAt(pos), but faster + * + * @param pos the position of the character to fetch. + * A value from 0 to yylength()-1. + * + * @return the character at position pos + */ + public final char yycharat(int pos) { + return zzBuffer[zzStartRead+pos]; + } + + + /** + * Returns the length of the matched text region. + */ + public final int yylength() { + return zzMarkedPos-zzStartRead; + } + + + /** + * Reports an error that occured while scanning. + * + * In a wellformed scanner (no or only correct usage of + * yypushback(int) and a match-all fallback rule) this method + * will only be called with things that "Can't Possibly Happen". + * If this method is called, something is seriously wrong + * (e.g. a JFlex bug producing a faulty scanner etc.). + * + * Usual syntax/scanner level error handling should be done + * in error fallback rules. + * + * @param errorCode the code of the errormessage to display + */ + private void zzScanError(int errorCode) { + String message; + try { + message = ZZ_ERROR_MSG[errorCode]; + } + catch (ArrayIndexOutOfBoundsException e) { + message = ZZ_ERROR_MSG[ZZ_UNKNOWN_ERROR]; + } + + throw new Error(message); + } + + + /** + * Pushes the specified amount of characters back into the input stream. + * + * They will be read again by then next call of the scanning method + * + * @param number the number of characters to be read again. + * This number must not be greater than yylength()! + */ + public void yypushback(int number) { + if ( number > yylength() ) + zzScanError(ZZ_PUSHBACK_2BIG); + + zzMarkedPos -= number; + } + + + /** + * Resumes scanning until the next regular expression is matched, + * the end of input is encountered or an I/O-Error occurs. + * + * @return the next token + * @exception java.io.IOException if any I/O-Error occurs + */ + public java.lang.String yylex() throws java.io.IOException { + int zzInput; + int zzAction; + + // cached fields: + int zzCurrentPosL; + int zzMarkedPosL; + int zzEndReadL = zzEndRead; + char [] zzBufferL = zzBuffer; + char [] zzCMapL = ZZ_CMAP; + + int [] zzTransL = ZZ_TRANS; + int [] zzRowMapL = ZZ_ROWMAP; + int [] zzAttrL = ZZ_ATTRIBUTE; + + while (true) { + zzMarkedPosL = zzMarkedPos; + + if (zzMarkedPosL > zzStartRead) { + switch (zzBufferL[zzMarkedPosL-1]) { + case '\n': + case '\u000B': + case '\u000C': + case '\u0085': + case '\u2028': + case '\u2029': + zzAtBOL = true; + break; + case '\r': + if (zzMarkedPosL < zzEndReadL) + zzAtBOL = zzBufferL[zzMarkedPosL] != '\n'; + else if (zzAtEOF) + zzAtBOL = false; + else { + boolean eof = zzRefill(); + zzMarkedPosL = zzMarkedPos; + zzEndReadL = zzEndRead; + zzBufferL = zzBuffer; + if (eof) + zzAtBOL = false; + else + zzAtBOL = zzBufferL[zzMarkedPosL] != '\n'; + } + break; + default: + zzAtBOL = false; + } + } + zzAction = -1; + + zzCurrentPosL = zzCurrentPos = zzStartRead = zzMarkedPosL; + + if (zzAtBOL) + zzState = ZZ_LEXSTATE[zzLexicalState+1]; + else + zzState = ZZ_LEXSTATE[zzLexicalState]; + + + zzForAction: { + while (true) { + + if (zzCurrentPosL < zzEndReadL) + zzInput = zzBufferL[zzCurrentPosL++]; + else if (zzAtEOF) { + zzInput = YYEOF; + break zzForAction; + } + else { + // store back cached positions + zzCurrentPos = zzCurrentPosL; + zzMarkedPos = zzMarkedPosL; + boolean eof = zzRefill(); + // get translated positions and possibly new buffer + zzCurrentPosL = zzCurrentPos; + zzMarkedPosL = zzMarkedPos; + zzBufferL = zzBuffer; + zzEndReadL = zzEndRead; + if (eof) { + zzInput = YYEOF; + break zzForAction; + } + else { + zzInput = zzBufferL[zzCurrentPosL++]; + } + } + int zzNext = zzTransL[ zzRowMapL[zzState] + zzCMapL[zzInput] ]; + if (zzNext == -1) break zzForAction; + zzState = zzNext; + + int zzAttributes = zzAttrL[zzState]; + if ( (zzAttributes & 1) == 1 ) { + zzAction = zzState; + zzMarkedPosL = zzCurrentPosL; + if ( (zzAttributes & 8) == 8 ) break zzForAction; + } + + } + } + + // store back cached position + zzMarkedPos = zzMarkedPosL; + + switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) { + case 32: + // lookahead expression with fixed base length + zzMarkedPos = zzStartRead + 1; + { cv = CONS; add("U"); + } + case 40: break; + case 15: + { add("Á"); + } + case 41: break; + case 39: + // lookahead expression with fixed lookahead length + yypushback(1); + { add(yytext()); + } + case 42: break; + case 38: + // lookahead expression with fixed base length + zzMarkedPos = zzStartRead + 3; + { add(yytext()); + } + case 43: break; + case 37: + // lookahead expression with fixed base length + zzMarkedPos = zzStartRead + 2; + { add(yytext()); + } + case 44: break; + case 4: + { add(yytext()); + } + case 45: break; + case 22: + { add("í"); + } + case 46: break; + case 9: + { cv = VOWEL; add("AE"); + } + case 47: break; + case 5: + { switch (problem) { + case 1: return original; + default: return normalized; + } + } + case 48: break; + case 29: + // lookahead expression with fixed base length + zzMarkedPos = zzStartRead + 1; + { cv = CONS; add("u"); + } + case 49: break; + case 20: + { add("á"); + } + case 50: break; + case 1: + { cv = 0; add(yytext()); + } + case 51: break; + case 33: + // lookahead expression with fixed base length + zzMarkedPos = zzStartRead + 1; + { cv = VOWEL; add(yytext().replaceAll("u", "v").replaceAll("U", "V")); + } + case 52: break; + case 34: + { cv = VOWEL; add("zio"); + } + case 53: break; + case 11: + { cv = VOWEL; add("OE"); + } + case 54: break; + case 19: + { add("Ú"); + } + case 55: break; + case 36: + // general lookahead, find correct zzMarkedPos + { int zzFState = 7; + int zzFPos = zzStartRead; + if (zzFin.length <= zzBufferL.length) { zzFin = new boolean[zzBufferL.length+1]; } + boolean zzFinL[] = zzFin; + while (zzFState != -1 && zzFPos < zzMarkedPos) { + if ((zzAttrL[zzFState] & 1) == 1) { zzFinL[zzFPos] = true; } + zzInput = zzBufferL[zzFPos++]; + zzFState = zzTransL[ zzRowMapL[zzFState] + zzCMapL[zzInput] ]; + } + if (zzFState != -1 && (zzAttrL[zzFState] & 1) == 1) { zzFinL[zzFPos] = true; } + + zzFState = 8; + zzFPos = zzMarkedPos; + while (!zzFinL[zzFPos] || (zzAttrL[zzFState] & 1) != 1) { + zzInput = zzBufferL[--zzFPos]; + zzFState = zzTransL[ zzRowMapL[zzFState] + zzCMapL[zzInput] ]; + }; + zzMarkedPos = zzFPos; + } + { cv = VOWEL; add(yytext().replace("ſ", "s")); + } + case 56: break; + case 3: + { cv = CONS; add(yytext()); + } + case 57: break; + case 31: + { cv = CONS; add("QU"); + } + case 58: break; + case 16: + { add("É"); + } + case 59: break; + case 27: + // lookahead expression with fixed base length + zzMarkedPos = zzStartRead + 1; + { switch(cv) { + case VOWEL: add(yytext().replace("u", "v").replace("U", "V")); break; + default: cv = VOWEL; add(yytext()); break; + } + } + case 60: break; + case 7: + { cv = CONS; add("ss"); + } + case 61: break; + case 6: + { cv = CONS; add("s"); + } + case 62: break; + case 35: + { cv = VOWEL; add("ZIO"); + } + case 63: break; + case 2: + { cv = VOWEL; add(yytext()); + } + case 64: break; + case 18: + { add("Ó"); + } + case 65: break; + case 24: + { add("ú"); + } + case 66: break; + case 30: + { cv = CONS; add("Qu"); + } + case 67: break; + case 21: + { add("é"); + } + case 68: break; + case 8: + { cv = VOWEL; add("ae"); + } + case 69: break; + case 14: + { switch (problem) { + case 1: return ""; + default: return normalized; + } + } + case 70: break; + case 13: + { add(""); + } + case 71: break; + case 23: + { add("ó"); + } + case 72: break; + case 10: + { cv = VOWEL; add("oe"); + } + case 73: break; + case 28: + { cv = CONS; add("qu"); + } + case 74: break; + case 12: + { problem = 1; add(yytext()); + } + case 75: break; + case 25: + { switch(cv) { + case CONS: add(yytext().replace("v", "u").replace("V", "U")); break; + default: cv = CONS; add(yytext()); break; + } + } + case 76: break; + case 26: + { cv = VOWEL; add("ii"); + } + case 77: break; + case 17: + { add("Í"); + } + case 78: break; + default: + if (zzInput == YYEOF && zzStartRead == zzCurrentPos) { + zzAtEOF = true; + return null; + } + else { + zzScanError(ZZ_NO_MATCH); + } + } + } + } + + +} diff -r d2a1c14fde31 -r 1ec29fdd0db8 software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlNormalizerLexIT.lex --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlNormalizerLexIT.lex Tue Feb 22 16:03:45 2011 +0100 @@ -0,0 +1,171 @@ +/* + * Normalization rules for Italian text + * [this is a JFlex specification] + * + * Wolfgang Schmidle + * version 0.96 + * 2011-02-21 + * + */ + +package de.mpg.mpiwg.berlin.mpdl.lt.analyzer.lang; + +%% + +%public +%class MpdlNormalizerLexIT +%type java.lang.String +%unicode + +// Italian: it, ita + +%states DISP, DICT, SEARCH + +%{ + private static final int CONS = 1; + private static final int VOWEL = 2; + private int cv = 0; // consonant = 1, vowel = 2, everything else = 0 + + private String original = ""; + private String normalized = ""; + private int problem = 0; + + private void add (String norm) { + original += yytext(); + normalized += norm; + } +%} + +Vowel = [AEIOUaeiouÆæęàèòùœ] +Cons = [BCDFGHKLMNPQRSTVWXZbcdfghklmnpqrstvwxzſß] +LR = [lLrR] + + +hyphen = [\u002d\u00ad] // hyphen and soft hyphen +X = {hyphen}? + +END = \n + +prefixCons = (in{X}ter | per | ſu{X}per | ſer) + +%% + + { + +À { add("Á"); } +È { add("É"); } +Ì { add("Í"); } +Ò { add("Ó"); } +Ù { add("Ú"); } +à { add("á"); } +è { add("é"); } +ì { add("í"); } +ò { add("ó"); } +ù { add("ú"); } + +} + + { + +ſ { cv = CONS; add("s"); } +ß { cv = CONS; add("ss"); } +æ { cv = VOWEL; add("ae"); } +Æ { cv = VOWEL; add("AE"); } +œ { cv = VOWEL; add("oe"); } +Œ { cv = VOWEL; add("OE"); } + +ij { cv = VOWEL; add("ii"); } + +tio { cv = VOWEL; add("zio"); } +TIO { cv = VOWEL; add("ZIO"); } + +// h-Regeln aus Arboreal: +^ ha / {END} { add(yytext()); } +^ hai / {END} { add(yytext()); } +^ han{X}no / {END} { add(yytext()); } +^ ho / {END} { add(yytext()); } +^ h { add(""); } + + +// u/v rules are taken from MpdlNormalizerLexLA.lex + +// 1. rules for u --> v + +^ {prefixCons} / {X} { cv = VOWEL; add(yytext().replace("ſ", "s")); } + +^ [uU] / {Vowel} { cv = VOWEL; add(yytext().replaceAll("u", "v").replaceAll("U", "V")); } + + +[uU] / {Vowel} { + switch(cv) { + case VOWEL: add(yytext().replace("u", "v").replace("U", "V")); break; + default: cv = VOWEL; add(yytext()); break; + } + } + +// 2. rules for v --> u + +qv { cv = CONS; add("qu"); } // the replaced v still counts as consonant +Qv { cv = CONS; add("Qu"); } +QV { cv = CONS; add("QU"); } + +{LR} [vV] { + switch(cv) { + case CONS: add(yytext().replace("v", "u").replace("V", "U")); break; + default: cv = CONS; add(yytext()); break; + } + } + +v / {X} {Cons} { cv = CONS; add("u"); } +V / {X} {Cons} { cv = CONS; add("U"); } + +// 3. override default rule for . + +{Vowel} { cv = VOWEL; add(yytext()); } +{Cons} { cv = CONS; add(yytext()); } +{hyphen} { add(yytext()); } +@ { problem = 1; add(yytext()); } +. { cv = 0; add(yytext()); } + +} + + + { + +{END} { + switch (problem) { + case 1: return original; + default: return normalized; + } + } +} + + { + +{END} { + switch (problem) { + case 1: return ""; + default: return normalized; + } + } +} + + +/* + +Annahmen: +- die Routine wird wortweise aufgerufen, mit einem \n am Ende des Strings +- Wörter mit Zeilenumbrüchen wurden bereits wieder zusammengesetzt + +TO DO: + +IT: all these rules are taken from Arboreal; do we need them all? +IT: richtig? vollständig? +IT: Sind die u/v-Regeln wirklich genau wie in LA ? insbesondere: gleiche Vokal-Klasse? +IT: Änderungen in den lateinischen u/v-Regeln übernehmen? +IT: italienische Beispielwörter für die u/v-Regeln angeben +IT: Brauchen wir die Gravis-Regeln aus Arboreal in DICT wirklich? +IT: wenn ja: gehört À --> Á etc. in die Wörterbuch-Schicht? Und einschränken auf letzte Silbe? +IT: ist prefixCons = (inter | per | ſuper | ſer) auch für Italienisch gültig? + +*/ diff -r d2a1c14fde31 -r 1ec29fdd0db8 software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlNormalizerLexLA.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlNormalizerLexLA.java Tue Feb 22 16:03:45 2011 +0100 @@ -0,0 +1,990 @@ +/* The following code was generated by JFlex 1.4.3 on 22.02.11 12:04 */ + +/* + * Normalization rules for Latin text + * [this is a JFlex specification] + * + * Wolfgang Schmidle + * version 0.96 + * 2011-02-21 + * + */ + +package de.mpg.mpiwg.berlin.mpdl.lt.analyzer.lang; + + +/** + * This class is a scanner generated by + * JFlex 1.4.3 + * on 22.02.11 12:04 from the specification file + * MpdlNormalizerLexLA.lex + */ +public class MpdlNormalizerLexLA { + + /** This character denotes the end of file */ + public static final int YYEOF = -1; + + /** initial size of the lookahead buffer */ + private static final int ZZ_BUFFERSIZE = 16384; + + /** lexical states */ + public static final int RENAISSANCE_DICT = 10; + public static final int RENAISSANCE_DISP = 8; + public static final int SEARCH = 6; + public static final int DICT = 4; + public static final int YYINITIAL = 0; + public static final int RENAISSANCE_SEARCH = 12; + public static final int DISP = 2; + + /** + * ZZ_LEXSTATE[l] is the state in the DFA for the lexical state l + * ZZ_LEXSTATE[l+1] is the state in the DFA for the lexical state l + * at the beginning of a line + * l is of the form l = 2*k, k a non negative integer + */ + private static final int ZZ_LEXSTATE[] = { + 0, 0, 1, 2, 3, 4, 1, 2, 1, 2, 3, 4, 1, 2 + }; + + /** + * Translates characters to character classes + */ + private static final String ZZ_CMAP_PACKED = + "\12\0\1\5\42\0\1\4\23\0\1\1\3\2\1\1\2\2\1\52"+ + "\1\1\1\0\1\2\1\3\2\2\1\1\1\2\1\45\1\3\2\2"+ + "\1\63\1\64\2\2\1\0\1\2\6\0\1\56\1\2\1\46\1\42"+ + "\1\10\2\2\1\50\1\13\1\26\1\2\1\47\1\37\1\12\1\60"+ + "\1\16\1\6\1\15\1\31\1\14\1\7\1\11\2\2\1\0\1\2"+ + "\62\0\1\4\30\0\1\24\30\0\1\22\1\36\1\30\1\54\3\0"+ + "\1\23\1\0\1\40\1\32\1\0\1\57\1\44\1\33\1\51\1\61"+ + "\2\0\1\41\1\34\1\53\4\0\1\43\1\35\1\55\1\62\34\0"+ + "\1\23\71\0\1\25\53\0\1\17\u0181\0\1\27\ud4fe\0\1\20\u0590\0"+ + "\1\21\u226e\0"; + + /** + * Translates characters to character classes + */ + private static final char [] ZZ_CMAP = zzUnpackCMap(ZZ_CMAP_PACKED); + + /** + * Translates DFA states to action switch labels. + */ + private static final int [] ZZ_ACTION = zzUnpackAction(); + + private static final String ZZ_ACTION_PACKED_0 = + "\10\0\1\1\1\2\2\3\1\4\1\5\1\3\1\2"+ + "\1\3\1\2\1\6\1\1\1\7\1\10\1\11\1\12"+ + "\11\1\1\3\2\1\3\2\2\3\2\2\1\3\1\6"+ + "\3\3\1\1\1\2\1\13\4\0\1\14\1\15\1\16"+ + "\1\0\1\17\1\20\1\21\1\22\1\0\1\23\20\0"+ + "\1\24\3\0\1\25\3\0\1\26\1\0\1\27\3\0"+ + "\1\30\1\31\1\32\1\0\1\33\1\34\2\0\1\35"+ + "\16\0\1\36\1\0\1\37\1\0\1\40\1\0\1\41"+ + "\1\42\1\43\1\44\1\0\1\45\1\0\1\46\1\0"+ + "\1\47\1\0\1\50\3\0\1\51\10\0\1\52\6\0"+ + "\1\53\1\51\1\54\1\55\1\56\1\57\5\0"; + + private static int [] zzUnpackAction() { + int [] result = new int[166]; + int offset = 0; + offset = zzUnpackAction(ZZ_ACTION_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackAction(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int count = packed.charAt(i++); + int value = packed.charAt(i++); + do result[j++] = value; while (--count > 0); + } + return j; + } + + + /** + * Translates a state to a row index in the transition table + */ + private static final int [] ZZ_ROWMAP = zzUnpackRowMap(); + + private static final String ZZ_ROWMAP_PACKED_0 = + "\0\0\0\65\0\152\0\237\0\324\0\u0109\0\u013e\0\u0173"+ + "\0\u01a8\0\u01a8\0\u01a8\0\u01dd\0\u01a8\0\u01a8\0\u0212\0\u0247"+ + "\0\u027c\0\u02b1\0\u01a8\0\u0173\0\u01a8\0\u01a8\0\u01a8\0\u01a8"+ + "\0\u02e6\0\u031b\0\u0350\0\u0385\0\u03ba\0\u03ef\0\u0424\0\u0459"+ + "\0\u048e\0\u04c3\0\u04f8\0\u052d\0\u0562\0\u0597\0\u05cc\0\u0601"+ + "\0\u0636\0\u066b\0\u06a0\0\u06d5\0\u070a\0\u073f\0\u0774\0\u07a9"+ + "\0\u07de\0\u0813\0\u01a8\0\u0848\0\u087d\0\u08b2\0\u01a8\0\u01a8"+ + "\0\u01a8\0\u01a8\0\u08e7\0\u01a8\0\u01a8\0\u01a8\0\u01a8\0\u091c"+ + "\0\u01a8\0\u0951\0\u0986\0\u09bb\0\u09f0\0\u0a25\0\u0a5a\0\u0a8f"+ + "\0\u0ac4\0\u0af9\0\u0b2e\0\u0b63\0\u0b98\0\u0bcd\0\u0c02\0\u0c37"+ + "\0\u0c6c\0\u01a8\0\u0ca1\0\u0cd6\0\u0d0b\0\u01a8\0\u0d40\0\u0d75"+ + "\0\u0daa\0\u01a8\0\u0ddf\0\u01a8\0\u0e14\0\u0e49\0\u0e7e\0\u01a8"+ + "\0\u01a8\0\u01a8\0\u0eb3\0\u01a8\0\u01a8\0\u0ee8\0\u0f1d\0\u01a8"+ + "\0\u0f52\0\u0f87\0\u0fbc\0\u0ff1\0\u1026\0\u105b\0\u1090\0\u10c5"+ + "\0\u10fa\0\u112f\0\u1164\0\u1199\0\u11ce\0\u07de\0\u01a8\0\u1203"+ + "\0\u01a8\0\u1238\0\u01a8\0\u126d\0\u01a8\0\u01a8\0\u01a8\0\u01a8"+ + "\0\u12a2\0\u01a8\0\u12d7\0\u01a8\0\u130c\0\u01a8\0\u1341\0\u01a8"+ + "\0\u1376\0\u13ab\0\u06d5\0\u13e0\0\u1415\0\u144a\0\u147f\0\u14b4"+ + "\0\u14e9\0\u01a8\0\u151e\0\u1553\0\u01a8\0\u1588\0\u15bd\0\u15f2"+ + "\0\u1627\0\u165c\0\u1691\0\u01a8\0\u01a8\0\u01a8\0\u01a8\0\u01a8"+ + "\0\u01a8\0\u16c6\0\u16fb\0\u1730\0\u1765\0\u179a"; + + private static int [] zzUnpackRowMap() { + int [] result = new int[166]; + int offset = 0; + offset = zzUnpackRowMap(ZZ_ROWMAP_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackRowMap(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int high = packed.charAt(i++) << 16; + result[j++] = high | packed.charAt(i++); + } + return j; + } + + /** + * The transition table of the DFA + */ + private static final int [] ZZ_TRANS = zzUnpackTrans(); + + private static final String ZZ_TRANS_PACKED_0 = + "\20\0\1\10\44\0\1\11\1\12\1\13\1\14\1\15"+ + "\1\16\1\17\1\20\1\12\1\21\1\13\1\22\1\13"+ + "\1\14\1\13\1\23\1\24\1\11\1\25\1\26\1\27"+ + "\1\30\2\11\1\31\1\13\1\32\1\33\1\34\1\35"+ + "\1\36\1\13\1\37\1\40\1\13\1\41\1\11\1\42"+ + "\1\13\1\14\1\13\1\11\1\13\1\11\1\43\1\44"+ + "\1\45\1\11\1\46\2\11\1\47\1\50\1\11\1\12"+ + "\1\13\1\14\1\15\1\16\1\51\1\52\1\12\1\21"+ + "\1\13\1\53\1\13\1\14\1\54\1\55\1\24\1\11"+ + "\1\25\1\26\1\27\1\30\2\11\1\31\1\13\1\32"+ + "\1\33\1\34\1\35\1\36\1\13\1\37\1\40\1\13"+ + "\1\41\1\11\1\56\1\13\1\14\1\57\1\11\1\60"+ + "\1\61\1\43\1\44\1\45\1\11\1\46\2\11\1\62"+ + "\1\50\1\11\1\12\1\13\1\14\1\15\1\63\1\17"+ + "\1\20\1\12\1\21\1\13\1\22\1\13\1\14\1\13"+ + "\1\23\1\24\1\11\1\25\1\26\1\27\1\30\2\11"+ + "\1\31\1\13\1\32\1\33\1\34\1\35\1\36\1\13"+ + "\1\37\1\40\1\13\1\41\1\11\1\42\1\13\1\14"+ + "\1\13\1\11\1\13\1\11\1\43\1\44\1\45\1\11"+ + "\1\46\2\11\1\47\1\50\1\11\1\12\1\13\1\14"+ + "\1\15\1\63\1\51\1\52\1\12\1\21\1\13\1\53"+ + "\1\13\1\14\1\54\1\55\1\24\1\11\1\25\1\26"+ + "\1\27\1\30\2\11\1\31\1\13\1\32\1\33\1\34"+ + "\1\35\1\36\1\13\1\37\1\40\1\13\1\41\1\11"+ + "\1\56\1\13\1\14\1\57\1\11\1\60\1\61\1\43"+ + "\1\44\1\45\1\11\1\46\2\11\1\62\1\50\13\0"+ + "\1\64\2\0\1\65\1\66\51\0\1\67\101\0\1\70"+ + "\141\0\1\71\52\0\1\71\11\0\1\72\15\0\1\73"+ + "\36\0\1\74\5\0\2\74\2\0\1\74\42\0\1\74"+ + "\1\0\1\74\1\75\1\76\1\74\3\0\2\77\1\100"+ + "\1\0\1\77\2\0\2\77\1\0\4\77\2\0\1\77"+ + "\6\0\1\77\5\0\1\77\2\0\1\77\2\0\4\77"+ + "\1\0\1\77\11\0\1\77\26\0\1\101\44\0\1\102"+ + "\2\0\2\103\1\0\2\104\13\0\1\104\5\0\1\104"+ + "\33\0\1\105\2\0\2\106\1\0\2\107\13\0\1\107"+ + "\5\0\1\107\33\0\1\110\2\0\2\111\1\0\2\112"+ + "\13\0\1\112\5\0\1\112\33\0\1\113\2\0\2\114"+ + "\1\0\2\115\13\0\1\115\5\0\1\115\33\0\1\116"+ + "\1\0\1\117\2\120\1\0\2\121\13\0\1\121\5\0"+ + "\1\121\32\0\1\122\1\102\22\0\1\123\5\0\1\124"+ + "\6\0\1\125\23\0\1\126\1\105\5\0\1\127\1\130"+ + "\13\0\1\131\40\0\1\132\1\113\33\0\1\133\27\0"+ + "\1\134\23\0\1\135\5\0\1\136\7\0\1\137\26\0"+ + "\1\140\52\0\1\141\5\0\1\122\1\102\6\0\1\142"+ + "\100\0\1\143\112\0\1\26\64\0\1\30\1\0\1\144"+ + "\4\0\1\74\5\0\2\74\2\0\1\74\42\0\1\74"+ + "\1\0\1\74\2\0\1\74\3\0\2\145\1\146\1\0"+ + "\1\145\2\0\2\145\1\0\4\145\2\0\1\145\6\0"+ + "\1\145\5\0\1\145\2\0\1\145\2\0\4\145\1\0"+ + "\1\145\11\0\1\145\7\0\1\147\1\0\1\72\15\0"+ + "\1\73\36\0\1\150\5\0\2\150\2\0\1\150\42\0"+ + "\1\150\1\0\1\150\1\75\1\76\1\150\13\0\1\151"+ + "\13\0\1\101\46\0\1\152\63\0\1\153\1\152\63\0"+ + "\1\154\1\0\1\140\52\0\1\141\51\0\1\155\64\0"+ + "\1\156\20\0\1\132\60\0\1\150\5\0\2\150\2\0"+ + "\1\150\42\0\1\150\1\0\1\150\2\0\1\150\13\0"+ + "\1\157\62\0\1\160\63\0\1\161\1\160\63\0\1\162"+ + "\57\0\2\77\2\0\1\77\2\0\2\77\1\0\4\77"+ + "\2\0\1\77\6\0\1\77\5\0\1\77\2\0\1\77"+ + "\2\0\4\77\1\0\1\77\11\0\1\77\7\0\1\103"+ + "\65\0\1\163\62\0\1\102\2\0\2\103\61\0\1\106"+ + "\65\0\1\164\62\0\1\105\2\0\2\106\61\0\1\111"+ + "\65\0\1\165\62\0\1\110\2\0\2\111\61\0\1\114"+ + "\65\0\1\166\62\0\1\113\2\0\2\114\61\0\1\120"+ + "\62\0\1\167\67\0\1\170\62\0\1\116\2\0\2\120"+ + "\57\0\1\171\1\172\63\0\1\173\1\174\63\0\1\175"+ + "\64\0\1\176\64\0\1\177\64\0\1\200\1\201\63\0"+ + "\1\202\1\203\63\0\1\204\1\205\63\0\1\206\1\207"+ + "\63\0\1\210\64\0\1\204\61\0\2\145\2\0\1\145"+ + "\2\0\2\145\1\0\4\145\2\0\1\145\6\0\1\145"+ + "\5\0\1\145\2\0\1\145\2\0\4\145\1\0\1\145"+ + "\11\0\1\145\44\0\1\211\24\0\1\212\7\0\1\213"+ + "\65\0\1\214\53\0\1\215\11\0\1\213\112\0\1\216"+ + "\66\0\1\217\64\0\1\220\22\0\1\221\7\0\1\65"+ + "\65\0\1\222\53\0\1\223\11\0\1\65\56\0\1\224"+ + "\61\0\1\122\64\0\1\126\64\0\1\225\64\0\1\134"+ + "\66\0\1\226\64\0\1\227\64\0\1\230\64\0\1\231"+ + "\64\0\1\232\64\0\1\233\62\0\1\234\73\0\1\213"+ + "\54\0\1\235\76\0\1\213\53\0\1\236\64\0\1\237"+ + "\64\0\1\240\73\0\1\65\66\0\1\65\53\0\1\241"+ + "\67\0\1\242\64\0\1\243\64\0\1\244\64\0\1\245"+ + "\64\0\1\143\64\0\1\246\61\0\1\171\64\0\1\173"+ + "\64\0\1\200\64\0\1\202\64\0\1\206\57\0"; + + private static int [] zzUnpackTrans() { + int [] result = new int[6095]; + int offset = 0; + offset = zzUnpackTrans(ZZ_TRANS_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackTrans(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int count = packed.charAt(i++); + int value = packed.charAt(i++); + value--; + do result[j++] = value; while (--count > 0); + } + return j; + } + + + /* error codes */ + private static final int ZZ_UNKNOWN_ERROR = 0; + private static final int ZZ_NO_MATCH = 1; + private static final int ZZ_PUSHBACK_2BIG = 2; + + /* error messages for the codes above */ + private static final String ZZ_ERROR_MSG[] = { + "Unkown internal scanner error", + "Error: could not match input", + "Error: pushback value was too large" + }; + + /** + * ZZ_ATTRIBUTE[aState] contains the attributes of state aState + */ + private static final int [] ZZ_ATTRIBUTE = zzUnpackAttribute(); + + private static final String ZZ_ATTRIBUTE_PACKED_0 = + "\6\0\1\1\1\0\3\11\1\1\2\11\4\1\1\11"+ + "\1\1\4\11\32\1\1\11\3\0\4\11\1\0\4\11"+ + "\1\0\1\11\20\0\1\11\3\0\1\11\3\0\1\11"+ + "\1\0\1\11\3\0\3\11\1\0\2\11\2\0\1\11"+ + "\16\0\1\11\1\0\1\11\1\0\1\11\1\0\4\11"+ + "\1\0\1\11\1\0\1\11\1\0\1\11\1\0\1\11"+ + "\3\0\1\1\5\0\1\11\2\0\1\11\6\0\6\11"+ + "\5\0"; + + private static int [] zzUnpackAttribute() { + int [] result = new int[166]; + int offset = 0; + offset = zzUnpackAttribute(ZZ_ATTRIBUTE_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackAttribute(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int count = packed.charAt(i++); + int value = packed.charAt(i++); + do result[j++] = value; while (--count > 0); + } + return j; + } + + /** the input device */ + private java.io.Reader zzReader; + + /** the current state of the DFA */ + private int zzState; + + /** the current lexical state */ + private int zzLexicalState = YYINITIAL; + + /** this buffer contains the current text to be matched and is + the source of the yytext() string */ + private char zzBuffer[] = new char[ZZ_BUFFERSIZE]; + + /** the textposition at the last accepting state */ + private int zzMarkedPos; + + /** the current text position in the buffer */ + private int zzCurrentPos; + + /** startRead marks the beginning of the yytext() string in the buffer */ + private int zzStartRead; + + /** endRead marks the last character in the buffer, that has been read + from input */ + private int zzEndRead; + + /** number of newlines encountered up to the start of the matched text */ + private int yyline; + + /** the number of characters up to the start of the matched text */ + private int yychar; + + /** + * the number of characters from the last newline up to the start of the + * matched text + */ + private int yycolumn; + + /** + * zzAtBOL == true <=> the scanner is currently at the beginning of a line + */ + private boolean zzAtBOL = true; + + /** zzAtEOF == true <=> the scanner is at the EOF */ + private boolean zzAtEOF; + + /** denotes if the user-EOF-code has already been executed */ + private boolean zzEOFDone; + + /** For the backwards DFA of general lookahead statements */ + private boolean [] zzFin = new boolean [ZZ_BUFFERSIZE+1]; + + /* user code: */ + private static final int CONS = 1; + private static final int VOWEL = 2; + private int cv = 0; // consonant = 1, vowel = 2, everything else = 0 + + private String original = ""; + private String normalized = ""; + private int problem = 0; + + private void add (String norm) { + original += yytext(); + normalized += norm; + } + + + /** + * Creates a new scanner + * There is also a java.io.InputStream version of this constructor. + * + * @param in the java.io.Reader to read input from. + */ + public MpdlNormalizerLexLA(java.io.Reader in) { + this.zzReader = in; + } + + /** + * Creates a new scanner. + * There is also java.io.Reader version of this constructor. + * + * @param in the java.io.Inputstream to read input from. + */ + public MpdlNormalizerLexLA(java.io.InputStream in) { + this(new java.io.InputStreamReader(in)); + } + + /** + * Unpacks the compressed character translation table. + * + * @param packed the packed character translation table + * @return the unpacked character translation table + */ + private static char [] zzUnpackCMap(String packed) { + char [] map = new char[0x10000]; + int i = 0; /* index in packed string */ + int j = 0; /* index in unpacked array */ + while (i < 184) { + int count = packed.charAt(i++); + char value = packed.charAt(i++); + do map[j++] = value; while (--count > 0); + } + return map; + } + + + /** + * Refills the input buffer. + * + * @return false, iff there was new input. + * + * @exception java.io.IOException if any I/O-Error occurs + */ + private boolean zzRefill() throws java.io.IOException { + + /* first: make room (if you can) */ + if (zzStartRead > 0) { + System.arraycopy(zzBuffer, zzStartRead, + zzBuffer, 0, + zzEndRead-zzStartRead); + + /* translate stored positions */ + zzEndRead-= zzStartRead; + zzCurrentPos-= zzStartRead; + zzMarkedPos-= zzStartRead; + zzStartRead = 0; + } + + /* is the buffer big enough? */ + if (zzCurrentPos >= zzBuffer.length) { + /* if not: blow it up */ + char newBuffer[] = new char[zzCurrentPos*2]; + System.arraycopy(zzBuffer, 0, newBuffer, 0, zzBuffer.length); + zzBuffer = newBuffer; + } + + /* finally: fill the buffer with new input */ + int numRead = zzReader.read(zzBuffer, zzEndRead, + zzBuffer.length-zzEndRead); + + if (numRead > 0) { + zzEndRead+= numRead; + return false; + } + // unlikely but not impossible: read 0 characters, but not at end of stream + if (numRead == 0) { + int c = zzReader.read(); + if (c == -1) { + return true; + } else { + zzBuffer[zzEndRead++] = (char) c; + return false; + } + } + + // numRead < 0 + return true; + } + + + /** + * Closes the input stream. + */ + public final void yyclose() throws java.io.IOException { + zzAtEOF = true; /* indicate end of file */ + zzEndRead = zzStartRead; /* invalidate buffer */ + + if (zzReader != null) + zzReader.close(); + } + + + /** + * Resets the scanner to read from a new input stream. + * Does not close the old reader. + * + * All internal variables are reset, the old input stream + * cannot be reused (internal buffer is discarded and lost). + * Lexical state is set to ZZ_INITIAL. + * + * @param reader the new input stream + */ + public final void yyreset(java.io.Reader reader) { + zzReader = reader; + zzAtBOL = true; + zzAtEOF = false; + zzEOFDone = false; + zzEndRead = zzStartRead = 0; + zzCurrentPos = zzMarkedPos = 0; + yyline = yychar = yycolumn = 0; + zzLexicalState = YYINITIAL; + } + + + /** + * Returns the current lexical state. + */ + public final int yystate() { + return zzLexicalState; + } + + + /** + * Enters a new lexical state + * + * @param newState the new lexical state + */ + public final void yybegin(int newState) { + zzLexicalState = newState; + } + + + /** + * Returns the text matched by the current regular expression. + */ + public final String yytext() { + return new String( zzBuffer, zzStartRead, zzMarkedPos-zzStartRead ); + } + + + /** + * Returns the character at position pos from the + * matched text. + * + * It is equivalent to yytext().charAt(pos), but faster + * + * @param pos the position of the character to fetch. + * A value from 0 to yylength()-1. + * + * @return the character at position pos + */ + public final char yycharat(int pos) { + return zzBuffer[zzStartRead+pos]; + } + + + /** + * Returns the length of the matched text region. + */ + public final int yylength() { + return zzMarkedPos-zzStartRead; + } + + + /** + * Reports an error that occured while scanning. + * + * In a wellformed scanner (no or only correct usage of + * yypushback(int) and a match-all fallback rule) this method + * will only be called with things that "Can't Possibly Happen". + * If this method is called, something is seriously wrong + * (e.g. a JFlex bug producing a faulty scanner etc.). + * + * Usual syntax/scanner level error handling should be done + * in error fallback rules. + * + * @param errorCode the code of the errormessage to display + */ + private void zzScanError(int errorCode) { + String message; + try { + message = ZZ_ERROR_MSG[errorCode]; + } + catch (ArrayIndexOutOfBoundsException e) { + message = ZZ_ERROR_MSG[ZZ_UNKNOWN_ERROR]; + } + + throw new Error(message); + } + + + /** + * Pushes the specified amount of characters back into the input stream. + * + * They will be read again by then next call of the scanning method + * + * @param number the number of characters to be read again. + * This number must not be greater than yylength()! + */ + public void yypushback(int number) { + if ( number > yylength() ) + zzScanError(ZZ_PUSHBACK_2BIG); + + zzMarkedPos -= number; + } + + + /** + * Resumes scanning until the next regular expression is matched, + * the end of input is encountered or an I/O-Error occurs. + * + * @return the next token + * @exception java.io.IOException if any I/O-Error occurs + */ + public java.lang.String yylex() throws java.io.IOException { + int zzInput; + int zzAction; + + // cached fields: + int zzCurrentPosL; + int zzMarkedPosL; + int zzEndReadL = zzEndRead; + char [] zzBufferL = zzBuffer; + char [] zzCMapL = ZZ_CMAP; + + int [] zzTransL = ZZ_TRANS; + int [] zzRowMapL = ZZ_ROWMAP; + int [] zzAttrL = ZZ_ATTRIBUTE; + + while (true) { + zzMarkedPosL = zzMarkedPos; + + if (zzMarkedPosL > zzStartRead) { + switch (zzBufferL[zzMarkedPosL-1]) { + case '\n': + case '\u000B': + case '\u000C': + case '\u0085': + case '\u2028': + case '\u2029': + zzAtBOL = true; + break; + case '\r': + if (zzMarkedPosL < zzEndReadL) + zzAtBOL = zzBufferL[zzMarkedPosL] != '\n'; + else if (zzAtEOF) + zzAtBOL = false; + else { + boolean eof = zzRefill(); + zzMarkedPosL = zzMarkedPos; + zzEndReadL = zzEndRead; + zzBufferL = zzBuffer; + if (eof) + zzAtBOL = false; + else + zzAtBOL = zzBufferL[zzMarkedPosL] != '\n'; + } + break; + default: + zzAtBOL = false; + } + } + zzAction = -1; + + zzCurrentPosL = zzCurrentPos = zzStartRead = zzMarkedPosL; + + if (zzAtBOL) + zzState = ZZ_LEXSTATE[zzLexicalState+1]; + else + zzState = ZZ_LEXSTATE[zzLexicalState]; + + + zzForAction: { + while (true) { + + if (zzCurrentPosL < zzEndReadL) + zzInput = zzBufferL[zzCurrentPosL++]; + else if (zzAtEOF) { + zzInput = YYEOF; + break zzForAction; + } + else { + // store back cached positions + zzCurrentPos = zzCurrentPosL; + zzMarkedPos = zzMarkedPosL; + boolean eof = zzRefill(); + // get translated positions and possibly new buffer + zzCurrentPosL = zzCurrentPos; + zzMarkedPosL = zzMarkedPos; + zzBufferL = zzBuffer; + zzEndReadL = zzEndRead; + if (eof) { + zzInput = YYEOF; + break zzForAction; + } + else { + zzInput = zzBufferL[zzCurrentPosL++]; + } + } + int zzNext = zzTransL[ zzRowMapL[zzState] + zzCMapL[zzInput] ]; + if (zzNext == -1) break zzForAction; + zzState = zzNext; + + int zzAttributes = zzAttrL[zzState]; + if ( (zzAttributes & 1) == 1 ) { + zzAction = zzState; + zzMarkedPosL = zzCurrentPosL; + if ( (zzAttributes & 8) == 8 ) break zzForAction; + } + + } + } + + // store back cached position + zzMarkedPos = zzMarkedPosL; + + switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) { + case 39: + // lookahead expression with fixed base length + zzMarkedPos = zzStartRead + 2; + { add("um"); + } + case 48: break; + case 28: + // lookahead expression with fixed base length + zzMarkedPos = zzStartRead + 1; + { cv = CONS; add("U"); + } + case 49: break; + case 4: + { add(yytext()); + } + case 50: break; + case 46: + // lookahead expression with fixed base length + zzMarkedPos = zzStartRead + 3; + { add("Hic"); + } + case 51: break; + case 9: + { cv = VOWEL; add("AE"); + } + case 52: break; + case 1: + { problem = 1; cv = 0; add(yytext()); + } + case 53: break; + case 5: + { switch (problem) { + case 1: return original; + default: return normalized; + } + } + case 54: break; + case 18: + // lookahead expression with fixed base length + zzMarkedPos = zzStartRead + 1; + { cv = CONS; add("u"); + } + case 55: break; + case 21: + // lookahead expression with fixed base length + zzMarkedPos = zzStartRead + 1; + { add("e"); + } + case 56: break; + case 29: + // lookahead expression with fixed base length + zzMarkedPos = zzStartRead + 1; + { cv = VOWEL; add(yytext().replaceAll("u", "v").replaceAll("U", "V")); + } + case 57: break; + case 34: + // lookahead expression with fixed base length + zzMarkedPos = zzStartRead + 2; + { add("et"); + } + case 58: break; + case 41: + // general lookahead, find correct zzMarkedPos + { int zzFState = 5; + int zzFPos = zzStartRead; + if (zzFin.length <= zzBufferL.length) { zzFin = new boolean[zzBufferL.length+1]; } + boolean zzFinL[] = zzFin; + while (zzFState != -1 && zzFPos < zzMarkedPos) { + if ((zzAttrL[zzFState] & 1) == 1) { zzFinL[zzFPos] = true; } + zzInput = zzBufferL[zzFPos++]; + zzFState = zzTransL[ zzRowMapL[zzFState] + zzCMapL[zzInput] ]; + } + if (zzFState != -1 && (zzAttrL[zzFState] & 1) == 1) { zzFinL[zzFPos] = true; } + + zzFState = 6; + zzFPos = zzMarkedPos; + while (!zzFinL[zzFPos] || (zzAttrL[zzFState] & 1) != 1) { + zzInput = zzBufferL[--zzFPos]; + zzFState = zzTransL[ zzRowMapL[zzFState] + zzCMapL[zzInput] ]; + }; + zzMarkedPos = zzFPos; + } + { cv = VOWEL; add(yytext().replace("ſ", "s")); + } + case 59: break; + case 3: + { cv = CONS; add(yytext()); + } + case 60: break; + case 27: + { cv = VOWEL; add("oi"); + } + case 61: break; + case 25: + { cv = CONS; add("QU"); + } + case 62: break; + case 15: + // lookahead expression with fixed base length + zzMarkedPos = zzStartRead + 1; + { switch(cv) { + case VOWEL: add(yytext().replace("u", "v").replace("U", "V")); break; + default: cv = VOWEL; add(yytext()); break; + } + } + case 63: break; + case 7: + { cv = CONS; add("ss"); + } + case 64: break; + case 6: + { cv = CONS; add("s"); + } + case 65: break; + case 22: + // lookahead expression with fixed base length + zzMarkedPos = zzStartRead + 1; + { add("o"); + } + case 66: break; + case 33: + // lookahead expression with fixed base length + zzMarkedPos = zzStartRead + 2; + { add("ac"); + } + case 67: break; + case 2: + { cv = VOWEL; add(yytext()); + } + case 68: break; + case 43: + // lookahead expression with fixed base length + zzMarkedPos = zzStartRead + 3; + { add("qui"); + } + case 69: break; + case 35: + // lookahead expression with fixed base length + zzMarkedPos = zzStartRead + 2; + { add("er"); + } + case 70: break; + case 24: + { cv = CONS; add("Qu"); + } + case 71: break; + case 30: + // lookahead expression with fixed base length + zzMarkedPos = zzStartRead + 2; + { add("ve"); + } + case 72: break; + case 38: + // lookahead expression with fixed base length + zzMarkedPos = zzStartRead + 2; + { add("us"); + } + case 73: break; + case 32: + // lookahead expression with fixed base length + zzMarkedPos = zzStartRead + 2; + { add("am"); + } + case 74: break; + case 8: + { cv = VOWEL; add("ae"); + } + case 75: break; + case 11: + { switch (problem) { + case 1: return ""; + default: return normalized; + } + } + case 76: break; + case 26: + { add("ar"); + } + case 77: break; + case 45: + // lookahead expression with fixed base length + zzMarkedPos = zzStartRead + 3; + { add("hic"); + } + case 78: break; + case 17: + { cv = VOWEL; add("uu"); + } + case 79: break; + case 40: + // lookahead expression with fixed base length + zzMarkedPos = zzStartRead + 2; + { add("ul"); + } + case 80: break; + case 20: + // lookahead expression with fixed base length + zzMarkedPos = zzStartRead + 1; + { add("a"); + } + case 81: break; + case 10: + { cv = VOWEL; add("oe"); + } + case 82: break; + case 16: + { cv = VOWEL; add("ui"); + } + case 83: break; + case 14: + { cv = CONS; add("qu"); + } + case 84: break; + case 47: + // lookahead expression with fixed base length + zzMarkedPos = zzStartRead + 4; + { add("que"); + } + case 85: break; + case 23: + // lookahead expression with fixed base length + zzMarkedPos = zzStartRead + 1; + { add("u"); + } + case 86: break; + case 36: + // lookahead expression with fixed base length + zzMarkedPos = zzStartRead + 2; + { add("es"); + } + case 87: break; + case 44: + // lookahead expression with fixed base length + zzMarkedPos = zzStartRead + 3; + { add("Qui"); + } + case 88: break; + case 42: + // lookahead expression with fixed base length + zzMarkedPos = zzStartRead + 1; + { add("i"); + } + case 89: break; + case 12: + { add("X"); + } + case 90: break; + case 13: + { switch(cv) { + case CONS: add(yytext().replace("v", "u").replace("V", "U")); break; + default: cv = CONS; add(yytext()); break; + } + } + case 91: break; + case 19: + { cv = VOWEL; add("ii"); + } + case 92: break; + case 31: + // lookahead expression with fixed base length + zzMarkedPos = zzStartRead + 2; + { add("as"); + } + case 93: break; + case 37: + // lookahead expression with fixed base length + zzMarkedPos = zzStartRead + 2; + { add("od"); + } + case 94: break; + default: + if (zzInput == YYEOF && zzStartRead == zzCurrentPos) { + zzAtEOF = true; + return null; + } + else { + zzScanError(ZZ_NO_MATCH); + } + } + } + } + + +} diff -r d2a1c14fde31 -r 1ec29fdd0db8 software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlNormalizerLexLA.lex --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlNormalizerLexLA.lex Tue Feb 22 16:03:45 2011 +0100 @@ -0,0 +1,214 @@ +/* + * Normalization rules for Latin text + * [this is a JFlex specification] + * + * Wolfgang Schmidle + * version 0.96 + * 2011-02-21 + * + */ + +package de.mpg.mpiwg.berlin.mpdl.lt.analyzer.lang; + +%% + +%public +%class MpdlNormalizerLexLA +%type java.lang.String +%unicode + +// Latin: la, lat + +%states DISP, DICT, SEARCH +%states RENAISSANCE_DISP, RENAISSANCE_DICT, RENAISSANCE_SEARCH + +%{ + private static final int CONS = 1; + private static final int VOWEL = 2; + private int cv = 0; // consonant = 1, vowel = 2, everything else = 0 + + private String original = ""; + private String normalized = ""; + private int problem = 0; + + private void add (String norm) { + original += yytext(); + normalized += norm; + } +%} + +Vowel = [AEIOUaeiou] // without Ææęàèòùœ +Cons = [BCDFGHKLMNPQRSTVWXZbcdfghklmnpqrstvwxzſß] +LR = [lLrR] + +hyphen = [\u002d\u00ad] // hyphen and soft hyphen +X = {hyphen}? + +END = \n + +que = (que)? // optional -que +enclitic = (que | ve | ne) +prefixCons = (in{X}ter | per | ſu{X}per | ſer) // "ſer" for forms of ſervare + +%% + + +// TEST, siehe Benedetti Seite 444 +𐆑 { add("X"); } // (U+10191; D800+DD91) + + + { + +// 1. simple replacements + +// 1.1 single characters +ſ { cv = CONS; add("s"); } +ß { cv = CONS; add("ss"); } +[æę] { cv = VOWEL; add("ae"); } +Æ { cv = VOWEL; add("AE"); } +œ { cv = VOWEL; add("oe"); } + +// 1.2 character combinations +ij { cv = VOWEL; add("ii"); } + +// 2. superfluous diacritics + +// 2.1 acute accent +q́ue / {END} { add("que"); } // G +á / [mrst]? {enclitic} {END} { add("a"); } // G +é / [mrst]? {enclitic} {END} { add("e"); } // G +í / [mrst]? {enclitic} {END} { add("i"); } // G +ó / [mrst]? {enclitic} {END} { add("o"); } // G +ú / [mrst]? {enclitic} {END} { add("u"); } // G + +úe / {END} { add("ve"); } // W ?? + +// 2.2 grave accent +à / {que} {END} { add("a"); } // W G +àm / {que} {END} { add("am"); } // W (G) +às / {que} {END} { add("as"); } // W (G) (-àsque will likely never occur) +è / {que} {END} { add("e"); } // W G +ò / {que} {END} { add("o"); } // W G +òd / {que} {END} { add("od"); } // W (G) +ùm / {que} {END} { add("um"); } // W (G) +ùs / {que} {END} { add("us"); } // W G + +ès / {que} {END} { add("es"); } // (G) +^ quì / {END} { add("qui"); } // W ?? +^ Quì / {END} { add("Qui"); } // W ?? +àc / {END} { add("ac"); } // W ?? +èr / {END} { add("er"); } // W ?? +èt / {END} { add("et"); } // W ?? +ù / {END} { add("u"); } // W ?? +ùl / {END} { add("ul"); } // W ?? + +// 2.3 circumflex accent +^ hîc / {END} { add("hic"); } // W G +^ Hîc / {END} { add("Hic"); } // W G +^ ô / {END} { add("o"); } // G +â / {que} {END} { add("a"); } // W G +ûs / {END} { add("us"); } // W G +âr { add("ar"); } // W (G) --> this is only a rough approximation! + +// 2.4 trema +// 2.4.1 common cases +aë { cv = VOWEL; add("ae"); } +oë { cv = VOWEL; add("oe"); } +// 2.4.2 rare cases +oï { cv = VOWEL; add("oi"); } +uï { cv = VOWEL; add("ui"); } +// 2.4.3 extremely rare cases +uü { cv = VOWEL; add("uu"); } + + +// 3. rules for u and v + +// 3.1 rules for u --> v + +// peruenias --> pervenias, interuallum --> intervallum +^ {prefixCons} / {X} { cv = VOWEL; add(yytext().replace("ſ", "s")); } // not cv = CONS ! + +// uellet --> vellet +^ [uU] / {Vowel} { cv = VOWEL; add(yytext().replaceAll("u", "v").replaceAll("U", "V")); } + +// diuidatur --> dividatur +// ut, volui: unchanged +// no rule for veruina because we cannot distinguish it from volui +[uU] / {Vowel} { + switch(cv) { + case VOWEL: add(yytext().replace("u", "v").replace("U", "V")); break; + default: cv = VOWEL; add(yytext()); break; + } + } + +// 3.2 rules for v --> u + +// qvam --> quam +qv { cv = CONS; add("qu"); } // the replaced v still counts as consonant +Qv { cv = CONS; add("Qu"); } +QV { cv = CONS; add("QU"); } + +// febrvarius --> februarius +// curva: unchanged +{LR} [vV] { + switch(cv) { + case CONS: add(yytext().replace("v", "u").replace("V", "U")); break; + default: cv = CONS; add(yytext()); break; + } + } + +// februarivs --> februarius +v / {X} {Cons} { cv = CONS; add("u"); } +V / {X} {Cons} { cv = CONS; add("U"); } + +// 3.3 override default rule for . + +{Vowel} { cv = VOWEL; add(yytext()); } +{Cons} { cv = CONS; add(yytext()); } +{hyphen} { add(yytext()); } + +. { problem = 1; cv = 0; add(yytext()); } // in particular "@", and from Arboreal: "〈" (2329), "〉" (232A), Ç, ç + +} + + + { + +{END} { + switch (problem) { + case 1: return original; + default: return normalized; + } + } +} + + { + +{END} { + switch (problem) { + case 1: return ""; + default: return normalized; + } + } +} + + +/* + +Annahmen: +- die Routine wird wortweise aufgerufen, mit einem \n am Ende des Strings +- Wörter mit Zeilenumbrüchen wurden bereits wieder zusammengesetzt + + +TO DO: + +LA: Nochmal überlegen, ob man Ææęàèòùœ in der Vokal-Klasse weglassen kann. Sie schaden aber auch nicht. (Oder doch !?) Unterscheide Vokal-Klassen vor und nach dem u ? +LA: Diakritika nochmal mit Paul durchgehen +LA: Die Disambiguierungen durch die Diakritika fehlen noch. +LA: ist J wirklich ein Problemfall? +LA: gibt es Wörter wie super-rv... oder super-lv... in Klein- oder Großbuchstaben? + +*/ diff -r d2a1c14fde31 -r 1ec29fdd0db8 software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlNormalizerLexNL.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlNormalizerLexNL.java Tue Feb 22 16:03:45 2011 +0100 @@ -0,0 +1,576 @@ +/* The following code was generated by JFlex 1.4.3 on 22.02.11 12:04 */ + +/* + * Normalization rules for Dutch text + * [this is a JFlex specification] + * + * Wolfgang Schmidle + * version 0.96 + * 2011-02-21 + * + */ + +package de.mpg.mpiwg.berlin.mpdl.lt.analyzer.lang; + + +/** + * This class is a scanner generated by + * JFlex 1.4.3 + * on 22.02.11 12:04 from the specification file + * MpdlNormalizerLexNL.lex + */ +public class MpdlNormalizerLexNL { + + /** This character denotes the end of file */ + public static final int YYEOF = -1; + + /** initial size of the lookahead buffer */ + private static final int ZZ_BUFFERSIZE = 16384; + + /** lexical states */ + public static final int SEARCH = 6; + public static final int DICT = 4; + public static final int YYINITIAL = 0; + public static final int DISP = 2; + + /** + * ZZ_LEXSTATE[l] is the state in the DFA for the lexical state l + * ZZ_LEXSTATE[l+1] is the state in the DFA for the lexical state l + * at the beginning of a line + * l is of the form l = 2*k, k a non negative integer + */ + private static final int ZZ_LEXSTATE[] = { + 0, 0, 1, 1, 2, 2, 1, 1 + }; + + /** + * Translates characters to character classes + */ + private static final String ZZ_CMAP_PACKED = + "\12\0\1\1\65\0\1\3\u013e\0\1\2\ufe80\0"; + + /** + * Translates characters to character classes + */ + private static final char [] ZZ_CMAP = zzUnpackCMap(ZZ_CMAP_PACKED); + + /** + * Translates DFA states to action switch labels. + */ + private static final int [] ZZ_ACTION = zzUnpackAction(); + + private static final String ZZ_ACTION_PACKED_0 = + "\3\0\1\1\1\2\1\3\1\4\1\5"; + + private static int [] zzUnpackAction() { + int [] result = new int[8]; + int offset = 0; + offset = zzUnpackAction(ZZ_ACTION_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackAction(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int count = packed.charAt(i++); + int value = packed.charAt(i++); + do result[j++] = value; while (--count > 0); + } + return j; + } + + + /** + * Translates a state to a row index in the transition table + */ + private static final int [] ZZ_ROWMAP = zzUnpackRowMap(); + + private static final String ZZ_ROWMAP_PACKED_0 = + "\0\0\0\4\0\10\0\14\0\14\0\14\0\14\0\14"; + + private static int [] zzUnpackRowMap() { + int [] result = new int[8]; + int offset = 0; + offset = zzUnpackRowMap(ZZ_ROWMAP_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackRowMap(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int high = packed.charAt(i++) << 16; + result[j++] = high | packed.charAt(i++); + } + return j; + } + + /** + * The transition table of the DFA + */ + private static final int [] ZZ_TRANS = zzUnpackTrans(); + + private static final String ZZ_TRANS_PACKED_0 = + "\1\4\1\0\1\4\1\5\1\4\1\6\1\7\1\5"+ + "\1\4\1\10\1\7\1\5\4\0"; + + private static int [] zzUnpackTrans() { + int [] result = new int[16]; + int offset = 0; + offset = zzUnpackTrans(ZZ_TRANS_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackTrans(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int count = packed.charAt(i++); + int value = packed.charAt(i++); + value--; + do result[j++] = value; while (--count > 0); + } + return j; + } + + + /* error codes */ + private static final int ZZ_UNKNOWN_ERROR = 0; + private static final int ZZ_NO_MATCH = 1; + private static final int ZZ_PUSHBACK_2BIG = 2; + + /* error messages for the codes above */ + private static final String ZZ_ERROR_MSG[] = { + "Unkown internal scanner error", + "Error: could not match input", + "Error: pushback value was too large" + }; + + /** + * ZZ_ATTRIBUTE[aState] contains the attributes of state aState + */ + private static final int [] ZZ_ATTRIBUTE = zzUnpackAttribute(); + + private static final String ZZ_ATTRIBUTE_PACKED_0 = + "\3\0\5\11"; + + private static int [] zzUnpackAttribute() { + int [] result = new int[8]; + int offset = 0; + offset = zzUnpackAttribute(ZZ_ATTRIBUTE_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackAttribute(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int count = packed.charAt(i++); + int value = packed.charAt(i++); + do result[j++] = value; while (--count > 0); + } + return j; + } + + /** the input device */ + private java.io.Reader zzReader; + + /** the current state of the DFA */ + private int zzState; + + /** the current lexical state */ + private int zzLexicalState = YYINITIAL; + + /** this buffer contains the current text to be matched and is + the source of the yytext() string */ + private char zzBuffer[] = new char[ZZ_BUFFERSIZE]; + + /** the textposition at the last accepting state */ + private int zzMarkedPos; + + /** the current text position in the buffer */ + private int zzCurrentPos; + + /** startRead marks the beginning of the yytext() string in the buffer */ + private int zzStartRead; + + /** endRead marks the last character in the buffer, that has been read + from input */ + private int zzEndRead; + + /** number of newlines encountered up to the start of the matched text */ + private int yyline; + + /** the number of characters up to the start of the matched text */ + private int yychar; + + /** + * the number of characters from the last newline up to the start of the + * matched text + */ + private int yycolumn; + + /** + * zzAtBOL == true <=> the scanner is currently at the beginning of a line + */ + private boolean zzAtBOL = true; + + /** zzAtEOF == true <=> the scanner is at the EOF */ + private boolean zzAtEOF; + + /** denotes if the user-EOF-code has already been executed */ + private boolean zzEOFDone; + + /* user code: */ + private String original = ""; + private String normalized = ""; + private int problem = 0; + + private void add (String norm) { + original += yytext(); + normalized += norm; + } + + + /** + * Creates a new scanner + * There is also a java.io.InputStream version of this constructor. + * + * @param in the java.io.Reader to read input from. + */ + public MpdlNormalizerLexNL(java.io.Reader in) { + this.zzReader = in; + } + + /** + * Creates a new scanner. + * There is also java.io.Reader version of this constructor. + * + * @param in the java.io.Inputstream to read input from. + */ + public MpdlNormalizerLexNL(java.io.InputStream in) { + this(new java.io.InputStreamReader(in)); + } + + /** + * Unpacks the compressed character translation table. + * + * @param packed the packed character translation table + * @return the unpacked character translation table + */ + private static char [] zzUnpackCMap(String packed) { + char [] map = new char[0x10000]; + int i = 0; /* index in packed string */ + int j = 0; /* index in unpacked array */ + while (i < 14) { + int count = packed.charAt(i++); + char value = packed.charAt(i++); + do map[j++] = value; while (--count > 0); + } + return map; + } + + + /** + * Refills the input buffer. + * + * @return false, iff there was new input. + * + * @exception java.io.IOException if any I/O-Error occurs + */ + private boolean zzRefill() throws java.io.IOException { + + /* first: make room (if you can) */ + if (zzStartRead > 0) { + System.arraycopy(zzBuffer, zzStartRead, + zzBuffer, 0, + zzEndRead-zzStartRead); + + /* translate stored positions */ + zzEndRead-= zzStartRead; + zzCurrentPos-= zzStartRead; + zzMarkedPos-= zzStartRead; + zzStartRead = 0; + } + + /* is the buffer big enough? */ + if (zzCurrentPos >= zzBuffer.length) { + /* if not: blow it up */ + char newBuffer[] = new char[zzCurrentPos*2]; + System.arraycopy(zzBuffer, 0, newBuffer, 0, zzBuffer.length); + zzBuffer = newBuffer; + } + + /* finally: fill the buffer with new input */ + int numRead = zzReader.read(zzBuffer, zzEndRead, + zzBuffer.length-zzEndRead); + + if (numRead > 0) { + zzEndRead+= numRead; + return false; + } + // unlikely but not impossible: read 0 characters, but not at end of stream + if (numRead == 0) { + int c = zzReader.read(); + if (c == -1) { + return true; + } else { + zzBuffer[zzEndRead++] = (char) c; + return false; + } + } + + // numRead < 0 + return true; + } + + + /** + * Closes the input stream. + */ + public final void yyclose() throws java.io.IOException { + zzAtEOF = true; /* indicate end of file */ + zzEndRead = zzStartRead; /* invalidate buffer */ + + if (zzReader != null) + zzReader.close(); + } + + + /** + * Resets the scanner to read from a new input stream. + * Does not close the old reader. + * + * All internal variables are reset, the old input stream + * cannot be reused (internal buffer is discarded and lost). + * Lexical state is set to ZZ_INITIAL. + * + * @param reader the new input stream + */ + public final void yyreset(java.io.Reader reader) { + zzReader = reader; + zzAtBOL = true; + zzAtEOF = false; + zzEOFDone = false; + zzEndRead = zzStartRead = 0; + zzCurrentPos = zzMarkedPos = 0; + yyline = yychar = yycolumn = 0; + zzLexicalState = YYINITIAL; + } + + + /** + * Returns the current lexical state. + */ + public final int yystate() { + return zzLexicalState; + } + + + /** + * Enters a new lexical state + * + * @param newState the new lexical state + */ + public final void yybegin(int newState) { + zzLexicalState = newState; + } + + + /** + * Returns the text matched by the current regular expression. + */ + public final String yytext() { + return new String( zzBuffer, zzStartRead, zzMarkedPos-zzStartRead ); + } + + + /** + * Returns the character at position pos from the + * matched text. + * + * It is equivalent to yytext().charAt(pos), but faster + * + * @param pos the position of the character to fetch. + * A value from 0 to yylength()-1. + * + * @return the character at position pos + */ + public final char yycharat(int pos) { + return zzBuffer[zzStartRead+pos]; + } + + + /** + * Returns the length of the matched text region. + */ + public final int yylength() { + return zzMarkedPos-zzStartRead; + } + + + /** + * Reports an error that occured while scanning. + * + * In a wellformed scanner (no or only correct usage of + * yypushback(int) and a match-all fallback rule) this method + * will only be called with things that "Can't Possibly Happen". + * If this method is called, something is seriously wrong + * (e.g. a JFlex bug producing a faulty scanner etc.). + * + * Usual syntax/scanner level error handling should be done + * in error fallback rules. + * + * @param errorCode the code of the errormessage to display + */ + private void zzScanError(int errorCode) { + String message; + try { + message = ZZ_ERROR_MSG[errorCode]; + } + catch (ArrayIndexOutOfBoundsException e) { + message = ZZ_ERROR_MSG[ZZ_UNKNOWN_ERROR]; + } + + throw new Error(message); + } + + + /** + * Pushes the specified amount of characters back into the input stream. + * + * They will be read again by then next call of the scanning method + * + * @param number the number of characters to be read again. + * This number must not be greater than yylength()! + */ + public void yypushback(int number) { + if ( number > yylength() ) + zzScanError(ZZ_PUSHBACK_2BIG); + + zzMarkedPos -= number; + } + + + /** + * Resumes scanning until the next regular expression is matched, + * the end of input is encountered or an I/O-Error occurs. + * + * @return the next token + * @exception java.io.IOException if any I/O-Error occurs + */ + public java.lang.String yylex() throws java.io.IOException { + int zzInput; + int zzAction; + + // cached fields: + int zzCurrentPosL; + int zzMarkedPosL; + int zzEndReadL = zzEndRead; + char [] zzBufferL = zzBuffer; + char [] zzCMapL = ZZ_CMAP; + + int [] zzTransL = ZZ_TRANS; + int [] zzRowMapL = ZZ_ROWMAP; + int [] zzAttrL = ZZ_ATTRIBUTE; + + while (true) { + zzMarkedPosL = zzMarkedPos; + + zzAction = -1; + + zzCurrentPosL = zzCurrentPos = zzStartRead = zzMarkedPosL; + + zzState = ZZ_LEXSTATE[zzLexicalState]; + + + zzForAction: { + while (true) { + + if (zzCurrentPosL < zzEndReadL) + zzInput = zzBufferL[zzCurrentPosL++]; + else if (zzAtEOF) { + zzInput = YYEOF; + break zzForAction; + } + else { + // store back cached positions + zzCurrentPos = zzCurrentPosL; + zzMarkedPos = zzMarkedPosL; + boolean eof = zzRefill(); + // get translated positions and possibly new buffer + zzCurrentPosL = zzCurrentPos; + zzMarkedPosL = zzMarkedPos; + zzBufferL = zzBuffer; + zzEndReadL = zzEndRead; + if (eof) { + zzInput = YYEOF; + break zzForAction; + } + else { + zzInput = zzBufferL[zzCurrentPosL++]; + } + } + int zzNext = zzTransL[ zzRowMapL[zzState] + zzCMapL[zzInput] ]; + if (zzNext == -1) break zzForAction; + zzState = zzNext; + + int zzAttributes = zzAttrL[zzState]; + if ( (zzAttributes & 1) == 1 ) { + zzAction = zzState; + zzMarkedPosL = zzCurrentPosL; + if ( (zzAttributes & 8) == 8 ) break zzForAction; + } + + } + } + + // store back cached position + zzMarkedPos = zzMarkedPosL; + + switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) { + case 5: + { switch (problem) { + case 1: return ""; + default: return normalized; + } + } + case 6: break; + case 2: + { problem = 1; add(yytext()); + } + case 7: break; + case 4: + { add("s"); + } + case 8: break; + case 3: + { switch (problem) { + case 1: return original; + default: return normalized; + } + } + case 9: break; + case 1: + { add(yytext()); + } + case 10: break; + default: + if (zzInput == YYEOF && zzStartRead == zzCurrentPos) { + zzAtEOF = true; + return null; + } + else { + zzScanError(ZZ_NO_MATCH); + } + } + } + } + + +} diff -r d2a1c14fde31 -r 1ec29fdd0db8 software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlNormalizerLexNL.lex --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlNormalizerLexNL.lex Tue Feb 22 16:03:45 2011 +0100 @@ -0,0 +1,83 @@ +/* + * Normalization rules for Dutch text + * [this is a JFlex specification] + * + * Wolfgang Schmidle + * version 0.96 + * 2011-02-21 + * + */ + +package de.mpg.mpiwg.berlin.mpdl.lt.analyzer.lang; + +%% + +%public +%class MpdlNormalizerLexNL +%type java.lang.String +%unicode + +// Dutch: nl + +%states DISP, DICT, SEARCH + +%{ + private String original = ""; + private String normalized = ""; + private int problem = 0; + + private void add (String norm) { + original += yytext(); + normalized += norm; + } +%} + +END = \n + +%% + + { + +ſ { add("s"); } + +} + + +// default + +@ { problem = 1; add(yytext()); } +. { add(yytext()); } + + + { + +{END} { + switch (problem) { + case 1: return original; + default: return normalized; + } + } +} + + { + +{END} { + switch (problem) { + case 1: return ""; + default: return normalized; + } + } +} + + +/* + +Annahmen: +- die Routine wird wortweise aufgerufen, mit einem \n am Ende des Strings +- Wörter mit Zeilenumbrüchen wurden bereits wieder zusammengesetzt + +TO DO: + +NL: vollständig? + +*/ diff -r d2a1c14fde31 -r 1ec29fdd0db8 software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlNormalizerLexZH.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlNormalizerLexZH.java Tue Feb 22 16:03:45 2011 +0100 @@ -0,0 +1,638 @@ +/* The following code was generated by JFlex 1.4.3 on 22.02.11 12:04 */ + +/* + * Normalization rules for Chinese text + * [this is a JFlex specification] + * + * Wolfgang Schmidle + * version 0.96 + * 2011-02-21 + * + */ + +package de.mpg.mpiwg.berlin.mpdl.lt.analyzer.lang; + + +/** + * This class is a scanner generated by + * JFlex 1.4.3 + * on 22.02.11 12:04 from the specification file + * MpdlNormalizerLexZH.lex + */ +public class MpdlNormalizerLexZH { + + /** This character denotes the end of file */ + public static final int YYEOF = -1; + + /** initial size of the lookahead buffer */ + private static final int ZZ_BUFFERSIZE = 16384; + + /** lexical states */ + public static final int SEARCH = 6; + public static final int DICT = 4; + public static final int YYINITIAL = 0; + public static final int DISP = 2; + + /** + * ZZ_LEXSTATE[l] is the state in the DFA for the lexical state l + * ZZ_LEXSTATE[l+1] is the state in the DFA for the lexical state l + * at the beginning of a line + * l is of the form l = 2*k, k a non negative integer + */ + private static final int ZZ_LEXSTATE[] = { + 0, 0, 1, 1, 2, 2, 3, 3 + }; + + /** + * Translates characters to character classes + */ + private static final String ZZ_CMAP_PACKED = + "\12\0\1\2\45\0\1\1\1\0\1\1\15\0\1\20\41\0\1\1"+ + "\22\0\1\1\5\0\1\1\1\0\1\1\u4f84\0\1\3\176\0\1\4"+ + "\u035a\0\1\4\u0a9a\0\1\6\u0781\0\1\10\u057a\0\1\11\u06bd\0\1\12"+ + "\15\0\1\7\u0891\0\1\5\u1baf\0\1\13\340\0\1\14\u411a\0\1\16"+ + "\u040e\0\1\17\u1d8f\0\1\15\u05e2\0"; + + /** + * Translates characters to character classes + */ + private static final char [] ZZ_CMAP = zzUnpackCMap(ZZ_CMAP_PACKED); + + /** + * Translates DFA states to action switch labels. + */ + private static final int [] ZZ_ACTION = zzUnpackAction(); + + private static final String ZZ_ACTION_PACKED_0 = + "\4\0\1\1\1\2\1\3\1\4\1\5\1\6\1\7"+ + "\1\10\1\11\1\12\1\13\1\14\1\15\1\16\1\1"+ + "\1\17\1\20\1\21"; + + private static int [] zzUnpackAction() { + int [] result = new int[22]; + int offset = 0; + offset = zzUnpackAction(ZZ_ACTION_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackAction(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int count = packed.charAt(i++); + int value = packed.charAt(i++); + do result[j++] = value; while (--count > 0); + } + return j; + } + + + /** + * Translates a state to a row index in the transition table + */ + private static final int [] ZZ_ROWMAP = zzUnpackRowMap(); + + private static final String ZZ_ROWMAP_PACKED_0 = + "\0\0\0\21\0\42\0\63\0\104\0\104\0\104\0\104"+ + "\0\104\0\104\0\104\0\104\0\104\0\104\0\104\0\104"+ + "\0\104\0\104\0\125\0\104\0\104\0\104"; + + private static int [] zzUnpackRowMap() { + int [] result = new int[22]; + int offset = 0; + offset = zzUnpackRowMap(ZZ_ROWMAP_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackRowMap(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int high = packed.charAt(i++) << 16; + result[j++] = high | packed.charAt(i++); + } + return j; + } + + /** + * The transition table of the DFA + */ + private static final int [] ZZ_TRANS = zzUnpackTrans(); + + private static final String ZZ_TRANS_PACKED_0 = + "\2\5\1\0\15\5\1\6\2\5\1\7\1\10\1\11"+ + "\1\12\1\13\1\14\1\15\1\16\1\17\1\20\1\21"+ + "\1\22\1\23\1\5\1\6\1\5\1\24\1\25\1\10"+ + "\1\11\1\12\1\13\1\14\1\15\1\16\1\17\1\20"+ + "\1\21\1\22\1\23\1\5\1\6\1\5\1\24\1\7"+ + "\1\10\1\11\1\12\1\13\1\14\1\15\1\16\1\17"+ + "\1\20\1\21\1\22\1\23\1\5\1\6\40\0\1\26"+ + "\1\0"; + + private static int [] zzUnpackTrans() { + int [] result = new int[102]; + int offset = 0; + offset = zzUnpackTrans(ZZ_TRANS_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackTrans(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int count = packed.charAt(i++); + int value = packed.charAt(i++); + value--; + do result[j++] = value; while (--count > 0); + } + return j; + } + + + /* error codes */ + private static final int ZZ_UNKNOWN_ERROR = 0; + private static final int ZZ_NO_MATCH = 1; + private static final int ZZ_PUSHBACK_2BIG = 2; + + /* error messages for the codes above */ + private static final String ZZ_ERROR_MSG[] = { + "Unkown internal scanner error", + "Error: could not match input", + "Error: pushback value was too large" + }; + + /** + * ZZ_ATTRIBUTE[aState] contains the attributes of state aState + */ + private static final int [] ZZ_ATTRIBUTE = zzUnpackAttribute(); + + private static final String ZZ_ATTRIBUTE_PACKED_0 = + "\4\0\16\11\1\1\3\11"; + + private static int [] zzUnpackAttribute() { + int [] result = new int[22]; + int offset = 0; + offset = zzUnpackAttribute(ZZ_ATTRIBUTE_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackAttribute(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int count = packed.charAt(i++); + int value = packed.charAt(i++); + do result[j++] = value; while (--count > 0); + } + return j; + } + + /** the input device */ + private java.io.Reader zzReader; + + /** the current state of the DFA */ + private int zzState; + + /** the current lexical state */ + private int zzLexicalState = YYINITIAL; + + /** this buffer contains the current text to be matched and is + the source of the yytext() string */ + private char zzBuffer[] = new char[ZZ_BUFFERSIZE]; + + /** the textposition at the last accepting state */ + private int zzMarkedPos; + + /** the current text position in the buffer */ + private int zzCurrentPos; + + /** startRead marks the beginning of the yytext() string in the buffer */ + private int zzStartRead; + + /** endRead marks the last character in the buffer, that has been read + from input */ + private int zzEndRead; + + /** number of newlines encountered up to the start of the matched text */ + private int yyline; + + /** the number of characters up to the start of the matched text */ + private int yychar; + + /** + * the number of characters from the last newline up to the start of the + * matched text + */ + private int yycolumn; + + /** + * zzAtBOL == true <=> the scanner is currently at the beginning of a line + */ + private boolean zzAtBOL = true; + + /** zzAtEOF == true <=> the scanner is at the EOF */ + private boolean zzAtEOF; + + /** denotes if the user-EOF-code has already been executed */ + private boolean zzEOFDone; + + /* user code: */ + private String original = ""; + private String normalized = ""; + private int problem = 0; + + private void add (String norm) { + original += yytext(); + normalized += norm; + } + + + /** + * Creates a new scanner + * There is also a java.io.InputStream version of this constructor. + * + * @param in the java.io.Reader to read input from. + */ + public MpdlNormalizerLexZH(java.io.Reader in) { + this.zzReader = in; + } + + /** + * Creates a new scanner. + * There is also java.io.Reader version of this constructor. + * + * @param in the java.io.Inputstream to read input from. + */ + public MpdlNormalizerLexZH(java.io.InputStream in) { + this(new java.io.InputStreamReader(in)); + } + + /** + * Unpacks the compressed character translation table. + * + * @param packed the packed character translation table + * @return the unpacked character translation table + */ + private static char [] zzUnpackCMap(String packed) { + char [] map = new char[0x10000]; + int i = 0; /* index in packed string */ + int j = 0; /* index in unpacked array */ + while (i < 90) { + int count = packed.charAt(i++); + char value = packed.charAt(i++); + do map[j++] = value; while (--count > 0); + } + return map; + } + + + /** + * Refills the input buffer. + * + * @return false, iff there was new input. + * + * @exception java.io.IOException if any I/O-Error occurs + */ + private boolean zzRefill() throws java.io.IOException { + + /* first: make room (if you can) */ + if (zzStartRead > 0) { + System.arraycopy(zzBuffer, zzStartRead, + zzBuffer, 0, + zzEndRead-zzStartRead); + + /* translate stored positions */ + zzEndRead-= zzStartRead; + zzCurrentPos-= zzStartRead; + zzMarkedPos-= zzStartRead; + zzStartRead = 0; + } + + /* is the buffer big enough? */ + if (zzCurrentPos >= zzBuffer.length) { + /* if not: blow it up */ + char newBuffer[] = new char[zzCurrentPos*2]; + System.arraycopy(zzBuffer, 0, newBuffer, 0, zzBuffer.length); + zzBuffer = newBuffer; + } + + /* finally: fill the buffer with new input */ + int numRead = zzReader.read(zzBuffer, zzEndRead, + zzBuffer.length-zzEndRead); + + if (numRead > 0) { + zzEndRead+= numRead; + return false; + } + // unlikely but not impossible: read 0 characters, but not at end of stream + if (numRead == 0) { + int c = zzReader.read(); + if (c == -1) { + return true; + } else { + zzBuffer[zzEndRead++] = (char) c; + return false; + } + } + + // numRead < 0 + return true; + } + + + /** + * Closes the input stream. + */ + public final void yyclose() throws java.io.IOException { + zzAtEOF = true; /* indicate end of file */ + zzEndRead = zzStartRead; /* invalidate buffer */ + + if (zzReader != null) + zzReader.close(); + } + + + /** + * Resets the scanner to read from a new input stream. + * Does not close the old reader. + * + * All internal variables are reset, the old input stream + * cannot be reused (internal buffer is discarded and lost). + * Lexical state is set to ZZ_INITIAL. + * + * @param reader the new input stream + */ + public final void yyreset(java.io.Reader reader) { + zzReader = reader; + zzAtBOL = true; + zzAtEOF = false; + zzEOFDone = false; + zzEndRead = zzStartRead = 0; + zzCurrentPos = zzMarkedPos = 0; + yyline = yychar = yycolumn = 0; + zzLexicalState = YYINITIAL; + } + + + /** + * Returns the current lexical state. + */ + public final int yystate() { + return zzLexicalState; + } + + + /** + * Enters a new lexical state + * + * @param newState the new lexical state + */ + public final void yybegin(int newState) { + zzLexicalState = newState; + } + + + /** + * Returns the text matched by the current regular expression. + */ + public final String yytext() { + return new String( zzBuffer, zzStartRead, zzMarkedPos-zzStartRead ); + } + + + /** + * Returns the character at position pos from the + * matched text. + * + * It is equivalent to yytext().charAt(pos), but faster + * + * @param pos the position of the character to fetch. + * A value from 0 to yylength()-1. + * + * @return the character at position pos + */ + public final char yycharat(int pos) { + return zzBuffer[zzStartRead+pos]; + } + + + /** + * Returns the length of the matched text region. + */ + public final int yylength() { + return zzMarkedPos-zzStartRead; + } + + + /** + * Reports an error that occured while scanning. + * + * In a wellformed scanner (no or only correct usage of + * yypushback(int) and a match-all fallback rule) this method + * will only be called with things that "Can't Possibly Happen". + * If this method is called, something is seriously wrong + * (e.g. a JFlex bug producing a faulty scanner etc.). + * + * Usual syntax/scanner level error handling should be done + * in error fallback rules. + * + * @param errorCode the code of the errormessage to display + */ + private void zzScanError(int errorCode) { + String message; + try { + message = ZZ_ERROR_MSG[errorCode]; + } + catch (ArrayIndexOutOfBoundsException e) { + message = ZZ_ERROR_MSG[ZZ_UNKNOWN_ERROR]; + } + + throw new Error(message); + } + + + /** + * Pushes the specified amount of characters back into the input stream. + * + * They will be read again by then next call of the scanning method + * + * @param number the number of characters to be read again. + * This number must not be greater than yylength()! + */ + public void yypushback(int number) { + if ( number > yylength() ) + zzScanError(ZZ_PUSHBACK_2BIG); + + zzMarkedPos -= number; + } + + + /** + * Resumes scanning until the next regular expression is matched, + * the end of input is encountered or an I/O-Error occurs. + * + * @return the next token + * @exception java.io.IOException if any I/O-Error occurs + */ + public java.lang.String yylex() throws java.io.IOException { + int zzInput; + int zzAction; + + // cached fields: + int zzCurrentPosL; + int zzMarkedPosL; + int zzEndReadL = zzEndRead; + char [] zzBufferL = zzBuffer; + char [] zzCMapL = ZZ_CMAP; + + int [] zzTransL = ZZ_TRANS; + int [] zzRowMapL = ZZ_ROWMAP; + int [] zzAttrL = ZZ_ATTRIBUTE; + + while (true) { + zzMarkedPosL = zzMarkedPos; + + zzAction = -1; + + zzCurrentPosL = zzCurrentPos = zzStartRead = zzMarkedPosL; + + zzState = ZZ_LEXSTATE[zzLexicalState]; + + + zzForAction: { + while (true) { + + if (zzCurrentPosL < zzEndReadL) + zzInput = zzBufferL[zzCurrentPosL++]; + else if (zzAtEOF) { + zzInput = YYEOF; + break zzForAction; + } + else { + // store back cached positions + zzCurrentPos = zzCurrentPosL; + zzMarkedPos = zzMarkedPosL; + boolean eof = zzRefill(); + // get translated positions and possibly new buffer + zzCurrentPosL = zzCurrentPos; + zzMarkedPosL = zzMarkedPos; + zzBufferL = zzBuffer; + zzEndReadL = zzEndRead; + if (eof) { + zzInput = YYEOF; + break zzForAction; + } + else { + zzInput = zzBufferL[zzCurrentPosL++]; + } + } + int zzNext = zzTransL[ zzRowMapL[zzState] + zzCMapL[zzInput] ]; + if (zzNext == -1) break zzForAction; + zzState = zzNext; + + int zzAttributes = zzAttrL[zzState]; + if ( (zzAttributes & 1) == 1 ) { + zzAction = zzState; + zzMarkedPosL = zzCurrentPosL; + if ( (zzAttributes & 8) == 8 ) break zzForAction; + } + + } + } + + // store back cached position + zzMarkedPos = zzMarkedPosL; + + switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) { + case 17: + { add("庶"); + } + case 18: break; + case 9: + { add("時"); + } + case 19: break; + case 2: + { problem = 1; add(yytext()); + } + case 20: break; + case 3: + { switch (problem) { + case 1: return original; + default: return normalized; + } + } + case 21: break; + case 10: + { add("歷"); + } + case 22: break; + case 13: + { add("面"); + } + case 23: break; + case 14: + { add("精"); + } + case 24: break; + case 12: + { add("陰"); + } + case 25: break; + case 8: + { add("床"); + } + case 26: break; + case 1: + { add(yytext()); + } + case 27: break; + case 15: + { add(""); + } + case 28: break; + case 7: + { add("并"); + } + case 29: break; + case 4: + { add("併"); + } + case 30: break; + case 11: + { add("為"); + } + case 31: break; + case 6: + { add("奇"); + } + case 32: break; + case 5: + { add("叟"); + } + case 33: break; + case 16: + { switch (problem) { + case 1: return ""; + default: return normalized; + } + } + case 34: break; + default: + if (zzInput == YYEOF && zzStartRead == zzCurrentPos) { + zzAtEOF = true; + return null; + } + else { + zzScanError(ZZ_NO_MATCH); + } + } + } + } + + +} diff -r d2a1c14fde31 -r 1ec29fdd0db8 software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlNormalizerLexZH.lex --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlNormalizerLexZH.lex Tue Feb 22 16:03:45 2011 +0100 @@ -0,0 +1,119 @@ +/* + * Normalization rules for Chinese text + * [this is a JFlex specification] + * + * Wolfgang Schmidle + * version 0.96 + * 2011-02-21 + * + */ + +package de.mpg.mpiwg.berlin.mpdl.lt.analyzer.lang; + +%% + +%public +%class MpdlNormalizerLexZH +%type java.lang.String +%unicode + +// classical Chinese: zh, zho, zho-Hant + +%states DISP, DICT, SEARCH + +%{ + private String original = ""; + private String normalized = ""; + private int problem = 0; + + private void add (String norm) { + original += yytext(); + normalized += norm; + } +%} + +ZWS = [\u{200b}] + +END = \n + +%% + +// Normalization in Chinese means that character variants will be replaced by their standard characters +// if there is no doubt about what the standard character is. + +// The input is supposed to be a single Chinese character, but strings of characters are also handled correctly. + + { + +// Codepoint < FFFF + +倂 { add("併"); } // 5002 --> 4F75 +傁 | 叜 { add("叟"); } // 5081, 53DC --> 53DF +竒 { add("奇"); } // 7AD2 --> 5947 +幷 { add("并"); } // 5E77 --> 5E76 +牀 { add("床"); } // 7240 --> 5E8A +旹 { add("時"); } // 65F9 --> 6642 +歴 { add("歷"); } // 6B74 --> 6B77 +爲 { add("為"); } // 7232 --> 70BA +隂 { add("陰"); } // 9682 --> 9670 +靣 { add("面"); } // 9763 --> 9762 +精 { add("精"); } // FA1D --> 7CBE (FA1D is a compatibility ideograph) + +// Codepoint > FFFF + +// note that [ABC] is not equivalent to A | B | C for codepoints above FFFF due to their internal encoding: +// for example, 庶 (U+2F88D) is represented as a sequence of two codepoints: D87E DC8D +// i.e. never use [ABC] but A | B | C + +庶 { add("庶"); } // 2F88D --> 5EB6 (2F88D is a compatibility ideograph) + +} + + { + +// remove Zero Width Space (if there is any in the the input string) + +{ZWS} { add(""); } + +} + +// default + +@ { problem = 1; add(yytext()); } +. { add(yytext()); } + + + { + +{END} { + switch (problem) { + case 1: return original; + default: return normalized; + } + } +} + + { + +{END} { + switch (problem) { + case 1: return ""; + default: return normalized; + } + } +} + + +/* + +Annahmen: +- die Routine wird wortweise aufgerufen, mit einem \n am Ende des Strings +- Wörter mit Zeilenumbrüchen wurden bereits wieder zusammengesetzt + +TO DO: + +ZH: Liste ergänzen +ZH: was ist, wenn man wirklich die Variante, die im Text steht, nachschlagen will? Dann muss man das Zeichen wohl selbst rauskopieren. +ZH: sollen lateinische Buchstaben bewirken, dass problem = 1 ist? + +*/ diff -r d2a1c14fde31 -r 1ec29fdd0db8 software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/general/Language.java --- a/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/general/Language.java Thu Feb 10 14:02:05 2011 +0100 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/general/Language.java Tue Feb 22 16:03:45 2011 +0100 @@ -45,6 +45,48 @@ return false; } + public boolean isGerman(String language) { + if (getLanguageId(language).equals("de")) + return true; + else + return false; + } + + public boolean isFrench(String language) { + if (getLanguageId(language).equals("fr")) + return true; + else + return false; + } + + public boolean isEnglish(String language) { + if (getLanguageId(language).equals("en")) + return true; + else + return false; + } + + public boolean isDutch(String language) { + if (getLanguageId(language).equals("nl")) + return true; + else + return false; + } + + public boolean isGreek(String language) { + if (getLanguageId(language).equals("el")) + return true; + else + return false; + } + + public boolean isArabic(String language) { + if (getLanguageId(language).equals("ar")) + return true; + else + return false; + } + public boolean isItalian(String language) { if (getLanguageId(language).equals("it")) return true; diff -r d2a1c14fde31 -r 1ec29fdd0db8 software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/util/Util.java --- a/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/util/Util.java Thu Feb 10 14:02:05 2011 +0100 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/util/Util.java Tue Feb 22 16:03:45 2011 +0100 @@ -19,6 +19,10 @@ return props; } + public String test(String inputStr) { + return "BlaBla"; + } + public String toYearStr(String inputStr) { String retYearStr = inputStr.trim(); int index = inputStr.indexOf("-"); diff -r d2a1c14fde31 -r 1ec29fdd0db8 software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/util/XmlUtil.java --- a/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/util/XmlUtil.java Thu Feb 10 14:02:05 2011 +0100 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/util/XmlUtil.java Tue Feb 22 16:03:45 2011 +0100 @@ -12,6 +12,7 @@ import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Date; +import java.util.Iterator; import javax.xml.XMLConstants; import javax.xml.namespace.NamespaceContext; @@ -34,6 +35,8 @@ import javax.xml.xpath.XPathFactory; import net.sf.saxon.om.NodeInfo; +import net.sf.saxon.query.QueryResult; +import net.sf.saxon.trans.XPathException; import org.w3c.dom.Document; import org.w3c.dom.DocumentType; @@ -50,10 +53,52 @@ static String JAXP_SCHEMA_SOURCE = "http://java.sun.com/xml/jaxp/properties/schemaSource"; static String W3C_XML_SCHEMA = XMLConstants.W3C_XML_SCHEMA_NS_URI; + private NamespaceContext namespaceContext; + public static XmlUtil getInstance() { return new XmlUtil(); } + public void setNsContext(String nsName) { + if (nsName.equals("general")) + namespaceContext = getNsContextGeneral(); + } + + public NamespaceContext getNsContextGeneral() { + NamespaceContext nsContext = new NamespaceContext() { + public String getNamespaceURI(String prefix) { + String uri; + if (prefix.equals("xlink")) + uri = "http://www.w3.org/1999/xlink"; + else if (prefix.equals("xml")) + uri = "http://www.w3.org/XML/1998/namespace"; + else if (prefix.equals("dc")) + uri = "http://purl.org/dc/elements/1.1/"; + else if (prefix.equals("mpiwg")) + uri = "http://www.mpiwg-berlin.mpg.de/ns/mpiwg"; + else + uri = null; + return uri; + } + public String getPrefix(String uri) { + if (uri.equals("http://www.w3.org/1999/xlink")) + return "xlink"; + else if (uri.equals("http://www.w3.org/XML/1998/namespace")) + return "xml"; + else if (uri.equals("http://purl.org/dc/elements/1.1/")) + return "dc"; + else if (uri.equals("http://www.mpiwg-berlin.mpg.de/ns/mpiwg")) + return "mpiwg"; + else + return null; + } + public Iterator getPrefixes(String namespace) { + return null; + } + }; + return nsContext; + } + public Node doc(String url) throws ApplicationException { Node root = null; try { @@ -205,6 +250,8 @@ ArrayList retStrArray = null; try { XPath xpath = XPathFactory.newInstance().newXPath(); + if (namespaceContext != null) + xpath.setNamespaceContext(namespaceContext); if (nsContext != null) xpath.setNamespaceContext(nsContext); Object resultObjects = xpath.evaluate(xpathExpression, inputSource, XPathConstants.NODESET); @@ -221,6 +268,8 @@ ArrayList retArray = null; try { XPath xpath = XPathFactory.newInstance().newXPath(); + if (namespaceContext != null) + xpath.setNamespaceContext(namespaceContext); if (nsContext != null) xpath.setNamespaceContext(nsContext); Object resultObjects = xpath.evaluate(xpathExpression, inputSource, XPathConstants.NODESET); @@ -297,7 +346,7 @@ * javax XPath evaluation: returns a NodeList * Saxon's XPath evaluation: returns an ArrayList of TinyTextImpl (which could be casted to NodeInfo which could be handled as if it was a dom node) */ - private ArrayList nodesetToNodeArray(Object nodesetObjects) { + private ArrayList nodesetToNodeArray(Object nodesetObjects) throws ApplicationException { ArrayList retArray = null; if (nodesetObjects instanceof NodeList) { NodeList resultNodeList = (NodeList) nodesetObjects; @@ -319,7 +368,16 @@ retArray.add(n); } else if (arrayListNode instanceof NodeInfo) { NodeInfo n = (NodeInfo) arrayListNode; - // TODO provide clean return value + String xmlStr = ""; + try { + xmlStr = QueryResult.serialize(n); + DocumentBuilderFactory dbfac = DocumentBuilderFactory.newInstance(); + DocumentBuilder docBuilder = dbfac.newDocumentBuilder(); + Node domNode = docBuilder.parse(new InputSource(new StringReader(xmlStr))).getDocumentElement(); + retArray.add(domNode); + } catch (Exception e) { + throw new ApplicationException(e); + } } } } @@ -353,11 +411,14 @@ serializeNode(n, writer, ""); } else if (arrayListNode instanceof NodeInfo) { NodeInfo n = (NodeInfo) arrayListNode; - writer.write(n.getStringValue()); // TODO if that really happens + String xmlStr = QueryResult.serialize(n); + writer.write(xmlStr); } } } writer.flush(); + } catch (XPathException e) { + throw new ApplicationException(e); } catch (IOException e) { throw new ApplicationException(e); } @@ -367,7 +428,10 @@ public String evaluateToXmlString(String xmlString, String xpathExpression, NamespaceContext nsContext) throws ApplicationException { String resultStr = null; try { - XPath xpath = XPathFactory.newInstance().newXPath(); + XPathFactory xpathFactory = net.sf.saxon.xpath.XPathFactoryImpl.newInstance(); + XPath xpath = xpathFactory.newXPath(); + if (namespaceContext != null) + xpath.setNamespaceContext(namespaceContext); if (nsContext != null) xpath.setNamespaceContext(nsContext); Reader stringReader = new StringReader(xmlString); @@ -382,6 +446,70 @@ return resultStr; } + public ArrayList evaluateToNodeArray(String xmlString, String xpathExpression, NamespaceContext nsContext) throws ApplicationException { + ArrayList result = null; + try { + XPathFactory xpathFactory = net.sf.saxon.xpath.XPathFactoryImpl.newInstance(); + XPath xpath = xpathFactory.newXPath(); + if (namespaceContext != null) + xpath.setNamespaceContext(namespaceContext); + if (nsContext != null) + xpath.setNamespaceContext(nsContext); + Reader stringReader = new StringReader(xmlString); + InputSource inputSource = new InputSource(stringReader); + Object resultObjects = xpath.evaluate(xpathExpression, inputSource, XPathConstants.NODESET); + if (resultObjects != null) { + result = nodesetToNodeArray(resultObjects); + } + } catch (Exception e) { + throw new ApplicationException(e); + } + return result; + } + + public String insertAtCharPos(String xmlFragment, String charPosStr, String newXmlNodeStr) { + Integer charPos = new Integer(charPosStr); + int strCharIndex = getCharIndex(xmlFragment, charPos); + if (charPos == 0) + strCharIndex = getCharIndex(xmlFragment, charPos + 1) - 1; + String resultStr = xmlFragment.substring(0, strCharIndex) + newXmlNodeStr + xmlFragment.substring(strCharIndex); + return resultStr; + } + + private int getCharIndex(String xmlFragment, int charPos) { + int size = xmlFragment.length(); + int counter = 0; + int charCounter = 0; + int counterLastChar = -1; + boolean isEntity = false; + boolean isElement = false; + while (counter < size) { + char c = xmlFragment.charAt(counter); + switch (c) { + case '<': isElement = true; break; + case '>': isElement = false; break; + case '&': isEntity = true; break; + case ';': isEntity = false; break; + } + // count all chars which are not inside elements and entities + // if element closing char ">" is found it should not be counted as a char + // if an entity closing char ";" is found it should be counted cause the entity itself is one char long + if (! isEntity && ! isElement && !(c == '>')) { + charCounter++; + counterLastChar = counter; + } + if (charCounter == charPos) { + break; + } + counter++; + } + // input charPos was bigger than available chars: return the last available charPos + if (counter == size) + return counterLastChar + 1; + return counter + 1; + } + + /** *

This will serialize a DOM Node to * the supplied Writer.

diff -r d2a1c14fde31 -r 1ec29fdd0db8 software/eXist/mpdl-modules/src/org/exist/xquery/modules/mpdltext/ExternalObject.java --- a/software/eXist/mpdl-modules/src/org/exist/xquery/modules/mpdltext/ExternalObject.java Thu Feb 10 14:02:05 2011 +0100 +++ b/software/eXist/mpdl-modules/src/org/exist/xquery/modules/mpdltext/ExternalObject.java Tue Feb 22 16:03:45 2011 +0100 @@ -23,7 +23,6 @@ package org.exist.xquery.modules.mpdltext; import java.util.ArrayList; -import java.util.Date; import org.exist.dom.QName; import org.exist.xquery.BasicFunction; @@ -39,6 +38,8 @@ import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; import de.mpg.mpiwg.berlin.mpdl.externalObjects.app.ExtElement; +import de.mpg.mpiwg.berlin.mpdl.externalObjects.app.ExtObject; +import de.mpg.mpiwg.berlin.mpdl.externalObjects.app.ExtQuery; import de.mpg.mpiwg.berlin.mpdl.externalObjects.app.ExternalObjectsHandler; /** @@ -61,9 +62,9 @@ public Sequence eval(Sequence[] args, Sequence contextSequence) throws XPathException { Sequence operation = args[0]; // read, update or delete - Sequence type = args[1]; - Sequence object = args[2]; - if (operation.isEmpty() || type.isEmpty()) + Sequence type = args[1]; // element, query + Sequence object = args[2]; // the object as an xml string + if (operation.isEmpty() || type.isEmpty() || object.isEmpty()) return Sequence.EMPTY_SEQUENCE; String operationStr = operation.getStringValue(); String typeStr = type.getStringValue(); @@ -72,17 +73,11 @@ ValueSequence result = null; String resultStr = ""; try { + ExternalObjectsHandler externalObjectsHandler = ExternalObjectsHandler.getInstance(); if (typeStr.equals("element")) { ExtElement e = ExtElement.parseXmlStr(objectStr); - if (operation.equals("create") || operation.equals("update")) { - Date now = new Date(); - e.setModificationDate(now); - } - String documentId = e.getDocumentId(); - String pageNumber = e.getPageNumber(); if (operationStr.equals("read")) { - ExternalObjectsHandler externalObjectsHandler = ExternalObjectsHandler.getInstance(); - ArrayList elems = externalObjectsHandler.readExternalElements(documentId, pageNumber); + ArrayList elems = externalObjectsHandler.readExternalElements(e); if (elems != null && elems.size() > 0) { resultStr = ""; for (int i=0; i"; } } else if (operationStr.equals("create")) { - // TODO + externalObjectsHandler.createExternalElement(e); } else if (operationStr.equals("update")) { - // TODO + externalObjectsHandler.updateExternalElement(e); } else if (operationStr.equals("delete")) { - // TODO + externalObjectsHandler.deleteExternalElement(e); } } else if (typeStr.equals("query")) { - // TODO + ExtQuery q = ExtQuery.parseXmlStr(objectStr); + if (operationStr.equals("read")) { + ArrayList objects = externalObjectsHandler.readExternalObjects(q); + if (objects != null && objects.size() > 0) { + resultStr = ""; + for (int i=0; i"; + } + } else if (operationStr.equals("create")) { + externalObjectsHandler.createExternalObject(q); + } else if (operationStr.equals("update")) { + externalObjectsHandler.updateExternalObject(q); + } else if (operationStr.equals("delete")) { + externalObjectsHandler.deleteExternalObject(q); + } } result = new ValueSequence(); result.add(new StringValue(resultStr)); diff -r d2a1c14fde31 -r 1ec29fdd0db8 software/eXist/mpdl-modules/src/org/exist/xquery/modules/mpdltext/MPDLTextModule.java --- a/software/eXist/mpdl-modules/src/org/exist/xquery/modules/mpdltext/MPDLTextModule.java Thu Feb 10 14:02:05 2011 +0100 +++ b/software/eXist/mpdl-modules/src/org/exist/xquery/modules/mpdltext/MPDLTextModule.java Tue Feb 22 16:03:45 2011 +0100 @@ -51,7 +51,6 @@ new FunctionDef(EncodeBig5.signature, EncodeBig5.class), new FunctionDef(LuceneQueryParser.signature, LuceneQueryParser.class), new FunctionDef(ExternalObject.signature, ExternalObject.class), - new FunctionDef(InsertAtCharPos.signature, InsertAtCharPos.class), new FunctionDef(ToCLevelGenerator.signature, ToCLevelGenerator.class) }; diff -r d2a1c14fde31 -r 1ec29fdd0db8 software/eXist/webapp/mpdl/_stuff/futureDev/insert.xql --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/webapp/mpdl/_stuff/futureDev/insert.xql Tue Feb 22 16:03:45 2011 +0100 @@ -0,0 +1,67 @@ +xquery version "1.0"; + +module namespace mpdl-text = "http://www.mpiwg-berlin.mpg.de/ns/mpdl/text"; + +declare function mpdl-text:insert($fragment as element(), $externalObjects as element()*) { + let $firstObject := $externalObjects[1] + let $xmlNodeId := $firstObject/@xmlNodeId + let $posNode := $fragment//*[@xmlNodeId = $xmlNodeId] + let $before := $firstObject/@before + let $boolBefore := + if ($before = "true") + then true() + else false() + let $charPosStr := $firstObject/@charPos + let $charPos := + if($charPosStr != "" and not(empty($charPosStr))) + then number($charPosStr) + else -1 + let $newNode := $firstObject/content + let $size := count($externalObjects) + let $otherObjects := + if ($size > 1) + then subsequence($externalObjects, 2, $size) + else () + let $insertedFragment := mpdl-text:insert($fragment, $posNode, $boolBefore, $charPos, $newNode) + let $result := + if ($size >= 1) + then + mpdl-text:insert($insertedFragment, $otherObjects) + else + $fragment + return $result +}; + +declare function mpdl-text:insert($element as element(), $node, $before, $charPos, $newNode) { + if ($element = $node and $before and $charPos = -1) + then + ($newNode, + element {node-name($node)} + {$node/@*, + for $child in $node/node() + return if ($child instance of element()) + then mpdl-text:insert($child, $node, $before, $charPos, $newNode) + else $child + }) + else if ($element = $node and not($before) and $charPos = -1) + then + (element {node-name($node)} + {$node/@*, + for $child in $node/node() + return if ($child instance of element()) + then mpdl-text:insert($child, $node, $before, $charPos, $newNode) + else $child + }, $newNode) + else if ($element = $node and $charPos >= 0) + then + util:parse(mpdltext:insertAtCharPos(util:serialize($node, ()), util:serialize($newNode, ()), $charPos)) + else + element {node-name($element)} + {$element/@*, + for $child in $element/node() + return if ($child instance of element()) + then mpdl-text:insert($child, $node, $before, $charPos, $newNode) + else $child + } +}; + diff -r d2a1c14fde31 -r 1ec29fdd0db8 software/eXist/webapp/mpdl/interface/page-fragment.xql --- a/software/eXist/webapp/mpdl/interface/page-fragment.xql Thu Feb 10 14:02:05 2011 +0100 +++ b/software/eXist/webapp/mpdl/interface/page-fragment.xql Tue Feb 22 16:03:45 2011 +0100 @@ -110,6 +110,7 @@ else () let $pageHeader := string($pb1/@rhead) let $pageNumberOrig := string($pb1/@o) +let $pageNumberOrigNorm := string($pb1/@o-norm) let $documentIdentifier := if ($docbase = 'archimedes') @@ -227,16 +228,16 @@ if(not(empty($externalElements))) then true() else false() -let $returnPageFragmentTmpp := +let $returnPageFragmentWithExtObjects := + + {$externalElements} + {$returnPageFragmentTmp} + +let $returnPageFragment := if (contains($options, "withXmlNodeId") or $containsExternalElements) - then mpdl-text:insertNodeIdAttribute($returnPageFragmentTmp/*[1]) + then mpdl-text:transform($returnPageFragmentWithExtObjects, concat($presentationPath, "/insertExternalElements.xsl")) else $returnPageFragmentTmp -let $returnPageFragment := - if($containsExternalElements) - then mpdl-text:insert($returnPageFragmentTmpp/*[1], $externalElements) - else $returnPageFragmentTmpp - let $pageFigureAnchors := $returnPageFragment//anchor[@type = 'figure'] let $pageFigures := for $pageFigureAnchor in $pageFigureAnchors @@ -337,6 +338,7 @@ {$pn}
{$pageHeader}
{$pageNumberOrig} + {$pageNumberOrigNorm} {$sn} {$digilibAvailable} {$imageIsAvailable} diff -r d2a1c14fde31 -r 1ec29fdd0db8 software/eXist/webapp/mpdl/page-query-result.xql --- a/software/eXist/webapp/mpdl/page-query-result.xql Thu Feb 10 14:02:05 2011 +0100 +++ b/software/eXist/webapp/mpdl/page-query-result.xql Tue Feb 22 16:03:45 2011 +0100 @@ -270,16 +270,16 @@ if(not(empty($externalElements))) then true() else false() -let $returnPageFragmentTmpp := +let $returnPageFragmentWithExtObjects := + + {$externalElements} + {$returnPageFragmentTmp} + +let $returnPageFragment := if (contains($options, "withXmlNodeId") or $containsExternalElements) - then mpdl-text:insertNodeIdAttribute($returnPageFragmentTmp/*[1]) + then mpdl-text:transform($returnPageFragmentWithExtObjects, concat($presentationPath, "/insertExternalElements.xsl")) else $returnPageFragmentTmp -let $returnPageFragment := - if($containsExternalElements) - then mpdl-text:insert($returnPageFragmentTmpp/*[1], $externalElements) - else $returnPageFragmentTmpp - let $pageFigureAnchors := $returnPageFragment//anchor[@type = 'figure'] let $pageFigures := for $pageFigureAnchor in $pageFigureAnchors diff -r d2a1c14fde31 -r 1ec29fdd0db8 software/eXist/webapp/mpdl/presentation/insertExternalElements.xsl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/webapp/mpdl/presentation/insertExternalElements.xsl Tue Feb 22 16:03:45 2011 +0100 @@ -0,0 +1,91 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff -r d2a1c14fde31 -r 1ec29fdd0db8 software/eXist/webapp/mpdl/presentation/pageFragmentHtml.xsl --- a/software/eXist/webapp/mpdl/presentation/pageFragmentHtml.xsl Thu Feb 10 14:02:05 2011 +0100 +++ b/software/eXist/webapp/mpdl/presentation/pageFragmentHtml.xsl Tue Feb 22 16:03:45 2011 +0100 @@ -43,6 +43,7 @@ + @@ -58,6 +59,10 @@
+ + +
+
@@ -282,6 +287,21 @@

+ + + + + + + + + + + + + + + @@ -401,8 +421,19 @@ + - + + + + + + + + + + + @@ -510,7 +541,7 @@ - Anchor of type: , href: + @@ -661,13 +692,13 @@ - + - + diff -r d2a1c14fde31 -r 1ec29fdd0db8 software/eXist/webapp/mpdl/presentation/pageHtml.xsl --- a/software/eXist/webapp/mpdl/presentation/pageHtml.xsl Thu Feb 10 14:02:05 2011 +0100 +++ b/software/eXist/webapp/mpdl/presentation/pageHtml.xsl Tue Feb 22 16:03:45 2011 +0100 @@ -813,6 +813,21 @@

+ + + + + + + + + + + + + + + @@ -932,8 +947,19 @@ + - + + + + + + + + + + + @@ -1041,7 +1067,7 @@ - Anchor of type: , href: + @@ -1191,13 +1217,13 @@ - + - + diff -r d2a1c14fde31 -r 1ec29fdd0db8 software/eXist/webapp/mpdl/text/all.xql --- a/software/eXist/webapp/mpdl/text/all.xql Thu Feb 10 14:02:05 2011 +0100 +++ b/software/eXist/webapp/mpdl/text/all.xql Tue Feb 22 16:03:45 2011 +0100 @@ -35,69 +35,6 @@ } }; -declare function mpdl-text:insert($fragment as element(), $externalObjects as element()*) { - let $firstObject := $externalObjects[1] - let $xmlNodeId := $firstObject/@xmlNodeId - let $posNode := $fragment//*[@xmlNodeId = $xmlNodeId] - let $before := $firstObject/@before - let $boolBefore := - if ($before = "true") - then true() - else false() - let $charPosStr := $firstObject/@charPos - let $charPos := - if($charPosStr != "" and not(empty($charPosStr))) - then number($charPosStr) - else -1 - let $newNode := $firstObject/content - let $size := count($externalObjects) - let $otherObjects := - if ($size > 1) - then subsequence($externalObjects, 2, $size) - else () - let $insertedFragment := mpdl-text:insert($fragment, $posNode, $boolBefore, $charPos, $newNode) - let $result := - if ($size >= 1) - then - mpdl-text:insert($insertedFragment, $otherObjects) - else - $fragment - return $result -}; - -declare function mpdl-text:insert($element as element(), $node, $before, $charPos, $newNode) { - if ($element = $node and $before and $charPos = -1) - then - ($newNode, - element {node-name($node)} - {$node/@*, - for $child in $node/node() - return if ($child instance of element()) - then mpdl-text:insert($child, $node, $before, $charPos, $newNode) - else $child - }) - else if ($element = $node and not($before) and $charPos = -1) - then - (element {node-name($node)} - {$node/@*, - for $child in $node/node() - return if ($child instance of element()) - then mpdl-text:insert($child, $node, $before, $charPos, $newNode) - else $child - }, $newNode) - else if ($element = $node and $charPos >= 0) - then - util:parse(mpdltext:insertAtCharPos(util:serialize($node, ()), util:serialize($newNode, ()), $charPos)) - else - element {node-name($element)} - {$element/@*, - for $child in $element/node() - return if ($child instance of element()) - then mpdl-text:insert($child, $node, $before, $charPos, $newNode) - else $child - } -}; - declare function mpdl-text:indexTerms($mpdlCollectionName, $language, $document, $indexTermsStartStr, $pn as xs:int, $pageSize as xs:int) as node()* { let $index := if ($mpdlCollectionName = 'archimedes')