# HG changeset patch # User Josef Willenborg # Date 1320849125 -3600 # Node ID 4a3641ae14d2107bda519c28257b915a00189a1c # Parent dc5e9fcb3fdcd3461235a09f4b0dbf7aa6b74558 Erstellung diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/.DS_Store Binary file software/mpdl-services/mpiwg-mpdl-lt-web/.DS_Store has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/.classpath --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt-web/.classpath Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,18 @@ + + + + + + + + + + + + + + + + + + diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/.externalToolBuilders/Ant-Build.launch --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt-web/.externalToolBuilders/Ant-Build.launch Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,12 @@ + + + + + + + + + + + + diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/.project --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt-web/.project Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,46 @@ + + + mpiwg-mpdl-lt-web + + + + + + org.eclipse.wst.jsdt.core.javascriptValidator + + + + + org.eclipse.jdt.core.javabuilder + + + + + org.eclipse.wst.common.project.facet.core.builder + + + + + org.eclipse.wst.validation.validationbuilder + + + + + org.eclipse.ui.externaltools.ExternalToolBuilder + full,incremental, + + + LaunchConfigHandle + <project>/.externalToolBuilders/Ant-Build.launch + + + + + + org.eclipse.jem.workbench.JavaEMFNature + org.eclipse.wst.common.modulecore.ModuleCoreNature + org.eclipse.wst.common.project.facet.core.nature + org.eclipse.jdt.core.javanature + org.eclipse.wst.jsdt.core.jsNature + + diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/.settings/.jsdtscope --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt-web/.settings/.jsdtscope Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,12 @@ + + + + + + + + + + + + diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/.settings/org.eclipse.jdt.core.prefs --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt-web/.settings/org.eclipse.jdt.core.prefs Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,12 @@ +#Mon Sep 12 15:41:45 CEST 2011 +eclipse.preferences.version=1 +org.eclipse.jdt.core.compiler.codegen.inlineJsrBytecode=enabled +org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.6 +org.eclipse.jdt.core.compiler.codegen.unusedLocal=preserve +org.eclipse.jdt.core.compiler.compliance=1.6 +org.eclipse.jdt.core.compiler.debug.lineNumber=generate +org.eclipse.jdt.core.compiler.debug.localVariable=generate +org.eclipse.jdt.core.compiler.debug.sourceFile=generate +org.eclipse.jdt.core.compiler.problem.assertIdentifier=error +org.eclipse.jdt.core.compiler.problem.enumIdentifier=error +org.eclipse.jdt.core.compiler.source=1.6 diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/.settings/org.eclipse.wst.common.component --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt-web/.settings/org.eclipse.wst.common.component Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,10 @@ + + + + + + + + + + diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/.settings/org.eclipse.wst.common.project.facet.core.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt-web/.settings/org.eclipse.wst.common.project.facet.core.xml Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,10 @@ + + + + + + + + + + diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/.settings/org.eclipse.wst.jsdt.ui.superType.container --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt-web/.settings/org.eclipse.wst.jsdt.ui.superType.container Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,1 @@ +org.eclipse.wst.jsdt.launching.baseBrowserLibrary \ No newline at end of file diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/.settings/org.eclipse.wst.jsdt.ui.superType.name --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt-web/.settings/org.eclipse.wst.jsdt.ui.superType.name Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,1 @@ +Window \ No newline at end of file diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/.settings/org.eclipse.wst.ws.service.policy.prefs --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt-web/.settings/org.eclipse.wst.ws.service.policy.prefs Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,3 @@ +#Mon Sep 12 15:41:45 CEST 2011 +eclipse.preferences.version=1 +org.eclipse.wst.ws.service.policy.projectEnabled=false diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/.DS_Store Binary file software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/.DS_Store has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/META-INF/MANIFEST.MF --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/META-INF/MANIFEST.MF Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,3 @@ +Manifest-Version: 1.0 +Class-Path: + diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/WEB-INF/.DS_Store Binary file software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/WEB-INF/.DS_Store has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/WEB-INF/classes/.DS_Store Binary file software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/WEB-INF/classes/.DS_Store has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/WEB-INF/classes/constants.properties --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/WEB-INF/classes/constants.properties Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,1 @@ +dataDir=/usr/local/tomcat-mpdl/mpdl-data/lt \ No newline at end of file diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/WEB-INF/lib/.DS_Store Binary file software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/WEB-INF/lib/.DS_Store has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/WEB-INF/lib/berkeley-db-3.3.82.jar Binary file software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/WEB-INF/lib/berkeley-db-3.3.82.jar has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/WEB-INF/lib/commons-io-2.0.1.jar Binary file software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/WEB-INF/lib/commons-io-2.0.1.jar has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/WEB-INF/lib/commons-lang3-3.0.1.jar Binary file software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/WEB-INF/lib/commons-lang3-3.0.1.jar has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/WEB-INF/lib/lucene-core-3.4.0.jar Binary file software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/WEB-INF/lib/lucene-core-3.4.0.jar has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/WEB-INF/lib/mpiwg-mpdl-lt-web.jar Binary file software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/WEB-INF/lib/mpiwg-mpdl-lt-web.jar has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/WEB-INF/lib/mpiwg-mpdl-lt.jar Binary file software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/WEB-INF/lib/mpiwg-mpdl-lt.jar has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/WEB-INF/lib/mpiwg-mpdl-xml.jar Binary file software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/WEB-INF/lib/mpiwg-mpdl-xml.jar has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/WEB-INF/lib/saxon9-s9api.jar Binary file software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/WEB-INF/lib/saxon9-s9api.jar has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/WEB-INF/lib/saxon9.jar Binary file software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/WEB-INF/lib/saxon9.jar has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/WEB-INF/lib/transcoder11.jar Binary file software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/WEB-INF/lib/transcoder11.jar has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/WEB-INF/web.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/WEB-INF/web.xml Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,70 @@ + + + mpiwg-mpdl-xml-web + + index.html + + + GetDictionaryEntries + GetDictionaryEntries + GetDictionaryEntries + de.mpg.mpiwg.berlin.mpdl.servlets.lt.GetDictionaryEntries + + + GetDictionaryEntries + /lt/GetDictionaryEntries + + + GetLemmas + GetLemmas + GetLemmas + de.mpg.mpiwg.berlin.mpdl.servlets.lt.GetLemmas + + + GetLemmas + /lt/GetLemmas + + + GetForms + GetForms + GetForms + de.mpg.mpiwg.berlin.mpdl.servlets.lt.GetForms + + + GetForms + /lt/GetForms + + + Tokenize + Tokenize + Tokenize + de.mpg.mpiwg.berlin.mpdl.servlets.lt.Tokenize + + + Tokenize + /text/Tokenize + + + Normalize + Normalize + Normalize + de.mpg.mpiwg.berlin.mpdl.servlets.lt.Normalize + + + Normalize + /text/Normalize + + + Transcode + Transcode + Transcode + de.mpg.mpiwg.berlin.mpdl.servlets.lt.Transcode + + + Transcode + /text/Transcode + + + de.mpg.mpiwg.berlin.mpdl.servlets.lt.MpiwgMpdlLtWebServletContextListener + + \ No newline at end of file diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/.DS_Store Binary file software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/.DS_Store has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/2downarrow.png Binary file software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/2downarrow.png has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/2leftarrow.png Binary file software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/2leftarrow.png has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/2rightarrow.png Binary file software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/2rightarrow.png has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/2uparrow.png Binary file software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/2uparrow.png has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/book-pointer.gif Binary file software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/book-pointer.gif has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/book.png Binary file software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/book.png has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/camera.png Binary file software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/camera.png has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/copyleft.png Binary file software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/copyleft.png has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/dictionary.gif Binary file software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/dictionary.gif has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/dictionaryMorph.gif Binary file software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/dictionaryMorph.gif has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/dot.gif Binary file software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/dot.gif has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/download.png Binary file software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/download.png has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/echo.gif Binary file software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/echo.gif has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/figures.png Binary file software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/figures.png has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/help.png Binary file software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/help.png has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/image.jpg Binary file software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/image.jpg has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/imageU.jpg Binary file software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/imageU.jpg has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/info.png Binary file software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/info.png has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/left.gif Binary file software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/left.gif has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/link.png Binary file software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/link.png has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/linkback.png Binary file software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/linkback.png has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/linkext.png Binary file software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/linkext.png has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/linkto.png Binary file software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/linkto.png has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/malcolm.jpg Binary file software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/malcolm.jpg has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/malcolm.tif Binary file software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/malcolm.tif has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/pirate-joey.gif Binary file software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/pirate-joey.gif has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/right.gif Binary file software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/right.gif has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/search.gif Binary file software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/search.gif has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/searchMorph.gif Binary file software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/searchMorph.gif has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/searchStructural.gif Binary file software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/searchStructural.gif has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/searchXPath.gif Binary file software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/searchXPath.gif has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/slime_logo.png Binary file software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/slime_logo.png has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/text.jpg Binary file software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/text.jpg has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/textPollux.jpg Binary file software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/textPollux.jpg has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/textPolluxU.jpg Binary file software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/textPolluxU.jpg has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/textU.jpg Binary file software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/textU.jpg has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/toc.gif Binary file software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/toc.gif has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/xml.jpg Binary file software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/xml.jpg has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/xmlU.jpg Binary file software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/xmlU.jpg has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/index.html --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/index.html Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,370 @@ + + + + +Max Planck Institute for the History of Science - Mpdl: Language technology services + + + + + + +
+ [This software is dedicated to Dr. Malcolm Hyman] + Info
+ [It is based on Donatus and Pollux] + Info +
+

Max Planck Institute for the History of Science - Mpdl: Language technology services

+ + + + + + diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/build/.DS_Store Binary file software/mpdl-services/mpiwg-mpdl-lt-web/build/.DS_Store has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/build/build.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt-web/build/build.xml Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,57 @@ + + + mpiwg-mpdl-lt-web + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/conf/constants-mpdl-system.properties --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt-web/conf/constants-mpdl-system.properties Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,1 @@ +dataDir=/usr/local/tomcat-mpdl/mpdl-data/lt \ No newline at end of file diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/conf/constants.properties --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt-web/conf/constants.properties Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,1 @@ +dataDir=/Users/jwillenborg/mpdl/data/lt \ No newline at end of file diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/dist-remote/mpiwg-mpdl-lt-web.jar Binary file software/mpdl-services/mpiwg-mpdl-lt-web/dist-remote/mpiwg-mpdl-lt-web.jar has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/dist-remote/mpiwg-mpdl-lt-web.war Binary file software/mpdl-services/mpiwg-mpdl-lt-web/dist-remote/mpiwg-mpdl-lt-web.war has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/dist/.DS_Store Binary file software/mpdl-services/mpiwg-mpdl-lt-web/dist/.DS_Store has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/dist/mpiwg-mpdl-lt-web.jar Binary file software/mpdl-services/mpiwg-mpdl-lt-web/dist/mpiwg-mpdl-lt-web.jar has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/dist/mpiwg-mpdl-lt-web.war Binary file software/mpdl-services/mpiwg-mpdl-lt-web/dist/mpiwg-mpdl-lt-web.war has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/src/.DS_Store Binary file software/mpdl-services/mpiwg-mpdl-lt-web/src/.DS_Store has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/src/de/.DS_Store Binary file software/mpdl-services/mpiwg-mpdl-lt-web/src/de/.DS_Store has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/src/de/mpg/.DS_Store Binary file software/mpdl-services/mpiwg-mpdl-lt-web/src/de/mpg/.DS_Store has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/src/de/mpg/mpiwg/.DS_Store Binary file software/mpdl-services/mpiwg-mpdl-lt-web/src/de/mpg/mpiwg/.DS_Store has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/src/de/mpg/mpiwg/berlin/.DS_Store Binary file software/mpdl-services/mpiwg-mpdl-lt-web/src/de/mpg/mpiwg/berlin/.DS_Store has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/src/de/mpg/mpiwg/berlin/mpdl/.DS_Store Binary file software/mpdl-services/mpiwg-mpdl-lt-web/src/de/mpg/mpiwg/berlin/mpdl/.DS_Store has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/src/de/mpg/mpiwg/berlin/mpdl/exception/ApplicationException.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt-web/src/de/mpg/mpiwg/berlin/mpdl/exception/ApplicationException.java Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,14 @@ +package de.mpg.mpiwg.berlin.mpdl.exception; + +public class ApplicationException extends Exception { + + public ApplicationException(Exception e) { + super(e); + } + + public ApplicationException(String str) { + super(str); + } + +} + diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/src/de/mpg/mpiwg/berlin/mpdl/servlets/lt/GetDictionaryEntries.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt-web/src/de/mpg/mpiwg/berlin/mpdl/servlets/lt/GetDictionaryEntries.java Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,324 @@ +package de.mpg.mpiwg.berlin.mpdl.servlets.lt; + +import java.io.IOException; +import java.io.PrintWriter; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Date; + +import javax.servlet.ServletConfig; +import javax.servlet.ServletException; +import javax.servlet.http.HttpServlet; +import javax.servlet.http.HttpServletRequest; +import javax.servlet.http.HttpServletResponse; + +import org.apache.commons.lang3.StringEscapeUtils; + +import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; +import de.mpg.mpiwg.berlin.mpdl.lt.dict.app.Lexicon; +import de.mpg.mpiwg.berlin.mpdl.lt.dict.app.LexiconEntry; +import de.mpg.mpiwg.berlin.mpdl.lt.dict.db.LexHandler; +import de.mpg.mpiwg.berlin.mpdl.lt.general.Language; +import de.mpg.mpiwg.berlin.mpdl.lt.morph.app.Form; +import de.mpg.mpiwg.berlin.mpdl.lt.morph.app.Lemma; +import de.mpg.mpiwg.berlin.mpdl.servlets.util.ServletUtil; + +public class GetDictionaryEntries extends HttpServlet { + private static final long serialVersionUID = 1L; + private LexHandler lexHandler; + + public GetDictionaryEntries() { + super(); + } + + public void init(ServletConfig config) throws ServletException { + super.init(config); + try { + lexHandler = LexHandler.getInstance(); + } catch (ApplicationException e) { + throw new ServletException(e); + } + } + + protected void doGet(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException { + Date begin = new Date(); + request.setCharacterEncoding("utf-8"); + response.setCharacterEncoding("utf-8"); + String query = request.getParameter("query"); + String language = request.getParameter("language"); + String inputType = request.getParameter("inputType"); + String outputFormat = request.getParameter("outputFormat"); + String outputType = request.getParameter("outputType"); + String dictionary = request.getParameter("dictionary"); + String normalization = request.getParameter("normalization"); + String resultPage = request.getParameter("resultPage"); + if (query == null) + query = "a*"; + if (language == null) + language = "eng"; + if (inputType == null || ! (inputType.equals("form") || inputType.equals("lemma"))) + inputType = "form"; + if (outputFormat == null || ! (outputFormat.equals("xml") || outputFormat.equals("html"))) + outputFormat = "xml"; + if (outputType == null || ! (outputType.equals("compact") || outputType.equals("full"))) + outputType = "compact"; + if (normalization == null || ! (normalization.equals("none") || normalization.equals("reg") || normalization.equals("reg norm"))) + normalization = "norm"; + String xmlDict = "all"; + if (dictionary != null) + xmlDict = dictionary; + int pn = 1; + if (resultPage != null) + pn = new Integer(resultPage); + boolean isRangeQuery = false; + if (query.endsWith("*")) + isRangeQuery = true; + String xmlQueryString = "" + query + "" + "" + language + "" + "" + inputType + "" + + "" + outputFormat + "" + "" + outputType + "" + "" + xmlDict + "" + + "" + normalization + "" + ""; + try { + if (outputFormat.equals("xml")) + response.setContentType("text/xml"); + else if (outputFormat.equals("html")) + response.setContentType("text/html"); + else + response.setContentType("text/xml"); + PrintWriter out = response.getWriter(); + if (query == null || query.isEmpty()) { + out.print("request parameter query is empty. Please specify a query."); + out.close(); + return; + } + ArrayList lemmas = null; + ArrayList dictionaries = null; + if (isRangeQuery) { + String queryTmp = query.substring(0, query.length() - 1); // without last star + if (dictionary != null) + dictionaries = lexHandler.getLexEntriesByLexiconBeginningWith(dictionary, queryTmp, pn); + else + dictionaries = lexHandler.getLexEntriesBeginningWith(language, queryTmp, pn); + } else { + lemmas = lexHandler.getLemmas(query, inputType, language, normalization); + dictionaries = lexHandler.getLexEntries(lemmas, language, dictionary); + } + String baseUrl = ServletUtil.getInstance().getBaseUrl(request); + Date end = new Date(); + String elapsedTime = String.valueOf(end.getTime() - begin.getTime()); + String result = ""; + if (outputFormat == null || outputFormat.equals("xml")) + result = createXmlOutputString(query, lemmas, dictionaries, outputType, baseUrl, xmlQueryString, elapsedTime); + else if (outputFormat.equals("html")) + result = createHtmlOutputString(query, lemmas, dictionaries, outputType, elapsedTime); + else + result = createXmlOutputString(query, lemmas, dictionaries, outputType, baseUrl, xmlQueryString, elapsedTime); + out.print(result); + out.close(); + } catch (ApplicationException e) { + throw new ServletException(e); + } + } + + protected void doPost(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException { + + } + + private String createXmlOutputString(String query, ArrayList lemmas, ArrayList lexicons, String outputType, String baseUrl, String xmlQueryString, String elapsedTime) { + String result = ""; + result = result + "" + "MPIWG MPDL language technology service (see: " + "" + baseUrl + "), Max Planck Institute for the History of Science, Berlin." + ""; + result = result + xmlQueryString; + result = result + "" + elapsedTime + ""; + if (lemmas != null && ! lemmas.isEmpty()) { + result = result + ""; + for (int i=0; i"; + result = result + "" + lemmaName + ""; + if (outputType != null && outputType.equals("full")) { + String lemmaProvider = lemma.getProvider(); + result = result + "" + lemmaProvider + ""; + result = result + "" + language + ""; + } + if (Language.getInstance().isArabic(language) || Language.getInstance().isLatin(language)) { + String remoteUrl = "http://www.perseus.tufts.edu/hopper/morph?l=" + lemmaName + "&la=" + language; + result = result + "" + remoteUrl + ""; + } else if (Language.getInstance().isGreek(language)) { + String remoteUrl = "http://www.perseus.tufts.edu/hopper/morph?l=" + lemmaName + "&la=" + "greek"; + result = result + "" + remoteUrl + ""; + } + if (outputType != null && outputType.equals("full")) { + ArrayList
forms = lemma.getFormsList(); + Collections.sort(forms); + if (forms != null && ! forms.isEmpty()) { + result = result + ""; + for (int j=0; j"; + Form f = forms.get(j); + String formName = f.getFormName(); + String formProvider = f.getProvider(); + result = result + "" + formProvider + ""; + result = result + "" + language + ""; + result = result + "" + formName + ""; + result = result + ""; + } + result = result + "
"; + } + } + result = result + "
"; + } + result = result + ""; + } + if (lexicons != null) { + result = result + ""; + for (int i=0; i"; + } + if (outputType != null && outputType.equals("full") && lemmas != null && ! lemmas.isEmpty()) { + result = result + ""; + for (int i=0; i"; + result = result + "" + lemmaName + ""; + String wikiHrefExact = "http://" + language + ".wikipedia.org/wiki/" + lemmaName; + String wikiHrefSearch = "http://" + language + ".wikipedia.org/wiki/index.php?search=" + lemmaName; + result = result + "" + wikiHrefExact + ""; + result = result + "" + wikiHrefSearch + ""; + result = result + ""; + } + result = result + ""; + } + result = result + ""; + return result; + } + + private String createHtmlOutputString(String query, ArrayList lemmas, ArrayList lexicons, String outputType, String elapsedTime) { + String result = ""; + result = result + ""; + result = result + ""; + result = result + "Word information for: \"" + query + "\""; + result = result + ""; + result = result + ""; + result = result + ""; + result = result + ""; + result = result + "
[This is a MPIWG MPDL language technology service] \"MPIWG
"; + result = result + "

"; + result = result + "

Word information for: \"" + query + "\"

"; + if (lemmas != null && ! lemmas.isEmpty()) { + result = result + "

Morphology

"; + result = result + "
    "; + result = result + "

    "; + for (int i=0; i"; + result = result + lemmaName; + if (outputType != null && outputType.equals("full")) { + String lemmaProvider = lemma.getProvider(); + result = result + " (data provider: " + lemmaProvider + ")"; + } + if (Language.getInstance().isArabic(language) || Language.getInstance().isLatin(language)) + result = result + " (external link: " + lemmaName + ")"; + else if (Language.getInstance().isGreek(language)) + result = result + " (external link: " + lemmaName + ")"; + if (outputType != null && outputType.equals("full")) { + ArrayList

    forms = lemma.getFormsList(); + Collections.sort(forms); + if (forms != null && ! forms.isEmpty()) { + result = result + "
      "; + for (int j=0; j"; + } + } + result = result + ""; + } + result = result + "
    "; + } + if (lexicons != null && ! lexicons.isEmpty()) { + result = result + "

    Dictionary

    "; + result = result + "
      "; + result = result + "

      "; + for (int i=0; i"; + result = result + "" + lexicon.getDescription() + ""; + result = result + "

        "; + ArrayList entries = lexicon.getEntries(); + for (int j=0; j", ""); + repairedEntry = repairedEntry.replaceAll("", ""); + entryContent = entryContent + repairedEntry; // valid unicode content of the original entry + } else { + entryContent = entryContent + "[Remark: this dictionary entry has no valid XML/HTML content in database so a text version of this entry is shown.]:
        "; + String originalEntry = entry.getOriginalEntry(); + originalEntry = originalEntry.replaceAll("", ""); + originalEntry = originalEntry.replaceAll("", ""); + originalEntry = StringEscapeUtils.escapeXml(originalEntry); // create text version of the invalid xml content + entryContent = entryContent + originalEntry; + } + if (entry.getRemoteUrl() != null) { + entryContent = entryContent + ""; + } + } else { + if (entry.getRemoteUrl() != null) { + entryContent = entryContent + "external link: " + entry.getFormName() + ""; + } + } + String formName = entry.getFormName(); + String dictName = lexicon.getName(); + if (outputType != null && outputType.equals("full")) { + result = result + "
      • " + "" + formName + "
        • " + entryContent + "
      • "; + } else if (outputType != null && outputType.equals("compact")) { + result = result + "
      • " + "" + formName + "
      • "; + } + } + result = result + "
      "; + result = result + ""; // lexicon entry + } + result = result + "
    "; + result = result + "

    "; + } + if (outputType != null && outputType.equals("full") && lemmas != null && ! lemmas.isEmpty()) { + result = result + "

    Wikipedia

    "; + result = result + "
      "; + result = result + "

      "; + for (int i=0; i"; + String wikiHrefExact = "http://" + language + ".wikipedia.org/wiki/" + lemmaName; + String wikiHrefSearch = "http://" + language + ".wikipedia.org/wiki/index.php?search=" + lemmaName; + result = result + "Article: External link: " + lemmaName + " (or search for " + lemmaName + ")"; + result = result + ""; + } + result = result + "

    "; + } + result = result + "[* external links may not function]"; + result = result + "
    "; + result = result + "

    "; + result = result + "Elapsed time: " + elapsedTime + " ms, see the service description of this page, if you find a bug let us know"; + result = result + ""; + result = result + ""; + return result; + } + + +} diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/src/de/mpg/mpiwg/berlin/mpdl/servlets/lt/GetForms.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt-web/src/de/mpg/mpiwg/berlin/mpdl/servlets/lt/GetForms.java Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,210 @@ +package de.mpg.mpiwg.berlin.mpdl.servlets.lt; + +import java.io.IOException; +import java.io.PrintWriter; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Date; +import java.util.Hashtable; + +import javax.servlet.ServletConfig; +import javax.servlet.ServletException; +import javax.servlet.http.HttpServlet; +import javax.servlet.http.HttpServletRequest; +import javax.servlet.http.HttpServletResponse; + +import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; +import de.mpg.mpiwg.berlin.mpdl.lt.dict.db.LexHandler; +import de.mpg.mpiwg.berlin.mpdl.lt.morph.app.Form; +import de.mpg.mpiwg.berlin.mpdl.lt.morph.app.Lemma; + +public class GetForms extends HttpServlet { + private static final long serialVersionUID = 1L; + private LexHandler lexHandler; + + public GetForms() { + super(); + } + + public void init(ServletConfig config) throws ServletException { + super.init(config); + try { + lexHandler = LexHandler.getInstance(); + } catch (ApplicationException e) { + throw new ServletException(e); + } + } + + protected void doGet(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException { + Date begin = new Date(); + request.setCharacterEncoding("utf-8"); + response.setCharacterEncoding("utf-8"); + String query = request.getParameter("query"); + String language = request.getParameter("language"); + String outputFormat = request.getParameter("outputFormat"); + String outputType = request.getParameter("outputType"); + String normalization = request.getParameter("normalization"); + if (language == null) + language = "eng"; + if (outputFormat == null || ! (outputFormat.equals("xml") || outputFormat.equals("html") || outputFormat.equals("string"))) + outputFormat = "xml"; + if (outputType == null || ! (outputType.equals("compact") || outputType.equals("full"))) + outputType = "compact"; + if (normalization == null || ! (normalization.equals("none") || normalization.equals("reg") || normalization.equals("reg norm"))) + normalization = "norm"; + + String xmlQueryString = "" + query + "" + "" + language + "" + + "" + outputFormat + "" + "" + outputType + "" + "" + normalization + "" + ""; + try { + if (outputFormat.equals("xml")) + response.setContentType("text/xml"); + else if (outputFormat.equals("html") || outputFormat.equals("string")) + response.setContentType("text/html"); + else + response.setContentType("text/xml"); + PrintWriter out = response.getWriter(); + if (query == null || query.isEmpty()) { + out.print("request parameter query is empty. Please specify a query."); + out.close(); + return; + } + ArrayList lemmas = lexHandler.getLemmas(query, "lemma", language, normalization); + Hashtable formsHashtable = new Hashtable(); + ArrayList forms = new ArrayList(); + if (lemmas != null && ! lemmas.isEmpty()) { + for (int i=0; i lemmaForms = lemma.getFormsList(); + for (int j=0; j < lemmaForms.size(); j++) { + Form form = lemmaForms.get(j); + formsHashtable.put(form.getFormName(), form); + } + } + } + forms.addAll(formsHashtable.values()); + Collections.sort(forms); + String baseUrl = getBaseUrl(request); + Date end = new Date(); + String elapsedTime = String.valueOf(end.getTime() - begin.getTime()); + String result = ""; + if (outputFormat == null || outputFormat.equals("xml")) + result = createXmlOutputString(query, forms, outputType, baseUrl, xmlQueryString, elapsedTime); + else if (outputFormat.equals("html")) + result = createHtmlOutputString(query, forms, outputType, elapsedTime); + else if (outputFormat.equals("string")) + result = createStringOutputString(forms); + else + result = createXmlOutputString(query, forms, outputType, baseUrl, xmlQueryString, elapsedTime); + out.print(result); + out.close(); + } catch (ApplicationException e) { + throw new ServletException(e); + } + } + + protected void doPost(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException { + + } + + private String getBaseUrl( HttpServletRequest request ) { + if (request.getServerPort() == 80 || request.getServerPort() == 443) + return request.getScheme() + "://" + request.getServerName() + request.getContextPath(); + else + return request.getScheme() + "://" + request.getServerName() + ":" + request.getServerPort() + request.getContextPath(); + } + + + private String createXmlOutputString(String query, ArrayList forms, String outputType, String baseUrl, String xmlQueryString, String elapsedTime) { + String result = ""; + result = result + "" + "MPIWG MPDL language technology service (see: " + "" + baseUrl + "), Max Planck Institute for the History of Science, Berlin." + ""; + result = result + xmlQueryString; + result = result + "" + elapsedTime + ""; + if (forms != null && ! forms.isEmpty()) { + result = result + ""; + result = result + ""; + for (int i=0; i"; + Form f = forms.get(i); + String formName = f.getFormName(); + String language = f.getLanguage(); + String formProvider = f.getProvider(); + String lemmaName = f.getLemmaName(); + result = result + "" + formProvider + ""; + result = result + "" + language + ""; + result = result + "" + lemmaName + ""; + result = result + "" + formName + ""; + result = result + ""; + } + result = result + ""; + result = result + ""; + } + result = result + ""; + return result; + } + + private String createHtmlOutputString(String query, ArrayList

    forms, String outputType, String elapsedTime) { + String result = ""; + result = result + ""; + result = result + ""; + result = result + "Lemmas for: \"" + query + "\""; + result = result + ""; + result = result + ""; + result = result + ""; + result = result + ""; + result = result + "
    [This is a MPIWG MPDL language technology service] \"MPIWG
    "; + result = result + "

    "; + result = result + "

    Forms for: \"" + query + "\"

    "; + if (forms != null && ! forms.isEmpty()) { + result = result + "

    Morphology

    "; + result = result + "
      "; + result = result + "

      "; + if (outputType != null && outputType.equals("full")) { + for (int i=0; i"; + Form f = forms.get(i); + String formName = f.getFormName(); + String formProvider = f.getProvider(); + String language = f.getLanguage(); + String lemmaName = f.getLemmaName(); + result = result + formName + " (data provider: " + formProvider + ", language: " + language + ", lemmaName: " + lemmaName + ")"; + result = result + ""; + } + } else if (outputType == null || outputType.equals("compact")) { + result = result + "

    • "; + for (int i=0; i"; + } else if (outputType.equals("string")) { + for (int i=0; i"; + result = result + "

      "; + result = result + "Elapsed time: " + elapsedTime + " ms, see the service description of this page, if you find a bug let us know"; + result = result + ""; + result = result + ""; + return result; + } + + private String createStringOutputString(ArrayList forms) { + String result = ""; + for (int i=0; i" + "" + language + "" + "" + inputType + "" + + "" + outputFormat + "" + "" + outputType + "" + "" + normalization + "" + ""; + try { + if (outputFormat.equals("xml")) + response.setContentType("text/xml"); + else if (outputFormat.equals("html") || outputFormat.equals("string")) + response.setContentType("text/html"); + else + response.setContentType("text/xml"); + PrintWriter out = response.getWriter(); + if (query == null || query.isEmpty()) { + out.print("request parameter query is empty. Please specify a query."); + out.close(); + return; + } + ArrayList lemmas = lexHandler.getLemmas(query, inputType, language, normalization); + String baseUrl = getBaseUrl(request); + Date end = new Date(); + String elapsedTime = String.valueOf(end.getTime() - begin.getTime()); + String result = ""; + if (outputFormat == null || outputFormat.equals("xml")) + result = createXmlOutputString(query, lemmas, outputType, baseUrl, xmlQueryString, elapsedTime); + else if (outputFormat.equals("html")) + result = createHtmlOutputString(query, lemmas, outputType, elapsedTime); + else if (outputFormat.equals("string")) + result = createStringOutputString(lemmas); + else + result = createXmlOutputString(query, lemmas, outputType, baseUrl, xmlQueryString, elapsedTime); + out.print(result); + out.close(); + } catch (ApplicationException e) { + throw new ServletException(e); + } + } + + protected void doPost(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException { + + } + + private String getBaseUrl( HttpServletRequest request ) { + if (request.getServerPort() == 80 || request.getServerPort() == 443) + return request.getScheme() + "://" + request.getServerName() + request.getContextPath(); + else + return request.getScheme() + "://" + request.getServerName() + ":" + request.getServerPort() + request.getContextPath(); + } + + + private String createXmlOutputString(String query, ArrayList lemmas, String outputType, String baseUrl, String xmlQueryString, String elapsedTime) { + String result = ""; + result = result + "" + "MPIWG MPDL language technology service (see: " + "" + baseUrl + "), Max Planck Institute for the History of Science, Berlin." + ""; + result = result + xmlQueryString; + result = result + "" + elapsedTime + ""; + if (lemmas != null && ! lemmas.isEmpty()) { + result = result + ""; + for (int i=0; i"; + result = result + "" + lemmaName + ""; + if (outputType != null && outputType.equals("full")) { + String lemmaProvider = lemma.getProvider(); + result = result + "" + lemmaProvider + ""; + result = result + "" + language + ""; + } + if (Language.getInstance().isArabic(language) || Language.getInstance().isLatin(language)) { + String remoteUrl = "http://www.perseus.tufts.edu/hopper/morph?l=" + lemmaName + "&la=" + language; + result = result + "" + remoteUrl + ""; + } else if (Language.getInstance().isGreek(language)) { + String remoteUrl = "http://www.perseus.tufts.edu/hopper/morph?l=" + lemmaName + "&la=" + "greek"; + result = result + "" + remoteUrl + ""; + } + if (outputType != null && outputType.equals("full")) { + ArrayList forms = lemma.getFormsList(); + Collections.sort(forms); + if (forms != null && ! forms.isEmpty()) { + result = result + ""; + for (int j=0; j"; + Form f = forms.get(j); + String formName = f.getFormName(); + String formProvider = f.getProvider(); + result = result + "" + formProvider + ""; + result = result + "" + language + ""; + result = result + "" + formName + ""; + result = result + ""; + } + result = result + ""; + } + } + result = result + ""; + } + result = result + ""; + } + result = result + ""; + return result; + } + + private String createHtmlOutputString(String query, ArrayList lemmas, String outputType, String elapsedTime) { + String result = ""; + result = result + ""; + result = result + ""; + result = result + "Lemmas for: \"" + query + "\""; + result = result + ""; + result = result + ""; + result = result + ""; + result = result + ""; + result = result + "
      [This is a MPIWG MPDL language technology service] \"MPIWG
      "; + result = result + "

      "; + result = result + "

      Lemmas for: \"" + query + "\"

      "; + if (lemmas != null && ! lemmas.isEmpty()) { + result = result + "

      Morphology

      "; + result = result + "
        "; + result = result + "

        "; + for (int i=0; i"; + result = result + lemmaName; + if (outputType != null && outputType.equals("full")) { + String lemmaProvider = lemma.getProvider(); + result = result + " (data provider: " + lemmaProvider + ")"; + } + if (Language.getInstance().isArabic(language) || Language.getInstance().isLatin(language)) + result = result + " (external link: " + lemmaName + ")"; + else if (Language.getInstance().isGreek(language)) + result = result + " (external link: " + lemmaName + ")"; + if (outputType != null && outputType.equals("full")) { + ArrayList

        forms = lemma.getFormsList(); + Collections.sort(forms); + if (forms != null && ! forms.isEmpty()) { + result = result + "
          "; + for (int j=0; j"; + } + } + result = result + ""; + } + result = result + "
        "; + } + result = result + "[* external links may not function]"; + result = result + "
        "; + result = result + "

        "; + result = result + "Elapsed time: " + elapsedTime + " ms, see the service description of this page, if you find a bug let us know"; + result = result + ""; + result = result + ""; + return result; + } + + private String createStringOutputString(ArrayList lemmas) { + String result = ""; + for (int i=0; i")) // TODO check properly for xml type of the inputText + inputTextIsXml = true; + if (! inputTextIsXml) { + ArrayList tokens = getToken(inputText, language, normFunctions); + Hashtable> tokensDictionaries = null; + if (dictionary.equals("yes")) { + tokensDictionaries = new Hashtable>(); + LexHandler lexHandler = LexHandler.getInstance(); + for (int i = 0; i < tokens.size(); i++) { + String token = tokens.get(i); + ArrayList lemmas = lexHandler.getLemmas(token, "form", language, "none"); + ArrayList dictionaries = lexHandler.getLexEntries(lemmas, language, null); + tokensDictionaries.put(token, dictionaries); + } + } + String baseUrl = ServletUtil.getInstance().getBaseUrl(request); + Date end = new Date(); + String elapsedTime = String.valueOf(end.getTime() - begin.getTime()); + if (outputFormat.equals("xml")) + result = createXmlOutputString(tokens, tokensDictionaries, baseUrl, elapsedTime); + else if (outputFormat.equals("string")) + result = createStringOutputString(tokens); + else + result = "outputFormat: \"" + outputFormat + "\" is not supported"; + } else { + StringReader xmlInputStringReader = new StringReader(inputText); + XmlTokenizer xmlTokenizer = new XmlTokenizer(xmlInputStringReader); + xmlTokenizer.setLanguage(language); + xmlTokenizer.setNormFunctions(normFunctions); + xmlTokenizer.setOutputOptions(outputOptions); + if (stopElementsArray != null) + xmlTokenizer.setStopElements(stopElementsArray); + result = xmlTokenizer.tokenize(); + } + if (result != null) + out.print(result); + out.close(); + } catch (ApplicationException e) { + throw new ServletException(e); + } + } + + private ArrayList getToken(String inputString, String language, String[] normFunctions) throws ApplicationException { + ArrayList retTokens = null; + try { + StringReader reader = new StringReader(inputString); + Tokenizer tokenizer = new Tokenizer(reader); + tokenizer.setLanguage(language); + tokenizer.setNormFunctions(normFunctions); + ArrayList tokens = tokenizer.getTokens(); + if (tokens != null) { + retTokens = new ArrayList(); + for (int i=0; i tokens, Hashtable> tokensDictionaries, String baseUrl, String elapsedTime) { + String result = ""; + result = result + "" + "MPIWG MPDL language technology service (see: " + "" + baseUrl + "), Max Planck Institute for the History of Science, Berlin." + ""; + result = result + "" + elapsedTime + ""; + if (tokens != null && ! tokens.isEmpty()) { + result = result + ""; + for (int i=0; i"; + result = result + "" + token + ""; + if (tokensDictionaries != null && ! tokensDictionaries.isEmpty()) { + ArrayList tokenDictionaries = tokensDictionaries.get(token); + if (tokenDictionaries != null) { + result = result + ""; + for (int j=0; j"; + } + } + result = result + ""; + } + result = result + ""; + } + result = result + ""; + return result; + } + + private String createStringOutputString(ArrayList tokens) { + String result = ""; + if (tokens != null && ! tokens.isEmpty()) { + for (int i=0; i getTokenOld(String inputString, String language, String[] normFunctions) throws ApplicationException { + ArrayList tokens = new ArrayList(); + try { + StringReader reader = new StringReader(inputString); + Tokenizer tokenizer = new Tokenizer(reader); + tokenizer.setLanguage(language); + tokenizer.setNormFunctions(normFunctions); + // tokenizer.reset(); + /* + result = new MpdlFilter(result); // filter to remove the hyphen in a token etc. + result = new LowerCaseFilter(result); + result = new StopFilter(result, stopSet); + */ + CharTermAttribute charTermAttribute = tokenizer.getAttribute(CharTermAttribute.class); + // Token token = tokenizer.getAttribute(Token.class); + while (tokenizer.incrementToken()) { + // String tokenStr = token.toString(); + String term = charTermAttribute.toString(); + tokens.add(term); + } + tokenizer.end(); + tokenizer.close(); + } catch (IOException e) { + throw new ApplicationException(e); + } + return tokens; + } + +} diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/src/de/mpg/mpiwg/berlin/mpdl/servlets/lt/Transcode.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt-web/src/de/mpg/mpiwg/berlin/mpdl/servlets/lt/Transcode.java Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,70 @@ +package de.mpg.mpiwg.berlin.mpdl.servlets.lt; + +import java.io.IOException; +import java.io.PrintWriter; + +import javax.servlet.ServletConfig; +import javax.servlet.ServletException; +import javax.servlet.http.HttpServlet; +import javax.servlet.http.HttpServletRequest; +import javax.servlet.http.HttpServletResponse; + +import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; +import de.mpg.mpiwg.berlin.mpdl.lt.text.transcode.Transcoder; + +public class Transcode extends HttpServlet { + private static final long serialVersionUID = 1L; + private Transcoder transcoder; + + public Transcode() { + super(); + } + + public void init(ServletConfig config) throws ServletException { + super.init(config); + transcoder = Transcoder.getInstance(); + } + + protected void doGet(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException { + request.setCharacterEncoding("utf-8"); + response.setCharacterEncoding("utf-8"); + String inputString = request.getParameter("inputString"); + String srcEncoding = request.getParameter("srcEncoding"); + String destEncoding = request.getParameter("destEncoding"); + if (destEncoding == null) + destEncoding = "unicode"; + String result = null; + try { + response.setContentType("text/html"); + PrintWriter out = response.getWriter(); + if (inputString == null || inputString.isEmpty()) { + out.print("request parameter \"inputString\" is empty. Please specify \"inputString\""); + out.close(); + return; + } + if (srcEncoding == null || srcEncoding.isEmpty()) { + out.print("request parameter \"srcEncoding\" is empty. Please specify \"srcEncoding\""); + out.close(); + return; + } + if (srcEncoding.equals("buckwalter") && destEncoding.equals("unicode")) { + result = transcoder.transcodeFromBuckwalter2Unicode(inputString); + } else if (srcEncoding.equals("betacode") && destEncoding.equals("unicode")) { + result = transcoder.transcodeFromBetaCode2Unicode(inputString); + } else if (srcEncoding.equals("unicode") && destEncoding.equals("betacode")) { + result = transcoder.transcodeFromUnicode2BetaCode(inputString); + } else if (srcEncoding.equals("unicode") && destEncoding.equals("buckwalter")) { + result = transcoder.transcodeFromUnicode2Buckwalter(inputString); + } + if (result != null) + out.print(result); + out.close(); + } catch (ApplicationException e) { + throw new ServletException(e); + } + } + + protected void doPost(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException { + + } +} diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/src/de/mpg/mpiwg/berlin/mpdl/servlets/util/ServletUtil.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt-web/src/de/mpg/mpiwg/berlin/mpdl/servlets/util/ServletUtil.java Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,27 @@ +package de.mpg.mpiwg.berlin.mpdl.servlets.util; + +import javax.servlet.http.HttpServletRequest; + +import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; + +public class ServletUtil { + private static ServletUtil instance; + + public static ServletUtil getInstance() throws ApplicationException { + if (instance == null) { + instance = new ServletUtil(); + } + return instance; + } + + public ServletUtil() { + } + + public String getBaseUrl( HttpServletRequest request ) { + if (request.getServerPort() == 80 || request.getServerPort() == 443) + return request.getScheme() + "://" + request.getServerName() + request.getContextPath(); + else + return request.getScheme() + "://" + request.getServerName() + ":" + request.getServerPort() + request.getContextPath(); + } + +} diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/.DS_Store Binary file software/mpdl-services/mpiwg-mpdl-lt/.DS_Store has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/.classpath --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt/.classpath Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,14 @@ + + + + + + + + + + + + + + diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/.externalToolBuilders/Ant_Build.launch --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt/.externalToolBuilders/Ant_Build.launch Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,18 @@ + + + + + + + + + + + + + + + + + + diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/.project --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt/.project Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,27 @@ + + + mpiwg-mpdl-lt + + + + + + org.eclipse.jdt.core.javabuilder + + + + + org.eclipse.ui.externaltools.ExternalToolBuilder + full,incremental, + + + LaunchConfigHandle + <project>/.externalToolBuilders/Ant_Build.launch + + + + + + org.eclipse.jdt.core.javanature + + diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/.settings/org.eclipse.core.resources.prefs --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt/.settings/org.eclipse.core.resources.prefs Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,3 @@ +#Fri Oct 07 18:44:17 CEST 2011 +eclipse.preferences.version=1 +encoding//src/de/mpg/mpiwg/berlin/mpdl/test/TestLocal.java=UTF-8 diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/build/.DS_Store Binary file software/mpdl-services/mpiwg-mpdl-lt/build/.DS_Store has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/build/build.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt/build/build.xml Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,44 @@ + + + mpiwg-mpdl-lt + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/build/classes/.DS_Store Binary file software/mpdl-services/mpiwg-mpdl-lt/build/classes/.DS_Store has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/build/classes/constants.properties --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt/build/classes/constants.properties Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,1 @@ +dataDir=/Users/jwillenborg/mpdl/data/lt \ No newline at end of file diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/build/classes/de/mpg/mpiwg/berlin/mpdl/exception/ApplicationException.class Binary file software/mpdl-services/mpiwg-mpdl-lt/build/classes/de/mpg/mpiwg/berlin/mpdl/exception/ApplicationException.class has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/build/classes/de/mpg/mpiwg/berlin/mpdl/lt/dict/app/Lexica.class Binary file software/mpdl-services/mpiwg-mpdl-lt/build/classes/de/mpg/mpiwg/berlin/mpdl/lt/dict/app/Lexica.class has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/build/classes/de/mpg/mpiwg/berlin/mpdl/lt/dict/app/Lexicon.class Binary file software/mpdl-services/mpiwg-mpdl-lt/build/classes/de/mpg/mpiwg/berlin/mpdl/lt/dict/app/Lexicon.class has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/build/classes/de/mpg/mpiwg/berlin/mpdl/lt/dict/app/LexiconEntry.class Binary file software/mpdl-services/mpiwg-mpdl-lt/build/classes/de/mpg/mpiwg/berlin/mpdl/lt/dict/app/LexiconEntry.class has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/build/classes/de/mpg/mpiwg/berlin/mpdl/lt/dict/db/DBLexWriter.class Binary file software/mpdl-services/mpiwg-mpdl-lt/build/classes/de/mpg/mpiwg/berlin/mpdl/lt/dict/db/DBLexWriter.class has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/build/classes/de/mpg/mpiwg/berlin/mpdl/lt/dict/db/DbEnvLex.class Binary file software/mpdl-services/mpiwg-mpdl-lt/build/classes/de/mpg/mpiwg/berlin/mpdl/lt/dict/db/DbEnvLex.class has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/build/classes/de/mpg/mpiwg/berlin/mpdl/lt/dict/db/LexEntryContentHandler.class Binary file software/mpdl-services/mpiwg-mpdl-lt/build/classes/de/mpg/mpiwg/berlin/mpdl/lt/dict/db/LexEntryContentHandler.class has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/build/classes/de/mpg/mpiwg/berlin/mpdl/lt/dict/db/LexEntryErrorHandler.class Binary file software/mpdl-services/mpiwg-mpdl-lt/build/classes/de/mpg/mpiwg/berlin/mpdl/lt/dict/db/LexEntryErrorHandler.class has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/build/classes/de/mpg/mpiwg/berlin/mpdl/lt/dict/db/LexHandler.class Binary file software/mpdl-services/mpiwg-mpdl-lt/build/classes/de/mpg/mpiwg/berlin/mpdl/lt/dict/db/LexHandler.class has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/build/classes/de/mpg/mpiwg/berlin/mpdl/lt/general/Betacode2UnicodeLex.class Binary file software/mpdl-services/mpiwg-mpdl-lt/build/classes/de/mpg/mpiwg/berlin/mpdl/lt/general/Betacode2UnicodeLex.class has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/build/classes/de/mpg/mpiwg/berlin/mpdl/lt/general/Buckwalter2UnicodeLex.class Binary file software/mpdl-services/mpiwg-mpdl-lt/build/classes/de/mpg/mpiwg/berlin/mpdl/lt/general/Buckwalter2UnicodeLex.class has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/build/classes/de/mpg/mpiwg/berlin/mpdl/lt/general/Constants.class Binary file software/mpdl-services/mpiwg-mpdl-lt/build/classes/de/mpg/mpiwg/berlin/mpdl/lt/general/Constants.class has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/build/classes/de/mpg/mpiwg/berlin/mpdl/lt/general/Language.class Binary file software/mpdl-services/mpiwg-mpdl-lt/build/classes/de/mpg/mpiwg/berlin/mpdl/lt/general/Language.class has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/build/classes/de/mpg/mpiwg/berlin/mpdl/lt/general/Transcoder.class Binary file software/mpdl-services/mpiwg-mpdl-lt/build/classes/de/mpg/mpiwg/berlin/mpdl/lt/general/Transcoder.class has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/build/classes/de/mpg/mpiwg/berlin/mpdl/lt/general/Unicode2BetacodeLex.class Binary file software/mpdl-services/mpiwg-mpdl-lt/build/classes/de/mpg/mpiwg/berlin/mpdl/lt/general/Unicode2BetacodeLex.class has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/build/classes/de/mpg/mpiwg/berlin/mpdl/lt/general/Unicode2BuckwalterLex.class Binary file software/mpdl-services/mpiwg-mpdl-lt/build/classes/de/mpg/mpiwg/berlin/mpdl/lt/general/Unicode2BuckwalterLex.class has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/build/classes/de/mpg/mpiwg/berlin/mpdl/lt/morph/app/Form.class Binary file software/mpdl-services/mpiwg-mpdl-lt/build/classes/de/mpg/mpiwg/berlin/mpdl/lt/morph/app/Form.class has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/build/classes/de/mpg/mpiwg/berlin/mpdl/lt/morph/app/Lemma.class Binary file software/mpdl-services/mpiwg-mpdl-lt/build/classes/de/mpg/mpiwg/berlin/mpdl/lt/morph/app/Lemma.class has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/build/classes/de/mpg/mpiwg/berlin/mpdl/lt/morph/app/MorphFileReaderContentHandler$Element.class Binary file software/mpdl-services/mpiwg-mpdl-lt/build/classes/de/mpg/mpiwg/berlin/mpdl/lt/morph/app/MorphFileReaderContentHandler$Element.class has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/build/classes/de/mpg/mpiwg/berlin/mpdl/lt/morph/app/MorphFileReaderContentHandler.class Binary file software/mpdl-services/mpiwg-mpdl-lt/build/classes/de/mpg/mpiwg/berlin/mpdl/lt/morph/app/MorphFileReaderContentHandler.class has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/build/classes/de/mpg/mpiwg/berlin/mpdl/lt/morph/app/MorphologyCache.class Binary file software/mpdl-services/mpiwg-mpdl-lt/build/classes/de/mpg/mpiwg/berlin/mpdl/lt/morph/app/MorphologyCache.class has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/build/classes/de/mpg/mpiwg/berlin/mpdl/lt/morph/app/SimpleMorphContentHandler$Element.class Binary file software/mpdl-services/mpiwg-mpdl-lt/build/classes/de/mpg/mpiwg/berlin/mpdl/lt/morph/app/SimpleMorphContentHandler$Element.class has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/build/classes/de/mpg/mpiwg/berlin/mpdl/lt/morph/app/SimpleMorphContentHandler.class Binary file software/mpdl-services/mpiwg-mpdl-lt/build/classes/de/mpg/mpiwg/berlin/mpdl/lt/morph/app/SimpleMorphContentHandler.class has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/build/classes/de/mpg/mpiwg/berlin/mpdl/lt/morph/db/DBMorphHandler.class Binary file software/mpdl-services/mpiwg-mpdl-lt/build/classes/de/mpg/mpiwg/berlin/mpdl/lt/morph/db/DBMorphHandler.class has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/build/classes/de/mpg/mpiwg/berlin/mpdl/lt/morph/db/DBMorphSupWriter.class Binary file software/mpdl-services/mpiwg-mpdl-lt/build/classes/de/mpg/mpiwg/berlin/mpdl/lt/morph/db/DBMorphSupWriter.class has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/build/classes/de/mpg/mpiwg/berlin/mpdl/lt/morph/db/DBMorphWriter.class Binary file software/mpdl-services/mpiwg-mpdl-lt/build/classes/de/mpg/mpiwg/berlin/mpdl/lt/morph/db/DBMorphWriter.class has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/build/classes/de/mpg/mpiwg/berlin/mpdl/lt/morph/db/DBMorphWriterContentHandler$Element.class Binary file software/mpdl-services/mpiwg-mpdl-lt/build/classes/de/mpg/mpiwg/berlin/mpdl/lt/morph/db/DBMorphWriterContentHandler$Element.class has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/build/classes/de/mpg/mpiwg/berlin/mpdl/lt/morph/db/DBMorphWriterContentHandler.class Binary file software/mpdl-services/mpiwg-mpdl-lt/build/classes/de/mpg/mpiwg/berlin/mpdl/lt/morph/db/DBMorphWriterContentHandler.class has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/build/classes/de/mpg/mpiwg/berlin/mpdl/lt/morph/db/DbEnvMorph.class Binary file software/mpdl-services/mpiwg-mpdl-lt/build/classes/de/mpg/mpiwg/berlin/mpdl/lt/morph/db/DbEnvMorph.class has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/build/classes/de/mpg/mpiwg/berlin/mpdl/lt/morph/db/DbEnvMorphSup.class Binary file software/mpdl-services/mpiwg-mpdl-lt/build/classes/de/mpg/mpiwg/berlin/mpdl/lt/morph/db/DbEnvMorphSup.class has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/build/classes/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/MpdlNormalizer.class Binary file software/mpdl-services/mpiwg-mpdl-lt/build/classes/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/MpdlNormalizer.class has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/build/classes/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/Normalizer.class Binary file software/mpdl-services/mpiwg-mpdl-lt/build/classes/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/Normalizer.class has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/build/classes/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexAR.class Binary file software/mpdl-services/mpiwg-mpdl-lt/build/classes/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexAR.class has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/build/classes/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexDE.class Binary file software/mpdl-services/mpiwg-mpdl-lt/build/classes/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexDE.class has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/build/classes/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexEL.class Binary file software/mpdl-services/mpiwg-mpdl-lt/build/classes/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexEL.class has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/build/classes/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexEN.class Binary file software/mpdl-services/mpiwg-mpdl-lt/build/classes/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexEN.class has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/build/classes/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexFR.class Binary file software/mpdl-services/mpiwg-mpdl-lt/build/classes/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexFR.class has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/build/classes/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexIT.class Binary file software/mpdl-services/mpiwg-mpdl-lt/build/classes/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexIT.class has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/build/classes/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexLA.class Binary file software/mpdl-services/mpiwg-mpdl-lt/build/classes/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexLA.class has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/build/classes/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexNL.class Binary file software/mpdl-services/mpiwg-mpdl-lt/build/classes/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexNL.class has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/build/classes/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexZH.class Binary file software/mpdl-services/mpiwg-mpdl-lt/build/classes/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexZH.class has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/build/classes/de/mpg/mpiwg/berlin/mpdl/lt/text/reg/DBRegularizationHandler.class Binary file software/mpdl-services/mpiwg-mpdl-lt/build/classes/de/mpg/mpiwg/berlin/mpdl/lt/text/reg/DBRegularizationHandler.class has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/build/classes/de/mpg/mpiwg/berlin/mpdl/lt/text/reg/DbEnvRegularization.class Binary file software/mpdl-services/mpiwg-mpdl-lt/build/classes/de/mpg/mpiwg/berlin/mpdl/lt/text/reg/DbEnvRegularization.class has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/build/classes/de/mpg/mpiwg/berlin/mpdl/lt/text/reg/Regularization.class Binary file software/mpdl-services/mpiwg-mpdl-lt/build/classes/de/mpg/mpiwg/berlin/mpdl/lt/text/reg/Regularization.class has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/build/classes/de/mpg/mpiwg/berlin/mpdl/lt/text/reg/RegularizationManager.class Binary file software/mpdl-services/mpiwg-mpdl-lt/build/classes/de/mpg/mpiwg/berlin/mpdl/lt/text/reg/RegularizationManager.class has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/build/classes/de/mpg/mpiwg/berlin/mpdl/lt/text/tokenize/ChineseTokenizer.class Binary file software/mpdl-services/mpiwg-mpdl-lt/build/classes/de/mpg/mpiwg/berlin/mpdl/lt/text/tokenize/ChineseTokenizer.class has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/build/classes/de/mpg/mpiwg/berlin/mpdl/lt/text/tokenize/Token.class Binary file software/mpdl-services/mpiwg-mpdl-lt/build/classes/de/mpg/mpiwg/berlin/mpdl/lt/text/tokenize/Token.class has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/build/classes/de/mpg/mpiwg/berlin/mpdl/lt/text/tokenize/Tokenizer.class Binary file software/mpdl-services/mpiwg-mpdl-lt/build/classes/de/mpg/mpiwg/berlin/mpdl/lt/text/tokenize/Tokenizer.class has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/build/classes/de/mpg/mpiwg/berlin/mpdl/lt/text/tokenize/XmlTokenizer.class Binary file software/mpdl-services/mpiwg-mpdl-lt/build/classes/de/mpg/mpiwg/berlin/mpdl/lt/text/tokenize/XmlTokenizer.class has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/build/classes/de/mpg/mpiwg/berlin/mpdl/lt/text/tokenize/XmlTokenizerContentHandler$1.class Binary file software/mpdl-services/mpiwg-mpdl-lt/build/classes/de/mpg/mpiwg/berlin/mpdl/lt/text/tokenize/XmlTokenizerContentHandler$1.class has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/build/classes/de/mpg/mpiwg/berlin/mpdl/lt/text/tokenize/XmlTokenizerContentHandler$Element.class Binary file software/mpdl-services/mpiwg-mpdl-lt/build/classes/de/mpg/mpiwg/berlin/mpdl/lt/text/tokenize/XmlTokenizerContentHandler$Element.class has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/build/classes/de/mpg/mpiwg/berlin/mpdl/lt/text/tokenize/XmlTokenizerContentHandler.class Binary file software/mpdl-services/mpiwg-mpdl-lt/build/classes/de/mpg/mpiwg/berlin/mpdl/lt/text/tokenize/XmlTokenizerContentHandler.class has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/build/classes/de/mpg/mpiwg/berlin/mpdl/lt/text/transcode/Betacode2UnicodeLex.class Binary file software/mpdl-services/mpiwg-mpdl-lt/build/classes/de/mpg/mpiwg/berlin/mpdl/lt/text/transcode/Betacode2UnicodeLex.class has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/build/classes/de/mpg/mpiwg/berlin/mpdl/lt/text/transcode/Buckwalter2UnicodeLex.class Binary file software/mpdl-services/mpiwg-mpdl-lt/build/classes/de/mpg/mpiwg/berlin/mpdl/lt/text/transcode/Buckwalter2UnicodeLex.class has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/build/classes/de/mpg/mpiwg/berlin/mpdl/lt/text/transcode/Transcoder.class Binary file software/mpdl-services/mpiwg-mpdl-lt/build/classes/de/mpg/mpiwg/berlin/mpdl/lt/text/transcode/Transcoder.class has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/build/classes/de/mpg/mpiwg/berlin/mpdl/lt/text/transcode/Unicode2BetacodeLex.class Binary file software/mpdl-services/mpiwg-mpdl-lt/build/classes/de/mpg/mpiwg/berlin/mpdl/lt/text/transcode/Unicode2BetacodeLex.class has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/build/classes/de/mpg/mpiwg/berlin/mpdl/lt/text/transcode/Unicode2BuckwalterLex.class Binary file software/mpdl-services/mpiwg-mpdl-lt/build/classes/de/mpg/mpiwg/berlin/mpdl/lt/text/transcode/Unicode2BuckwalterLex.class has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/build/classes/de/mpg/mpiwg/berlin/mpdl/lucene/util/LuceneUtil.class Binary file software/mpdl-services/mpiwg-mpdl-lt/build/classes/de/mpg/mpiwg/berlin/mpdl/lucene/util/LuceneUtil.class has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/build/classes/de/mpg/mpiwg/berlin/mpdl/test/TestLocal.class Binary file software/mpdl-services/mpiwg-mpdl-lt/build/classes/de/mpg/mpiwg/berlin/mpdl/test/TestLocal.class has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/build/classes/de/mpg/mpiwg/berlin/mpdl/util/FileUtil.class Binary file software/mpdl-services/mpiwg-mpdl-lt/build/classes/de/mpg/mpiwg/berlin/mpdl/util/FileUtil.class has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/build/classes/de/mpg/mpiwg/berlin/mpdl/util/LuceneUtil.class Binary file software/mpdl-services/mpiwg-mpdl-lt/build/classes/de/mpg/mpiwg/berlin/mpdl/util/LuceneUtil.class has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/build/classes/de/mpg/mpiwg/berlin/mpdl/util/StringUtilEscapeChars.class Binary file software/mpdl-services/mpiwg-mpdl-lt/build/classes/de/mpg/mpiwg/berlin/mpdl/util/StringUtilEscapeChars.class has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/build/classes/de/mpg/mpiwg/berlin/mpdl/util/StringUtils.class Binary file software/mpdl-services/mpiwg-mpdl-lt/build/classes/de/mpg/mpiwg/berlin/mpdl/util/StringUtils.class has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/build/classes/de/mpg/mpiwg/berlin/mpdl/util/Util.class Binary file software/mpdl-services/mpiwg-mpdl-lt/build/classes/de/mpg/mpiwg/berlin/mpdl/util/Util.class has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/build/classes/de/mpg/mpiwg/berlin/mpdl/util/XmlUtil$1.class Binary file software/mpdl-services/mpiwg-mpdl-lt/build/classes/de/mpg/mpiwg/berlin/mpdl/util/XmlUtil$1.class has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/build/classes/de/mpg/mpiwg/berlin/mpdl/util/XmlUtil.class Binary file software/mpdl-services/mpiwg-mpdl-lt/build/classes/de/mpg/mpiwg/berlin/mpdl/util/XmlUtil.class has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/dist/.DS_Store Binary file software/mpdl-services/mpiwg-mpdl-lt/dist/.DS_Store has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/dist/mpiwg-mpdl-lt.jar Binary file software/mpdl-services/mpiwg-mpdl-lt/dist/mpiwg-mpdl-lt.jar has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/lib/.DS_Store Binary file software/mpdl-services/mpiwg-mpdl-lt/lib/.DS_Store has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/lib/README-transcoder --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt/lib/README-transcoder Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,31 @@ +******************************************************************************** +* TransCoder README * +******************************************************************************** +What it is: + A library of Java classes designed to translate Ancient Greek from one encoding to another. + +License: + This software is copyright Hugh A. Cayless. It is licensed under the terms of the GNU LGPL, see + See http://www.gnu.org/licenses/lgpl.html for details. + +Supported encodings: + At the moment, there are classes for reading Beta Code, GreekKeys, and Unicode and for outputting Beta Code, precomposed Unicode (form C) and Unicode with combining diacriticals (form D). + +How to use it: + The classes that do the work implement the Parser and Converter interfaces. These may be loaded and accessed by the TransCoder class. The following code snippet creates a TransCoder and uses it to transform a Greek text file written in GreekKeys to a Unicode string. The transcoder.jar file now includes a GUI form which can be used for testing the conversion of various font encodings. The .jar file is executable, so it should be possible to run it by double clicking on the icon or executing it from the command line (java -jar transcoder.jar). + + + TransCoder tc = new TransCoder("GreekKeys", "UnicodeC"); + String result = tc.getString(new File("C:/temp/test.txt")); + + + or + + + String source = "A)/NDRA MOI E)/NNEPE, MOU=SA"; + TransCoder tc = new TransCoder(); + tc.setParser("Unicode"); + tc.setConverter("BetaCode"); + String result = tc.getString(source); + + diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/lib/berkeley-db-3.3.82.jar Binary file software/mpdl-services/mpiwg-mpdl-lt/lib/berkeley-db-3.3.82.jar has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/lib/commons-io-2.0.1.jar Binary file software/mpdl-services/mpiwg-mpdl-lt/lib/commons-io-2.0.1.jar has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/lib/commons-lang3-3.0.1.jar Binary file software/mpdl-services/mpiwg-mpdl-lt/lib/commons-lang3-3.0.1.jar has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/lib/lucene-core-3.4.0.jar Binary file software/mpdl-services/mpiwg-mpdl-lt/lib/lucene-core-3.4.0.jar has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/lib/mpiwg-mpdl-xml.jar Binary file software/mpdl-services/mpiwg-mpdl-lt/lib/mpiwg-mpdl-xml.jar has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/lib/saxon9-s9api.jar Binary file software/mpdl-services/mpiwg-mpdl-lt/lib/saxon9-s9api.jar has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/lib/saxon9.jar Binary file software/mpdl-services/mpiwg-mpdl-lt/lib/saxon9.jar has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/lib/transcoder11.jar Binary file software/mpdl-services/mpiwg-mpdl-lt/lib/transcoder11.jar has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/src/.DS_Store Binary file software/mpdl-services/mpiwg-mpdl-lt/src/.DS_Store has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/src/constants.properties --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt/src/constants.properties Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,1 @@ +dataDir=/Users/jwillenborg/mpdl/data/lt \ No newline at end of file diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/src/de/.DS_Store Binary file software/mpdl-services/mpiwg-mpdl-lt/src/de/.DS_Store has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/.DS_Store Binary file software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/.DS_Store has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/.DS_Store Binary file software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/.DS_Store has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/.DS_Store Binary file software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/.DS_Store has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/.DS_Store Binary file software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/.DS_Store has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/exception/ApplicationException.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/exception/ApplicationException.java Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,14 @@ +package de.mpg.mpiwg.berlin.mpdl.exception; + +public class ApplicationException extends Exception { + + public ApplicationException(Exception e) { + super(e); + } + + public ApplicationException(String str) { + super(str); + } + +} + diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/.DS_Store Binary file software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/.DS_Store has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/dict/app/Lexica.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/dict/app/Lexica.java Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,250 @@ +package de.mpg.mpiwg.berlin.mpdl.lt.dict.app; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.Iterator; +import java.util.Set; + +import de.mpg.mpiwg.berlin.mpdl.lt.general.Language; + +/* +florio: 70091 records (6 of them are not xml valid) +bonitz: 14648 records (46 of them are not xml valid) +webster: 111733 records (3 of them are not xml valid) +ls: 53500 records (14 of them are not xml valid) +autenrieth: 10158 records (468 of them are not xml valid) +cooper: 33124 records (116 of them are not xml valid) +baretti: 53555 records (0 of them are not xml valid) +salmone: 6360 records (11 of them are not xml valid) +lsj: 112631 records (26922 of them are not xml valid) + */ +public class Lexica { + private static Lexica instance; + private static HashMap localLexica = new HashMap(); + private static HashMap remoteLexica = new HashMap(); + + public static Lexica getInstance() { + if (instance == null) { + instance = new Lexica(); + instance.init(); + } + return instance; + } + + private void init() { + Lexicon autenrieth = new Lexicon("autenrieth", "el"); + autenrieth.setDescription("Autenrieth, a Homeric lexicon"); + autenrieth.setQueryUrl("http://www.perseus.tufts.edu/hopper/text?doc=Perseus:text:1999.04.0073:entry="); + Lexicon baretti = new Lexicon("baretti", "it"); + baretti.setDescription("Baretti, a dictionary of the English and Italian languages"); + Lexicon bonitz = new Lexicon("bonitz", "el"); + bonitz.setDescription("Bonitz, index Aristotelicus"); + Lexicon cooper = new Lexicon("cooper", "la"); + cooper.setDescription("Cooper, Thesaurus Linguae Romanae et Brittanicae"); + Lexicon florio = new Lexicon("florio", "it"); + florio.setDescription("Florio, a worlde of wordes, or most copious, dictionarie in Italian and English"); + Lexicon ls = new Lexicon("ls", "la"); + ls.setDescription("Lewis and Short, Latin dictionary"); + ls.setQueryUrl("http://www.perseus.tufts.edu/hopper/text?doc=Perseus:text:1999.04.0059:entry="); + Lexicon lsj = new Lexicon("lsj", "el"); + lsj.setDescription("Liddell-Scott-Jones, a Greek-English lexicon"); + lsj.setQueryUrl("http://www.perseus.tufts.edu/hopper/text?doc=Perseus:text:1999.04.0057:entry="); + Lexicon salmone = new Lexicon("salmone", "ar"); + salmone.setDescription("Salmone, an advanced learner's Arabic-English dictionary"); + salmone.setQueryUrl("http://www.perseus.tufts.edu/hopper/text?doc=Perseus:text:2002.02.0005:entry="); + Lexicon salmoneUnicode = new Lexicon("salmoneUnicode", "ar"); + salmoneUnicode.setDescription("Salmone, an advanced learner's Arabic-English dictionary"); + Lexicon webster = new Lexicon("webster", "en"); + webster.setDescription("Webster's revised unabridged dictionary (1913)"); + localLexica.put("autenrieth", autenrieth); + localLexica.put("baretti", baretti); + localLexica.put("bonitz", bonitz); + localLexica.put("cooper", cooper); + localLexica.put("florio", florio); + localLexica.put("ls", ls); + localLexica.put("lsj", lsj); + localLexica.put("salmone", salmone); + localLexica.put("webster", webster); + Lexicon dwds = new Lexicon("dwds", "de"); + dwds.setDescription("Deutsches Wrterbuch der deutschen Sprache"); + dwds.setQueryUrl("http://www.dwds.de/search/?qu="); + dwds.setType("remote"); + Lexicon slater = new Lexicon("slater", "el"); + slater.setDescription("William J. Slater, Lexicon to Pindar"); + slater.setQueryUrl("http://www.perseus.tufts.edu/hopper/text?doc=Perseus:text:1999.04.0072:entry="); + slater.setType("remote"); + Lexicon artflFr = new Lexicon("artfl-fr", "fr"); + artflFr.setDescription("The ARTFL project: Dictionnaires d'autrefois: French dictionaries of the 17th, 18th, 19th and 20th centuries"); + artflFr.setQueryUrl("http://machaut.uchicago.edu/?resource=frengdict&action=search&french="); + artflFr.setType("remote"); + Lexicon artflFrEn = new Lexicon("artfl-fr-en", "fr"); + artflFrEn.setDescription("The ARTFL project: French - English dictionary"); + artflFrEn.setQueryUrl("http://artflx.uchicago.edu/cgi-bin/dicos/pubdico1look.pl?strippedhw="); + artflFrEn.setType("remote"); + Lexicon lewis = new Lexicon("lewis", "la"); + lewis.setDescription("Charlton T. Lewis, an Elementary Latin Dictionary"); + lewis.setQueryUrl("http://www.perseus.tufts.edu/hopper/text?doc=Perseus:text:1999.04.0060:entry="); + lewis.setType("remote"); + Lexicon wikiwoordenboek = new Lexicon("wikiwoordenboek", "nl"); + wikiwoordenboek.setDescription("Wiktionary: WikiWoordenboek"); + wikiwoordenboek.setQueryUrl("http://nl.wiktionary.org/wiki/"); + wikiwoordenboek.setType("remote"); + Lexicon ctp = new Lexicon("ctp", "zh"); + ctp.setDescription("Chinese Text Project"); + ctp.setQueryUrl("http://ctext.org/dictionary.pl?if=en&char="); + ctp.setType("remote"); + Lexicon linyutan = new Lexicon("linyutan", "zh"); + linyutan.setDescription("Lin Yutang"); + linyutan.setQueryUrl("http://humanum.arts.cuhk.edu.hk/cgi-bin/agrep-lindict?query="); + linyutan.setType("remote"); + Lexicon chineseUnicode = new Lexicon("chinese-unicode", "zh"); + chineseUnicode.setDescription("Unicode"); + chineseUnicode.setQueryUrl("http://www.unicode.org/cgi-bin/GetUnihanData.pl?codepoint="); + chineseUnicode.setType("remote"); + Lexicon chineseWiktionary = new Lexicon("chinese-wiktionary", "zh"); + chineseWiktionary.setDescription("Wiktionary"); + chineseWiktionary.setQueryUrl("http://en.wiktionary.org/wiki/"); + chineseWiktionary.setType("remote"); + remoteLexica.put("dwds", dwds); + remoteLexica.put("slater", slater); + remoteLexica.put("artfl-fr", artflFr); + remoteLexica.put("artfl-fr-en", artflFrEn); + remoteLexica.put("lewis", lewis); + remoteLexica.put("wikiwoordenboek", wikiwoordenboek); + remoteLexica.put("ctp", ctp); + remoteLexica.put("linyutan", linyutan); + remoteLexica.put("chinese-unicode", chineseUnicode); + remoteLexica.put("chinese-wiktionary", chineseWiktionary); + } + + public Lexicon getLexicon(String name) { + Lexicon lexicon = localLexica.get(name); + if (lexicon == null) + lexicon = remoteLexica.get(name); + return lexicon; + } + + public ArrayList getLocalLexicons(String lang) { + String language = Language.getInstance().getLanguageId(lang); + ArrayList retLexicons = null; + Set keys = localLexica.keySet(); + Iterator it = keys.iterator(); + while (it.hasNext()) { + String lexName = it.next(); + Lexicon lexicon = localLexica.get(lexName); + String sourceLanguage = lexicon.getSourceLanguage(); + if (sourceLanguage != null && sourceLanguage.equals(language)) { + if (retLexicons == null) + retLexicons = new ArrayList(); + retLexicons.add(lexicon); + } + } + return retLexicons; + } + + public ArrayList getRemoteLexicons(String lang) { + String language = Language.getInstance().getLanguageId(lang); + ArrayList retLexicons = null; + Set keys = remoteLexica.keySet(); + Iterator it = keys.iterator(); + while (it.hasNext()) { + String lexName = it.next(); + Lexicon lexicon = remoteLexica.get(lexName); + String sourceLanguage = lexicon.getSourceLanguage(); + if (sourceLanguage != null && sourceLanguage.equals(language)) { + if (retLexicons == null) + retLexicons = new ArrayList(); + retLexicons.add(lexicon); + } + } + return retLexicons; + } + + public ArrayList getLexicons(String lang) { + ArrayList retLexicons = new ArrayList(); + ArrayList localLexicons = getLocalLexicons(lang); + if (localLexicons != null) { + retLexicons.addAll(localLexicons); + } + ArrayList remoteLexicons = getRemoteLexicons(lang); + if (remoteLexicons != null) { + retLexicons.addAll(remoteLexicons); + } + return retLexicons; + } + + public ArrayList getLocalLexicons() { + ArrayList retLexicons = null; + Set keys = localLexica.keySet(); + Iterator it = keys.iterator(); + while (it.hasNext()) { + String lexName = it.next(); + Lexicon lexicon = localLexica.get(lexName); + if (retLexicons == null) + retLexicons = new ArrayList(); + retLexicons.add(lexicon); + } + return retLexicons; + } + + public ArrayList getLocalBetacodeLexicons() { + ArrayList retLexicons = new ArrayList(); + retLexicons.add(localLexica.get("autenrieth")); + retLexicons.add(localLexica.get("bonitz")); + retLexicons.add(localLexica.get("lsj")); + return retLexicons; + } + + public ArrayList getLocalBuckwalterLexicons() { + ArrayList retLexicons = new ArrayList(); + retLexicons.add(localLexica.get("salmone")); + return retLexicons; + } + +} + +/* TODO + + + + + + else if (dictname == "dwds") lang="de"; + else if (dictname == "grimm") lang="de"; + else if (dictname == "artfl") lang="fr"; + else of (dictname == "epsd") lang="sux"; + +DWDS: + +Link: http://www.dwds.de/?woerterbuch=1&qu=auto +Logo: http://www.dwds.de/images/dwds_logo.gif +Copyright: Copyright © by Berlin-Brandenburgische Akademie der Wissenschaften, Wörterbuch der deutschen Gegenwartssprache, all rights reserved. + +Grimm: + +Link: http://germa63.uni-trier.de:8080/Projects/WBB/woerterbuecher/dwb/report_lemma?wb=G&word=auto +View: http://germa63.uni-trier.de:8080/Projects/WBB/woerterbuecher/dwb/selectarticles?lemid= +Output: + + +Deutsches Wörterbuch von Jacob und Wilhelm Grimm + + + + + + +ARTFL: + +Name: Dictionnaire de l'Académie francaise, 4e éd. +Vorverarbeitung des Wortes yourWord: $word =~ s/%([0-9A-F]{2})/pack("H2", $1)/ge; +Link: http://colet.uchicago.edu/cgi-bin/dico1look.pl?dicoid=ACAD1762&strippedhw=yourWord + +EPSD: + +Name: ePSD (Pennsylvania Sumerian Dictionary) +Link: http://psd.museum.upenn.edu/cgi-bin/epsd.plx?x=epsd&q=yourWord + + + + */ diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/dict/app/Lexicon.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/dict/app/Lexicon.java Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,198 @@ +package de.mpg.mpiwg.berlin.mpdl.lt.dict.app; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.Enumeration; +import java.util.Hashtable; + +import org.apache.commons.lang3.StringEscapeUtils; + +import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; +import de.mpg.mpiwg.berlin.mpdl.lt.general.Language; +import de.mpg.mpiwg.berlin.mpdl.lt.text.transcode.Transcoder; + +public class Lexicon implements Comparable { + private String name; + private String sourceLang; + private String description; + private String queryUrl; + private String type; // local or remote + private Hashtable entries; + + public Lexicon(String name, String sourceLanguage) { + this.name = name; + this.sourceLang = sourceLanguage; + this.type = "local"; // default is local + this.entries = new Hashtable(); + } + + public int compareTo(Lexicon l) { + return name.compareTo(l.name); + } + + public String getName() { + return name; + } + + public void setName(String name) { + this.name = name; + } + + public String getSourceLanguage() { + return sourceLang; + } + + public String getDescription() { + return description; + } + + public void setDescription(String description) { + this.description = description; + } + + public String getQueryUrl() { + return queryUrl; + } + + public void setQueryUrl(String queryUrl) { + this.queryUrl = queryUrl; + } + + public String getType() { + return type; + } + + public void setType(String type) { + this.type = type; + } + + public boolean isLocalLexicon() { + boolean isLocal = false; + if (type != null && type.equals("local")) + isLocal = true; + return isLocal; + } + + public boolean isBetacodeLexicon() { + boolean isBetacode = false; + if (name.equals("autenrieth") || name.equals("bonitz") || name.equals("lsj")) + isBetacode = true; + return isBetacode; + } + + public boolean isBuckwalterLexicon() { + boolean isBuckwalter = false; + if (name.equals("salmone")) + isBuckwalter = true; + return isBuckwalter; + } + + public ArrayList getEntries() { + ArrayList result = new ArrayList(); + if (entries != null) { + Enumeration entryKeys = entries.keys(); + while(entryKeys.hasMoreElements()) { + String entryKey = entryKeys.nextElement(); + LexiconEntry le = entries.get(entryKey); + result.add(le); + } + } + Collections.sort(result); + if (result.isEmpty()) + return null; + else + return result; + } + + public LexiconEntry getDynamicEntry(String formName) throws ApplicationException { + LexiconEntry lexEntry = new LexiconEntry(name, formName, null); + String linkForm = formName; + if (Language.getInstance().isGreek(sourceLang)) { + linkForm = Transcoder.getInstance().transcodeFromUnicode2BetaCode(formName); + } + if (name.equals("linyutan")) { + linkForm = Transcoder.getInstance().encodeBig5(formName); + } + String remoteUrl = queryUrl + linkForm; + lexEntry.setRemoteUrl(remoteUrl); + return lexEntry; + } + + public boolean isEmpty() { + if (entries == null || entries.isEmpty()) + return true; + else + return false; + } + + public void addEntry(LexiconEntry newEntry) { + if (entries == null) + this.entries = new Hashtable(); + entries.put(newEntry.getFormName(), newEntry); + } + + public void addEntries(ArrayList newEntries) { + if (entries == null) + this.entries = new Hashtable(); + for (int i=0; i(); + lex.queryUrl = queryUrl; + lex.type = type; + return lex; + } + + public String toXmlString() { + String result = ""; + result = result + ""; + result = result + "" + name + ""; + result = result + "" + description + ""; + result = result + ""; + for (int i=0; i"; + LexiconEntry entry = getEntries().get(i); + result = result + "" + entry.getFormName() + ""; + if (isLocalLexicon()) { + result = result + ""; + String xmlValid = "false"; + if (entry.isXmlValid()) + xmlValid = "true"; + result = result + xmlValid; + result = result + ""; + result = result + ""; + if (entry.isXmlValid()) { + String repairedEntry = entry.getRepairedEntry(); + repairedEntry = repairedEntry.replaceAll("", ""); + repairedEntry = repairedEntry.replaceAll("", ""); + result = result + repairedEntry; // unicode content of the original entry + } else { + result = result + "This dictionary entry has no valid XML/HTML content in database so a text version of this entry is shown"; + String originalEntry = entry.getOriginalEntry(); // original content: not valid and e.g. in Betacode + originalEntry = originalEntry.replaceAll("", ""); + originalEntry = originalEntry.replaceAll("", ""); + originalEntry = StringEscapeUtils.escapeXml(originalEntry); // create text version of the invalid xml content + result = result + originalEntry; + } + result = result + ""; + } + if (entry.getRemoteUrl() != null) + result = result + "" + entry.getRemoteUrl() + ""; + result = result + ""; + } + result = result + ""; + result = result + ""; + return result; + } + +} diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/dict/app/LexiconEntry.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/dict/app/LexiconEntry.java Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,130 @@ +package de.mpg.mpiwg.berlin.mpdl.lt.dict.app; + +public class LexiconEntry implements Comparable { + private String lexiconName; + private String formName; + private String content; + private String remoteUrl; + private boolean xmlValid = false; + private boolean xmlMadeValid = false; + private String validationCode; + private String validationFailElementName; + + public LexiconEntry(String lexiconName, String formName, String content) { + this.lexiconName = lexiconName; + this.formName = formName; + this.content = content; + if (content != null) { + int begin = content.indexOf(""); + int end = content.indexOf(""); + if (begin != -1 && end != -1) { + String xmlValid = content.substring(begin + 11, end); + if (xmlValid != null) { + if (xmlValid.equals("true")) + this.xmlValid = true; + else if (xmlValid.equals("false")) + this.xmlValid = false; + } + } + } + } + + public String getLexiconName() { + return lexiconName; + } + + public String getFormName() { + return formName; + } + + public void setFormName(String formName) { + this.formName = formName; + } + + public String getContent() { + return content; + } + + public void setContent(String content) { + this.content = content; + } + + public String getRemoteUrl() { + return remoteUrl; + } + + public void setRemoteUrl(String remoteUrl) { + this.remoteUrl = remoteUrl; + } + + public boolean isXmlValid() { + return xmlValid; + } + + public void setXmlValid(boolean xmlValid) { + this.xmlValid = xmlValid; + } + + public String getValidationCode() { + return validationCode; + } + + public void setValidationCode(String validationCode) { + this.validationCode = validationCode; + } + + public String getValidationFailElementName() { + return validationFailElementName; + } + + public void setValidationFailElementName(String validationFailElementName) { + this.validationFailElementName = validationFailElementName; + } + + public boolean isXmlMadeValid() { + return xmlMadeValid; + } + + public void setXmlMadeValid(boolean xmlMadeValid) { + this.xmlMadeValid = xmlMadeValid; + } + + public String getRepairedEntry() { + String retStr = null; + if (content != null) { + int begin = content.indexOf(""); + int end = content.indexOf(""); + if (begin != -1 && end != -1) { + retStr = content.substring(begin, end) + ""; + } + } + return retStr; + } + + public String getOriginalEntry() { + String retStr = null; + if (content != null) { + int begin = content.indexOf(""); + int end = content.indexOf(""); + if (begin != -1 && end != -1) { + retStr = content.substring(begin, end) + ""; + } + } + return retStr; + } + + public int compareTo(LexiconEntry l) { + if (l.formName == null && this.formName == null) { + return 0; + } + if (this.formName == null) { + return 1; + } + if (l.formName == null) { + return -1; + } + return this.formName.compareTo(l.formName); + } + + +} diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/dict/db/DBLexWriter.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/dict/db/DBLexWriter.java Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,629 @@ +package de.mpg.mpiwg.berlin.mpdl.lt.dict.db; + +import java.io.BufferedOutputStream; +import java.io.BufferedReader; +import java.io.File; +import java.io.FileNotFoundException; +import java.io.FileOutputStream; +import java.io.FileReader; +import java.io.IOException; +import java.io.Reader; +import java.io.StringReader; +import java.io.UnsupportedEncodingException; +import java.util.ArrayList; +import java.util.Date; +import java.util.HashMap; +import java.util.Iterator; +import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.xml.sax.InputSource; +import org.xml.sax.SAXException; +import org.xml.sax.XMLReader; + +import com.sleepycat.je.Cursor; +import com.sleepycat.je.Database; +import com.sleepycat.je.DatabaseEntry; +import com.sleepycat.je.DatabaseException; +import com.sleepycat.je.LockMode; +import com.sleepycat.je.OperationStatus; +import com.sleepycat.je.util.DbLoad; +import com.sun.org.apache.xerces.internal.parsers.SAXParser; + +import de.mpg.mpiwg.berlin.mpdl.util.StringUtils; +import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; +import de.mpg.mpiwg.berlin.mpdl.lt.dict.app.Lexica; +import de.mpg.mpiwg.berlin.mpdl.lt.dict.app.Lexicon; +import de.mpg.mpiwg.berlin.mpdl.lt.dict.app.LexiconEntry; +import de.mpg.mpiwg.berlin.mpdl.lt.general.Constants; +import de.mpg.mpiwg.berlin.mpdl.lt.text.transcode.Transcoder; + +public class DBLexWriter { + private static DBLexWriter instance; + private static String DATA_DIR = Constants.getInstance().getDataDir(); + private static String DATA_FILES_DIR_LEXICA = DATA_DIR + "/dataFiles/pollux"; + private static String DB_DIR_LEXICA = DATA_DIR + "/dataBerkeleyDB/pollux"; + private DbEnvLex dbEnvLexica; + private Date beginOfOperation; + private Date endOfOperation; + + public static DBLexWriter getInstance() throws ApplicationException { + if (instance == null) { + instance = new DBLexWriter(); + } + return instance; + } + + public static void main(String[] args) throws ApplicationException { + getInstance(); + instance.beginOperation(); + System.out.print("Start ..."); + // instance.initReadOnly(); + instance.initReadWrite(); + // instance.readSampleData(); + // instance.testTranscoder(); + // instance.printSizeOfAllLexicons(); + instance.writeLexiconsToFiles(); + // instance.loadPolluxDbDumpsToDb(); + // instance.copyAndRepairAndTranscodeDumps(); + instance.end(); + instance.endOperation(); + // Double elapsedTime = new Util().getSecondWithMillisecondsBetween(instance.beginOfOperation, instance.endOfOperation); + System.out.println("End."); + // System.out.println("Needed time: " + elapsedTime + " seconds"); + } + + private void initReadWrite() throws ApplicationException { + dbEnvLexica = new DbEnvLex(); + dbEnvLexica.setDataDir(DB_DIR_LEXICA); + dbEnvLexica.initReadWrite(); + } + + private void initReadOnly() throws ApplicationException { + dbEnvLexica = new DbEnvLex(); + dbEnvLexica.setDataDir(DB_DIR_LEXICA); + dbEnvLexica.initReadOnly(); + ArrayList lexicons = Lexica.getInstance().getLocalLexicons(); + for (int i=0; i lexicons = Lexica.getInstance().getLocalLexicons(); + for (int i=0; i dbNames = dbEnvLexica.getEnv().getDatabaseNames(); + String l1 = readEntry("autenrieth", "au)to/s"); + String l2 = readEntry("ls", "laudabilis"); + String l3 = readEntry("lsjUnicode", "ἄδρεπτος"); + String l4 = readEntry("salmoneUnicode", "ءرش"); + System.out.println("Autenrieth: autos: " + l1); + System.out.println("Lewis & Short: Laudabilis: " + l2); + System.out.println("LSJ: ἄδρεπτος: " + l3); + System.out.println("Salmone: طب: " + l4); + printSampleEntries("salmoneUnicode", 10); + printSampleEntries("lsjUnicode", 1000); + } catch (DatabaseException e) { + throw new ApplicationException(e); + } + } + + private void end() throws ApplicationException { + ArrayList lexicons = Lexica.getInstance().getLocalLexicons(); + for (int i=0; i lexicons = Lexica.getInstance().getLocalLexicons(); + for (int i=0; i"); + int end = dbEntryValueStr.indexOf(""); + dbEntryValueStr = dbEntryValueStr.substring(begin, end) + ""; + LexiconEntry dbLexEntry = new LexiconEntry(lexiconName, dbEntryKeyStr, dbEntryValueStr); + LexiconEntry xmlLexiconEntry = xmlParse(dbLexEntry); + if (! xmlLexiconEntry.isXmlValid()) { + sizeXmlNotValidEntries ++; + } + size++; + } + cursor.close(); + } catch (DatabaseException e) { + throw new ApplicationException(e); + } catch (UnsupportedEncodingException e) { + throw new ApplicationException(e); + } + int[] sizes = new int[2]; + sizes[0] = size; + sizes[1] = sizeXmlNotValidEntries; + return sizes; + } + + private void copyAndRepairAndTranscodeDumps() throws ApplicationException { + try { + ArrayList lexicons = Lexica.getInstance().getLocalLexicons(); + for (int i=0; i lexDumpHashMap = getWholeLexiconHashMap(lexiconName + "Dump"); + dbEnvLexica.openDatabase(lexiconName); + Database lexDB = dbEnvLexica.getLexiconDB(lexiconName); + Iterator lexDumpIter = lexDumpHashMap.keySet().iterator(); + while (lexDumpIter.hasNext()) { + String lexDumpKeyStr = lexDumpIter.next(); + DatabaseEntry lexDumpValue = lexDumpHashMap.get(lexDumpKeyStr); + byte[] lexDumpValueBytes = lexDumpValue.getData(); + String lexDumpValueStr = new String(lexDumpValueBytes, "utf-8"); + String newLexValueStr = new String(lexDumpValueBytes, "utf-8"); + // repair lsj + if (lexiconName.equals("lsj")) { + newLexValueStr = newLexValueStr.replaceAll("
        ", "
        "); + newLexValueStr = newLexValueStr.replaceAll("

        ", "

        "); + String elementNameGreek = "G"; + newLexValueStr = deleteNestedTags(elementNameGreek, newLexValueStr); // delete tags and inside + newLexValueStr = newLexValueStr.replaceAll("lang=greek", "lang=\"greek\""); + boolean senseContained = newLexValueStr.matches(".*.*"); + boolean endSenseContained = newLexValueStr.matches(".*.*"); + if (senseContained && ! endSenseContained) + newLexValueStr = newLexValueStr.replaceAll("", ""); + else if (!senseContained && endSenseContained) + newLexValueStr = newLexValueStr.replaceAll("", ""); + boolean refContained = newLexValueStr.matches(".*.*"); + boolean endRefContained = newLexValueStr.matches(".*.*"); + if (refContained && ! endRefContained) + newLexValueStr = newLexValueStr.replaceAll("", ""); + else if (!refContained && endRefContained) + newLexValueStr = newLexValueStr.replaceAll("", ""); + /* + boolean itypeContained = newLexValueStr.matches(".*.*"); + boolean endItypeContained = newLexValueStr.matches(".*.*"); + if (itypeContained && ! endItypeContained) + newLexValueStr = newLexValueStr.replaceAll("", ""); + else if (!itypeContained && endItypeContained) + newLexValueStr = newLexValueStr.replaceAll("", ""); + */ + } + // repair cooper + if (lexiconName.equals("cooper")) { + newLexValueStr = newLexValueStr.replaceAll("", ""); // TODO hack + newLexValueStr = newLexValueStr.replaceAll("

        ", "

        "); // TODO hack + } + // repair baretti + if (lexiconName.equals("baretti")) { + newLexValueStr = newLexValueStr.replaceAll("

      • ", "
      • "); // TODO hack + } + // repair for all lexicons + newLexValueStr = newLexValueStr.replaceAll("type=style", "type=\"style\""); + newLexValueStr = newLexValueStr.replaceAll("type=dom", "type=\"dom\""); + newLexValueStr = newLexValueStr.replaceAll("<\\*>", ""); + newLexValueStr = newLexValueStr.replaceAll("

        ", "

        "); + LexiconEntry newLexEntryTemp = new LexiconEntry(lexiconName, lexDumpKeyStr, newLexValueStr); // lexDumpKeyStr is not transcoded yet but it will not be used in further in the code + LexiconEntry newLexEntry = xmlParseAndRepair(newLexEntryTemp); + String xmlValidString = "true"; + if (! newLexEntry.isXmlValid()) { + xmlValidString = "false"; + } + newLexValueStr = newLexEntry.getContent(); + // transcode the Betacode lexicon entries to Unicode (key and value) + if (lexicon.isBetacodeLexicon()) { + Transcoder transcoder = Transcoder.getInstance(); + lexDumpKeyStr = transcoder.transcodeFromBetaCode2Unicode(lexDumpKeyStr); + String elementName = "G"; + if (newLexEntry.isXmlValid()) { + newLexValueStr = transcodeByElementName("fromBetacode2Unicode", elementName, newLexValueStr); + } + } + // transcode the Buckwalter entries to Unicode (key and value) + if (lexicon.isBuckwalterLexicon()) { + Transcoder transcoder = Transcoder.getInstance(); + lexDumpKeyStr = transcoder.transcodeFromBuckwalter2Unicode(lexDumpKeyStr); + String elementName = "AR"; + if (newLexEntry.isXmlValid()) { + newLexValueStr = transcodeByElementName("fromBuckwalter2Unicode", elementName, newLexValueStr); + } + } + // put the entry into database + newLexValueStr = "" + xmlValidString + "" + lexDumpValueStr + "" + "" + newLexValueStr + "" + ""; + DatabaseEntry newLexDumpKey = new DatabaseEntry(lexDumpKeyStr.getBytes("utf-8")); + DatabaseEntry newLexValue = new DatabaseEntry(newLexValueStr.getBytes("utf-8")); + lexDB.put(null, newLexDumpKey, newLexValue); + } + } + } catch (DatabaseException e) { + throw new ApplicationException(e); + } catch (UnsupportedEncodingException e) { + throw new ApplicationException(e); + } + } + + private void printSampleEntries(String lexiconName, int count) throws ApplicationException { + try { + int counter = 0; + dbEnvLexica.openDatabase(lexiconName); + Database lexDB = dbEnvLexica.getLexiconDB(lexiconName); + Cursor cursor = lexDB.openCursor(null, null); + DatabaseEntry dbEntryKey = new DatabaseEntry(); + DatabaseEntry dbEntryValue = new DatabaseEntry(); + OperationStatus operationStatus = cursor.getFirst(dbEntryKey, dbEntryValue, LockMode.DEFAULT); + while (operationStatus == OperationStatus.SUCCESS && counter < count) { + int size = dbEntryKey.getSize(); + if (size > 0) { + byte[] dbEntryKeyBytes = dbEntryKey.getData(); + String dbEntryKeyStr = new String(dbEntryKeyBytes, "utf-8"); + System.out.println(lexiconName + ": key: " + dbEntryKeyStr + " value size: " + dbEntryValue.getSize()); + } + operationStatus = cursor.getNext(dbEntryKey, dbEntryValue, LockMode.DEFAULT); + counter++; + } + cursor.close(); + } catch (DatabaseException e) { + throw new ApplicationException(e); + } catch (UnsupportedEncodingException e) { + throw new ApplicationException(e); + } + } + + private void testTranscoder() throws ApplicationException { + String testStr = "hfhf fdfdei)mi/ (sum), Aeol. e)/mmi hfhfh Sapph.2.15, Theoc.20.32; Cret. h)mi/ GDI 4959a; 2sg. ei)=, Ep. and Ion. ei)s Od.17.388, al., Aeol. e)/ssi, Ep. and Dor. e)ssi/ Il.1.176, Pi."; + String testStr2 = "aaaaa 1111a 2222a 3333a 1111a aaaaa bbbbb 1111b 2222b 3333b 1111b bbbbb "; + String testStr3 = "e)pano/rqwsin e)/xein, opp a)ni/aton ei)=nai *hi3. 1165 b18. --e)panorqw/seis kai boh/qeiai *rb5. 1383 a20."; + String testStr4 = "suni^hmi Ar.Av.946 (s. v.l.), Strato Com.1.3: with variation of quantity, plei=ston ou)=lon i(/ei [i^], i)/oulon i(/ei [i_] Carm.Pop. 1.]:—" + + ";
        release, let go, h(=ka ..po/das kai\\ xei=re fe/resqai Od.12.442; h(=ke fe/resqai let him float" + + "off, Il.21.120; let fall, ka\\d de\\ ka/rhtos h(=ke ko/mas made his locks flow down from his head, Od.<" + + "/author>6.231; [e)qei/ras] i(/ei lo/fon a)mfi/ .... ggg"; + String testStr5 = "plei=ston ou)=lon i(/ei "; + String testStr6 = "*a as< as as: *)a *s ss "; + Transcoder t = Transcoder.getInstance(); + String transcoded = t.transcodeFromBetaCode2Unicode(testStr4); + transcoded = t.transcodeFromBetaCode2Unicode(testStr5); + transcoded = t.transcodeFromBetaCode2Unicode(testStr6); + + String arabTestStr1 = "^nutaf"; + String arabTestStr2 = "min"; + String arabTestStr3 = "Aal-Hiyal (^qAla ^>arisTwTAlys) yataEaj~aba Aal-nAs minhA <im~A fy Aal->a$yA' Aal~aty taEriDu TabEAF fa-mim~A lA yuElamu Eil~atuhu wa-<im~A fy Aal->a$yA' Aal-muxAlifap li-l-TabE fa-mim~A yuEmalu bi-Aal-SinAEap li-manfaEap Aal-nAs li->an~a Aal-TabyEap tulzimu >abadAF jihap wAHidap wa->am~A manAfiE Aal-nAs fa-<in~ahA taxtalifu <ixtilAfAF kavyrAF."; + transcoded = t.transcodeFromBuckwalter2Unicode(arabTestStr1); + transcoded = t.transcodeFromBuckwalter2Unicode(arabTestStr2); + transcoded = t.transcodeFromBuckwalter2Unicode(arabTestStr3); + + // String deletedNestedTags = deleteNestedTags("G", testStr4); + // String regExpr = "(.*?)(.*?)(.*?)(.*?)(.*?)"; + String regExpr = "(.*?)(.*)(){1,}(.*?)"; + // String regExpr = "(.*?)(.*?)(.*?)(.*?)(.*?)"; + String replaceStr = testStr2.replaceAll(regExpr, "$1$2$4"); + // String replaceStr2 = testStr2.replaceAll("(.*)(.*)(.*)(.*)(.*)", "$2$3$4$5"); + regExpr = ".*?(.*?){1,}.*?"; + regExpr = "(.*?)(.*?)(.*?){1,}(.*?)"; + // String regExpr = "[a-zA-Z0-9]+?\\[.+?\\]/" + "|" + "[a-zA-Z0-9]+?/" + "|" + "[a-zA-Z0-9]+?\\[.+\\]$" + "|" + "[a-zA-Z0-9]+?$"; // pathName example: "/archimedes[@xmlns:xlink eq "http://www.w3.org/1999/xlink"]/text/body/chap/p[@type eq "main"]/s/foreign[@lang eq "en"]" + Pattern p = Pattern.compile(regExpr, Pattern.CASE_INSENSITIVE | Pattern.MULTILINE); // both flags enabled + Matcher m = p.matcher(testStr2); + while (m.find()) { + int msBeginPos = m.start(); + int msEndPos = m.end(); + String matchStr = testStr2.substring(msBeginPos, msEndPos); + String bla = ""; + } + + String retStr = transcodeByElementName("fromBetacode2Unicode", "G", testStr); + retStr = transcodeByElementName("fromBetacode2Unicode", "G", "bla"); + retStr = transcodeByElementName("fromBetacode2Unicode", "G", ""); + } + + private String transcodeByElementName(String transcodeDirection, String elementName, String inputStr) throws ApplicationException { + if (inputStr == null || elementName == null) + return null; + String elemBeginTag = "<" + elementName + ">"; + String elemEndTag = ""; + Transcoder transcoder = Transcoder.getInstance(); + String outputStr = ""; + int begin = inputStr.indexOf(elemBeginTag); + int end = inputStr.indexOf(elemEndTag); + while (begin != -1 && end != -1 && begin < end) { + String before = inputStr.substring(0, begin); + String origStr = inputStr.substring(begin + elemBeginTag.length(), end); + origStr = StringUtils.deleteSpecialXmlEntities(origStr); + String transcodedStr = origStr; + if (transcodeDirection.equals("fromBetacode2Unicode")) + transcodedStr = transcoder.transcodeFromBetaCode2Unicode(origStr); + else if (transcodeDirection.equals("fromBuckwalter2Unicode")) + transcodedStr = transcoder.transcodeFromBuckwalter2Unicode(origStr); + outputStr = outputStr + before + new String(elemBeginTag); + outputStr = outputStr + transcodedStr; + outputStr = outputStr + new String(elemEndTag); + inputStr = inputStr.substring(end + elemEndTag.length()); + begin = inputStr.indexOf(elemBeginTag); + end = inputStr.indexOf(elemEndTag); + } + outputStr = outputStr + inputStr; + return outputStr; + } + + private String deleteNestedTags(String elementName, String inputStr) { + String inputStrTmp = new String(inputStr); + String elemBeginTag = "<" + elementName + ">"; + String elemEndTag = ""; + String outputStr = ""; + int begin = inputStrTmp.indexOf(elemBeginTag); + int end = inputStrTmp.indexOf(elemEndTag); + while (begin != -1 && end != -1) { + end = getIndexClosedTag(begin, elementName, inputStrTmp); + String before = inputStrTmp.substring(0, begin); + String origStr = null; + if (end == -1) // if no end tag could be found + origStr = inputStrTmp.substring(begin + elemBeginTag.length(), inputStrTmp.length()); + else + origStr = inputStrTmp.substring(begin + elemBeginTag.length(), end); + origStr = origStr.replaceAll(elemBeginTag, ""); + origStr = origStr.replaceAll(elemEndTag, ""); + outputStr = outputStr + before + new String(elemBeginTag); + outputStr = outputStr + origStr; + outputStr = outputStr + new String(elemEndTag); + inputStrTmp = inputStrTmp.substring(end + elemEndTag.length()); + begin = inputStrTmp.indexOf(elemBeginTag); + } + outputStr = outputStr + inputStrTmp; + return outputStr; + } + + private int getIndexClosedTag(int begin, String elementName, String inputStr) { + int beginTmp = begin; + int retIndex = -1; + String elemBeginTag = "<" + elementName + ">"; + String elemEndTag = ""; + int indexEndTag = inputStr.indexOf(elemEndTag); + while (indexEndTag != -1) { + String betweenTmpStr = inputStr.substring(beginTmp + elemBeginTag.length(), indexEndTag); + int indexBeginTag = betweenTmpStr.indexOf(elemBeginTag); + if (indexBeginTag != -1) { + beginTmp = indexEndTag; + } else { + return indexEndTag; + } + indexEndTag = inputStr.indexOf(elemEndTag, indexEndTag + elemEndTag.length()); + } + return retIndex; + } + + private HashMap getWholeLexiconHashMap(String lexiconName) throws ApplicationException { + HashMap lexHashMap = new HashMap(); + try { + dbEnvLexica.openDatabase(lexiconName); + Database lexDB = dbEnvLexica.getLexiconDB(lexiconName); + Cursor cursor = lexDB.openCursor(null, null); + DatabaseEntry dbEntryKey = new DatabaseEntry(); + DatabaseEntry dbEntryValue = new DatabaseEntry(); + OperationStatus operationStatus = cursor.getFirst(dbEntryKey, dbEntryValue, LockMode.DEFAULT); + while (operationStatus == OperationStatus.SUCCESS) { + int size = dbEntryKey.getSize(); + if (size > 0) { + byte[] dbEntryKeyBytes = dbEntryKey.getData(); + String dbEntryKeyStr = new String(dbEntryKeyBytes, "utf-8"); + DatabaseEntry newDbEntryValue = new DatabaseEntry(dbEntryValue.getData()); + lexHashMap.put(dbEntryKeyStr, newDbEntryValue); + } + operationStatus = cursor.getNext(dbEntryKey, dbEntryValue, LockMode.DEFAULT); + } + cursor.close(); + } catch (DatabaseException e) { + throw new ApplicationException(e); + } catch (UnsupportedEncodingException e) { + throw new ApplicationException(e); + } + return lexHashMap; + } + + private LexiconEntry xmlParseAndRepair(LexiconEntry lexEntry) throws ApplicationException { + String origLexEntryContent = lexEntry.getContent(); + String lexEntryContent = new String(origLexEntryContent); + lexEntry.setContent(lexEntryContent); + // parse and repair: try to repair it 3 times through parsing + LexiconEntry retLexiconEntry = xmParseAndRepairLocal(lexEntry); + retLexiconEntry = xmParseAndRepairLocal(retLexiconEntry); + retLexiconEntry = xmParseAndRepairLocal(retLexiconEntry); + // if it could not be repaired the original content (which is not XML valid) is delivered + if (! retLexiconEntry.isXmlValid()) + retLexiconEntry.setContent(origLexEntryContent); + return retLexiconEntry; + } + + private LexiconEntry xmParseAndRepairLocal(LexiconEntry lexEntry) throws ApplicationException { + if (! lexEntry.isXmlValid()) { + lexEntry = xmlParse(lexEntry); + } + if (! lexEntry.isXmlValid() && lexEntry.getValidationCode() != null && lexEntry.getValidationCode().equals("elementNotClosed")) { + String elementName = lexEntry.getValidationFailElementName(); + String lexiconEntryContent = lexEntry.getContent(); + lexiconEntryContent = lexiconEntryContent.replaceAll("<" + elementName + " .*?>", ""); + lexiconEntryContent = lexiconEntryContent.replaceAll("", ""); + lexEntry.setContent(lexiconEntryContent); + lexEntry.setXmlMadeValid(true); + } + return lexEntry; + } + + private LexiconEntry xmlParse(LexiconEntry lexEntry) throws ApplicationException { + String lexEntryContent = "" + lexEntry.getContent() + ""; + LexEntryContentHandler lexEntryContentHandler = new LexEntryContentHandler(); + XMLReader xmlParser = new SAXParser(); + xmlParser.setContentHandler(lexEntryContentHandler); + LexEntryErrorHandler lexEntryErrorHandler = new LexEntryErrorHandler(); + xmlParser.setErrorHandler(lexEntryErrorHandler); + try { + Reader reader = new StringReader(lexEntryContent); + InputSource input = new InputSource(reader); + xmlParser.parse(input); + lexEntry.setXmlValid(true); + } catch (SAXException e) { + // nothing but following + lexEntry.setXmlValid(false); + String exceptionMessage = e.getMessage(); + if (exceptionMessage.matches("The element type .* must be terminated by the matching end-tag .*")) { + int begin = exceptionMessage.indexOf("\""); + if (begin != -1) { + String subStr = exceptionMessage.substring(begin + 1); + int end = subStr.indexOf("\""); + if (end != -1) { + String elementName = exceptionMessage.substring(begin + 1, begin + 1 + end); + lexEntry.setValidationCode("elementNotClosed"); + lexEntry.setValidationFailElementName(elementName); + } + } + } + } catch (IOException e) { + throw new ApplicationException(e); + } + return lexEntry; + } + + private void writeLexiconsToFiles() throws ApplicationException { + BufferedReader in = null; + BufferedOutputStream out = null; + try { + ArrayList lexicons = Lexica.getInstance().getLocalLexicons(); + for (int i=0; i lexHashMap = getWholeLexiconHashMap(lexiconName); + Iterator lexDumpIter = lexHashMap.keySet().iterator(); + File outputFile = new File(DATA_FILES_DIR_LEXICA + "/" + lexiconName + ".xml"); + out = new BufferedOutputStream(new FileOutputStream(outputFile)); + write("\n", out); + write("" + lexiconName + "\n", out); + write("" + lexicon.getDescription() + "\n", out); + write("\n", out); + while (lexDumpIter.hasNext()) { + write("\n", out); + String lexKeyStr = lexDumpIter.next(); + write("

        " + lexKeyStr + "
        \n", out); + DatabaseEntry lexValue = lexHashMap.get(lexKeyStr); + byte[] lexValueBytes = lexValue.getData(); + write(lexValueBytes, out); + write("\n", out); + } + write("\n", out); + write("\n", out); + } + } catch (FileNotFoundException e) { + throw new ApplicationException(e); + } finally { + // always close the stream + if (in != null) try { in.close(); } catch (Exception e) { } + if (out != null) try { out.close(); } catch (Exception e) { } + } + } + + private void write(byte[] inputBytes, BufferedOutputStream out) throws ApplicationException { + try { + out.write(inputBytes, 0, inputBytes.length); + out.flush(); + } catch (IOException e) { + throw new ApplicationException(e); + } + } + + private void write(String outStr, BufferedOutputStream out) throws ApplicationException { + try { + byte[] bytes = outStr.getBytes("utf-8"); + out.write(bytes, 0, bytes.length); + out.flush(); + } catch (IOException e) { + throw new ApplicationException(e); + } + } + + private void beginOperation() { + beginOfOperation = new Date(); + } + + private void endOperation() { + endOfOperation = new Date(); + } + +} \ No newline at end of file diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/dict/db/DbEnvLex.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/dict/db/DbEnvLex.java Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,101 @@ +package de.mpg.mpiwg.berlin.mpdl.lt.dict.db; + +import java.io.File; +import java.util.HashMap; + +import com.sleepycat.je.Database; +import com.sleepycat.je.DatabaseConfig; +import com.sleepycat.je.DatabaseException; +import com.sleepycat.je.Environment; +import com.sleepycat.je.EnvironmentConfig; + +import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; + +public class DbEnvLex { + private String dataDir; + private File envPath; + private Environment env; + private EnvironmentConfig envConfig; + private DatabaseConfig dbConfig; + private HashMap lexiconDBs = new HashMap(); + + public DbEnvLex() { + } + + public void setDataDir(String dataDir) { + this.dataDir = dataDir; + } + + public void initReadOnly() throws ApplicationException { + try { + envConfig = new EnvironmentConfig(); + dbConfig = new DatabaseConfig(); + envPath = new File(dataDir); + env = new Environment(envPath, envConfig); + } catch (DatabaseException e) { + throw new ApplicationException(e); + } + } + + public void initReadWrite() throws ApplicationException { + try { + envConfig = new EnvironmentConfig(); + dbConfig = new DatabaseConfig(); + envConfig.setReadOnly(false); + dbConfig.setReadOnly(false); + envConfig.setAllowCreate(true); + dbConfig.setAllowCreate(true); + envConfig.setTransactional(true); + dbConfig.setTransactional(true); + envPath = new File(dataDir); + env = new Environment(envPath, envConfig); + } catch (DatabaseException e) { + throw new ApplicationException(e); + } + } + + public void openDatabase(String lexiconName) throws ApplicationException { + try { + Database lexDB = lexiconDBs.get(lexiconName); + if (lexDB == null) { + Database lexiconDB = env.openDatabase(null, lexiconName + ".db", dbConfig); + lexiconDBs.put(lexiconName, lexiconDB); + } + } catch (DatabaseException e) { + throw new ApplicationException(e); + } + } + + public void closeDatabase(String lexiconName) throws ApplicationException { + try { + if (lexiconDBs != null) { + Database lexiconDB = lexiconDBs.get(lexiconName); + if (lexiconDB != null) + lexiconDB.close(); + } + } catch (DatabaseException e) { + throw new ApplicationException(e); + } + } + + public Environment getEnv() { + return env; + } + + public Database getLexiconDB(String lexiconName) { + Database lexiconDB = lexiconDBs.get(lexiconName); + return lexiconDB; + } + + public void close() throws ApplicationException { + if (env != null) { + try { + if (env != null) + env.close(); + } catch (DatabaseException e) { + throw new ApplicationException(e); + } + } + } +} + diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/dict/db/LexEntryContentHandler.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/dict/db/LexEntryContentHandler.java Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,43 @@ +package de.mpg.mpiwg.berlin.mpdl.lt.dict.db; + +import org.xml.sax.*; + +public class LexEntryContentHandler implements ContentHandler { + + public LexEntryContentHandler() { + } + + public void startDocument() throws SAXException { + } + + public void endDocument() throws SAXException { + } + + public void characters(char[] c, int start, int length) throws SAXException { + } + + public void ignorableWhitespace(char[] c, int start, int length) throws SAXException { + } + + public void processingInstruction(String target, String data) throws SAXException { + } + + public void setDocumentLocator(org.xml.sax.Locator arg1) { + } + + public void endPrefixMapping(String prefix) throws SAXException { + } + + public void skippedEntity(String name) throws SAXException { + } + + public void startElement(String uri, String localName, String name, Attributes attrs) throws SAXException { + } + + public void endElement(String uri, String localName, String name) throws SAXException { + } + + public void startPrefixMapping(String prefix, String uri) throws SAXException { + } + +} diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/dict/db/LexEntryErrorHandler.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/dict/db/LexEntryErrorHandler.java Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,12 @@ +package de.mpg.mpiwg.berlin.mpdl.lt.dict.db; + +import org.xml.sax.*; + +public class LexEntryErrorHandler implements ErrorHandler { + public void warning(SAXParseException exception) throws SAXException { + } + public void error(SAXParseException exception) throws SAXException { + } + public void fatalError(SAXParseException exception) throws SAXException { + } +} \ No newline at end of file diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/dict/db/LexHandler.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/dict/db/LexHandler.java Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,353 @@ +package de.mpg.mpiwg.berlin.mpdl.lt.dict.db; + +import java.io.UnsupportedEncodingException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Date; +import java.util.logging.Logger; + +import com.sleepycat.je.Cursor; +import com.sleepycat.je.Database; +import com.sleepycat.je.DatabaseEntry; +import com.sleepycat.je.DatabaseException; +import com.sleepycat.je.LockMode; +import com.sleepycat.je.OperationStatus; + +import de.mpg.mpiwg.berlin.mpdl.util.Util; +import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; +import de.mpg.mpiwg.berlin.mpdl.lt.dict.app.Lexica; +import de.mpg.mpiwg.berlin.mpdl.lt.dict.app.Lexicon; +import de.mpg.mpiwg.berlin.mpdl.lt.dict.app.LexiconEntry; +import de.mpg.mpiwg.berlin.mpdl.lt.general.Constants; +import de.mpg.mpiwg.berlin.mpdl.lt.general.Language; +import de.mpg.mpiwg.berlin.mpdl.lt.morph.app.Lemma; +import de.mpg.mpiwg.berlin.mpdl.lt.morph.app.MorphologyCache; +import de.mpg.mpiwg.berlin.mpdl.lt.text.transcode.Transcoder; + +public class LexHandler { + private static LexHandler instance; + private static Logger LOGGER = Logger.getLogger(LexHandler.class.getName()); + private static String DATA_DIR = Constants.getInstance().getDataDir(); + private static String DB_DIR_LEXICA = DATA_DIR + "/dataBerkeleyDB/pollux"; + private DbEnvLex dbEnvLexica; + private Date beginOfOperation; + private Date endOfOperation; + + public static LexHandler getInstance() throws ApplicationException { + if (instance == null) { + instance = new LexHandler(); + instance.initReadOnly(); + } + return instance; + } + + public void end() throws ApplicationException { + ArrayList lexicons = Lexica.getInstance().getLocalLexicons(); + for (int i=0; i getLemmas(String query, String type, String language, String normalization) throws ApplicationException { + ArrayList lexLemmas = new ArrayList(); + // get lemmas of all forms in query + MorphologyCache morphologyCache = MorphologyCache.getInstance(); + String[] queryForms = query.split(" "); + for (int k=0; k lemmas = null; + if (type.equals("form")) { + if (normalization.equals("norm")) + lemmas = morphologyCache.getLemmasByFormName(language, queryForm, true); + else if (normalization.equals("none")) + lemmas = morphologyCache.getLemmasByFormName(language, queryForm, false); + else + lemmas = morphologyCache.getLemmasByFormName(language, queryForm, true); // TODO reg and reg+norm + } else if (type.equals("lemma")) { + lemmas = new ArrayList(); + Lemma l = null; + if (normalization.equals("norm")) + l = morphologyCache.getLemma(language, queryForm, true); + else if (normalization.equals("none")) + l = morphologyCache.getLemma(language, queryForm, false); + else + l = morphologyCache.getLemma(language, queryForm, true); + if (l != null) + lemmas.add(l); + } + if (lemmas != null && ! lemmas.isEmpty()) { + lexLemmas.addAll(lemmas); + } else { + Lemma l = new Lemma("created dynamically cause no lemma is available", language, queryForm); // at least the word form is added for finding it in the lexicon + lexLemmas.add(l); + } + } + Collections.sort(lexLemmas); + if (lexLemmas.isEmpty()) + return null; + else + return lexLemmas; + } + + public ArrayList getLexEntries(ArrayList lexLemmas, String language, String lexiconName) throws ApplicationException { + ArrayList retLexicons = new ArrayList(); + ArrayList lexicons = Lexica.getInstance().getLexicons(language); + if (lexiconName != null) { + lexicons = new ArrayList(); + Lexicon lexicon = Lexica.getInstance().getLexicon(lexiconName); + if (lexicon != null) + lexicons.add(lexicon); + } + if (lexicons != null) { + for (int i=0; i getLexEntryKeys(String formName, String language, boolean normalize) throws ApplicationException { + ArrayList lexEntryKeys = new ArrayList(); + MorphologyCache morphologyCache = MorphologyCache.getInstance(); + ArrayList formLemmas = morphologyCache.getLemmasByFormName(language, formName, normalize); + boolean hasLexEntry = false; + hasLexEntry = hasLexEntryKey(formName, language); + if (hasLexEntry) + lexEntryKeys.add(formName); + if (formLemmas != null) { + for (int j=0; j statLexicons = Lexica.getInstance().getLocalLexicons(language); + if (statLexicons != null) { + for (int i=0; i getLexEntriesBeginningWith(String language, String formPrefix, int pageNumber) throws ApplicationException { + int pageSize = 50; + int from = (pageNumber * pageSize) - pageSize + 1; + int to = pageNumber * pageSize; + ArrayList statLexicons = Lexica.getInstance().getLocalLexicons(language); + ArrayList retLexicons = null; + if (statLexicons != null) { + for (int i=0; i lexEntries = readEntriesBeginningWith(lexiconName, formPrefix, from, to); + // TODO merge the entries and remove duplicates + if (lexEntries != null) { + lexicon.addEntries(lexEntries); + if (retLexicons == null) + retLexicons = new ArrayList(); + retLexicons.add(lexicon); + } + } + } + return retLexicons; + } + + public ArrayList getLexEntriesByLexiconBeginningWith(String lexiconName, String formPrefix, int pageNumber) throws ApplicationException { + int pageSize = 50; + int from = (pageNumber * pageSize) - pageSize + 1; + int to = pageNumber * pageSize; + Lexicon lexicon = Lexica.getInstance().getLexicon(lexiconName).clone(); + ArrayList retLexicons = null; + if (lexicon != null) { + ArrayList lexEntries = readEntriesBeginningWith(lexiconName, formPrefix, from, to); + if (lexEntries != null) { + lexicon.addEntries(lexEntries); + retLexicons = new ArrayList(); + retLexicons.add(lexicon); + } + } + return retLexicons; + } + + private LexiconEntry getEntry(Lexicon lexicon, String formName) throws ApplicationException { + LexiconEntry lexEntry = null; + if (lexicon.isLocalLexicon()) { + lexEntry = readEntry(lexicon.getName(), formName); + String lexiconQueryUrl = lexicon.getQueryUrl(); + if (lexEntry != null && lexicon.getQueryUrl() != null) { + String language = lexicon.getSourceLanguage(); + if (Language.getInstance().isGreek(language)) { + formName = Transcoder.getInstance().transcodeFromUnicode2BetaCode(formName); + } else if (Language.getInstance().isArabic(language)) { + formName = Transcoder.getInstance().transcodeFromUnicode2Buckwalter(formName); + } + lexEntry.setRemoteUrl(lexiconQueryUrl + formName); + } + } else { + lexEntry = lexicon.getDynamicEntry(formName); + } + return lexEntry; + } + + private LexiconEntry readEntry(String lexiconName, String formName) throws ApplicationException { + LexiconEntry retLexEntry = null; + try { + String dbFoundValueStr = null; + String keyStr = formName; + DatabaseEntry dbEntryKey = new DatabaseEntry(keyStr.getBytes("utf-8")); + Database lexDB = dbEnvLexica.getLexiconDB(lexiconName); + Cursor cursor = lexDB.openCursor(null, null); + DatabaseEntry foundValue = new DatabaseEntry(); + OperationStatus operationStatus = cursor.getSearchKey(dbEntryKey, foundValue, LockMode.DEFAULT); + if (operationStatus == OperationStatus.SUCCESS) { + byte[] foundValueBytes = foundValue.getData(); + dbFoundValueStr = new String(foundValueBytes, "utf-8"); + } + cursor.close(); + if (dbFoundValueStr != null) { + retLexEntry = new LexiconEntry(lexiconName, formName, dbFoundValueStr); + } + } catch (DatabaseException e) { + throw new ApplicationException(e); + } catch (UnsupportedEncodingException e) { + throw new ApplicationException(e); + } + return retLexEntry; + } + + private ArrayList readEntriesBeginningWith(String lexiconName, String formPrefix, int from, int to) throws ApplicationException { + ArrayList retLexEntries = new ArrayList();; + try { + String dbFoundValueStr = null; + String keyStr = formPrefix; + DatabaseEntry dbEntryKey = new DatabaseEntry(keyStr.getBytes("utf-8")); + Database lexDB = dbEnvLexica.getLexiconDB(lexiconName); + Cursor cursor = lexDB.openCursor(null, null); + DatabaseEntry foundValue = new DatabaseEntry(); + OperationStatus operationStatus = cursor.getSearchKeyRange(dbEntryKey, foundValue, LockMode.DEFAULT); + int counter = 1; + while (operationStatus == OperationStatus.SUCCESS && counter <= to) { + if (counter >= from) { + byte[] foundValueBytes = foundValue.getData(); + dbFoundValueStr = new String(foundValueBytes, "utf-8"); + byte[] foundKeyBytes = dbEntryKey.getData(); + String dbFoundKeyStr = new String(foundKeyBytes, "utf-8"); + LexiconEntry lexEntry = new LexiconEntry(lexiconName, dbFoundKeyStr, dbFoundValueStr); + retLexEntries.add(lexEntry); + } + operationStatus = cursor.getNext(dbEntryKey, foundValue, LockMode.DEFAULT); + counter++; + } + cursor.close(); + if (retLexEntries.isEmpty()) { + return null; + } + } catch (DatabaseException e) { + throw new ApplicationException(e); + } catch (UnsupportedEncodingException e) { + throw new ApplicationException(e); + } + return retLexEntries; + } + + public static void main(String[] args) throws ApplicationException { + getInstance(); + instance.beginOperation(); + System.out.print("Start ..."); + instance.readSampleData(); + instance.end(); + instance.endOperation(); + Double elapsedTime = new Util().getSecondWithMillisecondsBetween(instance.beginOfOperation, instance.endOfOperation); + System.out.println("End."); + System.out.println("Needed time: " + elapsedTime + " seconds"); + } + + private void initReadOnly() throws ApplicationException { + dbEnvLexica = new DbEnvLex(); + dbEnvLexica.setDataDir(DB_DIR_LEXICA); + dbEnvLexica.initReadOnly(); + ArrayList lexicons = Lexica.getInstance().getLocalLexicons(); + for (int i=0; i dbNames = dbEnvLexica.getEnv().getDatabaseNames(); + String l1 = readEntry("autenrieth", "au)to/s").getContent(); // greek: see also bonitz and lsj + String l2 = readEntry("ls", "laudabilis").getContent(); // latin + System.out.println("Autenrieth: autos: " + l1); + System.out.println("Lewis & Short: Laudabilis: " + l2); + } + + private void beginOperation() { + beginOfOperation = new Date(); + } + + private void endOperation() { + endOfOperation = new Date(); + } + +} \ No newline at end of file diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/general/Constants.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/general/Constants.java Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,36 @@ +package de.mpg.mpiwg.berlin.mpdl.lt.general; + +import java.net.URL; +import java.util.Properties; + +import de.mpg.mpiwg.berlin.mpdl.util.Util; + +public class Constants { + public static String DEFAULT_LANGUAGE = "en"; + public static int MORPHOLOGY_CACHE_SIZE = 1000000; + private static Constants instance; + private Properties properties; + + public static Constants getInstance() { + if (instance == null) { + instance = new Constants(); + instance.init(); + } + return instance; + } + + private void init() { + URL url = Constants.class.getClassLoader().getResource("constants.properties"); + if (url != null) { + String propertiesFileName = url.toString().substring(5); + properties = (new Util()).getProperties(propertiesFileName); + } + } + + public String getDataDir() { + if (properties != null) + return properties.getProperty("dataDir"); + else + return "no properties file"; + } +} diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/general/Language.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/general/Language.java Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,172 @@ +package de.mpg.mpiwg.berlin.mpdl.lt.general; + +import java.util.HashMap; + +/** + * + * Language codes from ISO 639-3 + * + */ +public class Language { + private static Language instance; + private static HashMap languageIds = new HashMap(); + private static HashMap iso639Codes = new HashMap(); + + public static Language getInstance() { + if (instance == null) { + instance = new Language(); + instance.init(); + } + return instance; + } + + private void init() { + languageIds.put("ar", "ar"); + languageIds.put("ara", "ar"); + languageIds.put("de", "de"); + languageIds.put("ger", "de"); + languageIds.put("deu", "de"); + languageIds.put("el", "el"); + languageIds.put("grc", "el"); + languageIds.put("en", "en"); + languageIds.put("eng", "en"); + languageIds.put("fr", "fr"); + languageIds.put("fra", "fr"); + languageIds.put("it", "it"); + languageIds.put("ita", "it"); + languageIds.put("la", "la"); + languageIds.put("lat", "la"); + languageIds.put("nl", "nl"); + languageIds.put("nld", "nl"); + languageIds.put("zh", "zh"); + languageIds.put("zho", "zh"); + languageIds.put("zho-Hant", "zh"); + + iso639Codes.put("ar", "ara"); + iso639Codes.put("ara", "ara"); + iso639Codes.put("de", "ger"); + iso639Codes.put("ger", "ger"); + iso639Codes.put("deu", "ger"); + iso639Codes.put("el", "grc"); + iso639Codes.put("grc", "grc"); + iso639Codes.put("en", "eng"); + iso639Codes.put("eng", "eng"); + iso639Codes.put("fr", "fra"); + iso639Codes.put("fra", "fra"); + iso639Codes.put("it", "ita"); + iso639Codes.put("ita", "ita"); + iso639Codes.put("la", "lat"); + iso639Codes.put("lat", "lat"); + iso639Codes.put("nl", "nld"); + iso639Codes.put("nld", "nld"); + iso639Codes.put("zh", "zho"); + iso639Codes.put("zho", "zho"); + iso639Codes.put("zho-Hant", "zho"); + } + + public String getISO639Code(String language) { + if (language == null) + return null; + String retISO639Code = null; + retISO639Code = iso639Codes.get(language); + return retISO639Code; + } + + public String getLanguageId(String language) { + if (language == null) + return null; + String retLanguageId = null; + retLanguageId = languageIds.get(language); + return retLanguageId; + } + + public boolean isLatin(String language) { + String langId = getLanguageId(language); + if (langId == null) + return false; + if (langId.equals("la")) + return true; + else + return false; + } + + public boolean isGerman(String language) { + String langId = getLanguageId(language); + if (langId == null) + return false; + if (langId.equals("de")) + return true; + else + return false; + } + + public boolean isFrench(String language) { + String langId = getLanguageId(language); + if (langId == null) + return false; + if (langId.equals("fr")) + return true; + else + return false; + } + + public boolean isEnglish(String language) { + String langId = getLanguageId(language); + if (langId == null) + return false; + if (langId.equals("en")) + return true; + else + return false; + } + + public boolean isDutch(String language) { + String langId = getLanguageId(language); + if (langId == null) + return false; + if (langId.equals("nl")) + return true; + else + return false; + } + + public boolean isGreek(String language) { + String langId = getLanguageId(language); + if (langId == null) + return false; + if (langId.equals("el")) + return true; + else + return false; + } + + public boolean isArabic(String language) { + String langId = getLanguageId(language); + if (langId == null) + return false; + if (langId.equals("ar")) + return true; + else + return false; + } + + public boolean isItalian(String language) { + String langId = getLanguageId(language); + if (langId == null) + return false; + if (langId.equals("it")) + return true; + else + return false; + } + + public boolean isChinese(String language) { + String langId = getLanguageId(language); + if (langId == null) + return false; + if (langId.equals("zh")) + return true; + else + return false; + } +} \ No newline at end of file diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/morph/app/Form.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/morph/app/Form.java Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,337 @@ +package de.mpg.mpiwg.berlin.mpdl.lt.morph.app; + +public class Form implements Comparable
        { + private String provider; + private String language; + private String formName; + private String lemmaName; + private String pos; + private String tense; + private String voice; + private String casus; + private String number; + private String mood; + private String person; + private String gender; + private String definite; + + public Form() { + } + + public Form(String provider, String language, String formName) { + this.provider = provider; + this.language = language; + this.formName = formName; + } + + public int compareTo(Form f) { + return formName.compareTo(f.formName); + } + + public void normalize() { + // lower case of form and lemma + formName = formName.toLowerCase(); + lemmaName = lemmaName.toLowerCase(); + // XML: special symbols + formName = formName.replaceAll("&", "&"); + formName = formName.replaceAll("'", "'"); + formName = formName.replaceAll("<", "<"); + formName = formName.replaceAll(">", ">"); + formName = formName.replaceAll("\"", """); + lemmaName = lemmaName.replaceAll("&", "&"); + lemmaName = lemmaName.replaceAll("'", "'"); + lemmaName = lemmaName.replaceAll("<", "<"); + lemmaName = lemmaName.replaceAll(">", ">"); + lemmaName = lemmaName.replaceAll("\"", """); + // unification of lemma names (homographs) TODO do not unificate the homographs + lemmaName = lemmaName.replaceAll("#[0-9]", ""); + if (isArabic()) { + if (lemmaName != null) { + int length = lemmaName.length(); + char lastChar = lemmaName.charAt(length - 1); + boolean isDigit = Character.isDigit(lastChar); + if (isDigit) + lemmaName = lemmaName.substring(0, length - 1); + } + } + // unification of forms and lemmas with hyphens: remove the hyphen + formName = formName.replaceAll("-", ""); + lemmaName = lemmaName.replaceAll("-", ""); + // unification of forms and lemmas with blanks (sequence of words): remove the blanks + formName = formName.replaceAll(" ", ""); + lemmaName = lemmaName.replaceAll(" ", ""); + // unification of forms and lemmas with plus symbols: remove the plus symbol + formName = formName.replaceAll("\\+", ""); + lemmaName = lemmaName.replaceAll("\\+", ""); + // TODO call MpdlMorphDataNormalizer (handle Umlauts in german, accents in french, character classes (longs, s, ...) ...) + + } + + public boolean isOk() { + boolean ret = true; + if (formName == null || lemmaName == null) + ret = false; + else if (formName.length() == 0 || lemmaName.length() == 0 || formName.length() == 1 || lemmaName.length() == 1) + ret = false; + return ret; + } + + public boolean isGreek() { + boolean ret = false; + if (language != null && language.equals("el")) + ret = true; + return ret; + } + + public boolean isArabic() { + boolean ret = false; + if (language != null && language.equals("ar")) + ret = true; + return ret; + } + + public boolean isRicherThan(Form otherForm) { + boolean richer = false; + if (! isOk()) + return false; + else if (! otherForm.isOk()) + return true; + String otherFormPos = otherForm.getPos(); + if (pos != null && pos.length() > 0 && (otherFormPos == null || otherFormPos.length() == 0)) + return true; + // TODO all other cases + return richer; + } + + public String getXmlString() { + String xmlString = "\n"; + if (provider != null) + xmlString += " " + provider + "\n"; + if (language != null) + xmlString += " " + language + "\n"; + if (formName != null) + xmlString += " " + formName + "\n"; + if (lemmaName != null) + xmlString += " " + lemmaName + "\n"; + if (pos != null) + xmlString += " " + pos + "\n"; + if (tense != null) + xmlString += " " + tense + "\n"; + if (voice != null) + xmlString += " " + voice + "\n"; + if (casus != null) + xmlString += " " + casus + "\n"; + if (number != null) + xmlString += " " + number + "\n"; + if (mood != null) + xmlString += " " + mood + "\n"; + if (person != null) + xmlString += " " + person + "\n"; + if (gender != null) + xmlString += " " + gender + "\n"; + if (definite != null) + xmlString += " " + definite + "\n"; + xmlString += "
        \n"; + return xmlString; + } + + public String toString() { + return getXmlString(); + } + + public String getTense() { + return tense; + } + + public void setTense(String tense) { + this.tense = tense; + } + + public void addTense(String newTense) { + if (tense == null) + this.tense = newTense; + else + tense += newTense; + } + + public String getVoice() { + return voice; + } + + public void setVoice(String voice) { + this.voice = voice; + } + + public void addVoice(String newVoice) { + if (voice == null) + this.voice = newVoice; + else + voice += newVoice; + } + + public String getCasus() { + return casus; + } + + public void setCasus(String casus) { + this.casus = casus; + } + + public void addCasus(String newCasus) { + if (casus == null) + this.casus = newCasus; + else + casus += newCasus; + } + + public String getNumber() { + return number; + } + + public void setNumber(String number) { + this.number = number; + } + + public void addNumber(String newNumber) { + if (number == null) + this.number = newNumber; + else + number += newNumber; + } + + public String getMood() { + return mood; + } + + public void setMood(String mood) { + this.mood = mood; + } + + public void addMood(String newMood) { + if (mood == null) + this.mood = newMood; + else + mood += newMood; + } + + public String getPerson() { + return person; + } + + public void setPerson(String person) { + this.person = person; + } + + public void addPerson(String newPerson) { + if (person == null) + this.person = newPerson; + else + person += newPerson; + } + + public String getGender() { + return gender; + } + + public void setGender(String gender) { + this.gender = gender; + } + + public void addGender(String newGender) { + if (gender == null) + this.gender = newGender; + else + gender += newGender; + } + + public String getDefinite() { + return definite; + } + + public void setDefinite(String definite) { + this.definite = definite; + } + + public void addDefinite(String newDefinite) { + if (definite == null) + this.definite = newDefinite; + else + definite += newDefinite; + } + + public String getLemmaName() { + return lemmaName; + } + + public String getPos() { + return pos; + } + + public String getProvider() { + return provider; + } + + public void setProvider(String provider) { + this.provider = provider; + } + + public void addProvider(String newProvider) { + if (provider == null) + this.provider = newProvider; + else + provider += newProvider; + } + + public String getLanguage() { + return language; + } + + public void setLanguage(String language) { + this.language = language; + } + + public void addLanguage(String newLanguage) { + if (language == null) + this.language = newLanguage; + else + language += newLanguage; + } + + public String getFormName() { + return formName; + } + + public void setFormName(String formName) { + this.formName = formName; + } + + public void addFormName(String newFormName) { + if (formName == null) + this.formName = newFormName; + else + formName += newFormName; + } + + public void setLemmaName(String lemmaName) { + this.lemmaName = lemmaName; + } + + public void addLemmaName(String newLemmaName) { + if (lemmaName == null) + this.lemmaName = newLemmaName; + else + lemmaName += newLemmaName; + } + + public void setPos(String pos) { + this.pos = pos; + } + + public void addPos(String newPos) { + if (pos == null) + this.pos = newPos; + else + pos += newPos; + } + +} diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/morph/app/Lemma.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/morph/app/Lemma.java Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,152 @@ +package de.mpg.mpiwg.berlin.mpdl.lt.morph.app; + +import java.util.ArrayList; +import java.util.Enumeration; +import java.util.Hashtable; + + +public class Lemma implements Comparable { + private String provider; + private String language; + private String lemmaName; + private Hashtable forms; + + public Lemma() { + } + + public Lemma(String provider, String language, String lemmaName) { + this.provider = provider; + this.language = language; + this.lemmaName = lemmaName; + this.forms = new Hashtable(); + // always contains the form with the same lemma name + Form form = new Form(provider, language, lemmaName); + addForm(form); + } + + public String getProvider() { + return provider; + } + + public void setProvider(String provider) { + this.provider = provider; + } + + public void addProvider(String newProvider) { + if (provider == null) + this.provider = newProvider; + else + provider += newProvider; + } + + public String getLanguage() { + return language; + } + + public void setLanguage(String language) { + this.language = language; + } + + public void addLanguage(String newLanguage) { + if (language == null) + this.language = newLanguage; + else + language += newLanguage; + } + + public String getLemmaName() { + return lemmaName; + } + + public void setLemmaName(String lemmaName) { + this.lemmaName = lemmaName; + } + + public void addLemmaName(String newLemmaName) { + if (lemmaName == null) + this.lemmaName = newLemmaName; + else + lemmaName += newLemmaName; + } + + public Hashtable getForms() { + return forms; + } + + public ArrayList
        getForms(String provider) { + ArrayList result = new ArrayList(); + Enumeration keys = forms.keys(); + while (keys.hasMoreElements()) { + String key = keys.nextElement(); + Form form = forms.get(key); + String prov = form.getProvider(); + if (prov.equals(provider)) + result.add(form); + } + return result; + } + + public ArrayList getFormsList() { + ArrayList result = new ArrayList(); + if(forms != null) { + Enumeration keys = forms.keys(); + while (keys.hasMoreElements()) { + String key = keys.nextElement(); + Form form = forms.get(key); + result.add(form); + } + } + return result; + } + + public void setForms(ArrayList forms) { + for (int i=0; i(); + Form f = forms.get(formKey); + if (f == null) { + forms.put(formKey, newForm); + } else { + if(newForm.isRicherThan(f)) + forms.put(formKey, newForm); + } + } + + public Form getForm(String formKey) { + return forms.get(formKey); + } + + public String getXmlString() { + String xmlString = "\n"; + xmlString += " " + provider + "\n"; + xmlString += " " + language + "\n"; + xmlString += " " + lemmaName + "\n"; + xmlString += ""; + return xmlString; + } + + public String toString() { + return getXmlString(); + } + + public int compareTo(Lemma l) { + if (l.getLemmaName() == null && this.getLemmaName() == null) { + return 0; + } + if (this.getLemmaName() == null) { + return 1; + } + if (l.getLemmaName() == null) { + return -1; + } + return this.getLemmaName().compareTo(l.getLemmaName()); + } + +} diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/morph/app/MorphFileReaderContentHandler.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/morph/app/MorphFileReaderContentHandler.java Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,127 @@ +package de.mpg.mpiwg.berlin.mpdl.lt.morph.app; + +import java.util.Hashtable; + +import org.xml.sax.*; + +import de.mpg.mpiwg.berlin.mpdl.lt.morph.app.Form; +import de.mpg.mpiwg.berlin.mpdl.lt.morph.app.Lemma; + +public class MorphFileReaderContentHandler implements ContentHandler { + private Hashtable forms; + private Hashtable lemmas; + private Element currentElement; + private Form currentForm; + + public MorphFileReaderContentHandler(Hashtable forms, Hashtable lemmas) { + this.forms = forms; + this.lemmas = lemmas; + } + + public void startDocument() throws SAXException { + } + + public void endDocument() throws SAXException { + } + + public void characters(char[] c, int start, int length) throws SAXException { + if (currentElement != null) { + String elemName = currentElement.name; + if (currentForm != null) { + char[] cCopy = new char[length]; + System.arraycopy(c, start, cCopy, 0, length); + String charactersStr = String.valueOf(cCopy); + if (charactersStr != null && ! (charactersStr.trim().equals(""))) { + if (elemName.equals("provider")) { + currentForm.setProvider(charactersStr); + } else if (elemName.equals("language")) { + currentForm.setLanguage(charactersStr); + } else if (elemName.equals("form-name")) { + currentForm.setFormName(charactersStr); + } else if (elemName.equals("lemma-name")) { + currentForm.setLemmaName(charactersStr); + } else if (elemName.equals("pos")) { + currentForm.setPos(charactersStr); + } else if (elemName.equals("tense")) { + currentForm.setTense(charactersStr); + } else if (elemName.equals("voice")) { + currentForm.setVoice(charactersStr); + } else if (elemName.equals("casus")) { + currentForm.setCasus(charactersStr); + } else if (elemName.equals("number")) { + currentForm.setNumber(charactersStr); + } else if (elemName.equals("mood")) { + currentForm.setMood(charactersStr); + } else if (elemName.equals("person")) { + currentForm.setPerson(charactersStr); + } else if (elemName.equals("gender")) { + currentForm.setGender(charactersStr); + } else if (elemName.equals("definite")) { + currentForm.setDefinite(charactersStr); + } + } + } + } + } + + public void ignorableWhitespace(char[] c, int start, int length) throws SAXException { + } + + public void processingInstruction(String target, String data) throws SAXException { + } + + public void setDocumentLocator(org.xml.sax.Locator arg1) { + } + + public void endPrefixMapping(String prefix) throws SAXException { + } + + public void skippedEntity(String name) throws SAXException { + } + + public void endElement(String uri, String localName, String name) throws SAXException { + currentElement = null; + if (name.equals("form")) { + String provider = currentForm.getProvider(); + String language = currentForm.getLanguage(); + String formName = currentForm.getFormName(); + String lemmaName = currentForm.getLemmaName(); + String formKey = language + "###" + formName; + forms.put(formKey, currentForm); + String lemmaKey = language + "###" + lemmaName; + Lemma lemma = lemmas.get(lemmaKey); + if(lemma == null) { + Lemma l = new Lemma(provider, language, lemmaName); + l.addForm(currentForm); + lemmas.put(lemmaKey, l); + } else { + lemma.addForm(currentForm); + } + currentForm = null; + } + } + + public void startElement(String uri, String localName, String name, Attributes attrs) throws SAXException { + currentElement = new Element(name); + if (name.equals("form")) { + currentForm = new Form(); + } + } + + public void startPrefixMapping(String prefix, String uri) throws SAXException { + } + + private class Element { + String name; + String value; + + Element(String name) { + this.name = name; + } + + Element(String name, String value) { + this.name = name; + this.value = value; + } + } +} diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/morph/app/MorphologyCache.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/morph/app/MorphologyCache.java Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,295 @@ +package de.mpg.mpiwg.berlin.mpdl.lt.morph.app; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.Date; +import java.util.Enumeration; +import java.util.Hashtable; + +import java.util.logging.Logger; + +import de.mpg.mpiwg.berlin.mpdl.lt.general.Language; +import de.mpg.mpiwg.berlin.mpdl.lt.general.Constants; +import de.mpg.mpiwg.berlin.mpdl.lt.morph.app.Form; +import de.mpg.mpiwg.berlin.mpdl.lt.morph.app.Lemma; +import de.mpg.mpiwg.berlin.mpdl.lt.morph.db.DBMorphHandler; +import de.mpg.mpiwg.berlin.mpdl.lt.text.norm.Normalizer; +import de.mpg.mpiwg.berlin.mpdl.lucene.util.LuceneUtil; +import de.mpg.mpiwg.berlin.mpdl.util.Util; +import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; + +public class MorphologyCache { + private static MorphologyCache instance; + private static Logger LOGGER = Logger.getLogger(MorphologyCache.class.getName()); + private static String DATA_DIR = Constants.getInstance().getDataDir(); + private static String DB_DIR_DONATUS = DATA_DIR + "/dataBerkeleyDB/donatus"; + public static int QUERY_MODE = 0; + public static int DOCUMENT_MODE = 1; + private static int MAX_HASHTABLE_SIZE = Constants.MORPHOLOGY_CACHE_SIZE; + protected int mode = QUERY_MODE; + private Hashtable> forms = new Hashtable>(); // cache of forms: hashKey is formName + private Hashtable lemmas = new Hashtable(); // cache of lemmas: hashKey is lemmaName + private DBMorphHandler dbMorphHandlerStatic; // handles static morph data (BerkeleyDB) + private Date beginOfOperation; + private Date endOfOperation; + + public static MorphologyCache getInstance() throws ApplicationException { + if (instance == null) { + instance = new MorphologyCache(); + instance.init(); + } + return instance; + } + + private void init() throws ApplicationException { + instance.beginOperation(); + dbMorphHandlerStatic = new DBMorphHandler(DB_DIR_DONATUS); + dbMorphHandlerStatic.start(); + dbMorphHandlerStatic.openDatabases(); + instance.endOperation(); + Double elapsedTime = new Util().getSecondWithMillisecondsBetween(instance.beginOfOperation, instance.endOfOperation); + LOGGER.info("Morphology db cache: opened (needed " + elapsedTime + " seconds)"); + } + + public int getMode() { + return mode; + } + + public void setMode(int newMode) { + this.mode = newMode; + } + + public void end() throws ApplicationException { + dbMorphHandlerStatic.closeDatabases(); + LOGGER.info("Morphology db cache: closed"); + } + + public ArrayList getLemmasByFormName(String lang, String formNameArg, boolean normalize) throws ApplicationException { + String language = Language.getInstance().getLanguageId(lang); + ArrayList retFormLemmas = null; + String formName = formNameArg; + if (normalize) { + Normalizer normalizer = new Normalizer(language); + formName = normalizer.normalize(formNameArg); + } + // first look in local cache + String key = language + "###" + formName; + Hashtable formLemmasHashtable = forms.get(key); + if (formLemmasHashtable == null) { + ArrayList dbFormLemmas = readLemmasByFormName(language, formName); + // put lemmas into local cache + int localHashTableSize = forms.size(); + if (localHashTableSize >= MAX_HASHTABLE_SIZE) { + clearCache(); + } + if (dbFormLemmas != null && ! dbFormLemmas.isEmpty()) { + formLemmasHashtable = new Hashtable(); + for (int i=0; i lemmaForms = readFormsByLemmaName(language, lemmaName); + lemma.setForms(lemmaForms); + lemmas.put(lemmaKey, lemma); + } else { + lemma = localLemma; + } + formLemmasHashtable.put(lemmaKey, lemma); + } + forms.put(key, formLemmasHashtable); + } + } + retFormLemmas = new ArrayList(); + if (formLemmasHashtable != null) { + Enumeration formLemmasKeys = formLemmasHashtable.keys(); + while(formLemmasKeys.hasMoreElements()) { + String lemmaKey = formLemmasKeys.nextElement(); + Lemma l = formLemmasHashtable.get(lemmaKey); + retFormLemmas.add(l); + } + } + Collections.sort(retFormLemmas); + return retFormLemmas; + } + + public Lemma getLemma(String lang, String lemmaNameArg, boolean normalize) throws ApplicationException { + String language = Language.getInstance().getLanguageId(lang); + String lemmaName = lemmaNameArg; + if (normalize) { + Normalizer normalizer = new Normalizer(language); + lemmaName = normalizer.normalize(lemmaNameArg); + } + // first look in local cache + String key = language + "###" + lemmaName; + Lemma lemma = lemmas.get(key); + if (lemma == null) { + ArrayList dbLemmaForms = readFormsByLemmaName(language, lemmaName); + if (dbLemmaForms != null && dbLemmaForms.size() > 0) { + lemma = new Lemma(); + lemma.setLemmaName(lemmaName); + lemma.setLanguage(language); + lemma.setProvider(dbLemmaForms.get(0).getProvider()); + lemma.setForms(dbLemmaForms); + lemmas.put(lemmaName, lemma); + } + } + return lemma; + } + + public ArrayList getFormsByLuceneQuery(String lang, String luceneQueryString, boolean normalize) throws ApplicationException { + String language = Language.getInstance().getLanguageId(lang); + ArrayList result = new ArrayList(); + luceneQueryString = luceneQueryString.toLowerCase(); + ArrayList formsFromQuery = getVariantsFromLuceneQuery(luceneQueryString); + if (! (formsFromQuery == null || formsFromQuery.isEmpty())) { + for (int i=0; i formLemmas = null; + // lemma mode: if formName contains "lemmalemma" then the lemma itself is fetched + if (formStr.startsWith("lemmalemma")) { + formLemmas = new ArrayList(); + String lemmaName = formStr.substring(10); + Lemma lemma = getLemma(language, lemmaName, false); + formLemmas.add(lemma); + } else { + formLemmas = getLemmasByFormName(language, formStr, false); + } + if (formLemmas != null && ! formLemmas.isEmpty()) { + for (int j=0; j lemmaForms = l.getFormsList(); + result.addAll(lemmaForms); + } + } + } + } + return result; + } + + public ArrayList getLemmasByLuceneQuery(String lang, String luceneQueryString, boolean normalize) throws ApplicationException { + String language = Language.getInstance().getLanguageId(lang); + Hashtable lemmas = new Hashtable(); + luceneQueryString = luceneQueryString.toLowerCase(); + ArrayList formsFromQuery = getVariantsFromLuceneQuery(luceneQueryString); + if (! (formsFromQuery == null || formsFromQuery.isEmpty())) { + for (int i=0; i formLemmas = null; + // lemma mode: if formName starts with "lemmalemma" then the lemma itself is fetched + if (formStr.startsWith("lemmalemma")) { + formLemmas = new ArrayList(); + String lemmaName = formStr.substring(10); + Lemma lemma = getLemma(language, lemmaName, false); + formLemmas.add(lemma); + } else { + formLemmas = getLemmasByFormName(language, formStr, false); + } + if (formLemmas != null) { + for (int j=0; j result = new ArrayList(); + if (lemmas != null) { + Enumeration formLemmasKeys = lemmas.keys(); + while(formLemmasKeys.hasMoreElements()) { + String lemmaKey = formLemmasKeys.nextElement(); + Lemma l = lemmas.get(lemmaKey); + result.add(l); + } + } + Collections.sort(result); + if (result.isEmpty()) + return null; + else + return result; + } + + public ArrayList getIndexKeysByLemmaNames(String lang, ArrayList lemmaNames) throws ApplicationException { + String language = Language.getInstance().getLanguageId(lang); + Hashtable indexKeys = new Hashtable(); + for (int j=0; j lemmaForms = lemma.getFormsList(); + for (int k=0; k fLemmas = getLemmasByFormName(language, form.getFormName(), false); + if (fLemmas != null) { + String indexKey = ""; + if (fLemmas.size() == 1) { + indexKey = fLemmas.get(0).getLemmaName(); + } else { + for (int l=0; l result = new ArrayList(); + if (indexKeys != null) { + Enumeration indexKeysKeys = indexKeys.keys(); + while(indexKeysKeys.hasMoreElements()) { + String indexKey = indexKeysKeys.nextElement(); + result.add(indexKey); + } + } + Collections.sort(result); + if (result.isEmpty()) + return null; + else + return result; + } + + private void clearCache() { + forms = null; + lemmas = null; + forms = new Hashtable>(); + lemmas = new Hashtable(); + } + + private ArrayList readLemmasByFormName(String lang, String formName) throws ApplicationException { + String language = Language.getInstance().getLanguageId(lang); + ArrayList lemmasStatic = dbMorphHandlerStatic.readLemmas(language, formName); + return lemmasStatic; + } + + private ArrayList readFormsByLemmaName(String lang, String lemmaName) throws ApplicationException { + String language = Language.getInstance().getLanguageId(lang); + ArrayList formsStatic = dbMorphHandlerStatic.readForms(language, lemmaName); + return formsStatic; + } + + private ArrayList getVariantsFromLuceneQuery(String queryString) { + LuceneUtil luceneUtil = LuceneUtil.getInstance(); + ArrayList variants = luceneUtil.getVariantsFromLuceneQuery(queryString); + return variants; + } + + private void beginOperation() { + beginOfOperation = new Date(); + } + + private void endOperation() { + endOfOperation = new Date(); + } +} \ No newline at end of file diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/morph/app/SimpleMorphContentHandler.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/morph/app/SimpleMorphContentHandler.java Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,119 @@ +package de.mpg.mpiwg.berlin.mpdl.lt.morph.app; + +import org.xml.sax.*; + + +public class SimpleMorphContentHandler implements ContentHandler { + private Element currentElement; + private Lemma lemma; + private Form form; + + public SimpleMorphContentHandler() { + } + + public Form getForm() { + return form; + } + + public Lemma getLemma() { + return lemma; + } + + public void startDocument() throws SAXException { + } + + public void endDocument() throws SAXException { + } + + public void characters(char[] c, int start, int length) throws SAXException { + if (currentElement != null) { + String elemName = currentElement.name; + if (form != null) { + char[] cCopy = new char[length]; + System.arraycopy(c, start, cCopy, 0, length); + String charactersStr = String.valueOf(cCopy); + if (elemName.equals("provider")) + form.setProvider(charactersStr); + else if (elemName.equals("language")) + form.setLanguage(charactersStr); + else if (elemName.equals("form-name")) + form.setFormName(charactersStr); + else if (elemName.equals("lemma-name")) + form.setLemmaName(charactersStr); + else if (elemName.equals("pos")) + form.setPos(charactersStr); + else if (elemName.equals("tense")) + form.setTense(charactersStr); + else if (elemName.equals("voice")) + form.setVoice(charactersStr); + else if (elemName.equals("casus")) + form.setCasus(charactersStr); + else if (elemName.equals("number")) + form.setNumber(charactersStr); + else if (elemName.equals("mood")) + form.setMood(charactersStr); + else if (elemName.equals("person")) + form.setPerson(charactersStr); + else if (elemName.equals("gender")) + form.setGender(charactersStr); + else if (elemName.equals("definite")) + form.setDefinite(charactersStr); + } else if (lemma != null) { + char[] cCopy = new char[length]; + System.arraycopy(c, start, cCopy, 0, length); + String charactersStr = String.valueOf(cCopy); + if (elemName.equals("provider")) + lemma.setProvider(charactersStr); + else if (elemName.equals("language")) + lemma.setLanguage(charactersStr); + else if (elemName.equals("lemma-name")) + lemma.setLemmaName(charactersStr); + } + } + } + + public void ignorableWhitespace(char[] c, int start, int length) throws SAXException { + } + + public void processingInstruction(String target, String data) throws SAXException { + } + + public void setDocumentLocator(org.xml.sax.Locator arg1) { + } + + public void endPrefixMapping(String prefix) throws SAXException { + } + + public void skippedEntity(String name) throws SAXException { + } + + public void endElement(String uri, String localName, String name) throws SAXException { + currentElement = null; + } + + public void startElement(String uri, String localName, String name, Attributes attrs) throws SAXException { + currentElement = new Element(name); + if (name.equals("form")) { + form = new Form(); + } else if (name.equals("lemma")) { + lemma = new Lemma(); + } + } + + public void startPrefixMapping(String prefix, String uri) throws SAXException { + } + + private class Element { + String name; + String value; + + Element(String name) { + this.name = name; + } + + Element(String name, String value) { + this.name = name; + this.value = value; + } + } +} diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/morph/db/DBMorphHandler.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/morph/db/DBMorphHandler.java Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,242 @@ +package de.mpg.mpiwg.berlin.mpdl.lt.morph.db; + +import java.io.IOException; +import java.io.Reader; +import java.io.StringReader; +import java.io.UnsupportedEncodingException; +import java.util.ArrayList; +import java.util.Hashtable; + +import org.xml.sax.InputSource; +import org.xml.sax.SAXException; +import org.xml.sax.XMLReader; + +import com.sleepycat.je.Cursor; +import com.sleepycat.je.Database; +import com.sleepycat.je.DatabaseEntry; +import com.sleepycat.je.DatabaseException; +import com.sleepycat.je.LockMode; +import com.sleepycat.je.OperationStatus; +import com.sun.org.apache.xerces.internal.parsers.SAXParser; + +import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; +import de.mpg.mpiwg.berlin.mpdl.lt.general.Language; +import de.mpg.mpiwg.berlin.mpdl.lt.morph.app.Form; +import de.mpg.mpiwg.berlin.mpdl.lt.morph.app.Lemma; +import de.mpg.mpiwg.berlin.mpdl.lt.morph.app.SimpleMorphContentHandler; + +public class DBMorphHandler { + private String dbDirectory; + private DbEnvMorph morphDbEnv; + + public DBMorphHandler(String dbDir) { + this.dbDirectory = dbDir; + } + + public void start() throws ApplicationException { + morphDbEnv = new DbEnvMorph(); + morphDbEnv.setDataDir(dbDirectory); + morphDbEnv.init(); // open databases in read/write mode + } + + public void openDatabases() throws ApplicationException { + morphDbEnv.openDatabases(); + } + + public void closeDatabases() throws ApplicationException { + morphDbEnv.close(); + } + + public void deleteMorphData() throws ApplicationException { + morphDbEnv.removeDatabases(); + } + + public long getSize() throws ApplicationException { + long size = 0; + try { + Database formDB = morphDbEnv.getFormDB(); + size = formDB.count(); + } catch (DatabaseException e) { + throw new ApplicationException(e); + } + return size; + } + + + public void writeFormLemma(Form form, Lemma lemma) throws ApplicationException { + try { + String lang = Language.getInstance().getLanguageId(form.getLanguage()); + String keyStr = lang + "###" + form.getFormName(); + String valueStr = lemma.getXmlString(); + DatabaseEntry dbEntryKey = new DatabaseEntry(keyStr.getBytes("utf-8")); + DatabaseEntry dbEntryValue = new DatabaseEntry(valueStr.getBytes("utf-8")); + Database formDB = morphDbEnv.getFormDB(); + formDB.put(null, dbEntryKey, dbEntryValue); + } catch (DatabaseException e) { + throw new ApplicationException(e); + } catch (UnsupportedEncodingException e) { + throw new ApplicationException(e); + } + } + + public void writeLemmaForm(Lemma lemma, Form form) throws ApplicationException { + try { + String lang = Language.getInstance().getLanguageId(lemma.getLanguage()); + String keyStr = lang + "###" + lemma.getLemmaName(); + String valueStr = form.getXmlString(); + DatabaseEntry dbEntryKey = new DatabaseEntry(keyStr.getBytes("utf-8")); + DatabaseEntry dbEntryValue = new DatabaseEntry(valueStr.getBytes("utf-8")); + Database lemmaDB = morphDbEnv.getLemmaDB(); + lemmaDB.put(null, dbEntryKey, dbEntryValue); + } catch (DatabaseException e) { + throw new ApplicationException(e); + } catch (UnsupportedEncodingException e) { + throw new ApplicationException(e); + } + } + + public void deleteLemma(Lemma lemma) throws ApplicationException { + try { + String lang = Language.getInstance().getLanguageId(lemma.getLanguage()); + String keyStr = lang + "###" + lemma.getLemmaName(); + DatabaseEntry dbEntryKey = new DatabaseEntry(keyStr.getBytes("utf-8")); + Database lemmaDB = morphDbEnv.getLemmaDB(); + lemmaDB.delete(null, dbEntryKey); + } catch (DatabaseException e) { + throw new ApplicationException(e); + } catch (UnsupportedEncodingException e) { + throw new ApplicationException(e); + } + } + + public void deleteForm(Form form) throws ApplicationException { + try { + String lang = Language.getInstance().getLanguageId(form.getLanguage()); + String keyStr = lang + "###" + form.getFormName(); + DatabaseEntry dbEntryKey = new DatabaseEntry(keyStr.getBytes("utf-8")); + Database formDB = morphDbEnv.getFormDB(); + formDB.delete(null, dbEntryKey); + } catch (DatabaseException e) { + throw new ApplicationException(e); + } catch (UnsupportedEncodingException e) { + throw new ApplicationException(e); + } + } + + public ArrayList readForms(String language, String lemmaName) throws ApplicationException { + ArrayList retForms = new ArrayList(); + String lang = Language.getInstance().getLanguageId(language); + String hashKey = lang + "###" + lemmaName; + try { + Database lemmaDB = morphDbEnv.getLemmaDB(); + Cursor cursor = lemmaDB.openCursor(null, null); + byte[] bHashKey = hashKey.getBytes("utf-8"); + DatabaseEntry dbEntryKey = new DatabaseEntry(bHashKey); + DatabaseEntry foundFormValue = new DatabaseEntry(); + OperationStatus operationStatus = cursor.getSearchKey(dbEntryKey, foundFormValue, LockMode.DEFAULT); + while (operationStatus == OperationStatus.SUCCESS) { + byte[] foundFormValueBytes = foundFormValue.getData(); + String foundFormValueStr = new String(foundFormValueBytes, "utf-8"); + Form f = parseXmlFormString(foundFormValueStr); + retForms.add(f); + operationStatus = cursor.getNextDup(dbEntryKey, foundFormValue, LockMode.DEFAULT); + } + cursor.close(); + } catch (DatabaseException e) { + throw new ApplicationException(e); + } catch (UnsupportedEncodingException e) { + throw new ApplicationException(e); + } + return retForms; + } + + // TODO diese Methode wird nicht verwendet bis jetzt + public Hashtable readForms() throws ApplicationException { + Hashtable retForms = new Hashtable(); + try { + Database lemmaDB = morphDbEnv.getLemmaDB(); + Cursor cursor = lemmaDB.openCursor(null, null); + DatabaseEntry dbEntryKey = new DatabaseEntry(); + DatabaseEntry foundFormValue = new DatabaseEntry(); + OperationStatus operationStatus = cursor.getFirst(dbEntryKey, foundFormValue, LockMode.DEFAULT); + while (operationStatus == OperationStatus.SUCCESS) { + byte[] foundFormValueBytes = foundFormValue.getData(); + String foundFormValueStr = new String(foundFormValueBytes, "utf-8"); + Form f = parseXmlFormString(foundFormValueStr); + String formHashKey = f.getLanguage() + "###" + f.getFormName(); + retForms.put(formHashKey, f); + operationStatus = cursor.getNext(dbEntryKey, foundFormValue, LockMode.DEFAULT); + } + cursor.close(); + } catch (DatabaseException e) { + throw new ApplicationException(e); + } catch (UnsupportedEncodingException e) { + throw new ApplicationException(e); + } + return retForms; + } + + public ArrayList readLemmas(String language, String formName) throws ApplicationException { + ArrayList retForms = new ArrayList(); + String lang = Language.getInstance().getLanguageId(language); + String hashKey = lang + "###" + formName; + try { + Database formDB = morphDbEnv.getFormDB(); + Cursor cursor = formDB.openCursor(null, null); + byte[] bHashKey = hashKey.getBytes("utf-8"); + DatabaseEntry dbEntryKey = new DatabaseEntry(bHashKey); + DatabaseEntry foundLemmaValue = new DatabaseEntry(); + OperationStatus operationStatus = cursor.getSearchKey(dbEntryKey, foundLemmaValue, LockMode.DEFAULT); + while (operationStatus == OperationStatus.SUCCESS) { + byte[] foundLemmaValueBytes = foundLemmaValue.getData(); + String foundLemmaValueStr = new String(foundLemmaValueBytes, "utf-8"); + Lemma l = parseXmlLemmaString(foundLemmaValueStr); + retForms.add(l); + operationStatus = cursor.getNextDup(dbEntryKey, foundLemmaValue, LockMode.DEFAULT); + } + cursor.close(); + } catch (DatabaseException e) { + throw new ApplicationException(e); + } catch (UnsupportedEncodingException e) { + throw new ApplicationException(e); + } + return retForms; + } + + private Form parseXmlFormString(String xmlString) throws ApplicationException { + Form form = null; + try { + XMLReader xmlParser = new SAXParser(); + SimpleMorphContentHandler morphContentHandler = new SimpleMorphContentHandler(); + xmlParser.setContentHandler(morphContentHandler); + Reader reader = new StringReader(xmlString); + InputSource input = new InputSource(reader); + xmlParser.parse(input); + form = morphContentHandler.getForm(); + } catch (SAXException e) { + throw new ApplicationException(e); + } catch (IOException e) { + throw new ApplicationException(e); + } + return form; + } + + private Lemma parseXmlLemmaString(String xmlString) throws ApplicationException { + Lemma lemma = null; + try { + XMLReader xmlParser = new SAXParser(); + SimpleMorphContentHandler morphContentHandler = new SimpleMorphContentHandler(); + xmlParser.setContentHandler(morphContentHandler); + Reader reader = new StringReader(xmlString); + InputSource input = new InputSource(reader); + xmlParser.parse(input); + lemma = morphContentHandler.getLemma(); + } catch (SAXException e) { + throw new ApplicationException(e); + } catch (IOException e) { + throw new ApplicationException(e); + } + return lemma; + } + +} \ No newline at end of file diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/morph/db/DBMorphSupWriter.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/morph/db/DBMorphSupWriter.java Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,265 @@ +package de.mpg.mpiwg.berlin.mpdl.lt.morph.db; + +import java.io.BufferedOutputStream; +import java.io.BufferedReader; +import java.io.File; +import java.io.FileNotFoundException; +import java.io.FileOutputStream; +import java.io.FileReader; +import java.io.IOException; +import java.io.UnsupportedEncodingException; +import java.util.Date; +import java.util.HashMap; +import java.util.Iterator; + +import com.sleepycat.je.Cursor; +import com.sleepycat.je.Database; +import com.sleepycat.je.DatabaseEntry; +import com.sleepycat.je.DatabaseException; +import com.sleepycat.je.LockMode; +import com.sleepycat.je.OperationStatus; +import com.sleepycat.je.util.DbLoad; + +import de.mpg.mpiwg.berlin.mpdl.util.Util; +import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; +import de.mpg.mpiwg.berlin.mpdl.lt.general.Constants; +import de.mpg.mpiwg.berlin.mpdl.lt.text.transcode.Transcoder; + +public class DBMorphSupWriter { + private static DBMorphSupWriter instance; + private static String DATA_DIR = Constants.getInstance().getDataDir(); + private static String DATA_FILES_DIR_DONATUS_ADD_SUP = DATA_DIR + "/dataFiles/donatusAdditionalSup"; + private static String DB_DIR_DONATUS_ADD_SUP = DATA_DIR + "/dataFiles/donatusAdditionalSup/db"; + private static String[] DONATUS_SUP_DUMPS = {"cache-la", "cache-el", "cache-it"}; + private DbEnvMorphSup dbEnvMorphSup; + private Date beginOfOperation; + private Date endOfOperation; + + public static DBMorphSupWriter getInstance() throws ApplicationException { + if (instance == null) { + instance = new DBMorphSupWriter(); + } + return instance; + } + + public static void main(String[] args) throws ApplicationException { + getInstance(); + instance.beginOperation(); + System.out.print("Start ..."); + instance.initReadWrite(); + // instance.loadDonatusSupDbDumpsToDb(); + instance.printSizeOfAllMorphSupDBs(); + // instance.writeDonatusSupsToFiles(); + instance.end(); + instance.endOperation(); + Double elapsedTime = new Util().getSecondWithMillisecondsBetween(instance.beginOfOperation, instance.endOfOperation); + System.out.println("End."); + System.out.println("Needed time: " + elapsedTime + " seconds"); + } + + private void initReadWrite() throws ApplicationException { + dbEnvMorphSup = new DbEnvMorphSup(); + dbEnvMorphSup.setDataDir(DB_DIR_DONATUS_ADD_SUP); + dbEnvMorphSup.initReadWrite(); + } + + private void loadDonatusSupDbDumpsToDb() throws ApplicationException { + for (int i=0; i getWholeMorphHashMap(String donatusSupName) throws ApplicationException { + HashMap morphHashMap = new HashMap(); + try { + dbEnvMorphSup.openDatabase(donatusSupName + "Dump"); + Database morphDB = dbEnvMorphSup.getMorphSupDB(donatusSupName + "Dump"); + Cursor cursor = morphDB.openCursor(null, null); + DatabaseEntry dbEntryKey = new DatabaseEntry(); + DatabaseEntry dbEntryValue = new DatabaseEntry(); + OperationStatus operationStatus = cursor.getFirst(dbEntryKey, dbEntryValue, LockMode.DEFAULT); + while (operationStatus == OperationStatus.SUCCESS) { + int size = dbEntryKey.getSize(); + if (size > 0) { + byte[] dbEntryKeyBytes = dbEntryKey.getData(); + String dbEntryKeyStr = new String(dbEntryKeyBytes, "utf-8"); + DatabaseEntry newDbEntryValue = new DatabaseEntry(dbEntryValue.getData()); + morphHashMap.put(dbEntryKeyStr, newDbEntryValue); + } + operationStatus = cursor.getNext(dbEntryKey, dbEntryValue, LockMode.DEFAULT); + } + cursor.close(); + } catch (DatabaseException e) { + throw new ApplicationException(e); + } catch (UnsupportedEncodingException e) { + throw new ApplicationException(e); + } + return morphHashMap; + } + + private void writeDonatusSupsToFiles() throws ApplicationException { + BufferedReader in = null; + BufferedOutputStream out = null; + try { + for (int i=0; i morphHashMap = getWholeMorphHashMap(donatusSupName); + Iterator morphDumpIter = morphHashMap.keySet().iterator(); + File outputFile = new File(DATA_FILES_DIR_DONATUS_ADD_SUP + "/donatus-sup-" + donatusSupName + ".xml"); + out = new BufferedOutputStream(new FileOutputStream(outputFile)); + write("\n", out); + while (morphDumpIter.hasNext()) { + write("\n", out); + write("" + "donatus-sup" + "\n", out); + String language = "unknown"; + if (donatusSupName.startsWith("cache-")) + language = donatusSupName.substring(6); + write("" + language + "\n", out); + String morphKeyStr = morphDumpIter.next(); + String formStr = morphKeyStr; + if (language.equals("el")) + formStr = transcodeFromBetaCode2Unicode(formStr); + formStr = formStr.toLowerCase(); + write("" + formStr + "\n", out); + DatabaseEntry morphValue = morphHashMap.get(morphKeyStr); + byte[] morphValueBytes = morphValue.getData(); + String wholeLemmaStr = new String(morphValueBytes, "utf-8"); + // only first lemma is recognized TODO recognize all lemmas for the form + char splitSymbol = '\u0009'; + int firstIndexOfSplitSymbol = wholeLemmaStr.indexOf(splitSymbol); + String lemmaForm = wholeLemmaStr; + if (firstIndexOfSplitSymbol != -1) + lemmaForm = wholeLemmaStr.substring(0, firstIndexOfSplitSymbol); + else + lemmaForm = lemmaForm + "XXXXXX"; + char splitSymbol2 = '\u000B'; + int firstIndexOfSplitSymbol2 = lemmaForm.indexOf(splitSymbol2); + if (firstIndexOfSplitSymbol2 != -1) + lemmaForm = lemmaForm.substring(0, firstIndexOfSplitSymbol2); + if (language.equals("el")) + lemmaForm = transcodeFromBetaCode2Unicode(lemmaForm); + lemmaForm = lemmaForm.replaceAll("#\\d", ""); + lemmaForm = lemmaForm.toLowerCase(); + write("" + lemmaForm + "\n", out); + write("\n", out); + } + write("\n", out); + } + } catch (FileNotFoundException e) { + throw new ApplicationException(e); + } catch (UnsupportedEncodingException e) { + throw new ApplicationException(e); + } finally { + // always close the stream + if (in != null) try { in.close(); } catch (Exception e) { } + if (out != null) try { out.close(); } catch (Exception e) { } + } + } + + private void write(byte[] inputBytes, BufferedOutputStream out) throws ApplicationException { + try { + out.write(inputBytes, 0, inputBytes.length); + out.flush(); + } catch (IOException e) { + throw new ApplicationException(e); + } + } + + private void write(String outStr, BufferedOutputStream out) throws ApplicationException { + try { + byte[] bytes = outStr.getBytes("utf-8"); + out.write(bytes, 0, bytes.length); + out.flush(); + } catch (IOException e) { + throw new ApplicationException(e); + } + } + + private String transcodeFromBetaCode2Unicode(String inputStr) throws ApplicationException { + Transcoder transcoder = Transcoder.getInstance(); + String encodedUnicodeForm = transcoder.transcodeFromBetaCode2Unicode(inputStr); + return encodedUnicodeForm; + } + + private void beginOperation() { + beginOfOperation = new Date(); + } + + private void endOperation() { + endOfOperation = new Date(); + } + +} \ No newline at end of file diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/morph/db/DBMorphWriter.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/morph/db/DBMorphWriter.java Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,168 @@ +package de.mpg.mpiwg.berlin.mpdl.lt.morph.db; + +import java.io.BufferedInputStream; +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.util.ArrayList; +import java.util.Date; + +import org.xml.sax.InputSource; +import org.xml.sax.SAXException; +import org.xml.sax.XMLReader; + +import com.sun.org.apache.xerces.internal.parsers.SAXParser; + +import de.mpg.mpiwg.berlin.mpdl.lt.general.Constants; +import de.mpg.mpiwg.berlin.mpdl.lt.morph.app.Form; +import de.mpg.mpiwg.berlin.mpdl.lt.morph.app.Lemma; +import de.mpg.mpiwg.berlin.mpdl.lt.morph.db.DBMorphHandler; +import de.mpg.mpiwg.berlin.mpdl.util.Util; +import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; + +public class DBMorphWriter { + private static DBMorphWriter instance; + private static String DATA_DIR = Constants.getInstance().getDataDir(); + private static String DB_DIR_DONATUS = DATA_DIR + "/dataBerkeleyDB/donatus"; + private static String DATA_FILES_DIR = DATA_DIR + "/dataFiles"; + private DBMorphHandler dbMorphHandler; + private Date beginOfOperation; + private Date endOfOperation; + + public static DBMorphWriter getInstance() throws ApplicationException { + if (instance == null) { + instance = new DBMorphWriter(); + instance.init(); + } + return instance; + } + + /** + * + */ + public static void main(String[] args) throws ApplicationException { + getInstance(); + instance.beginOperation(); + System.out.println("Start ..."); + instance.init(); + instance.openMorphData(); + // instance.deleteMorphData(); + long size = instance.getSize(); + System.out.println("Count forms: " + size); + // instance.writeMorphData(); + // instance.readSampleData(); + instance.end(); + instance.endOperation(); + Double elapsedTime = new Util().getSecondWithMillisecondsBetween(instance.beginOfOperation, instance.endOfOperation); + System.out.println("End."); + System.out.println("Needed time: " + elapsedTime + " seconds"); + } + + private void init() throws ApplicationException { + dbMorphHandler = new DBMorphHandler(DB_DIR_DONATUS); + dbMorphHandler.start(); + } + + private void openMorphData() throws ApplicationException { + dbMorphHandler.openDatabases(); + } + + private void deleteMorphData() throws ApplicationException { + dbMorphHandler.deleteMorphData(); + } + + private void writeMorphData() throws ApplicationException { + String inputFileNameLatin = DATA_FILES_DIR + "/" + "perseus-latin-forms.xml"; + instance.write(inputFileNameLatin); + String inputFileNameGreek = DATA_FILES_DIR + "/" + "perseus-greek-forms.xml"; + instance.write(inputFileNameGreek); + String inputFileNameArabic = DATA_FILES_DIR + "/" + "perseus-arabic-forms.xml"; + instance.write(inputFileNameArabic); + String inputFileNameDutch = DATA_FILES_DIR + "/" + "celex-dutch-forms.xml"; + instance.write(inputFileNameDutch); + String inputFileNameGerman = DATA_FILES_DIR + "/" + "celex-german-forms.xml"; + instance.write(inputFileNameGerman); + String inputFileNameEnglish = DATA_FILES_DIR + "/" + "celex-english-forms.xml"; + instance.write(inputFileNameEnglish); + String inputFileNameFrench = DATA_FILES_DIR + "/" + "lexique-french-forms.xml"; + instance.write(inputFileNameFrench); + String inputFileNameItalian = DATA_FILES_DIR + "/" + "donatus-italian-forms.xml"; + instance.write(inputFileNameItalian); + String[] languages = {"ar", "de", "en", "el", "fr", "it", "la"}; + for (int i = 0; i < languages.length; i++) { + String language = languages[i]; + String inputFileNameDonatusSup = DATA_FILES_DIR + "/" + "donatus-sup-" + language + "-forms.xml"; + instance.write(inputFileNameDonatusSup); + } + String[] donatusAdditionalSups = {"cache-la", "cache-el", "cache-it"}; + for (int i = 0; i < donatusAdditionalSups.length; i++) { + String donatusAdditionalSupName = donatusAdditionalSups[i]; + String inputFileNameDonatusAddSup = DATA_FILES_DIR + "/donatusAdditionalSup/" + "donatus-sup-" + donatusAdditionalSupName + ".xml"; + instance.write(inputFileNameDonatusAddSup); + } + } + + private void write(String inputFileName) throws ApplicationException { + File inputFile = new File(inputFileName); + if (! inputFile.exists()) { + System.out.println("Input file: " + inputFile.getAbsolutePath() + " does not exist."); + return; + } + DBMorphWriterContentHandler morphContentHandler = new DBMorphWriterContentHandler(dbMorphHandler); + try { + XMLReader xmlParser = new SAXParser(); + xmlParser.setContentHandler(morphContentHandler); + InputStream inputStream = new FileInputStream(inputFile); + BufferedInputStream bufferedInputStream = new BufferedInputStream(inputStream); + InputSource input = new InputSource(bufferedInputStream); + xmlParser.parse(input); + bufferedInputStream.close(); + } catch (SAXException e) { + throw new ApplicationException(e); + } catch (IOException e) { + throw new ApplicationException(e); + } + } + + private long getSize() throws ApplicationException { + long size = dbMorphHandler.getSize(); + return size; + } + + private void addSampleData() throws ApplicationException { + Lemma l1 = new Lemma("perseus", "la", "abrogo"); + Form f1 = new Form("perseus", "la", "abrogare"); + Form f2 = new Form("perseus", "la", "abroges"); + dbMorphHandler.writeFormLemma(f1, l1); + dbMorphHandler.writeLemmaForm(l1, f1); + dbMorphHandler.writeLemmaForm(l1, f2); + } + + private void readSampleData() throws ApplicationException { + ArrayList
        forms = dbMorphHandler.readForms("la", "abrogo"); + System.out.println("Forms: " + forms); + } + + private void deleteSampleData() throws ApplicationException { + Lemma l1 = new Lemma("perseus", "la", "abrogo"); + Form f1 = new Form("perseus", "la", "abrogare"); + Form f2 = new Form("perseus", "la", "abroges"); + dbMorphHandler.deleteLemma(l1); + dbMorphHandler.deleteForm(f1); + dbMorphHandler.deleteForm(f2); + } + + private void end() throws ApplicationException { + dbMorphHandler.closeDatabases(); + } + + private void beginOperation() { + beginOfOperation = new Date(); + } + + private void endOperation() { + endOfOperation = new Date(); + } + +} \ No newline at end of file diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/morph/db/DBMorphWriterContentHandler.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/morph/db/DBMorphWriterContentHandler.java Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,133 @@ +package de.mpg.mpiwg.berlin.mpdl.lt.morph.db; + +import java.util.Hashtable; + +import org.xml.sax.*; + +import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; +import de.mpg.mpiwg.berlin.mpdl.lt.morph.app.Form; +import de.mpg.mpiwg.berlin.mpdl.lt.morph.app.Lemma; + +public class DBMorphWriterContentHandler implements ContentHandler { + private DBMorphHandler dbMorphHandler; + private Element currentElement; + private Form form; + private Lemma lemma; + private Hashtable forms; + + public DBMorphWriterContentHandler(DBMorphHandler dbMorphHandler) { + this.dbMorphHandler = dbMorphHandler; + } + + public void startDocument() throws SAXException { + forms = new Hashtable(); + } + + public void endDocument() throws SAXException { + forms = null; + } + + // TODO setPos etc. ersetzen durch addPos etc. + public void characters(char[] c, int start, int length) throws SAXException { + if (currentElement != null) { + String elemName = currentElement.name; + if (form != null) { + char[] cCopy = new char[length]; + System.arraycopy(c, start, cCopy, 0, length); + String charactersStr = String.valueOf(cCopy); + if (charactersStr != null && ! (charactersStr.trim().equals(""))) { + if (elemName.equals("provider")) { + form.addProvider(charactersStr); + lemma.addProvider(charactersStr); + } else if (elemName.equals("language")) { + form.addLanguage(charactersStr); + lemma.addLanguage(charactersStr); + } else if (elemName.equals("form-name")) { + form.addFormName(charactersStr); + } else if (elemName.equals("lemma-name")) { + form.addLemmaName(charactersStr); + lemma.addLemmaName(charactersStr); + } else if (elemName.equals("pos")) { + form.addPos(charactersStr); + } else if (elemName.equals("tense")) { + form.addTense(charactersStr); + } else if (elemName.equals("voice")) { + form.addVoice(charactersStr); + } else if (elemName.equals("casus")) { + form.addCasus(charactersStr); + } else if (elemName.equals("number")) { + form.addNumber(charactersStr); + } else if (elemName.equals("mood")) { + form.addMood(charactersStr); + } else if (elemName.equals("person")) { + form.addPerson(charactersStr); + } else if (elemName.equals("gender")) { + form.addGender(charactersStr); + } else if (elemName.equals("definite")) { + form.addDefinite(charactersStr); + } + } + } + } + } + + public void ignorableWhitespace(char[] c, int start, int length) throws SAXException { + } + + public void processingInstruction(String target, String data) throws SAXException { + } + + public void setDocumentLocator(org.xml.sax.Locator arg1) { + } + + public void endPrefixMapping(String prefix) throws SAXException { + } + + public void skippedEntity(String name) throws SAXException { + } + + public void startElement(String uri, String localName, String name, Attributes attrs) throws SAXException { + currentElement = new Element(name, ""); + if (localName.equals("form")) { + form = new Form(); + lemma = new Lemma(); + } + } + + public void endElement(String uri, String localName, String name) throws SAXException { + currentElement = null; + if (localName.equals("form")) { + String keyStr = form.getFormName(); + forms.put(keyStr, form); + write(form, lemma); + form = null; + lemma = null; + } + } + + public void startPrefixMapping(String prefix, String uri) throws SAXException { + } + + private void write(Form form, Lemma lemma) throws SAXException { + try { + dbMorphHandler.writeFormLemma(form, lemma); + dbMorphHandler.writeLemmaForm(lemma, form); + } catch (ApplicationException e) { + throw new SAXException(e); + } + } + + private class Element { + String name; + String value; + + Element(String name) { + this.name = name; + } + + Element(String name, String value) { + this.name = name; + this.value = value; + } + } +} diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/morph/db/DbEnvMorph.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/morph/db/DbEnvMorph.java Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,105 @@ +package de.mpg.mpiwg.berlin.mpdl.lt.morph.db; + +import java.io.File; + +import com.sleepycat.je.Database; +import com.sleepycat.je.DatabaseConfig; +import com.sleepycat.je.DatabaseException; +import com.sleepycat.je.Environment; +import com.sleepycat.je.EnvironmentConfig; + +import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; + +public class DbEnvMorph { + private String dataDir; + private File envPath; + private Environment env; + private EnvironmentConfig envConfig; + private DatabaseConfig dbConfig; + private Database lemmaDB; + private Database formDB; + + public DbEnvMorph() { + } + + public void setDataDir(String dataDir) { + this.dataDir = dataDir; + } + + public void init() throws ApplicationException { + try { + envConfig = new EnvironmentConfig(); + dbConfig = new DatabaseConfig(); + envConfig.setReadOnly(false); + dbConfig.setReadOnly(false); + envConfig.setAllowCreate(true); + dbConfig.setAllowCreate(true); + envConfig.setTransactional(true); + dbConfig.setTransactional(true); + // allow duplicates for keys + dbConfig.setSortedDuplicates(true); + envPath = new File(dataDir); + env = new Environment(envPath, envConfig); + } catch (DatabaseException e) { + throw new ApplicationException(e); + } + } + + public void openDatabases() throws ApplicationException { + try { + // open databases (and create them if they do not exist) + lemmaDB = env.openDatabase(null, "LemmaDB", dbConfig); + formDB = env.openDatabase(null, "FormDB", dbConfig); + } catch (DatabaseException e) { + throw new ApplicationException(e); + } + } + + public void removeDatabases() throws ApplicationException { + try { + if (formDB != null) + formDB.close(); + if (lemmaDB != null) + lemmaDB.close(); + env.removeDatabase(null, "LemmaDB"); + env.removeDatabase(null, "FormDB"); + formDB = null; + lemmaDB = null; + /* + boolean bla = true; + env.truncateDatabase(null, "LemmaDB", bla); + env.truncateDatabase(null, "FormDB", bla); + */ + } catch (DatabaseException e) { + throw new ApplicationException(e); + } + } + + public Environment getEnv() { + return env; + } + + public Database getLemmaDB() { + return lemmaDB; + } + + public Database getFormDB() { + return formDB; + } + + public void close() throws ApplicationException { + if (env != null) { + try { + if (formDB != null) + formDB.close(); + if (lemmaDB != null) + lemmaDB.close(); + if (env != null) + env.close(); + } catch (DatabaseException e) { + throw new ApplicationException(e); + } + } + } +} + diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/morph/db/DbEnvMorphSup.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/morph/db/DbEnvMorphSup.java Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,101 @@ +package de.mpg.mpiwg.berlin.mpdl.lt.morph.db; + +import java.io.File; +import java.util.HashMap; + +import com.sleepycat.je.Database; +import com.sleepycat.je.DatabaseConfig; +import com.sleepycat.je.DatabaseException; +import com.sleepycat.je.Environment; +import com.sleepycat.je.EnvironmentConfig; + +import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; + +public class DbEnvMorphSup { + private String dataDir; + private File envPath; + private Environment env; + private EnvironmentConfig envConfig; + private DatabaseConfig dbConfig; + private HashMap morphSupDBs = new HashMap(); + + public DbEnvMorphSup() { + } + + public void setDataDir(String dataDir) { + this.dataDir = dataDir; + } + + public void initReadOnly() throws ApplicationException { + try { + envConfig = new EnvironmentConfig(); + dbConfig = new DatabaseConfig(); + envPath = new File(dataDir); + env = new Environment(envPath, envConfig); + } catch (DatabaseException e) { + throw new ApplicationException(e); + } + } + + public void initReadWrite() throws ApplicationException { + try { + envConfig = new EnvironmentConfig(); + dbConfig = new DatabaseConfig(); + envConfig.setReadOnly(false); + dbConfig.setReadOnly(false); + envConfig.setAllowCreate(true); + dbConfig.setAllowCreate(true); + envConfig.setTransactional(true); + dbConfig.setTransactional(true); + envPath = new File(dataDir); + env = new Environment(envPath, envConfig); + } catch (DatabaseException e) { + throw new ApplicationException(e); + } + } + + public void openDatabase(String morphSupName) throws ApplicationException { + try { + Database lexDB = morphSupDBs.get(morphSupName); + if (lexDB == null) { + Database morphSupDB = env.openDatabase(null, morphSupName + ".db", dbConfig); + morphSupDBs.put(morphSupName, morphSupDB); + } + } catch (DatabaseException e) { + throw new ApplicationException(e); + } + } + + public void closeDatabase(String morphSupName) throws ApplicationException { + try { + if (morphSupDBs != null) { + Database morphSupDB = morphSupDBs.get(morphSupName); + if (morphSupDB != null) + morphSupDB.close(); + } + } catch (DatabaseException e) { + throw new ApplicationException(e); + } + } + + public Environment getEnv() { + return env; + } + + public Database getMorphSupDB(String morphSupName) { + Database morphSupDB = morphSupDBs.get(morphSupName); + return morphSupDB; + } + + public void close() throws ApplicationException { + if (env != null) { + try { + if (env != null) + env.close(); + } catch (DatabaseException e) { + throw new ApplicationException(e); + } + } + } +} + diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/Normalizer.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/Normalizer.java Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,1208 @@ +package de.mpg.mpiwg.berlin.mpdl.lt.text.norm; + +import java.io.IOException; +import java.io.StringReader; +import java.util.ArrayList; + +import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; +import de.mpg.mpiwg.berlin.mpdl.lt.general.Language; +import de.mpg.mpiwg.berlin.mpdl.lt.text.norm.lang.MpdlNormalizerLexAR; +import de.mpg.mpiwg.berlin.mpdl.lt.text.norm.lang.MpdlNormalizerLexDE; +import de.mpg.mpiwg.berlin.mpdl.lt.text.norm.lang.MpdlNormalizerLexEL; +import de.mpg.mpiwg.berlin.mpdl.lt.text.norm.lang.MpdlNormalizerLexEN; +import de.mpg.mpiwg.berlin.mpdl.lt.text.norm.lang.MpdlNormalizerLexFR; +import de.mpg.mpiwg.berlin.mpdl.lt.text.norm.lang.MpdlNormalizerLexIT; +import de.mpg.mpiwg.berlin.mpdl.lt.text.norm.lang.MpdlNormalizerLexLA; +import de.mpg.mpiwg.berlin.mpdl.lt.text.norm.lang.MpdlNormalizerLexNL; +import de.mpg.mpiwg.berlin.mpdl.lt.text.norm.lang.MpdlNormalizerLexZH; +import de.mpg.mpiwg.berlin.mpdl.lt.text.reg.Regularization; +import de.mpg.mpiwg.berlin.mpdl.lt.text.reg.RegularizationManager; + +public class Normalizer { + public static int DISPLAY = 1; // normalization in DISPLAY mode + public static int DICTIONARY = 2; // normalization in DICTIONARY mode + public static int SEARCH = 3; // normalization in SEARCH mode; never used so far in indexing because it does not support the morph. lexicons such as CELEX (e.g. eingeschränkt would not be stemmed to eingeschraenkt) + private int normMode = DISPLAY; // Default e.g. for indexing and querying + private String[] normFunctions = {"norm"}; // default is to use the norm function + private String language; + private int[] offsets; + + public Normalizer(String[] normFunctions, String lang) { + this.normFunctions = normFunctions; + String language = Language.getInstance().getLanguageId(lang); + this.language = language; + } + + public Normalizer(String language) { + this.language = language; + } + + public String getLanguage() { + return language; + } + + public void setNormMode(int normMode) { + this.normMode = normMode; + } + + /** + * Applies the normalization rules in language to + * s, without offset tracking. + * + * @param s source string + * @return normalized string + */ + public String normalize(String s) throws ApplicationException { + String normStr = s; + if (useSpecialNormFunction()) + normStr = removeSpecialNWDMarks(normStr); + if (useRegFunction()) { + // try to regularize the string to the norm form over predefined regularizations + RegularizationManager regManager = RegularizationManager.getInstance(); + ArrayList regs = regManager.findRegsByOrig(language, s); + if (regs != null && regs.size() > 0) { + Regularization reg = regs.get(0); // only one: the first one + String regNormStr = reg.getNorm(); + normStr = regNormStr; + } + } + if (useNormFunction()) { + // normalize the string by string replacements + if (normMode == DICTIONARY) { + normStr = normalize(normStr, DICTIONARY); + } else if (normMode == DISPLAY) { + normStr = normalize(normStr, DISPLAY); + } else if (normMode == SEARCH) { + normStr = normalize(normStr, SEARCH); + } + } + if (useSpecialNormFunction()) + normStr = insertSpecialNWDMarks(normStr); + return normStr; + } + + private boolean useRegFunction() { + boolean useReg = false; + for (int i=0; i< normFunctions.length; i++) { + String function = normFunctions[i]; + if (function.equals("reg")) + return true; + } + return useReg; + } + + private boolean useNormFunction() { + boolean useNorm = false; + for (int i=0; i< normFunctions.length; i++) { + String function = normFunctions[i]; + if (function.equals("norm") || function.equals("specialNorm")) + return true; + } + return useNorm; + } + + private boolean useSpecialNormFunction() { + boolean useNorm = false; + for (int i=0; i< normFunctions.length; i++) { + String function = normFunctions[i]; + if (function.equals("specialNorm")) + return true; + } + return useNorm; + } + + private String normalize(String s, int mode) { + String inputStr = s; + StringReader strReader = new StringReader(inputStr + "\n"); + String retStr = ""; + String token = ""; + try { + if (Language.getInstance().isLatin(language)) { + MpdlNormalizerLexLA mpdlNormalizerLex = new MpdlNormalizerLexLA(strReader); + if (mode == DISPLAY) + mpdlNormalizerLex.yybegin(MpdlNormalizerLexLA.DISP); + else if (mode == DICTIONARY) + mpdlNormalizerLex.yybegin(MpdlNormalizerLexLA.DICT); + else if (mode == SEARCH) + mpdlNormalizerLex.yybegin(MpdlNormalizerLexLA.SEARCH); + while (token != null) { + token = mpdlNormalizerLex.yylex(); + if (token != null) + retStr += token; + } + } else if (Language.getInstance().isArabic(language)) { + MpdlNormalizerLexAR mpdlNormalizerLex = new MpdlNormalizerLexAR(strReader); + if (mode == DISPLAY) + mpdlNormalizerLex.yybegin(MpdlNormalizerLexAR.DISP); + else if (mode == DICTIONARY) + mpdlNormalizerLex.yybegin(MpdlNormalizerLexAR.DICT); + else if (mode == SEARCH) + mpdlNormalizerLex.yybegin(MpdlNormalizerLexAR.SEARCH); + while (token != null) { + token = mpdlNormalizerLex.yylex(); + if (token != null) + retStr += token; + } + } else if (Language.getInstance().isGerman(language)) { + MpdlNormalizerLexDE mpdlNormalizerLex = new MpdlNormalizerLexDE(strReader); + if (mode == DISPLAY) + mpdlNormalizerLex.yybegin(MpdlNormalizerLexDE.DISP); + else if (mode == DICTIONARY) + mpdlNormalizerLex.yybegin(MpdlNormalizerLexDE.CELEX); + else if (mode == SEARCH) + mpdlNormalizerLex.yybegin(MpdlNormalizerLexDE.SEARCH); + while (token != null) { + token = mpdlNormalizerLex.yylex(); + if (token != null) + retStr += token; + } + } else if (Language.getInstance().isGreek(language)) { + MpdlNormalizerLexEL mpdlNormalizerLex = new MpdlNormalizerLexEL(strReader); + if (mode == DISPLAY) + mpdlNormalizerLex.yybegin(MpdlNormalizerLexEL.DISP); + else if (mode == DICTIONARY) + mpdlNormalizerLex.yybegin(MpdlNormalizerLexEL.SIGMA); + else if (mode == SEARCH) + mpdlNormalizerLex.yybegin(MpdlNormalizerLexEL.SEARCH); + while (token != null) { + token = mpdlNormalizerLex.yylex(); + if (token != null) + retStr += token; + } + } else if (Language.getInstance().isEnglish(language)) { + MpdlNormalizerLexEN mpdlNormalizerLex = new MpdlNormalizerLexEN(strReader); + if (mode == DISPLAY) + mpdlNormalizerLex.yybegin(MpdlNormalizerLexEN.DISP); + else if (mode == DICTIONARY) + mpdlNormalizerLex.yybegin(MpdlNormalizerLexEN.DICT); + else if (mode == SEARCH) + mpdlNormalizerLex.yybegin(MpdlNormalizerLexEN.SEARCH); + while (token != null) { + token = mpdlNormalizerLex.yylex(); + if (token != null) + retStr += token; + } + } else if (Language.getInstance().isFrench(language)) { + MpdlNormalizerLexFR mpdlNormalizerLex = new MpdlNormalizerLexFR(strReader); + if (mode == DISPLAY) + mpdlNormalizerLex.yybegin(MpdlNormalizerLexFR.DISP); + else if (mode == DICTIONARY) + mpdlNormalizerLex.yybegin(MpdlNormalizerLexFR.DICT_ASCII); + else if (mode == SEARCH) + mpdlNormalizerLex.yybegin(MpdlNormalizerLexFR.SEARCH); + while (token != null) { + token = mpdlNormalizerLex.yylex(); + if (token != null) + retStr += token; + } + } else if (Language.getInstance().isItalian(language)) { + MpdlNormalizerLexIT mpdlNormalizerLex = new MpdlNormalizerLexIT(strReader); + if (mode == DISPLAY) + mpdlNormalizerLex.yybegin(MpdlNormalizerLexIT.DISP); + else if (mode == DICTIONARY) + mpdlNormalizerLex.yybegin(MpdlNormalizerLexIT.DICT); + else if (mode == SEARCH) + mpdlNormalizerLex.yybegin(MpdlNormalizerLexIT.SEARCH); + while (token != null) { + token = mpdlNormalizerLex.yylex(); + if (token != null) + retStr += token; + } + } else if (Language.getInstance().isDutch(language)) { + MpdlNormalizerLexNL mpdlNormalizerLex = new MpdlNormalizerLexNL(strReader); + if (mode == DISPLAY) + mpdlNormalizerLex.yybegin(MpdlNormalizerLexNL.DISP); + else if (mode == DICTIONARY) + mpdlNormalizerLex.yybegin(MpdlNormalizerLexNL.DICT); + else if (mode == SEARCH) + mpdlNormalizerLex.yybegin(MpdlNormalizerLexNL.SEARCH); + while (token != null) { + token = mpdlNormalizerLex.yylex(); + if (token != null) + retStr += token; + } + } else if (Language.getInstance().isChinese(language)) { + MpdlNormalizerLexZH mpdlNormalizerLex = new MpdlNormalizerLexZH(strReader); + if (mode == DISPLAY) + mpdlNormalizerLex.yybegin(MpdlNormalizerLexZH.DISP); + else if (mode == DICTIONARY) + mpdlNormalizerLex.yybegin(MpdlNormalizerLexZH.DICT); + else if (mode == SEARCH) + mpdlNormalizerLex.yybegin(MpdlNormalizerLexZH.SEARCH); + while (token != null) { + token = mpdlNormalizerLex.yylex(); + if (token != null) + retStr += token; + } + } else { + retStr = s; // return the string unchanged + } + } catch (IOException e ) { + // nothing cause IOException is not needed for a StringReader + } + return retStr; + } + + + // used only in XmlTokenizerContentHandler // TODO make it better + private String removeSpecialNWDMarks(String inputString) { + String COMPLEX_ELEMENT_NWD_MARK = new Character('\u2424').toString(); // not word delimiting element + String cleanedWord = inputString; + boolean startsWithNWDMark = cleanedWord.startsWith(COMPLEX_ELEMENT_NWD_MARK); + if (startsWithNWDMark) + cleanedWord = cleanedWord.substring(1); + int countNWDMarks = cleanedWord.length() - cleanedWord.replaceAll(COMPLEX_ELEMENT_NWD_MARK, "").length(); + if (countNWDMarks > 1) + cleanedWord = cleanedWord.replaceAll(COMPLEX_ELEMENT_NWD_MARK + "+", COMPLEX_ELEMENT_NWD_MARK); + // boolean notHyphenPlusNWD = cleanedWord.matches(".*[^-]+" + COMPLEX_ELEMENT_NWD_MARK + "+.*"); // e.g. "praebi ta" + // if (notHyphenPlusNWD) + // cleanedWord = cleanedWord.replaceAll("([^-]+)" + COMPLEX_ELEMENT_NWD_MARK + "+", "$1-" + COMPLEX_ELEMENT_NWD_MARK); // e.g. "praebi ta" is replaced by "praebi- ta" + cleanedWord = cleanedWord.replaceAll(COMPLEX_ELEMENT_NWD_MARK, " "); + return cleanedWord; + } + + private String insertSpecialNWDMarks(String inputString) { + String COMPLEX_ELEMENT_NWD_MARK = new Character('\u2424').toString(); // not word delimiting element + String retStr = inputString; + boolean startsWithNWDMark = retStr.startsWith(COMPLEX_ELEMENT_NWD_MARK); + int countNWDMarks = retStr.length() - retStr.replaceAll(COMPLEX_ELEMENT_NWD_MARK, "").length(); + retStr = retStr.replaceAll(" ", COMPLEX_ELEMENT_NWD_MARK); + // if (notHyphenPlusNWD) + // normalizedWordStr = normalizedWordStr.replaceAll("-" + COMPLEX_ELEMENT_NWD_MARK, COMPLEX_ELEMENT_NWD_MARK); // e.g. "praebi- ta" is replaced by "praebi ta" + if (countNWDMarks > 1) { + String nwdStr = ""; + for (int i=0; ilanguage to + * s, with offset tracking.

        + * + * WARNING: + * Arboreal will not work properly if a normalization substitution + * replaces a source character with more than two target characters! + * This is simply a BUG, and should be fixed. Fortunately, however, + * one does not often need such a replacement.

        + * + * @param s source string + * @param offsets character offset table + * @return normalized string + */ + private String normalize4Lexica(String s, int[] offsets) { + this.offsets = offsets; + if (language.equals("la") || language.equals("lat")) { + StringBuffer buf = new StringBuffer(); + int n = 0; + for (int i = 0; i < s.length(); i++) { + char c = s.charAt(i); + String replace = new String(); + switch (c) { + case 'j': replace = "i"; break; + case 'v': replace = "u"; break; + /* + * Linguistic note: /u/ and /v/ are rarely phonemic + * in Latin, as in alui 's/he nourished' vs. + * alvi 'of a belly', volui 's/he wished' or 'it rolled' + * vs. volvi 'to be rolled', (in)seruit 's/he joined + * together' vs. (in)servit 's/he serves'. + */ + case 'q': + if ((i < s.length() - 1) && (s.charAt(i + 1) == ';')) + replace = "qu"; + else + replace = "q"; + break; + case ';': + if ((i > 0) && (s.charAt(i - 1) == 'q')) + replace = "e"; + else if ((i == 0) || ! Character.isLetter(s.charAt(i - 1))) + replace = ";"; + else + replace = ""; + break; + case '\u0300': replace = ""; break; // COMBINING GRAVE ACCENT + case '\u0301': replace = ""; break; // COMBINING ACCUTE ACCENT + case '\u0302': replace = ""; break; // COMBINING CIRCUMFLEX ACCENT + + case '\u00c0': replace = "A"; break; // LATIN CAPITAL LETTER A GRAVE + case '\u00c1': replace = "A"; break; // LATIN CAPITAL LETTER A ACUTE + case '\u00c2': replace = "A"; break; // LATIN CAPITAL LETTER A CIRCUMFLEX + case '\u00c4': replace = "A"; break; // LATIN CAPITAL LETTER A DIAERESIS + case '\u00c6': replace = "Ae"; break; // LATIN CAPITAL LETTER A E + case '\u00c7': replace = "C"; break; // LATIN CAPITAL LETTER C CEDILLA + case '\u00c8': replace = "E"; break; // LATIN CAPITAL LETTER E GRAVE + case '\u00c9': replace = "E"; break; // LATIN CAPITAL LETTER E ACUTE + case '\u00ca': replace = "E"; break; // LATIN CAPITAL LETTER E CIRCUMFLEX + case '\u00cb': replace = "E"; break; // LATIN CAPITAL LETTER E DIAERESIS + case '\u00cc': replace = "I"; break; // LATIN CAPITAL LETTER I GRAVE; + case '\u00cd': replace = "I"; break; // LATIN CAPITAL LETTER I ACUTE + case '\u00ce': replace = "I"; break; // LATIN CAPITAL LETTER I CIRCUMFLEX + case '\u00cf': replace = "I"; break; // LATIN CAPITAL LETTER I DIAERESIS + case '\u00d2': replace = "O"; break; // LATIN CAPITAL LETTER O GRAVE + case '\u00d3': replace = "O"; break; // LATIN CAPITAL LETTER O ACUTE + case '\u00d4': replace = "O"; break; // LATIN CAPITAL LETTER O CIRCUMFLEX + case '\u00d6': replace = "O"; break; // LATIN CAPITAL LETTER O DIAERESIS + case '\u00d9': replace = "U"; break; // LATIN CAPITAL LETTER U GRAVE + case '\u00da': replace = "U"; break; // LATIN CAPITAL LETTER U ACUTE + case '\u00db': replace = "U"; break; // LATIN CAPITAL LETTER U CIRCUMFLEX + case '\u00dc': replace = "U"; break; // LATIN CAPITAL LETTER U DIAERESIS + case '\u00e0': replace = "a"; break; // LATIN SMALL LETTER A GRAVE + case '\u00e1': replace = "a"; break; // LATIN SMALL LETTER A ACUTE + case '\u00e2': replace = "a"; break; // LATIN SMALL LETTER A CIRCUMFLEX + case '\u00e4': replace = "a"; break; // LATIN SMALL LETTER A DIAERESIS + case '\u00e6': replace = "ae"; break; // LATIN SMALL LETTER A E + case '\u00e7': replace = "c"; break; // LATIN SMALL LETTER C CEDILLA + case '\u00e8': replace = "e"; break; // LATIN SMALL LETTER E GRAVE + case '\u00e9': replace = "e"; break; // LATIN SMALL LETTER E ACUTE + case '\u00ea': replace = "e"; break; // LATIN SMALL LETTER E CIRCUMFLEX + case '\u00eb': replace = "e"; break; // LATIN SMALL LETTER E DIAERESIS + case '\u00ec': replace = "i"; break; // LATIN SMALL LETTER I GRAVE + case '\u00ed': replace = "i"; break; // LATIN SMALL LETTER I ACUTE + case '\u00ee': replace = "i"; break; // LATIN SMALL LETTER I CIRCUMFLEX + case '\u00ef': replace = "i"; break; // LATIN SMALL LETTER I DIAERESIS + case '\u00f2': replace = "o"; break; // LATIN SMALL LETTER O GRAVE + case '\u00f3': replace = "o"; break; // LATIN SMALL LETTER O ACUTE + case '\u00f4': replace = "o"; break; // LATIN SMALL LETTER O CIRCUMFLEX + case '\u00f6': replace = "o"; break; // LATIN SMALL LETTER O DIAERESIS + case '\u00f9': replace = "u"; break; // LATIN SMALL LETTER U GRAVE + case '\u00fa': replace = "u"; break; // LATIN SMALL LETTER U ACUTE + case '\u00fb': replace = "u"; break; // LATIN SMALL LETTER U CIRCUMFLEX + case '\u00fc': replace = "u"; break; // LATIN SMALL LETTER U DIAERESIS + case '\u0100': replace = "A"; break; // LATIN CAPITAL LETTER A MACRON + case '\u0101': replace = "a"; break; // LATIN SMALL LETTER A MACRON + case '\u0102': replace = "A"; break; // LATIN CAPITAL LETTER A BREVE + case '\u0103': replace = "a"; break; // LATIN SMALL LETTER A BREVE + case '\u0112': replace = "E"; break; // LATIN CAPITAL LETTER E MACRON + case '\u0113': replace = "e"; break; // LATIN SMALL LETTER E MACRON + case '\u0114': replace = "E"; break; // LATIN CAPITAL LETTER E BREVE + case '\u0115': replace = "e"; break; // LATIN SMALL LETTER E BREVE + case '\u0118': replace = "Ae"; break; // LATIN CAPITAL LETTER E OGONEK + case '\u0119': replace = "ae"; break; // LATIN SMALL LETTER E OGONEK + case '\u012a': replace = "I"; break; // LATIN CAPITAL LETTER I MACRON + case '\u012b': replace = "i"; break; // LATIN SMALL LETTER I MACRON + case '\u012c': replace = "I"; break; // LATIN CAPITAL LETTER I BREVE + case '\u012d': replace = "i"; break; // LATIN SMALL LETTER I BREVE + case '\u014c': replace = "O"; break; // LATIN CAPITAL LETTER O MACRON + case '\u014d': replace = "o"; break; // LATIN SMALL LETTER O MACRON + case '\u014e': replace = "O"; break; // LATIN CAPITAL LETTER O BREVE + case '\u014f': replace = "o"; break; // LATIN SMALL LETTER O BREVE + case '\u0152': replace = "Oe"; break; // LATIN CAPITAL LETTER O E + case '\u0153': replace = "oe"; break; // LATIN SMALL LETTER O E + case '\u016a': replace = "U"; break; // LATIN CAPITAL LETTER U MACRON + case '\u016b': replace = "u"; break; // LATIN SMALL LETTER U MACRON + case '\u016c': replace = "U"; break; // LATIN CAPITAL LETTER U BREVE + case '\u016d': replace = "u"; break; // LATIN SMALL LETTER U BREVE + case '\u017f': replace = "s"; break; // LATIN SMALL LETTER LONG S + case '\u00df': replace = "ss"; break; // LATIN SMALL LETTER SHARP S + case '\u00ad': break; // soft hyphen + // new in MPDL project by J. Willenborg + case '\u1e14': replace = "e"; break; // LATIN ... LETTER E WITH ... + case '\u1e15': replace = "e"; break; // LATIN ... LETTER E WITH ... + case '\u1e16': replace = "e"; break; // LATIN ... LETTER E WITH ... + case '\u1e17': replace = "e"; break; // LATIN ... LETTER E WITH ... + case '\u1e18': replace = "e"; break; // LATIN ... LETTER E WITH ... + case '\u1e19': replace = "e"; break; // LATIN ... LETTER E WITH ... + case '\u1e1a': replace = "e"; break; // LATIN ... LETTER E WITH ... + case '\u1e1b': replace = "e"; break; // LATIN ... LETTER E WITH ... + case '\u1e1c': replace = "e"; break; // LATIN ... LETTER E WITH ... + case '\u1e1d': replace = "e"; break; // LATIN ... LETTER E WITH ... + case '\u1eb8': replace = "e"; break; // LATIN ... LETTER E WITH ... + case '\u1eb9': replace = "e"; break; // LATIN ... LETTER E WITH ... + case '\u1eba': replace = "e"; break; // LATIN ... LETTER E WITH ... + case '\u1ebb': replace = "e"; break; // LATIN ... LETTER E WITH ... + case '\u1ebc': replace = "e"; break; // LATIN ... LETTER E WITH ... + case '\u1ebd': replace = "e"; break; // LATIN ... LETTER E WITH ... + case '\u1ebe': replace = "e"; break; // LATIN ... LETTER E WITH ... + case '\u1ebf': replace = "e"; break; // LATIN ... LETTER E WITH ... + case '\u1ec0': replace = "e"; break; // LATIN ... LETTER E WITH ... + case '\u1ec1': replace = "e"; break; // LATIN ... LETTER E WITH ... + case '\u1ec2': replace = "e"; break; // LATIN ... LETTER E WITH ... + case '\u1ec3': replace = "e"; break; // LATIN ... LETTER E WITH ... + case '\u1ec4': replace = "e"; break; // LATIN ... LETTER E WITH ... + case '\u1ec5': replace = "e"; break; // LATIN ... LETTER E WITH ... + case '\u1ec6': replace = "e"; break; // LATIN ... LETTER E WITH ... + case '\u1ec7': replace = "e"; break; // LATIN ... LETTER E WITH ... + // by Malcolm + case '\u2329': break; // BRA + case '\u232a': break; // KET + default: replace += c; break; + } + buf.append(replace); + // update offsets if replacement is a different length + if (offsets != null) { + int r = replace.length(); + if (r == 0) + this.offsets = arrayKill(this.offsets, i - n); + else if (r == 2) + this.offsets = arrayInsert(this.offsets, i - n + 1, this.offsets[i - n], r - 1); + n += 1 - r; + } + } + return buf.toString(); + } else if (language.equals("it")) { + // new Mpdl code: added by J. Willenborg: some of Malcolms code did not work without errors so it has to be taken away, also all latin stuff is imported + StringBuffer buf = new StringBuffer(); + int n = 0; + for (int i = 0; i < s.length(); i++) { + char c = s.charAt(i); + String replace = new String(); + switch (c) { + case '\u00c0': replace = "A"; break; // LATIN CAPITAL LETTER A GRAVE + case '\u00c1': replace = "A"; break; // LATIN CAPITAL LETTER A ACUTE + case '\u00c2': replace = "A"; break; // LATIN CAPITAL LETTER A CIRCUMFLEX + case '\u00c4': replace = "A"; break; // LATIN CAPITAL LETTER A DIAERESIS + case '\u00c6': replace = "Ae"; break; // LATIN CAPITAL LETTER A E + case '\u00c7': replace = "C"; break; // LATIN CAPITAL LETTER C CEDILLA + case '\u00c8': replace = "E"; break; // LATIN CAPITAL LETTER E GRAVE + case '\u00c9': replace = "E"; break; // LATIN CAPITAL LETTER E ACUTE + case '\u00ca': replace = "E"; break; // LATIN CAPITAL LETTER E CIRCUMFLEX + case '\u00cb': replace = "E"; break; // LATIN CAPITAL LETTER E DIAERESIS + case '\u00cc': replace = "I"; break; // LATIN CAPITAL LETTER I GRAVE; + case '\u00cd': replace = "I"; break; // LATIN CAPITAL LETTER I ACUTE + case '\u00ce': replace = "I"; break; // LATIN CAPITAL LETTER I CIRCUMFLEX + case '\u00cf': replace = "I"; break; // LATIN CAPITAL LETTER I DIAERESIS + case '\u00d2': replace = "O"; break; // LATIN CAPITAL LETTER O GRAVE + case '\u00d3': replace = "O"; break; // LATIN CAPITAL LETTER O ACUTE + case '\u00d4': replace = "O"; break; // LATIN CAPITAL LETTER O CIRCUMFLEX + case '\u00d6': replace = "O"; break; // LATIN CAPITAL LETTER O DIAERESIS + case '\u00d9': replace = "U"; break; // LATIN CAPITAL LETTER U GRAVE + case '\u00da': replace = "U"; break; // LATIN CAPITAL LETTER U ACUTE + case '\u00db': replace = "U"; break; // LATIN CAPITAL LETTER U CIRCUMFLEX + case '\u00dc': replace = "U"; break; // LATIN CAPITAL LETTER U DIAERESIS + case '\u00e0': replace = "a"; break; // LATIN SMALL LETTER A GRAVE + case '\u00e1': replace = "a"; break; // LATIN SMALL LETTER A ACUTE + case '\u00e2': replace = "a"; break; // LATIN SMALL LETTER A CIRCUMFLEX + case '\u00e4': replace = "a"; break; // LATIN SMALL LETTER A DIAERESIS + case '\u00e6': replace = "ae"; break; // LATIN SMALL LETTER A E + case '\u00e7': replace = "c"; break; // LATIN SMALL LETTER C CEDILLA + case '\u00e8': replace = "e"; break; // LATIN SMALL LETTER E GRAVE + case '\u00e9': replace = "e"; break; // LATIN SMALL LETTER E ACUTE + case '\u00ea': replace = "e"; break; // LATIN SMALL LETTER E CIRCUMFLEX + case '\u00eb': replace = "e"; break; // LATIN SMALL LETTER E DIAERESIS + case '\u00ec': replace = "i"; break; // LATIN SMALL LETTER I GRAVE + case '\u00ed': replace = "i"; break; // LATIN SMALL LETTER I ACUTE + case '\u00ee': replace = "i"; break; // LATIN SMALL LETTER I CIRCUMFLEX + case '\u00ef': replace = "i"; break; // LATIN SMALL LETTER I DIAERESIS + case '\u00f2': replace = "o"; break; // LATIN SMALL LETTER O GRAVE + case '\u00f3': replace = "o"; break; // LATIN SMALL LETTER O ACUTE + case '\u00f4': replace = "o"; break; // LATIN SMALL LETTER O CIRCUMFLEX + case '\u00f6': replace = "o"; break; // LATIN SMALL LETTER O DIAERESIS + case '\u00f9': replace = "u"; break; // LATIN SMALL LETTER U GRAVE + case '\u00fa': replace = "u"; break; // LATIN SMALL LETTER U ACUTE + case '\u00fb': replace = "u"; break; // LATIN SMALL LETTER U CIRCUMFLEX + case '\u00fc': replace = "u"; break; // LATIN SMALL LETTER U DIAERESIS + case '\u0100': replace = "A"; break; // LATIN CAPITAL LETTER A MACRON + case '\u0101': replace = "a"; break; // LATIN SMALL LETTER A MACRON + case '\u0102': replace = "A"; break; // LATIN CAPITAL LETTER A BREVE + case '\u0103': replace = "a"; break; // LATIN SMALL LETTER A BREVE + case '\u0112': replace = "E"; break; // LATIN CAPITAL LETTER E MACRON + case '\u0113': replace = "e"; break; // LATIN SMALL LETTER E MACRON + case '\u0114': replace = "E"; break; // LATIN CAPITAL LETTER E BREVE + case '\u0115': replace = "e"; break; // LATIN SMALL LETTER E BREVE + case '\u0118': replace = "Ae"; break; // LATIN CAPITAL LETTER E OGONEK + case '\u0119': replace = "ae"; break; // LATIN SMALL LETTER E OGONEK + case '\u012a': replace = "I"; break; // LATIN CAPITAL LETTER I MACRON + case '\u012b': replace = "i"; break; // LATIN SMALL LETTER I MACRON + case '\u012c': replace = "I"; break; // LATIN CAPITAL LETTER I BREVE + case '\u012d': replace = "i"; break; // LATIN SMALL LETTER I BREVE + case '\u014c': replace = "O"; break; // LATIN CAPITAL LETTER O MACRON + case '\u014d': replace = "o"; break; // LATIN SMALL LETTER O MACRON + case '\u014e': replace = "O"; break; // LATIN CAPITAL LETTER O BREVE + case '\u014f': replace = "o"; break; // LATIN SMALL LETTER O BREVE + case '\u0152': replace = "Oe"; break; // LATIN CAPITAL LETTER O E + case '\u0153': replace = "oe"; break; // LATIN SMALL LETTER O E + case '\u016a': replace = "U"; break; // LATIN CAPITAL LETTER U MACRON + case '\u016b': replace = "u"; break; // LATIN SMALL LETTER U MACRON + case '\u016c': replace = "U"; break; // LATIN CAPITAL LETTER U BREVE + case '\u016d': replace = "u"; break; // LATIN SMALL LETTER U BREVE + case '\u017f': replace = "s"; break; // LATIN SMALL LETTER LONG S + case '\u00df': replace = "ss"; break; // LATIN SMALL LETTER SHARP S + // new in MPDL project by J. Willenborg + case '\u1e8d': replace = "e"; break; // LATIN SMALL LETTER E WITH TILDE + default: replace += c; break; + } + buf.append(replace); + // update offsets if replacement is a different length + if (offsets != null) { + int r = replace.length(); + if (r == 0) this.offsets = arrayKill(this.offsets, i - n); + else if (r == 2) + this.offsets = arrayInsert(this.offsets, i - n + 1, this.offsets[i - n], r - 1); + n += 1 - r; + } + } + return buf.toString(); + // new Mpdl code: added by J. Willenborg: most of the latin replacements also in english + } else if (language.equals("en")) { + StringBuffer buf = new StringBuffer(); + int n = 0; + for (int i = 0; i < s.length(); i++) { + char c = s.charAt(i); + String replace = new String(); + switch (c) { + case '\u0300': replace = ""; break; // COMBINING GRAVE ACCENT + case '\u0301': replace = ""; break; // COMBINING ACCUTE ACCENT + case '\u0302': replace = ""; break; // COMBINING CIRCUMFLEX ACCENT + + case '\u00c0': replace = "A"; break; // LATIN CAPITAL LETTER A GRAVE + case '\u00c1': replace = "A"; break; // LATIN CAPITAL LETTER A ACUTE + case '\u00c2': replace = "A"; break; // LATIN CAPITAL LETTER A CIRCUMFLEX + case '\u00c4': replace = "A"; break; // LATIN CAPITAL LETTER A DIAERESIS + case '\u00c6': replace = "Ae"; break; // LATIN CAPITAL LETTER A E + case '\u00c7': replace = "C"; break; // LATIN CAPITAL LETTER C CEDILLA + case '\u00c8': replace = "E"; break; // LATIN CAPITAL LETTER E GRAVE + case '\u00c9': replace = "E"; break; // LATIN CAPITAL LETTER E ACUTE + case '\u00ca': replace = "E"; break; // LATIN CAPITAL LETTER E CIRCUMFLEX + case '\u00cb': replace = "E"; break; // LATIN CAPITAL LETTER E DIAERESIS + case '\u00cc': replace = "I"; break; // LATIN CAPITAL LETTER I GRAVE; + case '\u00cd': replace = "I"; break; // LATIN CAPITAL LETTER I ACUTE + case '\u00ce': replace = "I"; break; // LATIN CAPITAL LETTER I CIRCUMFLEX + case '\u00cf': replace = "I"; break; // LATIN CAPITAL LETTER I DIAERESIS + case '\u00d2': replace = "O"; break; // LATIN CAPITAL LETTER O GRAVE + case '\u00d3': replace = "O"; break; // LATIN CAPITAL LETTER O ACUTE + case '\u00d4': replace = "O"; break; // LATIN CAPITAL LETTER O CIRCUMFLEX + case '\u00d6': replace = "O"; break; // LATIN CAPITAL LETTER O DIAERESIS + case '\u00d9': replace = "U"; break; // LATIN CAPITAL LETTER U GRAVE + case '\u00da': replace = "U"; break; // LATIN CAPITAL LETTER U ACUTE + case '\u00db': replace = "U"; break; // LATIN CAPITAL LETTER U CIRCUMFLEX + case '\u00dc': replace = "U"; break; // LATIN CAPITAL LETTER U DIAERESIS + case '\u00e0': replace = "a"; break; // LATIN SMALL LETTER A GRAVE + case '\u00e1': replace = "a"; break; // LATIN SMALL LETTER A ACUTE + case '\u00e2': replace = "a"; break; // LATIN SMALL LETTER A CIRCUMFLEX + case '\u00e4': replace = "a"; break; // LATIN SMALL LETTER A DIAERESIS + case '\u00e6': replace = "ae"; break; // LATIN SMALL LETTER A E + case '\u00e7': replace = "c"; break; // LATIN SMALL LETTER C CEDILLA + case '\u00e8': replace = "e"; break; // LATIN SMALL LETTER E GRAVE + case '\u00e9': replace = "e"; break; // LATIN SMALL LETTER E ACUTE + case '\u00ea': replace = "e"; break; // LATIN SMALL LETTER E CIRCUMFLEX + case '\u00eb': replace = "e"; break; // LATIN SMALL LETTER E DIAERESIS + case '\u00ec': replace = "i"; break; // LATIN SMALL LETTER I GRAVE + case '\u00ed': replace = "i"; break; // LATIN SMALL LETTER I ACUTE + case '\u00ee': replace = "i"; break; // LATIN SMALL LETTER I CIRCUMFLEX + case '\u00ef': replace = "i"; break; // LATIN SMALL LETTER I DIAERESIS + case '\u00f2': replace = "o"; break; // LATIN SMALL LETTER O GRAVE + case '\u00f3': replace = "o"; break; // LATIN SMALL LETTER O ACUTE + case '\u00f4': replace = "o"; break; // LATIN SMALL LETTER O CIRCUMFLEX + case '\u00f6': replace = "o"; break; // LATIN SMALL LETTER O DIAERESIS + case '\u00f9': replace = "u"; break; // LATIN SMALL LETTER U GRAVE + case '\u00fa': replace = "u"; break; // LATIN SMALL LETTER U ACUTE + case '\u00fb': replace = "u"; break; // LATIN SMALL LETTER U CIRCUMFLEX + case '\u00fc': replace = "u"; break; // LATIN SMALL LETTER U DIAERESIS + case '\u0100': replace = "A"; break; // LATIN CAPITAL LETTER A MACRON + case '\u0101': replace = "a"; break; // LATIN SMALL LETTER A MACRON + case '\u0102': replace = "A"; break; // LATIN CAPITAL LETTER A BREVE + case '\u0103': replace = "a"; break; // LATIN SMALL LETTER A BREVE + case '\u0112': replace = "E"; break; // LATIN CAPITAL LETTER E MACRON + case '\u0113': replace = "e"; break; // LATIN SMALL LETTER E MACRON + case '\u0114': replace = "E"; break; // LATIN CAPITAL LETTER E BREVE + case '\u0115': replace = "e"; break; // LATIN SMALL LETTER E BREVE + case '\u0118': replace = "Ae"; break; // LATIN CAPITAL LETTER E OGONEK + case '\u0119': replace = "ae"; break; // LATIN SMALL LETTER E OGONEK + case '\u012a': replace = "I"; break; // LATIN CAPITAL LETTER I MACRON + case '\u012b': replace = "i"; break; // LATIN SMALL LETTER I MACRON + case '\u012c': replace = "I"; break; // LATIN CAPITAL LETTER I BREVE + case '\u012d': replace = "i"; break; // LATIN SMALL LETTER I BREVE + case '\u014c': replace = "O"; break; // LATIN CAPITAL LETTER O MACRON + case '\u014d': replace = "o"; break; // LATIN SMALL LETTER O MACRON + case '\u014e': replace = "O"; break; // LATIN CAPITAL LETTER O BREVE + case '\u014f': replace = "o"; break; // LATIN SMALL LETTER O BREVE + case '\u0152': replace = "Oe"; break; // LATIN CAPITAL LETTER O E + case '\u0153': replace = "oe"; break; // LATIN SMALL LETTER O E + case '\u016a': replace = "U"; break; // LATIN CAPITAL LETTER U MACRON + case '\u016b': replace = "u"; break; // LATIN SMALL LETTER U MACRON + case '\u016c': replace = "U"; break; // LATIN CAPITAL LETTER U BREVE + case '\u016d': replace = "u"; break; // LATIN SMALL LETTER U BREVE + case '\u017f': replace = "s"; break; // LATIN SMALL LETTER LONG S + case '\u00df': replace = "ss"; break; // LATIN SMALL LETTER SHARP S + // new in MPDL project by J. Willenborg + case '\u1e8d': replace = "e"; break; // LATIN SMALL LETTER E WITH TILDE + // by Malcolm + case '\u00ad': break; // soft hyphen + case '\u2329': break; // BRA + case '\u232a': break; // KET + default: replace += c; break; + } + buf.append(replace); + // update offsets if replacement is a different length + if (offsets != null) { + int r = replace.length(); + if (r == 0) + this.offsets = arrayKill(this.offsets, i - n); + else if (r == 2) + this.offsets = arrayInsert(this.offsets, i - n + 1, this.offsets[i - n], r - 1); + n += 1 - r; + } + } + return buf.toString(); + } else if (language.equals("fr")) { + // new Mpdl code: added by J. Willenborg: some of Malcolms code did not work without errors so it has to be taken away + StringBuffer buf = new StringBuffer(); + int n = 0; + for (int i = 0; i < s.length(); i++) { + char c = s.charAt(i); + String replace = new String(); + switch (c) { + case '\u00e6': replace = "ae"; break; // LATIN SMALL LETTER A E + case '\u017f': replace = "s"; break; // LATIN SMALL LETTER LONG S + case '\u00df': replace = "ss"; break; // LATIN SMALL LETTER SHARP S + case '\u00ad': break; // soft hyphen + case '-': break; + default: replace += c; break; + } + buf.append(replace); + // update offsets if replacement is a different length + if (offsets != null) { + int r = replace.length(); + if (r == 0) + this.offsets = arrayKill(this.offsets, i - n); + else if (r == 2) + this.offsets = arrayInsert(this.offsets, i - n + 1, this.offsets[i - n], r - 1); + n += 1 - r; + } + } + return buf.toString(); + } else if (language.equals("de")) { + StringBuffer buf = new StringBuffer(); + int n = 0; + for (int i = 0; i < s.length(); i++) { + char c = s.charAt(i); + String replace = new String(); + switch (c) { + case '\u00c4': replace = "Ae"; break; + case '\u00d6': replace = "Oe"; break; + case '\u00dc': replace = "Ue"; break; + case '\u00df': replace = "ss"; break; + case '\u00e4': replace = "ae"; break; + case '\u00f6': replace = "oe"; break; + case '\u00fc': replace = "ue"; break; + case '\u00ad': break; // soft hyphen + case '\u00e9': replace = "e"; break; + // new in MPDL project by J. Willenborg + case '\u017f': replace = "s"; break; // LATIN SMALL LETTER LONG S + // case '-': break; + default: replace += c; break; + } + buf.append(replace); + // update offsets if replacement is a different length + if (offsets != null) { + int r = replace.length(); + if (r == 0) + this.offsets = arrayKill(this.offsets, i - n); + else if (r == 2) + this.offsets = arrayInsert(this.offsets, i - n + 1, this.offsets[i - n], r - 1); + n += 1 - r; + } + } + return buf.toString(); + } else if (language.equals("zh")) { + StringBuffer buf = new StringBuffer(); + int n = 0; + for (int i = 0; i < s.length(); i++) { + char c = s.charAt(i); + String replace = new String(); + switch (c) { + case '\u00b9': replace = "1"; break; + case '\u00b2': replace = "2"; break; + case '\u00b3': replace = "3"; break; + case '\u2074': replace = "4"; break; + case '\u2075': replace = "5"; break; + // original by Malcolm Hyman: with the following replacements + // case '\u3000': replace = " "; break; + // case '\u3001': replace = ","; break; + // case '\u3002': replace = "."; break; + // case '\u200b': break; // BREAKS EVERYTHING! + default: replace += c; break; + } + buf.append(replace); + // update offsets if replacement is a different length + if (offsets != null) { + int r = replace.length(); + if (r == 0) + this.offsets = arrayKill(this.offsets, i - n); + else if (r == 2) + this.offsets = arrayInsert(this.offsets, i - n + 1, this.offsets[i - n], r - 1); + n += 1 - r; + } + } + return buf.toString(); + } else if (language.equals("akk") || + language.equals("qam") || + language.equals("qpc") || + language.equals("elx") || + language.equals("sux") || + language.equals("hit") || + language.equals("qhu") || + language.equals("peo") || + language.equals("uga") || + language.equals("ura") || + language.equals("qcu")) { + StringBuffer buf = new StringBuffer(); + int n = 0; + char last = '\u0000'; + for (int i = 0; i < s.length(); i++) { + char c = s.charAt(i); + c = Character.toLowerCase(c); + String replace = new String(); + switch (c) { + case '{': replace += "-"; break; + case '}': replace += "-"; break; + // These are from PSD::ATF::Unicode by Steve Tinney + case '\u0161': replace += "sz"; break; + case '\u1e63': replace += "s,"; break; + case '\u1e6d': replace += "t,"; break; + case '\u014b': replace += "j"; break; + case '\u015b': replace += "s'"; break; + case '\u2080': replace += "0"; break; + case '\u2081': replace += "1"; break; + case '\u2082': replace += "2"; break; + case '\u2083': replace += "3"; break; + case '\u2084': replace += "4"; break; + case '\u2085': replace += "5"; break; + case '\u2086': replace += "6"; break; + case '\u2087': replace += "7"; break; + case '\u2088': replace += "8"; break; + case '\u2089': replace += "9"; break; + + case 'c': // shin (except where used as modifier) + if ((i > 0) && ((last == '~') || (last == '@'))) + replace += "c"; + else replace += "sz"; + break; + default: replace += c; break; + } + // suppress grapheme boundary before or after word boundary + if (replace.equals("-")) { + if ((i + 1 == s.length()) || (s.charAt(i + 1) == ' ') || (i == 0) || (s.charAt(i - 1) == ' ')) + replace = ""; + } + last = c; + buf.append(replace); + // update offsets if replacement is a different length + if (offsets != null) { + int r = replace.length(); + if (r == 0) + this.offsets = arrayKill(this.offsets, i - n); + else if (r == 2) + this.offsets = arrayInsert(this.offsets, i - n + 1, this.offsets[i - n], r - 1); + n += 1 - r; + } + } + return buf.toString(); + } else if (language.equals("el") || language.equals("grc")) { + StringBuffer buf = new StringBuffer(); + int n = 0; + for (int i = 0; i < s.length(); i++) { + char c = s.charAt(i); + String replace = new String(); + switch (c) { + case '\u03c2': replace = "\u03c3"; break; // GREEK SMALL LETTER FINAL SIGMA + case '<': break; + case '>': break; + case '[': break; + case ']': break; + case '1': break; + case '2': break; + case '\u03ac': replace = "\u1f71"; break; + case '\u03ad': replace = "\u1f73"; break; + case '\u03ae': replace = "\u1f75"; break; + case '\u03af': replace = "\u1f77"; break; + case '\u03cc': replace = "\u1f79"; break; + case '\u03cd': replace = "\u1f7b"; break; + case '\u03ce': replace = "\u1f7d"; break; + case '-': break; // same treatment as soft hyphen + case '\u00ad': break; // soft hyphen + default: replace += c; break; + } + buf.append(replace); + // update offsets if replacement is a different length + if (offsets != null) { + int r = replace.length(); + if (r == 0) + this.offsets = arrayKill(this.offsets, i - n); + else if (r == 2) + this.offsets = arrayInsert(this.offsets, i - n + 1, this.offsets[i - n], r - 1); + n += 1 - r; + } + } + return buf.toString(); + } else if (language.equals("el_atonic")) { + StringBuffer buf = new StringBuffer(); + int n = 0; + for (int i = 0; i < s.length(); i++) { + char c = s.charAt(i); + String replace = new String(); + switch (c) { + case '\u03c2': replace = "\u03c3"; break; // GREEK SMALL LETTER FINAL SIGMA + // map characters with diacritics to their plain equivalent + // cf. BetaCode.java + case '\u03aa': replace = "\u0399"; break; + case '\u03ab': replace = "\u03a5"; break; + case '\u03ac': replace = "\u0381"; break; + case '\u03ad': replace = "\u0385"; break; + case '\u03ae': replace = "\u0387"; break; + case '\u03af': replace = "\u0389"; break; + case '\u03ca': replace = "\u03b9"; break; + case '\u03cb': replace = "\u03c5"; break; + case '\u03cc': replace = "\u03bf"; break; + case '\u03cd': replace = "\u03c5"; break; + case '\u03ce': replace = "\u03c9"; break; + case '\u1f00': replace = "\u03b1"; break; + case '\u1f01': replace = "\u03b1"; break; + case '\u1f02': replace = "\u03b1"; break; + case '\u1f03': replace = "\u03b1"; break; + case '\u1f04': replace = "\u03b1"; break; + case '\u1f05': replace = "\u03b1"; break; + case '\u1f06': replace = "\u03b1"; break; + case '\u1f07': replace = "\u03b1"; break; + case '\u1f08': replace = "\u0391"; break; + case '\u1f09': replace = "\u0391"; break; + case '\u1f0a': replace = "\u0391"; break; + case '\u1f0b': replace = "\u0391"; break; + case '\u1f0c': replace = "\u0391"; break; + case '\u1f0d': replace = "\u0391"; break; + case '\u1f0e': replace = "\u0391"; break; + case '\u1f0f': replace = "\u0391"; break; + case '\u1f10': replace = "\u03b5"; break; + case '\u1f11': replace = "\u03b5"; break; + case '\u1f12': replace = "\u03b5"; break; + case '\u1f13': replace = "\u03b5"; break; + case '\u1f14': replace = "\u03b5"; break; + case '\u1f15': replace = "\u03b5"; break; + case '\u1f18': replace = "\u0395"; break; + case '\u1f19': replace = "\u0395"; break; + case '\u1f1a': replace = "\u0395"; break; + case '\u1f1b': replace = "\u0395"; break; + case '\u1f1c': replace = "\u0395"; break; + case '\u1f1d': replace = "\u0395"; break; + case '\u1f20': replace = "\u03b7"; break; + case '\u1f21': replace = "\u03b7"; break; + case '\u1f22': replace = "\u03b7"; break; + case '\u1f23': replace = "\u03b7"; break; + case '\u1f24': replace = "\u03b7"; break; + case '\u1f25': replace = "\u03b7"; break; + case '\u1f26': replace = "\u03b7"; break; + case '\u1f27': replace = "\u03b7"; break; + case '\u1f28': replace = "\u0397"; break; + case '\u1f29': replace = "\u0397"; break; + case '\u1f2a': replace = "\u0397"; break; + case '\u1f2b': replace = "\u0397"; break; + case '\u1f2c': replace = "\u0397"; break; + case '\u1f2d': replace = "\u0397"; break; + case '\u1f2e': replace = "\u0397"; break; + case '\u1f2f': replace = "\u0397"; break; + case '\u1f30': replace = "\u03b9"; break; + case '\u1f31': replace = "\u03b9"; break; + case '\u1f32': replace = "\u03b9"; break; + case '\u1f33': replace = "\u03b9"; break; + case '\u1f34': replace = "\u03b9"; break; + case '\u1f35': replace = "\u03b9"; break; + case '\u1f36': replace = "\u03b9"; break; + case '\u1f37': replace = "\u03b9"; break; + case '\u1f38': replace = "\u0399"; break; + case '\u1f39': replace = "\u0399"; break; + case '\u1f3a': replace = "\u0399"; break; + case '\u1f3b': replace = "\u0399"; break; + case '\u1f3c': replace = "\u0399"; break; + case '\u1f3d': replace = "\u0399"; break; + case '\u1f3e': replace = "\u0399"; break; + case '\u1f3f': replace = "\u0399"; break; + case '\u1f40': replace = "\u03bf"; break; + case '\u1f41': replace = "\u03bf"; break; + case '\u1f42': replace = "\u03bf"; break; + case '\u1f43': replace = "\u03bf"; break; + case '\u1f44': replace = "\u03bf"; break; + case '\u1f45': replace = "\u03bf"; break; + case '\u1f48': replace = "\u039f"; break; + case '\u1f49': replace = "\u039f"; break; + case '\u1f4a': replace = "\u039f"; break; + case '\u1f4b': replace = "\u039f"; break; + case '\u1f4c': replace = "\u039f"; break; + case '\u1f4d': replace = "\u039f"; break; + case '\u1f50': replace = "\u03c5"; break; + case '\u1f51': replace = "\u03c5"; break; + case '\u1f52': replace = "\u03c5"; break; + case '\u1f53': replace = "\u03c5"; break; + case '\u1f54': replace = "\u03c5"; break; + case '\u1f55': replace = "\u03c5"; break; + case '\u1f56': replace = "\u03c5"; break; + case '\u1f57': replace = "\u03c5"; break; + case '\u1f58': replace = "\u03a5"; break; + case '\u1f59': replace = "\u03a5"; break; + case '\u1f5a': replace = "\u03a5"; break; + case '\u1f5b': replace = "\u03a5"; break; + case '\u1f5c': replace = "\u03a5"; break; + case '\u1f5d': replace = "\u03a5"; break; + case '\u1f5e': replace = "\u03a5"; break; + case '\u1f5f': replace = "\u03a5"; break; + case '\u1f60': replace = "\u03c9"; break; + case '\u1f61': replace = "\u03c9"; break; + case '\u1f62': replace = "\u03c9"; break; + case '\u1f63': replace = "\u03c9"; break; + case '\u1f64': replace = "\u03c9"; break; + case '\u1f65': replace = "\u03c9"; break; + case '\u1f66': replace = "\u03c9"; break; + case '\u1f67': replace = "\u03c9"; break; + case '\u1f68': replace = "\u03a9"; break; + case '\u1f69': replace = "\u03a9"; break; + case '\u1f6a': replace = "\u03a9"; break; + case '\u1f6b': replace = "\u03a9"; break; + case '\u1f6c': replace = "\u03a9"; break; + case '\u1f6d': replace = "\u03a9"; break; + case '\u1f6e': replace = "\u03a9"; break; + case '\u1f6f': replace = "\u03a9"; break; + case '\u1f70': replace = "\u03b1"; break; + case '\u1f71': replace = "\u03b1"; break; + case '\u1f72': replace = "\u03b5"; break; + case '\u1f73': replace = "\u03b5"; break; + case '\u1f74': replace = "\u03b7"; break; + case '\u1f75': replace = "\u03b7"; break; + case '\u1f76': replace = "\u03b9"; break; + case '\u1f77': replace = "\u03b9"; break; + case '\u1f78': replace = "\u03bf"; break; + case '\u1f79': replace = "\u03bf"; break; + case '\u1f7a': replace = "\u03c5"; break; + case '\u1f7b': replace = "\u03c5"; break; + case '\u1f7c': replace = "\u03c9"; break; + case '\u1f7d': replace = "\u03c9"; break; + case '\u1f80': replace = "\u03b1"; break; + case '\u1f81': replace = "\u03b1"; break; + case '\u1f82': replace = "\u03b1"; break; + case '\u1f83': replace = "\u03b1"; break; + case '\u1f84': replace = "\u03b1"; break; + case '\u1f85': replace = "\u03b1"; break; + case '\u1f86': replace = "\u03b1"; break; + case '\u1f87': replace = "\u03b1"; break; + case '\u1f88': replace = "\u0391"; break; + case '\u1f89': replace = "\u0391"; break; + case '\u1f8a': replace = "\u0391"; break; + case '\u1f8b': replace = "\u0391"; break; + case '\u1f8c': replace = "\u0391"; break; + case '\u1f8d': replace = "\u0391"; break; + case '\u1f8e': replace = "\u0391"; break; + case '\u1f8f': replace = "\u0391"; break; + case '\u1f90': replace = "\u03b7"; break; + case '\u1f91': replace = "\u03b7"; break; + case '\u1f92': replace = "\u03b7"; break; + case '\u1f93': replace = "\u03b7"; break; + case '\u1f94': replace = "\u03b7"; break; + case '\u1f95': replace = "\u03b7"; break; + case '\u1f96': replace = "\u03b7"; break; + case '\u1f97': replace = "\u03b7"; break; + case '\u1f98': replace = "\u0397"; break; + case '\u1f99': replace = "\u0397"; break; + case '\u1f9a': replace = "\u0397"; break; + case '\u1f9b': replace = "\u0397"; break; + case '\u1f9c': replace = "\u0397"; break; + case '\u1f9d': replace = "\u0397"; break; + case '\u1f9e': replace = "\u0397"; break; + case '\u1f9f': replace = "\u0397"; break; + case '\u1fa0': replace = "\u03c9"; break; + case '\u1fa1': replace = "\u03c9"; break; + case '\u1fa2': replace = "\u03c9"; break; + case '\u1fa3': replace = "\u03c9"; break; + case '\u1fa4': replace = "\u03c9"; break; + case '\u1fa5': replace = "\u03c9"; break; + case '\u1fa6': replace = "\u03c9"; break; + case '\u1fa7': replace = "\u03c9"; break; + case '\u1fa8': replace = "\u03a9"; break; + case '\u1fa9': replace = "\u03a9"; break; + case '\u1faa': replace = "\u03a9"; break; + case '\u1fab': replace = "\u03a9"; break; + case '\u1fac': replace = "\u03a9"; break; + case '\u1fad': replace = "\u03a9"; break; + case '\u1fae': replace = "\u03a9"; break; + case '\u1faf': replace = "\u03a9"; break; + case '\u1fb2': replace = "\u03b1"; break; + case '\u1fb3': replace = "\u03b1"; break; + case '\u1fb4': replace = "\u03b1"; break; + case '\u1fb6': replace = "\u03b1"; break; + case '\u1fb7': replace = "\u03b1"; break; + case '\u1fba': replace = "\u0391"; break; + case '\u1fbb': replace = "\u0391"; break; + case '\u1fbc': replace = "\u0391"; break; + case '\u1fc2': replace = "\u03b7"; break; + case '\u1fc3': replace = "\u03b7"; break; + case '\u1fc4': replace = "\u03b7"; break; + case '\u1fc6': replace = "\u03b7"; break; + case '\u1fc7': replace = "\u03b7"; break; + case '\u1fca': replace = "\u0397"; break; + case '\u1fcb': replace = "\u0397"; break; + case '\u1fcc': replace = "\u0397"; break; + case '\u1fd2': replace = "\u03b9"; break; + case '\u1fd3': replace = "\u03b9"; break; + case '\u1fd6': replace = "\u03b9"; break; + case '\u1fd7': replace = "\u03b9"; break; + case '\u1fda': replace = "\u0399"; break; + case '\u1fdb': replace = "\u039f"; break; + case '\u1fe2': replace = "\u03c5"; break; + case '\u1fe3': replace = "\u03c5"; break; + case '\u1fe4': replace = "\u03c1"; break; + case '\u1fe5': replace = "\u03c1"; break; + case '\u1fe6': replace = "\u03c5"; break; + case '\u1fe7': replace = "\u03c5"; break; + case '\u1fea': replace = "\u03a5"; break; + case '\u1feb': replace = "\u03a5"; break; + case '\u1fec': replace = "\u03a1"; break; + case '\u1ff2': replace = "\u03c9"; break; + case '\u1ff3': replace = "\u03c9"; break; + case '\u1ff4': replace = "\u03c9"; break; + case '\u1ff6': replace = "\u03c9"; break; + case '\u1ff7': replace = "\u03c9"; break; + case '\u1ff8': replace = "\u039f"; break; + case '\u1ff9': replace = "\u039f"; break; + case '\u1ffa': replace = "\u03a9"; break; + case '\u1ffb': replace = "\u03a9"; break; + case '\u1ffc': replace = "\u03a9"; break; + + case '<': break; + case '>': break; + case '-': break; // same treatment as soft hyphen + case '\u00ad': break; // soft hyphen + default: replace += c; break; + } + buf.append(replace); + // update offsets if replacement is a different length + if (offsets != null) { + int r = replace.length(); + if (r == 0) + this.offsets = arrayKill(this.offsets, i - n); + else if (r == 2) + this.offsets = arrayInsert(this.offsets, i - n + 1, this.offsets[i - n], r - 1); + n += 1 - r; + } + } + return buf.toString(); + } else { // unknown or no language + return s; + } + } + + /* + // explicit words + normStr = normStr.replaceAll("aliàs", "alias"); + normStr = normStr.replaceAll("hîc", "hic"); + normStr = normStr.replaceAll("quòd", "quod"); + normStr = normStr.replaceAll("Quòd", "Quod"); + normStr = normStr.replaceAll("QVòd", "Quod"); + normStr = normStr.replaceAll("Cùmque", "Cumque"); + normStr = normStr.replaceAll("aër", "aer"); + // ij + normStr = normStr.replaceAll("ij", "ii"); + // qu/qv + normStr = normStr.replaceAll("qv", "qu"); + // normStr = normStr.replaceAll("qV", "qU"); + normStr = normStr.replaceAll("Qv", "Qu"); + normStr = normStr.replaceAll("QV", "QU"); + // u/v + String vowels = getVowels(); + String consonants = getConsonants(); + normStr = normStr.replaceAll("([" + vowels + "])([-]*)u([" + vowels +"])", "$1$2v$3"); // vowel + u + vowel --> vowel + v + vowel + normStr = normStr.replaceAll("([" + vowels + "])([-]*)U([" + vowels +"])", "$1$2V$3"); // vowel + U + vowel --> vowel + V + vowel + normStr = normStr.replaceAll("([" + consonants + "])([-]*)v([" + consonants +"])", "$1$2u$3"); // consonant + v + consonant --> consonant + u + consonant + normStr = normStr.replaceAll("([" + consonants + "])([-]*)V([" + consonants +"])", "$1$2U$3"); // consonant + V + consonant --> consonant + U + consonant + normStr = normStr.replaceAll("^v([" + consonants +"])", "u$1"); // v + consonant --> u + consonant + normStr = normStr.replaceAll("^V([" + consonants +"])", "U$1"); // V + consonant --> U + consonant + // end of word: diacritica + normStr = normStr.replaceAll("à$", "a"); + normStr = normStr.replaceAll("è$", "e"); + normStr = normStr.replaceAll("ò$", "o"); + normStr = normStr.replaceAll("àm$", "am"); + normStr = normStr.replaceAll("ùm$", "um"); + String normStrTmp = normStr; + normStr = ""; + for (int i = 0; i < normStrTmp.length(); i++) { + char c = normStrTmp.charAt(i); + String replace = ""; + switch (c) { + case 'ſ': replace = "s"; break; + case 'ß': replace = "ss"; break; + case 'æ': replace = "ae"; break; + case 'Æ': replace = "AE"; break; + case 'ę': replace = "ae"; break; + case 'œ': replace = "oe"; break; + default: replace += c; break; + } + normStr = normStr + replace; + } + + + private String getVowels() { + String retStr = null; + if (Language.getInstance().isItalian(language)) { + retStr = "AEIOUaeiou" + + "\u00c6\u00e6" + // AE ligatures + "\u0152\u0153"; // OE ligatures + } else if (Language.getInstance().isLatin(language)) { + retStr = "AEIOUaeiouÆœęàèòù"; + } + return retStr; + } + + private String getConsonants() { + String retStr = null; + if (Language.getInstance().isItalian(language)) { + retStr = "BCDFGHKLMNPQRSTVWXZ" + + "bcdfghklmnpqrstvwxz" + + "ſß"; // long/sharp S + } else if (Language.getInstance().isLatin(language)) { + retStr = "BCDFGHKLMNPQRSTVWXZ" + + "bcdfghklmnpqrstvwxz" + + "ſß"; // long/sharp S + } + return retStr; + } + + + + + + * + * + * + * + */ + + + + + + + /** + * Returns a copy of an integer array with the element at + * index removed ("killed"). + * + * @param array integer array + * @param index index of element to remove + */ + private int[] arrayKill(int[] array, int index) { + int[] newArray = new int[array.length - 1]; + System.arraycopy(array, 0, newArray, 0, index); + System.arraycopy(array, index + 1, newArray, index, array.length - index - 1); + return newArray; + } + + /** + * Returns a copy of an integer array with count elements + * inserted at index. + * + * @param array integer array + * @param index index to insert new elements + * @param value value to insert into new slots + * @param count number of new slots to insert + */ + private int[] arrayInsert(int[] array, int index, int value, int count) { + int[] newArray = new int[array.length + count]; + System.arraycopy(array, 0, newArray, 0, index); + for (int i = 0; i < count; i++) newArray[index + i] = value; + System.arraycopy(array, index, newArray, index + count, array.length - index); + return newArray; + } + +} \ No newline at end of file diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexAR.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexAR.java Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,584 @@ +/* The following code was generated by JFlex 1.4.3 on 21.07.11 11:22 */ + +/* + * Normalization rules for Arabic text + * [this is a JFlex specification] + * + * Wolfgang Schmidle + * version 2011-02-28 + * + */ + +package de.mpg.mpiwg.berlin.mpdl.lt.text.norm.lang; + + +/** + * This class is a scanner generated by + * JFlex 1.4.3 + * on 21.07.11 11:22 from the specification file + * MpdlNormalizerLexAR.lex + */ +public class MpdlNormalizerLexAR { + + /** This character denotes the end of file */ + public static final int YYEOF = -1; + + /** initial size of the lookahead buffer */ + private static final int ZZ_BUFFERSIZE = 16384; + + /** lexical states */ + public static final int SEARCH = 6; + public static final int DICT = 4; + public static final int YYINITIAL = 0; + public static final int DISP = 2; + + /** + * ZZ_LEXSTATE[l] is the state in the DFA for the lexical state l + * ZZ_LEXSTATE[l+1] is the state in the DFA for the lexical state l + * at the beginning of a line + * l is of the form l = 2*k, k a non negative integer + */ + private static final int ZZ_LEXSTATE[] = { + 0, 0, 1, 1, 2, 2, 3, 3 + }; + + /** + * Translates characters to character classes + */ + private static final String ZZ_CMAP_PACKED = + "\12\0\1\3\25\0\1\2\14\0\1\1\2\0\1\1\17\0\1\4"+ + "\40\0\1\1\2\0\1\1\20\0\1\1\5\0\1\1\1\0\1\1"+ + "\uff82\0"; + + /** + * Translates characters to character classes + */ + private static final char [] ZZ_CMAP = zzUnpackCMap(ZZ_CMAP_PACKED); + + /** + * Translates DFA states to action switch labels. + */ + private static final int [] ZZ_ACTION = zzUnpackAction(); + + private static final String ZZ_ACTION_PACKED_0 = + "\4\0\2\1\1\2\1\3\1\4\1\5"; + + private static int [] zzUnpackAction() { + int [] result = new int[10]; + int offset = 0; + offset = zzUnpackAction(ZZ_ACTION_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackAction(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int count = packed.charAt(i++); + int value = packed.charAt(i++); + do result[j++] = value; while (--count > 0); + } + return j; + } + + + /** + * Translates a state to a row index in the transition table + */ + private static final int [] ZZ_ROWMAP = zzUnpackRowMap(); + + private static final String ZZ_ROWMAP_PACKED_0 = + "\0\0\0\5\0\12\0\17\0\24\0\31\0\24\0\24"+ + "\0\24\0\24"; + + private static int [] zzUnpackRowMap() { + int [] result = new int[10]; + int offset = 0; + offset = zzUnpackRowMap(ZZ_ROWMAP_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackRowMap(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int high = packed.charAt(i++) << 16; + result[j++] = high | packed.charAt(i++); + } + return j; + } + + /** + * The transition table of the DFA + */ + private static final int [] ZZ_TRANS = zzUnpackTrans(); + + private static final String ZZ_TRANS_PACKED_0 = + "\1\5\1\6\1\5\1\0\1\7\1\5\1\6\1\5"+ + "\1\10\1\7\1\5\1\6\1\5\1\11\1\7\1\5"+ + "\1\6\1\5\1\12\1\7\7\0\1\5\2\0"; + + private static int [] zzUnpackTrans() { + int [] result = new int[30]; + int offset = 0; + offset = zzUnpackTrans(ZZ_TRANS_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackTrans(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int count = packed.charAt(i++); + int value = packed.charAt(i++); + value--; + do result[j++] = value; while (--count > 0); + } + return j; + } + + + /* error codes */ + private static final int ZZ_UNKNOWN_ERROR = 0; + private static final int ZZ_NO_MATCH = 1; + private static final int ZZ_PUSHBACK_2BIG = 2; + + /* error messages for the codes above */ + private static final String ZZ_ERROR_MSG[] = { + "Unkown internal scanner error", + "Error: could not match input", + "Error: pushback value was too large" + }; + + /** + * ZZ_ATTRIBUTE[aState] contains the attributes of state aState + */ + private static final int [] ZZ_ATTRIBUTE = zzUnpackAttribute(); + + private static final String ZZ_ATTRIBUTE_PACKED_0 = + "\4\0\1\11\1\1\4\11"; + + private static int [] zzUnpackAttribute() { + int [] result = new int[10]; + int offset = 0; + offset = zzUnpackAttribute(ZZ_ATTRIBUTE_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackAttribute(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int count = packed.charAt(i++); + int value = packed.charAt(i++); + do result[j++] = value; while (--count > 0); + } + return j; + } + + /** the input device */ + private java.io.Reader zzReader; + + /** the current state of the DFA */ + private int zzState; + + /** the current lexical state */ + private int zzLexicalState = YYINITIAL; + + /** this buffer contains the current text to be matched and is + the source of the yytext() string */ + private char zzBuffer[] = new char[ZZ_BUFFERSIZE]; + + /** the textposition at the last accepting state */ + private int zzMarkedPos; + + /** the current text position in the buffer */ + private int zzCurrentPos; + + /** startRead marks the beginning of the yytext() string in the buffer */ + private int zzStartRead; + + /** endRead marks the last character in the buffer, that has been read + from input */ + private int zzEndRead; + + /** number of newlines encountered up to the start of the matched text */ + private int yyline; + + /** the number of characters up to the start of the matched text */ + private int yychar; + + /** + * the number of characters from the last newline up to the start of the + * matched text + */ + private int yycolumn; + + /** + * zzAtBOL == true <=> the scanner is currently at the beginning of a line + */ + private boolean zzAtBOL = true; + + /** zzAtEOF == true <=> the scanner is at the EOF */ + private boolean zzAtEOF; + + /** denotes if the user-EOF-code has already been executed */ + private boolean zzEOFDone; + + /* user code: */ + private String original = ""; + private String normalized = ""; + private int problem = 0; + + private void add (String norm) { + original += yytext(); + normalized += norm; + } + + private static final String LB = "[\u002d\u00ad] "; + + + /** + * Creates a new scanner + * There is also a java.io.InputStream version of this constructor. + * + * @param in the java.io.Reader to read input from. + */ + public MpdlNormalizerLexAR(java.io.Reader in) { + this.zzReader = in; + } + + /** + * Creates a new scanner. + * There is also java.io.Reader version of this constructor. + * + * @param in the java.io.Inputstream to read input from. + */ + public MpdlNormalizerLexAR(java.io.InputStream in) { + this(new java.io.InputStreamReader(in)); + } + + /** + * Unpacks the compressed character translation table. + * + * @param packed the packed character translation table + * @return the unpacked character translation table + */ + private static char [] zzUnpackCMap(String packed) { + char [] map = new char[0x10000]; + int i = 0; /* index in packed string */ + int j = 0; /* index in unpacked array */ + while (i < 42) { + int count = packed.charAt(i++); + char value = packed.charAt(i++); + do map[j++] = value; while (--count > 0); + } + return map; + } + + + /** + * Refills the input buffer. + * + * @return false, iff there was new input. + * + * @exception java.io.IOException if any I/O-Error occurs + */ + private boolean zzRefill() throws java.io.IOException { + + /* first: make room (if you can) */ + if (zzStartRead > 0) { + System.arraycopy(zzBuffer, zzStartRead, + zzBuffer, 0, + zzEndRead-zzStartRead); + + /* translate stored positions */ + zzEndRead-= zzStartRead; + zzCurrentPos-= zzStartRead; + zzMarkedPos-= zzStartRead; + zzStartRead = 0; + } + + /* is the buffer big enough? */ + if (zzCurrentPos >= zzBuffer.length) { + /* if not: blow it up */ + char newBuffer[] = new char[zzCurrentPos*2]; + System.arraycopy(zzBuffer, 0, newBuffer, 0, zzBuffer.length); + zzBuffer = newBuffer; + } + + /* finally: fill the buffer with new input */ + int numRead = zzReader.read(zzBuffer, zzEndRead, + zzBuffer.length-zzEndRead); + + if (numRead > 0) { + zzEndRead+= numRead; + return false; + } + // unlikely but not impossible: read 0 characters, but not at end of stream + if (numRead == 0) { + int c = zzReader.read(); + if (c == -1) { + return true; + } else { + zzBuffer[zzEndRead++] = (char) c; + return false; + } + } + + // numRead < 0 + return true; + } + + + /** + * Closes the input stream. + */ + public final void yyclose() throws java.io.IOException { + zzAtEOF = true; /* indicate end of file */ + zzEndRead = zzStartRead; /* invalidate buffer */ + + if (zzReader != null) + zzReader.close(); + } + + + /** + * Resets the scanner to read from a new input stream. + * Does not close the old reader. + * + * All internal variables are reset, the old input stream + * cannot be reused (internal buffer is discarded and lost). + * Lexical state is set to ZZ_INITIAL. + * + * @param reader the new input stream + */ + public final void yyreset(java.io.Reader reader) { + zzReader = reader; + zzAtBOL = true; + zzAtEOF = false; + zzEOFDone = false; + zzEndRead = zzStartRead = 0; + zzCurrentPos = zzMarkedPos = 0; + yyline = yychar = yycolumn = 0; + zzLexicalState = YYINITIAL; + } + + + /** + * Returns the current lexical state. + */ + public final int yystate() { + return zzLexicalState; + } + + + /** + * Enters a new lexical state + * + * @param newState the new lexical state + */ + public final void yybegin(int newState) { + zzLexicalState = newState; + } + + + /** + * Returns the text matched by the current regular expression. + */ + public final String yytext() { + return new String( zzBuffer, zzStartRead, zzMarkedPos-zzStartRead ); + } + + + /** + * Returns the character at position pos from the + * matched text. + * + * It is equivalent to yytext().charAt(pos), but faster + * + * @param pos the position of the character to fetch. + * A value from 0 to yylength()-1. + * + * @return the character at position pos + */ + public final char yycharat(int pos) { + return zzBuffer[zzStartRead+pos]; + } + + + /** + * Returns the length of the matched text region. + */ + public final int yylength() { + return zzMarkedPos-zzStartRead; + } + + + /** + * Reports an error that occured while scanning. + * + * In a wellformed scanner (no or only correct usage of + * yypushback(int) and a match-all fallback rule) this method + * will only be called with things that "Can't Possibly Happen". + * If this method is called, something is seriously wrong + * (e.g. a JFlex bug producing a faulty scanner etc.). + * + * Usual syntax/scanner level error handling should be done + * in error fallback rules. + * + * @param errorCode the code of the errormessage to display + */ + private void zzScanError(int errorCode) { + String message; + try { + message = ZZ_ERROR_MSG[errorCode]; + } + catch (ArrayIndexOutOfBoundsException e) { + message = ZZ_ERROR_MSG[ZZ_UNKNOWN_ERROR]; + } + + throw new Error(message); + } + + + /** + * Pushes the specified amount of characters back into the input stream. + * + * They will be read again by then next call of the scanning method + * + * @param number the number of characters to be read again. + * This number must not be greater than yylength()! + */ + public void yypushback(int number) { + if ( number > yylength() ) + zzScanError(ZZ_PUSHBACK_2BIG); + + zzMarkedPos -= number; + } + + + /** + * Resumes scanning until the next regular expression is matched, + * the end of input is encountered or an I/O-Error occurs. + * + * @return the next token + * @exception java.io.IOException if any I/O-Error occurs + */ + public java.lang.String yylex() throws java.io.IOException { + int zzInput; + int zzAction; + + // cached fields: + int zzCurrentPosL; + int zzMarkedPosL; + int zzEndReadL = zzEndRead; + char [] zzBufferL = zzBuffer; + char [] zzCMapL = ZZ_CMAP; + + int [] zzTransL = ZZ_TRANS; + int [] zzRowMapL = ZZ_ROWMAP; + int [] zzAttrL = ZZ_ATTRIBUTE; + + while (true) { + zzMarkedPosL = zzMarkedPos; + + zzAction = -1; + + zzCurrentPosL = zzCurrentPos = zzStartRead = zzMarkedPosL; + + zzState = ZZ_LEXSTATE[zzLexicalState]; + + + zzForAction: { + while (true) { + + if (zzCurrentPosL < zzEndReadL) + zzInput = zzBufferL[zzCurrentPosL++]; + else if (zzAtEOF) { + zzInput = YYEOF; + break zzForAction; + } + else { + // store back cached positions + zzCurrentPos = zzCurrentPosL; + zzMarkedPos = zzMarkedPosL; + boolean eof = zzRefill(); + // get translated positions and possibly new buffer + zzCurrentPosL = zzCurrentPos; + zzMarkedPosL = zzMarkedPos; + zzBufferL = zzBuffer; + zzEndReadL = zzEndRead; + if (eof) { + zzInput = YYEOF; + break zzForAction; + } + else { + zzInput = zzBufferL[zzCurrentPosL++]; + } + } + int zzNext = zzTransL[ zzRowMapL[zzState] + zzCMapL[zzInput] ]; + if (zzNext == -1) break zzForAction; + zzState = zzNext; + + int zzAttributes = zzAttrL[zzState]; + if ( (zzAttributes & 1) == 1 ) { + zzAction = zzState; + zzMarkedPosL = zzCurrentPosL; + if ( (zzAttributes & 8) == 8 ) break zzForAction; + } + + } + } + + // store back cached position + zzMarkedPos = zzMarkedPosL; + + switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) { + case 5: + { switch (problem) { + case 1: return original; + default: return normalized.replaceAll(LB, ""); + } + } + case 6: break; + case 4: + { switch (problem) { + case 1: return ""; + default: return normalized.replaceAll(LB, ""); + } + } + case 7: break; + case 2: + { problem = 1; add(yytext()); + } + case 8: break; + case 3: + { switch (problem) { + case 1: return original; + default: return normalized; + } + } + case 9: break; + case 1: + { add(yytext()); + } + case 10: break; + default: + if (zzInput == YYEOF && zzStartRead == zzCurrentPos) { + zzAtEOF = true; + return null; + } + else { + zzScanError(ZZ_NO_MATCH); + } + } + } + } + + +} diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexAR.lex --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexAR.lex Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,90 @@ +/* + * Normalization rules for Arabic text + * [this is a JFlex specification] + * + * Wolfgang Schmidle + * version 2011-02-28 + * + */ + +package de.mpg.mpiwg.berlin.mpdl.lt.analyzer.lang; + +%% + +%public +%class MpdlNormalizerLexAR +%type java.lang.String +%unicode + +// Arabic: ar + +%states DISP, DICT, SEARCH + +%{ + private String original = ""; + private String normalized = ""; + private int problem = 0; + + private void add (String norm) { + original += yytext(); + normalized += norm; + } + + private static final String LB = "[\u002d\u00ad] "; +%} + +hyphen = [-\u{00ad}] // hyphen and soft hyphen +LB = {hyphen} \u0020 +// lb = ({hyphen} \u0020)? + +END = \n + +%% + +@ { problem = 1; add(yytext()); } +{LB} { add(yytext()); } +. { add(yytext()); } + + + { + +{END} { + switch (problem) { + case 1: return original; + default: return normalized; + } + } +} + + { + +{END} { + switch (problem) { + case 1: return ""; + default: return normalized.replaceAll(LB, ""); + } + } +} + + { + +{END} { + switch (problem) { + case 1: return original; + default: return normalized.replaceAll(LB, ""); + } + } +} + + +/* + +Annahmen: +- die Routine wird wortweise aufgerufen, mit einem \n am Ende des Strings +- Zeilenumbrüche innerhalb des Wortes sind als hyphen/soft hyphen plus space markiert + +TO DO: + +AR: fehlt noch + +*/ diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexAll.lex --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexAll.lex Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,143 @@ +/* + * Normalization rules for all languages + * [this is a JFlex specification] + * + * Wolfgang Schmidle + * 2011-01-25 + * + */ + +package de.mpg.mpiwg.berlin.mpdl.lt.analyzer.lang; + +%% + +%public +%class MpdlNormalizerLexAll +%type java.lang.String +%unicode +// %debug + +%states LA, ZH + +%{ + int cv = 0; // consonant = 1, vowel = 2, everything else = 0 +%} + +VOWEL=[AEIOUaeiouÆæęàèòùœ] +CONS=[BCDFGHKLMNPQRSTVWXZbcdfghklmnpqrstvwxzſß] +LR=[lLrR] +QUE=(que)? +END=\n + +%% + + { + +// 1. simple replacements + +// 1.1 single characters +ſ { cv = 1; return "s"; } +ß { cv = 1; return "ss"; } +[æę] { cv = 2; return "ae"; } +Æ { cv = 2; return "AE"; } +œ { cv = 2; return "oe"; } +// 1.2 character combinations +ij { cv = 2; return "ii"; } + +// 2. diacritics + +// 2.1 superfluous diacritics in single words +^ hîc {END} { return "hic"; } + +// 2.2 superfluous diacritics at the end of a word +// 2.2.1 common cases +à / {QUE} {END} { return "a"; } +àm / {QUE} {END} { return "am"; } +às / {QUE} {END} { return "as"; } // (-àsque will likely never occur) +// à / [ms]? {QUE} {END} { return "a"; } +è / {QUE} {END} { return "e"; } +ò / {QUE} {END} { return "o"; } +òd / {QUE} {END} { return "od"; } +ùm / {QUE} {END} { return "um"; } +ùs / {QUE} {END} { return "us"; } + +// 2.3 superfluous diacritics within a word +// 2.3.1 common cases +aë { cv = 2; return "ae"; } +oë { cv = 2; return "oe"; } +// 2.3.2 rare cases +oï { cv = 2; return "oi"; } +uï { cv = 2; return "ui"; } +// 2.3.3 extremely rare cases +uü { cv = 2; return "uu"; } + +// 3. rules for u and v + +// 3.1 rules for u + +u/{VOWEL} { + switch(cv) { + case 2: return "v"; + default: cv = 2; return "u"; + } + } +U/{VOWEL} { + switch(cv) { + case 2: return "V"; + default: cv = 2; return "U"; + } + } + +// 3.2 rules for v + +qv { cv = 1; return "qu"; } // the replaced v still counts as consonant +Qv { cv = 1; return "Qu"; } +QV { cv = 1; return "QU"; } + +{LR}v { + switch(cv) { + case 1: return yytext().replace("v", "u"); + default: cv = 1; return yytext(); + } + } +{LR}V { + switch(cv) { + case 1: return yytext().replace("V", "U"); + default: cv = 1; return yytext(); + } + } + +v/{CONS} { cv = 1; return "u"; } +V/{CONS} { cv = 1; return "U"; } + + +// default + +{VOWEL} { cv = 2; return yytext(); } +{CONS} { cv = 1; return yytext(); } +\n { cv = 0; return ""; } +. { cv = 0; return yytext(); } + +} + + { + +// Codepoint < FFFF + +竒 { return "奇"; } // 7AD2 --> 5947 +旹 { return "時"; } // 65F9 --> 6642 +歴 { return "歷"; } // 6B74 --> 6B77 +精 { return "精"; } // FA1D --> 7CBE (FA1D is a compatibility ideograph) + +// Codepoint > FFFF + +庶 { return "庶"; } // 2F88D --> 5EB6 (2F88D is a compatibility ideograph) + + +} + + +// default (can be overridden by individual languages) + +\n { return ""; } +. { return yytext(); } diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexDE.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexDE.java Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,648 @@ +/* The following code was generated by JFlex 1.4.3 on 05.09.11 10:34 */ + +/* + * Normalization rules for German text + * [this is a JFlex specification] + * + * Wolfgang Schmidle + * version 2011-08-10 + * + */ + +package de.mpg.mpiwg.berlin.mpdl.lt.text.norm.lang; + + +/** + * This class is a scanner generated by + * JFlex 1.4.3 + * on 05.09.11 10:34 from the specification file + * MpdlNormalizerLexDE.lex + */ +public class MpdlNormalizerLexDE { + + /** This character denotes the end of file */ + public static final int YYEOF = -1; + + /** initial size of the lookahead buffer */ + private static final int ZZ_BUFFERSIZE = 16384; + + /** lexical states */ + public static final int SEARCH = 10; + public static final int DICT_ASCII = 6; + public static final int SEARCH_ASCII = 12; + public static final int DICT = 4; + public static final int YYINITIAL = 0; + public static final int DISP = 2; + public static final int GRIMM = 8; + + /** + * ZZ_LEXSTATE[l] is the state in the DFA for the lexical state l + * ZZ_LEXSTATE[l+1] is the state in the DFA for the lexical state l + * at the beginning of a line + * l is of the form l = 2*k, k a non negative integer + */ + private static final int ZZ_LEXSTATE[] = { + 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6 + }; + + /** + * Translates characters to character classes + */ + private static final String ZZ_CMAP_PACKED = + "\12\0\1\3\25\0\1\2\14\0\1\1\2\0\1\1\17\0\1\20"+ + "\32\4\6\0\1\11\2\4\1\5\12\4\1\13\5\4\1\7\5\4"+ + "\1\1\1\0\1\1\106\0\1\14\21\0\1\15\5\0\1\16\2\0"+ + "\1\17\4\0\1\14\21\0\1\15\5\0\1\16\202\0\1\6\u01e4\0"+ + "\1\12\1\0\1\10\ufc99\0"; + + /** + * Translates characters to character classes + */ + private static final char [] ZZ_CMAP = zzUnpackCMap(ZZ_CMAP_PACKED); + + /** + * Translates DFA states to action switch labels. + */ + private static final int [] ZZ_ACTION = zzUnpackAction(); + + private static final String ZZ_ACTION_PACKED_0 = + "\7\0\2\1\1\2\1\3\1\4\3\1\1\5\1\3"+ + "\3\1\1\6\1\7\1\10\1\11\1\12\1\13\1\14"+ + "\1\15\1\16\1\17"; + + private static int [] zzUnpackAction() { + int [] result = new int[30]; + int offset = 0; + offset = zzUnpackAction(ZZ_ACTION_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackAction(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int count = packed.charAt(i++); + int value = packed.charAt(i++); + do result[j++] = value; while (--count > 0); + } + return j; + } + + + /** + * Translates a state to a row index in the transition table + */ + private static final int [] ZZ_ROWMAP = zzUnpackRowMap(); + + private static final String ZZ_ROWMAP_PACKED_0 = + "\0\0\0\21\0\42\0\63\0\104\0\125\0\146\0\167"+ + "\0\210\0\167\0\167\0\167\0\231\0\252\0\273\0\167"+ + "\0\210\0\314\0\335\0\356\0\167\0\167\0\167\0\167"+ + "\0\167\0\167\0\167\0\167\0\167\0\167"; + + private static int [] zzUnpackRowMap() { + int [] result = new int[30]; + int offset = 0; + offset = zzUnpackRowMap(ZZ_ROWMAP_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackRowMap(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int high = packed.charAt(i++) << 16; + result[j++] = high | packed.charAt(i++); + } + return j; + } + + /** + * The transition table of the DFA + */ + private static final int [] ZZ_TRANS = zzUnpackTrans(); + + private static final String ZZ_TRANS_PACKED_0 = + "\1\10\1\11\1\10\1\0\1\10\1\11\1\12\1\11"+ + "\1\10\1\11\6\10\1\13\1\10\1\11\1\10\1\14"+ + "\1\10\1\11\1\12\1\15\1\10\1\16\1\10\1\17"+ + "\4\10\1\13\1\10\1\11\1\10\1\20\1\10\1\11"+ + "\1\12\1\15\1\10\1\16\1\10\1\17\4\10\2\13"+ + "\1\21\1\13\1\20\1\10\1\11\1\12\1\22\1\13"+ + "\1\23\1\13\1\24\1\25\1\26\1\27\1\30\1\13"+ + "\1\10\1\11\1\10\1\20\1\10\1\11\1\12\1\15"+ + "\1\10\1\16\1\10\1\17\3\10\1\31\1\13\1\10"+ + "\1\11\1\10\1\32\1\10\1\11\1\12\1\15\1\10"+ + "\1\16\1\10\1\17\4\10\2\13\1\21\1\13\1\32"+ + "\1\10\1\11\1\12\1\22\1\13\1\23\1\13\1\24"+ + "\1\25\1\26\1\27\1\30\1\13\23\0\1\10\20\0"+ + "\1\10\5\0\1\33\1\0\1\34\10\0\1\10\7\0"+ + "\1\35\20\0\1\36\10\0\1\10\5\0\1\33\1\0"+ + "\1\27\10\0\1\10\7\0\1\25\20\0\1\26\6\0"; + + private static int [] zzUnpackTrans() { + int [] result = new int[255]; + int offset = 0; + offset = zzUnpackTrans(ZZ_TRANS_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackTrans(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int count = packed.charAt(i++); + int value = packed.charAt(i++); + value--; + do result[j++] = value; while (--count > 0); + } + return j; + } + + + /* error codes */ + private static final int ZZ_UNKNOWN_ERROR = 0; + private static final int ZZ_NO_MATCH = 1; + private static final int ZZ_PUSHBACK_2BIG = 2; + + /* error messages for the codes above */ + private static final String ZZ_ERROR_MSG[] = { + "Unkown internal scanner error", + "Error: could not match input", + "Error: pushback value was too large" + }; + + /** + * ZZ_ATTRIBUTE[aState] contains the attributes of state aState + */ + private static final int [] ZZ_ATTRIBUTE = zzUnpackAttribute(); + + private static final String ZZ_ATTRIBUTE_PACKED_0 = + "\7\0\1\11\1\1\3\11\3\1\1\11\4\1\12\11"; + + private static int [] zzUnpackAttribute() { + int [] result = new int[30]; + int offset = 0; + offset = zzUnpackAttribute(ZZ_ATTRIBUTE_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackAttribute(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int count = packed.charAt(i++); + int value = packed.charAt(i++); + do result[j++] = value; while (--count > 0); + } + return j; + } + + /** the input device */ + private java.io.Reader zzReader; + + /** the current state of the DFA */ + private int zzState; + + /** the current lexical state */ + private int zzLexicalState = YYINITIAL; + + /** this buffer contains the current text to be matched and is + the source of the yytext() string */ + private char zzBuffer[] = new char[ZZ_BUFFERSIZE]; + + /** the textposition at the last accepting state */ + private int zzMarkedPos; + + /** the current text position in the buffer */ + private int zzCurrentPos; + + /** startRead marks the beginning of the yytext() string in the buffer */ + private int zzStartRead; + + /** endRead marks the last character in the buffer, that has been read + from input */ + private int zzEndRead; + + /** number of newlines encountered up to the start of the matched text */ + private int yyline; + + /** the number of characters up to the start of the matched text */ + private int yychar; + + /** + * the number of characters from the last newline up to the start of the + * matched text + */ + private int yycolumn; + + /** + * zzAtBOL == true <=> the scanner is currently at the beginning of a line + */ + private boolean zzAtBOL = true; + + /** zzAtEOF == true <=> the scanner is at the EOF */ + private boolean zzAtEOF; + + /** denotes if the user-EOF-code has already been executed */ + private boolean zzEOFDone; + + /* user code: */ + public static final int CELEX = DICT_ASCII; + + private String original = ""; + private String normalized = ""; + private int problem = 0; + + private void add (String norm) { + original += yytext(); + normalized += norm; + } + + private static final String LB = "[\u002d\u00ad] "; + + + /** + * Creates a new scanner + * There is also a java.io.InputStream version of this constructor. + * + * @param in the java.io.Reader to read input from. + */ + public MpdlNormalizerLexDE(java.io.Reader in) { + this.zzReader = in; + } + + /** + * Creates a new scanner. + * There is also java.io.Reader version of this constructor. + * + * @param in the java.io.Inputstream to read input from. + */ + public MpdlNormalizerLexDE(java.io.InputStream in) { + this(new java.io.InputStreamReader(in)); + } + + /** + * Unpacks the compressed character translation table. + * + * @param packed the packed character translation table + * @return the unpacked character translation table + */ + private static char [] zzUnpackCMap(String packed) { + char [] map = new char[0x10000]; + int i = 0; /* index in packed string */ + int j = 0; /* index in unpacked array */ + while (i < 88) { + int count = packed.charAt(i++); + char value = packed.charAt(i++); + do map[j++] = value; while (--count > 0); + } + return map; + } + + + /** + * Refills the input buffer. + * + * @return false, iff there was new input. + * + * @exception java.io.IOException if any I/O-Error occurs + */ + private boolean zzRefill() throws java.io.IOException { + + /* first: make room (if you can) */ + if (zzStartRead > 0) { + System.arraycopy(zzBuffer, zzStartRead, + zzBuffer, 0, + zzEndRead-zzStartRead); + + /* translate stored positions */ + zzEndRead-= zzStartRead; + zzCurrentPos-= zzStartRead; + zzMarkedPos-= zzStartRead; + zzStartRead = 0; + } + + /* is the buffer big enough? */ + if (zzCurrentPos >= zzBuffer.length) { + /* if not: blow it up */ + char newBuffer[] = new char[zzCurrentPos*2]; + System.arraycopy(zzBuffer, 0, newBuffer, 0, zzBuffer.length); + zzBuffer = newBuffer; + } + + /* finally: fill the buffer with new input */ + int numRead = zzReader.read(zzBuffer, zzEndRead, + zzBuffer.length-zzEndRead); + + if (numRead > 0) { + zzEndRead+= numRead; + return false; + } + // unlikely but not impossible: read 0 characters, but not at end of stream + if (numRead == 0) { + int c = zzReader.read(); + if (c == -1) { + return true; + } else { + zzBuffer[zzEndRead++] = (char) c; + return false; + } + } + + // numRead < 0 + return true; + } + + + /** + * Closes the input stream. + */ + public final void yyclose() throws java.io.IOException { + zzAtEOF = true; /* indicate end of file */ + zzEndRead = zzStartRead; /* invalidate buffer */ + + if (zzReader != null) + zzReader.close(); + } + + + /** + * Resets the scanner to read from a new input stream. + * Does not close the old reader. + * + * All internal variables are reset, the old input stream + * cannot be reused (internal buffer is discarded and lost). + * Lexical state is set to ZZ_INITIAL. + * + * @param reader the new input stream + */ + public final void yyreset(java.io.Reader reader) { + zzReader = reader; + zzAtBOL = true; + zzAtEOF = false; + zzEOFDone = false; + zzEndRead = zzStartRead = 0; + zzCurrentPos = zzMarkedPos = 0; + yyline = yychar = yycolumn = 0; + zzLexicalState = YYINITIAL; + } + + + /** + * Returns the current lexical state. + */ + public final int yystate() { + return zzLexicalState; + } + + + /** + * Enters a new lexical state + * + * @param newState the new lexical state + */ + public final void yybegin(int newState) { + zzLexicalState = newState; + } + + + /** + * Returns the text matched by the current regular expression. + */ + public final String yytext() { + return new String( zzBuffer, zzStartRead, zzMarkedPos-zzStartRead ); + } + + + /** + * Returns the character at position pos from the + * matched text. + * + * It is equivalent to yytext().charAt(pos), but faster + * + * @param pos the position of the character to fetch. + * A value from 0 to yylength()-1. + * + * @return the character at position pos + */ + public final char yycharat(int pos) { + return zzBuffer[zzStartRead+pos]; + } + + + /** + * Returns the length of the matched text region. + */ + public final int yylength() { + return zzMarkedPos-zzStartRead; + } + + + /** + * Reports an error that occured while scanning. + * + * In a wellformed scanner (no or only correct usage of + * yypushback(int) and a match-all fallback rule) this method + * will only be called with things that "Can't Possibly Happen". + * If this method is called, something is seriously wrong + * (e.g. a JFlex bug producing a faulty scanner etc.). + * + * Usual syntax/scanner level error handling should be done + * in error fallback rules. + * + * @param errorCode the code of the errormessage to display + */ + private void zzScanError(int errorCode) { + String message; + try { + message = ZZ_ERROR_MSG[errorCode]; + } + catch (ArrayIndexOutOfBoundsException e) { + message = ZZ_ERROR_MSG[ZZ_UNKNOWN_ERROR]; + } + + throw new Error(message); + } + + + /** + * Pushes the specified amount of characters back into the input stream. + * + * They will be read again by then next call of the scanning method + * + * @param number the number of characters to be read again. + * This number must not be greater than yylength()! + */ + public void yypushback(int number) { + if ( number > yylength() ) + zzScanError(ZZ_PUSHBACK_2BIG); + + zzMarkedPos -= number; + } + + + /** + * Resumes scanning until the next regular expression is matched, + * the end of input is encountered or an I/O-Error occurs. + * + * @return the next token + * @exception java.io.IOException if any I/O-Error occurs + */ + public java.lang.String yylex() throws java.io.IOException { + int zzInput; + int zzAction; + + // cached fields: + int zzCurrentPosL; + int zzMarkedPosL; + int zzEndReadL = zzEndRead; + char [] zzBufferL = zzBuffer; + char [] zzCMapL = ZZ_CMAP; + + int [] zzTransL = ZZ_TRANS; + int [] zzRowMapL = ZZ_ROWMAP; + int [] zzAttrL = ZZ_ATTRIBUTE; + + while (true) { + zzMarkedPosL = zzMarkedPos; + + zzAction = -1; + + zzCurrentPosL = zzCurrentPos = zzStartRead = zzMarkedPosL; + + zzState = ZZ_LEXSTATE[zzLexicalState]; + + + zzForAction: { + while (true) { + + if (zzCurrentPosL < zzEndReadL) + zzInput = zzBufferL[zzCurrentPosL++]; + else if (zzAtEOF) { + zzInput = YYEOF; + break zzForAction; + } + else { + // store back cached positions + zzCurrentPos = zzCurrentPosL; + zzMarkedPos = zzMarkedPosL; + boolean eof = zzRefill(); + // get translated positions and possibly new buffer + zzCurrentPosL = zzCurrentPos; + zzMarkedPosL = zzMarkedPos; + zzBufferL = zzBuffer; + zzEndReadL = zzEndRead; + if (eof) { + zzInput = YYEOF; + break zzForAction; + } + else { + zzInput = zzBufferL[zzCurrentPosL++]; + } + } + int zzNext = zzTransL[ zzRowMapL[zzState] + zzCMapL[zzInput] ]; + if (zzNext == -1) break zzForAction; + zzState = zzNext; + + int zzAttributes = zzAttrL[zzState]; + if ( (zzAttributes & 1) == 1 ) { + zzAction = zzState; + zzMarkedPosL = zzCurrentPosL; + if ( (zzAttributes & 8) == 8 ) break zzForAction; + } + + } + } + + // store back cached position + zzMarkedPos = zzMarkedPosL; + + switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) { + case 10: + { add("sz"); + } + case 16: break; + case 3: + { problem = 1; add(yytext()); + } + case 17: break; + case 6: + { add("ae"); + } + case 18: break; + case 2: + { add("s"); + } + case 19: break; + case 4: + { switch (problem) { + case 1: return original; + default: return normalized; + } + } + case 20: break; + case 13: + { add("ü"); + } + case 21: break; + case 8: + { add("ue"); + } + case 22: break; + case 11: + { switch (problem) { + case 1: return original; + default: return normalized.replaceAll(LB, "").toLowerCase(); + } + } + case 23: break; + case 12: + { add("u"); + } + case 24: break; + case 14: + { add("ä"); + } + case 25: break; + case 1: + { add(yytext()); + } + case 26: break; + case 9: + { add("ss"); + } + case 27: break; + case 7: + { add("oe"); + } + case 28: break; + case 15: + { add("ö"); + } + case 29: break; + case 5: + { switch (problem) { + case 1: return ""; + default: return normalized.replaceAll(LB, ""); + } + } + case 30: break; + default: + if (zzInput == YYEOF && zzStartRead == zzCurrentPos) { + zzAtEOF = true; + return null; + } + else { + zzScanError(ZZ_NO_MATCH); + } + } + } + } + + +} diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexDE.lex --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexDE.lex Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,134 @@ +/* + * Normalization rules for German text + * [this is a JFlex specification] + * + * Wolfgang Schmidle + * version 2011-07-12 + * + */ + +package de.mpg.mpiwg.berlin.mpdl.lt.analyzer.lang; + +%% + +%public +%class MpdlNormalizerLexDE +%type java.lang.String +%unicode + +// German: de, deu, ger + +%states DISP, DICT, SEARCH +%state CELEX, GRIMM + +%{ + private String original = ""; + private String normalized = ""; + private int problem = 0; + + private void add (String norm) { + original += yytext(); + normalized += norm; + } + + private static final String LB = "[\u002d\u00ad] "; +%} + +hyphen = [-\u{00ad}] // hyphen and soft hyphen +LB = {hyphen} \u0020 +// lb = ({hyphen} \u0020)? + +END = \n + +Alphabet = [abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ] + +%% + +ſ { add("s"); } + +// Fraktur + + { + +uͦ {add("u"); } +aͤ {add("ä"); } +oͤ {add("ö"); } +uͤ {add("ü"); } + +} + + { + +// normalize ä ö ü ß only for Celex! + +ä | Ä | aͤ { add("ae"); } +ö | Ö | oͤ { add("oe"); } +ü | Ü | uͤ { add("ue"); } +uͦ {add("u"); } +ß { add("ss"); } + +{Alphabet} { add(yytext()); } + +. { problem = 1; add(yytext()); } + +} + + { + +ß { add("sz"); } + +} + + +// default + +@ { problem = 1; add(yytext()); } +{LB} { add(yytext()); } +. { add(yytext()); } + + + { + +{END} { + switch (problem) { + case 1: return original; + default: return normalized; + } + } +} + + { + +{END} { + switch (problem) { + case 1: return ""; + default: return normalized.replaceAll(LB, ""); + } + } +} + + { + +{END} { + switch (problem) { + case 1: return original; + default: return normalized.replaceAll(LB, "").toLowerCase(); + } + } +} + + + +/* + +Annahmen: +- die Routine wird wortweise aufgerufen, mit einem \n am Ende des Strings +- Zeilenumbrüche innerhalb des Wortes sind als hyphen/soft hyphen plus space markiert + +TO DO: + +DE: Trennung von Deutsch und Fraktur? +DE: Celex: hyphens weg? + +*/ diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexEL.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexEL.java Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,711 @@ +/* The following code was generated by JFlex 1.4.3 on 05.09.11 10:35 */ + +/* + * Normalization rules for Greek text + * [this is a JFlex specification] + * + * Wolfgang Schmidle + * version 2011-08-03 + * + */ + +package de.mpg.mpiwg.berlin.mpdl.lt.text.norm.lang; + + +/** + * This class is a scanner generated by + * JFlex 1.4.3 + * on 05.09.11 10:35 from the specification file + * MpdlNormalizerLexEL.lex + */ +public class MpdlNormalizerLexEL { + + /** This character denotes the end of file */ + public static final int YYEOF = -1; + + /** initial size of the lookahead buffer */ + private static final int ZZ_BUFFERSIZE = 16384; + + /** lexical states */ + public static final int SEARCH = 6; + public static final int DICT = 4; + public static final int YYINITIAL = 0; + public static final int SIGMA = 8; + public static final int DISP = 2; + + /** + * ZZ_LEXSTATE[l] is the state in the DFA for the lexical state l + * ZZ_LEXSTATE[l+1] is the state in the DFA for the lexical state l + * at the beginning of a line + * l is of the form l = 2*k, k a non negative integer + */ + private static final int ZZ_LEXSTATE[] = { + 0, 0, 1, 1, 2, 2, 3, 3, 4, 4 + }; + + /** + * Translates characters to character classes + */ + private static final String ZZ_CMAP_PACKED = + "\12\0\1\3\25\0\1\2\14\0\1\1\2\0\1\1\17\0\1\5"+ + "\32\5\6\0\1\6\2\5\1\6\20\5\1\6\5\5\1\1\1\0"+ + "\1\1\u032e\0\1\7\1\10\1\11\1\12\15\0\1\4\3\0\1\4"+ + "\1\30\11\0\1\13\1\14\1\15\u1ba1\0\1\16\1\0\1\20\1\0"+ + "\1\21\1\0\1\23\1\0\1\24\1\0\1\25\1\0\1\26\65\0"+ + "\1\17\17\0\1\22\57\0\1\27\ue00d\0"; + + /** + * Translates characters to character classes + */ + private static final char [] ZZ_CMAP = zzUnpackCMap(ZZ_CMAP_PACKED); + + /** + * Translates DFA states to action switch labels. + */ + private static final int [] ZZ_ACTION = zzUnpackAction(); + + private static final String ZZ_ACTION_PACKED_0 = + "\5\0\2\1\2\2\1\3\1\4\1\5\1\6\1\7"+ + "\1\10\1\11\1\12\1\13\12\1\1\14\1\15\1\16"+ + "\1\0\1\17\1\0\1\20\1\0\1\21\1\0\1\22"+ + "\1\0\1\23\1\0\1\24\1\0\1\25\1\0\1\26"+ + "\1\0\1\27\1\0"; + + private static int [] zzUnpackAction() { + int [] result = new int[50]; + int offset = 0; + offset = zzUnpackAction(ZZ_ACTION_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackAction(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int count = packed.charAt(i++); + int value = packed.charAt(i++); + do result[j++] = value; while (--count > 0); + } + return j; + } + + + /** + * Translates a state to a row index in the transition table + */ + private static final int [] ZZ_ROWMAP = zzUnpackRowMap(); + + private static final String ZZ_ROWMAP_PACKED_0 = + "\0\0\0\31\0\62\0\113\0\144\0\175\0\226\0\175"+ + "\0\226\0\175\0\175\0\175\0\175\0\175\0\175\0\175"+ + "\0\175\0\175\0\257\0\310\0\341\0\372\0\u0113\0\u012c"+ + "\0\u0145\0\u015e\0\u0177\0\u0190\0\175\0\175\0\175\0\u01a9"+ + "\0\175\0\u01c2\0\175\0\u01db\0\175\0\u01f4\0\175\0\u020d"+ + "\0\175\0\u0226\0\175\0\u023f\0\175\0\u0258\0\175\0\u0271"+ + "\0\175\0\u028a"; + + private static int [] zzUnpackRowMap() { + int [] result = new int[50]; + int offset = 0; + offset = zzUnpackRowMap(ZZ_ROWMAP_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackRowMap(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int high = packed.charAt(i++) << 16; + result[j++] = high | packed.charAt(i++); + } + return j; + } + + /** + * The transition table of the DFA + */ + private static final int [] ZZ_TRANS = zzUnpackTrans(); + + private static final String ZZ_TRANS_PACKED_0 = + "\1\6\1\7\1\6\1\0\1\6\1\10\1\11\1\12"+ + "\1\13\1\14\1\15\1\16\1\17\1\20\14\6\1\7"+ + "\1\6\1\21\1\6\1\10\1\11\1\12\1\13\1\14"+ + "\1\15\1\16\1\17\1\20\14\6\1\7\1\6\1\22"+ + "\1\6\1\10\1\11\1\12\1\13\1\14\1\15\1\16"+ + "\1\17\1\20\1\23\1\24\1\25\1\26\1\27\1\30"+ + "\1\31\1\32\1\33\1\34\2\6\1\7\1\6\1\35"+ + "\1\6\1\10\1\11\1\12\1\13\1\14\1\15\1\16"+ + "\1\17\1\20\1\23\1\24\1\25\1\26\1\27\1\30"+ + "\1\31\1\32\1\33\1\34\2\6\1\7\1\6\1\22"+ + "\1\6\1\10\1\11\1\12\1\13\1\14\1\15\1\16"+ + "\1\17\1\20\1\23\1\24\1\25\1\26\1\27\1\30"+ + "\1\31\1\32\1\33\1\34\1\36\33\0\1\6\31\0"+ + "\1\37\1\40\23\0\1\40\3\0\1\41\1\42\23\0"+ + "\1\42\3\0\1\43\1\44\23\0\1\44\3\0\1\45"+ + "\1\46\23\0\1\46\3\0\1\47\1\50\23\0\1\50"+ + "\3\0\1\51\1\52\23\0\1\52\3\0\1\53\1\54"+ + "\23\0\1\54\3\0\1\55\1\56\23\0\1\56\3\0"+ + "\1\57\1\60\23\0\1\60\3\0\1\61\1\62\23\0"+ + "\1\62\3\0\1\37\30\0\1\41\30\0\1\43\30\0"+ + "\1\45\30\0\1\47\30\0\1\51\30\0\1\53\30\0"+ + "\1\55\30\0\1\57\30\0\1\61\25\0"; + + private static int [] zzUnpackTrans() { + int [] result = new int[675]; + int offset = 0; + offset = zzUnpackTrans(ZZ_TRANS_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackTrans(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int count = packed.charAt(i++); + int value = packed.charAt(i++); + value--; + do result[j++] = value; while (--count > 0); + } + return j; + } + + + /* error codes */ + private static final int ZZ_UNKNOWN_ERROR = 0; + private static final int ZZ_NO_MATCH = 1; + private static final int ZZ_PUSHBACK_2BIG = 2; + + /* error messages for the codes above */ + private static final String ZZ_ERROR_MSG[] = { + "Unkown internal scanner error", + "Error: could not match input", + "Error: pushback value was too large" + }; + + /** + * ZZ_ATTRIBUTE[aState] contains the attributes of state aState + */ + private static final int [] ZZ_ATTRIBUTE = zzUnpackAttribute(); + + private static final String ZZ_ATTRIBUTE_PACKED_0 = + "\5\0\1\11\1\1\1\11\1\1\11\11\12\1\3\11"+ + "\1\0\1\11\1\0\1\11\1\0\1\11\1\0\1\11"+ + "\1\0\1\11\1\0\1\11\1\0\1\11\1\0\1\11"+ + "\1\0\1\11\1\0"; + + private static int [] zzUnpackAttribute() { + int [] result = new int[50]; + int offset = 0; + offset = zzUnpackAttribute(ZZ_ATTRIBUTE_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackAttribute(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int count = packed.charAt(i++); + int value = packed.charAt(i++); + do result[j++] = value; while (--count > 0); + } + return j; + } + + /** the input device */ + private java.io.Reader zzReader; + + /** the current state of the DFA */ + private int zzState; + + /** the current lexical state */ + private int zzLexicalState = YYINITIAL; + + /** this buffer contains the current text to be matched and is + the source of the yytext() string */ + private char zzBuffer[] = new char[ZZ_BUFFERSIZE]; + + /** the textposition at the last accepting state */ + private int zzMarkedPos; + + /** the current text position in the buffer */ + private int zzCurrentPos; + + /** startRead marks the beginning of the yytext() string in the buffer */ + private int zzStartRead; + + /** endRead marks the last character in the buffer, that has been read + from input */ + private int zzEndRead; + + /** number of newlines encountered up to the start of the matched text */ + private int yyline; + + /** the number of characters up to the start of the matched text */ + private int yychar; + + /** + * the number of characters from the last newline up to the start of the + * matched text + */ + private int yycolumn; + + /** + * zzAtBOL == true <=> the scanner is currently at the beginning of a line + */ + private boolean zzAtBOL = true; + + /** zzAtEOF == true <=> the scanner is at the EOF */ + private boolean zzAtEOF; + + /** denotes if the user-EOF-code has already been executed */ + private boolean zzEOFDone; + + /* user code: */ + private String original = ""; + private String normalized = ""; + private int problem = 0; + + private void add (String norm) { + original += yytext(); + normalized += norm; + } + + private static final String LB = "[\u002d\u00ad] "; + + + /** + * Creates a new scanner + * There is also a java.io.InputStream version of this constructor. + * + * @param in the java.io.Reader to read input from. + */ + public MpdlNormalizerLexEL(java.io.Reader in) { + this.zzReader = in; + } + + /** + * Creates a new scanner. + * There is also java.io.Reader version of this constructor. + * + * @param in the java.io.Inputstream to read input from. + */ + public MpdlNormalizerLexEL(java.io.InputStream in) { + this(new java.io.InputStreamReader(in)); + } + + /** + * Unpacks the compressed character translation table. + * + * @param packed the packed character translation table + * @return the unpacked character translation table + */ + private static char [] zzUnpackCMap(String packed) { + char [] map = new char[0x10000]; + int i = 0; /* index in packed string */ + int j = 0; /* index in unpacked array */ + while (i < 112) { + int count = packed.charAt(i++); + char value = packed.charAt(i++); + do map[j++] = value; while (--count > 0); + } + return map; + } + + + /** + * Refills the input buffer. + * + * @return false, iff there was new input. + * + * @exception java.io.IOException if any I/O-Error occurs + */ + private boolean zzRefill() throws java.io.IOException { + + /* first: make room (if you can) */ + if (zzStartRead > 0) { + System.arraycopy(zzBuffer, zzStartRead, + zzBuffer, 0, + zzEndRead-zzStartRead); + + /* translate stored positions */ + zzEndRead-= zzStartRead; + zzCurrentPos-= zzStartRead; + zzMarkedPos-= zzStartRead; + zzStartRead = 0; + } + + /* is the buffer big enough? */ + if (zzCurrentPos >= zzBuffer.length) { + /* if not: blow it up */ + char newBuffer[] = new char[zzCurrentPos*2]; + System.arraycopy(zzBuffer, 0, newBuffer, 0, zzBuffer.length); + zzBuffer = newBuffer; + } + + /* finally: fill the buffer with new input */ + int numRead = zzReader.read(zzBuffer, zzEndRead, + zzBuffer.length-zzEndRead); + + if (numRead > 0) { + zzEndRead+= numRead; + return false; + } + // unlikely but not impossible: read 0 characters, but not at end of stream + if (numRead == 0) { + int c = zzReader.read(); + if (c == -1) { + return true; + } else { + zzBuffer[zzEndRead++] = (char) c; + return false; + } + } + + // numRead < 0 + return true; + } + + + /** + * Closes the input stream. + */ + public final void yyclose() throws java.io.IOException { + zzAtEOF = true; /* indicate end of file */ + zzEndRead = zzStartRead; /* invalidate buffer */ + + if (zzReader != null) + zzReader.close(); + } + + + /** + * Resets the scanner to read from a new input stream. + * Does not close the old reader. + * + * All internal variables are reset, the old input stream + * cannot be reused (internal buffer is discarded and lost). + * Lexical state is set to ZZ_INITIAL. + * + * @param reader the new input stream + */ + public final void yyreset(java.io.Reader reader) { + zzReader = reader; + zzAtBOL = true; + zzAtEOF = false; + zzEOFDone = false; + zzEndRead = zzStartRead = 0; + zzCurrentPos = zzMarkedPos = 0; + yyline = yychar = yycolumn = 0; + zzLexicalState = YYINITIAL; + } + + + /** + * Returns the current lexical state. + */ + public final int yystate() { + return zzLexicalState; + } + + + /** + * Enters a new lexical state + * + * @param newState the new lexical state + */ + public final void yybegin(int newState) { + zzLexicalState = newState; + } + + + /** + * Returns the text matched by the current regular expression. + */ + public final String yytext() { + return new String( zzBuffer, zzStartRead, zzMarkedPos-zzStartRead ); + } + + + /** + * Returns the character at position pos from the + * matched text. + * + * It is equivalent to yytext().charAt(pos), but faster + * + * @param pos the position of the character to fetch. + * A value from 0 to yylength()-1. + * + * @return the character at position pos + */ + public final char yycharat(int pos) { + return zzBuffer[zzStartRead+pos]; + } + + + /** + * Returns the length of the matched text region. + */ + public final int yylength() { + return zzMarkedPos-zzStartRead; + } + + + /** + * Reports an error that occured while scanning. + * + * In a wellformed scanner (no or only correct usage of + * yypushback(int) and a match-all fallback rule) this method + * will only be called with things that "Can't Possibly Happen". + * If this method is called, something is seriously wrong + * (e.g. a JFlex bug producing a faulty scanner etc.). + * + * Usual syntax/scanner level error handling should be done + * in error fallback rules. + * + * @param errorCode the code of the errormessage to display + */ + private void zzScanError(int errorCode) { + String message; + try { + message = ZZ_ERROR_MSG[errorCode]; + } + catch (ArrayIndexOutOfBoundsException e) { + message = ZZ_ERROR_MSG[ZZ_UNKNOWN_ERROR]; + } + + throw new Error(message); + } + + + /** + * Pushes the specified amount of characters back into the input stream. + * + * They will be read again by then next call of the scanning method + * + * @param number the number of characters to be read again. + * This number must not be greater than yylength()! + */ + public void yypushback(int number) { + if ( number > yylength() ) + zzScanError(ZZ_PUSHBACK_2BIG); + + zzMarkedPos -= number; + } + + + /** + * Resumes scanning until the next regular expression is matched, + * the end of input is encountered or an I/O-Error occurs. + * + * @return the next token + * @exception java.io.IOException if any I/O-Error occurs + */ + public java.lang.String yylex() throws java.io.IOException { + int zzInput; + int zzAction; + + // cached fields: + int zzCurrentPosL; + int zzMarkedPosL; + int zzEndReadL = zzEndRead; + char [] zzBufferL = zzBuffer; + char [] zzCMapL = ZZ_CMAP; + + int [] zzTransL = ZZ_TRANS; + int [] zzRowMapL = ZZ_ROWMAP; + int [] zzAttrL = ZZ_ATTRIBUTE; + + while (true) { + zzMarkedPosL = zzMarkedPos; + + zzAction = -1; + + zzCurrentPosL = zzCurrentPos = zzStartRead = zzMarkedPosL; + + zzState = ZZ_LEXSTATE[zzLexicalState]; + + + zzForAction: { + while (true) { + + if (zzCurrentPosL < zzEndReadL) + zzInput = zzBufferL[zzCurrentPosL++]; + else if (zzAtEOF) { + zzInput = YYEOF; + break zzForAction; + } + else { + // store back cached positions + zzCurrentPos = zzCurrentPosL; + zzMarkedPos = zzMarkedPosL; + boolean eof = zzRefill(); + // get translated positions and possibly new buffer + zzCurrentPosL = zzCurrentPos; + zzMarkedPosL = zzMarkedPos; + zzBufferL = zzBuffer; + zzEndReadL = zzEndRead; + if (eof) { + zzInput = YYEOF; + break zzForAction; + } + else { + zzInput = zzBufferL[zzCurrentPosL++]; + } + } + int zzNext = zzTransL[ zzRowMapL[zzState] + zzCMapL[zzInput] ]; + if (zzNext == -1) break zzForAction; + zzState = zzNext; + + int zzAttributes = zzAttrL[zzState]; + if ( (zzAttributes & 1) == 1 ) { + zzAction = zzState; + zzMarkedPosL = zzCurrentPosL; + if ( (zzAttributes & 8) == 8 ) break zzForAction; + } + + } + } + + // store back cached position + zzMarkedPos = zzMarkedPosL; + + switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) { + case 23: + // lookahead expression with fixed base length + zzMarkedPos = zzStartRead + 1; + { add("ῴ"); + } + case 24: break; + case 5: + { add("ή"); + } + case 25: break; + case 17: + // lookahead expression with fixed base length + zzMarkedPos = zzStartRead + 1; + { add("ή"); + } + case 26: break; + case 13: + { add("σ"); + } + case 27: break; + case 6: + { add("ί"); + } + case 28: break; + case 1: + { add(yytext()); + } + case 29: break; + case 22: + // lookahead expression with fixed base length + zzMarkedPos = zzStartRead + 1; + { add("ώ"); + } + case 30: break; + case 11: + { switch (problem) { + case 1: return ""; + default: return normalized.replaceAll(LB, ""); + } + } + case 31: break; + case 19: + // lookahead expression with fixed base length + zzMarkedPos = zzStartRead + 1; + { add("ί"); + } + case 32: break; + case 15: + // lookahead expression with fixed base length + zzMarkedPos = zzStartRead + 1; + { add("ᾴ"); + } + case 33: break; + case 7: + { add("ό"); + } + case 34: break; + case 14: + // lookahead expression with fixed base length + zzMarkedPos = zzStartRead + 1; + { add("ά"); + } + case 35: break; + case 12: + { switch (problem) { + case 1: return original; + default: return normalized.replaceAll(LB, "").toLowerCase(); + } + } + case 36: break; + case 8: + { add("ύ"); + } + case 37: break; + case 2: + { problem = 1; add(yytext()); + } + case 38: break; + case 20: + // lookahead expression with fixed base length + zzMarkedPos = zzStartRead + 1; + { add("ό"); + } + case 39: break; + case 3: + { add("ά"); + } + case 40: break; + case 10: + { switch (problem) { + case 1: return original; + default: return normalized; + } + } + case 41: break; + case 9: + { add("ώ"); + } + case 42: break; + case 16: + // lookahead expression with fixed base length + zzMarkedPos = zzStartRead + 1; + { add("έ"); + } + case 43: break; + case 18: + // lookahead expression with fixed base length + zzMarkedPos = zzStartRead + 1; + { add("ῄ"); + } + case 44: break; + case 4: + { add("έ"); + } + case 45: break; + case 21: + // lookahead expression with fixed base length + zzMarkedPos = zzStartRead + 1; + { add("ύ"); + } + case 46: break; + default: + if (zzInput == YYEOF && zzStartRead == zzCurrentPos) { + zzAtEOF = true; + return null; + } + else { + zzScanError(ZZ_NO_MATCH); + } + } + } + } + + +} diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexEL.lex --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexEL.lex Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,139 @@ +/* + * Normalization rules for Greek text + * [this is a JFlex specification] + * + * Wolfgang Schmidle + * version 2011-08-03 + * + */ + +package de.mpg.mpiwg.berlin.mpdl.lt.analyzer.lang; + +%% + +%public +%class MpdlNormalizerLexEL +%type java.lang.String +%unicode + +// Greek: el, grc + +%states DISP, DICT, SEARCH +%state SIGMA + +%{ + private String original = ""; + private String normalized = ""; + private int problem = 0; + + private void add (String norm) { + original += yytext(); + normalized += norm; + } + + private static final String LB = "[\u002d\u00ad] "; +%} + +hyphen = [-\u{00ad}] // hyphen and soft hyphen +LB = {hyphen} \u0020 +// lb = ({hyphen} \u0020)? + +END = \n + +wordend = [νρς]? {END} + +Latin = [abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ] + + +%% + + +// always replace tonos by oxia +// (although this should really be corrected in the text rather than normalized) +ά { add("ά"); } +έ { add("έ"); } +ή { add("ή"); } +ί { add("ί"); } +ό { add("ό"); } +ύ { add("ύ"); } +ώ { add("ώ"); } + + + { + +ὰ / {wordend} { add("ά"); } +ᾲ / {wordend} { add("ᾴ"); } +ὲ / {wordend} { add("έ"); } +ὴ / {wordend} { add("ή"); } +ῂ / {wordend} { add("ῄ"); } +ὶ / {wordend} { add("ί"); } +ὸ / {wordend} { add("ό"); } +ὺ / {wordend} { add("ύ"); } +ὼ / {wordend} { add("ώ"); } +ῲ / {wordend} { add("ῴ"); } + +// other candidates: Ὰ Ὲ Ὴ Ὶ Ὺ Ὸ Ὼ + +} + + { + +ς { add("σ"); } + +} + +// default + +@ { problem = 1; add(yytext()); } +{Latin} { problem = 1; add(yytext()); } + +{LB} { add(yytext()); } +. { add(yytext()); } + + + { + +{END} { + switch (problem) { + case 1: return original; + default: return normalized; + } + } +} + + { + +{END} { + switch (problem) { + case 1: return ""; + default: return normalized.replaceAll(LB, ""); + } + } +} + + { + +{END} { + switch (problem) { + case 1: return original; + default: return normalized.replaceAll(LB, "").toLowerCase(); + } + } +} + + +/* + +Annahmen: +- die Routine wird wortweise aufgerufen, mit einem \n am Ende des Strings +- Zeilenumbrüche innerhalb des Wortes sind als hyphen/soft hyphen plus space markiert + +TO DO: + +EL: tonos --> oxia wieder rausnehmen, weil es im Text geändert werden muss? +EL: gibt es noch weitere Fälle, wo legitimerweise ein Gravis vorkommen kann? +EL: kommen Großbuchstaben mit Gravis bei uns jemals vor, und sollen sie normalisiert werden? +EL: neuer State BETACODE ? +EL: nicht falsche Zeichen definieren, sondern erlaubte Zeichen + +*/ diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexEN.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexEN.java Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,589 @@ +/* The following code was generated by JFlex 1.4.3 on 21.07.11 11:22 */ + +/* + * Normalization rules for English text + * [this is a JFlex specification] + * + * Wolfgang Schmidle + * version 2011-07-12 + * + */ + +package de.mpg.mpiwg.berlin.mpdl.lt.text.norm.lang; + + +/** + * This class is a scanner generated by + * JFlex 1.4.3 + * on 21.07.11 11:22 from the specification file + * MpdlNormalizerLexEN.lex + */ +public class MpdlNormalizerLexEN { + + /** This character denotes the end of file */ + public static final int YYEOF = -1; + + /** initial size of the lookahead buffer */ + private static final int ZZ_BUFFERSIZE = 16384; + + /** lexical states */ + public static final int SEARCH = 6; + public static final int DICT = 4; + public static final int YYINITIAL = 0; + public static final int DISP = 2; + + /** + * ZZ_LEXSTATE[l] is the state in the DFA for the lexical state l + * ZZ_LEXSTATE[l+1] is the state in the DFA for the lexical state l + * at the beginning of a line + * l is of the form l = 2*k, k a non negative integer + */ + private static final int ZZ_LEXSTATE[] = { + 0, 0, 1, 1, 2, 2, 3, 3 + }; + + /** + * Translates characters to character classes + */ + private static final String ZZ_CMAP_PACKED = + "\12\0\1\3\25\0\1\2\14\0\1\1\2\0\1\1\17\0\1\5"+ + "\40\0\1\1\2\0\1\1\20\0\1\1\5\0\1\1\1\0\1\1"+ + "\u0101\0\1\4\ufe80\0"; + + /** + * Translates characters to character classes + */ + private static final char [] ZZ_CMAP = zzUnpackCMap(ZZ_CMAP_PACKED); + + /** + * Translates DFA states to action switch labels. + */ + private static final int [] ZZ_ACTION = zzUnpackAction(); + + private static final String ZZ_ACTION_PACKED_0 = + "\4\0\2\1\1\2\1\3\1\4\1\5\1\6"; + + private static int [] zzUnpackAction() { + int [] result = new int[11]; + int offset = 0; + offset = zzUnpackAction(ZZ_ACTION_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackAction(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int count = packed.charAt(i++); + int value = packed.charAt(i++); + do result[j++] = value; while (--count > 0); + } + return j; + } + + + /** + * Translates a state to a row index in the transition table + */ + private static final int [] ZZ_ROWMAP = zzUnpackRowMap(); + + private static final String ZZ_ROWMAP_PACKED_0 = + "\0\0\0\6\0\14\0\22\0\30\0\36\0\30\0\30"+ + "\0\30\0\30\0\30"; + + private static int [] zzUnpackRowMap() { + int [] result = new int[11]; + int offset = 0; + offset = zzUnpackRowMap(ZZ_ROWMAP_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackRowMap(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int high = packed.charAt(i++) << 16; + result[j++] = high | packed.charAt(i++); + } + return j; + } + + /** + * The transition table of the DFA + */ + private static final int [] ZZ_TRANS = zzUnpackTrans(); + + private static final String ZZ_TRANS_PACKED_0 = + "\1\5\1\6\1\5\1\0\1\5\1\7\1\5\1\6"+ + "\1\5\1\10\1\11\1\7\1\5\1\6\1\5\1\12"+ + "\1\11\1\7\1\5\1\6\1\5\1\13\1\11\1\7"+ + "\10\0\1\5\3\0"; + + private static int [] zzUnpackTrans() { + int [] result = new int[36]; + int offset = 0; + offset = zzUnpackTrans(ZZ_TRANS_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackTrans(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int count = packed.charAt(i++); + int value = packed.charAt(i++); + value--; + do result[j++] = value; while (--count > 0); + } + return j; + } + + + /* error codes */ + private static final int ZZ_UNKNOWN_ERROR = 0; + private static final int ZZ_NO_MATCH = 1; + private static final int ZZ_PUSHBACK_2BIG = 2; + + /* error messages for the codes above */ + private static final String ZZ_ERROR_MSG[] = { + "Unkown internal scanner error", + "Error: could not match input", + "Error: pushback value was too large" + }; + + /** + * ZZ_ATTRIBUTE[aState] contains the attributes of state aState + */ + private static final int [] ZZ_ATTRIBUTE = zzUnpackAttribute(); + + private static final String ZZ_ATTRIBUTE_PACKED_0 = + "\4\0\1\11\1\1\5\11"; + + private static int [] zzUnpackAttribute() { + int [] result = new int[11]; + int offset = 0; + offset = zzUnpackAttribute(ZZ_ATTRIBUTE_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackAttribute(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int count = packed.charAt(i++); + int value = packed.charAt(i++); + do result[j++] = value; while (--count > 0); + } + return j; + } + + /** the input device */ + private java.io.Reader zzReader; + + /** the current state of the DFA */ + private int zzState; + + /** the current lexical state */ + private int zzLexicalState = YYINITIAL; + + /** this buffer contains the current text to be matched and is + the source of the yytext() string */ + private char zzBuffer[] = new char[ZZ_BUFFERSIZE]; + + /** the textposition at the last accepting state */ + private int zzMarkedPos; + + /** the current text position in the buffer */ + private int zzCurrentPos; + + /** startRead marks the beginning of the yytext() string in the buffer */ + private int zzStartRead; + + /** endRead marks the last character in the buffer, that has been read + from input */ + private int zzEndRead; + + /** number of newlines encountered up to the start of the matched text */ + private int yyline; + + /** the number of characters up to the start of the matched text */ + private int yychar; + + /** + * the number of characters from the last newline up to the start of the + * matched text + */ + private int yycolumn; + + /** + * zzAtBOL == true <=> the scanner is currently at the beginning of a line + */ + private boolean zzAtBOL = true; + + /** zzAtEOF == true <=> the scanner is at the EOF */ + private boolean zzAtEOF; + + /** denotes if the user-EOF-code has already been executed */ + private boolean zzEOFDone; + + /* user code: */ + private String original = ""; + private String normalized = ""; + private int problem = 0; + + private void add (String norm) { + original += yytext(); + normalized += norm; + } + + private static final String LB = "[\u002d\u00ad] "; + + + /** + * Creates a new scanner + * There is also a java.io.InputStream version of this constructor. + * + * @param in the java.io.Reader to read input from. + */ + public MpdlNormalizerLexEN(java.io.Reader in) { + this.zzReader = in; + } + + /** + * Creates a new scanner. + * There is also java.io.Reader version of this constructor. + * + * @param in the java.io.Inputstream to read input from. + */ + public MpdlNormalizerLexEN(java.io.InputStream in) { + this(new java.io.InputStreamReader(in)); + } + + /** + * Unpacks the compressed character translation table. + * + * @param packed the packed character translation table + * @return the unpacked character translation table + */ + private static char [] zzUnpackCMap(String packed) { + char [] map = new char[0x10000]; + int i = 0; /* index in packed string */ + int j = 0; /* index in unpacked array */ + while (i < 46) { + int count = packed.charAt(i++); + char value = packed.charAt(i++); + do map[j++] = value; while (--count > 0); + } + return map; + } + + + /** + * Refills the input buffer. + * + * @return false, iff there was new input. + * + * @exception java.io.IOException if any I/O-Error occurs + */ + private boolean zzRefill() throws java.io.IOException { + + /* first: make room (if you can) */ + if (zzStartRead > 0) { + System.arraycopy(zzBuffer, zzStartRead, + zzBuffer, 0, + zzEndRead-zzStartRead); + + /* translate stored positions */ + zzEndRead-= zzStartRead; + zzCurrentPos-= zzStartRead; + zzMarkedPos-= zzStartRead; + zzStartRead = 0; + } + + /* is the buffer big enough? */ + if (zzCurrentPos >= zzBuffer.length) { + /* if not: blow it up */ + char newBuffer[] = new char[zzCurrentPos*2]; + System.arraycopy(zzBuffer, 0, newBuffer, 0, zzBuffer.length); + zzBuffer = newBuffer; + } + + /* finally: fill the buffer with new input */ + int numRead = zzReader.read(zzBuffer, zzEndRead, + zzBuffer.length-zzEndRead); + + if (numRead > 0) { + zzEndRead+= numRead; + return false; + } + // unlikely but not impossible: read 0 characters, but not at end of stream + if (numRead == 0) { + int c = zzReader.read(); + if (c == -1) { + return true; + } else { + zzBuffer[zzEndRead++] = (char) c; + return false; + } + } + + // numRead < 0 + return true; + } + + + /** + * Closes the input stream. + */ + public final void yyclose() throws java.io.IOException { + zzAtEOF = true; /* indicate end of file */ + zzEndRead = zzStartRead; /* invalidate buffer */ + + if (zzReader != null) + zzReader.close(); + } + + + /** + * Resets the scanner to read from a new input stream. + * Does not close the old reader. + * + * All internal variables are reset, the old input stream + * cannot be reused (internal buffer is discarded and lost). + * Lexical state is set to ZZ_INITIAL. + * + * @param reader the new input stream + */ + public final void yyreset(java.io.Reader reader) { + zzReader = reader; + zzAtBOL = true; + zzAtEOF = false; + zzEOFDone = false; + zzEndRead = zzStartRead = 0; + zzCurrentPos = zzMarkedPos = 0; + yyline = yychar = yycolumn = 0; + zzLexicalState = YYINITIAL; + } + + + /** + * Returns the current lexical state. + */ + public final int yystate() { + return zzLexicalState; + } + + + /** + * Enters a new lexical state + * + * @param newState the new lexical state + */ + public final void yybegin(int newState) { + zzLexicalState = newState; + } + + + /** + * Returns the text matched by the current regular expression. + */ + public final String yytext() { + return new String( zzBuffer, zzStartRead, zzMarkedPos-zzStartRead ); + } + + + /** + * Returns the character at position pos from the + * matched text. + * + * It is equivalent to yytext().charAt(pos), but faster + * + * @param pos the position of the character to fetch. + * A value from 0 to yylength()-1. + * + * @return the character at position pos + */ + public final char yycharat(int pos) { + return zzBuffer[zzStartRead+pos]; + } + + + /** + * Returns the length of the matched text region. + */ + public final int yylength() { + return zzMarkedPos-zzStartRead; + } + + + /** + * Reports an error that occured while scanning. + * + * In a wellformed scanner (no or only correct usage of + * yypushback(int) and a match-all fallback rule) this method + * will only be called with things that "Can't Possibly Happen". + * If this method is called, something is seriously wrong + * (e.g. a JFlex bug producing a faulty scanner etc.). + * + * Usual syntax/scanner level error handling should be done + * in error fallback rules. + * + * @param errorCode the code of the errormessage to display + */ + private void zzScanError(int errorCode) { + String message; + try { + message = ZZ_ERROR_MSG[errorCode]; + } + catch (ArrayIndexOutOfBoundsException e) { + message = ZZ_ERROR_MSG[ZZ_UNKNOWN_ERROR]; + } + + throw new Error(message); + } + + + /** + * Pushes the specified amount of characters back into the input stream. + * + * They will be read again by then next call of the scanning method + * + * @param number the number of characters to be read again. + * This number must not be greater than yylength()! + */ + public void yypushback(int number) { + if ( number > yylength() ) + zzScanError(ZZ_PUSHBACK_2BIG); + + zzMarkedPos -= number; + } + + + /** + * Resumes scanning until the next regular expression is matched, + * the end of input is encountered or an I/O-Error occurs. + * + * @return the next token + * @exception java.io.IOException if any I/O-Error occurs + */ + public java.lang.String yylex() throws java.io.IOException { + int zzInput; + int zzAction; + + // cached fields: + int zzCurrentPosL; + int zzMarkedPosL; + int zzEndReadL = zzEndRead; + char [] zzBufferL = zzBuffer; + char [] zzCMapL = ZZ_CMAP; + + int [] zzTransL = ZZ_TRANS; + int [] zzRowMapL = ZZ_ROWMAP; + int [] zzAttrL = ZZ_ATTRIBUTE; + + while (true) { + zzMarkedPosL = zzMarkedPos; + + zzAction = -1; + + zzCurrentPosL = zzCurrentPos = zzStartRead = zzMarkedPosL; + + zzState = ZZ_LEXSTATE[zzLexicalState]; + + + zzForAction: { + while (true) { + + if (zzCurrentPosL < zzEndReadL) + zzInput = zzBufferL[zzCurrentPosL++]; + else if (zzAtEOF) { + zzInput = YYEOF; + break zzForAction; + } + else { + // store back cached positions + zzCurrentPos = zzCurrentPosL; + zzMarkedPos = zzMarkedPosL; + boolean eof = zzRefill(); + // get translated positions and possibly new buffer + zzCurrentPosL = zzCurrentPos; + zzMarkedPosL = zzMarkedPos; + zzBufferL = zzBuffer; + zzEndReadL = zzEndRead; + if (eof) { + zzInput = YYEOF; + break zzForAction; + } + else { + zzInput = zzBufferL[zzCurrentPosL++]; + } + } + int zzNext = zzTransL[ zzRowMapL[zzState] + zzCMapL[zzInput] ]; + if (zzNext == -1) break zzForAction; + zzState = zzNext; + + int zzAttributes = zzAttrL[zzState]; + if ( (zzAttributes & 1) == 1 ) { + zzAction = zzState; + zzMarkedPosL = zzCurrentPosL; + if ( (zzAttributes & 8) == 8 ) break zzForAction; + } + + } + } + + // store back cached position + zzMarkedPos = zzMarkedPosL; + + switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) { + case 5: + { switch (problem) { + case 1: return ""; + default: return normalized.replaceAll(LB, ""); + } + } + case 7: break; + case 2: + { problem = 1; add(yytext()); + } + case 8: break; + case 4: + { add("s"); + } + case 9: break; + case 3: + { switch (problem) { + case 1: return original; + default: return normalized; + } + } + case 10: break; + case 6: + { switch (problem) { + case 1: return original; + default: return normalized.replaceAll(LB, "").toLowerCase(); + } + } + case 11: break; + case 1: + { add(yytext()); + } + case 12: break; + default: + if (zzInput == YYEOF && zzStartRead == zzCurrentPos) { + zzAtEOF = true; + return null; + } + else { + zzScanError(ZZ_NO_MATCH); + } + } + } + } + + +} diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexEN.lex --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexEN.lex Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,99 @@ +/* + * Normalization rules for English text + * [this is a JFlex specification] + * + * Wolfgang Schmidle + * version 2011-07-12 + * + */ + +package de.mpg.mpiwg.berlin.mpdl.lt.analyzer.lang; + +%% + +%public +%class MpdlNormalizerLexEN +%type java.lang.String +%unicode + +// 1.5 English: en + +%states DISP, DICT, SEARCH + +%{ + private String original = ""; + private String normalized = ""; + private int problem = 0; + + private void add (String norm) { + original += yytext(); + normalized += norm; + } + + private static final String LB = "[\u002d\u00ad] "; +%} + +hyphen = [-\u{00ad}] // hyphen and soft hyphen +LB = {hyphen} \u0020 +// lb = ({hyphen} \u0020)? + +END = \n + +%% + + { + +ſ { add("s"); } + +} + + +// default + +@ { problem = 1; add(yytext()); } +{LB} { add(yytext()); } +. { add(yytext()); } + + + { + +{END} { + switch (problem) { + case 1: return original; + default: return normalized; + } + } +} + + { + +{END} { + switch (problem) { + case 1: return ""; + default: return normalized.replaceAll(LB, ""); + } + } +} + + { + +{END} { + switch (problem) { + case 1: return original; + default: return normalized.replaceAll(LB, "").toLowerCase(); + } + } +} + + +/* + +Annahmen: +- die Routine wird wortweise aufgerufen, mit einem \n am Ende des Strings +- Zeilenumbrüche innerhalb des Wortes sind als hyphen/soft hyphen plus space markiert + +TO DO: + +EN: vollständig? + +*/ diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexFR.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexFR.java Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,635 @@ +/* The following code was generated by JFlex 1.4.3 on 05.09.11 10:35 */ + +/* + * Normalization rules for French text + * [this is a JFlex specification] + * + * Wolfgang Schmidle + * version 2011-08-10 + * + */ + +package de.mpg.mpiwg.berlin.mpdl.lt.text.norm.lang; + + +/** + * This class is a scanner generated by + * JFlex 1.4.3 + * on 05.09.11 10:35 from the specification file + * MpdlNormalizerLexFR.lex + */ +public class MpdlNormalizerLexFR { + + /** This character denotes the end of file */ + public static final int YYEOF = -1; + + /** initial size of the lookahead buffer */ + private static final int ZZ_BUFFERSIZE = 16384; + + /** lexical states */ + public static final int DICT_ASCII = 8; + public static final int SEARCH = 6; + public static final int DICT = 4; + public static final int YYINITIAL = 0; + public static final int DISP = 2; + + /** + * ZZ_LEXSTATE[l] is the state in the DFA for the lexical state l + * ZZ_LEXSTATE[l+1] is the state in the DFA for the lexical state l + * at the beginning of a line + * l is of the form l = 2*k, k a non negative integer + */ + private static final int ZZ_LEXSTATE[] = { + 0, 0, 1, 1, 2, 2, 3, 3, 4, 4 + }; + + /** + * Translates characters to character classes + */ + private static final String ZZ_CMAP_PACKED = + "\12\0\1\3\25\0\1\2\14\0\1\1\2\0\1\1\17\0\1\20"+ + "\32\4\6\0\1\5\2\4\1\5\20\4\1\5\5\4\1\1\1\0"+ + "\1\1\141\0\1\7\3\12\3\0\1\10\1\0\3\13\1\0\3\14"+ + "\3\0\3\15\4\0\3\16\126\0\2\11\53\0\1\6\u1e99\0\1\17"+ + "\udfe6\0"; + + /** + * Translates characters to character classes + */ + private static final char [] ZZ_CMAP = zzUnpackCMap(ZZ_CMAP_PACKED); + + /** + * Translates DFA states to action switch labels. + */ + private static final int [] ZZ_ACTION = zzUnpackAction(); + + private static final String ZZ_ACTION_PACKED_0 = + "\5\0\2\1\1\2\1\3\1\4\1\5\1\6\1\7"+ + "\1\10\1\2\1\11\1\12\1\13\1\14\1\15\1\16"+ + "\1\17"; + + private static int [] zzUnpackAction() { + int [] result = new int[22]; + int offset = 0; + offset = zzUnpackAction(ZZ_ACTION_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackAction(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int count = packed.charAt(i++); + int value = packed.charAt(i++); + do result[j++] = value; while (--count > 0); + } + return j; + } + + + /** + * Translates a state to a row index in the transition table + */ + private static final int [] ZZ_ROWMAP = zzUnpackRowMap(); + + private static final String ZZ_ROWMAP_PACKED_0 = + "\0\0\0\21\0\42\0\63\0\104\0\125\0\146\0\125"+ + "\0\125\0\125\0\125\0\125\0\125\0\125\0\146\0\125"+ + "\0\125\0\125\0\125\0\125\0\125\0\125"; + + private static int [] zzUnpackRowMap() { + int [] result = new int[22]; + int offset = 0; + offset = zzUnpackRowMap(ZZ_ROWMAP_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackRowMap(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int high = packed.charAt(i++) << 16; + result[j++] = high | packed.charAt(i++); + } + return j; + } + + /** + * The transition table of the DFA + */ + private static final int [] ZZ_TRANS = zzUnpackTrans(); + + private static final String ZZ_TRANS_PACKED_0 = + "\1\6\1\7\1\6\1\0\1\6\1\7\12\6\1\10"+ + "\1\6\1\7\1\6\1\11\1\6\1\7\1\12\1\13"+ + "\1\14\7\6\1\10\1\6\1\7\1\6\1\15\1\6"+ + "\1\7\1\12\1\13\1\14\7\6\1\10\1\6\1\7"+ + "\1\6\1\16\1\6\1\7\1\12\1\13\1\14\7\6"+ + "\2\10\1\17\1\10\1\15\1\6\1\7\1\12\1\13"+ + "\1\14\1\20\1\21\1\22\1\23\1\24\1\25\1\26"+ + "\1\10\23\0\1\6\16\0"; + + private static int [] zzUnpackTrans() { + int [] result = new int[119]; + int offset = 0; + offset = zzUnpackTrans(ZZ_TRANS_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackTrans(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int count = packed.charAt(i++); + int value = packed.charAt(i++); + value--; + do result[j++] = value; while (--count > 0); + } + return j; + } + + + /* error codes */ + private static final int ZZ_UNKNOWN_ERROR = 0; + private static final int ZZ_NO_MATCH = 1; + private static final int ZZ_PUSHBACK_2BIG = 2; + + /* error messages for the codes above */ + private static final String ZZ_ERROR_MSG[] = { + "Unkown internal scanner error", + "Error: could not match input", + "Error: pushback value was too large" + }; + + /** + * ZZ_ATTRIBUTE[aState] contains the attributes of state aState + */ + private static final int [] ZZ_ATTRIBUTE = zzUnpackAttribute(); + + private static final String ZZ_ATTRIBUTE_PACKED_0 = + "\5\0\1\11\1\1\7\11\1\1\7\11"; + + private static int [] zzUnpackAttribute() { + int [] result = new int[22]; + int offset = 0; + offset = zzUnpackAttribute(ZZ_ATTRIBUTE_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackAttribute(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int count = packed.charAt(i++); + int value = packed.charAt(i++); + do result[j++] = value; while (--count > 0); + } + return j; + } + + /** the input device */ + private java.io.Reader zzReader; + + /** the current state of the DFA */ + private int zzState; + + /** the current lexical state */ + private int zzLexicalState = YYINITIAL; + + /** this buffer contains the current text to be matched and is + the source of the yytext() string */ + private char zzBuffer[] = new char[ZZ_BUFFERSIZE]; + + /** the textposition at the last accepting state */ + private int zzMarkedPos; + + /** the current text position in the buffer */ + private int zzCurrentPos; + + /** startRead marks the beginning of the yytext() string in the buffer */ + private int zzStartRead; + + /** endRead marks the last character in the buffer, that has been read + from input */ + private int zzEndRead; + + /** number of newlines encountered up to the start of the matched text */ + private int yyline; + + /** the number of characters up to the start of the matched text */ + private int yychar; + + /** + * the number of characters from the last newline up to the start of the + * matched text + */ + private int yycolumn; + + /** + * zzAtBOL == true <=> the scanner is currently at the beginning of a line + */ + private boolean zzAtBOL = true; + + /** zzAtEOF == true <=> the scanner is at the EOF */ + private boolean zzAtEOF; + + /** denotes if the user-EOF-code has already been executed */ + private boolean zzEOFDone; + + /* user code: */ + private String original = ""; + private String normalized = ""; + private int problem = 0; + + private void add (String norm) { + original += yytext(); + normalized += norm; + } + + private static final String LB = "[\u002d\u00ad] "; + + + /** + * Creates a new scanner + * There is also a java.io.InputStream version of this constructor. + * + * @param in the java.io.Reader to read input from. + */ + public MpdlNormalizerLexFR(java.io.Reader in) { + this.zzReader = in; + } + + /** + * Creates a new scanner. + * There is also java.io.Reader version of this constructor. + * + * @param in the java.io.Inputstream to read input from. + */ + public MpdlNormalizerLexFR(java.io.InputStream in) { + this(new java.io.InputStreamReader(in)); + } + + /** + * Unpacks the compressed character translation table. + * + * @param packed the packed character translation table + * @return the unpacked character translation table + */ + private static char [] zzUnpackCMap(String packed) { + char [] map = new char[0x10000]; + int i = 0; /* index in packed string */ + int j = 0; /* index in unpacked array */ + while (i < 82) { + int count = packed.charAt(i++); + char value = packed.charAt(i++); + do map[j++] = value; while (--count > 0); + } + return map; + } + + + /** + * Refills the input buffer. + * + * @return false, iff there was new input. + * + * @exception java.io.IOException if any I/O-Error occurs + */ + private boolean zzRefill() throws java.io.IOException { + + /* first: make room (if you can) */ + if (zzStartRead > 0) { + System.arraycopy(zzBuffer, zzStartRead, + zzBuffer, 0, + zzEndRead-zzStartRead); + + /* translate stored positions */ + zzEndRead-= zzStartRead; + zzCurrentPos-= zzStartRead; + zzMarkedPos-= zzStartRead; + zzStartRead = 0; + } + + /* is the buffer big enough? */ + if (zzCurrentPos >= zzBuffer.length) { + /* if not: blow it up */ + char newBuffer[] = new char[zzCurrentPos*2]; + System.arraycopy(zzBuffer, 0, newBuffer, 0, zzBuffer.length); + zzBuffer = newBuffer; + } + + /* finally: fill the buffer with new input */ + int numRead = zzReader.read(zzBuffer, zzEndRead, + zzBuffer.length-zzEndRead); + + if (numRead > 0) { + zzEndRead+= numRead; + return false; + } + // unlikely but not impossible: read 0 characters, but not at end of stream + if (numRead == 0) { + int c = zzReader.read(); + if (c == -1) { + return true; + } else { + zzBuffer[zzEndRead++] = (char) c; + return false; + } + } + + // numRead < 0 + return true; + } + + + /** + * Closes the input stream. + */ + public final void yyclose() throws java.io.IOException { + zzAtEOF = true; /* indicate end of file */ + zzEndRead = zzStartRead; /* invalidate buffer */ + + if (zzReader != null) + zzReader.close(); + } + + + /** + * Resets the scanner to read from a new input stream. + * Does not close the old reader. + * + * All internal variables are reset, the old input stream + * cannot be reused (internal buffer is discarded and lost). + * Lexical state is set to ZZ_INITIAL. + * + * @param reader the new input stream + */ + public final void yyreset(java.io.Reader reader) { + zzReader = reader; + zzAtBOL = true; + zzAtEOF = false; + zzEOFDone = false; + zzEndRead = zzStartRead = 0; + zzCurrentPos = zzMarkedPos = 0; + yyline = yychar = yycolumn = 0; + zzLexicalState = YYINITIAL; + } + + + /** + * Returns the current lexical state. + */ + public final int yystate() { + return zzLexicalState; + } + + + /** + * Enters a new lexical state + * + * @param newState the new lexical state + */ + public final void yybegin(int newState) { + zzLexicalState = newState; + } + + + /** + * Returns the text matched by the current regular expression. + */ + public final String yytext() { + return new String( zzBuffer, zzStartRead, zzMarkedPos-zzStartRead ); + } + + + /** + * Returns the character at position pos from the + * matched text. + * + * It is equivalent to yytext().charAt(pos), but faster + * + * @param pos the position of the character to fetch. + * A value from 0 to yylength()-1. + * + * @return the character at position pos + */ + public final char yycharat(int pos) { + return zzBuffer[zzStartRead+pos]; + } + + + /** + * Returns the length of the matched text region. + */ + public final int yylength() { + return zzMarkedPos-zzStartRead; + } + + + /** + * Reports an error that occured while scanning. + * + * In a wellformed scanner (no or only correct usage of + * yypushback(int) and a match-all fallback rule) this method + * will only be called with things that "Can't Possibly Happen". + * If this method is called, something is seriously wrong + * (e.g. a JFlex bug producing a faulty scanner etc.). + * + * Usual syntax/scanner level error handling should be done + * in error fallback rules. + * + * @param errorCode the code of the errormessage to display + */ + private void zzScanError(int errorCode) { + String message; + try { + message = ZZ_ERROR_MSG[errorCode]; + } + catch (ArrayIndexOutOfBoundsException e) { + message = ZZ_ERROR_MSG[ZZ_UNKNOWN_ERROR]; + } + + throw new Error(message); + } + + + /** + * Pushes the specified amount of characters back into the input stream. + * + * They will be read again by then next call of the scanning method + * + * @param number the number of characters to be read again. + * This number must not be greater than yylength()! + */ + public void yypushback(int number) { + if ( number > yylength() ) + zzScanError(ZZ_PUSHBACK_2BIG); + + zzMarkedPos -= number; + } + + + /** + * Resumes scanning until the next regular expression is matched, + * the end of input is encountered or an I/O-Error occurs. + * + * @return the next token + * @exception java.io.IOException if any I/O-Error occurs + */ + public java.lang.String yylex() throws java.io.IOException { + int zzInput; + int zzAction; + + // cached fields: + int zzCurrentPosL; + int zzMarkedPosL; + int zzEndReadL = zzEndRead; + char [] zzBufferL = zzBuffer; + char [] zzCMapL = ZZ_CMAP; + + int [] zzTransL = ZZ_TRANS; + int [] zzRowMapL = ZZ_ROWMAP; + int [] zzAttrL = ZZ_ATTRIBUTE; + + while (true) { + zzMarkedPosL = zzMarkedPos; + + zzAction = -1; + + zzCurrentPosL = zzCurrentPos = zzStartRead = zzMarkedPosL; + + zzState = ZZ_LEXSTATE[zzLexicalState]; + + + zzForAction: { + while (true) { + + if (zzCurrentPosL < zzEndReadL) + zzInput = zzBufferL[zzCurrentPosL++]; + else if (zzAtEOF) { + zzInput = YYEOF; + break zzForAction; + } + else { + // store back cached positions + zzCurrentPos = zzCurrentPosL; + zzMarkedPos = zzMarkedPosL; + boolean eof = zzRefill(); + // get translated positions and possibly new buffer + zzCurrentPosL = zzCurrentPos; + zzMarkedPosL = zzMarkedPos; + zzBufferL = zzBuffer; + zzEndReadL = zzEndRead; + if (eof) { + zzInput = YYEOF; + break zzForAction; + } + else { + zzInput = zzBufferL[zzCurrentPosL++]; + } + } + int zzNext = zzTransL[ zzRowMapL[zzState] + zzCMapL[zzInput] ]; + if (zzNext == -1) break zzForAction; + zzState = zzNext; + + int zzAttributes = zzAttrL[zzState]; + if ( (zzAttributes & 1) == 1 ) { + zzAction = zzState; + zzMarkedPosL = zzCurrentPosL; + if ( (zzAttributes & 8) == 8 ) break zzForAction; + } + + } + } + + // store back cached position + zzMarkedPos = zzMarkedPosL; + + switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) { + case 2: + { problem = 1; add(yytext()); + } + case 16: break; + case 6: + { add("ae"); + } + case 17: break; + case 4: + { add("s"); + } + case 18: break; + case 13: + { add("o"); + } + case 19: break; + case 3: + { switch (problem) { + case 1: return original; + default: return normalized; + } + } + case 20: break; + case 8: + { switch (problem) { + case 1: return original; + default: return normalized.replaceAll(LB, "").toLowerCase(); + } + } + case 21: break; + case 14: + { add("u"); + } + case 22: break; + case 1: + { add(yytext()); + } + case 23: break; + case 12: + { add("i"); + } + case 24: break; + case 15: + { add(""); + } + case 25: break; + case 11: + { add("e"); + } + case 26: break; + case 10: + { add("a"); + } + case 27: break; + case 9: + { add("oe"); + } + case 28: break; + case 5: + { add("ss"); + } + case 29: break; + case 7: + { switch (problem) { + case 1: return ""; + default: return normalized.replaceAll(LB, ""); + } + } + case 30: break; + default: + if (zzInput == YYEOF && zzStartRead == zzCurrentPos) { + zzAtEOF = true; + return null; + } + else { + zzScanError(ZZ_NO_MATCH); + } + } + } + } + + +} diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexFR.lex --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexFR.lex Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,119 @@ +/* + * Normalization rules for French text + * [this is a JFlex specification] + * + * Wolfgang Schmidle + * version 2011-07-12 + * + */ + +package de.mpg.mpiwg.berlin.mpdl.lt.analyzer.lang; + +%% + +%public +%class MpdlNormalizerLexFR +%type java.lang.String +%unicode + +// French: fr + +%states DISP, DICT, SEARCH +%state CELEX + +%{ + private String original = ""; + private String normalized = ""; + private int problem = 0; + + private void add (String norm) { + original += yytext(); + normalized += norm; + } + + private static final String LB = "[\u002d\u00ad] "; +%} + +hyphen = [-\u{00ad}] // hyphen and soft hyphen +LB = {hyphen} \u0020 +// lb = ({hyphen} \u0020)? + +END = \n + +Alphabet = [abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ] + +%% + + { + +ſ { add("s"); } +ß { add("ss"); } +æ { add("ae"); } + +} + + { + +[œŒ] { add("oe"); } +[áàâ] { add("a"); } +[éèê] { add("e"); } +[íìî] { add("i"); } +[óòô] { add("o"); } +[úùû] { add("u"); } +’ { add(""); } + +{Alphabet} { add(yytext()); } + +. { problem = 1; add(yytext()); } // in particular "@" + +} + +// default + +@ { problem = 1; add(yytext()); } +{LB} { add(yytext()); } +. { add(yytext()); } + + + { + +{END} { + switch (problem) { + case 1: return original; + default: return normalized; + } + } +} + + { + +{END} { + switch (problem) { + case 1: return ""; + default: return normalized.replaceAll(LB, ""); + } + } +} + + { + +{END} { + switch (problem) { + case 1: return original; + default: return normalized.replaceAll(LB, "").toLowerCase(); + } + } +} + + +/* + +Annahmen: +- die Routine wird wortweise aufgerufen, mit einem \n am Ende des Strings +- Zeilenumbrüche innerhalb des Wortes sind als hyphen/soft hyphen plus space markiert + +TO DO: + +FR: richtig? vollständig? + +*/ diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexIT.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexIT.java Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,887 @@ +/* The following code was generated by JFlex 1.4.3 on 21.07.11 11:22 */ + +/* + * Normalization rules for Italian text + * [this is a JFlex specification] + * + * Wolfgang Schmidle + * version 2011-07-12 + * + */ + +package de.mpg.mpiwg.berlin.mpdl.lt.text.norm.lang; + + +/** + * This class is a scanner generated by + * JFlex 1.4.3 + * on 21.07.11 11:22 from the specification file + * MpdlNormalizerLexIT.lex + */ +public class MpdlNormalizerLexIT { + + /** This character denotes the end of file */ + public static final int YYEOF = -1; + + /** initial size of the lookahead buffer */ + private static final int ZZ_BUFFERSIZE = 16384; + + /** lexical states */ + public static final int SEARCH = 6; + public static final int DICT = 4; + public static final int YYINITIAL = 0; + public static final int DISP = 2; + + /** + * ZZ_LEXSTATE[l] is the state in the DFA for the lexical state l + * ZZ_LEXSTATE[l+1] is the state in the DFA for the lexical state l + * at the beginning of a line + * l is of the form l = 2*k, k a non negative integer + */ + private static final int ZZ_LEXSTATE[] = { + 0, 0, 1, 2, 3, 4, 5, 6 + }; + + /** + * Translates characters to character classes + */ + private static final String ZZ_CMAP_PACKED = + "\12\0\1\6\25\0\1\5\14\0\1\4\22\0\1\52\1\1\3\2"+ + "\1\1\3\2\1\41\1\0\1\2\1\3\2\2\1\42\1\2\1\50"+ + "\1\3\1\2\1\40\1\45\1\51\2\2\1\0\1\2\6\0\1\44"+ + "\3\2\1\12\2\2\1\43\1\7\1\36\1\2\1\3\1\2\1\10"+ + "\1\37\1\14\1\46\1\13\1\2\1\11\1\16\1\47\2\2\1\0"+ + "\1\2\62\0\1\4\22\0\1\17\5\0\1\33\1\0\1\20\3\0"+ + "\1\21\5\0\1\22\6\0\1\23\5\0\1\31\1\24\5\0\1\32"+ + "\1\0\1\25\3\0\1\26\5\0\1\27\6\0\1\30\37\0\1\1"+ + "\70\0\1\35\1\34\53\0\1\15\ufe80\0"; + + /** + * Translates characters to character classes + */ + private static final char [] ZZ_CMAP = zzUnpackCMap(ZZ_CMAP_PACKED); + + /** + * Translates DFA states to action switch labels. + */ + private static final int [] ZZ_ACTION = zzUnpackAction(); + + private static final String ZZ_ACTION_PACKED_0 = + "\11\0\1\1\1\2\2\3\1\1\1\4\1\2\1\3"+ + "\1\5\1\2\1\6\1\7\1\10\1\11\1\12\5\3"+ + "\1\13\1\2\1\3\1\5\1\2\1\14\1\15\1\16"+ + "\1\17\1\20\1\21\1\22\1\23\1\24\1\25\1\26"+ + "\1\27\1\30\4\0\1\31\1\32\1\33\1\0\1\34"+ + "\1\0\1\35\1\36\1\0\1\37\1\40\1\41\4\0"+ + "\1\42\6\0\1\43\1\44\4\0\1\45\1\0\1\46"+ + "\10\0\1\47\4\0\1\45\2\0\1\50"; + + private static int [] zzUnpackAction() { + int [] result = new int[100]; + int offset = 0; + offset = zzUnpackAction(ZZ_ACTION_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackAction(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int count = packed.charAt(i++); + int value = packed.charAt(i++); + do result[j++] = value; while (--count > 0); + } + return j; + } + + + /** + * Translates a state to a row index in the transition table + */ + private static final int [] ZZ_ROWMAP = zzUnpackRowMap(); + + private static final String ZZ_ROWMAP_PACKED_0 = + "\0\0\0\53\0\126\0\201\0\254\0\327\0\u0102\0\u012d"+ + "\0\u0158\0\0\0\0\0\0\0\u0183\0\u01ae\0\0\0\u01d9"+ + "\0\u0204\0\0\0\u022f\0\0\0\0\0\0\0\0\0\0"+ + "\0\u025a\0\u0285\0\u02b0\0\u02db\0\u0306\0\0\0\u0331\0\u035c"+ + "\0\u0387\0\u03b2\0\u03dd\0\0\0\0\0\0\0\0\0\0"+ + "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\u0408"+ + "\0\u0433\0\u045e\0\u0489\0\0\0\0\0\0\0\u04b4\0\0"+ + "\0\u04df\0\0\0\0\0\u050a\0\0\0\0\0\0\0\u0535"+ + "\0\u0560\0\u058b\0\u05b6\0\0\0\u05e1\0\u060c\0\u0637\0\u0662"+ + "\0\u068d\0\0\0\0\0\0\0\u06b8\0\u06e3\0\u070e\0\u035c"+ + "\0\u0739\0\u0764\0\0\0\u078f\0\u07ba\0\u07e5\0\0\0\u0810"+ + "\0\u083b\0\u0866\0\u0891\0\0\0\u08bc\0\u08e7\0\u0912\0\u093d"+ + "\0\0\0\u0968\0\u0993\0\0"; + + private static int [] zzUnpackRowMap() { + int [] result = new int[100]; + int offset = 0; + offset = zzUnpackRowMap(ZZ_ROWMAP_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackRowMap(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int high = packed.charAt(i++) << 16; + result[j++] = high | packed.charAt(i++); + } + return j; + } + + /** + * The transition table of the DFA + */ + private static final int [] ZZ_TRANS = zzUnpackTrans(); + + private static final String ZZ_TRANS_PACKED_0 = + "\53\0\1\12\1\13\1\14\1\15\1\16\1\12\1\17"+ + "\1\20\1\14\1\21\1\13\1\15\1\14\1\22\1\23"+ + "\5\12\2\13\1\12\2\13\1\24\1\25\1\26\1\27"+ + "\1\30\1\12\1\13\1\31\2\13\1\14\1\13\1\23"+ + "\1\32\1\33\1\34\1\35\1\36\1\12\1\13\1\14"+ + "\1\15\1\16\1\12\1\17\1\37\1\14\1\21\1\13"+ + "\1\15\1\40\1\41\1\42\5\12\2\13\1\12\2\13"+ + "\1\24\1\25\1\26\1\27\1\30\1\12\1\13\1\31"+ + "\2\13\1\43\1\13\1\42\1\32\1\33\1\34\1\35"+ + "\1\36\1\12\1\13\1\14\1\15\1\16\1\12\1\44"+ + "\1\20\1\14\1\21\1\13\1\15\1\14\1\22\1\23"+ + "\1\45\1\46\1\47\1\50\1\51\1\52\1\53\1\54"+ + "\1\55\1\56\1\24\1\25\1\26\1\27\1\30\1\12"+ + "\1\13\1\31\2\13\1\14\1\13\1\23\1\32\1\33"+ + "\1\34\1\35\1\36\1\12\1\13\1\14\1\15\1\16"+ + "\1\12\1\44\1\37\1\14\1\21\1\13\1\15\1\40"+ + "\1\41\1\42\1\45\1\46\1\47\1\50\1\51\1\52"+ + "\1\53\1\54\1\55\1\56\1\24\1\25\1\26\1\27"+ + "\1\30\1\12\1\13\1\31\2\13\1\43\1\13\1\42"+ + "\1\32\1\33\1\34\1\35\1\36\1\12\1\13\1\14"+ + "\1\15\1\16\1\12\1\57\1\20\1\14\1\21\1\13"+ + "\1\15\1\14\1\22\1\23\1\45\1\46\1\47\1\50"+ + "\1\51\1\52\1\53\1\54\1\55\1\56\1\24\1\25"+ + "\1\26\1\27\1\30\1\12\1\13\1\31\2\13\1\14"+ + "\1\13\1\23\1\32\1\33\1\34\1\35\1\36\1\12"+ + "\1\13\1\14\1\15\1\16\1\12\1\57\1\37\1\14"+ + "\1\21\1\13\1\15\1\40\1\41\1\42\1\45\1\46"+ + "\1\47\1\50\1\51\1\52\1\53\1\54\1\55\1\56"+ + "\1\24\1\25\1\26\1\27\1\30\1\12\1\13\1\31"+ + "\2\13\1\43\1\13\1\42\1\32\1\33\1\34\1\35"+ + "\1\36\7\0\1\60\4\0\1\61\1\62\42\0\1\63"+ + "\114\0\1\64\1\0\1\64\6\0\1\65\103\0\1\66"+ + "\23\0\1\67\44\0\1\70\5\0\1\70\2\0\1\70"+ + "\3\0\1\70\5\0\2\70\1\0\2\70\1\0\3\70"+ + "\2\0\1\70\1\0\2\70\1\0\2\70\46\0\1\71"+ + "\60\0\1\72\5\0\2\73\1\74\3\0\2\73\1\0"+ + "\3\73\13\0\1\73\6\0\1\73\2\0\1\73\2\0"+ + "\4\73\50\0\1\75\1\0\1\76\3\0\2\77\1\100"+ + "\3\0\2\77\1\0\3\77\13\0\1\77\6\0\1\77"+ + "\2\0\1\77\2\0\4\77\11\0\1\101\25\0\1\66"+ + "\26\0\1\102\52\0\1\102\3\0\1\103\35\0\1\104"+ + "\5\0\1\104\2\0\1\104\3\0\1\104\5\0\2\104"+ + "\1\0\2\104\1\0\3\104\2\0\1\104\1\0\2\104"+ + "\1\0\2\104\44\0\1\105\4\0\1\106\16\0\1\107"+ + "\54\0\1\110\52\0\1\110\3\0\1\111\40\0\1\112"+ + "\105\0\1\113\55\0\1\114\15\0\1\115\52\0\1\116"+ + "\51\0\1\117\4\0\1\120\54\0\1\121\43\0\1\122"+ + "\7\0\1\120\44\0\1\123\52\0\1\123\1\124\1\125"+ + "\46\0\1\126\4\0\1\61\54\0\1\127\43\0\1\130"+ + "\7\0\1\61\40\0\2\73\4\0\2\73\1\0\3\73"+ + "\13\0\1\73\6\0\1\73\2\0\1\73\2\0\4\73"+ + "\3\0\2\77\4\0\2\77\1\0\3\77\13\0\1\77"+ + "\6\0\1\77\2\0\1\77\2\0\4\77\6\0\1\131"+ + "\51\0\1\132\53\0\1\133\53\0\1\134\50\0\1\135"+ + "\3\0\1\136\47\0\1\137\52\0\1\140\56\0\1\120"+ + "\46\0\1\141\61\0\1\120\43\0\1\142\104\0\1\143"+ + "\24\0\1\61\55\0\1\61\46\0\1\136\50\0\1\144"+ + "\44\0"; + + private static int [] zzUnpackTrans() { + int [] result = new int[2494]; + int offset = 0; + offset = zzUnpackTrans(ZZ_TRANS_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackTrans(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int count = packed.charAt(i++); + int value = packed.charAt(i++); + value--; + do result[j++] = value; while (--count > 0); + } + return j; + } + + + /* error codes */ + private static final int ZZ_UNKNOWN_ERROR = 0; + private static final int ZZ_NO_MATCH = 1; + private static final int ZZ_PUSHBACK_2BIG = 2; + + /* error messages for the codes above */ + private static final String ZZ_ERROR_MSG[] = { + "Unkown internal scanner error", + "Error: could not match input", + "Error: pushback value was too large" + }; + + /** + * ZZ_ATTRIBUTE[aState] contains the attributes of state aState + */ + private static final int [] ZZ_ATTRIBUTE = zzUnpackAttribute(); + + private static final String ZZ_ATTRIBUTE_PACKED_0 = + "\1\10\7\0\1\1\3\11\2\1\1\11\2\1\1\11"+ + "\1\1\5\11\5\1\1\11\5\1\14\11\4\0\3\11"+ + "\1\0\1\11\1\0\2\11\1\0\3\11\4\0\1\11"+ + "\5\0\3\11\4\0\1\1\1\0\1\11\3\0\1\11"+ + "\4\0\1\11\4\0\1\11\2\0\1\11"; + + private static int [] zzUnpackAttribute() { + int [] result = new int[100]; + int offset = 0; + offset = zzUnpackAttribute(ZZ_ATTRIBUTE_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackAttribute(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int count = packed.charAt(i++); + int value = packed.charAt(i++); + do result[j++] = value; while (--count > 0); + } + return j; + } + + /** the input device */ + private java.io.Reader zzReader; + + /** the current state of the DFA */ + private int zzState; + + /** the current lexical state */ + private int zzLexicalState = YYINITIAL; + + /** this buffer contains the current text to be matched and is + the source of the yytext() string */ + private char zzBuffer[] = new char[ZZ_BUFFERSIZE]; + + /** the textposition at the last accepting state */ + private int zzMarkedPos; + + /** the current text position in the buffer */ + private int zzCurrentPos; + + /** startRead marks the beginning of the yytext() string in the buffer */ + private int zzStartRead; + + /** endRead marks the last character in the buffer, that has been read + from input */ + private int zzEndRead; + + /** number of newlines encountered up to the start of the matched text */ + private int yyline; + + /** the number of characters up to the start of the matched text */ + private int yychar; + + /** + * the number of characters from the last newline up to the start of the + * matched text + */ + private int yycolumn; + + /** + * zzAtBOL == true <=> the scanner is currently at the beginning of a line + */ + private boolean zzAtBOL = true; + + /** zzAtEOF == true <=> the scanner is at the EOF */ + private boolean zzAtEOF; + + /** denotes if the user-EOF-code has already been executed */ + private boolean zzEOFDone; + + /** For the backwards DFA of general lookahead statements */ + private boolean [] zzFin = new boolean [ZZ_BUFFERSIZE+1]; + + /* user code: */ + private static final int CONS = 1; + private static final int VOWEL = 2; + private int cv = 0; // consonant = 1, vowel = 2, everything else = 0 + + private String original = ""; + private String normalized = ""; + private int problem = 0; + + private void add (String norm) { + original += yytext(); + normalized += norm; + } + + private static final String LB = "[\u002d\u00ad] "; + + + /** + * Creates a new scanner + * There is also a java.io.InputStream version of this constructor. + * + * @param in the java.io.Reader to read input from. + */ + public MpdlNormalizerLexIT(java.io.Reader in) { + this.zzReader = in; + } + + /** + * Creates a new scanner. + * There is also java.io.Reader version of this constructor. + * + * @param in the java.io.Inputstream to read input from. + */ + public MpdlNormalizerLexIT(java.io.InputStream in) { + this(new java.io.InputStreamReader(in)); + } + + /** + * Unpacks the compressed character translation table. + * + * @param packed the packed character translation table + * @return the unpacked character translation table + */ + private static char [] zzUnpackCMap(String packed) { + char [] map = new char[0x10000]; + int i = 0; /* index in packed string */ + int j = 0; /* index in unpacked array */ + while (i < 172) { + int count = packed.charAt(i++); + char value = packed.charAt(i++); + do map[j++] = value; while (--count > 0); + } + return map; + } + + + /** + * Refills the input buffer. + * + * @return false, iff there was new input. + * + * @exception java.io.IOException if any I/O-Error occurs + */ + private boolean zzRefill() throws java.io.IOException { + + /* first: make room (if you can) */ + if (zzStartRead > 0) { + System.arraycopy(zzBuffer, zzStartRead, + zzBuffer, 0, + zzEndRead-zzStartRead); + + /* translate stored positions */ + zzEndRead-= zzStartRead; + zzCurrentPos-= zzStartRead; + zzMarkedPos-= zzStartRead; + zzStartRead = 0; + } + + /* is the buffer big enough? */ + if (zzCurrentPos >= zzBuffer.length) { + /* if not: blow it up */ + char newBuffer[] = new char[zzCurrentPos*2]; + System.arraycopy(zzBuffer, 0, newBuffer, 0, zzBuffer.length); + zzBuffer = newBuffer; + } + + /* finally: fill the buffer with new input */ + int numRead = zzReader.read(zzBuffer, zzEndRead, + zzBuffer.length-zzEndRead); + + if (numRead > 0) { + zzEndRead+= numRead; + return false; + } + // unlikely but not impossible: read 0 characters, but not at end of stream + if (numRead == 0) { + int c = zzReader.read(); + if (c == -1) { + return true; + } else { + zzBuffer[zzEndRead++] = (char) c; + return false; + } + } + + // numRead < 0 + return true; + } + + + /** + * Closes the input stream. + */ + public final void yyclose() throws java.io.IOException { + zzAtEOF = true; /* indicate end of file */ + zzEndRead = zzStartRead; /* invalidate buffer */ + + if (zzReader != null) + zzReader.close(); + } + + + /** + * Resets the scanner to read from a new input stream. + * Does not close the old reader. + * + * All internal variables are reset, the old input stream + * cannot be reused (internal buffer is discarded and lost). + * Lexical state is set to ZZ_INITIAL. + * + * @param reader the new input stream + */ + public final void yyreset(java.io.Reader reader) { + zzReader = reader; + zzAtBOL = true; + zzAtEOF = false; + zzEOFDone = false; + zzEndRead = zzStartRead = 0; + zzCurrentPos = zzMarkedPos = 0; + yyline = yychar = yycolumn = 0; + zzLexicalState = YYINITIAL; + } + + + /** + * Returns the current lexical state. + */ + public final int yystate() { + return zzLexicalState; + } + + + /** + * Enters a new lexical state + * + * @param newState the new lexical state + */ + public final void yybegin(int newState) { + zzLexicalState = newState; + } + + + /** + * Returns the text matched by the current regular expression. + */ + public final String yytext() { + return new String( zzBuffer, zzStartRead, zzMarkedPos-zzStartRead ); + } + + + /** + * Returns the character at position pos from the + * matched text. + * + * It is equivalent to yytext().charAt(pos), but faster + * + * @param pos the position of the character to fetch. + * A value from 0 to yylength()-1. + * + * @return the character at position pos + */ + public final char yycharat(int pos) { + return zzBuffer[zzStartRead+pos]; + } + + + /** + * Returns the length of the matched text region. + */ + public final int yylength() { + return zzMarkedPos-zzStartRead; + } + + + /** + * Reports an error that occured while scanning. + * + * In a wellformed scanner (no or only correct usage of + * yypushback(int) and a match-all fallback rule) this method + * will only be called with things that "Can't Possibly Happen". + * If this method is called, something is seriously wrong + * (e.g. a JFlex bug producing a faulty scanner etc.). + * + * Usual syntax/scanner level error handling should be done + * in error fallback rules. + * + * @param errorCode the code of the errormessage to display + */ + private void zzScanError(int errorCode) { + String message; + try { + message = ZZ_ERROR_MSG[errorCode]; + } + catch (ArrayIndexOutOfBoundsException e) { + message = ZZ_ERROR_MSG[ZZ_UNKNOWN_ERROR]; + } + + throw new Error(message); + } + + + /** + * Pushes the specified amount of characters back into the input stream. + * + * They will be read again by then next call of the scanning method + * + * @param number the number of characters to be read again. + * This number must not be greater than yylength()! + */ + public void yypushback(int number) { + if ( number > yylength() ) + zzScanError(ZZ_PUSHBACK_2BIG); + + zzMarkedPos -= number; + } + + + /** + * Resumes scanning until the next regular expression is matched, + * the end of input is encountered or an I/O-Error occurs. + * + * @return the next token + * @exception java.io.IOException if any I/O-Error occurs + */ + public java.lang.String yylex() throws java.io.IOException { + int zzInput; + int zzAction; + + // cached fields: + int zzCurrentPosL; + int zzMarkedPosL; + int zzEndReadL = zzEndRead; + char [] zzBufferL = zzBuffer; + char [] zzCMapL = ZZ_CMAP; + + int [] zzTransL = ZZ_TRANS; + int [] zzRowMapL = ZZ_ROWMAP; + int [] zzAttrL = ZZ_ATTRIBUTE; + + while (true) { + zzMarkedPosL = zzMarkedPos; + + if (zzMarkedPosL > zzStartRead) { + switch (zzBufferL[zzMarkedPosL-1]) { + case '\n': + case '\u000B': + case '\u000C': + case '\u0085': + case '\u2028': + case '\u2029': + zzAtBOL = true; + break; + case '\r': + if (zzMarkedPosL < zzEndReadL) + zzAtBOL = zzBufferL[zzMarkedPosL] != '\n'; + else if (zzAtEOF) + zzAtBOL = false; + else { + boolean eof = zzRefill(); + zzMarkedPosL = zzMarkedPos; + zzEndReadL = zzEndRead; + zzBufferL = zzBuffer; + if (eof) + zzAtBOL = false; + else + zzAtBOL = zzBufferL[zzMarkedPosL] != '\n'; + } + break; + default: + zzAtBOL = false; + } + } + zzAction = -1; + + zzCurrentPosL = zzCurrentPos = zzStartRead = zzMarkedPosL; + + if (zzAtBOL) + zzState = ZZ_LEXSTATE[zzLexicalState+1]; + else + zzState = ZZ_LEXSTATE[zzLexicalState]; + + + zzForAction: { + while (true) { + + if (zzCurrentPosL < zzEndReadL) + zzInput = zzBufferL[zzCurrentPosL++]; + else if (zzAtEOF) { + zzInput = YYEOF; + break zzForAction; + } + else { + // store back cached positions + zzCurrentPos = zzCurrentPosL; + zzMarkedPos = zzMarkedPosL; + boolean eof = zzRefill(); + // get translated positions and possibly new buffer + zzCurrentPosL = zzCurrentPos; + zzMarkedPosL = zzMarkedPos; + zzBufferL = zzBuffer; + zzEndReadL = zzEndRead; + if (eof) { + zzInput = YYEOF; + break zzForAction; + } + else { + zzInput = zzBufferL[zzCurrentPosL++]; + } + } + int zzNext = zzTransL[ zzRowMapL[zzState] + zzCMapL[zzInput] ]; + if (zzNext == -1) break zzForAction; + zzState = zzNext; + + int zzAttributes = zzAttrL[zzState]; + if ( (zzAttributes & 1) == 1 ) { + zzAction = zzState; + zzMarkedPosL = zzCurrentPosL; + if ( (zzAttributes & 8) == 8 ) break zzForAction; + } + + } + } + + // store back cached position + zzMarkedPos = zzMarkedPosL; + + switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) { + case 33: + // lookahead expression with fixed base length + zzMarkedPos = zzStartRead + 1; + { cv = CONS; add("U"); + } + case 41: break; + case 14: + { add("Á"); + } + case 42: break; + case 40: + // lookahead expression with fixed lookahead length + yypushback(1); + { add(yytext()); + } + case 43: break; + case 39: + // lookahead expression with fixed base length + zzMarkedPos = zzStartRead + 3; + { add(yytext()); + } + case 44: break; + case 38: + // lookahead expression with fixed base length + zzMarkedPos = zzStartRead + 2; + { add(yytext()); + } + case 45: break; + case 26: + { add(yytext()); + } + case 46: break; + case 21: + { add("í"); + } + case 47: break; + case 8: + { cv = VOWEL; add("AE"); + } + case 48: break; + case 11: + { problem = 1; cv = 0; add(yytext()); + } + case 49: break; + case 4: + { switch (problem) { + case 1: return original; + default: return normalized; + } + } + case 50: break; + case 30: + // lookahead expression with fixed base length + zzMarkedPos = zzStartRead + 1; + { cv = CONS; add("u"); + } + case 51: break; + case 19: + { add("á"); + } + case 52: break; + case 1: + { cv = 0; add(yytext()); + } + case 53: break; + case 24: + { switch (problem) { + case 1: return original; + default: return normalized.replaceAll(LB, "").toLowerCase(); + } + } + case 54: break; + case 34: + // lookahead expression with fixed base length + zzMarkedPos = zzStartRead + 1; + { cv = VOWEL; add(yytext().replaceAll("u", "v").replaceAll("U", "V")); + } + case 55: break; + case 35: + { cv = VOWEL; add("zio"); + } + case 56: break; + case 10: + { cv = VOWEL; add("OE"); + } + case 57: break; + case 18: + { add("Ú"); + } + case 58: break; + case 37: + // general lookahead, find correct zzMarkedPos + { int zzFState = 7; + int zzFPos = zzStartRead; + if (zzFin.length <= zzBufferL.length) { zzFin = new boolean[zzBufferL.length+1]; } + boolean zzFinL[] = zzFin; + while (zzFState != -1 && zzFPos < zzMarkedPos) { + if ((zzAttrL[zzFState] & 1) == 1) { zzFinL[zzFPos] = true; } + zzInput = zzBufferL[zzFPos++]; + zzFState = zzTransL[ zzRowMapL[zzFState] + zzCMapL[zzInput] ]; + } + if (zzFState != -1 && (zzAttrL[zzFState] & 1) == 1) { zzFinL[zzFPos] = true; } + + zzFState = 8; + zzFPos = zzMarkedPos; + while (!zzFinL[zzFPos] || (zzAttrL[zzFState] & 1) != 1) { + zzInput = zzBufferL[--zzFPos]; + zzFState = zzTransL[ zzRowMapL[zzFState] + zzCMapL[zzInput] ]; + }; + zzMarkedPos = zzFPos; + } + { cv = VOWEL; add(yytext().replace("ſ", "s")); + } + case 59: break; + case 3: + { cv = CONS; add(yytext()); + } + case 60: break; + case 32: + { cv = CONS; add("QU"); + } + case 61: break; + case 15: + { add("É"); + } + case 62: break; + case 28: + // lookahead expression with fixed base length + zzMarkedPos = zzStartRead + 1; + { switch(cv) { + case VOWEL: add(yytext().replace("u", "v").replace("U", "V")); break; + default: cv = VOWEL; add(yytext()); break; + } + } + case 63: break; + case 6: + { cv = CONS; add("ss"); + } + case 64: break; + case 5: + { cv = CONS; add("s"); + } + case 65: break; + case 13: + { switch (problem) { + case 1: return ""; + default: return normalized.replaceAll(LB, ""); + } + } + case 66: break; + case 36: + { cv = VOWEL; add("ZIO"); + } + case 67: break; + case 2: + { cv = VOWEL; add(yytext()); + } + case 68: break; + case 17: + { add("Ó"); + } + case 69: break; + case 23: + { add("ú"); + } + case 70: break; + case 31: + { cv = CONS; add("Qu"); + } + case 71: break; + case 20: + { add("é"); + } + case 72: break; + case 7: + { cv = VOWEL; add("ae"); + } + case 73: break; + case 12: + { add(""); + } + case 74: break; + case 22: + { add("ó"); + } + case 75: break; + case 9: + { cv = VOWEL; add("oe"); + } + case 76: break; + case 29: + { cv = CONS; add("qu"); + } + case 77: break; + case 25: + { switch(cv) { + case CONS: add(yytext().replace("v", "u").replace("V", "U")); break; + default: cv = CONS; add(yytext()); break; + } + } + case 78: break; + case 27: + { cv = VOWEL; add("ii"); + } + case 79: break; + case 16: + { add("Í"); + } + case 80: break; + default: + if (zzInput == YYEOF && zzStartRead == zzCurrentPos) { + zzAtEOF = true; + return null; + } + else { + zzScanError(ZZ_NO_MATCH); + } + } + } + } + + +} diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexIT.lex --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexIT.lex Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,183 @@ +/* + * Normalization rules for Italian text + * [this is a JFlex specification] + * + * Wolfgang Schmidle + * version 2011-07-12 + * + */ + +package de.mpg.mpiwg.berlin.mpdl.lt.analyzer.lang; + +%% + +%public +%class MpdlNormalizerLexIT +%type java.lang.String +%unicode + +// Italian: it, ita + +%states DISP, DICT, SEARCH + +%{ + private static final int CONS = 1; + private static final int VOWEL = 2; + private int cv = 0; // consonant = 1, vowel = 2, everything else = 0 + + private String original = ""; + private String normalized = ""; + private int problem = 0; + + private void add (String norm) { + original += yytext(); + normalized += norm; + } + + private static final String LB = "[\u002d\u00ad] "; +%} + +Vowel = [AEIOUaeiouÆæęàèòùœ] +Cons = [BCDFGHKLMNPQRSTVWXZbcdfghklmnpqrstvwxzſß] +LR = [lLrR] + + +hyphen = [\u002d\u00ad] // hyphen and soft hyphen +LB = {hyphen} \u0020 +lb = ({hyphen} \u0020)? + +END = \n + +prefixCons = (in{lb}ter | per | ſu{lb}per | ſer) + +%% + + { + +À { add("Á"); } +È { add("É"); } +Ì { add("Í"); } +Ò { add("Ó"); } +Ù { add("Ú"); } +à { add("á"); } +è { add("é"); } +ì { add("í"); } +ò { add("ó"); } +ù { add("ú"); } + +} + + { + +ſ { cv = CONS; add("s"); } +ß { cv = CONS; add("ss"); } +æ { cv = VOWEL; add("ae"); } +Æ { cv = VOWEL; add("AE"); } +œ { cv = VOWEL; add("oe"); } +Œ { cv = VOWEL; add("OE"); } + +ij { cv = VOWEL; add("ii"); } + +tio { cv = VOWEL; add("zio"); } +TIO { cv = VOWEL; add("ZIO"); } + +// h-Regeln aus Arboreal: +^ ha / {END} { add(yytext()); } +^ hai / {END} { add(yytext()); } +^ han{lb}no / {END} { add(yytext()); } +^ ho / {END} { add(yytext()); } +^ h { add(""); } + + +// u/v rules are taken from MpdlNormalizerLexLA.lex + +// 1. rules for u --> v + +^ {prefixCons} / {lb} { cv = VOWEL; add(yytext().replace("ſ", "s")); } + +^ [uU] / {Vowel} { cv = VOWEL; add(yytext().replaceAll("u", "v").replaceAll("U", "V")); } + + +[uU] / {Vowel} { + switch(cv) { + case VOWEL: add(yytext().replace("u", "v").replace("U", "V")); break; + default: cv = VOWEL; add(yytext()); break; + } + } + +// 2. rules for v --> u + +qv { cv = CONS; add("qu"); } // the replaced v still counts as consonant +Qv { cv = CONS; add("Qu"); } +QV { cv = CONS; add("QU"); } + +{LR} [vV] { + switch(cv) { + case CONS: add(yytext().replace("v", "u").replace("V", "U")); break; + default: cv = CONS; add(yytext()); break; + } + } + +v / {lb} {Cons} { cv = CONS; add("u"); } +V / {lb} {Cons} { cv = CONS; add("U"); } + +// 3. override default rule for . + +{Vowel} { cv = VOWEL; add(yytext()); } +{Cons} { cv = CONS; add(yytext()); } +@ { problem = 1; cv = 0; add(yytext()); } +{LB} { add(yytext()); } +. { cv = 0; add(yytext()); } + +} + + + { + +{END} { + switch (problem) { + case 1: return original; + default: return normalized; + } + } +} + + { + +{END} { + switch (problem) { + case 1: return ""; + default: return normalized.replaceAll(LB, ""); + } + } +} + + { + +{END} { + switch (problem) { + case 1: return original; + default: return normalized.replaceAll(LB, "").toLowerCase(); + } + } +} + + +/* + +Annahmen: +- die Routine wird wortweise aufgerufen, mit einem \n am Ende des Strings +- Zeilenumbrüche innerhalb des Wortes sind als hyphen/soft hyphen plus space markiert + +TO DO: + +IT: all these rules are taken from Arboreal; do we need them all? +IT: richtig? vollständig? +IT: Sind die u/v-Regeln wirklich genau wie in LA ? insbesondere: gleiche Vokal-Klasse? +IT: Änderungen in den lateinischen u/v-Regeln übernehmen? +IT: italienische Beispielwörter für die u/v-Regeln angeben +IT: Brauchen wir die Gravis-Regeln aus Arboreal in DICT wirklich? +IT: wenn ja: gehört À --> Á etc. in die Wörterbuch-Schicht? Und einschränken auf letzte Silbe? +IT: ist prefixCons = (inter | per | ſuper | ſer) auch für Italienisch gültig? + +*/ diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexLA.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexLA.java Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,1024 @@ +/* The following code was generated by JFlex 1.4.3 on 05.09.11 10:35 */ + +/* + * Normalization rules for Latin text + * [this is a JFlex specification] + * + * Wolfgang Schmidle + * version 2011-07-12 + * + */ + +package de.mpg.mpiwg.berlin.mpdl.lt.text.norm.lang; + + +/** + * This class is a scanner generated by + * JFlex 1.4.3 + * on 05.09.11 10:35 from the specification file + * MpdlNormalizerLexLA.lex + */ +public class MpdlNormalizerLexLA { + + /** This character denotes the end of file */ + public static final int YYEOF = -1; + + /** initial size of the lookahead buffer */ + private static final int ZZ_BUFFERSIZE = 16384; + + /** lexical states */ + public static final int RENAISSANCE_DICT = 8; + public static final int SEARCH = 10; + public static final int RENAISSANCE_DISP = 4; + public static final int DICT = 6; + public static final int YYINITIAL = 0; + public static final int RENAISSANCE_SEARCH = 12; + public static final int DISP = 2; + + /** + * ZZ_LEXSTATE[l] is the state in the DFA for the lexical state l + * ZZ_LEXSTATE[l+1] is the state in the DFA for the lexical state l + * at the beginning of a line + * l is of the form l = 2*k, k a non negative integer + */ + private static final int ZZ_LEXSTATE[] = { + 0, 0, 1, 2, 1, 2, 3, 4, 3, 4, 5, 6, 5, 6 + }; + + /** + * Translates characters to character classes + */ + private static final String ZZ_CMAP_PACKED = + "\12\0\1\6\25\0\1\5\14\0\1\4\22\0\1\0\1\1\3\2"+ + "\1\1\2\2\1\53\1\1\1\0\1\2\1\3\2\2\1\1\1\2"+ + "\1\46\1\3\2\2\1\64\1\65\2\2\1\66\1\2\6\0\1\57"+ + "\1\2\1\47\1\43\1\11\2\2\1\51\1\14\1\27\1\2\1\50"+ + "\1\40\1\13\1\61\1\17\1\7\1\16\1\32\1\15\1\10\1\12"+ + "\2\2\1\66\1\2\62\0\1\4\30\0\1\25\30\0\1\23\1\37"+ + "\1\31\1\55\3\0\1\24\1\0\1\41\1\33\1\0\1\60\1\45"+ + "\1\34\1\52\1\62\2\0\1\42\1\35\1\54\4\0\1\44\1\36"+ + "\1\56\1\63\34\0\1\24\71\0\1\26\53\0\1\20\u0181\0\1\30"+ + "\ud4fe\0\1\21\u0590\0\1\22\u226e\0"; + + /** + * Translates characters to character classes + */ + private static final char [] ZZ_CMAP = zzUnpackCMap(ZZ_CMAP_PACKED); + + /** + * Translates DFA states to action switch labels. + */ + private static final int [] ZZ_ACTION = zzUnpackAction(); + + private static final String ZZ_ACTION_PACKED_0 = + "\12\0\1\1\1\2\2\3\1\1\1\4\1\3\1\2"+ + "\1\3\1\2\1\5\1\1\1\6\1\7\1\10\1\11"+ + "\11\1\1\3\2\1\3\2\1\3\1\12\1\3\2\2"+ + "\1\3\1\5\3\3\1\1\1\2\1\13\1\14\4\0"+ + "\1\15\1\16\1\17\1\20\1\0\1\21\1\22\1\23"+ + "\1\24\1\0\1\25\20\0\1\26\3\0\1\27\3\0"+ + "\1\30\1\0\1\31\3\0\1\32\1\33\1\34\1\0"+ + "\1\35\1\36\2\0\1\37\20\0\1\40\1\0\1\41"+ + "\1\0\1\42\1\0\1\43\1\44\1\45\1\46\1\0"+ + "\1\47\1\0\1\50\1\0\1\51\1\0\1\52\4\0"+ + "\1\53\10\0\1\54\6\0\1\55\3\0\1\56\1\57"+ + "\1\60\2\0\1\61\5\0\1\53"; + + private static int [] zzUnpackAction() { + int [] result = new int[179]; + int offset = 0; + offset = zzUnpackAction(ZZ_ACTION_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackAction(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int count = packed.charAt(i++); + int value = packed.charAt(i++); + do result[j++] = value; while (--count > 0); + } + return j; + } + + + /** + * Translates a state to a row index in the transition table + */ + private static final int [] ZZ_ROWMAP = zzUnpackRowMap(); + + private static final String ZZ_ROWMAP_PACKED_0 = + "\0\0\0\67\0\156\0\245\0\334\0\u0113\0\u014a\0\u0181"+ + "\0\u01b8\0\u01ef\0\u0226\0\u0226\0\u0226\0\u025d\0\u0294\0\u0226"+ + "\0\u02cb\0\u0302\0\u0339\0\u0370\0\u0226\0\u01ef\0\u0226\0\u0226"+ + "\0\u0226\0\u0226\0\u03a7\0\u03de\0\u0415\0\u044c\0\u0483\0\u04ba"+ + "\0\u04f1\0\u0528\0\u055f\0\u0596\0\u05cd\0\u0604\0\u063b\0\u0672"+ + "\0\u06a9\0\u06e0\0\u0226\0\u0717\0\u074e\0\u0785\0\u07bc\0\u07f3"+ + "\0\u082a\0\u0861\0\u0898\0\u08cf\0\u0906\0\u0226\0\u0226\0\u093d"+ + "\0\u0974\0\u09ab\0\u09e2\0\u0226\0\u0226\0\u0226\0\u0226\0\u0a19"+ + "\0\u0226\0\u0226\0\u0226\0\u0226\0\u0a50\0\u0226\0\u0a87\0\u0abe"+ + "\0\u0af5\0\u0b2c\0\u0b63\0\u0b9a\0\u0bd1\0\u0c08\0\u0c3f\0\u0c76"+ + "\0\u0cad\0\u0ce4\0\u0d1b\0\u0d52\0\u0d89\0\u0dc0\0\u0226\0\u0df7"+ + "\0\u0e2e\0\u0e65\0\u0226\0\u0e9c\0\u0ed3\0\u0f0a\0\u0226\0\u0f41"+ + "\0\u0226\0\u0f78\0\u0faf\0\u0fe6\0\u0226\0\u0226\0\u0226\0\u101d"+ + "\0\u0226\0\u0226\0\u1054\0\u108b\0\u0226\0\u10c2\0\u10f9\0\u1130"+ + "\0\u1167\0\u119e\0\u11d5\0\u120c\0\u1243\0\u127a\0\u0226\0\u12b1"+ + "\0\u12e8\0\u131f\0\u1356\0\u138d\0\u08cf\0\u0226\0\u13c4\0\u0226"+ + "\0\u13fb\0\u0226\0\u1432\0\u0226\0\u0226\0\u0226\0\u0226\0\u1469"+ + "\0\u0226\0\u14a0\0\u0226\0\u14d7\0\u0226\0\u150e\0\u0226\0\u1545"+ + "\0\u157c\0\u15b3\0\u07bc\0\u15ea\0\u1621\0\u1658\0\u168f\0\u16c6"+ + "\0\u16fd\0\u0226\0\u1734\0\u176b\0\u0226\0\u17a2\0\u17d9\0\u1810"+ + "\0\u1847\0\u187e\0\u18b5\0\u0226\0\u18ec\0\u1923\0\u195a\0\u0226"+ + "\0\u0226\0\u0226\0\u1991\0\u19c8\0\u0226\0\u19ff\0\u1a36\0\u1a6d"+ + "\0\u1aa4\0\u1adb\0\u0226"; + + private static int [] zzUnpackRowMap() { + int [] result = new int[179]; + int offset = 0; + offset = zzUnpackRowMap(ZZ_ROWMAP_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackRowMap(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int high = packed.charAt(i++) << 16; + result[j++] = high | packed.charAt(i++); + } + return j; + } + + /** + * The transition table of the DFA + */ + private static final int [] ZZ_TRANS = zzUnpackTrans(); + + private static final String ZZ_TRANS_PACKED_0 = + "\21\0\1\12\45\0\1\13\1\14\1\15\1\16\1\17"+ + "\1\13\1\20\1\21\1\22\1\14\1\23\1\15\1\24"+ + "\1\15\1\16\1\15\1\25\1\26\1\13\1\27\1\30"+ + "\1\31\1\32\2\13\1\33\1\15\1\34\1\35\1\36"+ + "\1\37\1\40\1\15\1\41\1\42\1\15\1\43\1\13"+ + "\1\44\1\15\1\16\1\15\1\13\1\15\1\13\1\45"+ + "\1\46\1\47\1\13\1\50\2\13\1\51\1\52\1\53"+ + "\1\13\1\14\1\15\1\16\1\17\1\13\1\20\1\54"+ + "\1\55\1\14\1\23\1\15\1\56\1\15\1\16\1\57"+ + "\1\60\1\26\1\13\1\27\1\30\1\31\1\32\2\13"+ + "\1\33\1\15\1\34\1\35\1\36\1\37\1\40\1\15"+ + "\1\41\1\42\1\15\1\43\1\13\1\61\1\15\1\16"+ + "\1\62\1\13\1\63\1\64\1\45\1\46\1\47\1\13"+ + "\1\50\2\13\1\65\1\52\1\53\1\13\1\14\1\15"+ + "\1\16\1\17\1\13\1\66\1\21\1\22\1\14\1\23"+ + "\1\15\1\24\1\15\1\16\1\15\1\25\1\26\1\13"+ + "\1\27\1\30\1\31\1\32\2\13\1\33\1\15\1\34"+ + "\1\35\1\36\1\37\1\40\1\15\1\41\1\42\1\15"+ + "\1\43\1\13\1\44\1\15\1\16\1\15\1\13\1\15"+ + "\1\13\1\45\1\46\1\47\1\13\1\50\2\13\1\51"+ + "\1\52\1\53\1\13\1\14\1\15\1\16\1\17\1\13"+ + "\1\66\1\54\1\55\1\14\1\23\1\15\1\56\1\15"+ + "\1\16\1\57\1\60\1\26\1\13\1\27\1\30\1\31"+ + "\1\32\2\13\1\33\1\15\1\34\1\35\1\36\1\37"+ + "\1\40\1\15\1\41\1\42\1\15\1\43\1\13\1\61"+ + "\1\15\1\16\1\62\1\13\1\63\1\64\1\45\1\46"+ + "\1\47\1\13\1\50\2\13\1\65\1\52\1\53\1\13"+ + "\1\14\1\15\1\16\1\17\1\13\1\67\1\21\1\22"+ + "\1\14\1\23\1\15\1\24\1\15\1\16\1\15\1\25"+ + "\1\26\1\13\1\27\1\30\1\31\1\32\2\13\1\33"+ + "\1\15\1\34\1\35\1\36\1\37\1\40\1\15\1\41"+ + "\1\42\1\15\1\43\1\13\1\44\1\15\1\16\1\15"+ + "\1\13\1\15\1\13\1\45\1\46\1\47\1\13\1\50"+ + "\2\13\1\51\1\52\1\53\1\13\1\14\1\15\1\16"+ + "\1\17\1\13\1\67\1\54\1\55\1\14\1\23\1\15"+ + "\1\56\1\15\1\16\1\57\1\60\1\26\1\13\1\27"+ + "\1\30\1\31\1\32\2\13\1\33\1\15\1\34\1\35"+ + "\1\36\1\37\1\40\1\15\1\41\1\42\1\15\1\43"+ + "\1\13\1\61\1\15\1\16\1\62\1\13\1\63\1\64"+ + "\1\45\1\46\1\47\1\13\1\50\2\13\1\65\1\52"+ + "\1\53\14\0\1\70\2\0\1\71\1\72\53\0\1\73"+ + "\103\0\1\74\145\0\1\75\52\0\1\75\6\0\1\76"+ + "\73\0\1\77\15\0\1\100\37\0\1\101\6\0\2\101"+ + "\2\0\1\101\7\0\3\101\30\0\1\101\1\0\1\101"+ + "\1\102\1\103\1\101\4\0\2\104\1\105\2\0\1\104"+ + "\2\0\2\104\1\0\4\104\2\0\1\104\6\0\1\104"+ + "\5\0\1\104\2\0\1\104\2\0\4\104\1\0\1\104"+ + "\11\0\1\104\30\0\1\106\46\0\1\107\2\0\2\110"+ + "\1\0\2\111\13\0\1\111\5\0\1\111\35\0\1\112"+ + "\2\0\2\113\1\0\2\114\13\0\1\114\5\0\1\114"+ + "\35\0\1\115\2\0\2\116\1\0\2\117\13\0\1\117"+ + "\5\0\1\117\35\0\1\120\2\0\2\121\1\0\2\122"+ + "\13\0\1\122\5\0\1\122\35\0\1\123\1\0\1\124"+ + "\2\125\1\0\2\126\13\0\1\126\5\0\1\126\34\0"+ + "\1\127\1\107\22\0\1\130\5\0\1\131\6\0\1\132"+ + "\25\0\1\133\1\112\5\0\1\134\1\135\13\0\1\136"+ + "\42\0\1\137\1\120\33\0\1\140\31\0\1\141\23\0"+ + "\1\142\5\0\1\143\7\0\1\144\30\0\1\145\52\0"+ + "\1\146\7\0\1\127\1\107\6\0\1\147\102\0\1\150"+ + "\114\0\1\30\66\0\1\32\1\0\1\151\5\0\1\101"+ + "\6\0\2\101\2\0\1\101\7\0\3\101\30\0\1\101"+ + "\1\0\1\101\2\0\1\101\4\0\2\152\1\153\2\0"+ + "\1\152\2\0\2\152\1\0\4\152\2\0\1\152\6\0"+ + "\1\152\5\0\1\152\2\0\1\152\2\0\4\152\1\0"+ + "\1\152\11\0\1\152\11\0\1\154\1\0\1\77\15\0"+ + "\1\100\37\0\1\155\6\0\2\155\2\0\1\155\7\0"+ + "\3\155\30\0\1\155\1\0\1\155\1\102\1\103\1\155"+ + "\15\0\1\156\13\0\1\106\50\0\1\157\65\0\1\160"+ + "\1\157\65\0\1\161\1\0\1\145\52\0\1\146\53\0"+ + "\1\162\66\0\1\163\22\0\1\137\61\0\1\155\6\0"+ + "\2\155\2\0\1\155\7\0\3\155\30\0\1\155\1\0"+ + "\1\155\2\0\1\155\15\0\1\164\64\0\1\165\65\0"+ + "\1\166\1\165\61\0\1\167\72\0\1\170\63\0\1\171"+ + "\71\0\1\110\67\0\1\172\64\0\1\107\2\0\2\110"+ + "\63\0\1\113\67\0\1\173\64\0\1\112\2\0\2\113"+ + "\63\0\1\116\67\0\1\174\64\0\1\115\2\0\2\116"+ + "\63\0\1\121\67\0\1\175\64\0\1\120\2\0\2\121"+ + "\63\0\1\125\64\0\1\176\71\0\1\177\64\0\1\123"+ + "\2\0\2\125\61\0\1\200\1\201\65\0\1\202\1\203"+ + "\65\0\1\204\66\0\1\205\66\0\1\206\66\0\1\207"+ + "\1\210\65\0\1\211\1\212\65\0\1\213\1\214\65\0"+ + "\1\215\1\216\65\0\1\217\66\0\1\213\65\0\1\220"+ + "\126\0\1\221\25\0\1\222\10\0\1\223\67\0\1\224"+ + "\54\0\1\225\12\0\1\223\114\0\1\226\70\0\1\227"+ + "\66\0\1\230\23\0\1\231\10\0\1\71\67\0\1\232"+ + "\54\0\1\233\12\0\1\71\60\0\1\234\57\0\2\104"+ + "\3\0\1\104\2\0\2\104\1\0\4\104\2\0\1\104"+ + "\6\0\1\104\5\0\1\104\2\0\1\104\2\0\4\104"+ + "\1\0\1\104\11\0\1\104\7\0\1\127\66\0\1\133"+ + "\66\0\1\235\66\0\1\141\70\0\1\236\66\0\1\237"+ + "\66\0\1\240\66\0\1\241\66\0\1\242\66\0\1\243"+ + "\60\0\2\152\3\0\1\152\2\0\2\152\1\0\4\152"+ + "\2\0\1\152\6\0\1\152\5\0\1\152\2\0\1\152"+ + "\2\0\4\152\1\0\1\152\11\0\1\152\7\0\1\244"+ + "\65\0\1\245\65\0\1\246\67\0\1\247\67\0\1\250"+ + "\66\0\1\251\66\0\1\252\65\0\1\253\66\0\1\254"+ + "\67\0\1\255\71\0\1\256\66\0\1\257\66\0\1\260"+ + "\66\0\1\261\66\0\1\150\66\0\1\262\72\0\1\223"+ + "\56\0\1\263\100\0\1\223\64\0\1\71\70\0\1\71"+ + "\55\0\1\200\66\0\1\202\66\0\1\207\66\0\1\211"+ + "\66\0\1\215\60\0"; + + private static int [] zzUnpackTrans() { + int [] result = new int[6930]; + int offset = 0; + offset = zzUnpackTrans(ZZ_TRANS_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackTrans(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int count = packed.charAt(i++); + int value = packed.charAt(i++); + value--; + do result[j++] = value; while (--count > 0); + } + return j; + } + + + /* error codes */ + private static final int ZZ_UNKNOWN_ERROR = 0; + private static final int ZZ_NO_MATCH = 1; + private static final int ZZ_PUSHBACK_2BIG = 2; + + /* error messages for the codes above */ + private static final String ZZ_ERROR_MSG[] = { + "Unkown internal scanner error", + "Error: could not match input", + "Error: pushback value was too large" + }; + + /** + * ZZ_ATTRIBUTE[aState] contains the attributes of state aState + */ + private static final int [] ZZ_ATTRIBUTE = zzUnpackAttribute(); + + private static final String ZZ_ATTRIBUTE_PACKED_0 = + "\10\0\1\1\1\0\3\11\2\1\1\11\4\1\1\11"+ + "\1\1\4\11\20\1\1\11\12\1\2\11\4\0\4\11"+ + "\1\0\4\11\1\0\1\11\20\0\1\11\3\0\1\11"+ + "\3\0\1\11\1\0\1\11\3\0\3\11\1\0\2\11"+ + "\2\0\1\11\11\0\1\11\6\0\1\11\1\0\1\11"+ + "\1\0\1\11\1\0\4\11\1\0\1\11\1\0\1\11"+ + "\1\0\1\11\1\0\1\11\4\0\1\1\5\0\1\11"+ + "\2\0\1\11\6\0\1\11\3\0\3\11\2\0\1\11"+ + "\5\0\1\11"; + + private static int [] zzUnpackAttribute() { + int [] result = new int[179]; + int offset = 0; + offset = zzUnpackAttribute(ZZ_ATTRIBUTE_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackAttribute(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int count = packed.charAt(i++); + int value = packed.charAt(i++); + do result[j++] = value; while (--count > 0); + } + return j; + } + + /** the input device */ + private java.io.Reader zzReader; + + /** the current state of the DFA */ + private int zzState; + + /** the current lexical state */ + private int zzLexicalState = YYINITIAL; + + /** this buffer contains the current text to be matched and is + the source of the yytext() string */ + private char zzBuffer[] = new char[ZZ_BUFFERSIZE]; + + /** the textposition at the last accepting state */ + private int zzMarkedPos; + + /** the current text position in the buffer */ + private int zzCurrentPos; + + /** startRead marks the beginning of the yytext() string in the buffer */ + private int zzStartRead; + + /** endRead marks the last character in the buffer, that has been read + from input */ + private int zzEndRead; + + /** number of newlines encountered up to the start of the matched text */ + private int yyline; + + /** the number of characters up to the start of the matched text */ + private int yychar; + + /** + * the number of characters from the last newline up to the start of the + * matched text + */ + private int yycolumn; + + /** + * zzAtBOL == true <=> the scanner is currently at the beginning of a line + */ + private boolean zzAtBOL = true; + + /** zzAtEOF == true <=> the scanner is at the EOF */ + private boolean zzAtEOF; + + /** denotes if the user-EOF-code has already been executed */ + private boolean zzEOFDone; + + /** For the backwards DFA of general lookahead statements */ + private boolean [] zzFin = new boolean [ZZ_BUFFERSIZE+1]; + + /* user code: */ + private static final int CONS = 1; + private static final int VOWEL = 2; + private int cv = 0; // consonant = 1, vowel = 2, everything else = 0 + + private String original = ""; + private String normalized = ""; + private int problem = 0; + + private void add (String norm) { + original += yytext(); + normalized += norm; + } + + private static final String LB = "[\u002d\u00ad] "; + + + /** + * Creates a new scanner + * There is also a java.io.InputStream version of this constructor. + * + * @param in the java.io.Reader to read input from. + */ + public MpdlNormalizerLexLA(java.io.Reader in) { + this.zzReader = in; + } + + /** + * Creates a new scanner. + * There is also java.io.Reader version of this constructor. + * + * @param in the java.io.Inputstream to read input from. + */ + public MpdlNormalizerLexLA(java.io.InputStream in) { + this(new java.io.InputStreamReader(in)); + } + + /** + * Unpacks the compressed character translation table. + * + * @param packed the packed character translation table + * @return the unpacked character translation table + */ + private static char [] zzUnpackCMap(String packed) { + char [] map = new char[0x10000]; + int i = 0; /* index in packed string */ + int j = 0; /* index in unpacked array */ + while (i < 190) { + int count = packed.charAt(i++); + char value = packed.charAt(i++); + do map[j++] = value; while (--count > 0); + } + return map; + } + + + /** + * Refills the input buffer. + * + * @return false, iff there was new input. + * + * @exception java.io.IOException if any I/O-Error occurs + */ + private boolean zzRefill() throws java.io.IOException { + + /* first: make room (if you can) */ + if (zzStartRead > 0) { + System.arraycopy(zzBuffer, zzStartRead, + zzBuffer, 0, + zzEndRead-zzStartRead); + + /* translate stored positions */ + zzEndRead-= zzStartRead; + zzCurrentPos-= zzStartRead; + zzMarkedPos-= zzStartRead; + zzStartRead = 0; + } + + /* is the buffer big enough? */ + if (zzCurrentPos >= zzBuffer.length) { + /* if not: blow it up */ + char newBuffer[] = new char[zzCurrentPos*2]; + System.arraycopy(zzBuffer, 0, newBuffer, 0, zzBuffer.length); + zzBuffer = newBuffer; + } + + /* finally: fill the buffer with new input */ + int numRead = zzReader.read(zzBuffer, zzEndRead, + zzBuffer.length-zzEndRead); + + if (numRead > 0) { + zzEndRead+= numRead; + return false; + } + // unlikely but not impossible: read 0 characters, but not at end of stream + if (numRead == 0) { + int c = zzReader.read(); + if (c == -1) { + return true; + } else { + zzBuffer[zzEndRead++] = (char) c; + return false; + } + } + + // numRead < 0 + return true; + } + + + /** + * Closes the input stream. + */ + public final void yyclose() throws java.io.IOException { + zzAtEOF = true; /* indicate end of file */ + zzEndRead = zzStartRead; /* invalidate buffer */ + + if (zzReader != null) + zzReader.close(); + } + + + /** + * Resets the scanner to read from a new input stream. + * Does not close the old reader. + * + * All internal variables are reset, the old input stream + * cannot be reused (internal buffer is discarded and lost). + * Lexical state is set to ZZ_INITIAL. + * + * @param reader the new input stream + */ + public final void yyreset(java.io.Reader reader) { + zzReader = reader; + zzAtBOL = true; + zzAtEOF = false; + zzEOFDone = false; + zzEndRead = zzStartRead = 0; + zzCurrentPos = zzMarkedPos = 0; + yyline = yychar = yycolumn = 0; + zzLexicalState = YYINITIAL; + } + + + /** + * Returns the current lexical state. + */ + public final int yystate() { + return zzLexicalState; + } + + + /** + * Enters a new lexical state + * + * @param newState the new lexical state + */ + public final void yybegin(int newState) { + zzLexicalState = newState; + } + + + /** + * Returns the text matched by the current regular expression. + */ + public final String yytext() { + return new String( zzBuffer, zzStartRead, zzMarkedPos-zzStartRead ); + } + + + /** + * Returns the character at position pos from the + * matched text. + * + * It is equivalent to yytext().charAt(pos), but faster + * + * @param pos the position of the character to fetch. + * A value from 0 to yylength()-1. + * + * @return the character at position pos + */ + public final char yycharat(int pos) { + return zzBuffer[zzStartRead+pos]; + } + + + /** + * Returns the length of the matched text region. + */ + public final int yylength() { + return zzMarkedPos-zzStartRead; + } + + + /** + * Reports an error that occured while scanning. + * + * In a wellformed scanner (no or only correct usage of + * yypushback(int) and a match-all fallback rule) this method + * will only be called with things that "Can't Possibly Happen". + * If this method is called, something is seriously wrong + * (e.g. a JFlex bug producing a faulty scanner etc.). + * + * Usual syntax/scanner level error handling should be done + * in error fallback rules. + * + * @param errorCode the code of the errormessage to display + */ + private void zzScanError(int errorCode) { + String message; + try { + message = ZZ_ERROR_MSG[errorCode]; + } + catch (ArrayIndexOutOfBoundsException e) { + message = ZZ_ERROR_MSG[ZZ_UNKNOWN_ERROR]; + } + + throw new Error(message); + } + + + /** + * Pushes the specified amount of characters back into the input stream. + * + * They will be read again by then next call of the scanning method + * + * @param number the number of characters to be read again. + * This number must not be greater than yylength()! + */ + public void yypushback(int number) { + if ( number > yylength() ) + zzScanError(ZZ_PUSHBACK_2BIG); + + zzMarkedPos -= number; + } + + + /** + * Resumes scanning until the next regular expression is matched, + * the end of input is encountered or an I/O-Error occurs. + * + * @return the next token + * @exception java.io.IOException if any I/O-Error occurs + */ + public java.lang.String yylex() throws java.io.IOException { + int zzInput; + int zzAction; + + // cached fields: + int zzCurrentPosL; + int zzMarkedPosL; + int zzEndReadL = zzEndRead; + char [] zzBufferL = zzBuffer; + char [] zzCMapL = ZZ_CMAP; + + int [] zzTransL = ZZ_TRANS; + int [] zzRowMapL = ZZ_ROWMAP; + int [] zzAttrL = ZZ_ATTRIBUTE; + + while (true) { + zzMarkedPosL = zzMarkedPos; + + if (zzMarkedPosL > zzStartRead) { + switch (zzBufferL[zzMarkedPosL-1]) { + case '\n': + case '\u000B': + case '\u000C': + case '\u0085': + case '\u2028': + case '\u2029': + zzAtBOL = true; + break; + case '\r': + if (zzMarkedPosL < zzEndReadL) + zzAtBOL = zzBufferL[zzMarkedPosL] != '\n'; + else if (zzAtEOF) + zzAtBOL = false; + else { + boolean eof = zzRefill(); + zzMarkedPosL = zzMarkedPos; + zzEndReadL = zzEndRead; + zzBufferL = zzBuffer; + if (eof) + zzAtBOL = false; + else + zzAtBOL = zzBufferL[zzMarkedPosL] != '\n'; + } + break; + default: + zzAtBOL = false; + } + } + zzAction = -1; + + zzCurrentPosL = zzCurrentPos = zzStartRead = zzMarkedPosL; + + if (zzAtBOL) + zzState = ZZ_LEXSTATE[zzLexicalState+1]; + else + zzState = ZZ_LEXSTATE[zzLexicalState]; + + + zzForAction: { + while (true) { + + if (zzCurrentPosL < zzEndReadL) + zzInput = zzBufferL[zzCurrentPosL++]; + else if (zzAtEOF) { + zzInput = YYEOF; + break zzForAction; + } + else { + // store back cached positions + zzCurrentPos = zzCurrentPosL; + zzMarkedPos = zzMarkedPosL; + boolean eof = zzRefill(); + // get translated positions and possibly new buffer + zzCurrentPosL = zzCurrentPos; + zzMarkedPosL = zzMarkedPos; + zzBufferL = zzBuffer; + zzEndReadL = zzEndRead; + if (eof) { + zzInput = YYEOF; + break zzForAction; + } + else { + zzInput = zzBufferL[zzCurrentPosL++]; + } + } + int zzNext = zzTransL[ zzRowMapL[zzState] + zzCMapL[zzInput] ]; + if (zzNext == -1) break zzForAction; + zzState = zzNext; + + int zzAttributes = zzAttrL[zzState]; + if ( (zzAttributes & 1) == 1 ) { + zzAction = zzState; + zzMarkedPosL = zzCurrentPosL; + if ( (zzAttributes & 8) == 8 ) break zzForAction; + } + + } + } + + // store back cached position + zzMarkedPos = zzMarkedPosL; + + switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) { + case 41: + // lookahead expression with fixed base length + zzMarkedPos = zzStartRead + 2; + { add("um"); + } + case 50: break; + case 30: + // lookahead expression with fixed base length + zzMarkedPos = zzStartRead + 1; + { cv = CONS; add("U"); + } + case 51: break; + case 15: + { add(yytext()); + } + case 52: break; + case 48: + // lookahead expression with fixed base length + zzMarkedPos = zzStartRead + 3; + { add("Hic"); + } + case 53: break; + case 8: + { cv = VOWEL; add("AE"); + } + case 54: break; + case 1: + { problem = 1; cv = 0; add(yytext()); + } + case 55: break; + case 4: + { switch (problem) { + case 1: return original; + default: return normalized; + } + } + case 56: break; + case 20: + // lookahead expression with fixed base length + zzMarkedPos = zzStartRead + 1; + { cv = CONS; add("u"); + } + case 57: break; + case 10: + { cv = 0; add(yytext()); + } + case 58: break; + case 12: + { switch (problem) { + case 1: return original; + default: return normalized.replaceAll(LB, "").toLowerCase(); + } + } + case 59: break; + case 36: + // lookahead expression with fixed base length + zzMarkedPos = zzStartRead + 2; + { add("et"); + } + case 60: break; + case 23: + // lookahead expression with fixed base length + zzMarkedPos = zzStartRead + 1; + { add("e"); + } + case 61: break; + case 31: + // lookahead expression with fixed base length + zzMarkedPos = zzStartRead + 1; + { cv = VOWEL; add(yytext().replaceAll("u", "v").replaceAll("U", "V")); + } + case 62: break; + case 43: + // general lookahead, find correct zzMarkedPos + { int zzFState = 7; + int zzFPos = zzStartRead; + if (zzFin.length <= zzBufferL.length) { zzFin = new boolean[zzBufferL.length+1]; } + boolean zzFinL[] = zzFin; + while (zzFState != -1 && zzFPos < zzMarkedPos) { + if ((zzAttrL[zzFState] & 1) == 1) { zzFinL[zzFPos] = true; } + zzInput = zzBufferL[zzFPos++]; + zzFState = zzTransL[ zzRowMapL[zzFState] + zzCMapL[zzInput] ]; + } + if (zzFState != -1 && (zzAttrL[zzFState] & 1) == 1) { zzFinL[zzFPos] = true; } + + zzFState = 8; + zzFPos = zzMarkedPos; + while (!zzFinL[zzFPos] || (zzAttrL[zzFState] & 1) != 1) { + zzInput = zzBufferL[--zzFPos]; + zzFState = zzTransL[ zzRowMapL[zzFState] + zzCMapL[zzInput] ]; + }; + zzMarkedPos = zzFPos; + } + { cv = VOWEL; add(yytext().replace("ſ", "s")); + } + case 63: break; + case 3: + { cv = CONS; add(yytext()); + } + case 64: break; + case 29: + { cv = VOWEL; add("oi"); + } + case 65: break; + case 27: + { cv = CONS; add("QU"); + } + case 66: break; + case 17: + // lookahead expression with fixed base length + zzMarkedPos = zzStartRead + 1; + { switch(cv) { + case VOWEL: add(yytext().replace("u", "v").replace("U", "V")); break; + default: cv = VOWEL; add(yytext()); break; + } + } + case 67: break; + case 6: + { cv = CONS; add("ss"); + } + case 68: break; + case 5: + { cv = CONS; add("s"); + } + case 69: break; + case 11: + { switch (problem) { + case 1: return ""; + default: return normalized.replaceAll(LB, ""); + } + } + case 70: break; + case 24: + // lookahead expression with fixed base length + zzMarkedPos = zzStartRead + 1; + { add("o"); + } + case 71: break; + case 35: + // lookahead expression with fixed base length + zzMarkedPos = zzStartRead + 2; + { add("ac"); + } + case 72: break; + case 2: + { cv = VOWEL; add(yytext()); + } + case 73: break; + case 45: + // lookahead expression with fixed base length + zzMarkedPos = zzStartRead + 3; + { add("qui"); + } + case 74: break; + case 37: + // lookahead expression with fixed base length + zzMarkedPos = zzStartRead + 2; + { add("er"); + } + case 75: break; + case 26: + { cv = CONS; add("Qu"); + } + case 76: break; + case 32: + // lookahead expression with fixed base length + zzMarkedPos = zzStartRead + 2; + { add("ve"); + } + case 77: break; + case 40: + // lookahead expression with fixed base length + zzMarkedPos = zzStartRead + 2; + { add("us"); + } + case 78: break; + case 34: + // lookahead expression with fixed base length + zzMarkedPos = zzStartRead + 2; + { add("am"); + } + case 79: break; + case 7: + { cv = VOWEL; add("ae"); + } + case 80: break; + case 28: + { add("ar"); + } + case 81: break; + case 47: + // lookahead expression with fixed base length + zzMarkedPos = zzStartRead + 3; + { add("hic"); + } + case 82: break; + case 19: + { cv = VOWEL; add("uu"); + } + case 83: break; + case 42: + // lookahead expression with fixed base length + zzMarkedPos = zzStartRead + 2; + { add("ul"); + } + case 84: break; + case 22: + // lookahead expression with fixed base length + zzMarkedPos = zzStartRead + 1; + { add("a"); + } + case 85: break; + case 9: + { cv = VOWEL; add("oe"); + } + case 86: break; + case 18: + { cv = VOWEL; add("ui"); + } + case 87: break; + case 16: + { cv = CONS; add("qu"); + } + case 88: break; + case 49: + // lookahead expression with fixed base length + zzMarkedPos = zzStartRead + 4; + { add("que"); + } + case 89: break; + case 25: + // lookahead expression with fixed base length + zzMarkedPos = zzStartRead + 1; + { add("u"); + } + case 90: break; + case 38: + // lookahead expression with fixed base length + zzMarkedPos = zzStartRead + 2; + { add("es"); + } + case 91: break; + case 46: + // lookahead expression with fixed base length + zzMarkedPos = zzStartRead + 3; + { add("Qui"); + } + case 92: break; + case 44: + // lookahead expression with fixed base length + zzMarkedPos = zzStartRead + 1; + { add("i"); + } + case 93: break; + case 13: + { add("X"); + } + case 94: break; + case 14: + { switch(cv) { + case CONS: add(yytext().replace("v", "u").replace("V", "U")); break; + default: cv = CONS; add(yytext()); break; + } + } + case 95: break; + case 21: + { cv = VOWEL; add("ii"); + } + case 96: break; + case 33: + // lookahead expression with fixed base length + zzMarkedPos = zzStartRead + 2; + { add("as"); + } + case 97: break; + case 39: + // lookahead expression with fixed base length + zzMarkedPos = zzStartRead + 2; + { add("od"); + } + case 98: break; + default: + if (zzInput == YYEOF && zzStartRead == zzCurrentPos) { + zzAtEOF = true; + return null; + } + else { + zzScanError(ZZ_NO_MATCH); + } + } + } + } + + +} diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexLA.lex --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexLA.lex Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,228 @@ +/* + * Normalization rules for Latin text + * [this is a JFlex specification] + * + * Wolfgang Schmidle + * version 2011-07-12 + * + */ + +package de.mpg.mpiwg.berlin.mpdl.lt.analyzer.lang; + +%% + +%public +%class MpdlNormalizerLexLA +%type java.lang.String +%unicode + +// Latin: la, lat + +%states DISP, DICT, SEARCH +%states RENAISSANCE_DISP, RENAISSANCE_DICT, RENAISSANCE_SEARCH + +%{ + private static final int CONS = 1; + private static final int VOWEL = 2; + private int cv = 0; // consonant = 1, vowel = 2, everything else = 0 + + private String original = ""; + private String normalized = ""; + private int problem = 0; + + private void add (String norm) { + original += yytext(); + normalized += norm; + } + + private static final String LB = "[\u002d\u00ad] "; +%} + +Vowel = [AEIOUaeiouÆæęœ] // without àèòù etc. +Cons = [BCDFGHKLMNPQRSTVWXZbcdfghklmnpqrstvwxzſß] +// y counts neither as Vowel nor as Cons, see the default rule below: [yY] { cv = 0; add(yytext()); } + +LR = [lLrR] + +hyphen = [\u002d\u00ad] // hyphen and soft hyphen +LB = {hyphen} \u0020 +lb = ({hyphen} \u0020)? + +END = \n + +que = (que)? // optional -que +enclitic = (que | ve | ne) +prefixCons = (in{lb}ter | per | ſu{lb}per | ſer) // "ſer" for forms of ſervare + +%% + + +// TEST, siehe Benedetti Seite 444 +𐆑 { add("X"); } // (U+10191; D800+DD91) + + + { + +// 1. simple replacements + +// 1.1 single characters +ſ { cv = CONS; add("s"); } +ß { cv = CONS; add("ss"); } +[æę] { cv = VOWEL; add("ae"); } +Æ { cv = VOWEL; add("AE"); } +œ { cv = VOWEL; add("oe"); } + +// 1.2 character combinations +ij { cv = VOWEL; add("ii"); } + +// 2. superfluous diacritics + +// 2.1 acute accent +q́ue / {END} { add("que"); } // G +á / [mrst]? {enclitic} {END} { add("a"); } // G +é / [mrst]? {enclitic} {END} { add("e"); } // G +í / [mrst]? {enclitic} {END} { add("i"); } // G +ó / [mrst]? {enclitic} {END} { add("o"); } // G +ú / [mrst]? {enclitic} {END} { add("u"); } // G + +úe / {END} { add("ve"); } // W ?? + +// 2.2 grave accent +à / {que} {END} { add("a"); } // W G +àm / {que} {END} { add("am"); } // W (G) +às / {que} {END} { add("as"); } // W (G) (-àsque will likely never occur) +è / {que} {END} { add("e"); } // W G +ò / {que} {END} { add("o"); } // W G +òd / {que} {END} { add("od"); } // W (G) +ùm / {que} {END} { add("um"); } // W (G) +ùs / {que} {END} { add("us"); } // W G + +ès / {que} {END} { add("es"); } // (G) +^ quì / {END} { add("qui"); } // W ?? +^ Quì / {END} { add("Qui"); } // W ?? +àc / {END} { add("ac"); } // W ?? +èr / {END} { add("er"); } // W ?? +èt / {END} { add("et"); } // W ?? +ù / {END} { add("u"); } // W ?? +ùl / {END} { add("ul"); } // W ?? + +// 2.3 circumflex accent +^ hîc / {END} { add("hic"); } // W G +^ Hîc / {END} { add("Hic"); } // W G +^ ô / {END} { add("o"); } // G +â / {que} {END} { add("a"); } // W G +ûs / {END} { add("us"); } // W G +âr { add("ar"); } // W (G) --> this is only a rough approximation! + +// 2.4 trema +// 2.4.1 common cases +aë { cv = VOWEL; add("ae"); } +oë { cv = VOWEL; add("oe"); } +// 2.4.2 rare cases +oï { cv = VOWEL; add("oi"); } +uï { cv = VOWEL; add("ui"); } +// 2.4.3 extremely rare cases +uü { cv = VOWEL; add("uu"); } + + +// 3. rules for u and v + +// 3.1 rules for u --> v + +// peruenias --> pervenias, interuallum --> intervallum +^ {prefixCons} / {lb} { cv = VOWEL; add(yytext().replace("ſ", "s")); } // not cv = CONS ! + +// uellet --> vellet +^ [uU] / {Vowel} { cv = VOWEL; add(yytext().replaceAll("u", "v").replaceAll("U", "V")); } + +// diuidatur --> dividatur +// ut, volui: unchanged +// no rule for veruina because we cannot distinguish it from volui +[uU] / {Vowel} { + switch(cv) { + case VOWEL: add(yytext().replace("u", "v").replace("U", "V")); break; + default: cv = VOWEL; add(yytext()); break; + } + } + +// 3.2 rules for v --> u + +// qvam --> quam +qv { cv = CONS; add("qu"); } // the replaced v still counts as consonant +Qv { cv = CONS; add("Qu"); } +QV { cv = CONS; add("QU"); } + +// febrvarius --> februarius +// curva: unchanged +{LR} [vV] { + switch(cv) { + case CONS: add(yytext().replace("v", "u").replace("V", "U")); break; + default: cv = CONS; add(yytext()); break; + } + } + +// februarivs --> februarius +v / {lb} {Cons} { cv = CONS; add("u"); } +V / {lb} {Cons} { cv = CONS; add("U"); } + +// 3.3 override default rule for . + +{Vowel} { cv = VOWEL; add(yytext()); } +{Cons} { cv = CONS; add(yytext()); } +[yY] { cv = 0; add(yytext()); } + +@ { problem = 1; cv = 0; add(yytext()); } +{LB} { add(yytext()); } +. { problem = 1; cv = 0; add(yytext()); } // in particular from Arboreal: "〈" (2329), "〉" (232A), Ç, ç + +} + + + { + +{END} { + switch (problem) { + case 1: return original; + default: return normalized; + } + } +} + + { + +{END} { + switch (problem) { + case 1: return ""; + default: return normalized.replaceAll(LB, ""); + } + } +} + + { + +{END} { + switch (problem) { + case 1: return original; + default: return normalized.replaceAll(LB, "").toLowerCase(); + } + } +} + + +/* + +Annahmen: +- die Routine wird wortweise aufgerufen, mit einem \n am Ende des Strings +- Zeilenumbrüche innerhalb des Wortes sind als hyphen/soft hyphen plus space markiert + + +TO DO: + +LA: Nochmal überlegen, ob man Ææęàèòùœ in der Vokal-Klasse weglassen kann. Sie schaden aber auch nicht. (Oder doch !?) Unterscheide Vokal-Klassen vor und nach dem u ? +LA: Diakritika nochmal mit Paul durchgehen +LA: Die Disambiguierungen durch die Diakritika fehlen noch. +LA: ist J wirklich ein Problemfall? +LA: gibt es Wörter wie super-rv... oder super-lv... in Klein- oder Großbuchstaben? + +*/ diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexNL.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexNL.java Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,589 @@ +/* The following code was generated by JFlex 1.4.3 on 21.07.11 11:22 */ + +/* + * Normalization rules for Dutch text + * [this is a JFlex specification] + * + * Wolfgang Schmidle + * version 2011-07-12 + * + */ + +package de.mpg.mpiwg.berlin.mpdl.lt.text.norm.lang; + + +/** + * This class is a scanner generated by + * JFlex 1.4.3 + * on 21.07.11 11:22 from the specification file + * MpdlNormalizerLexNL.lex + */ +public class MpdlNormalizerLexNL { + + /** This character denotes the end of file */ + public static final int YYEOF = -1; + + /** initial size of the lookahead buffer */ + private static final int ZZ_BUFFERSIZE = 16384; + + /** lexical states */ + public static final int SEARCH = 6; + public static final int DICT = 4; + public static final int YYINITIAL = 0; + public static final int DISP = 2; + + /** + * ZZ_LEXSTATE[l] is the state in the DFA for the lexical state l + * ZZ_LEXSTATE[l+1] is the state in the DFA for the lexical state l + * at the beginning of a line + * l is of the form l = 2*k, k a non negative integer + */ + private static final int ZZ_LEXSTATE[] = { + 0, 0, 1, 1, 2, 2, 3, 3 + }; + + /** + * Translates characters to character classes + */ + private static final String ZZ_CMAP_PACKED = + "\12\0\1\3\25\0\1\2\14\0\1\1\2\0\1\1\17\0\1\5"+ + "\40\0\1\1\2\0\1\1\20\0\1\1\5\0\1\1\1\0\1\1"+ + "\u0101\0\1\4\ufe80\0"; + + /** + * Translates characters to character classes + */ + private static final char [] ZZ_CMAP = zzUnpackCMap(ZZ_CMAP_PACKED); + + /** + * Translates DFA states to action switch labels. + */ + private static final int [] ZZ_ACTION = zzUnpackAction(); + + private static final String ZZ_ACTION_PACKED_0 = + "\4\0\2\1\1\2\1\3\1\4\1\5\1\6"; + + private static int [] zzUnpackAction() { + int [] result = new int[11]; + int offset = 0; + offset = zzUnpackAction(ZZ_ACTION_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackAction(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int count = packed.charAt(i++); + int value = packed.charAt(i++); + do result[j++] = value; while (--count > 0); + } + return j; + } + + + /** + * Translates a state to a row index in the transition table + */ + private static final int [] ZZ_ROWMAP = zzUnpackRowMap(); + + private static final String ZZ_ROWMAP_PACKED_0 = + "\0\0\0\6\0\14\0\22\0\30\0\36\0\30\0\30"+ + "\0\30\0\30\0\30"; + + private static int [] zzUnpackRowMap() { + int [] result = new int[11]; + int offset = 0; + offset = zzUnpackRowMap(ZZ_ROWMAP_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackRowMap(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int high = packed.charAt(i++) << 16; + result[j++] = high | packed.charAt(i++); + } + return j; + } + + /** + * The transition table of the DFA + */ + private static final int [] ZZ_TRANS = zzUnpackTrans(); + + private static final String ZZ_TRANS_PACKED_0 = + "\1\5\1\6\1\5\1\0\1\5\1\7\1\5\1\6"+ + "\1\5\1\10\1\11\1\7\1\5\1\6\1\5\1\12"+ + "\1\11\1\7\1\5\1\6\1\5\1\13\1\11\1\7"+ + "\10\0\1\5\3\0"; + + private static int [] zzUnpackTrans() { + int [] result = new int[36]; + int offset = 0; + offset = zzUnpackTrans(ZZ_TRANS_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackTrans(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int count = packed.charAt(i++); + int value = packed.charAt(i++); + value--; + do result[j++] = value; while (--count > 0); + } + return j; + } + + + /* error codes */ + private static final int ZZ_UNKNOWN_ERROR = 0; + private static final int ZZ_NO_MATCH = 1; + private static final int ZZ_PUSHBACK_2BIG = 2; + + /* error messages for the codes above */ + private static final String ZZ_ERROR_MSG[] = { + "Unkown internal scanner error", + "Error: could not match input", + "Error: pushback value was too large" + }; + + /** + * ZZ_ATTRIBUTE[aState] contains the attributes of state aState + */ + private static final int [] ZZ_ATTRIBUTE = zzUnpackAttribute(); + + private static final String ZZ_ATTRIBUTE_PACKED_0 = + "\4\0\1\11\1\1\5\11"; + + private static int [] zzUnpackAttribute() { + int [] result = new int[11]; + int offset = 0; + offset = zzUnpackAttribute(ZZ_ATTRIBUTE_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackAttribute(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int count = packed.charAt(i++); + int value = packed.charAt(i++); + do result[j++] = value; while (--count > 0); + } + return j; + } + + /** the input device */ + private java.io.Reader zzReader; + + /** the current state of the DFA */ + private int zzState; + + /** the current lexical state */ + private int zzLexicalState = YYINITIAL; + + /** this buffer contains the current text to be matched and is + the source of the yytext() string */ + private char zzBuffer[] = new char[ZZ_BUFFERSIZE]; + + /** the textposition at the last accepting state */ + private int zzMarkedPos; + + /** the current text position in the buffer */ + private int zzCurrentPos; + + /** startRead marks the beginning of the yytext() string in the buffer */ + private int zzStartRead; + + /** endRead marks the last character in the buffer, that has been read + from input */ + private int zzEndRead; + + /** number of newlines encountered up to the start of the matched text */ + private int yyline; + + /** the number of characters up to the start of the matched text */ + private int yychar; + + /** + * the number of characters from the last newline up to the start of the + * matched text + */ + private int yycolumn; + + /** + * zzAtBOL == true <=> the scanner is currently at the beginning of a line + */ + private boolean zzAtBOL = true; + + /** zzAtEOF == true <=> the scanner is at the EOF */ + private boolean zzAtEOF; + + /** denotes if the user-EOF-code has already been executed */ + private boolean zzEOFDone; + + /* user code: */ + private String original = ""; + private String normalized = ""; + private int problem = 0; + + private void add (String norm) { + original += yytext(); + normalized += norm; + } + + private static final String LB = "[\u002d\u00ad] "; + + + /** + * Creates a new scanner + * There is also a java.io.InputStream version of this constructor. + * + * @param in the java.io.Reader to read input from. + */ + public MpdlNormalizerLexNL(java.io.Reader in) { + this.zzReader = in; + } + + /** + * Creates a new scanner. + * There is also java.io.Reader version of this constructor. + * + * @param in the java.io.Inputstream to read input from. + */ + public MpdlNormalizerLexNL(java.io.InputStream in) { + this(new java.io.InputStreamReader(in)); + } + + /** + * Unpacks the compressed character translation table. + * + * @param packed the packed character translation table + * @return the unpacked character translation table + */ + private static char [] zzUnpackCMap(String packed) { + char [] map = new char[0x10000]; + int i = 0; /* index in packed string */ + int j = 0; /* index in unpacked array */ + while (i < 46) { + int count = packed.charAt(i++); + char value = packed.charAt(i++); + do map[j++] = value; while (--count > 0); + } + return map; + } + + + /** + * Refills the input buffer. + * + * @return false, iff there was new input. + * + * @exception java.io.IOException if any I/O-Error occurs + */ + private boolean zzRefill() throws java.io.IOException { + + /* first: make room (if you can) */ + if (zzStartRead > 0) { + System.arraycopy(zzBuffer, zzStartRead, + zzBuffer, 0, + zzEndRead-zzStartRead); + + /* translate stored positions */ + zzEndRead-= zzStartRead; + zzCurrentPos-= zzStartRead; + zzMarkedPos-= zzStartRead; + zzStartRead = 0; + } + + /* is the buffer big enough? */ + if (zzCurrentPos >= zzBuffer.length) { + /* if not: blow it up */ + char newBuffer[] = new char[zzCurrentPos*2]; + System.arraycopy(zzBuffer, 0, newBuffer, 0, zzBuffer.length); + zzBuffer = newBuffer; + } + + /* finally: fill the buffer with new input */ + int numRead = zzReader.read(zzBuffer, zzEndRead, + zzBuffer.length-zzEndRead); + + if (numRead > 0) { + zzEndRead+= numRead; + return false; + } + // unlikely but not impossible: read 0 characters, but not at end of stream + if (numRead == 0) { + int c = zzReader.read(); + if (c == -1) { + return true; + } else { + zzBuffer[zzEndRead++] = (char) c; + return false; + } + } + + // numRead < 0 + return true; + } + + + /** + * Closes the input stream. + */ + public final void yyclose() throws java.io.IOException { + zzAtEOF = true; /* indicate end of file */ + zzEndRead = zzStartRead; /* invalidate buffer */ + + if (zzReader != null) + zzReader.close(); + } + + + /** + * Resets the scanner to read from a new input stream. + * Does not close the old reader. + * + * All internal variables are reset, the old input stream + * cannot be reused (internal buffer is discarded and lost). + * Lexical state is set to ZZ_INITIAL. + * + * @param reader the new input stream + */ + public final void yyreset(java.io.Reader reader) { + zzReader = reader; + zzAtBOL = true; + zzAtEOF = false; + zzEOFDone = false; + zzEndRead = zzStartRead = 0; + zzCurrentPos = zzMarkedPos = 0; + yyline = yychar = yycolumn = 0; + zzLexicalState = YYINITIAL; + } + + + /** + * Returns the current lexical state. + */ + public final int yystate() { + return zzLexicalState; + } + + + /** + * Enters a new lexical state + * + * @param newState the new lexical state + */ + public final void yybegin(int newState) { + zzLexicalState = newState; + } + + + /** + * Returns the text matched by the current regular expression. + */ + public final String yytext() { + return new String( zzBuffer, zzStartRead, zzMarkedPos-zzStartRead ); + } + + + /** + * Returns the character at position pos from the + * matched text. + * + * It is equivalent to yytext().charAt(pos), but faster + * + * @param pos the position of the character to fetch. + * A value from 0 to yylength()-1. + * + * @return the character at position pos + */ + public final char yycharat(int pos) { + return zzBuffer[zzStartRead+pos]; + } + + + /** + * Returns the length of the matched text region. + */ + public final int yylength() { + return zzMarkedPos-zzStartRead; + } + + + /** + * Reports an error that occured while scanning. + * + * In a wellformed scanner (no or only correct usage of + * yypushback(int) and a match-all fallback rule) this method + * will only be called with things that "Can't Possibly Happen". + * If this method is called, something is seriously wrong + * (e.g. a JFlex bug producing a faulty scanner etc.). + * + * Usual syntax/scanner level error handling should be done + * in error fallback rules. + * + * @param errorCode the code of the errormessage to display + */ + private void zzScanError(int errorCode) { + String message; + try { + message = ZZ_ERROR_MSG[errorCode]; + } + catch (ArrayIndexOutOfBoundsException e) { + message = ZZ_ERROR_MSG[ZZ_UNKNOWN_ERROR]; + } + + throw new Error(message); + } + + + /** + * Pushes the specified amount of characters back into the input stream. + * + * They will be read again by then next call of the scanning method + * + * @param number the number of characters to be read again. + * This number must not be greater than yylength()! + */ + public void yypushback(int number) { + if ( number > yylength() ) + zzScanError(ZZ_PUSHBACK_2BIG); + + zzMarkedPos -= number; + } + + + /** + * Resumes scanning until the next regular expression is matched, + * the end of input is encountered or an I/O-Error occurs. + * + * @return the next token + * @exception java.io.IOException if any I/O-Error occurs + */ + public java.lang.String yylex() throws java.io.IOException { + int zzInput; + int zzAction; + + // cached fields: + int zzCurrentPosL; + int zzMarkedPosL; + int zzEndReadL = zzEndRead; + char [] zzBufferL = zzBuffer; + char [] zzCMapL = ZZ_CMAP; + + int [] zzTransL = ZZ_TRANS; + int [] zzRowMapL = ZZ_ROWMAP; + int [] zzAttrL = ZZ_ATTRIBUTE; + + while (true) { + zzMarkedPosL = zzMarkedPos; + + zzAction = -1; + + zzCurrentPosL = zzCurrentPos = zzStartRead = zzMarkedPosL; + + zzState = ZZ_LEXSTATE[zzLexicalState]; + + + zzForAction: { + while (true) { + + if (zzCurrentPosL < zzEndReadL) + zzInput = zzBufferL[zzCurrentPosL++]; + else if (zzAtEOF) { + zzInput = YYEOF; + break zzForAction; + } + else { + // store back cached positions + zzCurrentPos = zzCurrentPosL; + zzMarkedPos = zzMarkedPosL; + boolean eof = zzRefill(); + // get translated positions and possibly new buffer + zzCurrentPosL = zzCurrentPos; + zzMarkedPosL = zzMarkedPos; + zzBufferL = zzBuffer; + zzEndReadL = zzEndRead; + if (eof) { + zzInput = YYEOF; + break zzForAction; + } + else { + zzInput = zzBufferL[zzCurrentPosL++]; + } + } + int zzNext = zzTransL[ zzRowMapL[zzState] + zzCMapL[zzInput] ]; + if (zzNext == -1) break zzForAction; + zzState = zzNext; + + int zzAttributes = zzAttrL[zzState]; + if ( (zzAttributes & 1) == 1 ) { + zzAction = zzState; + zzMarkedPosL = zzCurrentPosL; + if ( (zzAttributes & 8) == 8 ) break zzForAction; + } + + } + } + + // store back cached position + zzMarkedPos = zzMarkedPosL; + + switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) { + case 5: + { switch (problem) { + case 1: return ""; + default: return normalized.replaceAll(LB, ""); + } + } + case 7: break; + case 2: + { problem = 1; add(yytext()); + } + case 8: break; + case 4: + { add("s"); + } + case 9: break; + case 3: + { switch (problem) { + case 1: return original; + default: return normalized; + } + } + case 10: break; + case 6: + { switch (problem) { + case 1: return original; + default: return normalized.replaceAll(LB, "").toLowerCase(); + } + } + case 11: break; + case 1: + { add(yytext()); + } + case 12: break; + default: + if (zzInput == YYEOF && zzStartRead == zzCurrentPos) { + zzAtEOF = true; + return null; + } + else { + zzScanError(ZZ_NO_MATCH); + } + } + } + } + + +} diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexNL.lex --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexNL.lex Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,99 @@ +/* + * Normalization rules for Dutch text + * [this is a JFlex specification] + * + * Wolfgang Schmidle + * version 2011-07-12 + * + */ + +package de.mpg.mpiwg.berlin.mpdl.lt.analyzer.lang; + +%% + +%public +%class MpdlNormalizerLexNL +%type java.lang.String +%unicode + +// Dutch: nl + +%states DISP, DICT, SEARCH + +%{ + private String original = ""; + private String normalized = ""; + private int problem = 0; + + private void add (String norm) { + original += yytext(); + normalized += norm; + } + + private static final String LB = "[\u002d\u00ad] "; +%} + +hyphen = [-\u{00ad}] // hyphen and soft hyphen +LB = {hyphen} \u0020 +// lb = ({hyphen} \u0020)? + +END = \n + +%% + + { + +ſ { add("s"); } + +} + + +// default + +@ { problem = 1; add(yytext()); } +{LB} { add(yytext()); } +. { add(yytext()); } + + + { + +{END} { + switch (problem) { + case 1: return original; + default: return normalized; + } + } +} + + { + +{END} { + switch (problem) { + case 1: return ""; + default: return normalized.replaceAll(LB, ""); + } + } +} + + { + +{END} { + switch (problem) { + case 1: return original; + default: return normalized.replaceAll(LB, "").toLowerCase(); + } + } +} + + +/* + +Annahmen: +- die Routine wird wortweise aufgerufen, mit einem \n am Ende des Strings +- Zeilenumbrüche innerhalb des Wortes sind als hyphen/soft hyphen plus space markiert + +TO DO: + +NL: vollständig? + +*/ diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexTemplate.lex --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexTemplate.lex Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,89 @@ +/* + * Template for normalization rules + * [this is a JFlex specification] + * + * Wolfgang Schmidle + * version 2011-07-12 + * + */ + +package de.mpg.mpiwg.berlin.mpdl.lt.analyzer.lang; + +%% + +%public +%class MpdlNormalizerLexTemplate +%type java.lang.String +%unicode + +// Language: list of ISO codes + +%states DISP, DICT, SEARCH + +%{ + private String original = ""; + private String normalized = ""; + private int problem = 0; + + private void add (String norm) { + original += yytext(); + normalized += norm; + } + + private static final String LB = "[\u002d\u00ad] "; +%} + +hyphen = [-\u{00ad}] // hyphen and soft hyphen +LB = {hyphen} \u0020 +// lb = ({hyphen} \u0020)? + +END = \n + +%% + + { + +ſ { add("s"); } // sample rule + +} + + +// default rules + +@ { problem = 1; add(yytext()); } +{LB} { add(yytext()); } +. { add(yytext()); } + + +// at the end, determine which string to return + + { + +{END} { + switch (problem) { + case 1: return original; + default: return normalized; + } + } +} + + { + +{END} { + switch (problem) { + case 1: return ""; + default: return normalized.replaceAll(LB, ""); + } + } +} + + { + +{END} { + switch (problem) { + case 1: return original; + default: return normalized.replaceAll(LB, "").toLowerCase(); + } + } +} + diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexZH.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexZH.java Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,637 @@ +/* The following code was generated by JFlex 1.4.3 on 21.07.11 11:22 */ + +/* + * Normalization rules for Chinese text + * [this is a JFlex specification] + * + * Wolfgang Schmidle + * version 2011-02-28 + * + */ + +package de.mpg.mpiwg.berlin.mpdl.lt.text.norm.lang; + + +/** + * This class is a scanner generated by + * JFlex 1.4.3 + * on 21.07.11 11:22 from the specification file + * MpdlNormalizerLexZH.lex + */ +public class MpdlNormalizerLexZH { + + /** This character denotes the end of file */ + public static final int YYEOF = -1; + + /** initial size of the lookahead buffer */ + private static final int ZZ_BUFFERSIZE = 16384; + + /** lexical states */ + public static final int SEARCH = 6; + public static final int DICT = 4; + public static final int YYINITIAL = 0; + public static final int DISP = 2; + + /** + * ZZ_LEXSTATE[l] is the state in the DFA for the lexical state l + * ZZ_LEXSTATE[l+1] is the state in the DFA for the lexical state l + * at the beginning of a line + * l is of the form l = 2*k, k a non negative integer + */ + private static final int ZZ_LEXSTATE[] = { + 0, 0, 1, 1, 2, 2, 3, 3 + }; + + /** + * Translates characters to character classes + */ + private static final String ZZ_CMAP_PACKED = + "\12\0\1\2\45\0\1\1\1\0\1\1\15\0\1\20\41\0\1\1"+ + "\22\0\1\1\5\0\1\1\1\0\1\1\u4f84\0\1\3\176\0\1\4"+ + "\u035a\0\1\4\u0a9a\0\1\6\u0781\0\1\10\u057a\0\1\11\u06bd\0\1\12"+ + "\15\0\1\7\u0891\0\1\5\u1baf\0\1\13\340\0\1\14\u411a\0\1\16"+ + "\u040e\0\1\17\u1d8f\0\1\15\u05e2\0"; + + /** + * Translates characters to character classes + */ + private static final char [] ZZ_CMAP = zzUnpackCMap(ZZ_CMAP_PACKED); + + /** + * Translates DFA states to action switch labels. + */ + private static final int [] ZZ_ACTION = zzUnpackAction(); + + private static final String ZZ_ACTION_PACKED_0 = + "\4\0\1\1\1\2\1\3\1\4\1\5\1\6\1\7"+ + "\1\10\1\11\1\12\1\13\1\14\1\15\1\16\1\1"+ + "\1\17\1\20\1\21"; + + private static int [] zzUnpackAction() { + int [] result = new int[22]; + int offset = 0; + offset = zzUnpackAction(ZZ_ACTION_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackAction(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int count = packed.charAt(i++); + int value = packed.charAt(i++); + do result[j++] = value; while (--count > 0); + } + return j; + } + + + /** + * Translates a state to a row index in the transition table + */ + private static final int [] ZZ_ROWMAP = zzUnpackRowMap(); + + private static final String ZZ_ROWMAP_PACKED_0 = + "\0\0\0\21\0\42\0\63\0\104\0\104\0\104\0\104"+ + "\0\104\0\104\0\104\0\104\0\104\0\104\0\104\0\104"+ + "\0\104\0\104\0\125\0\104\0\104\0\104"; + + private static int [] zzUnpackRowMap() { + int [] result = new int[22]; + int offset = 0; + offset = zzUnpackRowMap(ZZ_ROWMAP_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackRowMap(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int high = packed.charAt(i++) << 16; + result[j++] = high | packed.charAt(i++); + } + return j; + } + + /** + * The transition table of the DFA + */ + private static final int [] ZZ_TRANS = zzUnpackTrans(); + + private static final String ZZ_TRANS_PACKED_0 = + "\2\5\1\0\15\5\1\6\2\5\1\7\1\10\1\11"+ + "\1\12\1\13\1\14\1\15\1\16\1\17\1\20\1\21"+ + "\1\22\1\23\1\5\1\6\1\5\1\24\1\25\1\10"+ + "\1\11\1\12\1\13\1\14\1\15\1\16\1\17\1\20"+ + "\1\21\1\22\1\23\1\5\1\6\1\5\1\24\1\7"+ + "\1\10\1\11\1\12\1\13\1\14\1\15\1\16\1\17"+ + "\1\20\1\21\1\22\1\23\1\5\1\6\40\0\1\26"+ + "\1\0"; + + private static int [] zzUnpackTrans() { + int [] result = new int[102]; + int offset = 0; + offset = zzUnpackTrans(ZZ_TRANS_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackTrans(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int count = packed.charAt(i++); + int value = packed.charAt(i++); + value--; + do result[j++] = value; while (--count > 0); + } + return j; + } + + + /* error codes */ + private static final int ZZ_UNKNOWN_ERROR = 0; + private static final int ZZ_NO_MATCH = 1; + private static final int ZZ_PUSHBACK_2BIG = 2; + + /* error messages for the codes above */ + private static final String ZZ_ERROR_MSG[] = { + "Unkown internal scanner error", + "Error: could not match input", + "Error: pushback value was too large" + }; + + /** + * ZZ_ATTRIBUTE[aState] contains the attributes of state aState + */ + private static final int [] ZZ_ATTRIBUTE = zzUnpackAttribute(); + + private static final String ZZ_ATTRIBUTE_PACKED_0 = + "\4\0\16\11\1\1\3\11"; + + private static int [] zzUnpackAttribute() { + int [] result = new int[22]; + int offset = 0; + offset = zzUnpackAttribute(ZZ_ATTRIBUTE_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackAttribute(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int count = packed.charAt(i++); + int value = packed.charAt(i++); + do result[j++] = value; while (--count > 0); + } + return j; + } + + /** the input device */ + private java.io.Reader zzReader; + + /** the current state of the DFA */ + private int zzState; + + /** the current lexical state */ + private int zzLexicalState = YYINITIAL; + + /** this buffer contains the current text to be matched and is + the source of the yytext() string */ + private char zzBuffer[] = new char[ZZ_BUFFERSIZE]; + + /** the textposition at the last accepting state */ + private int zzMarkedPos; + + /** the current text position in the buffer */ + private int zzCurrentPos; + + /** startRead marks the beginning of the yytext() string in the buffer */ + private int zzStartRead; + + /** endRead marks the last character in the buffer, that has been read + from input */ + private int zzEndRead; + + /** number of newlines encountered up to the start of the matched text */ + private int yyline; + + /** the number of characters up to the start of the matched text */ + private int yychar; + + /** + * the number of characters from the last newline up to the start of the + * matched text + */ + private int yycolumn; + + /** + * zzAtBOL == true <=> the scanner is currently at the beginning of a line + */ + private boolean zzAtBOL = true; + + /** zzAtEOF == true <=> the scanner is at the EOF */ + private boolean zzAtEOF; + + /** denotes if the user-EOF-code has already been executed */ + private boolean zzEOFDone; + + /* user code: */ + private String original = ""; + private String normalized = ""; + private int problem = 0; + + private void add (String norm) { + original += yytext(); + normalized += norm; + } + + + /** + * Creates a new scanner + * There is also a java.io.InputStream version of this constructor. + * + * @param in the java.io.Reader to read input from. + */ + public MpdlNormalizerLexZH(java.io.Reader in) { + this.zzReader = in; + } + + /** + * Creates a new scanner. + * There is also java.io.Reader version of this constructor. + * + * @param in the java.io.Inputstream to read input from. + */ + public MpdlNormalizerLexZH(java.io.InputStream in) { + this(new java.io.InputStreamReader(in)); + } + + /** + * Unpacks the compressed character translation table. + * + * @param packed the packed character translation table + * @return the unpacked character translation table + */ + private static char [] zzUnpackCMap(String packed) { + char [] map = new char[0x10000]; + int i = 0; /* index in packed string */ + int j = 0; /* index in unpacked array */ + while (i < 90) { + int count = packed.charAt(i++); + char value = packed.charAt(i++); + do map[j++] = value; while (--count > 0); + } + return map; + } + + + /** + * Refills the input buffer. + * + * @return false, iff there was new input. + * + * @exception java.io.IOException if any I/O-Error occurs + */ + private boolean zzRefill() throws java.io.IOException { + + /* first: make room (if you can) */ + if (zzStartRead > 0) { + System.arraycopy(zzBuffer, zzStartRead, + zzBuffer, 0, + zzEndRead-zzStartRead); + + /* translate stored positions */ + zzEndRead-= zzStartRead; + zzCurrentPos-= zzStartRead; + zzMarkedPos-= zzStartRead; + zzStartRead = 0; + } + + /* is the buffer big enough? */ + if (zzCurrentPos >= zzBuffer.length) { + /* if not: blow it up */ + char newBuffer[] = new char[zzCurrentPos*2]; + System.arraycopy(zzBuffer, 0, newBuffer, 0, zzBuffer.length); + zzBuffer = newBuffer; + } + + /* finally: fill the buffer with new input */ + int numRead = zzReader.read(zzBuffer, zzEndRead, + zzBuffer.length-zzEndRead); + + if (numRead > 0) { + zzEndRead+= numRead; + return false; + } + // unlikely but not impossible: read 0 characters, but not at end of stream + if (numRead == 0) { + int c = zzReader.read(); + if (c == -1) { + return true; + } else { + zzBuffer[zzEndRead++] = (char) c; + return false; + } + } + + // numRead < 0 + return true; + } + + + /** + * Closes the input stream. + */ + public final void yyclose() throws java.io.IOException { + zzAtEOF = true; /* indicate end of file */ + zzEndRead = zzStartRead; /* invalidate buffer */ + + if (zzReader != null) + zzReader.close(); + } + + + /** + * Resets the scanner to read from a new input stream. + * Does not close the old reader. + * + * All internal variables are reset, the old input stream + * cannot be reused (internal buffer is discarded and lost). + * Lexical state is set to ZZ_INITIAL. + * + * @param reader the new input stream + */ + public final void yyreset(java.io.Reader reader) { + zzReader = reader; + zzAtBOL = true; + zzAtEOF = false; + zzEOFDone = false; + zzEndRead = zzStartRead = 0; + zzCurrentPos = zzMarkedPos = 0; + yyline = yychar = yycolumn = 0; + zzLexicalState = YYINITIAL; + } + + + /** + * Returns the current lexical state. + */ + public final int yystate() { + return zzLexicalState; + } + + + /** + * Enters a new lexical state + * + * @param newState the new lexical state + */ + public final void yybegin(int newState) { + zzLexicalState = newState; + } + + + /** + * Returns the text matched by the current regular expression. + */ + public final String yytext() { + return new String( zzBuffer, zzStartRead, zzMarkedPos-zzStartRead ); + } + + + /** + * Returns the character at position pos from the + * matched text. + * + * It is equivalent to yytext().charAt(pos), but faster + * + * @param pos the position of the character to fetch. + * A value from 0 to yylength()-1. + * + * @return the character at position pos + */ + public final char yycharat(int pos) { + return zzBuffer[zzStartRead+pos]; + } + + + /** + * Returns the length of the matched text region. + */ + public final int yylength() { + return zzMarkedPos-zzStartRead; + } + + + /** + * Reports an error that occured while scanning. + * + * In a wellformed scanner (no or only correct usage of + * yypushback(int) and a match-all fallback rule) this method + * will only be called with things that "Can't Possibly Happen". + * If this method is called, something is seriously wrong + * (e.g. a JFlex bug producing a faulty scanner etc.). + * + * Usual syntax/scanner level error handling should be done + * in error fallback rules. + * + * @param errorCode the code of the errormessage to display + */ + private void zzScanError(int errorCode) { + String message; + try { + message = ZZ_ERROR_MSG[errorCode]; + } + catch (ArrayIndexOutOfBoundsException e) { + message = ZZ_ERROR_MSG[ZZ_UNKNOWN_ERROR]; + } + + throw new Error(message); + } + + + /** + * Pushes the specified amount of characters back into the input stream. + * + * They will be read again by then next call of the scanning method + * + * @param number the number of characters to be read again. + * This number must not be greater than yylength()! + */ + public void yypushback(int number) { + if ( number > yylength() ) + zzScanError(ZZ_PUSHBACK_2BIG); + + zzMarkedPos -= number; + } + + + /** + * Resumes scanning until the next regular expression is matched, + * the end of input is encountered or an I/O-Error occurs. + * + * @return the next token + * @exception java.io.IOException if any I/O-Error occurs + */ + public java.lang.String yylex() throws java.io.IOException { + int zzInput; + int zzAction; + + // cached fields: + int zzCurrentPosL; + int zzMarkedPosL; + int zzEndReadL = zzEndRead; + char [] zzBufferL = zzBuffer; + char [] zzCMapL = ZZ_CMAP; + + int [] zzTransL = ZZ_TRANS; + int [] zzRowMapL = ZZ_ROWMAP; + int [] zzAttrL = ZZ_ATTRIBUTE; + + while (true) { + zzMarkedPosL = zzMarkedPos; + + zzAction = -1; + + zzCurrentPosL = zzCurrentPos = zzStartRead = zzMarkedPosL; + + zzState = ZZ_LEXSTATE[zzLexicalState]; + + + zzForAction: { + while (true) { + + if (zzCurrentPosL < zzEndReadL) + zzInput = zzBufferL[zzCurrentPosL++]; + else if (zzAtEOF) { + zzInput = YYEOF; + break zzForAction; + } + else { + // store back cached positions + zzCurrentPos = zzCurrentPosL; + zzMarkedPos = zzMarkedPosL; + boolean eof = zzRefill(); + // get translated positions and possibly new buffer + zzCurrentPosL = zzCurrentPos; + zzMarkedPosL = zzMarkedPos; + zzBufferL = zzBuffer; + zzEndReadL = zzEndRead; + if (eof) { + zzInput = YYEOF; + break zzForAction; + } + else { + zzInput = zzBufferL[zzCurrentPosL++]; + } + } + int zzNext = zzTransL[ zzRowMapL[zzState] + zzCMapL[zzInput] ]; + if (zzNext == -1) break zzForAction; + zzState = zzNext; + + int zzAttributes = zzAttrL[zzState]; + if ( (zzAttributes & 1) == 1 ) { + zzAction = zzState; + zzMarkedPosL = zzCurrentPosL; + if ( (zzAttributes & 8) == 8 ) break zzForAction; + } + + } + } + + // store back cached position + zzMarkedPos = zzMarkedPosL; + + switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) { + case 17: + { add("庶"); + } + case 18: break; + case 9: + { add("時"); + } + case 19: break; + case 2: + { problem = 1; add(yytext()); + } + case 20: break; + case 3: + { switch (problem) { + case 1: return original; + default: return normalized; + } + } + case 21: break; + case 10: + { add("歷"); + } + case 22: break; + case 13: + { add("面"); + } + case 23: break; + case 14: + { add("精"); + } + case 24: break; + case 12: + { add("陰"); + } + case 25: break; + case 8: + { add("床"); + } + case 26: break; + case 1: + { add(yytext()); + } + case 27: break; + case 15: + { add(""); + } + case 28: break; + case 7: + { add("并"); + } + case 29: break; + case 4: + { add("併"); + } + case 30: break; + case 11: + { add("為"); + } + case 31: break; + case 6: + { add("奇"); + } + case 32: break; + case 5: + { add("叟"); + } + case 33: break; + case 16: + { switch (problem) { + case 1: return ""; + default: return normalized; + } + } + case 34: break; + default: + if (zzInput == YYEOF && zzStartRead == zzCurrentPos) { + zzAtEOF = true; + return null; + } + else { + zzScanError(ZZ_NO_MATCH); + } + } + } + } + + +} diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexZH.lex --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexZH.lex Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,120 @@ +/* + * Normalization rules for Chinese text + * [this is a JFlex specification] + * + * Wolfgang Schmidle + * version 2011-02-28 + * + */ + +package de.mpg.mpiwg.berlin.mpdl.lt.analyzer.lang; + +%% + +%public +%class MpdlNormalizerLexZH +%type java.lang.String +%unicode + +// classical Chinese: zh, zho, zho-Hant + +%states DISP, DICT, SEARCH + +%{ + private String original = ""; + private String normalized = ""; + private int problem = 0; + + private void add (String norm) { + original += yytext(); + normalized += norm; + } +%} + +ZWS = [\u{200b}] + +END = \n + +%% + +// Normalization in Chinese means that character variants will be replaced by their standard characters +// if there is no doubt about what the standard character is. + +// The input is supposed to be a single Chinese character, but strings of characters are also handled correctly. + + { + +// Codepoint < FFFF + +倂 { add("併"); } // 5002 --> 4F75 +傁 | 叜 { add("叟"); } // 5081, 53DC --> 53DF +竒 { add("奇"); } // 7AD2 --> 5947 +幷 { add("并"); } // 5E77 --> 5E76 +牀 { add("床"); } // 7240 --> 5E8A +旹 { add("時"); } // 65F9 --> 6642 +歴 { add("歷"); } // 6B74 --> 6B77 +爲 { add("為"); } // 7232 --> 70BA +隂 { add("陰"); } // 9682 --> 9670 +靣 { add("面"); } // 9763 --> 9762 +精 { add("精"); } // FA1D --> 7CBE (FA1D is a compatibility ideograph) + +// Codepoint > FFFF + +// note that [ABC] is not equivalent to A | B | C for codepoints above FFFF due to their internal encoding: +// for example, 庶 (U+2F88D) is represented as a sequence of two codepoints: D87E DC8D +// i.e. never use [ABC] but A | B | C + +庶 { add("庶"); } // 2F88D --> 5EB6 (2F88D is a compatibility ideograph) + +} + + { + +// remove Zero Width Space (if there is any in the the input string) + +{ZWS} { add(""); } + +} + +// default + +@ { problem = 1; add(yytext()); } +. { add(yytext()); } + + + { + +{END} { + switch (problem) { + case 1: return original; + default: return normalized; + } + } +} + + { + +{END} { + switch (problem) { + case 1: return ""; + default: return normalized; + } + } +} + + +/* + +Annahmen: +- die Routine wird zeichenweise (oder mit mehr als einem Zeichen) aufgerufen, mit einem \n am Ende des Strings +- es gibt keine Zeilenumbrüche + +TO DO: + +ZH: Liste ergänzen +ZH: was ist, wenn man wirklich die Variante, die im Text steht, nachschlagen will? Dann muss man das Zeichen wohl selbst rauskopieren. +ZH: sollen lateinische Buchstaben bewirken, dass problem = 1 ist? +ZH: sollen Zeilenumbrüche rausgenommen werden, auch wenn sie in korrekt markiertem Text nicht vorkommen? +ZH: was ist, wenn beijing übergeben wird und einen Zeilenumbruch enthält? Verlässt sich der Wrapper darauf, dass die Zeichenzahl gleich bleibt, oder macht er ein hyphen rein? was macht oder ? + +*/ diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/reg/DBRegularizationHandler.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/reg/DBRegularizationHandler.java Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,146 @@ +package de.mpg.mpiwg.berlin.mpdl.lt.text.reg; + +import java.io.UnsupportedEncodingException; +import java.util.ArrayList; + +import com.sleepycat.je.Cursor; +import com.sleepycat.je.Database; +import com.sleepycat.je.DatabaseEntry; +import com.sleepycat.je.DatabaseException; +import com.sleepycat.je.LockMode; +import com.sleepycat.je.OperationStatus; + +import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; +import de.mpg.mpiwg.berlin.mpdl.lt.general.Language; + +public class DBRegularizationHandler { + private String dbDirectory; + private DbEnvRegularization regDbEnv; + + public DBRegularizationHandler(String dbDir) { + this.dbDirectory = dbDir; + } + + public void start() throws ApplicationException { + regDbEnv = new DbEnvRegularization(); + regDbEnv.setDataDir(dbDirectory); + regDbEnv.init(); // open databases in read/write mode + } + + public void openDatabases() throws ApplicationException { + regDbEnv.openDatabases(); + } + + public void closeDatabases() throws ApplicationException { + regDbEnv.close(); + } + + public void deleteData() throws ApplicationException { + regDbEnv.removeDatabases(); + } + + public void writeOrigReg(Regularization reg) throws ApplicationException { + try { + String language = Language.getInstance().getLanguageId(reg.getLanguage()); + String keyStr = language + "###" + reg.getOrig(); + String valueStr = reg.getXmlString(); + DatabaseEntry dbEntryKey = new DatabaseEntry(keyStr.getBytes("utf-8")); + DatabaseEntry dbEntryValue = new DatabaseEntry(valueStr.getBytes("utf-8")); + Database origDB = regDbEnv.getOrigDB(); + origDB.put(null, dbEntryKey, dbEntryValue); + } catch (DatabaseException e) { + throw new ApplicationException(e); + } catch (UnsupportedEncodingException e) { + throw new ApplicationException(e); + } + } + + public void writeNormReg(Regularization reg) throws ApplicationException { + try { + String language = Language.getInstance().getLanguageId(reg.getLanguage()); + String keyStr = language + "###" + reg.getNorm(); + String valueStr = reg.getXmlString(); + DatabaseEntry dbEntryKey = new DatabaseEntry(keyStr.getBytes("utf-8")); + DatabaseEntry dbEntryValue = new DatabaseEntry(valueStr.getBytes("utf-8")); + Database normDB = regDbEnv.getNormDB(); + normDB.put(null, dbEntryKey, dbEntryValue); + } catch (DatabaseException e) { + throw new ApplicationException(e); + } catch (UnsupportedEncodingException e) { + throw new ApplicationException(e); + } + } + + public void deleteReg(Regularization reg) throws ApplicationException { + try { + String language = Language.getInstance().getLanguageId(reg.getLanguage()); + String keyStrOrig = language + "###" + reg.getOrig(); + DatabaseEntry dbEntryKey = new DatabaseEntry(keyStrOrig.getBytes("utf-8")); + Database origDB = regDbEnv.getOrigDB(); + origDB.delete(null, dbEntryKey); + String keyStrNorm = reg.getLanguage() + "###" + reg.getNorm(); + dbEntryKey = new DatabaseEntry(keyStrNorm.getBytes("utf-8")); + Database normDB = regDbEnv.getNormDB(); + normDB.delete(null, dbEntryKey); + } catch (DatabaseException e) { + throw new ApplicationException(e); + } catch (UnsupportedEncodingException e) { + throw new ApplicationException(e); + } + } + + public ArrayList readRegsByOrig(String lang, String orig) throws ApplicationException { + String language = Language.getInstance().getLanguageId(lang); + ArrayList retRegs = new ArrayList(); + String hashKey = language + "###" + orig; + try { + Database origDB = regDbEnv.getOrigDB(); + Cursor cursor = origDB.openCursor(null, null); + byte[] bHashKey = hashKey.getBytes("utf-8"); + DatabaseEntry dbEntryKey = new DatabaseEntry(bHashKey); + DatabaseEntry foundValue = new DatabaseEntry(); + OperationStatus operationStatus = cursor.getSearchKey(dbEntryKey, foundValue, LockMode.DEFAULT); + while (operationStatus == OperationStatus.SUCCESS) { + byte[] foundValueBytes = foundValue.getData(); + String foundValueStr = new String(foundValueBytes, "utf-8"); + Regularization reg = Regularization.getInstance(foundValueStr); + retRegs.add(reg); + operationStatus = cursor.getNextDup(dbEntryKey, foundValue, LockMode.DEFAULT); + } + cursor.close(); + } catch (DatabaseException e) { + throw new ApplicationException(e); + } catch (UnsupportedEncodingException e) { + throw new ApplicationException(e); + } + return retRegs; + } + + public ArrayList readRegsByNorm(String lang, String norm) throws ApplicationException { + String language = Language.getInstance().getLanguageId(lang); + ArrayList retRegs = new ArrayList(); + String hashKey = language + "###" + norm; + try { + Database normDB = regDbEnv.getNormDB(); + Cursor cursor = normDB.openCursor(null, null); + byte[] bHashKey = hashKey.getBytes("utf-8"); + DatabaseEntry dbEntryKey = new DatabaseEntry(bHashKey); + DatabaseEntry foundValue = new DatabaseEntry(); + OperationStatus operationStatus = cursor.getSearchKey(dbEntryKey, foundValue, LockMode.DEFAULT); + while (operationStatus == OperationStatus.SUCCESS) { + byte[] foundValueBytes = foundValue.getData(); + String foundValueStr = new String(foundValueBytes, "utf-8"); + Regularization reg = Regularization.getInstance(foundValueStr); + retRegs.add(reg); + operationStatus = cursor.getNextDup(dbEntryKey, foundValue, LockMode.DEFAULT); + } + cursor.close(); + } catch (DatabaseException e) { + throw new ApplicationException(e); + } catch (UnsupportedEncodingException e) { + throw new ApplicationException(e); + } + return retRegs; + } + +} \ No newline at end of file diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/reg/DbEnvRegularization.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/reg/DbEnvRegularization.java Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,100 @@ +package de.mpg.mpiwg.berlin.mpdl.lt.text.reg; + +import java.io.File; + +import com.sleepycat.je.Database; +import com.sleepycat.je.DatabaseConfig; +import com.sleepycat.je.DatabaseException; +import com.sleepycat.je.Environment; +import com.sleepycat.je.EnvironmentConfig; + +import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; + +public class DbEnvRegularization { + private String dataDir; + private File envPath; + private Environment env; + private EnvironmentConfig envConfig; + private DatabaseConfig dbConfig; + private Database origDB; + private Database normDB; + + public DbEnvRegularization() { + } + + public void setDataDir(String dataDir) { + this.dataDir = dataDir; + } + + public void init() throws ApplicationException { + try { + envConfig = new EnvironmentConfig(); + dbConfig = new DatabaseConfig(); + envConfig.setReadOnly(false); + dbConfig.setReadOnly(false); + envConfig.setAllowCreate(true); + dbConfig.setAllowCreate(true); + envConfig.setTransactional(true); + dbConfig.setTransactional(true); + // allow duplicates for keys + dbConfig.setSortedDuplicates(true); + envPath = new File(dataDir); + env = new Environment(envPath, envConfig); + } catch (DatabaseException e) { + throw new ApplicationException(e); + } + } + + public void openDatabases() throws ApplicationException { + try { + // open databases (and create them if they do not exist) + origDB = env.openDatabase(null, "OrigDB", dbConfig); + normDB = env.openDatabase(null, "NormDB", dbConfig); + } catch (DatabaseException e) { + throw new ApplicationException(e); + } + } + + public void removeDatabases() throws ApplicationException { + try { + if (origDB != null) + origDB.close(); + if (normDB != null) + normDB.close(); + env.removeDatabase(null, "OrigDB"); + env.removeDatabase(null, "NormDB"); + origDB = null; + normDB = null; + } catch (DatabaseException e) { + throw new ApplicationException(e); + } + } + + public Environment getEnv() { + return env; + } + + public Database getNormDB() { + return normDB; + } + + public Database getOrigDB() { + return origDB; + } + + public void close() throws ApplicationException { + if (env != null) { + try { + if (origDB != null) + origDB.close(); + if (normDB != null) + normDB.close(); + if (env != null) + env.close(); + } catch (DatabaseException e) { + throw new ApplicationException(e); + } + } + } +} + diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/reg/Regularization.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/reg/Regularization.java Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,89 @@ +package de.mpg.mpiwg.berlin.mpdl.lt.text.reg; + +import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; +import de.mpg.mpiwg.berlin.mpdl.util.StringUtils; +import de.mpg.mpiwg.berlin.mpdl.xml.xquery.XQueryEvaluator; + +public class Regularization { + private String language; + private String orig; + private String norm; + private String source; + private int sourcePosition; + + public Regularization(String language, String orig, String norm, String source) { + this.language = language; + this.orig = orig; + this.norm = norm; + this.source = source; + } + + public static Regularization getInstance(String xmlStr) throws ApplicationException { + XQueryEvaluator xQueryEvaluator = new XQueryEvaluator(); + String language = xQueryEvaluator.evaluateAsStringValue(xmlStr, "//language"); + String orig = xQueryEvaluator.evaluateAsStringValue(xmlStr, "//orig"); + String norm = xQueryEvaluator.evaluateAsStringValue(xmlStr, "//norm"); + String source = xQueryEvaluator.evaluateAsStringValue(xmlStr, "//source"); + String sourcePosStr = xQueryEvaluator.evaluateAsStringValue(xmlStr, "//source/@position"); + int sourcePos = new Integer(sourcePosStr); + Regularization reg = new Regularization(language, orig, norm, source); + reg.setSourcePosition(sourcePos); + return reg; + } + + public String getLanguage() { + return language; + } + + public void setLanguage(String language) { + this.language = language; + } + + public String getOrig() { + return orig; + } + + public void setOrig(String orig) { + this.orig = orig; + } + + public String getNorm() { + return norm; + } + + public void setNorm(String norm) { + this.norm = norm; + } + + public String getSource() { + return source; + } + + public void setSource(String source) { + this.source = source; + } + + public int getSourcePosition() { + return sourcePosition; + } + + public void setSourcePosition(int sourcePosition) { + this.sourcePosition = sourcePosition; + } + + public String getXmlString() { + String xmlString = "\n"; + if (language != null) + xmlString += " " + language + "\n"; + if (orig != null) + xmlString += " " + StringUtils.deresolveXmlEntities(orig) + "\n"; + if (norm != null) + xmlString += " " + StringUtils.deresolveXmlEntities(norm) + "\n"; + if (source != null) + xmlString += " " + StringUtils.deresolveXmlEntities(source) + "\n"; + xmlString += "\n"; + return xmlString; + } + + +} diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/reg/RegularizationManager.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/reg/RegularizationManager.java Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,118 @@ +package de.mpg.mpiwg.berlin.mpdl.lt.text.reg; + +import java.util.ArrayList; +import java.util.Date; +import java.util.Hashtable; +import java.util.logging.Logger; + +import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; +import de.mpg.mpiwg.berlin.mpdl.lt.general.Constants; +import de.mpg.mpiwg.berlin.mpdl.lucene.util.LuceneUtil; +import de.mpg.mpiwg.berlin.mpdl.util.Util; + +public class RegularizationManager { + private static RegularizationManager instance; + private static String DATA_DIR = Constants.getInstance().getDataDir(); + private static String REGULARIZATION_DB_DIR = DATA_DIR + "/dataBerkeleyDB/regularization"; + private static Logger LOGGER = Logger.getLogger(RegularizationManager.class.getName()); + private DBRegularizationHandler dbRegHandler; + private Hashtable> regsOrig; + private Hashtable> regsNorm; + private Date beginOfOperation; + private Date endOfOperation; + + public static RegularizationManager getInstance() throws ApplicationException { + if (instance == null) { + instance = new RegularizationManager(); + instance.init(); + } + return instance; + } + + public static void main(String[] args) throws ApplicationException { + getInstance(); + instance.beginOperation(); + System.out.print("Start ..."); + ArrayList regs = instance.findRegsByNorm("la", "Illiusque"); + ArrayList regs2 = instance.findRegsByNorm("la", "Itaque"); + Regularization bla = regs.get(0); + Regularization bla2 = regs2.get(0); + + instance.end(); + instance.endOperation(); + Double elapsedTime = new Util().getSecondWithMillisecondsBetween(instance.beginOfOperation, instance.endOfOperation); + System.out.println("End."); + System.out.println("Needed time: " + elapsedTime + " seconds"); + } + + private void init() throws ApplicationException { + regsOrig = new Hashtable>(); + regsNorm = new Hashtable>(); + dbRegHandler = new DBRegularizationHandler(REGULARIZATION_DB_DIR); + dbRegHandler.start(); + dbRegHandler.openDatabases(); + LOGGER.info("Regularization db cache: opened"); + } + + public ArrayList findRegsByOrig(String language, String orig) throws ApplicationException { + orig = orig.toLowerCase(); + String hashKey = language + "###" + orig; + ArrayList regs = regsOrig.get(hashKey); + if (regs == null) { + regs = dbRegHandler.readRegsByOrig(language, orig); + if (regs == null || regs.isEmpty()) + regsOrig.put(hashKey, new ArrayList()); + else + regsOrig.put(hashKey, regs); + } + return regs; + } + + public ArrayList findRegsByNorm(String language, String norm) throws ApplicationException { + norm = norm.toLowerCase(); + String hashKey = language + "###" + norm; + ArrayList regs = regsNorm.get(hashKey); + if (regs == null || regs.isEmpty()) { + regs = dbRegHandler.readRegsByNorm(language, norm); + if (regs == null) + regsNorm.put(hashKey, new ArrayList()); + else + regsNorm.put(hashKey, regs); + } + return regs; + } + + public ArrayList getRegOrigsByNormLuceneQueryString(String language, String luceneQueryString) throws ApplicationException { + ArrayList regForms = new ArrayList(); + LuceneUtil luceneUtil = LuceneUtil.getInstance(); + ArrayList variants = luceneUtil.getVariantsFromLuceneQuery(luceneQueryString); + if (variants != null) { + for (int i=0; i regs = findRegsByNorm(language, variant); + if (regs != null) { + for (int j=0; j getTokens() throws ApplicationException { + if (Language.getInstance().isChinese(language)) { + return getTokensByChineseTokenizer(input, normFunctions); + } + ArrayList tokens = new ArrayList(); + try { + reset(input); + CharTermAttribute charTermAttribute = getAttribute(CharTermAttribute.class); + OffsetAttribute offsetAttribute = getAttribute(OffsetAttribute.class); + while (incrementToken()) { + String term = charTermAttribute.toString(); + int start = offsetAttribute.startOffset(); + int end = offsetAttribute.endOffset(); + String normedTerm = normalizer.normalize(term); + Token token = new Token(start, end, normedTerm); + tokens.add(token); + } + end(); // TODO needed ? + close(); // TODO needed ? + } catch (IOException e) { + throw new ApplicationException(e); + } + return tokens; + } + + /** Returns true iff a character should be included in a token. */ + protected boolean isTokenChar(int codepoint) { + boolean isTokenChar = true; + char c = (char) codepoint; + switch (c) { + case ' ': isTokenChar = false; break; + case '.': isTokenChar = false; break; + case ',': isTokenChar = false; break; + case '!': isTokenChar = false; break; + case '?': isTokenChar = false; break; + case ';': isTokenChar = false; break; + case ':': isTokenChar = false; break; + case '(': isTokenChar = false; break; + case ')': isTokenChar = false; break; + case '[': isTokenChar = false; break; + case ']': isTokenChar = false; break; + case '{': isTokenChar = false; break; + case '}': isTokenChar = false; break; + case '<': isTokenChar = false; break; + case '>': isTokenChar = false; break; + case '/': isTokenChar = false; break; + case '=': isTokenChar = false; break; + case '&': isTokenChar = false; break; + case '+': isTokenChar = false; break; + case '#': isTokenChar = false; break; + case '"': isTokenChar = false; break; + case '': isTokenChar = false; break; + case '': isTokenChar = false; break; + case '': isTokenChar = false; break; + case '': isTokenChar = false; break; + case '\'': isTokenChar = false; break; + case '\t': isTokenChar = false; break; // do not break words which have tabs in it + case '\n': isTokenChar = false; break; // do not break words which are on another line + case '\u2425': isTokenChar = false; break; // special char for marking xml elements + } + return isTokenChar; + } + + /** Called on each token character to normalize it before it is added to the + * token. The default implementation does nothing. Subclasses may use this + * to, e.g., lowercase tokens. */ + protected char normalize(char c) { + return c; + } + protected int normalize(int c) { + return c; + } + + /* + * Code is copied from Lucene 3.4. CharTokenizer.incrementToken() + * @see org.apache.lucene.analysis.TokenStream#incrementToken() + */ + public boolean incrementToken() throws IOException { + clearAttributes(); + int length = 0; + int start = -1; // this variable is always initialized + char[] buffer = termAtt.buffer(); + while (true) { + if (bufferIndex >= dataLen) { + offset += dataLen; + if(! charUtils.fill(ioBuffer, input)) { // read supplementary char aware with CharacterUtils + dataLen = 0; // so next offset += dataLen won't decrement offset + if (length > 0) { + break; + } else { + finalOffset = correctOffset(offset); + return false; + } + } + dataLen = ioBuffer.getLength(); + bufferIndex = 0; + } + // use CharacterUtils here to support < 3.1 UTF-16 code unit behavior if the char based methods are gone + int c = charUtils.codePointAt(ioBuffer.getBuffer(), bufferIndex); + bufferIndex += Character.charCount(c); + if (isTokenChar(c)) { // if it's a token char + if (length == 0) { // start of token + start = offset + bufferIndex - 1; + } else if (length >= buffer.length-1) { // check if a supplementary could run out of bounds + buffer = termAtt.resizeBuffer(2 + length); // make sure a supplementary fits in the buffer + } + length += Character.toChars(normalize(c), buffer, length); // buffer it, normalized + if (length >= MAX_WORD_LEN) // buffer overflow! make sure to check for >= surrogate pair could break == test + break; + } else if (length > 0) // at non-Letter w/ chars + break; // return 'em + } + termAtt.setLength(length); + offsetAtt.setOffset(correctOffset(start), finalOffset = correctOffset(start + length)); + return true; + } + + /* + * Code is copied from Lucene 3.4. CharTokenizer.end() + * @see org.apache.lucene.analysis.TokenStream#end() + */ + @Override + public final void end() { + // set final offset + offsetAtt.setOffset(finalOffset, finalOffset); + } + + /* + * Code is copied from Lucene 3.4. CharTokenizer.reset() + * @see org.apache.lucene.analysis.Tokenizer#reset(java.io.Reader) + */ + @Override + public void reset(Reader input) throws IOException { + super.reset(input); + bufferIndex = 0; + offset = 0; + dataLen = 0; + finalOffset = 0; + ioBuffer.reset(); // make sure to reset the IO buffer!! + this.normalizer = new Normalizer(normFunctions, language); + } + + private ArrayList getTokensByChineseTokenizer(Reader input, String[] normFunctions) throws ApplicationException { + StandardTokenizer chineseTokenizer = new StandardTokenizer(Version.LUCENE_34, input); // is recommended instead of ChineseTokenizer which is deprecated + ArrayList tokens = new ArrayList(); + try { + reset(input); + chineseTokenizer.reset(input); + CharTermAttribute charTermAttribute = chineseTokenizer.getAttribute(CharTermAttribute.class); + OffsetAttribute offsetAttribute = chineseTokenizer.getAttribute(OffsetAttribute.class); + while (chineseTokenizer.incrementToken()) { + String term = charTermAttribute.toString(); + String normedTerm = normalizer.normalize(term); + int start = offsetAttribute.startOffset(); + int end = offsetAttribute.endOffset(); + Token token = new Token(start, end, normedTerm); + tokens.add(token); + } + chineseTokenizer.end(); // TODO needed ? + chineseTokenizer.close(); // TODO needed ? + end(); // TODO needed ? + close(); // TODO needed ? + } catch (IOException e) { + throw new ApplicationException(e); + } + return tokens; + } + +} \ No newline at end of file diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/tokenize/XmlTokenizer.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/tokenize/XmlTokenizer.java Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,68 @@ +package de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize; + +import java.io.IOException; +import java.io.Reader; + +import org.xml.sax.InputSource; +import org.xml.sax.SAXException; +import org.xml.sax.XMLReader; + +import com.sun.org.apache.xerces.internal.parsers.SAXParser; + +import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; +import de.mpg.mpiwg.berlin.mpdl.lt.general.Language; + +public class XmlTokenizer { + private Reader input; + private String language = "eng"; // default: english + private String[] normFunctions = {"specialNorm"}; // default: use special norm function + private String[] nwbElements = {"lb", "br", "cb", "figure", "image", "handwritten", "anchor", "emph", "note"}; // non word breaking elements, default: these elements + private String[] stopElements = {}; // default: no stop elements + private String[] outputOptions = {}; + + public XmlTokenizer(Reader input) { + this.input = input; + } + + public void setLanguage(String lang) { + String language = Language.getInstance().getLanguageId(lang); + this.language = language; + } + + public void setNormFunctions(String[] normFunctions) { + this.normFunctions = normFunctions; + } + + public void setNWBElements(String[] nwbElements) { + this.nwbElements = nwbElements; + } + + public void setStopElements(String[] stopElements) { + this.stopElements = stopElements; + } + + public void setOutputOptions(String[] outputOptions) { + this.outputOptions = outputOptions; + } + + public String tokenize() throws ApplicationException { + String retString = null; + try { + XmlTokenizerContentHandler dictContentHandler = new XmlTokenizerContentHandler(normFunctions, language); + dictContentHandler.setStopElements(stopElements); + dictContentHandler.setNWBElements(nwbElements); + dictContentHandler.setOutputOptions(outputOptions); + XMLReader xmlParser = new SAXParser(); + xmlParser.setContentHandler(dictContentHandler); + InputSource inputSource = new InputSource(input); + xmlParser.parse(inputSource); + retString = dictContentHandler.getXmlFragment(); + } catch (SAXException e) { + throw new ApplicationException(e); + } catch (IOException e) { + throw new ApplicationException(e); + } + return retString; + } + +} \ No newline at end of file diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/tokenize/XmlTokenizerContentHandler.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/tokenize/XmlTokenizerContentHandler.java Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,426 @@ +package de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize; + +import java.io.StringReader; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Hashtable; + +import org.xml.sax.*; + +import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; +import de.mpg.mpiwg.berlin.mpdl.lt.dict.db.LexHandler; +import de.mpg.mpiwg.berlin.mpdl.lt.general.Language; +import de.mpg.mpiwg.berlin.mpdl.lt.morph.app.Form; +import de.mpg.mpiwg.berlin.mpdl.lt.morph.app.Lemma; +import de.mpg.mpiwg.berlin.mpdl.util.StringUtils; + +public class XmlTokenizerContentHandler implements ContentHandler { + private static String COMPLEX_ELEMENT_MARK = new Character('\u2425').toString(); // word delimiting element + private static String COMPLEX_ELEMENT_NWD_MARK = new Character('\u2424').toString(); // not word delimiting element + private static int COMPLEX_ELEMENT_MARK_SIZE = COMPLEX_ELEMENT_MARK.length(); + private static int ELEMENT_TYPE_CHARACTERS = 1; + private static int ELEMENT_TYPE_COMPLEX = 2; + private String[] normalizeFunctions = {}; // default: without normalize functions + private String[] nwbElements = {}; // non word breaking elements, default: these elements + private String[] stopElements = {}; // default: no stop elements + private String[] outputOptions = {}; + private String xmlnsString = ""; + private String language; + private String outputXmlFragment = ""; + private Element rootElement; + private Element currentElement; + private ArrayList elementQueue; + + public XmlTokenizerContentHandler(String[] normalizeFunctions, String language) throws ApplicationException { + if (normalizeFunctions == null) { + String[] emptyFunctions = {}; + this.normalizeFunctions = emptyFunctions; + } else { + this.normalizeFunctions = normalizeFunctions; + } + this.language = language; + } + + public void setNWBElements(String[] nwbElements) { + this.nwbElements = nwbElements; + } + + public void setStopElements(String[] stopElements) { + this.stopElements = stopElements; + } + + public void setOutputOptions(String[] outputOptions) { + this.outputOptions = outputOptions; + } + + public String getXmlFragment() { + return outputXmlFragment; + } + + public void startDocument() throws SAXException { + } + + public void endDocument() throws SAXException { + try { + String rootElemToStr = rootElement.toXmlString(); + write(rootElemToStr); + write("\n"); + } catch (NullPointerException e) { + throw new SAXException(e); + } + } + + public void characters(char[] c, int start, int length) throws SAXException { + char[] cCopy = new char[length]; + System.arraycopy(c, start, cCopy, 0, length); + String charactersStr = String.valueOf(cCopy); + if (charactersStr != null && ! charactersStr.equals("")) { + if (currentElement != null) { + Element charElement = new Element("characters", ELEMENT_TYPE_CHARACTERS); + charElement.value = StringUtils.deresolveXmlEntities(charactersStr); + if (currentElement.composites == null) + currentElement.composites = new ArrayList(); + currentElement.composites.add(charElement); + } + } + } + + public void ignorableWhitespace(char[] c, int start, int length) throws SAXException { + } + + public void processingInstruction(String target, String data) throws SAXException { + } + + public void setDocumentLocator(Locator locator) { + } + + public void startPrefixMapping(String prefix, String uri) throws SAXException { + xmlnsString += "xmlns:" + prefix + "=\"" + uri + "\" "; + if (prefix != null && prefix.equals("")) + xmlnsString = "xmlns" + prefix + "=\"" + uri + "\" "; + } + + public void endPrefixMapping(String prefix) throws SAXException { + } + + public void skippedEntity(String name) throws SAXException { + } + + public void startElement(String uri, String localName, String name, Attributes attrs) throws SAXException { + if (elementQueue == null) + elementQueue = new ArrayList(); + Element newElement = new Element(name); // element of type: complex + if (currentElement != null) { + if (currentElement.composites == null) + currentElement.composites = new ArrayList(); + if (currentElement.lang != null) + newElement.lang = currentElement.lang; // language is inherited to childs + currentElement.composites.add(newElement); + } + currentElement = newElement; + int attrSize = attrs.getLength(); + String attrString = ""; + for (int i=0; i 0) { + int lastIndex = elementQueue.size() - 1; + elementQueue.remove(lastIndex); + } + if (elementQueue != null && elementQueue.size() > 0) { + int lastIndex = elementQueue.size() - 1; + currentElement = elementQueue.get(lastIndex); + } else { + currentElement = null; + } + } + + private boolean withForms() { + boolean result = false; + for (int i=0; i< outputOptions.length; i++) { + String function = outputOptions[i]; + if (function.equals("withForms")) + return true; + } + return result; + } + + private boolean withLemmas() { + boolean result = false; + for (int i=0; i< outputOptions.length; i++) { + String function = outputOptions[i]; + if (function.equals("withLemmas")) + return true; + } + return result; + } + + private void write(String outStr) throws SAXException { + outputXmlFragment += outStr; + } + + private class Element { + private int type; + private String name; + private String xmlnsString; + private String attrString; + private String value; + private String lang; // normally value of attribute xml:lang or the inherited xml:lang value of the father node + private ArrayList composites; + + private Element(String name) { + this.type = ELEMENT_TYPE_COMPLEX; + this.name = name; + } + + private Element(String name, int type) { + this.type = type; + this.name = name; + } + + private boolean isComplex() { + boolean isComplex = false; + if (type == ELEMENT_TYPE_COMPLEX) + isComplex = true; + return isComplex; + } + + private boolean isWordDelimiterElement() { + boolean isWordDelimiterElement = true; + for (int i=0; i"; + } else { + retString = retString + "<" + name + " " + xmlNsString + attrString + ">"; + } + if (composites != null) { + String compositesCharsWithMarks = ""; + ArrayList complexElements = new ArrayList(); + for (int i=0; i) + } else { + compositesCharsWithMarks = compositesCharsWithMarks + COMPLEX_ELEMENT_MARK; // add a special mark symbol at the position of the "word delimiter element" (e.g. ) + } + complexElements.add(composite); + } + } + // compositesCharsWithMarks = compositesCharsWithMarks.replaceAll(COMPLEX_ELEMENT_NWD_MARK + " +", COMPLEX_ELEMENT_NWD_MARK); // remove Blanks after the non word breaking mark (e.g. "praebi ta" is changed to "praebita") + String compositesCharsWithMarksWithWordTags = insertWordTags(compositesCharsWithMarks, elemLanguage); + compositesCharsWithMarksWithWordTags = compositesCharsWithMarksWithWordTags.replaceAll(COMPLEX_ELEMENT_NWD_MARK, COMPLEX_ELEMENT_MARK); // mark symbols are unified to COMPLEX_ELEMENT_MARK to replace them by the element values + if (complexElements.size() > 0) { + for (int i=0; i 0) { + firstPiece = compositesCharsWithMarksWithWordTags.substring(0, indexComplexElemCompositesCharsWithMarks); + compositesCharsWithMarksWithWordTags = compositesCharsWithMarksWithWordTags.substring(indexComplexElemCompositesCharsWithMarks); + } + retString = retString + firstPiece + complexElementStr; + compositesCharsWithMarksWithWordTags = compositesCharsWithMarksWithWordTags.substring(COMPLEX_ELEMENT_MARK_SIZE); + } + retString = retString + compositesCharsWithMarksWithWordTags; // last one must also be added + } else { + retString = retString + compositesCharsWithMarksWithWordTags; // last one must also be added + } + } + retString = retString + ""; + } + return retString; + } + + private String insertWordTags(String charactersStrDeresolved, String language) throws SAXException { + String charactersStr = StringUtils.resolveXmlEntities(charactersStrDeresolved); + String retStr = ""; + try { + Tokenizer tokenizer = new Tokenizer(new StringReader(charactersStr)); + tokenizer.setLanguage(language); + tokenizer.setNormFunctions(normalizeFunctions); + ArrayList tokens = tokenizer.getTokens(); + int endPos = 0; + for (int i=0; i < tokens.size(); i++) { + Token token = tokens.get(i); + String wordForm = token.getContent(); + int startPos = token.getStart(); + String beforeStr = charactersStr.substring(endPos, startPos); + endPos = token.getEnd(); + String beforeStrDeresolved = StringUtils.deresolveXmlEntities(beforeStr); + String origWordForm = charactersStr.substring(startPos, endPos); + String wordTag = insertWordTags(wordForm, language, origWordForm); + retStr = retStr + beforeStrDeresolved + wordTag; + } + String lastAfterStr = charactersStr.substring(endPos); + String lastAfterStrDeresolved = StringUtils.deresolveXmlEntities(lastAfterStr); + retStr = retStr + lastAfterStrDeresolved; + } catch (ApplicationException e) { + throw new SAXException(e); + } + return retStr; + } + + private String insertWordTags(String wordForm, String language, String origWordForm) throws ApplicationException { + String wordTag = null; + if (origWordForm != null && origWordForm.equals(COMPLEX_ELEMENT_NWD_MARK)) + return origWordForm; + if (isStopElement()) + return origWordForm; + wordForm = removeSpecialSymbols(wordForm); + wordForm = wordForm.toLowerCase(); + String origWordFormDeresolved = StringUtils.deresolveXmlEntities(origWordForm); + ArrayList lemmas = null; + if (withForms() || withLemmas()) { + LexHandler lexHandler = LexHandler.getInstance(); + lemmas = lexHandler.getLemmas(wordForm, "form", language, "none"); + } + wordTag = insertWordTags(origWordFormDeresolved, wordForm, language, null, lemmas); + return wordTag; + } + + /** + * + * @param origWordToken could contain nwd marks + * @param wordForm contains no nwd marks + * @param language + * @param origWordFormNormalized + * @param lemmas + * @return for each substring between nwd marks create a word tag + */ + private String insertWordTags(String origWordToken, String wordForm, String language, String origWordFormNormalized, ArrayList lemmas) { + if (origWordToken.isEmpty()) + return origWordToken; + if (origWordToken.equals(COMPLEX_ELEMENT_NWD_MARK)) + return COMPLEX_ELEMENT_NWD_MARK; + String retWordTags = ""; + String origWordTokenTmp = origWordToken; + while (! origWordTokenTmp.isEmpty()) { + if (origWordTokenTmp.startsWith(COMPLEX_ELEMENT_NWD_MARK)) { // single nwd mark + origWordTokenTmp = origWordTokenTmp.substring(1); + retWordTags = retWordTags + COMPLEX_ELEMENT_NWD_MARK; + } else { + int indexUpToNWD = origWordTokenTmp.indexOf(COMPLEX_ELEMENT_NWD_MARK); + if (indexUpToNWD != -1) { // not end of string reached + String origWordTokenFragment = origWordTokenTmp.substring(0, indexUpToNWD); + String origWordTokenFragmentWithTags = getWordTag(origWordTokenFragment, wordForm, language, origWordFormNormalized, lemmas); + retWordTags = retWordTags + origWordTokenFragmentWithTags + COMPLEX_ELEMENT_NWD_MARK; + origWordTokenTmp = origWordTokenTmp.substring(indexUpToNWD + 1); + } else { // end of string reached + String origWordTokenFragment = origWordTokenTmp.substring(0, origWordTokenTmp.length()); + String origWordTokenFragmentWithTags = getWordTag(origWordTokenFragment, wordForm, language, origWordFormNormalized, lemmas); + retWordTags = retWordTags + origWordTokenFragmentWithTags; + origWordTokenTmp = ""; // finente + } + } + } + return retWordTags; + } + + private String getWordTag(String origWordForm, String wordForm, String language, String origWordFormNormalized, ArrayList lemmas) { + if (origWordForm == null || origWordForm.isEmpty()) + return ""; + String langISOCode = Language.getInstance().getISO639Code(language); + String retStr = " formsHashtable = new Hashtable(); + for (int i=0; i < lemmas.size(); i++) { + Lemma lemma = lemmas.get(i); + ArrayList lemmaForms = lemma.getFormsList(); + for (int j=0; j < lemmaForms.size(); j++) { + Form form = lemmaForms.get(j); + formsHashtable.put(form.getFormName(), form); + } + String lemmaName = lemma.getLemmaName(); + lemmasStr = lemmasStr + lemmaName + " "; + } + ArrayList forms = new ArrayList(); + forms.addAll(formsHashtable.values()); + Collections.sort(forms); + for (int i=0; i < forms.size(); i++) { + Form form = forms.get(i); + String formName = form.getFormName(); + formName = StringUtils.forXML(formName); + formsStr = formsStr + formName + " "; + } + if (formsStr.endsWith(" ")) + formsStr = formsStr.substring(0, formsStr.length() - 1); + if (lemmasStr.endsWith(" ")) + lemmasStr = lemmasStr.substring(0, lemmasStr.length() - 1); + if (withForms()) + retStr = retStr + " forms=\"" + formsStr + "\""; + if (withLemmas()) + retStr = retStr + " lemmas=\"" + lemmasStr + "\""; + } + retStr = retStr + ">" + origWordForm + ""; + return retStr; + } + + private String removeSpecialSymbols(String inputStr) { + String retStr = inputStr.replaceAll(" |\n|\t|-|\u2424|\u2425", ""); + return retStr; + } + + } +} diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/transcode/Betacode2Unicode.lex --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/transcode/Betacode2Unicode.lex Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,332 @@ +package de.mpg.mpiwg.berlin.mpdl.lt.general; + +%% +%{ + /* + * Betacode to Unicode conversion + */ + + private String toUnicode(int in) { + char c = (char) in; + Character ch = new Character(c); + String retString = ch.toString(); + return retString; + } + +%} + +%class Betacode2UnicodeLex +%public +%type java.lang.String +%unicode +%% + + +"<"[^>]+">" { return yytext(); } + +"*j" { return "H"; } +"j" { return "h"; } +"*v" { return "F"; } +"v" { return "f"; } +"*s" { return toUnicode(0x03a3); } + +"!" { return "."; } +":" { return toUnicode(0x00B7); } /* MPDL update */ + +"a)" { return toUnicode(0x1F00); } +"a(" { return toUnicode(0x1F01); } +"a)\\" { return toUnicode(0x1F02); } +"a(\\" { return toUnicode(0x1F03); } +"a)/" { return toUnicode(0x1F04); } +"a(/" { return toUnicode(0x1F05); } +"a)=" { return toUnicode(0x1F06); } +"a(=" { return toUnicode(0x1F07); } +"*)a" { return toUnicode(0x1F08); } +"*(a" { return toUnicode(0x1F09); } +"*)\\a" { return toUnicode(0x1F0A); } +"*(\\a" { return toUnicode(0x1F0B); } +"*)/a" { return toUnicode(0x1F0C); } +"*(/a" { return toUnicode(0x1F0D); } +"*)=a" { return toUnicode(0x1F0E); } +"*(=a" { return toUnicode(0x1F0F); } +"e)" { return toUnicode(0x1F10); } +"e(" { return toUnicode(0x1F11); } +"e)\\" { return toUnicode(0x1F12); } +"e(\\" { return toUnicode(0x1F13); } +"e)/" { return toUnicode(0x1F14); } +"e(/" { return toUnicode(0x1F15); } +"*)e" { return toUnicode(0x1F18); } +"*(e" { return toUnicode(0x1F19); } +"*)\\e" { return toUnicode(0x1F1A); } +"*(\\e" { return toUnicode(0x1F1B); } +"*)/e" { return toUnicode(0x1F1C); } +"*(/e" { return toUnicode(0x1F1D); } +"h)" { return toUnicode(0x1F20); } +"h(" { return toUnicode(0x1F21); } +"h)\\" { return toUnicode(0x1F22); } +"h(\\" { return toUnicode(0x1F23); } +"h)/" { return toUnicode(0x1F24); } +"h(/" { return toUnicode(0x1F25); } +"h)=" { return toUnicode(0x1F26); } +"h(=" { return toUnicode(0x1F27); } +"*)h" { return toUnicode(0x1F28); } +"*(h" { return toUnicode(0x1F29); } +"*)\\h" { return toUnicode(0x1F2A); } +"*(\\h" { return toUnicode(0x1F2B); } +"*)/h" { return toUnicode(0x1F2C); } +"*(/h" { return toUnicode(0x1F2D); } +"*)=h" { return toUnicode(0x1F2E); } +"*(=h" { return toUnicode(0x1F2F); } +"i)" { return toUnicode(0x1F30); } +"i(" { return toUnicode(0x1F31); } +"i)\\" { return toUnicode(0x1F32); } +"i(\\" { return toUnicode(0x1F33); } +"i)/" { return toUnicode(0x1F34); } +"i(/" { return toUnicode(0x1F35); } +"i)=" { return toUnicode(0x1F36); } +"i(=" { return toUnicode(0x1F37); } +"*)i" { return toUnicode(0x1F38); } +"*(i" { return toUnicode(0x1F39); } +"*)\\i" { return toUnicode(0x1F3A); } +"*(\\i" { return toUnicode(0x1F3B); } +"*)/i" { return toUnicode(0x1F3C); } +"*(/i" { return toUnicode(0x1F3D); } +"*)=i" { return toUnicode(0x1F3E); } +"*(=i" { return toUnicode(0x1F3F); } +"o)" { return toUnicode(0x1F40); } +"o(" { return toUnicode(0x1F41); } +"o)\\" { return toUnicode(0x1F42); } +"o(\\" { return toUnicode(0x1F43); } +"o)/" { return toUnicode(0x1F44); } +"o(/" { return toUnicode(0x1F45); } +"*)o" { return toUnicode(0x1F48); } +"*(o" { return toUnicode(0x1F49); } +"*)\\o" { return toUnicode(0x1F4A); } +"*(\\o" { return toUnicode(0x1F4B); } +"*)/o" { return toUnicode(0x1F4C); } +"*(/o" { return toUnicode(0x1F4D); } +"u)" { return toUnicode(0x1F50); } +"u(" { return toUnicode(0x1F51); } +"u)\\" { return toUnicode(0x1F52); } +"u(\\" { return toUnicode(0x1F53); } +"u)/" { return toUnicode(0x1F54); } +"u(/" { return toUnicode(0x1F55); } +"u)=" { return toUnicode(0x1F56); } +"u(=" { return toUnicode(0x1F57); } +"*(u" { return toUnicode(0x1F59); } +"*(\\u" { return toUnicode(0x1F5B); } +"*(/u" { return toUnicode(0x1F5D); } +"*(=u" { return toUnicode(0x1F5F); } +"w)" { return toUnicode(0x1F60); } +"w(" { return toUnicode(0x1F61); } +"w)\\" { return toUnicode(0x1F62); } +"w(\\" { return toUnicode(0x1F63); } +"w)/" { return toUnicode(0x1F64); } +"w(/" { return toUnicode(0x1F65); } +"w)=" { return toUnicode(0x1F66); } +"w(=" { return toUnicode(0x1F67); } +"*)w" { return toUnicode(0x1F68); } +"*(w" { return toUnicode(0x1F69); } +"*)\\w" { return toUnicode(0x1F6A); } +"*(\\w" { return toUnicode(0x1F6B); } +"*)/w" { return toUnicode(0x1F6C); } +"*(/w" { return toUnicode(0x1F6D); } +"*)=w" { return toUnicode(0x1F6E); } +"*(=w" { return toUnicode(0x1F6F); } +"a\\" { return toUnicode(0x1F70); } +"a/" { return toUnicode(0x1F71); } +"e\\" { return toUnicode(0x1F72); } +"e/" { return toUnicode(0x1F73); } +"h\\" { return toUnicode(0x1F74); } +"h/" { return toUnicode(0x1F75); } +"i\\" { return toUnicode(0x1F76); } +"i/" { return toUnicode(0x1F77); } +"o\\" { return toUnicode(0x1F78); } +"o/" { return toUnicode(0x1F79); } +"u\\" { return toUnicode(0x1F7A); } +"u/" { return toUnicode(0x1F7B); } +"w\\" { return toUnicode(0x1F7C); } +"w/" { return toUnicode(0x1F7D); } +"a)|" { return toUnicode(0x1F80); } +"a(|" { return toUnicode(0x1F81); } +"a)\\|" { return toUnicode(0x1F82); } +"a(\\|" { return toUnicode(0x1F83); } +"a)/|" { return toUnicode(0x1F84); } +"a(/|" { return toUnicode(0x1F85); } +"a)=|" { return toUnicode(0x1F86); } +"a(=|" { return toUnicode(0x1F87); } +"*)|a" { return toUnicode(0x1F88); } +"*(|a" { return toUnicode(0x1F89); } +"*)\\|a" { return toUnicode(0x1F8A); } +"*(\\|a" { return toUnicode(0x1F8B); } +"*)/|a" { return toUnicode(0x1F8C); } +"*(/|a" { return toUnicode(0x1F8D); } +"*)=|a" { return toUnicode(0x1F8E); } +"*(=|a" { return toUnicode(0x1F8F); } +"h)|" { return toUnicode(0x1F90); } +"h(|" { return toUnicode(0x1F91); } +"h)\\|" { return toUnicode(0x1F92); } +"h(\\|" { return toUnicode(0x1F93); } +"h)/|" { return toUnicode(0x1F94); } +"h(/|" { return toUnicode(0x1F95); } +"h)=|" { return toUnicode(0x1F96); } +"h(=|" { return toUnicode(0x1F97); } +"*)|h" { return toUnicode(0x1F98); } +"*(|h" { return toUnicode(0x1F99); } +"*)\\|h" { return toUnicode(0x1F9A); } +"*(\\|h" { return toUnicode(0x1F9B); } +"*)/|h" { return toUnicode(0x1F9C); } +"*(/|h" { return toUnicode(0x1F9D); } +"*)=|h" { return toUnicode(0x1F9E); } +"*(=|h" { return toUnicode(0x1F9F); } +"w)|" { return toUnicode(0x1FA0); } +"w(|" { return toUnicode(0x1FA1); } +"w)\\|" { return toUnicode(0x1FA2); } +"w(\\|" { return toUnicode(0x1FA3); } +"w)/|" { return toUnicode(0x1FA4); } +"w(/|" { return toUnicode(0x1FA5); } +"w)=|" { return toUnicode(0x1FA6); } +"w(=|" { return toUnicode(0x1FA7); } +"*)|w" { return toUnicode(0x1FA8); } +"*(|w" { return toUnicode(0x1FA9); } +"*)\\|w" { return toUnicode(0x1FAA); } +"*(\\|w" { return toUnicode(0x1FAB); } +"*)/|w" { return toUnicode(0x1FAC); } +"*(/|w" { return toUnicode(0x1FAD); } +"*)=|w" { return toUnicode(0x1FAE); } +"*(=|w" { return toUnicode(0x1FAF); } +"a^" { return toUnicode(0x1FB0); } +"a_" { return toUnicode(0x1FB1); } +"a\\|" { return toUnicode(0x1FB2); } +"a|" { return toUnicode(0x1FB3); } +"a/|" { return toUnicode(0x1FB4); } +"a=" { return toUnicode(0x1FB6); } +"a=|" { return toUnicode(0x1FB7); } +"*a^" { return toUnicode(0x1FB8); } +"*a_" { return toUnicode(0x1FB9); } +"*a\\" { return toUnicode(0x1FBA); } +"*a/" { return toUnicode(0x1FBB); } +"*a|" { return toUnicode(0x1FBC); } +"h\\|" { return toUnicode(0x1FC2); } +"h|" { return toUnicode(0x1FC3); } +"h/|" { return toUnicode(0x1FC4); } +"h=" { return toUnicode(0x1FC6); } +"h=|" { return toUnicode(0x1FC7); } +"*e\\" { return toUnicode(0x1FC8); } +"*e/" { return toUnicode(0x1FC9); } +"*h\\" { return toUnicode(0x1FCA); } +"*h/" { return toUnicode(0x1FCB); } +"*h|" { return toUnicode(0x1FCC); } +"i^" { return toUnicode(0x1FD0); } +"i_" { return toUnicode(0x1FD1); } +"i+\\" { return toUnicode(0x1FD2); } +"i+/" { return toUnicode(0x1FD3); } +"i=" { return toUnicode(0x1FD6); } +"i+=" { return toUnicode(0x1FD7); } +"*i^" { return toUnicode(0x1FD8); } +"*i_" { return toUnicode(0x1FD9); } +"*i\\" { return toUnicode(0x1FDA); } +"*i/" { return toUnicode(0x1FDB); } +"u^" { return toUnicode(0x1FE0); } +"u_" { return toUnicode(0x1FE1); } +"u+\\" { return toUnicode(0x1FE2); } +"u+/" { return toUnicode(0x1FE3); } +"r)" { return toUnicode(0x1FE4); } +"r(" { return toUnicode(0x1FE5); } +"u=" { return toUnicode(0x1FE6); } +"u+=" { return toUnicode(0x1FE7); } +"*u^" { return toUnicode(0x1FE8); } +"*u_" { return toUnicode(0x1FE9); } +"*u\\" { return toUnicode(0x1FEA); } +"*u/" { return toUnicode(0x1FEB); } +"*(r" { return toUnicode(0x1FEC); } +"w\\|" { return toUnicode(0x1FF2); } +"w|" { return toUnicode(0x1FF3); } +"w/|" { return toUnicode(0x1FF4); } +"*w\\" { return toUnicode(0x1FFA); } +"*w/" { return toUnicode(0x1FFB); } +"*w|" { return toUnicode(0x1FFC); } +"w=" { return toUnicode(0x1FF6); } +"w=|" { return toUnicode(0x1FF7); } +"*o\\" { return toUnicode(0x1FF8); } +"*o/" { return toUnicode(0x1FF9); } + +"\\" { return toUnicode(0x0300); } +"/" { return toUnicode(0x0301); } +"_" { return toUnicode(0x0304); } +"^" { return toUnicode(0x0306); } +"+" { return toUnicode(0x0308); } +"=" { return toUnicode(0x0302); } +")" { return toUnicode(0x0313); } +"(" { return toUnicode(0x0314); } +"?" { return toUnicode(0x0323); } +"|" { return toUnicode(0x0345); } + +"a" { return toUnicode(0x03b1); } /* MPDL update */ +"*a" { return toUnicode(0x0391); } /* MPDL update */ +"b" { return toUnicode(0x03b2); } /* MPDL update */ +"*b" { return toUnicode(0x0392); } /* MPDL update */ +"g" { return toUnicode(0x03b3); } /* MPDL update */ +"*g" { return toUnicode(0x0393); } /* MPDL update */ +"d" { return toUnicode(0x03b4); } /* MPDL update */ +"*d" { return toUnicode(0x0394); } /* MPDL update */ +"e" { return toUnicode(0x03b5); } /* MPDL update */ +"*e" { return toUnicode(0x0395); } /* MPDL update */ +"z" { return toUnicode(0x03b6); } /* MPDL update */ +"*z" { return toUnicode(0x0396); } /* MPDL update */ +"h" { return toUnicode(0x03b7); } /* MPDL update */ +"*h" { return toUnicode(0x0397); } /* MPDL update */ +"q" { return toUnicode(0x03b8); } /* MPDL update */ +"*q" { return toUnicode(0x0398); } /* MPDL update */ +"i" { return toUnicode(0x03b9); } /* MPDL update */ +"*i" { return toUnicode(0x0399); } /* MPDL update */ +"k" { return toUnicode(0x03ba); } /* MPDL update */ +"*k" { return toUnicode(0x039a); } /* MPDL update */ +"l" { return toUnicode(0x03bb); } /* MPDL update */ +"*l" { return toUnicode(0x039b); } /* MPDL update */ +"m" { return toUnicode(0x03bc); } /* MPDL update */ +"*m" { return toUnicode(0x039c); } /* MPDL update */ +"n" { return toUnicode(0x03bd); } /* MPDL update */ +"*n" { return toUnicode(0x039d); } /* MPDL update */ +"c" { return toUnicode(0x03be); } /* MPDL update */ +"*c" { return toUnicode(0x039e); } /* MPDL update */ +"o" { return toUnicode(0x03bf); } /* MPDL update */ +"*o" { return toUnicode(0x039f); } /* MPDL update */ +"p" { return toUnicode(0x03c0); } /* MPDL update */ +"*p" { return toUnicode(0x03a0); } /* MPDL update */ +"r" { return toUnicode(0x03c1); } /* MPDL update */ +"*r" { return toUnicode(0x03a1); } /* MPDL update */ + +"*s" { return toUnicode(0x03a3); } /* MPDL update */ +"s1" { return toUnicode(0x03c3); } /* mdh 2002-01-07 */ +"s"/\-\- { return toUnicode(0x03c2); } +"s"/\> }[a-z\?\!0-9*=\/()\'\-] { return toUnicode(0x03c3); } /* MPDL update */ +"s"/\< { return toUnicode(0x03c2); } /* MPDL update */ +"s"/[\[\]][a-z\?\!0-9*=\/()\'\-] { return toUnicode(0x03c3); } /* MPDL update */ +"s"/\??[^a-z0-9*=\/()\'\-\[\?] { return toUnicode(0x03c2); } +"s" { return toUnicode(0x03c3); } /* MPDL update */ + +"t" { return toUnicode(0x03c4); } /* MPDL update */ +"*t" { return toUnicode(0x03a4); } /* MPDL update */ +"u" { return toUnicode(0x03c5); } /* MPDL update */ +"*u" { return toUnicode(0x03a5); } /* MPDL update */ +"f" { return toUnicode(0x03c6); } /* MPDL update */ +"*f" { return toUnicode(0x03a6); } /* MPDL update */ +"x" { return toUnicode(0x03c7); } /* MPDL update */ +"*x" { return toUnicode(0x03a7); } /* MPDL update */ +"y" { return toUnicode(0x03c8); } /* MPDL update */ +"*y" { return toUnicode(0x03a8); } /* MPDL update */ +"w" { return toUnicode(0x03c9); } /* MPDL update */ +"*w" { return toUnicode(0x03a9); } /* MPDL update */ + +[\&_]"vert;" { return "|"; } +[\&_]"lpar;" { return "("; } +[\&_]"rpar;" { return ")"; } +[\_\&]"lt;" { return "<"; } +[\_\&]"gt;" { return ">"; } +"'" { return "'"; } /* MPDL update */ + +"&"[a-zA-Z]+";" { return yytext(); } + +. { return yytext(); } +\n { return yytext(); } \ No newline at end of file diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/transcode/Betacode2Unicode.lex.old --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/transcode/Betacode2Unicode.lex.old Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,318 @@ +package de.mpg.mpiwg.berlin.mpdl.lt.general; + +%% +%{ + /* + * Betacode to Unicode conversion + */ + + private int isUpper = 0; + + private String toUnicodeGreek(int in) { + String retStr = toUnicode(in - (isUpper * 0x0020)); + isUpper = 0; + return retStr; + } + + private String toUnicode(int in) { + char c = (char) in; + Character ch = new Character(c); + String retString = ch.toString(); + return retString; + } + +%} + +%class Betacode2UnicodeLex +%public +%type java.lang.String +%unicode +%% + + +"<"[^>]+">" { return yytext(); } + +"*j" { return "H"; } +"j" { return "h"; } +"*v" { return "F"; } +"v" { return "f"; } +"*s" { return toUnicode(0x03a3); } + +"!" { return "."; } +":" { return toUnicode(0x00B7); } /* MPDL update */ + +"a)" { return toUnicode(0x1F00); } +"a(" { return toUnicode(0x1F01); } +"a)\\" { return toUnicode(0x1F02); } +"a(\\" { return toUnicode(0x1F03); } +"a)/" { return toUnicode(0x1F04); } +"a(/" { return toUnicode(0x1F05); } +"a)=" { return toUnicode(0x1F06); } +"a(=" { return toUnicode(0x1F07); } +"*)a" { return toUnicode(0x1F08); } +"*(a" { return toUnicode(0x1F09); } +"*)\\a" { return toUnicode(0x1F0A); } +"*(\\a" { return toUnicode(0x1F0B); } +"*)/a" { return toUnicode(0x1F0C); } +"*(/a" { return toUnicode(0x1F0D); } +"*)=a" { return toUnicode(0x1F0E); } +"*(=a" { return toUnicode(0x1F0F); } +"e)" { return toUnicode(0x1F10); } +"e(" { return toUnicode(0x1F11); } +"e)\\" { return toUnicode(0x1F12); } +"e(\\" { return toUnicode(0x1F13); } +"e)/" { return toUnicode(0x1F14); } +"e(/" { return toUnicode(0x1F15); } +"*)e" { return toUnicode(0x1F18); } +"*(e" { return toUnicode(0x1F19); } +"*)\\e" { return toUnicode(0x1F1A); } +"*(\\e" { return toUnicode(0x1F1B); } +"*)/e" { return toUnicode(0x1F1C); } +"*(/e" { return toUnicode(0x1F1D); } +"h)" { return toUnicode(0x1F20); } +"h(" { return toUnicode(0x1F21); } +"h)\\" { return toUnicode(0x1F22); } +"h(\\" { return toUnicode(0x1F23); } +"h)/" { return toUnicode(0x1F24); } +"h(/" { return toUnicode(0x1F25); } +"h)=" { return toUnicode(0x1F26); } +"h(=" { return toUnicode(0x1F27); } +"*)h" { return toUnicode(0x1F28); } +"*(h" { return toUnicode(0x1F29); } +"*)\\h" { return toUnicode(0x1F2A); } +"*(\\h" { return toUnicode(0x1F2B); } +"*)/h" { return toUnicode(0x1F2C); } +"*(/h" { return toUnicode(0x1F2D); } +"*)=h" { return toUnicode(0x1F2E); } +"*(=h" { return toUnicode(0x1F2F); } +"i)" { return toUnicode(0x1F30); } +"i(" { return toUnicode(0x1F31); } +"i)\\" { return toUnicode(0x1F32); } +"i(\\" { return toUnicode(0x1F33); } +"i)/" { return toUnicode(0x1F34); } +"i(/" { return toUnicode(0x1F35); } +"i)=" { return toUnicode(0x1F36); } +"i(=" { return toUnicode(0x1F37); } +"*)i" { return toUnicode(0x1F38); } +"*(i" { return toUnicode(0x1F39); } +"*)\\i" { return toUnicode(0x1F3A); } +"*(\\i" { return toUnicode(0x1F3B); } +"*)/i" { return toUnicode(0x1F3C); } +"*(/i" { return toUnicode(0x1F3D); } +"*)=i" { return toUnicode(0x1F3E); } +"*(=i" { return toUnicode(0x1F3F); } +"o)" { return toUnicode(0x1F40); } +"o(" { return toUnicode(0x1F41); } +"o)\\" { return toUnicode(0x1F42); } +"o(\\" { return toUnicode(0x1F43); } +"o)/" { return toUnicode(0x1F44); } +"o(/" { return toUnicode(0x1F45); } +"*)o" { return toUnicode(0x1F48); } +"*(o" { return toUnicode(0x1F49); } +"*)\\o" { return toUnicode(0x1F4A); } +"*(\\o" { return toUnicode(0x1F4B); } +"*)/o" { return toUnicode(0x1F4C); } +"*(/o" { return toUnicode(0x1F4D); } +"u)" { return toUnicode(0x1F50); } +"u(" { return toUnicode(0x1F51); } +"u)\\" { return toUnicode(0x1F52); } +"u(\\" { return toUnicode(0x1F53); } +"u)/" { return toUnicode(0x1F54); } +"u(/" { return toUnicode(0x1F55); } +"u)=" { return toUnicode(0x1F56); } +"u(=" { return toUnicode(0x1F57); } +"*(u" { return toUnicode(0x1F59); } +"*(\\u" { return toUnicode(0x1F5B); } +"*(/u" { return toUnicode(0x1F5D); } +"*(=u" { return toUnicode(0x1F5F); } +"w)" { return toUnicode(0x1F60); } +"w(" { return toUnicode(0x1F61); } +"w)\\" { return toUnicode(0x1F62); } +"w(\\" { return toUnicode(0x1F63); } +"w)/" { return toUnicode(0x1F64); } +"w(/" { return toUnicode(0x1F65); } +"w)=" { return toUnicode(0x1F66); } +"w(=" { return toUnicode(0x1F67); } +"*)w" { return toUnicode(0x1F68); } +"*(w" { return toUnicode(0x1F69); } +"*)\\w" { return toUnicode(0x1F6A); } +"*(\\w" { return toUnicode(0x1F6B); } +"*)/w" { return toUnicode(0x1F6C); } +"*(/w" { return toUnicode(0x1F6D); } +"*)=w" { return toUnicode(0x1F6E); } +"*(=w" { return toUnicode(0x1F6F); } +"a\\" { return toUnicode(0x1F70); } +"a/" { return toUnicode(0x1F71); } +"e\\" { return toUnicode(0x1F72); } +"e/" { return toUnicode(0x1F73); } +"h\\" { return toUnicode(0x1F74); } +"h/" { return toUnicode(0x1F75); } +"i\\" { return toUnicode(0x1F76); } +"i/" { return toUnicode(0x1F77); } +"o\\" { return toUnicode(0x1F78); } +"o/" { return toUnicode(0x1F79); } +"u\\" { return toUnicode(0x1F7A); } +"u/" { return toUnicode(0x1F7B); } +"w\\" { return toUnicode(0x1F7C); } +"w/" { return toUnicode(0x1F7D); } +"a)|" { return toUnicode(0x1F80); } +"a(|" { return toUnicode(0x1F81); } +"a)\\|" { return toUnicode(0x1F82); } +"a(\\|" { return toUnicode(0x1F83); } +"a)/|" { return toUnicode(0x1F84); } +"a(/|" { return toUnicode(0x1F85); } +"a)=|" { return toUnicode(0x1F86); } +"a(=|" { return toUnicode(0x1F87); } +"*)|a" { return toUnicode(0x1F88); } +"*(|a" { return toUnicode(0x1F89); } +"*)\\|a" { return toUnicode(0x1F8A); } +"*(\\|a" { return toUnicode(0x1F8B); } +"*)/|a" { return toUnicode(0x1F8C); } +"*(/|a" { return toUnicode(0x1F8D); } +"*)=|a" { return toUnicode(0x1F8E); } +"*(=|a" { return toUnicode(0x1F8F); } +"h)|" { return toUnicode(0x1F90); } +"h(|" { return toUnicode(0x1F91); } +"h)\\|" { return toUnicode(0x1F92); } +"h(\\|" { return toUnicode(0x1F93); } +"h)/|" { return toUnicode(0x1F94); } +"h(/|" { return toUnicode(0x1F95); } +"h)=|" { return toUnicode(0x1F96); } +"h(=|" { return toUnicode(0x1F97); } +"*)|h" { return toUnicode(0x1F98); } +"*(|h" { return toUnicode(0x1F99); } +"*)\\|h" { return toUnicode(0x1F9A); } +"*(\\|h" { return toUnicode(0x1F9B); } +"*)/|h" { return toUnicode(0x1F9C); } +"*(/|h" { return toUnicode(0x1F9D); } +"*)=|h" { return toUnicode(0x1F9E); } +"*(=|h" { return toUnicode(0x1F9F); } +"w)|" { return toUnicode(0x1FA0); } +"w(|" { return toUnicode(0x1FA1); } +"w)\\|" { return toUnicode(0x1FA2); } +"w(\\|" { return toUnicode(0x1FA3); } +"w)/|" { return toUnicode(0x1FA4); } +"w(/|" { return toUnicode(0x1FA5); } +"w)=|" { return toUnicode(0x1FA6); } +"w(=|" { return toUnicode(0x1FA7); } +"*)|w" { return toUnicode(0x1FA8); } +"*(|w" { return toUnicode(0x1FA9); } +"*)\\|w" { return toUnicode(0x1FAA); } +"*(\\|w" { return toUnicode(0x1FAB); } +"*)/|w" { return toUnicode(0x1FAC); } +"*(/|w" { return toUnicode(0x1FAD); } +"*)=|w" { return toUnicode(0x1FAE); } +"*(=|w" { return toUnicode(0x1FAF); } +"a^" { return toUnicode(0x1FB0); } +"a_" { return toUnicode(0x1FB1); } +"a\\|" { return toUnicode(0x1FB2); } +"a|" { return toUnicode(0x1FB3); } +"a/|" { return toUnicode(0x1FB4); } +"a=" { return toUnicode(0x1FB6); } +"a=|" { return toUnicode(0x1FB7); } +"*a^" { return toUnicode(0x1FB8); } +"*a_" { return toUnicode(0x1FB9); } +"*a\\" { return toUnicode(0x1FBA); } +"*a/" { return toUnicode(0x1FBB); } +"*a|" { return toUnicode(0x1FBC); } +"h\\|" { return toUnicode(0x1FC2); } +"h|" { return toUnicode(0x1FC3); } +"h/|" { return toUnicode(0x1FC4); } +"h=" { return toUnicode(0x1FC6); } +"h=|" { return toUnicode(0x1FC7); } +"*e\\" { return toUnicode(0x1FC8); } +"*e/" { return toUnicode(0x1FC9); } +"*h\\" { return toUnicode(0x1FCA); } +"*h/" { return toUnicode(0x1FCB); } +"*h|" { return toUnicode(0x1FCC); } +"i^" { return toUnicode(0x1FD0); } +"i_" { return toUnicode(0x1FD1); } +"i+\\" { return toUnicode(0x1FD2); } +"i+/" { return toUnicode(0x1FD3); } +"i=" { return toUnicode(0x1FD6); } +"i+=" { return toUnicode(0x1FD7); } +"*i^" { return toUnicode(0x1FD8); } +"*i_" { return toUnicode(0x1FD9); } +"*i\\" { return toUnicode(0x1FDA); } +"*i/" { return toUnicode(0x1FDB); } +"u^" { return toUnicode(0x1FE0); } +"u_" { return toUnicode(0x1FE1); } +"u+\\" { return toUnicode(0x1FE2); } +"u+/" { return toUnicode(0x1FE3); } +"r)" { return toUnicode(0x1FE4); } +"r(" { return toUnicode(0x1FE5); } +"u=" { return toUnicode(0x1FE6); } +"u+=" { return toUnicode(0x1FE7); } +"*u^" { return toUnicode(0x1FE8); } +"*u_" { return toUnicode(0x1FE9); } +"*u\\" { return toUnicode(0x1FEA); } +"*u/" { return toUnicode(0x1FEB); } +"*(r" { return toUnicode(0x1FEC); } +"w\\|" { return toUnicode(0x1FF2); } +"w|" { return toUnicode(0x1FF3); } +"w/|" { return toUnicode(0x1FF4); } +"*w\\" { return toUnicode(0x1FFA); } +"*w/" { return toUnicode(0x1FFB); } +"*w|" { return toUnicode(0x1FFC); } +"w=" { return toUnicode(0x1FF6); } +"w=|" { return toUnicode(0x1FF7); } +"*o\\" { return toUnicode(0x1FF8); } +"*o/" { return toUnicode(0x1FF9); } + +"*" isUpper = 1; + +"\\" { return toUnicode(0x0300); } +"/" { return toUnicode(0x0301); } +"_" { return toUnicode(0x0304); } +"^" { return toUnicode(0x0306); } +"+" { return toUnicode(0x0308); } +"=" { return toUnicode(0x0302); } +")" { return toUnicode(0x0313); } +"(" { return toUnicode(0x0314); } +"?" { return toUnicode(0x0323); } +"|" { return toUnicode(0x0345); } + +"a" { return toUnicodeGreek(0x03b1); } +"b" { return toUnicodeGreek(0x03b2); } +"g" { return toUnicodeGreek(0x03b3); } +"d" { return toUnicodeGreek(0x03b4); } +"e" { return toUnicodeGreek(0x03b5); } +"z" { return toUnicodeGreek(0x03b6); } +"h" { return toUnicodeGreek(0x03b7); } +"q" { return toUnicodeGreek(0x03b8); } +"i" { return toUnicodeGreek(0x03b9); } +"k" { return toUnicodeGreek(0x03ba); } +"l" { return toUnicodeGreek(0x03bb); } +"m" { return toUnicodeGreek(0x03bc); } +"n" { return toUnicodeGreek(0x03bd); } +"c" { return toUnicodeGreek(0x03be); } +"o" { return toUnicodeGreek(0x03bf); } +"p" { return toUnicodeGreek(0x03c0); } +"r" { return toUnicodeGreek(0x03c1); } + +"s1" { return toUnicode(0x03c3); } /* mdh 2002-01-07 */ +"s"/\-\- { return toUnicode(0x03c2); } +"s"/\> }[a-z\?\!0-9*=\/()\'\-] { return toUnicodeGreek(0x03c3); } +"s"/\< { return toUnicodeGreek(0x03c2); } /* MPDL update */ +"s"/[\[\]][a-z\?\!0-9*=\/()\'\-] { return toUnicodeGreek(0x03c3); } +"s"/\??[^a-z0-9*=\/()\'\-\[\?] { return toUnicode(0x03c2); } +"s" { return toUnicodeGreek(0x03c3); } + +"t" { return toUnicodeGreek(0x03c4); } +"u" { return toUnicodeGreek(0x03c5); } +"f" { return toUnicodeGreek(0x03c6); } +"x" { return toUnicodeGreek(0x03c7); } +"y" { return toUnicodeGreek(0x03c8); } +"w" { return toUnicodeGreek(0x03c9); } + +[\&_]"vert;" { return "|"; } +[\&_]"lpar;" { return "("; } +[\&_]"rpar;" { return ")"; } +[\_\&]"lt;" { return "<"; } +[\_\&]"gt;" { return ">"; } +"'" { return "'"; } /* MPDL update */ + +"&"[a-zA-Z]+";" { return yytext(); } + +. { return yytext(); } +\n { return yytext(); } \ No newline at end of file diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/transcode/Betacode2UnicodeLex.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/transcode/Betacode2UnicodeLex.java Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,1908 @@ +package de.mpg.mpiwg.berlin.mpdl.lt.text.transcode; + + +/** + * This class is a scanner generated by + * JFlex 1.4.3 + * on 19.11.09 20:01 from the specification file + * /Users/jwillenborg/java/existDevMai2009/mpdl/extensions/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/general/Betacode2Unicode.lex + */ +public class Betacode2UnicodeLex { + + /** This character denotes the end of file */ + public static final int YYEOF = -1; + + /** initial size of the lookahead buffer */ + private static final int ZZ_BUFFERSIZE = 16384; + + /** lexical states */ + public static final int YYINITIAL = 0; + + /** + * ZZ_LEXSTATE[l] is the state in the DFA for the lexical state l + * ZZ_LEXSTATE[l+1] is the state in the DFA for the lexical state l + * at the beginning of a line + * l is of the form l = 2*k, k a non negative integer + */ + private static final int ZZ_LEXSTATE[] = { + 0, 0 + }; + + /** + * Translates characters to character classes + */ + private static final String ZZ_CMAP_PACKED = + "\12\0\1\0\26\0\1\7\1\0\1\62\2\0\1\50\1\54\1\13"+ + "\1\12\1\3\1\30\1\0\1\47\1\0\1\15\1\63\1\46\1\54"+ + "\1\64\5\54\1\65\1\10\1\52\1\1\1\16\1\2\1\32\1\0"+ + "\32\66\1\56\1\14\1\55\1\26\1\27\1\0\1\11\1\33\1\44"+ + "\1\35\1\17\1\57\1\34\1\20\1\21\1\4\1\40\1\41\1\42"+ + "\1\43\1\22\1\45\1\37\1\31\1\6\1\51\1\23\1\5\1\24"+ + "\1\60\1\61\1\36\1\0\1\25\1\53\uff82\0"; + + /** + * Translates characters to character classes + */ + private static final char [] ZZ_CMAP = zzUnpackCMap(ZZ_CMAP_PACKED); + + /** + * Translates DFA states to action switch labels. + */ + private static final int [] ZZ_ACTION = zzUnpackAction(); + + private static final String ZZ_ACTION_PACKED_0 = + "\1\0\3\1\1\2\1\3\1\4\1\5\1\6\1\7"+ + "\1\10\1\11\1\12\1\13\1\14\1\15\1\16\1\17"+ + "\1\20\1\21\1\22\1\23\1\24\1\25\1\26\1\27"+ + "\1\30\1\31\1\32\1\33\1\34\1\35\1\36\1\37"+ + "\1\40\1\41\1\42\1\43\1\1\1\44\1\45\1\46"+ + "\1\47\1\0\1\50\1\51\1\52\1\53\2\0\1\54"+ + "\1\55\1\56\1\57\1\60\1\61\1\62\1\63\1\64"+ + "\1\65\1\66\1\67\1\70\1\71\1\72\1\73\1\74"+ + "\1\75\1\76\1\77\1\100\1\101\1\102\1\0\1\4"+ + "\1\0\2\102\1\0\1\103\1\104\1\105\1\106\1\107"+ + "\1\110\1\111\1\112\1\113\1\114\1\115\1\116\1\117"+ + "\1\120\1\121\1\122\1\123\1\124\1\125\1\126\1\127"+ + "\1\130\1\131\1\132\1\133\1\0\1\134\1\135\1\136"+ + "\1\137\1\140\1\141\1\142\1\143\1\144\1\145\1\146"+ + "\1\0\1\147\1\150\1\151\1\152\1\153\1\154\4\0"+ + "\1\155\1\156\6\0\1\157\1\160\1\161\1\162\1\163"+ + "\1\164\3\0\1\165\1\166\1\167\1\170\1\171\1\0"+ + "\1\172\3\0\1\173\1\174\1\175\1\176\1\177\1\200"+ + "\1\0\1\201\1\202\1\203\1\204\1\205\1\206\1\207"+ + "\1\210\1\211\1\212\1\213\1\214\1\215\1\216\1\217"+ + "\1\220\1\221\1\222\1\223\2\0\1\224\1\225\1\226"+ + "\1\227\1\230\1\231\1\232\1\233\1\234\1\235\1\236"+ + "\1\237\1\240\1\241\1\242\1\243\1\244\1\245\1\246"+ + "\1\247\1\250\1\251\1\252\1\253\1\254\1\255\1\256"+ + "\1\257\1\260\1\261\1\262\1\263\1\264\1\265\1\266"+ + "\1\267\1\270\1\271\1\272\1\273\1\274\1\275\1\276"+ + "\1\277\1\300\1\301\1\302\1\303\1\304\1\305\1\306"+ + "\1\307\1\310\1\311\1\312\1\313\1\314\1\315\1\316"+ + "\1\317\13\0\1\320\1\321\1\322\1\323\1\324\1\325"+ + "\1\0\1\326\1\327\1\330\1\331\1\332\1\333\1\0"+ + "\1\334\1\335\1\336\1\337\1\0\1\340\1\341\1\342"+ + "\1\343\1\344\1\345\1\346\1\347\1\350\1\351\1\0"+ + "\1\352\1\353\1\354\1\355\1\356\1\357\1\360\1\0"+ + "\1\361\1\362\1\363\1\364\1\365\1\0\1\366\1\367"+ + "\1\370\2\0\1\371\1\372\1\373\1\374\1\375\1\376"+ + "\1\377\1\u0100\1\u0101\1\u0102\1\u0103\1\u0104\1\u0105\1\u0106"+ + "\1\u0107\1\u0108\1\u0109\1\u010a\2\0\1\u010b\1\0\1\u010c"+ + "\4\0\1\u010d\1\u010e\1\u010f\1\u0110\1\u0111\1\u0112\1\u0113"+ + "\1\u0114\1\u0115\1\u0116\1\u0117\1\u0118\1\u0119\1\u011a\1\u011b"+ + "\1\u011c\1\u011d\1\u011e\10\0\1\u011f\1\u0120\1\u0121\1\u0122"; + + private static int [] zzUnpackAction() { + int [] result = new int[359]; + int offset = 0; + offset = zzUnpackAction(ZZ_ACTION_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackAction(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int count = packed.charAt(i++); + int value = packed.charAt(i++); + do result[j++] = value; while (--count > 0); + } + return j; + } + + + /** + * Translates a state to a row index in the transition table + */ + private static final int [] ZZ_ROWMAP = zzUnpackRowMap(); + + private static final String ZZ_ROWMAP_PACKED_0 = + "\0\0\0\67\0\156\0\245\0\67\0\67\0\334\0\67"+ + "\0\67\0\u0113\0\67\0\67\0\67\0\67\0\67\0\u014a"+ + "\0\u0181\0\u01b8\0\u01ef\0\u0226\0\u025d\0\67\0\67\0\u0294"+ + "\0\67\0\u02cb\0\67\0\67\0\67\0\67\0\67\0\67"+ + "\0\67\0\67\0\67\0\67\0\67\0\67\0\u0302\0\67"+ + "\0\67\0\67\0\67\0\u0339\0\67\0\67\0\67\0\u0370"+ + "\0\u03a7\0\u03de\0\u0415\0\u044c\0\u0483\0\u04ba\0\u04f1\0\u0528"+ + "\0\67\0\67\0\67\0\67\0\67\0\67\0\67\0\67"+ + "\0\67\0\67\0\67\0\67\0\67\0\67\0\67\0\67"+ + "\0\67\0\u055f\0\67\0\u0596\0\u05cd\0\u0604\0\u0604\0\u063b"+ + "\0\u0672\0\u06a9\0\u06e0\0\u0717\0\67\0\67\0\67\0\u074e"+ + "\0\u0785\0\67\0\67\0\u07bc\0\u07f3\0\u082a\0\u0861\0\u0898"+ + "\0\67\0\u08cf\0\u0906\0\67\0\67\0\67\0\67\0\67"+ + "\0\u093d\0\u0974\0\u09ab\0\67\0\67\0\u09e2\0\u0a19\0\67"+ + "\0\67\0\67\0\67\0\67\0\u0a50\0\u0a87\0\u0abe\0\u0af5"+ + "\0\u0b2c\0\u0b63\0\67\0\u0b9a\0\u0bd1\0\u0c08\0\u0c3f\0\67"+ + "\0\67\0\u0c76\0\u0cad\0\u0ce4\0\u0d1b\0\u0d52\0\u0d89\0\67"+ + "\0\67\0\67\0\67\0\67\0\67\0\u0dc0\0\u0df7\0\u0e2e"+ + "\0\67\0\67\0\67\0\67\0\67\0\u0e65\0\67\0\u0e9c"+ + "\0\u0ed3\0\u0f0a\0\67\0\67\0\67\0\67\0\67\0\67"+ + "\0\u0f41\0\67\0\67\0\67\0\67\0\67\0\67\0\67"+ + "\0\67\0\67\0\67\0\67\0\67\0\67\0\67\0\67"+ + "\0\67\0\67\0\67\0\67\0\u0f78\0\u0faf\0\67\0\u0fe6"+ + "\0\u101d\0\u1054\0\67\0\u108b\0\u10c2\0\u10f9\0\67\0\67"+ + "\0\67\0\67\0\67\0\67\0\67\0\67\0\u1130\0\u1167"+ + "\0\u119e\0\67\0\u11d5\0\u120c\0\u1243\0\67\0\67\0\67"+ + "\0\67\0\67\0\67\0\67\0\67\0\67\0\67\0\67"+ + "\0\67\0\67\0\67\0\67\0\67\0\67\0\67\0\67"+ + "\0\67\0\67\0\67\0\67\0\67\0\67\0\67\0\u127a"+ + "\0\u12b1\0\u12e8\0\67\0\u131f\0\u1356\0\u138d\0\67\0\67"+ + "\0\67\0\67\0\u13c4\0\u13fb\0\u1432\0\u1469\0\u14a0\0\u14d7"+ + "\0\u150e\0\u1545\0\u157c\0\u15b3\0\u15ea\0\67\0\67\0\67"+ + "\0\67\0\67\0\67\0\u1621\0\67\0\67\0\67\0\67"+ + "\0\67\0\67\0\u1658\0\67\0\67\0\67\0\67\0\u168f"+ + "\0\67\0\67\0\67\0\67\0\67\0\67\0\67\0\67"+ + "\0\67\0\67\0\u16c6\0\67\0\67\0\67\0\67\0\67"+ + "\0\67\0\67\0\u16fd\0\67\0\67\0\67\0\67\0\67"+ + "\0\u1734\0\67\0\67\0\67\0\u176b\0\u17a2\0\67\0\67"+ + "\0\67\0\67\0\67\0\67\0\67\0\67\0\67\0\67"+ + "\0\67\0\67\0\67\0\67\0\67\0\67\0\67\0\67"+ + "\0\u17d9\0\u1810\0\67\0\u1847\0\67\0\u187e\0\u18b5\0\u18ec"+ + "\0\u1923\0\67\0\67\0\67\0\67\0\67\0\67\0\67"+ + "\0\67\0\67\0\67\0\67\0\67\0\67\0\67\0\67"+ + "\0\67\0\67\0\67\0\u195a\0\u1991\0\u19c8\0\u19ff\0\u1a36"+ + "\0\u1a6d\0\u1aa4\0\u1adb\0\67\0\67\0\67\0\67"; + + private static int [] zzUnpackRowMap() { + int [] result = new int[359]; + int offset = 0; + offset = zzUnpackRowMap(ZZ_ROWMAP_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackRowMap(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int high = packed.charAt(i++) << 16; + result[j++] = high | packed.charAt(i++); + } + return j; + } + + /** + * The transition table of the DFA + */ + private static final int [] ZZ_TRANS = zzUnpackTrans(); + + private static final String ZZ_TRANS_PACKED_0 = + "\1\2\1\3\1\2\1\4\1\5\1\6\1\7\1\10"+ + "\1\11\1\12\1\13\1\14\1\15\1\16\1\17\1\20"+ + "\1\21\1\22\1\23\1\24\1\25\1\26\1\27\1\30"+ + "\1\31\1\32\1\33\1\34\1\35\1\36\1\37\1\40"+ + "\1\41\1\42\1\43\1\44\1\45\1\46\2\2\1\47"+ + "\1\50\5\2\1\51\1\52\1\53\5\2\67\0\2\54"+ + "\1\0\64\54\4\0\1\55\1\56\1\57\2\0\1\60"+ + "\1\61\1\62\3\0\1\63\1\64\1\65\1\66\1\67"+ + "\1\70\4\0\1\71\1\0\1\72\1\73\1\74\1\75"+ + "\1\76\1\77\1\100\1\101\1\102\1\103\1\104\3\0"+ + "\1\105\5\0\1\106\1\107\1\110\5\0\3\111\4\0"+ + "\2\111\3\0\1\111\10\0\4\111\1\0\1\112\13\0"+ + "\1\113\1\114\1\115\1\0\2\111\1\0\1\116\1\117"+ + "\3\0\1\111\3\0\1\111\12\0\1\120\1\121\1\122"+ + "\1\123\1\124\6\0\1\125\1\126\1\127\51\0\1\130"+ + "\1\131\1\132\1\133\63\0\1\134\1\135\1\136\1\137"+ + "\1\140\6\0\1\141\53\0\1\142\1\143\1\144\1\145"+ + "\1\146\7\0\1\147\1\150\1\151\50\0\1\152\1\153"+ + "\1\154\1\155\63\0\1\156\1\157\1\160\1\161\1\162"+ + "\7\0\1\163\1\164\1\165\50\0\1\166\1\167\1\170"+ + "\1\171\1\172\6\0\1\173\46\0\1\174\23\0\1\175"+ + "\2\0\1\176\4\0\1\177\37\0\1\200\1\201\57\0"+ + "\1\202\1\203\1\202\2\0\1\202\5\0\6\202\4\0"+ + "\1\204\1\0\1\202\1\205\4\202\1\206\4\202\3\0"+ + "\1\202\5\0\3\202\1\207\3\0\1\202\2\54\1\2"+ + "\64\54\14\0\1\210\1\211\7\0\1\212\1\213\1\214"+ + "\50\0\1\215\2\0\1\216\1\217\1\220\1\221\1\222"+ + "\1\223\1\224\1\0\1\225\1\226\52\0\1\227\2\0"+ + "\1\230\1\231\1\232\1\233\1\234\1\235\1\236\1\237"+ + "\1\240\1\241\3\0\1\242\51\0\1\243\1\244\65\0"+ + "\1\245\1\246\7\0\1\247\55\0\1\250\1\251\10\0"+ + "\1\252\1\253\53\0\1\254\1\255\65\0\1\256\1\257"+ + "\10\0\1\260\1\261\53\0\1\262\1\263\7\0\1\264"+ + "\41\0\3\111\4\0\2\111\3\0\1\111\10\0\4\111"+ + "\17\0\1\111\1\0\2\111\1\0\1\111\4\0\1\111"+ + "\3\0\1\111\47\0\1\111\53\0\1\265\4\0\1\266"+ + "\30\0\5\267\1\0\3\267\1\0\10\267\4\0\17\267"+ + "\1\0\1\267\2\0\1\267\2\0\3\267\1\0\3\267"+ + "\15\0\1\270\1\271\1\272\6\0\1\273\55\0\1\274"+ + "\1\275\1\276\6\0\1\277\66\0\1\300\66\0\1\301"+ + "\66\0\1\302\55\0\1\303\1\304\65\0\1\305\1\306"+ + "\65\0\1\307\1\310\1\311\6\0\1\312\55\0\1\313"+ + "\1\314\1\315\6\0\1\316\66\0\1\317\66\0\1\320"+ + "\66\0\1\321\55\0\1\322\1\323\1\324\64\0\1\325"+ + "\1\326\1\327\64\0\1\330\1\331\1\332\64\0\1\333"+ + "\1\334\65\0\1\335\1\336\65\0\1\337\1\340\1\341"+ + "\64\0\1\342\1\343\1\344\64\0\1\345\1\346\1\347"+ + "\64\0\1\350\1\351\1\352\6\0\1\353\55\0\1\354"+ + "\1\355\1\356\6\0\1\357\66\0\1\360\66\0\1\361"+ + "\66\0\1\362\60\0\1\363\114\0\1\364\72\0\1\365"+ + "\62\0\1\366\3\0\1\367\21\0\3\202\2\0\1\202"+ + "\5\0\6\202\4\0\1\202\1\0\13\202\3\0\1\202"+ + "\1\2\4\0\3\202\4\0\1\202\4\0\3\202\2\0"+ + "\1\202\5\0\1\370\5\202\4\0\1\202\1\0\13\202"+ + "\3\0\1\202\1\2\4\0\3\202\4\0\1\202\4\0"+ + "\3\202\2\0\1\202\5\0\6\202\4\0\1\202\1\0"+ + "\12\202\1\371\3\0\1\202\1\2\4\0\3\202\4\0"+ + "\1\202\4\0\3\202\2\0\1\202\5\0\6\202\4\0"+ + "\1\202\1\0\13\202\3\0\1\372\1\2\4\0\3\202"+ + "\4\0\1\202\4\0\3\202\2\0\1\202\5\0\6\202"+ + "\4\0\1\202\1\0\12\202\1\373\3\0\1\374\1\2"+ + "\4\0\3\202\4\0\1\202\63\0\1\375\14\0\1\376"+ + "\5\0\1\377\1\u0100\1\u0101\1\u0102\1\0\1\u0103\1\u0104"+ + "\52\0\1\u0105\5\0\1\u0106\1\u0107\1\u0108\1\u0109\1\0"+ + "\1\u010a\1\u010b\52\0\1\u010c\6\0\1\u010d\1\u010e\2\0"+ + "\1\u010f\1\u0110\52\0\1\u0111\6\0\1\u0112\3\0\1\u0113"+ + "\53\0\1\u0114\5\0\1\u0115\1\u0116\1\u0117\1\u0118\1\u0119"+ + "\1\u011a\1\u011b\52\0\1\u011c\5\0\1\u011d\1\u011e\1\u011f"+ + "\1\u0120\1\u0121\1\u0122\1\u0123\52\0\1\u0124\6\0\1\u0125"+ + "\1\u0126\1\0\1\u0127\1\u0128\1\u0129\52\0\1\u012a\6\0"+ + "\1\u012b\3\0\1\u012c\113\0\1\u012d\66\0\1\u012e\42\0"+ + "\1\u012f\66\0\1\u0130\66\0\1\u0131\66\0\1\u0132\66\0"+ + "\1\u0133\66\0\1\u0134\66\0\1\u0135\66\0\1\u0136\66\0"+ + "\1\u0137\66\0\1\u0138\66\0\1\u0139\66\0\1\u013a\66\0"+ + "\1\u013b\66\0\1\u013c\66\0\1\u013d\66\0\1\u013e\66\0"+ + "\1\u013f\66\0\1\u0140\72\0\1\u0141\46\0\1\u0142\127\0"+ + "\1\u0143\25\0\1\u0144\127\0\1\u0145\20\0\3\202\2\0"+ + "\1\202\5\0\6\202\4\0\1\u0146\1\0\13\202\3\0"+ + "\1\202\1\2\4\0\3\202\4\0\1\202\4\0\3\202"+ + "\2\0\1\u0147\5\0\6\202\4\0\1\202\1\0\13\202"+ + "\3\0\1\202\1\2\4\0\3\202\4\0\1\202\4\0"+ + "\3\202\2\0\1\202\5\0\6\202\4\0\1\202\1\0"+ + "\13\202\3\0\1\202\1\u0143\4\0\3\202\4\0\1\202"+ + "\4\0\3\202\2\0\1\u0148\5\0\6\202\4\0\1\202"+ + "\1\0\13\202\3\0\1\202\1\2\4\0\3\202\4\0"+ + "\1\202\4\0\3\202\2\0\1\202\5\0\6\202\4\0"+ + "\1\202\1\0\13\202\3\0\1\202\1\u0145\4\0\3\202"+ + "\4\0\1\202\64\0\1\u0149\13\0\1\u014a\6\0\1\u014b"+ + "\3\0\1\u014c\53\0\1\u014d\6\0\1\u014e\3\0\1\u014f"+ + "\53\0\1\u0150\6\0\1\u0151\3\0\1\u0152\53\0\1\u0153"+ + "\6\0\1\u0154\3\0\1\u0155\53\0\1\u0156\6\0\1\u0157"+ + "\3\0\1\u0158\53\0\1\u0159\6\0\1\u015a\3\0\1\u015b"+ + "\114\0\1\u015c\66\0\1\111\65\0\1\u015d\46\0\1\u015e"+ + "\66\0\1\u015f\41\0\3\202\2\0\1\202\5\0\6\202"+ + "\4\0\1\202\1\0\13\202\3\0\1\u0160\1\2\4\0"+ + "\3\202\4\0\1\202\4\0\3\202\2\0\1\202\5\0"+ + "\6\202\4\0\1\u0161\1\0\13\202\3\0\1\202\1\2"+ + "\4\0\3\202\4\0\1\202\4\0\3\202\2\0\1\202"+ + "\5\0\6\202\4\0\1\u0162\1\0\13\202\3\0\1\202"+ + "\1\2\4\0\3\202\4\0\1\202\65\0\1\u0163\54\0"+ + "\1\117\65\0\1\u0164\66\0\1\u0165\66\0\1\u0166\20\0"+ + "\3\202\2\0\1\202\5\0\6\202\4\0\1\202\1\0"+ + "\13\202\3\0\1\202\1\u0164\4\0\3\202\4\0\1\202"+ + "\4\0\3\202\2\0\1\202\5\0\6\202\4\0\1\202"+ + "\1\0\13\202\3\0\1\202\1\u0165\4\0\3\202\4\0"+ + "\1\202\4\0\3\202\2\0\1\202\5\0\6\202\4\0"+ + "\1\202\1\0\13\202\3\0\1\202\1\u0166\4\0\3\202"+ + "\4\0\1\202\52\0\1\u0167\14\0"; + + private static int [] zzUnpackTrans() { + int [] result = new int[6930]; + int offset = 0; + offset = zzUnpackTrans(ZZ_TRANS_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackTrans(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int count = packed.charAt(i++); + int value = packed.charAt(i++); + value--; + do result[j++] = value; while (--count > 0); + } + return j; + } + + + /* error codes */ + private static final int ZZ_UNKNOWN_ERROR = 0; + private static final int ZZ_NO_MATCH = 1; + private static final int ZZ_PUSHBACK_2BIG = 2; + + /* error messages for the codes above */ + private static final String ZZ_ERROR_MSG[] = { + "Unkown internal scanner error", + "Error: could not match input", + "Error: pushback value was too large" + }; + + /** + * ZZ_ATTRIBUTE[aState] contains the attributes of state aState + */ + private static final int [] ZZ_ATTRIBUTE = zzUnpackAttribute(); + + private static final String ZZ_ATTRIBUTE_PACKED_0 = + "\1\0\1\11\2\1\2\11\1\1\2\11\1\1\5\11"+ + "\6\1\2\11\1\1\1\11\1\1\14\11\1\1\4\11"+ + "\1\0\3\11\1\1\2\0\6\1\21\11\1\0\1\11"+ + "\1\0\2\1\1\0\5\1\3\11\2\1\2\11\5\1"+ + "\1\11\2\1\5\11\1\0\2\1\2\11\2\1\5\11"+ + "\1\0\5\1\1\11\4\0\2\11\6\0\6\11\3\0"+ + "\5\11\1\0\1\11\3\0\6\11\1\0\23\11\2\0"+ + "\1\11\3\1\1\11\3\1\10\11\3\1\1\11\3\1"+ + "\32\11\3\1\1\11\3\1\4\11\13\0\6\11\1\0"+ + "\6\11\1\0\4\11\1\0\12\11\1\0\7\11\1\0"+ + "\5\11\1\0\3\11\2\0\22\11\2\0\1\11\1\0"+ + "\1\11\4\0\22\11\10\0\4\11"; + + private static int [] zzUnpackAttribute() { + int [] result = new int[359]; + int offset = 0; + offset = zzUnpackAttribute(ZZ_ATTRIBUTE_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackAttribute(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int count = packed.charAt(i++); + int value = packed.charAt(i++); + do result[j++] = value; while (--count > 0); + } + return j; + } + + /** the input device */ + private java.io.Reader zzReader; + + /** the current state of the DFA */ + private int zzState; + + /** the current lexical state */ + private int zzLexicalState = YYINITIAL; + + /** this buffer contains the current text to be matched and is + the source of the yytext() string */ + private char zzBuffer[] = new char[ZZ_BUFFERSIZE]; + + /** the textposition at the last accepting state */ + private int zzMarkedPos; + + /** the current text position in the buffer */ + private int zzCurrentPos; + + /** startRead marks the beginning of the yytext() string in the buffer */ + private int zzStartRead; + + /** endRead marks the last character in the buffer, that has been read + from input */ + private int zzEndRead; + + /** number of newlines encountered up to the start of the matched text */ + private int yyline; + + /** the number of characters up to the start of the matched text */ + private int yychar; + + /** + * the number of characters from the last newline up to the start of the + * matched text + */ + private int yycolumn; + + /** + * zzAtBOL == true <=> the scanner is currently at the beginning of a line + */ + private boolean zzAtBOL = true; + + /** zzAtEOF == true <=> the scanner is at the EOF */ + private boolean zzAtEOF; + + /** denotes if the user-EOF-code has already been executed */ + private boolean zzEOFDone; + + /* user code: */ + /* + * Betacode to Unicode conversion + */ + + private String toUnicode(int in) { + char c = (char) in; + Character ch = new Character(c); + String retString = ch.toString(); + return retString; + } + + + + /** + * Creates a new scanner + * There is also a java.io.InputStream version of this constructor. + * + * @param in the java.io.Reader to read input from. + */ + public Betacode2UnicodeLex(java.io.Reader in) { + this.zzReader = in; + } + + /** + * Creates a new scanner. + * There is also java.io.Reader version of this constructor. + * + * @param in the java.io.Inputstream to read input from. + */ + public Betacode2UnicodeLex(java.io.InputStream in) { + this(new java.io.InputStreamReader(in)); + } + + /** + * Unpacks the compressed character translation table. + * + * @param packed the packed character translation table + * @return the unpacked character translation table + */ + private static char [] zzUnpackCMap(String packed) { + char [] map = new char[0x10000]; + int i = 0; /* index in packed string */ + int j = 0; /* index in unpacked array */ + while (i < 134) { + int count = packed.charAt(i++); + char value = packed.charAt(i++); + do map[j++] = value; while (--count > 0); + } + return map; + } + + + /** + * Refills the input buffer. + * + * @return false, iff there was new input. + * + * @exception java.io.IOException if any I/O-Error occurs + */ + private boolean zzRefill() throws java.io.IOException { + + /* first: make room (if you can) */ + if (zzStartRead > 0) { + System.arraycopy(zzBuffer, zzStartRead, + zzBuffer, 0, + zzEndRead-zzStartRead); + + /* translate stored positions */ + zzEndRead-= zzStartRead; + zzCurrentPos-= zzStartRead; + zzMarkedPos-= zzStartRead; + zzStartRead = 0; + } + + /* is the buffer big enough? */ + if (zzCurrentPos >= zzBuffer.length) { + /* if not: blow it up */ + char newBuffer[] = new char[zzCurrentPos*2]; + System.arraycopy(zzBuffer, 0, newBuffer, 0, zzBuffer.length); + zzBuffer = newBuffer; + } + + /* finally: fill the buffer with new input */ + int numRead = zzReader.read(zzBuffer, zzEndRead, + zzBuffer.length-zzEndRead); + + if (numRead > 0) { + zzEndRead+= numRead; + return false; + } + // unlikely but not impossible: read 0 characters, but not at end of stream + if (numRead == 0) { + int c = zzReader.read(); + if (c == -1) { + return true; + } else { + zzBuffer[zzEndRead++] = (char) c; + return false; + } + } + + // numRead < 0 + return true; + } + + + /** + * Closes the input stream. + */ + public final void yyclose() throws java.io.IOException { + zzAtEOF = true; /* indicate end of file */ + zzEndRead = zzStartRead; /* invalidate buffer */ + + if (zzReader != null) + zzReader.close(); + } + + + /** + * Resets the scanner to read from a new input stream. + * Does not close the old reader. + * + * All internal variables are reset, the old input stream + * cannot be reused (internal buffer is discarded and lost). + * Lexical state is set to ZZ_INITIAL. + * + * @param reader the new input stream + */ + public final void yyreset(java.io.Reader reader) { + zzReader = reader; + zzAtBOL = true; + zzAtEOF = false; + zzEOFDone = false; + zzEndRead = zzStartRead = 0; + zzCurrentPos = zzMarkedPos = 0; + yyline = yychar = yycolumn = 0; + zzLexicalState = YYINITIAL; + } + + + /** + * Returns the current lexical state. + */ + public final int yystate() { + return zzLexicalState; + } + + + /** + * Enters a new lexical state + * + * @param newState the new lexical state + */ + public final void yybegin(int newState) { + zzLexicalState = newState; + } + + + /** + * Returns the text matched by the current regular expression. + */ + public final String yytext() { + return new String( zzBuffer, zzStartRead, zzMarkedPos-zzStartRead ); + } + + + /** + * Returns the character at position pos from the + * matched text. + * + * It is equivalent to yytext().charAt(pos), but faster + * + * @param pos the position of the character to fetch. + * A value from 0 to yylength()-1. + * + * @return the character at position pos + */ + public final char yycharat(int pos) { + return zzBuffer[zzStartRead+pos]; + } + + + /** + * Returns the length of the matched text region. + */ + public final int yylength() { + return zzMarkedPos-zzStartRead; + } + + + /** + * Reports an error that occured while scanning. + * + * In a wellformed scanner (no or only correct usage of + * yypushback(int) and a match-all fallback rule) this method + * will only be called with things that "Can't Possibly Happen". + * If this method is called, something is seriously wrong + * (e.g. a JFlex bug producing a faulty scanner etc.). + * + * Usual syntax/scanner level error handling should be done + * in error fallback rules. + * + * @param errorCode the code of the errormessage to display + */ + private void zzScanError(int errorCode) { + String message; + try { + message = ZZ_ERROR_MSG[errorCode]; + } + catch (ArrayIndexOutOfBoundsException e) { + message = ZZ_ERROR_MSG[ZZ_UNKNOWN_ERROR]; + } + + throw new Error(message); + } + + + /** + * Pushes the specified amount of characters back into the input stream. + * + * They will be read again by then next call of the scanning method + * + * @param number the number of characters to be read again. + * This number must not be greater than yylength()! + */ + public void yypushback(int number) { + if ( number > yylength() ) + zzScanError(ZZ_PUSHBACK_2BIG); + + zzMarkedPos -= number; + } + + + /** + * Resumes scanning until the next regular expression is matched, + * the end of input is encountered or an I/O-Error occurs. + * + * @return the next token + * @exception java.io.IOException if any I/O-Error occurs + */ + public java.lang.String yylex() throws java.io.IOException { + int zzInput; + int zzAction; + + // cached fields: + int zzCurrentPosL; + int zzMarkedPosL; + int zzEndReadL = zzEndRead; + char [] zzBufferL = zzBuffer; + char [] zzCMapL = ZZ_CMAP; + + int [] zzTransL = ZZ_TRANS; + int [] zzRowMapL = ZZ_ROWMAP; + int [] zzAttrL = ZZ_ATTRIBUTE; + + while (true) { + zzMarkedPosL = zzMarkedPos; + + zzAction = -1; + + zzCurrentPosL = zzCurrentPos = zzStartRead = zzMarkedPosL; + + zzState = ZZ_LEXSTATE[zzLexicalState]; + + + zzForAction: { + while (true) { + + if (zzCurrentPosL < zzEndReadL) + zzInput = zzBufferL[zzCurrentPosL++]; + else if (zzAtEOF) { + zzInput = YYEOF; + break zzForAction; + } + else { + // store back cached positions + zzCurrentPos = zzCurrentPosL; + zzMarkedPos = zzMarkedPosL; + boolean eof = zzRefill(); + // get translated positions and possibly new buffer + zzCurrentPosL = zzCurrentPos; + zzMarkedPosL = zzMarkedPos; + zzBufferL = zzBuffer; + zzEndReadL = zzEndRead; + if (eof) { + zzInput = YYEOF; + break zzForAction; + } + else { + zzInput = zzBufferL[zzCurrentPosL++]; + } + } + int zzNext = zzTransL[ zzRowMapL[zzState] + zzCMapL[zzInput] ]; + if (zzNext == -1) break zzForAction; + zzState = zzNext; + + int zzAttributes = zzAttrL[zzState]; + if ( (zzAttributes & 1) == 1 ) { + zzAction = zzState; + zzMarkedPosL = zzCurrentPosL; + if ( (zzAttributes & 8) == 8 ) break zzForAction; + } + + } + } + + // store back cached position + zzMarkedPos = zzMarkedPosL; + + switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) { + case 139: + { return toUnicode(0x1FF8); + } + case 291: break; + case 85: + { return toUnicode(0x1F30); + } + case 292: break; + case 64: + { return toUnicode(0x03a7); + } + case 293: break; + case 60: + { return toUnicode(0x039e); + } + case 294: break; + case 151: + { return toUnicode(0x1F06); + } + case 295: break; + case 206: + { return toUnicode(0x1FF4); + } + case 296: break; + case 42: + { return toUnicode(0x03a3); + } + case 297: break; + case 56: + { return toUnicode(0x039a); + } + case 298: break; + case 149: + { return toUnicode(0x1F02); + } + case 299: break; + case 254: + { return toUnicode(0x1F87); + } + case 300: break; + case 83: + { return toUnicode(0x1FC6); + } + case 301: break; + case 32: + { return toUnicode(0x03bc); + } + case 302: break; + case 216: + { return toUnicode(0x1F2C); + } + case 303: break; + case 252: + { return toUnicode(0x1F83); + } + case 304: break; + case 172: + { return toUnicode(0x1FC2); + } + case 305: break; + case 127: + { return toUnicode(0x1F59); + } + case 306: break; + case 192: + { return toUnicode(0x1F55); + } + case 307: break; + case 129: + { return toUnicode(0x1FEC); + } + case 308: break; + case 97: + { return toUnicode(0x1F51); + } + case 309: break; + case 39: + { return toUnicode(0x03c8); + } + case 310: break; + case 170: + { return toUnicode(0x1F27); + } + case 311: break; + case 36: + { return toUnicode(0x03c4); + } + case 312: break; + case 168: + { return toUnicode(0x1F23); + } + case 313: break; + case 99: + { return toUnicode(0x1F7B); + } + case 314: break; + case 111: + { return toUnicode(0x1FBA); + } + case 315: break; + case 35: + { return toUnicode(0x03c0); + } + case 316: break; + case 196: + { return toUnicode(0x1FE7); + } + case 317: break; + case 238: + { return toUnicode(0x1F4D); + } + case 318: break; + case 195: + { return toUnicode(0x1FE3); + } + case 319: break; + case 115: + { return toUnicode(0x1FB9); + } + case 320: break; + case 87: + { return toUnicode(0x1F76); + } + case 321: break; + case 9: + { return toUnicode(0x0314); + } + case 322: break; + case 228: + { return toUnicode(0x1F1B); + } + case 323: break; + case 77: + { return toUnicode(0x1F72); + } + case 324: break; + case 46: + { return toUnicode(0x0399); + } + case 325: break; + case 74: + { return toUnicode(0x1FB1); + } + case 326: break; + case 120: + { return toUnicode(0x1F48); + } + case 327: break; + case 44: + { return toUnicode(0x0395); + } + case 328: break; + case 185: + { return toUnicode(0x1F44); + } + case 329: break; + case 273: + { return toUnicode(0x1F9C); + } + case 330: break; + case 136: + { return toUnicode(0x1FDB); + } + case 331: break; + case 43: + { return toUnicode(0x0391); + } + case 332: break; + case 92: + { return toUnicode(0x1F40); + } + case 333: break; + case 14: + { return toUnicode(0x03b7); + } + case 334: break; + case 268: + { return "<"; + } + case 335: break; + case 223: + { return toUnicode(0x1F6E); + } + case 336: break; + case 283: + { return toUnicode(0x1FAD); + } + case 337: break; + case 26: + { return toUnicode(0x03b3); + } + case 338: break; + case 160: + { return toUnicode(0x1F12); + } + case 339: break; + case 213: + { return toUnicode(0x1F6A); + } + case 340: break; + case 260: + { return toUnicode(0x1F97); + } + case 341: break; + case 89: + { return toUnicode(0x1FD6); + } + case 342: break; + case 217: + { return toUnicode(0x1F3C); + } + case 343: break; + case 258: + { return toUnicode(0x1F93); + } + case 344: break; + case 181: + { return toUnicode(0x1FD2); + } + case 345: break; + case 128: + { return toUnicode(0x1F69); + } + case 346: break; + case 226: + { return toUnicode(0x1FA8); + } + case 347: break; + case 220: + { return toUnicode(0x1F0E); + } + case 348: break; + case 202: + { return toUnicode(0x1F65); + } + case 349: break; + case 262: + { return toUnicode(0x1FA4); + } + case 350: break; + case 147: + { return toUnicode(0x1FFC); + } + case 351: break; + case 208: + { return toUnicode(0x1F0A); + } + case 352: break; + case 104: + { return toUnicode(0x1F61); + } + case 353: break; + case 288: + { return ")"; + } + case 354: break; + case 200: + { return toUnicode(0x1FA0); + } + case 355: break; + case 180: + { return toUnicode(0x1F37); + } + case 356: break; + case 284: + { return toUnicode(0x1F8F); + } + case 357: break; + case 287: + { return "|"; + } + case 358: break; + case 178: + { return toUnicode(0x1F33); + } + case 359: break; + case 278: + { return toUnicode(0x1F8B); + } + case 360: break; + case 132: + { return toUnicode(0x1FCA); + } + case 361: break; + case 122: + { return toUnicode(0x1F09); + } + case 362: break; + case 207: + { return toUnicode(0x1FF7); + } + case 363: break; + case 63: + { return toUnicode(0x03a6); + } + case 364: break; + case 59: + { return toUnicode(0x039d); + } + case 365: break; + case 154: + { return toUnicode(0x1F05); + } + case 366: break; + case 239: + { return toUnicode(0x1F5D); + } + case 367: break; + case 108: + { return toUnicode(0x1FF3); + } + case 368: break; + case 131: + { return toUnicode(0x1FC9); + } + case 369: break; + case 68: + { return toUnicode(0x1F01); + } + case 370: break; + case 16: + { return toUnicode(0x03bf); + } + case 371: break; + case 242: + { return toUnicode(0x1F2F); + } + case 372: break; + case 251: + { return toUnicode(0x1F86); + } + case 373: break; + case 6: + { return toUnicode(0x00B7); + } + case 374: break; + case 31: + { return toUnicode(0x03bb); + } + case 375: break; + case 229: + { return toUnicode(0x1F2B); + } + case 376: break; + case 249: + { return toUnicode(0x1F82); + } + case 377: break; + case 2: + { return "h"; + } + case 378: break; + case 189: + { return toUnicode(0x1F54); + } + case 379: break; + case 142: + { return toUnicode(0x1FEB); + } + case 380: break; + case 96: + { return toUnicode(0x1F50); + } + case 381: break; + case 38: + { return toUnicode(0x03c7); + } + case 382: break; + case 166: + { return toUnicode(0x1F26); + } + case 383: break; + case 4: + { return toUnicode(0x03c3); + } + case 384: break; + case 148: + // lookahead expression with fixed base length + zzMarkedPos = zzStartRead + 1; + { return toUnicode(0x03c3); + } + case 385: break; + case 164: + { return toUnicode(0x1F22); + } + case 386: break; + case 98: + { return toUnicode(0x1F7A); + } + case 387: break; + case 100: + { return toUnicode(0x1FE6); + } + case 388: break; + case 19: + { return toUnicode(0x0345); + } + case 389: break; + case 218: + { return toUnicode(0x1F4C); + } + case 390: break; + case 194: + { return toUnicode(0x1FE2); + } + case 391: break; + case 95: + { return toUnicode(0x1F79); + } + case 392: break; + case 114: + { return toUnicode(0x1FB8); + } + case 393: break; + case 82: + { return toUnicode(0x1F75); + } + case 394: break; + case 158: + { return toUnicode(0x1FB4); + } + case 395: break; + case 8: + { return toUnicode(0x0313); + } + case 396: break; + case 209: + { return toUnicode(0x1F1A); + } + case 397: break; + case 70: + { return toUnicode(0x1F71); + } + case 398: break; + case 40: + { return "H"; + } + case 399: break; + case 55: + { return toUnicode(0x0398); + } + case 400: break; + case 73: + { return toUnicode(0x1FB0); + } + case 401: break; + case 285: + { return toUnicode(0x1F9F); + } + case 402: break; + case 53: + { return toUnicode(0x0394); + } + case 403: break; + case 186: + { return toUnicode(0x1F43); + } + case 404: break; + case 279: + { return toUnicode(0x1F9B); + } + case 405: break; + case 135: + { return toUnicode(0x1FDA); + } + case 406: break; + case 123: + { return toUnicode(0x1F19); + } + case 407: break; + case 28: + { return toUnicode(0x03b6); + } + case 408: break; + case 163: + { return toUnicode(0x1F15); + } + case 409: break; + case 240: + { return toUnicode(0x1F6D); + } + case 410: break; + case 274: + { return toUnicode(0x1FAC); + } + case 411: break; + case 25: + { return toUnicode(0x03b2); + } + case 412: break; + case 138: + { return toUnicode(0x1FD9); + } + case 413: break; + case 76: + { return toUnicode(0x1F11); + } + case 414: break; + case 243: + { return toUnicode(0x1F3F); + } + case 415: break; + case 257: + { return toUnicode(0x1F96); + } + case 416: break; + case 230: + { return toUnicode(0x1F3B); + } + case 417: break; + case 255: + { return toUnicode(0x1F92); + } + case 418: break; + case 91: + { return toUnicode(0x1FD1); + } + case 419: break; + case 121: + { return toUnicode(0x1F68); + } + case 420: break; + case 266: + { return toUnicode(0x1FA7); + } + case 421: break; + case 20: + { return toUnicode(0x0306); + } + case 422: break; + case 234: + { return toUnicode(0x1F0D); + } + case 423: break; + case 198: + { return toUnicode(0x1F64); + } + case 424: break; + case 264: + { return toUnicode(0x1FA3); + } + case 425: break; + case 146: + { return toUnicode(0x1FFB); + } + case 426: break; + case 12: + { return toUnicode(0x0302); + } + case 427: break; + case 103: + { return toUnicode(0x1F60); + } + case 428: break; + case 289: + { return "("; + } + case 429: break; + case 177: + { return toUnicode(0x1F36); + } + case 430: break; + case 275: + { return toUnicode(0x1F8E); + } + case 431: break; + case 175: + { return toUnicode(0x1F32); + } + case 432: break; + case 49: + { return toUnicode(0x03a9); + } + case 433: break; + case 269: + { return toUnicode(0x1F8A); + } + case 434: break; + case 116: + { return toUnicode(0x1F08); + } + case 435: break; + case 107: + { return toUnicode(0x1FF6); + } + case 436: break; + case 267: + { return ">"; + } + case 437: break; + case 48: + { return toUnicode(0x03a5); + } + case 438: break; + case 58: + { return toUnicode(0x039c); + } + case 439: break; + case 150: + { return toUnicode(0x1F04); + } + case 440: break; + case 205: + { return toUnicode(0x1FF2); + } + case 441: break; + case 50: + { return toUnicode(0x03a1); + } + case 442: break; + case 246: + { return toUnicode(0x1F89); + } + case 443: break; + case 130: + { return toUnicode(0x1FC8); + } + case 444: break; + case 67: + { return toUnicode(0x1F00); + } + case 445: break; + case 34: + { return toUnicode(0x03be); + } + case 446: break; + case 221: + { return toUnicode(0x1F2E); + } + case 447: break; + case 253: + { return toUnicode(0x1F85); + } + case 448: break; + case 173: + { return toUnicode(0x1FC4); + } + case 449: break; + case 24: + { return toUnicode(0x0323); + } + case 450: break; + case 30: + { return toUnicode(0x03ba); + } + case 451: break; + case 210: + { return toUnicode(0x1F2A); + } + case 452: break; + case 156: + { return toUnicode(0x1F81); + } + case 453: break; + case 193: + { return toUnicode(0x1F57); + } + case 454: break; + case 191: + { return toUnicode(0x1F53); + } + case 455: break; + case 141: + { return toUnicode(0x1FEA); + } + case 456: break; + case 124: + { return toUnicode(0x1F29); + } + case 457: break; + case 37: + { return toUnicode(0x03c6); + } + case 458: break; + case 169: + { return toUnicode(0x1F25); + } + case 459: break; + case 106: + { return toUnicode(0x1F7D); + } + case 460: break; + case 113: + { return toUnicode(0x1FBC); + } + case 461: break; + case 66: + // lookahead expression with fixed base length + zzMarkedPos = zzStartRead + 1; + { return toUnicode(0x03c2); + } + case 462: break; + case 144: + { return toUnicode(0x1FE9); + } + case 463: break; + case 80: + { return toUnicode(0x1F21); + } + case 464: break; + case 110: + { return toUnicode(0x1FE5); + } + case 465: break; + case 1: + { return yytext(); + } + case 466: break; + case 231: + { return toUnicode(0x1F4B); + } + case 467: break; + case 102: + { return toUnicode(0x1FE1); + } + case 468: break; + case 94: + { return toUnicode(0x1F78); + } + case 469: break; + case 159: + { return toUnicode(0x1FB7); + } + case 470: break; + case 235: + { return toUnicode(0x1F1D); + } + case 471: break; + case 81: + { return toUnicode(0x1F74); + } + case 472: break; + case 72: + { return toUnicode(0x1FB3); + } + case 473: break; + case 69: + { return toUnicode(0x1F70); + } + case 474: break; + case 45: + { return toUnicode(0x0397); + } + case 475: break; + case 276: + { return toUnicode(0x1F9E); + } + case 476: break; + case 52: + { return toUnicode(0x0393); + } + case 477: break; + case 184: + { return toUnicode(0x1F42); + } + case 478: break; + case 15: + { return toUnicode(0x03b9); + } + case 479: break; + case 270: + { return toUnicode(0x1F9A); + } + case 480: break; + case 117: + { return toUnicode(0x1F18); + } + case 481: break; + case 286: + { return toUnicode(0x1FAF); + } + case 482: break; + case 13: + { return toUnicode(0x03b5); + } + case 483: break; + case 161: + { return toUnicode(0x1F14); + } + case 484: break; + case 219: + { return toUnicode(0x1F6C); + } + case 485: break; + case 280: + { return toUnicode(0x1FAB); + } + case 486: break; + case 7: + { return toUnicode(0x03b1); + } + case 487: break; + case 247: + { return toUnicode(0x1F99); + } + case 488: break; + case 137: + { return toUnicode(0x1FD8); + } + case 489: break; + case 75: + { return toUnicode(0x1F10); + } + case 490: break; + case 222: + { return toUnicode(0x1F3E); + } + case 491: break; + case 259: + { return toUnicode(0x1F95); + } + case 492: break; + case 211: + { return toUnicode(0x1F3A); + } + case 493: break; + case 171: + { return toUnicode(0x1F91); + } + case 494: break; + case 90: + { return toUnicode(0x1FD0); + } + case 495: break; + case 203: + { return toUnicode(0x1F67); + } + case 496: break; + case 263: + { return toUnicode(0x1FA6); + } + case 497: break; + case 214: + { return toUnicode(0x1F0C); + } + case 498: break; + case 201: + { return toUnicode(0x1F63); + } + case 499: break; + case 261: + { return toUnicode(0x1FA2); + } + case 500: break; + case 145: + { return toUnicode(0x1FFA); + } + case 501: break; + case 125: + { return toUnicode(0x1F39); + } + case 502: break; + case 11: + { return toUnicode(0x0301); + } + case 503: break; + case 290: + { return "'"; + } + case 504: break; + case 179: + { return toUnicode(0x1F35); + } + case 505: break; + case 281: + { return toUnicode(0x1F8D); + } + case 506: break; + case 134: + { return toUnicode(0x1FCC); + } + case 507: break; + case 140: + { return toUnicode(0x1FF9); + } + case 508: break; + case 86: + { return toUnicode(0x1F31); + } + case 509: break; + case 65: + { return toUnicode(0x03a8); + } + case 510: break; + case 47: + { return toUnicode(0x039f); + } + case 511: break; + case 155: + { return toUnicode(0x1F07); + } + case 512: break; + case 244: + { return toUnicode(0x1F5F); + } + case 513: break; + case 62: + { return toUnicode(0x03a4); + } + case 514: break; + case 57: + { return toUnicode(0x039b); + } + case 515: break; + case 153: + { return toUnicode(0x1F03); + } + case 516: break; + case 232: + { return toUnicode(0x1F5B); + } + case 517: break; + case 61: + { return toUnicode(0x03a0); + } + case 518: break; + case 224: + { return toUnicode(0x1F88); + } + case 519: break; + case 174: + { return toUnicode(0x1FC7); + } + case 520: break; + case 33: + { return toUnicode(0x03bd); + } + case 521: break; + case 236: + { return toUnicode(0x1F2D); + } + case 522: break; + case 250: + { return toUnicode(0x1F84); + } + case 523: break; + case 84: + { return toUnicode(0x1FC3); + } + case 524: break; + case 152: + { return toUnicode(0x1F80); + } + case 525: break; + case 3: + { return "f"; + } + case 526: break; + case 190: + { return toUnicode(0x1F56); + } + case 527: break; + case 188: + { return toUnicode(0x1F52); + } + case 528: break; + case 18: + { return toUnicode(0x03c9); + } + case 529: break; + case 118: + { return toUnicode(0x1F28); + } + case 530: break; + case 17: + { return toUnicode(0x03c5); + } + case 531: break; + case 165: + { return toUnicode(0x1F24); + } + case 532: break; + case 105: + { return toUnicode(0x1F7C); + } + case 533: break; + case 112: + { return toUnicode(0x1FBB); + } + case 534: break; + case 23: + { return toUnicode(0x03c1); + } + case 535: break; + case 143: + { return toUnicode(0x1FE8); + } + case 536: break; + case 79: + { return toUnicode(0x1F20); + } + case 537: break; + case 109: + { return toUnicode(0x1FE4); + } + case 538: break; + case 212: + { return toUnicode(0x1F4A); + } + case 539: break; + case 101: + { return toUnicode(0x1FE0); + } + case 540: break; + case 88: + { return toUnicode(0x1F77); + } + case 541: break; + case 71: + { return toUnicode(0x1FB6); + } + case 542: break; + case 215: + { return toUnicode(0x1F1C); + } + case 543: break; + case 78: + { return toUnicode(0x1F73); + } + case 544: break; + case 157: + { return toUnicode(0x1FB2); + } + case 545: break; + case 126: + { return toUnicode(0x1F49); + } + case 546: break; + case 41: + { return "F"; + } + case 547: break; + case 54: + { return toUnicode(0x0396); + } + case 548: break; + case 187: + { return toUnicode(0x1F45); + } + case 549: break; + case 282: + { return toUnicode(0x1F9D); + } + case 550: break; + case 51: + { return toUnicode(0x0392); + } + case 551: break; + case 93: + { return toUnicode(0x1F41); + } + case 552: break; + case 29: + { return toUnicode(0x03b8); + } + case 553: break; + case 245: + { return toUnicode(0x1F6F); + } + case 554: break; + case 277: + { return toUnicode(0x1FAE); + } + case 555: break; + case 27: + { return toUnicode(0x03b4); + } + case 556: break; + case 162: + { return toUnicode(0x1F13); + } + case 557: break; + case 233: + { return toUnicode(0x1F6B); + } + case 558: break; + case 271: + { return toUnicode(0x1FAA); + } + case 559: break; + case 225: + { return toUnicode(0x1F98); + } + case 560: break; + case 183: + { return toUnicode(0x1FD7); + } + case 561: break; + case 237: + { return toUnicode(0x1F3D); + } + case 562: break; + case 256: + { return toUnicode(0x1F94); + } + case 563: break; + case 182: + { return toUnicode(0x1FD3); + } + case 564: break; + case 248: + { return toUnicode(0x1FA9); + } + case 565: break; + case 22: + { return toUnicode(0x0308); + } + case 566: break; + case 167: + { return toUnicode(0x1F90); + } + case 567: break; + case 241: + { return toUnicode(0x1F0F); + } + case 568: break; + case 199: + { return toUnicode(0x1F66); + } + case 569: break; + case 5: + { return "."; + } + case 570: break; + case 265: + { return toUnicode(0x1FA5); + } + case 571: break; + case 21: + { return toUnicode(0x0304); + } + case 572: break; + case 227: + { return toUnicode(0x1F0B); + } + case 573: break; + case 197: + { return toUnicode(0x1F62); + } + case 574: break; + case 204: + { return toUnicode(0x1FA1); + } + case 575: break; + case 119: + { return toUnicode(0x1F38); + } + case 576: break; + case 10: + { return toUnicode(0x0300); + } + case 577: break; + case 176: + { return toUnicode(0x1F34); + } + case 578: break; + case 272: + { return toUnicode(0x1F8C); + } + case 579: break; + case 133: + { return toUnicode(0x1FCB); + } + case 580: break; + default: + if (zzInput == YYEOF && zzStartRead == zzCurrentPos) { + zzAtEOF = true; + return null; + } + else { + zzScanError(ZZ_NO_MATCH); + } + } + } + } + + +} diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/transcode/Buckwalter2Unicode.lex --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/transcode/Buckwalter2Unicode.lex Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,121 @@ +package de.mpg.mpiwg.berlin.mpdl.lt.general; + +%% +%{ + /* + * Betacode to Unicode conversion + */ + +%} + +%class Buckwalter2UnicodeLex +%public +%type java.lang.String +%unicode +%% + + +"<"[^>]+">" { return yytext(); } + +"'" { return "\u0621"; } /* Hamza */ +"|" { return "\u0622"; } /* ALEF WITH MADDA ABOVE from AraMorph */ +">" { return "\u0623"; } /* Hamza */ +"&" { return "\u0624"; } /* Hamza */ +"<" { return "\u0625"; } /* Alif + HamzaBelow */ +"}" { return "\u0626"; } /* Ya + HamzaAbove */ +"A" { return "\u0627"; } /* Alif */ +"b" { return "\u0628"; } /* Ba */ +"p" { return "\u0629"; } /* TaMarbuta */ +"t" { return "\u062A"; } /* Ta */ +"v" { return "\u062B"; } /* Tha */ +"j" { return "\u062C"; } /* Jeem */ +"H" { return "\u062D"; } /* HHa */ +"x" { return "\u062E"; } /* Kha */ +"d" { return "\u062F"; } /* Dal */ +"*" { return "\u0630"; } /* Thal */ +"r" { return "\u0631"; } /* Ra */ +"z" { return "\u0632"; } /* Zain */ +"s" { return "\u0633"; } /* Seen */ +"$" { return "\u0634"; } /* Sheen */ +"S" { return "\u0635"; } /* Sad */ +"D" { return "\u0636"; } /* DDad */ +"T" { return "\u0637"; } /* TTa */ +"Z" { return "\u0638"; } /* DTha */ +"E" { return "\u0639"; } /* Ain */ +"g" { return "\u063A"; } /* Ghain */ + +"_" { return "\u0640"; } /* Tatweel */ +"f" { return "\u0641"; } /* Fa */ +"q" { return "\u0642"; } /* Qaf */ +"k" { return "\u0643"; } /* Kaf */ +"l" { return "\u0644"; } /* Lam */ +"m" { return "\u0645"; } /* Meem */ +"n" { return "\u0646"; } /* Noon */ +"h" { return "\u0647"; } /* Ha */ +"w" { return "\u0648"; } /* Waw */ +"Y" { return "\u0649"; } /* AlifMaksura */ +"y" { return "\u064A"; } /* Ya */ +"F" { return "\u064B"; } /* Fathatan */ +"N" { return "\u064C"; } /* Dammatan */ +"K" { return "\u064D"; } /* Kasratan */ +"a" { return "\u064E"; } /* Fatha */ +"u" { return "\u064F"; } /* Damma */ +"i" { return "\u0650"; } /* Kasra */ +"~" { return "\u0651"; } /* Shadda */ +"o" { return "\u0652"; } /* Sukun */ +"^" { return "\u0653"; } /* Maddah */ +"#" { return "\u0654"; } /* HamzaAbove */ + +"`" { return "\u0670"; } /* AlifKhanjareeya */ +"{" { return "\u0671"; } /* Alif + HamzatWasl */ + +"P" { return "\u067E"; } /* PEH from AraMorph */ +"J" { return "\u0686"; } /* TCHEH from AraMorph */ +"V" { return "\u06A4"; } /* VEH from AraMorph */ +"G" { return "\u06AF"; } /* GAF from AraMorph */ +"R" { return "\u0698"; } /* JEH from AraMorph */ +"?" { return "\u061F"; } /* QUESTION MARK from AraMorph */ + +":" { return "\u06DC"; } /* SmallHighSeen */ +"@" { return "\u06DF"; } /* SmallHighRoundedZero */ + +"[" { return "\u06E2"; } /* SmallHighMeemIsolatedForm */ +";" { return "\u06E3"; } /* SmallLowSeen */ +"," { return "\u06E5"; } /* SmallWaw */ +"." { return "\u06E6"; } /* SmallYa */ +"!" { return "\u06E8"; } /* SmallHighNoon */ +"-" { return "\u06EA"; } /* EmptyCentreLowStop */ +"+" { return "\u06EB"; } /* EmptyCentreHighStop */ +"%" { return "\u06EC"; } /* RoundedHighStopWithFilledCentre */ +"]" { return "\u06ED"; } /* SmallLowMeem */ + +[\&_]"vert;" { return "|"; } +[\&_]"lpar;" { return "("; } +[\&_]"rpar;" { return ")"; } +[\_\&]"lt;" { return "<"; } +[\_\&]"gt;" { return ">"; } +"'" { return "'"; } + +"&"[a-zA-Z]+";" { return yytext(); } + +. { return yytext(); } +\n { return yytext(); } + +/* make problemes */ +/* "\\"" { return "\u06E0"; } SmallHighUprightRectangularZero */ + + +/* double entries */ +/* "," { return "\u060C"; } COMMA from AraMorph */ +/* ";" { return "\u061B"; } SEMICOLON from AraMorph */ + +/* not in buckwalter contained */ +/* \u0679 : ARABIC LETTER TTEH */ +/* \u0688 : ARABIC LETTER DDAL */ +/* \u06A9 : ARABIC LETTER KEHEH */ +/* \u0691 : ARABIC LETTER RREH */ +/* \u06BA : ARABIC LETTER NOON GHUNNA */ +/* \u06BE : ARABIC LETTER HEH DOACHASHMEE */ +/* \u06C1 : ARABIC LETTER HEH GOAL */ +/* \u06D2 : ARABIC LETTER YEH BARREE */ + diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/transcode/Buckwalter2UnicodeLex.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/transcode/Buckwalter2UnicodeLex.java Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,909 @@ +package de.mpg.mpiwg.berlin.mpdl.lt.text.transcode; + + +/** + * This class is a scanner generated by + * JFlex 1.4.3 + * on 20.11.09 17:57 from the specification file + * /Users/jwillenborg/java/existDevMai2009/mpdl/extensions/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/general/Buckwalter2Unicode.lex + */ +public class Buckwalter2UnicodeLex { + + /** This character denotes the end of file */ + public static final int YYEOF = -1; + + /** initial size of the lookahead buffer */ + private static final int ZZ_BUFFERSIZE = 16384; + + /** lexical states */ + public static final int YYINITIAL = 0; + + /** + * ZZ_LEXSTATE[l] is the state in the DFA for the lexical state l + * ZZ_LEXSTATE[l+1] is the state in the DFA for the lexical state l + * at the beginning of a line + * l is of the form l = 2*k, k a non negative integer + */ + private static final int ZZ_LEXSTATE[] = { + 0, 0 + }; + + /** + * Translates characters to character classes + */ + private static final String ZZ_CMAP_PACKED = + "\12\0\1\0\26\0\1\76\1\0\1\57\1\24\1\101\1\5\1\3"+ + "\2\0\1\20\1\100\1\74\1\77\1\75\1\0\1\104\2\0\1\105"+ + "\5\0\1\106\1\70\1\73\1\1\1\0\1\2\1\67\1\71\1\7"+ + "\2\107\1\26\1\31\1\46\1\65\1\15\1\107\1\63\1\50\2\107"+ + "\1\47\1\107\1\62\1\107\1\66\1\25\1\27\1\107\1\64\2\107"+ + "\1\44\1\30\1\72\1\0\1\102\1\56\1\33\1\60\1\51\1\10"+ + "\1\107\1\17\1\103\1\34\1\32\1\42\1\53\1\14\1\36\1\37"+ + "\1\40\1\41\1\55\1\11\1\35\1\21\1\23\1\12\1\52\1\13"+ + "\1\43\1\16\1\45\1\22\1\61\1\4\1\6\1\54\uff81\0"; + + /** + * Translates characters to character classes + */ + private static final char [] ZZ_CMAP = zzUnpackCMap(ZZ_CMAP_PACKED); + + /** + * Translates DFA states to action switch labels. + */ + private static final int [] ZZ_ACTION = zzUnpackAction(); + + private static final String ZZ_ACTION_PACKED_0 = + "\1\0\1\1\1\2\1\3\1\4\1\5\1\6\1\7"+ + "\1\10\1\11\1\12\1\13\1\14\1\15\1\16\1\17"+ + "\1\20\1\21\1\22\1\23\1\24\1\25\1\26\1\27"+ + "\1\30\1\31\1\32\1\33\1\34\1\35\1\36\1\37"+ + "\1\40\1\41\1\42\1\43\1\44\1\45\1\46\1\47"+ + "\1\50\1\51\1\52\1\53\1\54\1\55\1\56\1\57"+ + "\1\60\1\61\1\62\1\63\1\64\1\65\1\66\1\67"+ + "\1\70\1\71\1\72\1\73\1\74\1\75\1\76\1\77"+ + "\1\100\1\101\1\102\1\103\30\0\1\104\1\0\1\105"+ + "\13\0\1\106\1\107\1\110\1\111"; + + private static int [] zzUnpackAction() { + int [] result = new int[110]; + int offset = 0; + offset = zzUnpackAction(ZZ_ACTION_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackAction(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int count = packed.charAt(i++); + int value = packed.charAt(i++); + do result[j++] = value; while (--count > 0); + } + return j; + } + + + /** + * Translates a state to a row index in the transition table + */ + private static final int [] ZZ_ROWMAP = zzUnpackRowMap(); + + private static final String ZZ_ROWMAP_PACKED_0 = + "\0\0\0\110\0\220\0\110\0\110\0\110\0\330\0\110"+ + "\0\110\0\110\0\110\0\110\0\110\0\110\0\110\0\110"+ + "\0\110\0\110\0\110\0\110\0\110\0\110\0\110\0\110"+ + "\0\110\0\110\0\110\0\110\0\u0120\0\110\0\110\0\110"+ + "\0\110\0\110\0\110\0\110\0\110\0\110\0\110\0\110"+ + "\0\110\0\110\0\110\0\110\0\110\0\110\0\110\0\110"+ + "\0\110\0\110\0\110\0\110\0\110\0\110\0\110\0\110"+ + "\0\110\0\110\0\110\0\110\0\110\0\110\0\110\0\110"+ + "\0\110\0\110\0\110\0\110\0\u0168\0\u01b0\0\u01f8\0\u0240"+ + "\0\u0288\0\u02d0\0\u0318\0\u0360\0\u03a8\0\u03f0\0\u0438\0\u0480"+ + "\0\u04c8\0\u0510\0\u0558\0\u05a0\0\u05e8\0\u0630\0\u0678\0\u06c0"+ + "\0\u0708\0\u0750\0\u0798\0\u07e0\0\110\0\u0828\0\110\0\u0870"+ + "\0\u08b8\0\u0900\0\u0948\0\u0990\0\u09d8\0\u0a20\0\u0a68\0\u0ab0"+ + "\0\u0af8\0\u0b40\0\110\0\110\0\110\0\110"; + + private static int [] zzUnpackRowMap() { + int [] result = new int[110]; + int offset = 0; + offset = zzUnpackRowMap(ZZ_ROWMAP_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackRowMap(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int high = packed.charAt(i++) << 16; + result[j++] = high | packed.charAt(i++); + } + return j; + } + + /** + * The transition table of the DFA + */ + private static final int [] ZZ_TRANS = zzUnpackTrans(); + + private static final String ZZ_TRANS_PACKED_0 = + "\1\2\1\3\1\4\1\5\1\6\1\7\1\10\1\11"+ + "\1\12\1\13\1\14\1\15\1\16\1\17\1\20\1\21"+ + "\1\22\1\23\1\24\1\25\1\26\1\27\1\30\1\31"+ + "\1\32\1\33\1\34\1\35\1\36\1\37\1\40\1\41"+ + "\1\42\1\43\1\44\1\45\1\46\1\47\1\50\1\51"+ + "\1\52\1\53\1\54\1\55\1\56\1\57\1\60\1\61"+ + "\1\62\1\63\1\64\1\65\1\66\1\67\1\70\1\71"+ + "\1\72\1\73\1\74\1\75\1\76\1\77\1\100\1\101"+ + "\1\102\1\103\1\104\5\2\110\0\2\105\1\0\105\105"+ + "\7\0\4\106\1\107\4\106\1\0\1\110\2\106\1\0"+ + "\5\106\1\111\1\0\3\106\1\112\14\106\1\0\1\106"+ + "\1\0\1\113\2\0\5\106\14\0\1\106\3\0\1\106"+ + "\13\0\1\114\5\0\1\115\10\0\1\116\4\0\1\117"+ + "\50\0\2\105\1\2\105\105\7\0\11\106\1\0\3\106"+ + "\1\0\6\106\1\0\20\106\1\0\1\106\4\0\5\106"+ + "\4\0\1\2\7\0\1\106\3\0\1\106\7\0\11\106"+ + "\1\0\3\106\1\0\6\106\1\0\20\106\1\0\1\106"+ + "\4\0\5\106\4\0\1\2\7\0\1\120\3\0\1\106"+ + "\7\0\2\106\1\121\6\106\1\0\3\106\1\0\6\106"+ + "\1\0\20\106\1\0\1\106\4\0\5\106\4\0\1\2"+ + "\7\0\1\106\3\0\1\106\7\0\3\106\1\122\5\106"+ + "\1\0\3\106\1\0\6\106\1\0\20\106\1\0\1\106"+ + "\4\0\5\106\4\0\1\2\7\0\1\106\3\0\1\106"+ + "\7\0\2\106\1\123\1\124\5\106\1\0\3\106\1\0"+ + "\6\106\1\0\20\106\1\0\1\106\4\0\5\106\4\0"+ + "\1\2\7\0\1\106\3\0\1\106\104\0\1\125\106\0"+ + "\1\126\15\0\1\127\110\0\1\130\106\0\1\131\1\132"+ + "\104\0\11\106\1\0\1\133\2\106\1\0\6\106\1\0"+ + "\20\106\1\0\1\106\4\0\5\106\4\0\1\2\7\0"+ + "\1\106\3\0\1\106\7\0\11\106\1\0\3\106\1\0"+ + "\6\106\1\0\15\106\1\134\2\106\1\0\1\106\4\0"+ + "\5\106\4\0\1\2\7\0\1\106\3\0\1\106\7\0"+ + "\11\106\1\0\3\106\1\0\6\106\1\0\20\106\1\0"+ + "\1\106\4\0\5\106\4\0\1\135\7\0\1\106\3\0"+ + "\1\106\7\0\11\106\1\0\3\106\1\0\6\106\1\0"+ + "\15\106\1\136\2\106\1\0\1\106\4\0\5\106\4\0"+ + "\1\2\7\0\1\106\3\0\1\106\7\0\11\106\1\0"+ + "\3\106\1\0\6\106\1\0\20\106\1\0\1\106\4\0"+ + "\5\106\4\0\1\137\7\0\1\106\3\0\1\106\105\0"+ + "\1\140\23\0\1\141\137\0\1\142\131\0\1\135\65\0"+ + "\1\143\131\0\1\137\23\0\3\106\1\144\5\106\1\0"+ + "\3\106\1\0\6\106\1\0\20\106\1\0\1\106\4\0"+ + "\5\106\4\0\1\2\7\0\1\106\3\0\1\106\7\0"+ + "\11\106\1\0\1\145\2\106\1\0\6\106\1\0\20\106"+ + "\1\0\1\106\4\0\5\106\4\0\1\2\7\0\1\106"+ + "\3\0\1\106\7\0\11\106\1\0\1\146\2\106\1\0"+ + "\6\106\1\0\20\106\1\0\1\106\4\0\5\106\4\0"+ + "\1\2\7\0\1\106\3\0\1\106\106\0\1\147\13\0"+ + "\1\150\116\0\1\151\107\0\1\152\75\0\11\106\1\0"+ + "\3\106\1\0\6\106\1\0\20\106\1\0\1\106\4\0"+ + "\5\106\4\0\1\153\7\0\1\106\3\0\1\106\7\0"+ + "\11\106\1\0\3\106\1\0\6\106\1\0\20\106\1\0"+ + "\1\106\4\0\5\106\4\0\1\154\7\0\1\106\3\0"+ + "\1\106\7\0\11\106\1\0\3\106\1\0\6\106\1\0"+ + "\20\106\1\0\1\106\4\0\5\106\4\0\1\155\7\0"+ + "\1\106\3\0\1\106\73\0\1\156\107\0\1\153\107\0"+ + "\1\154\107\0\1\155\14\0"; + + private static int [] zzUnpackTrans() { + int [] result = new int[2952]; + int offset = 0; + offset = zzUnpackTrans(ZZ_TRANS_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackTrans(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int count = packed.charAt(i++); + int value = packed.charAt(i++); + value--; + do result[j++] = value; while (--count > 0); + } + return j; + } + + + /* error codes */ + private static final int ZZ_UNKNOWN_ERROR = 0; + private static final int ZZ_NO_MATCH = 1; + private static final int ZZ_PUSHBACK_2BIG = 2; + + /* error messages for the codes above */ + private static final String ZZ_ERROR_MSG[] = { + "Unkown internal scanner error", + "Error: could not match input", + "Error: pushback value was too large" + }; + + /** + * ZZ_ATTRIBUTE[aState] contains the attributes of state aState + */ + private static final int [] ZZ_ATTRIBUTE = zzUnpackAttribute(); + + private static final String ZZ_ATTRIBUTE_PACKED_0 = + "\1\0\1\11\1\1\3\11\1\1\25\11\1\1\47\11"+ + "\30\0\1\11\1\0\1\11\13\0\4\11"; + + private static int [] zzUnpackAttribute() { + int [] result = new int[110]; + int offset = 0; + offset = zzUnpackAttribute(ZZ_ATTRIBUTE_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackAttribute(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int count = packed.charAt(i++); + int value = packed.charAt(i++); + do result[j++] = value; while (--count > 0); + } + return j; + } + + /** the input device */ + private java.io.Reader zzReader; + + /** the current state of the DFA */ + private int zzState; + + /** the current lexical state */ + private int zzLexicalState = YYINITIAL; + + /** this buffer contains the current text to be matched and is + the source of the yytext() string */ + private char zzBuffer[] = new char[ZZ_BUFFERSIZE]; + + /** the textposition at the last accepting state */ + private int zzMarkedPos; + + /** the current text position in the buffer */ + private int zzCurrentPos; + + /** startRead marks the beginning of the yytext() string in the buffer */ + private int zzStartRead; + + /** endRead marks the last character in the buffer, that has been read + from input */ + private int zzEndRead; + + /** number of newlines encountered up to the start of the matched text */ + private int yyline; + + /** the number of characters up to the start of the matched text */ + private int yychar; + + /** + * the number of characters from the last newline up to the start of the + * matched text + */ + private int yycolumn; + + /** + * zzAtBOL == true <=> the scanner is currently at the beginning of a line + */ + private boolean zzAtBOL = true; + + /** zzAtEOF == true <=> the scanner is at the EOF */ + private boolean zzAtEOF; + + /** denotes if the user-EOF-code has already been executed */ + private boolean zzEOFDone; + + /* user code: */ + /* + * Betacode to Unicode conversion + */ + + + + /** + * Creates a new scanner + * There is also a java.io.InputStream version of this constructor. + * + * @param in the java.io.Reader to read input from. + */ + public Buckwalter2UnicodeLex(java.io.Reader in) { + this.zzReader = in; + } + + /** + * Creates a new scanner. + * There is also java.io.Reader version of this constructor. + * + * @param in the java.io.Inputstream to read input from. + */ + public Buckwalter2UnicodeLex(java.io.InputStream in) { + this(new java.io.InputStreamReader(in)); + } + + /** + * Unpacks the compressed character translation table. + * + * @param packed the packed character translation table + * @return the unpacked character translation table + */ + private static char [] zzUnpackCMap(String packed) { + char [] map = new char[0x10000]; + int i = 0; /* index in packed string */ + int j = 0; /* index in unpacked array */ + while (i < 178) { + int count = packed.charAt(i++); + char value = packed.charAt(i++); + do map[j++] = value; while (--count > 0); + } + return map; + } + + + /** + * Refills the input buffer. + * + * @return false, iff there was new input. + * + * @exception java.io.IOException if any I/O-Error occurs + */ + private boolean zzRefill() throws java.io.IOException { + + /* first: make room (if you can) */ + if (zzStartRead > 0) { + System.arraycopy(zzBuffer, zzStartRead, + zzBuffer, 0, + zzEndRead-zzStartRead); + + /* translate stored positions */ + zzEndRead-= zzStartRead; + zzCurrentPos-= zzStartRead; + zzMarkedPos-= zzStartRead; + zzStartRead = 0; + } + + /* is the buffer big enough? */ + if (zzCurrentPos >= zzBuffer.length) { + /* if not: blow it up */ + char newBuffer[] = new char[zzCurrentPos*2]; + System.arraycopy(zzBuffer, 0, newBuffer, 0, zzBuffer.length); + zzBuffer = newBuffer; + } + + /* finally: fill the buffer with new input */ + int numRead = zzReader.read(zzBuffer, zzEndRead, + zzBuffer.length-zzEndRead); + + if (numRead > 0) { + zzEndRead+= numRead; + return false; + } + // unlikely but not impossible: read 0 characters, but not at end of stream + if (numRead == 0) { + int c = zzReader.read(); + if (c == -1) { + return true; + } else { + zzBuffer[zzEndRead++] = (char) c; + return false; + } + } + + // numRead < 0 + return true; + } + + + /** + * Closes the input stream. + */ + public final void yyclose() throws java.io.IOException { + zzAtEOF = true; /* indicate end of file */ + zzEndRead = zzStartRead; /* invalidate buffer */ + + if (zzReader != null) + zzReader.close(); + } + + + /** + * Resets the scanner to read from a new input stream. + * Does not close the old reader. + * + * All internal variables are reset, the old input stream + * cannot be reused (internal buffer is discarded and lost). + * Lexical state is set to ZZ_INITIAL. + * + * @param reader the new input stream + */ + public final void yyreset(java.io.Reader reader) { + zzReader = reader; + zzAtBOL = true; + zzAtEOF = false; + zzEOFDone = false; + zzEndRead = zzStartRead = 0; + zzCurrentPos = zzMarkedPos = 0; + yyline = yychar = yycolumn = 0; + zzLexicalState = YYINITIAL; + } + + + /** + * Returns the current lexical state. + */ + public final int yystate() { + return zzLexicalState; + } + + + /** + * Enters a new lexical state + * + * @param newState the new lexical state + */ + public final void yybegin(int newState) { + zzLexicalState = newState; + } + + + /** + * Returns the text matched by the current regular expression. + */ + public final String yytext() { + return new String( zzBuffer, zzStartRead, zzMarkedPos-zzStartRead ); + } + + + /** + * Returns the character at position pos from the + * matched text. + * + * It is equivalent to yytext().charAt(pos), but faster + * + * @param pos the position of the character to fetch. + * A value from 0 to yylength()-1. + * + * @return the character at position pos + */ + public final char yycharat(int pos) { + return zzBuffer[zzStartRead+pos]; + } + + + /** + * Returns the length of the matched text region. + */ + public final int yylength() { + return zzMarkedPos-zzStartRead; + } + + + /** + * Reports an error that occured while scanning. + * + * In a wellformed scanner (no or only correct usage of + * yypushback(int) and a match-all fallback rule) this method + * will only be called with things that "Can't Possibly Happen". + * If this method is called, something is seriously wrong + * (e.g. a JFlex bug producing a faulty scanner etc.). + * + * Usual syntax/scanner level error handling should be done + * in error fallback rules. + * + * @param errorCode the code of the errormessage to display + */ + private void zzScanError(int errorCode) { + String message; + try { + message = ZZ_ERROR_MSG[errorCode]; + } + catch (ArrayIndexOutOfBoundsException e) { + message = ZZ_ERROR_MSG[ZZ_UNKNOWN_ERROR]; + } + + throw new Error(message); + } + + + /** + * Pushes the specified amount of characters back into the input stream. + * + * They will be read again by then next call of the scanning method + * + * @param number the number of characters to be read again. + * This number must not be greater than yylength()! + */ + public void yypushback(int number) { + if ( number > yylength() ) + zzScanError(ZZ_PUSHBACK_2BIG); + + zzMarkedPos -= number; + } + + + /** + * Resumes scanning until the next regular expression is matched, + * the end of input is encountered or an I/O-Error occurs. + * + * @return the next token + * @exception java.io.IOException if any I/O-Error occurs + */ + public java.lang.String yylex() throws java.io.IOException { + int zzInput; + int zzAction; + + // cached fields: + int zzCurrentPosL; + int zzMarkedPosL; + int zzEndReadL = zzEndRead; + char [] zzBufferL = zzBuffer; + char [] zzCMapL = ZZ_CMAP; + + int [] zzTransL = ZZ_TRANS; + int [] zzRowMapL = ZZ_ROWMAP; + int [] zzAttrL = ZZ_ATTRIBUTE; + + while (true) { + zzMarkedPosL = zzMarkedPos; + + zzAction = -1; + + zzCurrentPosL = zzCurrentPos = zzStartRead = zzMarkedPosL; + + zzState = ZZ_LEXSTATE[zzLexicalState]; + + + zzForAction: { + while (true) { + + if (zzCurrentPosL < zzEndReadL) + zzInput = zzBufferL[zzCurrentPosL++]; + else if (zzAtEOF) { + zzInput = YYEOF; + break zzForAction; + } + else { + // store back cached positions + zzCurrentPos = zzCurrentPosL; + zzMarkedPos = zzMarkedPosL; + boolean eof = zzRefill(); + // get translated positions and possibly new buffer + zzCurrentPosL = zzCurrentPos; + zzMarkedPosL = zzMarkedPos; + zzBufferL = zzBuffer; + zzEndReadL = zzEndRead; + if (eof) { + zzInput = YYEOF; + break zzForAction; + } + else { + zzInput = zzBufferL[zzCurrentPosL++]; + } + } + int zzNext = zzTransL[ zzRowMapL[zzState] + zzCMapL[zzInput] ]; + if (zzNext == -1) break zzForAction; + zzState = zzNext; + + int zzAttributes = zzAttrL[zzState]; + if ( (zzAttributes & 1) == 1 ) { + zzAction = zzState; + zzMarkedPosL = zzCurrentPosL; + if ( (zzAttributes & 8) == 8 ) break zzForAction; + } + + } + } + + // store back cached position + zzMarkedPos = zzMarkedPosL; + + switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) { + case 54: + { return "\u06AF"; + } + case 74: break; + case 10: + { return "\u0629"; + } + case 75: break; + case 26: + { return "\u0639"; + } + case 76: break; + case 9: + { return "\u0628"; + } + case 77: break; + case 37: + { return "\u0649"; + } + case 78: break; + case 25: + { return "\u0638"; + } + case 79: break; + case 8: + { return "\u0627"; + } + case 80: break; + case 58: + { return "\u06DF"; + } + case 81: break; + case 36: + { return "\u0648"; + } + case 82: break; + case 68: + { return ">"; + } + case 83: break; + case 24: + { return "\u0637"; + } + case 84: break; + case 7: + { return "\u0626"; + } + case 85: break; + case 35: + { return "\u0647"; + } + case 86: break; + case 23: + { return "\u0636"; + } + case 87: break; + case 2: + { return "\u0625"; + } + case 88: break; + case 69: + { return "<"; + } + case 89: break; + case 34: + { return "\u0646"; + } + case 90: break; + case 67: + { return "\u06ED"; + } + case 91: break; + case 22: + { return "\u0635"; + } + case 92: break; + case 6: + { return "\u0624"; + } + case 93: break; + case 57: + { return "\u06DC"; + } + case 94: break; + case 33: + { return "\u0645"; + } + case 95: break; + case 66: + { return "\u06EC"; + } + case 96: break; + case 21: + { return "\u0634"; + } + case 97: break; + case 3: + { return "\u0623"; + } + case 98: break; + case 32: + { return "\u0644"; + } + case 99: break; + case 70: + { return "|"; + } + case 100: break; + case 65: + { return "\u06EB"; + } + case 101: break; + case 20: + { return "\u0633"; + } + case 102: break; + case 55: + { return "\u0698"; + } + case 103: break; + case 5: + { return "\u0622"; + } + case 104: break; + case 48: + { return "\u0654"; + } + case 105: break; + case 31: + { return "\u0643"; + } + case 106: break; + case 19: + { return "\u0632"; + } + case 107: break; + case 64: + { return "\u06EA"; + } + case 108: break; + case 4: + { return "\u0621"; + } + case 109: break; + case 52: + { return "\u0686"; + } + case 110: break; + case 47: + { return "\u0653"; + } + case 111: break; + case 30: + { return "\u0642"; + } + case 112: break; + case 18: + { return "\u0631"; + } + case 113: break; + case 46: + { return "\u0652"; + } + case 114: break; + case 29: + { return "\u0641"; + } + case 115: break; + case 17: + { return "\u0630"; + } + case 116: break; + case 45: + { return "\u0651"; + } + case 117: break; + case 28: + { return "\u0640"; + } + case 118: break; + case 44: + { return "\u0650"; + } + case 119: break; + case 1: + { return yytext(); + } + case 120: break; + case 50: + { return "\u0671"; + } + case 121: break; + case 49: + { return "\u0670"; + } + case 122: break; + case 63: + { return "\u06E8"; + } + case 123: break; + case 53: + { return "\u06A4"; + } + case 124: break; + case 56: + { return "\u061F"; + } + case 125: break; + case 16: + { return "\u062F"; + } + case 126: break; + case 62: + { return "\u06E6"; + } + case 127: break; + case 15: + { return "\u062E"; + } + case 128: break; + case 61: + { return "\u06E5"; + } + case 129: break; + case 43: + { return "\u064F"; + } + case 130: break; + case 14: + { return "\u062D"; + } + case 131: break; + case 42: + { return "\u064E"; + } + case 132: break; + case 60: + { return "\u06E3"; + } + case 133: break; + case 13: + { return "\u062C"; + } + case 134: break; + case 41: + { return "\u064D"; + } + case 135: break; + case 59: + { return "\u06E2"; + } + case 136: break; + case 12: + { return "\u062B"; + } + case 137: break; + case 40: + { return "\u064C"; + } + case 138: break; + case 11: + { return "\u062A"; + } + case 139: break; + case 51: + { return "\u067E"; + } + case 140: break; + case 39: + { return "\u064B"; + } + case 141: break; + case 27: + { return "\u063A"; + } + case 142: break; + case 38: + { return "\u064A"; + } + case 143: break; + case 71: + { return ")"; + } + case 144: break; + case 72: + { return "("; + } + case 145: break; + case 73: + { return "'"; + } + case 146: break; + default: + if (zzInput == YYEOF && zzStartRead == zzCurrentPos) { + zzAtEOF = true; + return null; + } + else { + zzScanError(ZZ_NO_MATCH); + } + } + } + } + + +} diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/transcode/Transcoder.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/transcode/Transcoder.java Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,226 @@ +package de.mpg.mpiwg.berlin.mpdl.lt.text.transcode; + +import java.io.IOException; +import java.io.StringReader; +import java.io.UnsupportedEncodingException; + +import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; +import edu.unc.epidoc.transcoder.TransCoder; + +public class Transcoder { + private static Transcoder instance; + private TransCoder betaCodeTranscoder; + + public static Transcoder getInstance() { + if (instance == null) { + instance = new Transcoder(); + } + return instance; + } + + public String transcodeFromBetaCode2UnicodeEpidoc(String inputStr) throws ApplicationException { + String encodedUnicodeStr = null; + try { + if (betaCodeTranscoder == null) { + betaCodeTranscoder = new TransCoder(); + betaCodeTranscoder.setParser("BetaCode"); + betaCodeTranscoder.setConverter("UnicodeC"); + } + encodedUnicodeStr = betaCodeTranscoder.getString(inputStr); + } catch (Exception e) { + throw new ApplicationException(e); + } + return encodedUnicodeStr; + } + + public String transcodeFromBetaCode2Unicode(String inputStr) throws ApplicationException { + StringReader strReader = new StringReader(inputStr); + Betacode2UnicodeLex betacode2UnicodeLex = new Betacode2UnicodeLex(strReader); + String retStr = ""; + String token = ""; + while (token != null) { + try { + token = betacode2UnicodeLex.yylex(); + if (token != null) + retStr += token; + } catch (IOException e ) { + throw new ApplicationException(e); + } + } + // replace "small letter sigma" at the end of a word by the "small letter end sigma" + if (retStr != null && retStr.contains("σ")) { + retStr = retStr.replaceAll("(.*)σ(\\s)", "$1ς$2"); + retStr = retStr.replaceAll("(.*)σ($)", "$1ς$2"); + } + return retStr; + /* + // alternative to JFlex + String encodedUnicodeStr = null; + if (inputStr.matches("^a)")) + encodedUnicodeStr = inputStr.replaceFirst("^a)", "\u1F00"); + else if (inputStr.matches("^a(")) + encodedUnicodeStr = inputStr.replaceFirst("^a(", "\u1F01"); + else if (inputStr.matches("^a)\\")) + encodedUnicodeStr = inputStr.replaceFirst("^a)\\", "\u1F02"); + + // the longest regular expressions first + + return encodedUnicodeStr; + */ + } + + + public String transcodeFromUnicode2BetaCode(String inputStr) throws ApplicationException { + StringReader strReader = new StringReader(inputStr); + Unicode2BetacodeLex betacode2UnicodeLex = new Unicode2BetacodeLex(strReader); + String retStr = ""; + String token = ""; + while (token != null) { + try { + token = betacode2UnicodeLex.yylex(); + if (token != null) + retStr += token; + } catch (IOException e ) { + throw new ApplicationException(e); + } + } + return retStr; + } + + public String transcodeFromUnicode2Buckwalter(String inputStr) throws ApplicationException { + StringReader strReader = new StringReader(inputStr); + Unicode2BuckwalterLex betacode2UnicodeLex = new Unicode2BuckwalterLex(strReader); + String retStr = ""; + String token = ""; + while (token != null) { + try { + token = betacode2UnicodeLex.yylex(); + if (token != null) + retStr += token; + } catch (IOException e ) { + throw new ApplicationException(e); + } + } + return retStr; + } + + public String transcodeFromBuckwalter2Unicode(String inputStr) throws ApplicationException { + StringReader strReader = new StringReader(inputStr); + Buckwalter2UnicodeLex buckwalter2UnicodeLex = new Buckwalter2UnicodeLex(strReader); + String retStr = ""; + String token = ""; + while (token != null) { + try { + token = buckwalter2UnicodeLex.yylex(); + if (token != null) + retStr += token; + } catch (IOException e ) { + throw new ApplicationException(e); + } + } + return retStr; + } + + + + public String transcodeFromBuckwalter2UnicodeAraMorph(String inputStr) { + String encodedUnicodeStr = arabizeWord(inputStr); + return encodedUnicodeStr; + } + + + public String encodeBig5(String inputStr) { + String charset = "big5"; + String resultStr = ""; + try { + byte[] resultBytes = inputStr.getBytes(charset); + for (int i=0; i < resultBytes.length; i++) { + byte b = resultBytes[i]; + int unsigned = unsignedByteToInt(b); + String hexStr = Integer.toHexString(unsigned); + resultStr = resultStr + "%" + hexStr; + } + } catch (UnsupportedEncodingException e) { + + } + return resultStr; + } + + private int unsignedByteToInt(byte b) { + return (int) b & 0xFF; + } + + /* + * copied from http://www.nongnu.org/aramorph/english/download.html + * Class: AraMorph + */ + private String arabizeWord(String translitered) { + String tmp_word = translitered; + // convert to transliteration + tmp_word = tmp_word.replaceAll("'", "\u0621"); //\u0621 : ARABIC LETTER HAMZA + tmp_word = tmp_word.replaceAll("\\|", "\u0622"); //\u0622 : ARABIC LETTER ALEF WITH MADDA ABOVE + tmp_word = tmp_word.replaceAll(">", "\u0623"); //\u0623 : ARABIC LETTER ALEF WITH HAMZA ABOVE + tmp_word = tmp_word.replaceAll("&", "\u0624"); //\u0624 : ARABIC LETTER WAW WITH HAMZA ABOVE + tmp_word = tmp_word.replaceAll("<", "\u0625"); //\u0625 : ARABIC LETTER ALEF WITH HAMZA BELOW + tmp_word = tmp_word.replaceAll("}", "\u0626"); //\u0626 : ARABIC LETTER YEH WITH HAMZA ABOVE + tmp_word = tmp_word.replaceAll("A", "\u0627"); //\u0627 : ARABIC LETTER ALEF + tmp_word = tmp_word.replaceAll("b", "\u0628"); //\u0628 : ARABIC LETTER BEH + tmp_word = tmp_word.replaceAll("p", "\u0629"); //\u0629 : ARABIC LETTER TEH MARBUTA + tmp_word = tmp_word.replaceAll("t", "\u062A"); //\u062A : ARABIC LETTER TEH + tmp_word = tmp_word.replaceAll("v", "\u062B"); //\u062B : ARABIC LETTER THEH + tmp_word = tmp_word.replaceAll("j", "\u062C"); //\u062C : ARABIC LETTER JEEM + tmp_word = tmp_word.replaceAll("H", "\u062D"); //\u062D : ARABIC LETTER HAH + tmp_word = tmp_word.replaceAll("x", "\u062E"); //\u062E : ARABIC LETTER KHAH + tmp_word = tmp_word.replaceAll("d", "\u062F"); //\u062F : ARABIC LETTER DAL + tmp_word = tmp_word.replaceAll("\\*", "\u0630"); //\u0630 : ARABIC LETTER THAL + tmp_word = tmp_word.replaceAll("r", "\u0631"); //\u0631 : ARABIC LETTER REH + tmp_word = tmp_word.replaceAll("z", "\u0632"); //\u0632 : ARABIC LETTER ZAIN + tmp_word = tmp_word.replaceAll("s", "\u0633" ); //\u0633 : ARABIC LETTER SEEN + tmp_word = tmp_word.replaceAll("\\$", "\u0634"); //\u0634 : ARABIC LETTER SHEEN + tmp_word = tmp_word.replaceAll("S", "\u0635"); //\u0635 : ARABIC LETTER SAD + tmp_word = tmp_word.replaceAll("D", "\u0636"); //\u0636 : ARABIC LETTER DAD + tmp_word = tmp_word.replaceAll("T", "\u0637"); //\u0637 : ARABIC LETTER TAH + tmp_word = tmp_word.replaceAll("Z", "\u0638"); //\u0638 : ARABIC LETTER ZAH + tmp_word = tmp_word.replaceAll("E", "\u0639"); //\u0639 : ARABIC LETTER AIN + tmp_word = tmp_word.replaceAll("g", "\u063A"); //\u063A : ARABIC LETTER GHAIN + tmp_word = tmp_word.replaceAll("_", "\u0640"); //\u0640 : ARABIC TATWEEL + tmp_word = tmp_word.replaceAll("f", "\u0641"); //\u0641 : ARABIC LETTER FEH + tmp_word = tmp_word.replaceAll("q", "\u0642"); //\u0642 : ARABIC LETTER QAF + tmp_word = tmp_word.replaceAll("k", "\u0643"); //\u0643 : ARABIC LETTER KAF + tmp_word = tmp_word.replaceAll("l", "\u0644"); //\u0644 : ARABIC LETTER LAM + tmp_word = tmp_word.replaceAll("m", "\u0645"); //\u0645 : ARABIC LETTER MEEM + tmp_word = tmp_word.replaceAll("n", "\u0646"); //\u0646 : ARABIC LETTER NOON + tmp_word = tmp_word.replaceAll("h", "\u0647"); //\u0647 : ARABIC LETTER HEH + tmp_word = tmp_word.replaceAll("w", "\u0648"); //\u0648 : ARABIC LETTER WAW + tmp_word = tmp_word.replaceAll("Y", "\u0649"); //\u0649 : ARABIC LETTER ALEF MAKSURA + tmp_word = tmp_word.replaceAll("y", "\u064A"); //\u064A : ARABIC LETTER YEH + tmp_word = tmp_word.replaceAll("F", "\u064B"); //\u064B : ARABIC FATHATAN + tmp_word = tmp_word.replaceAll("N", "\u064C"); //\u064C : ARABIC DAMMATAN + tmp_word = tmp_word.replaceAll("K", "\u064D"); //\u064D : ARABIC KASRATAN + tmp_word = tmp_word.replaceAll("a", "\u064E"); //\u064E : ARABIC FATHA + tmp_word = tmp_word.replaceAll("u", "\u064F"); //\u064F : ARABIC DAMMA + tmp_word = tmp_word.replaceAll("i", "\u0650"); //\u0650 : ARABIC KASRA + tmp_word = tmp_word.replaceAll("~", "\u0651"); //\u0651 : ARABIC SHADDA + tmp_word = tmp_word.replaceAll("o", "\u0652"); //\u0652 : ARABIC SUKUN + tmp_word = tmp_word.replaceAll("`", "\u0670"); //\u0670 : ARABIC LETTER SUPERSCRIPT ALEF + tmp_word = tmp_word.replaceAll("\\{", "\u0671"); //\u0671 : ARABIC LETTER ALEF WASLA + tmp_word = tmp_word.replaceAll("P", "\u067E"); //\u067E : ARABIC LETTER PEH + tmp_word = tmp_word.replaceAll("J", "\u0686"); //\u0686 : ARABIC LETTER TCHEH + tmp_word = tmp_word.replaceAll("V", "\u06A4"); //\u06A4 : ARABIC LETTER VEH + tmp_word = tmp_word.replaceAll("G", "\u06AF"); //\u06AF : ARABIC LETTER GAF + tmp_word = tmp_word.replaceAll("R", "\u0698"); //\u0698 : ARABIC LETTER JEH (no more in Buckwalter system) + //Not in Buckwalter system \u0679 : ARABIC LETTER TTEH + //Not in Buckwalter system \u0688 : ARABIC LETTER DDAL + //Not in Buckwalter system \u06A9 : ARABIC LETTER KEHEH + //Not in Buckwalter system \u0691 : ARABIC LETTER RREH + //Not in Buckwalter system \u06BA : ARABIC LETTER NOON GHUNNA + //Not in Buckwalter system \u06BE : ARABIC LETTER HEH DOACHASHMEE + //Not in Buckwalter system \u06C1 : ARABIC LETTER HEH GOAL + //Not in Buckwalter system \u06D2 : ARABIC LETTER YEH BARREE + tmp_word = tmp_word.replaceAll(",", "\u060C" ); //\u060C : ARABIC COMMA + tmp_word = tmp_word.replaceAll(";", "\u061B"); //\u061B : ARABIC SEMICOLON + tmp_word = tmp_word.replaceAll("\\?", "\u061F"); //\u061F : ARABIC QUESTION MARK + return tmp_word; + } + +} \ No newline at end of file diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/transcode/Unicode2Betacode.lex --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/transcode/Unicode2Betacode.lex Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,319 @@ +package de.mpg.mpiwg.berlin.mpdl.lt.general; + +%% + +%class Unicode2BetacodeLex +%public +%type java.lang.String +%unicode +%% + + +"<"[^>]\u+">" { return yytext(); } + +"H" { return "*j"; } +"h" { return "j"; } +"F" { return "*v"; } +"f" { return "v"; } +"\u03a3" { return "*s"; } + +"." { return "!"; } +"\u00B7" { return ":"; } /* MPDL update */ + +"\u1F00" { return "a)"; } +"\u1F01" { return "a("; } +"\u1F02" { return "a)\\"; } +"\u1F03" { return "a(\\"; } +"\u1F04" { return "a)/"; } +"\u1F05" { return "a(/"; } +"\u1F06" { return "a)="; } +"\u1F07" { return "a(="; } +"\u1F08" { return "*)a"; } +"\u1F09" { return "*(a"; } +"\u1F0A" { return "*)\\a"; } +"\u1F0B" { return "*(\\a"; } +"\u1F0C" { return "*)/a"; } +"\u1F0D" { return "*(/a"; } +"\u1F0E" { return "*)=a"; } +"\u1F0F" { return "*(=a"; } +"\u1F10" { return "e)"; } +"\u1F11" { return "e("; } +"\u1F12" { return "e)\\"; } +"\u1F13" { return "e(\\"; } +"\u1F14" { return "e)/"; } +"\u1F15" { return "e(/"; } +"\u1F18" { return "*)e"; } +"\u1F19" { return "*(e"; } +"\u1F1A" { return "*)\\e"; } +"\u1F1B" { return "*(\\e"; } +"\u1F1C" { return "*)/e"; } +"\u1F1D" { return "*(/e"; } +"\u1F20" { return "h)"; } +"\u1F21" { return "h("; } +"\u1F22" { return "h)\\"; } +"\u1F23" { return "h(\\"; } +"\u1F24" { return "h)/"; } +"\u1F25" { return "h(/"; } +"\u1F26" { return "h)="; } +"\u1F27" { return "h(="; } +"\u1F28" { return "*)h"; } +"\u1F29" { return "*(h"; } +"\u1F2A" { return "*)\\h"; } +"\u1F2B" { return "*(\\h"; } +"\u1F2C" { return "*)/h"; } +"\u1F2D" { return "*(/h"; } +"\u1F2E" { return "*)=h"; } +"\u1F2F" { return "*(=h"; } +"\u1F30" { return "i)"; } +"\u1F31" { return "i("; } +"\u1F32" { return "i)\\"; } +"\u1F33" { return "i(\\"; } +"\u1F34" { return "i)/"; } +"\u1F35" { return "i(/"; } +"\u1F36" { return "i)="; } +"\u1F37" { return "i(="; } +"\u1F38" { return "*)i"; } +"\u1F39" { return "*(i"; } +"\u1F3A" { return "*)\\i"; } +"\u1F3B" { return "*(\\i"; } +"\u1F3C" { return "*)/i"; } +"\u1F3D" { return "*(/i"; } +"\u1F3E" { return "*)=i"; } +"\u1F3F" { return "*(=i"; } +"\u1F40" { return "o)"; } +"\u1F41" { return "o("; } +"\u1F42" { return "o)\\"; } +"\u1F43" { return "o(\\"; } +"\u1F44" { return "o)/"; } +"\u1F45" { return "o(/"; } +"\u1F48" { return "*)o"; } +"\u1F49" { return "*(o"; } +"\u1F4A" { return "*)\\o"; } +"\u1F4B" { return "*(\\o"; } +"\u1F4C" { return "*)/o"; } +"\u1F4D" { return "*(/o"; } +"\u1F50" { return "u)"; } +"\u1F51" { return "u("; } +"\u1F52" { return "u)\\"; } +"\u1F53" { return "u(\\"; } +"\u1F54" { return "u)/"; } +"\u1F55" { return "u(/"; } +"\u1F56" { return "u)="; } +"\u1F57" { return "u(="; } +"\u1F59" { return "*(u"; } +"\u1F5B" { return "*(\\u"; } +"\u1F5D" { return "*(/u"; } +"\u1F5F" { return "*(=u"; } +"\u1F60" { return "w)"; } +"\u1F61" { return "w("; } +"\u1F62" { return "w)\\"; } +"\u1F63" { return "w(\\"; } +"\u1F64" { return "w)/"; } +"\u1F65" { return "w(/"; } +"\u1F66" { return "w)="; } +"\u1F67" { return "w(="; } +"\u1F68" { return "*)w"; } +"\u1F69" { return "*(w"; } +"\u1F6A" { return "*)\\w"; } +"\u1F6B" { return "*(\\w"; } +"\u1F6C" { return "*)/w"; } +"\u1F6D" { return "*(/w"; } +"\u1F6E" { return "*)=w"; } +"\u1F6F" { return "*(=w"; } +"\u1F70" { return "a\\"; } +"\u1F71" { return "a/"; } +"\u1F72" { return "e\\"; } +"\u1F73" { return "e/"; } +"\u1F74" { return "h\\"; } +"\u1F75" { return "h/"; } +"\u1F76" { return "i\\"; } +"\u1F77" { return "i/"; } +"\u1F78" { return "o\\"; } +"\u1F79" { return "o/"; } +"\u1F7A" { return "u\\"; } +"\u1F7B" { return "u/"; } +"\u1F7C" { return "w\\"; } +"\u1F7D" { return "w/"; } +"\u1F80" { return "a)|"; } +"\u1F81" { return "a(|"; } +"\u1F82" { return "a)\\|"; } +"\u1F83" { return "a(\\|"; } +"\u1F84" { return "a)/|"; } +"\u1F85" { return "a(/|"; } +"\u1F86" { return "a)=|"; } +"\u1F87" { return "a(=|"; } +"\u1F88" { return "*)|a"; } +"\u1F89" { return "*(|a"; } +"\u1F8A" { return "*)\\|a"; } +"\u1F8B" { return "*(\\|a"; } +"\u1F8C" { return "*)/|a"; } +"\u1F8D" { return "*(/|a"; } +"\u1F8E" { return "*)=|a"; } +"\u1F8F" { return "*(=|a"; } +"\u1F90" { return "h)|"; } +"\u1F91" { return "h(|"; } +"\u1F92" { return "h)\\|"; } +"\u1F93" { return "h(\\|"; } +"\u1F94" { return "h)/|"; } +"\u1F95" { return "h(/|"; } +"\u1F96" { return "h)=|"; } +"\u1F97" { return "h(=|"; } +"\u1F98" { return "*)|h"; } +"\u1F99" { return "*(|h"; } +"\u1F9A" { return "*)\\|h"; } +"\u1F9B" { return "*(\\|h"; } +"\u1F9C" { return "*)/|h"; } +"\u1F9D" { return "*(/|h"; } +"\u1F9E" { return "*)=|h"; } +"\u1F9F" { return "*(=|h"; } +"\u1FA0" { return "w)|"; } +"\u1FA1" { return "w(|"; } +"\u1FA2" { return "w)\\|"; } +"\u1FA3" { return "w(\\|"; } +"\u1FA4" { return "w)/|"; } +"\u1FA5" { return "w(/|"; } +"\u1FA6" { return "w)=|"; } +"\u1FA7" { return "w(=|"; } +"\u1FA8" { return "*)|w"; } +"\u1FA9" { return "*(|w"; } +"\u1FAA" { return "*)\\|w"; } +"\u1FAB" { return "*(\\|w"; } +"\u1FAC" { return "*)/|w"; } +"\u1FAD" { return "*(/|w"; } +"\u1FAE" { return "*)=|w"; } +"\u1FAF" { return "*(=|w"; } +"\u1FB0" { return "a^"; } +"\u1FB1" { return "a_"; } +"\u1FB2" { return "a\\|"; } +"\u1FB3" { return "a|"; } +"\u1FB4" { return "a/|"; } +"\u1FB6" { return "a="; } +"\u1FB7" { return "a=|"; } +"\u1FB8" { return "*a^"; } +"\u1FB9" { return "*a_"; } +"\u1FBA" { return "*a\\"; } +"\u1FBB" { return "*a/"; } +"\u1FBC" { return "*a|"; } +"\u1FC2" { return "h\\|"; } +"\u1FC3" { return "h|"; } +"\u1FC4" { return "h/|"; } +"\u1FC6" { return "h="; } +"\u1FC7" { return "h=|"; } +"\u1FC8" { return "*e\\"; } +"\u1FC9" { return "*e/"; } +"\u1FCA" { return "*h\\"; } +"\u1FCB" { return "*h/"; } +"\u1FCC" { return "*h|"; } +"\u1FD0" { return "i^"; } +"\u1FD1" { return "i_"; } +"\u1FD2" { return "i+\\"; } +"\u1FD3" { return "i+/"; } +"\u1FD6" { return "i="; } +"\u1FD7" { return "i+="; } +"\u1FD8" { return "*i^"; } +"\u1FD9" { return "*i_"; } +"\u1FDA" { return "*i\\"; } +"\u1FDB" { return "*i/"; } +"\u1FE0" { return "u^"; } +"\u1FE1" { return "u_"; } +"\u1FE2" { return "u+\\"; } +"\u1FE3" { return "u+/"; } +"\u1FE4" { return "r)"; } +"\u1FE5" { return "r("; } +"\u1FE6" { return "u="; } +"\u1FE7" { return "u+="; } +"\u1FE8" { return "*u^"; } +"\u1FE9" { return "*u_"; } +"\u1FEA" { return "*u\\"; } +"\u1FEB" { return "*u/"; } +"\u1FEC" { return "*(r"; } +"\u1FF2" { return "w\\|"; } +"\u1FF3" { return "w|"; } +"\u1FF4" { return "w/|"; } +"\u1FFA" { return "*w\\"; } +"\u1FFB" { return "*w/"; } +"\u1FFC" { return "*w|"; } +"\u1FF6" { return "w="; } +"\u1FF7" { return "w=|"; } +"\u1FF8" { return "*o\\"; } +"\u1FF9" { return "*o/"; } + +"\u0300" { return "\\"; } +"\u0301" { return "/"; } +"\u0304" { return "_"; } +"\u0306" { return "^"; } +"\u0308" { return "+"; } +"\u0302" { return "="; } +"\u0313" { return ")"; } +"\u0314" { return "("; } +"\u0323" { return "?"; } +"\u0345" { return "|"; } + +"\u03b1" { return "a"; } /* MPDL update */ +"\u0391" { return "*a"; } /* MPDL update */ +"\u03b2" { return "b"; } /* MPDL update */ +"\u0392" { return "*b"; } /* MPDL update */ +"\u03b3" { return "g"; } /* MPDL update */ +"\u0393" { return "*g"; } /* MPDL update */ +"\u03b4" { return "d"; } /* MPDL update */ +"\u0394" { return "*d"; } /* MPDL update */ +"\u03b5" { return "e"; } /* MPDL update */ +"\u0395" { return "*e"; } /* MPDL update */ +"\u03b6" { return "z"; } /* MPDL update */ +"\u0396" { return "*z"; } /* MPDL update */ +"\u03b7" { return "h"; } /* MPDL update */ +"\u0397" { return "*h"; } /* MPDL update */ +"\u03b8" { return "q"; } /* MPDL update */ +"\u0398" { return "*q"; } /* MPDL update */ +"\u03b9" { return "i"; } /* MPDL update */ +"\u0399" { return "*i"; } /* MPDL update */ +"\u03ba" { return "k"; } /* MPDL update */ +"\u039a" { return "*k"; } /* MPDL update */ +"\u03bb" { return "l"; } /* MPDL update */ +"\u039b" { return "*l"; } /* MPDL update */ +"\u03bc" { return "m"; } /* MPDL update */ +"\u039c" { return "*m"; } /* MPDL update */ +"\u03bd" { return "n"; } /* MPDL update */ +"\u039d" { return "*n"; } /* MPDL update */ +"\u03be" { return "c"; } /* MPDL update */ +"\u039e" { return "*c"; } /* MPDL update */ +"\u03bf" { return "o"; } /* MPDL update */ +"\u039f" { return "*o"; } /* MPDL update */ +"\u03c0" { return "p"; } /* MPDL update */ +"\u03a0" { return "*p"; } /* MPDL update */ +"\u03c1" { return "r"; } /* MPDL update */ +"\u03a1" { return "*r"; } /* MPDL update */ + +"\u03a3" { return "*s"; } /* MPDL update */ +"\u03c3" { return "s1"; } /* mdh 2002-01-07 */ +"\u03c2"/\-\- { return "s"; } +"\u03c3"/\> }[a-z\?\!0-9*=\/()\'\-] { return "s"; } /* MPDL update */ +"\u03c2"/\< { return "s"; } /* MPDL update */ +"\u03c3"/[\[\]][a-z\?\!0-9*=\/()\'\-] { return "s"; } /* MPDL update */ +"\u03c2"/\??[^a-z0-9*=\/()\'\-\[\?] { return "s"; } +"\u03c3" { return "s"; } /* MPDL update */ + +"\u03c4" { return "t"; } /* MPDL update */ +"\u03a4" { return "*t"; } /* MPDL update */ +"\u03c5" { return "u"; } /* MPDL update */ +"\u03a5" { return "*u"; } /* MPDL update */ +"\u03c6" { return "f"; } /* MPDL update */ +"\u03a6" { return "*f"; } /* MPDL update */ +"\u03c7" { return "x"; } /* MPDL update */ +"\u03a7" { return "*x"; } /* MPDL update */ +"\u03c8" { return "y"; } /* MPDL update */ +"\u03a8" { return "*y"; } /* MPDL update */ +"\u03c9" { return "w"; } /* MPDL update */ +"\u03a9" { return "*w"; } /* MPDL update */ + +[\&_]"vert;" { return "|"; } +[\&_]"lpar;" { return "("; } +[\&_]"rpar;" { return ")"; } +[\_\&]"lt;" { return "<"; } +[\_\&]"gt;" { return ">"; } +"'" { return "'"; } /* MPDL update */ + +"&"[a-zA-Z]+";" { return yytext(); } + +. { return yytext(); } +\n { return yytext(); } \ No newline at end of file diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/transcode/Unicode2BetacodeLex.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/transcode/Unicode2BetacodeLex.java Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,1866 @@ +/* The following code was generated by JFlex 1.4.3 on 14.12.10 15:03 */ + +package de.mpg.mpiwg.berlin.mpdl.lt.text.transcode; + + +/** + * This class is a scanner generated by + * JFlex 1.4.3 + * on 14.12.10 15:03 from the specification file + * /Users/jwillenborg/test/jflex/Unicode2Betacode.lex + */ +public class Unicode2BetacodeLex { + + /** This character denotes the end of file */ + public static final int YYEOF = -1; + + /** initial size of the lookahead buffer */ + private static final int ZZ_BUFFERSIZE = 16384; + + /** lexical states */ + public static final int YYINITIAL = 0; + + /** + * ZZ_LEXSTATE[l] is the state in the DFA for the lexical state l + * ZZ_LEXSTATE[l+1] is the state in the DFA for the lexical state l + * at the beginning of a line + * l is of the form l = 2*k, k a non negative integer + */ + private static final int ZZ_LEXSTATE[] = { + 0, 0 + }; + + /** + * Translates characters to character classes + */ + private static final String ZZ_CMAP_PACKED = + "\12\0\1\0\26\0\1\u0118\1\0\1\u0130\2\0\1\u0113\4\u011c\2\0"+ + "\1\u0112\1\11\1\u011c\1\u0131\2\u011c\1\u0132\5\u011c\1\u0133\1\0\1\u0116"+ + "\1\1\1\u011c\1\2\1\u011b\1\0\5\u0134\1\6\1\u0134\1\4\22\u0134"+ + "\1\u011d\1\0\1\u011a\1\0\1\u012a\1\0\1\u012f\3\u0135\1\u012c\1\7"+ + "\1\u0114\1\5\3\u0135\1\u0119\3\u0135\1\u012e\1\u0135\1\u012d\1\u0135\1\u0115"+ + "\1\3\1\u012b\4\u0135\2\0\1\u0117\71\0\1\12\u0248\0\1\344\1\345"+ + "\1\351\1\0\1\346\1\0\1\347\1\0\1\350\12\0\1\352\1\353"+ + "\16\0\1\354\41\0\1\355\113\0\1\357\1\361\1\363\1\365\1\367"+ + "\1\371\1\373\1\375\1\377\1\u0101\1\u0103\1\u0105\1\u0107\1\u0109\1\u010b"+ + "\1\u010d\1\u010f\1\0\1\10\1\u011f\1\u0121\1\u0123\1\u0125\1\u0127\1\u0129"+ + "\7\0\1\356\1\360\1\362\1\364\1\366\1\370\1\372\1\374\1\376"+ + "\1\u0100\1\u0102\1\u0104\1\u0106\1\u0108\1\u010a\1\u010c\1\u010e\1\u0111\1\u0110"+ + "\1\u011e\1\u0120\1\u0122\1\u0124\1\u0126\1\u0128\u1b36\0\1\13\1\14\1\15"+ + "\1\16\1\17\1\20\1\21\1\22\1\23\1\24\1\25\1\26\1\27"+ + "\1\30\1\31\1\32\1\33\1\34\1\35\1\36\1\37\1\40\2\0"+ + "\1\41\1\42\1\43\1\44\1\45\1\46\2\0\1\47\1\50\1\51"+ + "\1\52\1\53\1\54\1\55\1\56\1\57\1\60\1\61\1\62\1\63"+ + "\1\64\1\65\1\66\1\67\1\70\1\71\1\72\1\73\1\74\1\75"+ + "\1\76\1\77\1\100\1\101\1\102\1\103\1\104\1\105\1\106\1\107"+ + "\1\110\1\111\1\112\1\113\1\114\2\0\1\115\1\116\1\117\1\120"+ + "\1\121\1\122\2\0\1\123\1\124\1\125\1\126\1\127\1\130\1\131"+ + "\1\132\1\0\1\133\1\0\1\134\1\0\1\135\1\0\1\136\1\137"+ + "\1\140\1\141\1\142\1\143\1\144\1\145\1\146\1\147\1\150\1\151"+ + "\1\152\1\153\1\154\1\155\1\156\1\157\1\160\1\161\1\162\1\163"+ + "\1\164\1\165\1\166\1\167\1\170\1\171\1\172\1\173\1\174\2\0"+ + "\1\175\1\176\1\177\1\200\1\201\1\202\1\203\1\204\1\205\1\206"+ + "\1\207\1\210\1\211\1\212\1\213\1\214\1\215\1\216\1\217\1\220"+ + "\1\221\1\222\1\223\1\224\1\225\1\226\1\227\1\230\1\231\1\232"+ + "\1\233\1\234\1\235\1\236\1\237\1\240\1\241\1\242\1\243\1\244"+ + "\1\245\1\246\1\247\1\250\1\251\1\252\1\253\1\254\1\255\1\256"+ + "\1\257\1\260\1\261\1\0\1\262\1\263\1\264\1\265\1\266\1\267"+ + "\1\270\5\0\1\271\1\272\1\273\1\0\1\274\1\275\1\276\1\277"+ + "\1\300\1\301\1\302\3\0\1\303\1\304\1\305\1\306\2\0\1\307"+ + "\1\310\1\311\1\312\1\313\1\314\4\0\1\315\1\316\1\317\1\320"+ + "\1\321\1\322\1\323\1\324\1\325\1\326\1\327\1\330\1\331\5\0"+ + "\1\332\1\333\1\334\1\0\1\340\1\341\1\342\1\343\1\335\1\336"+ + "\1\337\ue003\0"; + + /** + * Translates characters to character classes + */ + private static final char [] ZZ_CMAP = zzUnpackCMap(ZZ_CMAP_PACKED); + + /** + * Translates DFA states to action switch labels. + */ + private static final int [] ZZ_ACTION = zzUnpackAction(); + + private static final String ZZ_ACTION_PACKED_0 = + "\1\0\2\1\1\2\1\3\1\4\1\5\1\6\1\7"+ + "\1\10\1\11\1\12\1\13\1\14\1\15\1\16\1\17"+ + "\1\20\1\21\1\22\1\23\1\24\1\25\1\26\1\27"+ + "\1\30\1\31\1\32\1\33\1\34\1\35\1\36\1\37"+ + "\1\40\1\41\1\42\1\43\1\44\1\45\1\46\1\47"+ + "\1\50\1\51\1\52\1\53\1\54\1\55\1\56\1\57"+ + "\1\60\1\61\1\62\1\63\1\64\1\65\1\66\1\67"+ + "\1\70\1\71\1\72\1\73\1\74\1\75\1\76\1\77"+ + "\1\100\1\101\1\102\1\103\1\104\1\105\1\106\1\107"+ + "\1\110\1\111\1\112\1\113\1\114\1\115\1\116\1\117"+ + "\1\120\1\121\1\122\1\123\1\124\1\125\1\126\1\127"+ + "\1\130\1\131\1\132\1\133\1\134\1\135\1\136\1\137"+ + "\1\140\1\141\1\142\1\143\1\144\1\145\1\146\1\147"+ + "\1\150\1\151\1\152\1\153\1\154\1\155\1\156\1\157"+ + "\1\160\1\161\1\162\1\163\1\164\1\165\1\166\1\167"+ + "\1\170\1\171\1\172\1\173\1\174\1\175\1\176\1\177"+ + "\1\200\1\201\1\202\1\203\1\204\1\205\1\206\1\207"+ + "\1\210\1\211\1\212\1\213\1\214\1\215\1\216\1\217"+ + "\1\220\1\221\1\222\1\223\1\224\1\225\1\226\1\227"+ + "\1\230\1\231\1\232\1\233\1\234\1\235\1\236\1\237"+ + "\1\240\1\241\1\242\1\243\1\244\1\245\1\246\1\247"+ + "\1\250\1\251\1\252\1\253\1\254\1\255\1\256\1\257"+ + "\1\260\1\261\1\262\1\263\1\264\1\265\1\266\1\267"+ + "\1\270\1\271\1\272\1\273\1\274\1\275\1\276\1\277"+ + "\1\300\1\301\1\302\1\303\1\304\1\305\1\306\1\307"+ + "\1\310\1\311\1\312\1\313\1\314\1\315\1\316\1\317"+ + "\1\320\1\321\1\322\1\323\1\324\1\325\1\326\1\327"+ + "\1\330\1\331\1\332\1\333\1\334\1\335\1\336\1\337"+ + "\1\340\1\341\1\342\1\343\1\344\1\345\1\346\1\347"+ + "\1\350\1\351\1\352\1\353\1\354\1\355\1\356\1\357"+ + "\1\360\1\361\1\362\1\363\1\364\1\365\1\366\1\367"+ + "\1\370\1\371\1\372\1\373\1\374\1\375\1\376\1\377"+ + "\1\u0100\1\u0101\1\u0102\1\u0103\1\u0104\1\u0105\1\u0106\1\u0107"+ + "\1\u0108\1\u0109\1\u010a\1\u010b\1\u010c\1\u010d\1\u010e\2\1"+ + "\1\u010f\1\u0110\1\u0111\1\u0112\1\u0113\1\u0114\1\u0115\1\u0116"+ + "\1\u0117\1\u0118\1\u0119\1\u011a\1\1\3\0\1\u011b\1\0"+ + "\1\u011b\33\0\1\u011c\1\u011d\17\0\1\u011e"; + + private static int [] zzUnpackAction() { + int [] result = new int[338]; + int offset = 0; + offset = zzUnpackAction(ZZ_ACTION_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackAction(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int count = packed.charAt(i++); + int value = packed.charAt(i++); + do result[j++] = value; while (--count > 0); + } + return j; + } + + + /** + * Translates a state to a row index in the transition table + */ + private static final int [] ZZ_ROWMAP = zzUnpackRowMap(); + + private static final String ZZ_ROWMAP_PACKED_0 = + "\0\0\0\u0136\0\u026c\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136"+ + "\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136"+ + "\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136"+ + "\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136"+ + "\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136"+ + "\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136"+ + "\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136"+ + "\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136"+ + "\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136"+ + "\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136"+ + "\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136"+ + "\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136"+ + "\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136"+ + "\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136"+ + "\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136"+ + "\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136"+ + "\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136"+ + "\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136"+ + "\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136"+ + "\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136"+ + "\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136"+ + "\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136"+ + "\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136"+ + "\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136"+ + "\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136"+ + "\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136"+ + "\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136"+ + "\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136"+ + "\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136"+ + "\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136"+ + "\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136"+ + "\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136"+ + "\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136"+ + "\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u03a2"+ + "\0\u04d8\0\u060e\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136"+ + "\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0744\0\u087a"+ + "\0\u09b0\0\u0ae6\0\u0136\0\u0c1c\0\u0d52\0\u0e88\0\u0fbe\0\u10f4"+ + "\0\u122a\0\u1360\0\u1496\0\u15cc\0\u1702\0\u1838\0\u196e\0\u1aa4"+ + "\0\u1bda\0\u1d10\0\u1e46\0\u1f7c\0\u20b2\0\u21e8\0\u231e\0\u2454"+ + "\0\u258a\0\u26c0\0\u27f6\0\u292c\0\u2a62\0\u2b98\0\u2cce\0\u2e04"+ + "\0\u0136\0\u0136\0\u2f3a\0\u3070\0\u31a6\0\u32dc\0\u3412\0\u3548"+ + "\0\u367e\0\u37b4\0\u38ea\0\u3a20\0\u3b56\0\u3c8c\0\u3dc2\0\u3ef8"+ + "\0\u402e\0\u0136"; + + private static int [] zzUnpackRowMap() { + int [] result = new int[338]; + int offset = 0; + offset = zzUnpackRowMap(ZZ_ROWMAP_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackRowMap(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int high = packed.charAt(i++) << 16; + result[j++] = high | packed.charAt(i++); + } + return j; + } + + /** + * The transition table of the DFA + */ + private static final int [] ZZ_TRANS = zzUnpackTrans(); + + private static final String ZZ_TRANS_PACKED_0 = + "\1\2\1\3\2\2\1\4\1\5\1\6\1\7\1\10"+ + "\1\11\1\12\1\13\1\14\1\15\1\16\1\17\1\20"+ + "\1\21\1\22\1\23\1\24\1\25\1\26\1\27\1\30"+ + "\1\31\1\32\1\33\1\34\1\35\1\36\1\37\1\40"+ + "\1\41\1\42\1\43\1\44\1\45\1\46\1\47\1\50"+ + "\1\51\1\52\1\53\1\54\1\55\1\56\1\57\1\60"+ + "\1\61\1\62\1\63\1\64\1\65\1\66\1\67\1\70"+ + "\1\71\1\72\1\73\1\74\1\75\1\76\1\77\1\100"+ + "\1\101\1\102\1\103\1\104\1\105\1\106\1\107\1\110"+ + "\1\111\1\112\1\113\1\114\1\115\1\116\1\117\1\120"+ + "\1\121\1\122\1\123\1\124\1\125\1\126\1\127\1\130"+ + "\1\131\1\132\1\133\1\134\1\135\1\136\1\137\1\140"+ + "\1\141\1\142\1\143\1\144\1\145\1\146\1\147\1\150"+ + "\1\151\1\152\1\153\1\154\1\155\1\156\1\157\1\160"+ + "\1\161\1\162\1\163\1\164\1\165\1\166\1\167\1\170"+ + "\1\171\1\172\1\173\1\174\1\175\1\176\1\177\1\200"+ + "\1\201\1\202\1\203\1\204\1\205\1\206\1\207\1\210"+ + "\1\211\1\212\1\213\1\214\1\215\1\216\1\217\1\220"+ + "\1\221\1\222\1\223\1\224\1\225\1\226\1\227\1\230"+ + "\1\231\1\232\1\233\1\234\1\235\1\236\1\237\1\240"+ + "\1\241\1\242\1\243\1\244\1\245\1\246\1\247\1\250"+ + "\1\251\1\252\1\253\1\254\1\255\1\256\1\257\1\260"+ + "\1\261\1\262\1\263\1\264\1\265\1\266\1\267\1\270"+ + "\1\271\1\272\1\273\1\274\1\275\1\276\1\277\1\300"+ + "\1\301\1\302\1\303\1\304\1\305\1\306\1\307\1\310"+ + "\1\311\1\312\1\313\1\314\1\315\1\316\1\317\1\320"+ + "\1\321\1\322\1\323\1\324\1\325\1\326\1\327\1\330"+ + "\1\331\1\332\1\333\1\334\1\335\1\336\1\337\1\340"+ + "\1\341\1\342\1\343\1\344\1\345\1\346\1\347\1\350"+ + "\1\351\1\352\1\353\1\354\1\355\1\356\1\357\1\360"+ + "\1\361\1\362\1\363\1\364\1\365\1\366\1\367\1\370"+ + "\1\371\1\372\1\373\1\374\1\375\1\376\1\377\1\u0100"+ + "\1\u0101\1\u0102\1\u0103\1\u0104\1\u0105\1\u0106\1\u0107\1\u0108"+ + "\1\u0109\1\u010a\1\u010b\1\u010c\1\u010d\1\u010e\1\u010f\1\u0110"+ + "\1\u0111\1\2\1\u0112\12\2\1\u0113\1\u0114\1\u0115\1\u0116"+ + "\1\u0117\1\u0118\1\u0119\1\u011a\1\u011b\1\u011c\1\u011d\1\u011e"+ + "\1\u011f\13\2\u0136\0\2\u0120\1\0\u0133\u0120\u0113\0\1\u0121"+ + "\6\0\1\u0122\2\0\1\u0122\30\0\3\u0123\1\0\1\u0123"+ + "\1\0\1\u0123\1\0\u010a\u0123\1\u0124\1\u0125\2\0\3\u0123"+ + "\1\0\1\u0123\1\u0126\2\0\15\u0123\5\0\1\u0123\3\0"+ + "\1\u0123\4\0\5\u0127\u010c\0\1\u0128\1\u0127\3\0\1\u0129"+ + "\21\0\1\u012a\1\u0127\1\u012b\2\u0127\1\u012c\3\0\2\u0127"+ + "\u0114\0\1\u012d\4\0\1\u012e\21\0\1\u012f\1\0\1\u0130"+ + "\13\0\1\u0131\u0246\0\1\u0132\44\0\1\u0123\1\0\1\u0123"+ + "\1\0\1\u0123\u010a\0\1\u0123\1\0\2\u0123\2\0\2\u0123"+ + "\1\0\2\u0123\16\0\5\u0123\1\0\3\u0123\1\0\1\u0123"+ + "\u0112\0\1\u0123\u013c\0\1\u0133\34\0\3\u0123\1\0\1\u0123"+ + "\1\0\1\u0123\1\0\u010a\u0123\1\0\1\u0123\2\0\3\u0123"+ + "\1\0\1\u0123\3\0\15\u0123\5\0\1\u0123\3\0\1\u0123"+ + "\4\0\5\u0127\u010c\0\2\u0127\1\2\2\0\1\u0127\21\0"+ + "\5\u0127\4\0\2\u0127\3\0\5\u0127\u010c\0\1\u0127\1\u0134"+ + "\1\2\2\0\1\u0127\21\0\5\u0127\4\0\2\u0127\3\0"+ + "\5\u0127\u010c\0\1\u0127\1\u0135\1\2\2\0\1\u0127\21\0"+ + "\3\u0127\1\u0136\1\u0127\4\0\2\u0127\3\0\5\u0127\u010c\0"+ + "\2\u0127\1\2\2\0\1\u0127\21\0\1\u0127\1\u0137\3\u0127"+ + "\4\0\2\u0127\3\0\5\u0127\u010c\0\2\u0127\1\2\2\0"+ + "\1\u0127\21\0\3\u0127\1\u0138\1\u0127\4\0\2\u0127\u0131\0"+ + "\1\u0139\u0119\0\1\u013a\u0135\0\1\u013b\30\0\1\u013c\u0133\0"+ + "\1\u013d\u0137\0\1\u013e\11\0\1\2\1\u0131\u0247\0\1\u013f"+ + "\u0135\0\1\u0140\43\0\5\u0127\u010c\0\2\u0127\1\u0141\2\0"+ + "\1\u0127\21\0\5\u0127\4\0\2\u0127\3\0\5\u0127\u010c\0"+ + "\2\u0127\1\u0142\2\0\1\u0127\21\0\5\u0127\4\0\2\u0127"+ + "\3\0\5\u0127\u010c\0\2\u0127\1\2\2\0\1\u0127\21\0"+ + "\4\u0127\1\u0143\4\0\2\u0127\3\0\5\u0127\u010c\0\2\u0127"+ + "\1\2\2\0\1\u0127\21\0\2\u0127\1\u0144\2\u0127\4\0"+ + "\2\u0127\3\0\5\u0127\u010c\0\2\u0127\1\2\2\0\1\u0127"+ + "\21\0\4\u0127\1\u0145\4\0\2\u0127\u0132\0\1\u0146\u0119\0"+ + "\1\u0141\u0135\0\1\u0142\u014e\0\1\u0147\u0133\0\1\u0148\u0137\0"+ + "\1\u0149\u011c\0\1\u014a\u0135\0\1\u0123\42\0\5\u0127\u010c\0"+ + "\2\u0127\1\2\2\0\1\u0127\21\0\2\u0127\1\u014b\2\u0127"+ + "\4\0\2\u0127\3\0\5\u0127\u010c\0\1\u0127\1\u014c\1\2"+ + "\2\0\1\u0127\21\0\5\u0127\4\0\2\u0127\3\0\5\u0127"+ + "\u010c\0\2\u0127\1\2\2\0\1\u0127\21\0\2\u0127\1\u014d"+ + "\2\u0127\4\0\2\u0127\u0133\0\1\u014e\u012f\0\1\u014f\u011d\0"+ + "\1\u0150\u014d\0\1\u0151\u011f\0\1\u0122\41\0\5\u0127\u010c\0"+ + "\2\u0127\1\353\2\0\1\u0127\21\0\5\u0127\4\0\2\u0127"+ + "\3\0\5\u0127\u010c\0\2\u0127\1\355\2\0\1\u0127\21\0"+ + "\5\u0127\4\0\2\u0127\3\0\5\u0127\u010c\0\2\u0127\1\352"+ + "\2\0\1\u0127\21\0\5\u0127\4\0\2\u0127\u0116\0\1\u0152"+ + "\u0135\0\1\353\u0135\0\1\355\u0135\0\1\352\37\0"; + + private static int [] zzUnpackTrans() { + int [] result = new int[16740]; + int offset = 0; + offset = zzUnpackTrans(ZZ_TRANS_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackTrans(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int count = packed.charAt(i++); + int value = packed.charAt(i++); + value--; + do result[j++] = value; while (--count > 0); + } + return j; + } + + + /* error codes */ + private static final int ZZ_UNKNOWN_ERROR = 0; + private static final int ZZ_NO_MATCH = 1; + private static final int ZZ_PUSHBACK_2BIG = 2; + + /* error messages for the codes above */ + private static final String ZZ_ERROR_MSG[] = { + "Unkown internal scanner error", + "Error: could not match input", + "Error: pushback value was too large" + }; + + /** + * ZZ_ATTRIBUTE[aState] contains the attributes of state aState + */ + private static final int [] ZZ_ATTRIBUTE = zzUnpackAttribute(); + + private static final String ZZ_ATTRIBUTE_PACKED_0 = + "\1\0\1\11\1\1\u010c\11\3\1\14\11\1\1\3\0"+ + "\1\11\1\0\1\1\33\0\2\11\17\0\1\11"; + + private static int [] zzUnpackAttribute() { + int [] result = new int[338]; + int offset = 0; + offset = zzUnpackAttribute(ZZ_ATTRIBUTE_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackAttribute(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int count = packed.charAt(i++); + int value = packed.charAt(i++); + do result[j++] = value; while (--count > 0); + } + return j; + } + + /** the input device */ + private java.io.Reader zzReader; + + /** the current state of the DFA */ + private int zzState; + + /** the current lexical state */ + private int zzLexicalState = YYINITIAL; + + /** this buffer contains the current text to be matched and is + the source of the yytext() string */ + private char zzBuffer[] = new char[ZZ_BUFFERSIZE]; + + /** the textposition at the last accepting state */ + private int zzMarkedPos; + + /** the current text position in the buffer */ + private int zzCurrentPos; + + /** startRead marks the beginning of the yytext() string in the buffer */ + private int zzStartRead; + + /** endRead marks the last character in the buffer, that has been read + from input */ + private int zzEndRead; + + /** number of newlines encountered up to the start of the matched text */ + private int yyline; + + /** the number of characters up to the start of the matched text */ + private int yychar; + + /** + * the number of characters from the last newline up to the start of the + * matched text + */ + private int yycolumn; + + /** + * zzAtBOL == true <=> the scanner is currently at the beginning of a line + */ + private boolean zzAtBOL = true; + + /** zzAtEOF == true <=> the scanner is at the EOF */ + private boolean zzAtEOF; + + /** denotes if the user-EOF-code has already been executed */ + private boolean zzEOFDone; + + + /** + * Creates a new scanner + * There is also a java.io.InputStream version of this constructor. + * + * @param in the java.io.Reader to read input from. + */ + public Unicode2BetacodeLex(java.io.Reader in) { + this.zzReader = in; + } + + /** + * Creates a new scanner. + * There is also java.io.Reader version of this constructor. + * + * @param in the java.io.Inputstream to read input from. + */ + public Unicode2BetacodeLex(java.io.InputStream in) { + this(new java.io.InputStreamReader(in)); + } + + /** + * Unpacks the compressed character translation table. + * + * @param packed the packed character translation table + * @return the unpacked character translation table + */ + private static char [] zzUnpackCMap(String packed) { + char [] map = new char[0x10000]; + int i = 0; /* index in packed string */ + int j = 0; /* index in unpacked array */ + while (i < 724) { + int count = packed.charAt(i++); + char value = packed.charAt(i++); + do map[j++] = value; while (--count > 0); + } + return map; + } + + + /** + * Refills the input buffer. + * + * @return false, iff there was new input. + * + * @exception java.io.IOException if any I/O-Error occurs + */ + private boolean zzRefill() throws java.io.IOException { + + /* first: make room (if you can) */ + if (zzStartRead > 0) { + System.arraycopy(zzBuffer, zzStartRead, + zzBuffer, 0, + zzEndRead-zzStartRead); + + /* translate stored positions */ + zzEndRead-= zzStartRead; + zzCurrentPos-= zzStartRead; + zzMarkedPos-= zzStartRead; + zzStartRead = 0; + } + + /* is the buffer big enough? */ + if (zzCurrentPos >= zzBuffer.length) { + /* if not: blow it up */ + char newBuffer[] = new char[zzCurrentPos*2]; + System.arraycopy(zzBuffer, 0, newBuffer, 0, zzBuffer.length); + zzBuffer = newBuffer; + } + + /* finally: fill the buffer with new input */ + int numRead = zzReader.read(zzBuffer, zzEndRead, + zzBuffer.length-zzEndRead); + + if (numRead > 0) { + zzEndRead+= numRead; + return false; + } + // unlikely but not impossible: read 0 characters, but not at end of stream + if (numRead == 0) { + int c = zzReader.read(); + if (c == -1) { + return true; + } else { + zzBuffer[zzEndRead++] = (char) c; + return false; + } + } + + // numRead < 0 + return true; + } + + + /** + * Closes the input stream. + */ + public final void yyclose() throws java.io.IOException { + zzAtEOF = true; /* indicate end of file */ + zzEndRead = zzStartRead; /* invalidate buffer */ + + if (zzReader != null) + zzReader.close(); + } + + + /** + * Resets the scanner to read from a new input stream. + * Does not close the old reader. + * + * All internal variables are reset, the old input stream + * cannot be reused (internal buffer is discarded and lost). + * Lexical state is set to ZZ_INITIAL. + * + * @param reader the new input stream + */ + public final void yyreset(java.io.Reader reader) { + zzReader = reader; + zzAtBOL = true; + zzAtEOF = false; + zzEOFDone = false; + zzEndRead = zzStartRead = 0; + zzCurrentPos = zzMarkedPos = 0; + yyline = yychar = yycolumn = 0; + zzLexicalState = YYINITIAL; + } + + + /** + * Returns the current lexical state. + */ + public final int yystate() { + return zzLexicalState; + } + + + /** + * Enters a new lexical state + * + * @param newState the new lexical state + */ + public final void yybegin(int newState) { + zzLexicalState = newState; + } + + + /** + * Returns the text matched by the current regular expression. + */ + public final String yytext() { + return new String( zzBuffer, zzStartRead, zzMarkedPos-zzStartRead ); + } + + + /** + * Returns the character at position pos from the + * matched text. + * + * It is equivalent to yytext().charAt(pos), but faster + * + * @param pos the position of the character to fetch. + * A value from 0 to yylength()-1. + * + * @return the character at position pos + */ + public final char yycharat(int pos) { + return zzBuffer[zzStartRead+pos]; + } + + + /** + * Returns the length of the matched text region. + */ + public final int yylength() { + return zzMarkedPos-zzStartRead; + } + + + /** + * Reports an error that occured while scanning. + * + * In a wellformed scanner (no or only correct usage of + * yypushback(int) and a match-all fallback rule) this method + * will only be called with things that "Can't Possibly Happen". + * If this method is called, something is seriously wrong + * (e.g. a JFlex bug producing a faulty scanner etc.). + * + * Usual syntax/scanner level error handling should be done + * in error fallback rules. + * + * @param errorCode the code of the errormessage to display + */ + private void zzScanError(int errorCode) { + String message; + try { + message = ZZ_ERROR_MSG[errorCode]; + } + catch (ArrayIndexOutOfBoundsException e) { + message = ZZ_ERROR_MSG[ZZ_UNKNOWN_ERROR]; + } + + throw new Error(message); + } + + + /** + * Pushes the specified amount of characters back into the input stream. + * + * They will be read again by then next call of the scanning method + * + * @param number the number of characters to be read again. + * This number must not be greater than yylength()! + */ + public void yypushback(int number) { + if ( number > yylength() ) + zzScanError(ZZ_PUSHBACK_2BIG); + + zzMarkedPos -= number; + } + + + /** + * Resumes scanning until the next regular expression is matched, + * the end of input is encountered or an I/O-Error occurs. + * + * @return the next token + * @exception java.io.IOException if any I/O-Error occurs + */ + public java.lang.String yylex() throws java.io.IOException { + int zzInput; + int zzAction; + + // cached fields: + int zzCurrentPosL; + int zzMarkedPosL; + int zzEndReadL = zzEndRead; + char [] zzBufferL = zzBuffer; + char [] zzCMapL = ZZ_CMAP; + + int [] zzTransL = ZZ_TRANS; + int [] zzRowMapL = ZZ_ROWMAP; + int [] zzAttrL = ZZ_ATTRIBUTE; + + while (true) { + zzMarkedPosL = zzMarkedPos; + + zzAction = -1; + + zzCurrentPosL = zzCurrentPos = zzStartRead = zzMarkedPosL; + + zzState = ZZ_LEXSTATE[zzLexicalState]; + + + zzForAction: { + while (true) { + + if (zzCurrentPosL < zzEndReadL) + zzInput = zzBufferL[zzCurrentPosL++]; + else if (zzAtEOF) { + zzInput = YYEOF; + break zzForAction; + } + else { + // store back cached positions + zzCurrentPos = zzCurrentPosL; + zzMarkedPos = zzMarkedPosL; + boolean eof = zzRefill(); + // get translated positions and possibly new buffer + zzCurrentPosL = zzCurrentPos; + zzMarkedPosL = zzMarkedPos; + zzBufferL = zzBuffer; + zzEndReadL = zzEndRead; + if (eof) { + zzInput = YYEOF; + break zzForAction; + } + else { + zzInput = zzBufferL[zzCurrentPosL++]; + } + } + int zzNext = zzTransL[ zzRowMapL[zzState] + zzCMapL[zzInput] ]; + if (zzNext == -1) break zzForAction; + zzState = zzNext; + + int zzAttributes = zzAttrL[zzState]; + if ( (zzAttributes & 1) == 1 ) { + zzAction = zzState; + zzMarkedPosL = zzCurrentPosL; + if ( (zzAttributes & 8) == 8 ) break zzForAction; + } + + } + } + + // store back cached position + zzMarkedPos = zzMarkedPosL; + + switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) { + case 266: + { return "p"; + } + case 287: break; + case 102: + { return "*(w"; + } + case 288: break; + case 20: + { return "*(\\a"; + } + case 289: break; + case 21: + { return "*)/a"; + } + case 290: break; + case 181: + { return "*a/"; + } + case 291: break; + case 237: + { return "*a"; + } + case 292: break; + case 260: + { return "n"; + } + case 293: break; + case 89: + { return "*(u"; + } + case 294: break; + case 16: + { return "a(="; + } + case 295: break; + case 30: + { return "e(/"; + } + case 296: break; + case 195: + { return "i+\\"; + } + case 297: break; + case 222: + { return "w="; + } + case 298: break; + case 210: + { return "u+="; + } + case 299: break; + case 99: + { return "w)="; + } + case 300: break; + case 256: + { return "l"; + } + case 301: break; + case 205: + { return "u+\\"; + } + case 302: break; + case 23: + { return "*)=a"; + } + case 303: break; + case 225: + { return "*o/"; + } + case 304: break; + case 44: + { return "h(="; + } + case 305: break; + case 3: + { return "j"; + } + case 306: break; + case 103: + { return "*)\\w"; + } + case 307: break; + case 152: + { return "*(/|h"; + } + case 308: break; + case 165: + { return "*)\\|w"; + } + case 309: break; + case 248: + { return "h"; + } + case 310: break; + case 76: + { return "*(o"; + } + case 311: break; + case 159: + { return "w)/|"; + } + case 312: break; + case 178: + { return "*a^"; + } + case 313: break; + case 141: + { return "h)\\|"; + } + case 314: break; + case 106: + { return "*(/w"; + } + case 315: break; + case 275: + { return "f"; + } + case 316: break; + case 227: + { return "/"; + } + case 317: break; + case 91: + { return "*(/u"; + } + case 318: break; + case 242: + { return "d"; + } + case 319: break; + case 161: + { return "w)=|"; + } + case 320: break; + case 57: + { return "i)/"; + } + case 321: break; + case 154: + { return "*(=|h"; + } + case 322: break; + case 95: + { return "w)\\"; + } + case 323: break; + case 108: + { return "*(=w"; + } + case 324: break; + case 116: + { return "i/"; + } + case 325: break; + case 238: + { return "b"; + } + case 326: break; + case 207: + { return "r)"; + } + case 327: break; + case 147: + { return "*)|h"; + } + case 328: break; + case 62: + { return "*(i"; + } + case 329: break; + case 230: + { return "+"; + } + case 330: break; + case 77: + { return "*)\\o"; + } + case 331: break; + case 166: + { return "*(\\|w"; + } + case 332: break; + case 71: + { return "o)\\"; + } + case 333: break; + case 92: + { return "*(=u"; + } + case 334: break; + case 232: + { return ")"; + } + case 335: break; + case 14: + { return "a(/"; + } + case 336: break; + case 122: + { return "w/"; + } + case 337: break; + case 206: + { return "u+/"; + } + case 338: break; + case 80: + { return "*(/o"; + } + case 339: break; + case 97: + { return "w)/"; + } + case 340: break; + case 123: + { return "a)|"; + } + case 341: break; + case 229: + { return "^"; + } + case 342: break; + case 32: + { return "*(e"; + } + case 343: break; + case 286: + { return "'"; + } + case 344: break; + case 42: + { return "h(/"; + } + case 345: break; + case 53: + { return "i)"; + } + case 346: break; + case 174: + { return "a|"; + } + case 347: break; + case 63: + { return "*)\\i"; + } + case 348: break; + case 139: + { return "h)|"; + } + case 349: break; + case 193: + { return "i^"; + } + case 350: break; + case 18: + { return "*(a"; + } + case 351: break; + case 74: + { return "o(/"; + } + case 352: break; + case 93: + { return "w)"; + } + case 353: break; + case 66: + { return "*(/i"; + } + case 354: break; + case 101: + { return "*)w"; + } + case 355: break; + case 7: + { return "!"; + } + case 356: break; + case 33: + { return "*)\\e"; + } + case 357: break; + case 15: + { return "a)="; + } + case 358: break; + case 29: + { return "e)/"; + } + case 359: break; + case 68: + { return "*(=i"; + } + case 360: break; + case 125: + { return "a)\\|"; + } + case 361: break; + case 36: + { return "*(/e"; + } + case 362: break; + case 115: + { return "i\\"; + } + case 363: break; + case 201: + { return "*i\\"; + } + case 364: break; + case 112: + { return "e/"; + } + case 365: break; + case 218: + { return "w/|"; + } + case 366: break; + case 176: + { return "a="; + } + case 367: break; + case 19: + { return "*)\\a"; + } + case 368: break; + case 43: + { return "h)="; + } + case 369: break; + case 133: + { return "*)\\|a"; + } + case 370: break; + case 270: + { return "s1"; + } + case 371: break; + case 247: + { return "*z"; + } + case 372: break; + case 204: + { return "u_"; + } + case 373: break; + case 143: + { return "h)/|"; + } + case 374: break; + case 22: + { return "*(/a"; + } + case 375: break; + case 82: + { return "u("; + } + case 376: break; + case 75: + { return "*)o"; + } + case 377: break; + case 223: + { return "w=|"; + } + case 378: break; + case 278: + { return "*x"; + } + case 379: break; + case 121: + { return "w\\"; + } + case 380: break; + case 200: + { return "*i_"; + } + case 381: break; + case 219: + { return "*w\\"; + } + case 382: break; + case 25: + { return "e)"; + } + case 383: break; + case 145: + { return "h)=|"; + } + case 384: break; + case 151: + { return "*)/|h"; + } + case 385: break; + case 24: + { return "*(=a"; + } + case 386: break; + case 4: + { return "*v"; + } + case 387: break; + case 192: + { return "*h|"; + } + case 388: break; + case 39: + { return "h)\\"; + } + case 389: break; + case 272: + { return "*t"; + } + case 390: break; + case 134: + { return "*(\\|a"; + } + case 391: break; + case 214: + { return "*u/"; + } + case 392: break; + case 61: + { return "*)i"; + } + case 393: break; + case 269: + { return "*r"; + } + case 394: break; + case 160: + { return "w(/|"; + } + case 395: break; + case 13: + { return "a)/"; + } + case 396: break; + case 153: + { return "*)=|h"; + } + case 397: break; + case 267: + { return "*p"; + } + case 398: break; + case 111: + { return "e\\"; + } + case 399: break; + case 88: + { return "u(="; + } + case 400: break; + case 31: + { return "*)e"; + } + case 401: break; + case 188: + { return "*e\\"; + } + case 402: break; + case 110: + { return "a/"; + } + case 403: break; + case 162: + { return "w(=|"; + } + case 404: break; + case 41: + { return "h)/"; + } + case 405: break; + case 261: + { return "*n"; + } + case 406: break; + case 226: + { return "\\"; + } + case 407: break; + case 96: + { return "w(\\"; + } + case 408: break; + case 148: + { return "*(|h"; + } + case 409: break; + case 257: + { return "*l"; + } + case 410: break; + case 211: + { return "*u^"; + } + case 411: break; + case 198: + { return "i+="; + } + case 412: break; + case 279: + { return "y"; + } + case 413: break; + case 17: + { return "*)a"; + } + case 414: break; + case 73: + { return "o)/"; + } + case 415: break; + case 72: + { return "o(\\"; + } + case 416: break; + case 118: + { return "o/"; + } + case 417: break; + case 168: + { return "*(/|w"; + } + case 418: break; + case 2: + { return "*j"; + } + case 419: break; + case 281: + { return "w"; + } + case 420: break; + case 48: + { return "*(\\h"; + } + case 421: break; + case 49: + { return "*)/h"; + } + case 422: break; + case 9: + { return "a)"; + } + case 423: break; + case 216: + { return "w\\|"; + } + case 424: break; + case 249: + { return "*h"; + } + case 425: break; + case 273: + { return "u"; + } + case 426: break; + case 171: + { return "a^"; + } + case 427: break; + case 175: + { return "a/|"; + } + case 428: break; + case 285: + { return "<"; + } + case 429: break; + case 276: + { return "*f"; + } + case 430: break; + case 38: + { return "h("; + } + case 431: break; + case 283: + // lookahead expression with fixed base length + zzMarkedPos = zzStartRead + 1; + { return "s"; + } + case 432: break; + case 51: + { return "*)=h"; + } + case 433: break; + case 127: + { return "a)/|"; + } + case 434: break; + case 170: + { return "*(=|w"; + } + case 435: break; + case 69: + { return "o)"; + } + case 436: break; + case 243: + { return "*d"; + } + case 437: break; + case 185: + { return "h/|"; + } + case 438: break; + case 250: + { return "q"; + } + case 439: break; + case 163: + { return "*)|w"; + } + case 440: break; + case 8: + { return ":"; + } + case 441: break; + case 177: + { return "a=|"; + } + case 442: break; + case 239: + { return "*b"; + } + case 443: break; + case 158: + { return "w(\\|"; + } + case 444: break; + case 109: + { return "a\\"; + } + case 445: break; + case 264: + { return "o"; + } + case 446: break; + case 129: + { return "a)=|"; + } + case 447: break; + case 86: + { return "u(/"; + } + case 448: break; + case 180: + { return "*a\\"; + } + case 449: break; + case 11: + { return "a)\\"; + } + case 450: break; + case 187: + { return "h=|"; + } + case 451: break; + case 258: + { return "m"; + } + case 452: break; + case 191: + { return "*h/"; + } + case 453: break; + case 113: + { return "h\\"; + } + case 454: break; + case 190: + { return "*h\\"; + } + case 455: break; + case 196: + { return "i+/"; + } + case 456: break; + case 254: + { return "k"; + } + case 457: break; + case 215: + { return "*(r"; + } + case 458: break; + case 27: + { return "e)\\"; + } + case 459: break; + case 117: + { return "o\\"; + } + case 460: break; + case 252: + { return "i"; + } + case 461: break; + case 224: + { return "*o\\"; + } + case 462: break; + case 144: + { return "h(/|"; + } + case 463: break; + case 179: + { return "*a_"; + } + case 464: break; + case 221: + { return "*w|"; + } + case 465: break; + case 240: + { return "g"; + } + case 466: break; + case 55: + { return "i)\\"; + } + case 467: break; + case 209: + { return "u="; + } + case 468: break; + case 87: + { return "u)="; + } + case 469: break; + case 244: + { return "e"; + } + case 470: break; + case 146: + { return "h(=|"; + } + case 471: break; + case 83: + { return "u)\\"; + } + case 472: break; + case 40: + { return "h(\\"; + } + case 473: break; + case 262: + { return "c"; + } + case 474: break; + case 136: + { return "*(/|a"; + } + case 475: break; + case 236: + { return "a"; + } + case 476: break; + case 208: + { return "r("; + } + case 477: break; + case 46: + { return "*(h"; + } + case 478: break; + case 228: + { return "_"; + } + case 479: break; + case 183: + { return "h\\|"; + } + case 480: break; + case 233: + { return "("; + } + case 481: break; + case 138: + { return "*(=|a"; + } + case 482: break; + case 194: + { return "i_"; + } + case 483: break; + case 167: + { return "*)/|w"; + } + case 484: break; + case 54: + { return "i("; + } + case 485: break; + case 131: + { return "*)|a"; + } + case 486: break; + case 47: + { return "*)\\h"; + } + case 487: break; + case 184: + { return "h|"; + } + case 488: break; + case 149: + { return "*)\\|h"; + } + case 489: break; + case 94: + { return "w("; + } + case 490: break; + case 50: + { return "*(/h"; + } + case 491: break; + case 120: + { return "u/"; + } + case 492: break; + case 85: + { return "u)/"; + } + case 493: break; + case 169: + { return "*)=|w"; + } + case 494: break; + case 156: + { return "w(|"; + } + case 495: break; + case 202: + { return "*i/"; + } + case 496: break; + case 52: + { return "*(=h"; + } + case 497: break; + case 128: + { return "a(/|"; + } + case 498: break; + case 157: + { return "w)\\|"; + } + case 499: break; + case 60: + { return "i(="; + } + case 500: break; + case 164: + { return "*(|w"; + } + case 501: break; + case 150: + { return "*(\\|h"; + } + case 502: break; + case 220: + { return "*w/"; + } + case 503: break; + case 186: + { return "h="; + } + case 504: break; + case 81: + { return "u)"; + } + case 505: break; + case 130: + { return "a(=|"; + } + case 506: break; + case 280: + { return "*y"; + } + case 507: break; + case 203: + { return "u^"; + } + case 508: break; + case 104: + { return "*(\\w"; + } + case 509: break; + case 12: + { return "a(\\"; + } + case 510: break; + case 105: + { return "*)/w"; + } + case 511: break; + case 182: + { return "*a|"; + } + case 512: break; + case 282: + { return "*w"; + } + case 513: break; + case 199: + { return "*i^"; + } + case 514: break; + case 100: + { return "w(="; + } + case 515: break; + case 90: + { return "*(\\u"; + } + case 516: break; + case 26: + { return "e("; + } + case 517: break; + case 1: + { return yytext(); + } + case 518: break; + case 142: + { return "h(\\|"; + } + case 519: break; + case 274: + { return "*u"; + } + case 520: break; + case 28: + { return "e(\\"; + } + case 521: break; + case 107: + { return "*)=w"; + } + case 522: break; + case 173: + { return "a\\|"; + } + case 523: break; + case 6: + { return "*s"; + } + case 524: break; + case 45: + { return "*)h"; + } + case 525: break; + case 251: + { return "*q"; + } + case 526: break; + case 119: + { return "u\\"; + } + case 527: break; + case 56: + { return "i(\\"; + } + case 528: break; + case 213: + { return "*u\\"; + } + case 529: break; + case 284: + { return ">"; + } + case 530: break; + case 78: + { return "*(\\o"; + } + case 531: break; + case 189: + { return "*e/"; + } + case 532: break; + case 79: + { return "*)/o"; + } + case 533: break; + case 265: + { return "*o"; + } + case 534: break; + case 135: + { return "*)/|a"; + } + case 535: break; + case 84: + { return "u(\\"; + } + case 536: break; + case 235: + { return "|"; + } + case 537: break; + case 58: + { return "i(/"; + } + case 538: break; + case 259: + { return "*m"; + } + case 539: break; + case 212: + { return "*u_"; + } + case 540: break; + case 114: + { return "h/"; + } + case 541: break; + case 246: + { return "z"; + } + case 542: break; + case 255: + { return "*k"; + } + case 543: break; + case 277: + { return "x"; + } + case 544: break; + case 64: + { return "*(\\i"; + } + case 545: break; + case 65: + { return "*)/i"; + } + case 546: break; + case 137: + { return "*)=|a"; + } + case 547: break; + case 253: + { return "*i"; + } + case 548: break; + case 98: + { return "w(/"; + } + case 549: break; + case 5: + { return "v"; + } + case 550: break; + case 124: + { return "a(|"; + } + case 551: break; + case 234: + { return "?"; + } + case 552: break; + case 172: + { return "a_"; + } + case 553: break; + case 217: + { return "w|"; + } + case 554: break; + case 10: + { return "a("; + } + case 555: break; + case 241: + { return "*g"; + } + case 556: break; + case 155: + { return "w)|"; + } + case 557: break; + case 37: + { return "h)"; + } + case 558: break; + case 271: + { return "t"; + } + case 559: break; + case 231: + { return "="; + } + case 560: break; + case 67: + { return "*)=i"; + } + case 561: break; + case 34: + { return "*(\\e"; + } + case 562: break; + case 35: + { return "*)/e"; + } + case 563: break; + case 140: + { return "h(|"; + } + case 564: break; + case 132: + { return "*(|a"; + } + case 565: break; + case 245: + { return "*e"; + } + case 566: break; + case 268: + { return "r"; + } + case 567: break; + case 59: + { return "i)="; + } + case 568: break; + case 70: + { return "o("; + } + case 569: break; + case 126: + { return "a(\\|"; + } + case 570: break; + case 263: + { return "*c"; + } + case 571: break; + case 197: + { return "i="; + } + case 572: break; + default: + if (zzInput == YYEOF && zzStartRead == zzCurrentPos) { + zzAtEOF = true; + return null; + } + else { + zzScanError(ZZ_NO_MATCH); + } + } + } + } + + +} diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/transcode/Unicode2Buckwalter.lex --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/transcode/Unicode2Buckwalter.lex Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,121 @@ +package de.mpg.mpiwg.berlin.mpdl.lt.general; + +%% +%{ + /* + * Betacode to Unicode conversion + */ + +%} + +%class Unicode2BuckwalterLex +%public +%type java.lang.String +%unicode +%% + + +"<"[^>]+">" { return yytext(); } + +"\u0621" { return "'"; } /* Hamza */ +"\u0622" { return "|"; } /* ALEF WITH MADDA ABOVE from AraMorph */ +"\u0623" { return ">"; } /* Hamza */ +"\u0624" { return "&"; } /* Hamza */ +"\u0625" { return "<"; } /* Alif + HamzaBelow */ +"\u0626" { return "}"; } /* Ya + HamzaAbove */ +"\u0627" { return "A"; } /* Alif */ +"\u0628" { return "b"; } /* Ba */ +"\u0629" { return "p"; } /* TaMarbuta */ +"\u062A" { return "t"; } /* Ta */ +"\u062B" { return "v"; } /* Tha */ +"\u062C" { return "j"; } /* Jeem */ +"\u062D" { return "H"; } /* HHa */ +"\u062E" { return "x"; } /* Kha */ +"\u062F" { return "d"; } /* Dal */ +"\u0630" { return "*"; } /* Thal */ +"\u0631" { return "r"; } /* Ra */ +"\u0632" { return "z"; } /* Zain */ +"\u0633" { return "s"; } /* Seen */ +"\u0634" { return "$"; } /* Sheen */ +"\u0635" { return "S"; } /* Sad */ +"\u0636" { return "D"; } /* DDad */ +"\u0637" { return "T"; } /* TTa */ +"\u0638" { return "Z"; } /* DTha */ +"\u0639" { return "E"; } /* Ain */ +"\u063A" { return "g"; } /* Ghain */ + +"\u0640" { return "_"; } /* Tatweel */ +"\u0641" { return "f"; } /* Fa */ +"\u0642" { return "q"; } /* Qaf */ +"\u0643" { return "k"; } /* Kaf */ +"\u0644" { return "l"; } /* Lam */ +"\u0645" { return "m"; } /* Meem */ +"\u0646" { return "n"; } /* Noon */ +"\u0647" { return "h"; } /* Ha */ +"\u0648" { return "w"; } /* Waw */ +"\u0649" { return "Y"; } /* AlifMaksura */ +"\u064A" { return "y"; } /* Ya */ +"\u064B" { return "F"; } /* Fathatan */ +"\u064C" { return "N"; } /* Dammatan */ +"\u064D" { return "K"; } /* Kasratan */ +"\u064E" { return "a"; } /* Fatha */ +"\u064F" { return "u"; } /* Damma */ +"\u0650" { return "i"; } /* Kasra */ +"\u0651" { return "~"; } /* Shadda */ +"\u0652" { return "o"; } /* Sukun */ +"\u0653" { return "^"; } /* Maddah */ +"\u0654" { return "#"; } /* HamzaAbove */ + +"\u0670" { return "`"; } /* AlifKhanjareeya */ +"\u0671" { return "{"; } /* Alif + HamzatWasl */ + +"\u067E" { return "P"; } /* PEH from AraMorph */ +"\u0686" { return "J"; } /* TCHEH from AraMorph */ +"\u06A4" { return "V"; } /* VEH from AraMorph */ +"\u06AF" { return "G"; } /* GAF from AraMorph */ +"\u0698" { return "R"; } /* JEH from AraMorph */ +"\u061F" { return "?"; } /* QUESTION MARK from AraMorph */ + +"\u06DC" { return ":"; } /* SmallHighSeen */ +"\u06DF" { return "@"; } /* SmallHighRoundedZero */ + +"\u06E2" { return "["; } /* SmallHighMeemIsolatedForm */ +"\u06E3" { return ";"; } /* SmallLowSeen */ +"\u06E5" { return ","; } /* SmallWaw */ +"\u06E6" { return "."; } /* SmallYa */ +"\u06E8" { return "!"; } /* SmallHighNoon */ +"\u06EA" { return "-"; } /* EmptyCentreLowStop */ +"\u06EB" { return "+"; } /* EmptyCentreHighStop */ +"\u06EC" { return "%"; } /* RoundedHighStopWithFilledCentre */ +"\u06ED" { return "]"; } /* SmallLowMeem */ + +[\&_]"vert;" { return "|"; } +[\&_]"lpar;" { return "("; } +[\&_]"rpar;" { return ")"; } +[\_\&]"lt;" { return "<"; } +[\_\&]"gt;" { return ">"; } +"'" { return "'"; } + +"&"[a-zA-Z]+";" { return yytext(); } + +. { return yytext(); } +\n { return yytext(); } + +/* make problemes */ +/* "\u06E0" { return "\\""; } SmallHighUprightRectangularZero */ + + +/* double entries */ +/* "\u060C" { return ","; } COMMA from AraMorph */ +/* "\u061B" { return ";"; } SEMICOLON from AraMorph */ + +/* not in buckwalter contained */ +/* \u0679 : ARABIC LETTER TTEH */ +/* \u0688 : ARABIC LETTER DDAL */ +/* \u06A9 : ARABIC LETTER KEHEH */ +/* \u0691 : ARABIC LETTER RREH */ +/* \u06BA : ARABIC LETTER NOON GHUNNA */ +/* \u06BE : ARABIC LETTER HEH DOACHASHMEE */ +/* \u06C1 : ARABIC LETTER HEH GOAL */ +/* \u06D2 : ARABIC LETTER YEH BARREE */ + diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/transcode/Unicode2BuckwalterLex.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/transcode/Unicode2BuckwalterLex.java Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,882 @@ +/* The following code was generated by JFlex 1.4.3 on 14.12.10 17:12 */ + +package de.mpg.mpiwg.berlin.mpdl.lt.text.transcode; + + +/** + * This class is a scanner generated by + * JFlex 1.4.3 + * on 14.12.10 17:12 from the specification file + * /Users/jwillenborg/test/jflex/Unicode2Buckwalter.lex + */ +public class Unicode2BuckwalterLex { + + /** This character denotes the end of file */ + public static final int YYEOF = -1; + + /** initial size of the lookahead buffer */ + private static final int ZZ_BUFFERSIZE = 16384; + + /** lexical states */ + public static final int YYINITIAL = 0; + + /** + * ZZ_LEXSTATE[l] is the state in the DFA for the lexical state l + * ZZ_LEXSTATE[l+1] is the state in the DFA for the lexical state l + * at the beginning of a line + * l is of the form l = 2*k, k a non negative integer + */ + private static final int ZZ_LEXSTATE[] = { + 0, 0 + }; + + /** + * Translates characters to character classes + */ + private static final String ZZ_CMAP_PACKED = + "\12\0\1\0\30\0\1\120\2\0\1\117\11\0\1\121\2\0\1\122"+ + "\5\0\1\123\1\0\1\112\1\1\1\0\1\2\2\0\32\124\4\0"+ + "\1\105\1\0\1\115\3\124\1\107\1\124\1\116\4\124\1\113\3\124"+ + "\1\114\1\124\1\110\1\124\1\111\1\124\1\106\4\124\u05a4\0\1\71"+ + "\1\0\1\3\1\4\1\5\1\6\1\7\1\10\1\11\1\12\1\13"+ + "\1\14\1\15\1\16\1\17\1\20\1\21\1\22\1\23\1\24\1\25"+ + "\1\26\1\27\1\30\1\31\1\32\1\33\1\34\5\0\1\35\1\36"+ + "\1\37\1\40\1\41\1\42\1\43\1\44\1\45\1\46\1\47\1\50"+ + "\1\51\1\52\1\53\1\54\1\55\1\56\1\57\1\60\1\61\33\0"+ + "\1\62\1\63\14\0\1\64\7\0\1\65\21\0\1\70\13\0\1\66"+ + "\12\0\1\67\54\0\1\72\2\0\1\73\2\0\1\74\1\75\1\0"+ + "\1\76\1\77\1\0\1\100\1\0\1\101\1\102\1\103\1\104\uf912\0"; + + /** + * Translates characters to character classes + */ + private static final char [] ZZ_CMAP = zzUnpackCMap(ZZ_CMAP_PACKED); + + /** + * Translates DFA states to action switch labels. + */ + private static final int [] ZZ_ACTION = zzUnpackAction(); + + private static final String ZZ_ACTION_PACKED_0 = + "\1\0\2\1\1\2\1\3\1\4\1\5\1\6\1\7"+ + "\1\10\1\11\1\12\1\13\1\14\1\15\1\16\1\17"+ + "\1\20\1\21\1\22\1\23\1\24\1\25\1\26\1\27"+ + "\1\30\1\31\1\32\1\33\1\34\1\35\1\36\1\37"+ + "\1\40\1\41\1\42\1\43\1\44\1\45\1\46\1\47"+ + "\1\50\1\51\1\52\1\53\1\54\1\55\1\56\1\57"+ + "\1\60\1\61\1\62\1\63\1\64\1\65\1\66\1\67"+ + "\1\70\1\71\1\72\1\73\1\74\1\75\1\76\1\77"+ + "\1\100\1\101\1\102\1\103\2\1\30\0\1\104\1\0"+ + "\1\105\13\0\1\106\1\107"; + + private static int [] zzUnpackAction() { + int [] result = new int[111]; + int offset = 0; + offset = zzUnpackAction(ZZ_ACTION_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackAction(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int count = packed.charAt(i++); + int value = packed.charAt(i++); + do result[j++] = value; while (--count > 0); + } + return j; + } + + + /** + * Translates a state to a row index in the transition table + */ + private static final int [] ZZ_ROWMAP = zzUnpackRowMap(); + + private static final String ZZ_ROWMAP_PACKED_0 = + "\0\0\0\125\0\252\0\125\0\125\0\125\0\125\0\125"+ + "\0\125\0\125\0\125\0\125\0\125\0\125\0\125\0\125"+ + "\0\125\0\125\0\125\0\125\0\125\0\125\0\125\0\125"+ + "\0\125\0\125\0\125\0\125\0\125\0\125\0\125\0\125"+ + "\0\125\0\125\0\125\0\125\0\125\0\125\0\125\0\125"+ + "\0\125\0\125\0\125\0\125\0\125\0\125\0\125\0\125"+ + "\0\125\0\125\0\125\0\125\0\125\0\125\0\125\0\125"+ + "\0\125\0\125\0\125\0\125\0\125\0\125\0\125\0\125"+ + "\0\125\0\125\0\125\0\125\0\125\0\377\0\u0154\0\u01a9"+ + "\0\u01fe\0\u0253\0\u02a8\0\u02fd\0\u0352\0\u03a7\0\u03fc\0\u0451"+ + "\0\u04a6\0\u04fb\0\u0550\0\u05a5\0\u05fa\0\u064f\0\u06a4\0\u06f9"+ + "\0\u074e\0\u07a3\0\u07f8\0\u084d\0\u08a2\0\u08f7\0\u094c\0\125"+ + "\0\u09a1\0\125\0\u09f6\0\u0a4b\0\u0aa0\0\u0af5\0\u0b4a\0\u0b9f"+ + "\0\u0bf4\0\u0c49\0\u0c9e\0\u0cf3\0\u0d48\0\125\0\125"; + + private static int [] zzUnpackRowMap() { + int [] result = new int[111]; + int offset = 0; + offset = zzUnpackRowMap(ZZ_ROWMAP_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackRowMap(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int high = packed.charAt(i++) << 16; + result[j++] = high | packed.charAt(i++); + } + return j; + } + + /** + * The transition table of the DFA + */ + private static final int [] ZZ_TRANS = zzUnpackTrans(); + + private static final String ZZ_TRANS_PACKED_0 = + "\1\2\1\3\1\2\1\4\1\5\1\6\1\7\1\10"+ + "\1\11\1\12\1\13\1\14\1\15\1\16\1\17\1\20"+ + "\1\21\1\22\1\23\1\24\1\25\1\26\1\27\1\30"+ + "\1\31\1\32\1\33\1\34\1\35\1\36\1\37\1\40"+ + "\1\41\1\42\1\43\1\44\1\45\1\46\1\47\1\50"+ + "\1\51\1\52\1\53\1\54\1\55\1\56\1\57\1\60"+ + "\1\61\1\62\1\63\1\64\1\65\1\66\1\67\1\70"+ + "\1\71\1\72\1\73\1\74\1\75\1\76\1\77\1\100"+ + "\1\101\1\102\1\103\1\104\1\105\1\106\11\2\1\107"+ + "\5\2\125\0\2\110\1\0\122\110\106\0\1\111\1\0"+ + "\1\112\2\0\1\113\2\0\1\114\114\0\1\115\1\116"+ + "\1\117\1\116\1\0\1\120\2\116\1\121\1\0\1\122"+ + "\3\0\1\116\2\110\1\2\122\110\107\0\1\123\131\0"+ + "\1\124\121\0\1\125\2\0\1\126\121\0\1\127\121\0"+ + "\1\116\1\130\2\116\1\2\4\116\5\0\1\116\106\0"+ + "\4\116\1\2\4\116\5\0\1\116\106\0\4\116\1\2"+ + "\1\116\1\131\2\116\5\0\1\116\106\0\3\116\1\132"+ + "\1\2\1\116\1\133\2\116\5\0\1\116\106\0\3\116"+ + "\1\134\1\2\4\116\5\0\1\116\121\0\1\135\113\0"+ + "\1\136\131\0\1\137\121\0\1\140\127\0\1\141\121\0"+ + "\1\142\120\0\2\116\1\143\1\116\1\2\4\116\5\0"+ + "\1\116\106\0\4\116\1\2\2\116\1\144\1\116\5\0"+ + "\1\116\106\0\4\116\1\140\4\116\5\0\1\116\106\0"+ + "\4\116\1\2\2\116\1\145\1\116\5\0\1\116\106\0"+ + "\4\116\1\142\4\116\5\0\1\116\122\0\1\146\113\0"+ + "\1\147\123\0\1\150\124\0\1\151\122\0\3\116\1\152"+ + "\1\2\4\116\5\0\1\116\106\0\2\116\1\153\1\116"+ + "\1\2\4\116\5\0\1\116\106\0\2\116\1\154\1\116"+ + "\1\2\4\116\5\0\1\116\123\0\1\155\113\0\1\5"+ + "\124\0\1\156\124\0\1\157\120\0\4\116\1\5\4\116"+ + "\5\0\1\116\106\0\4\116\1\156\4\116\5\0\1\116"+ + "\106\0\4\116\1\157\4\116\5\0\1\116\112\0\1\4"+ + "\12\0"; + + private static int [] zzUnpackTrans() { + int [] result = new int[3485]; + int offset = 0; + offset = zzUnpackTrans(ZZ_TRANS_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackTrans(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int count = packed.charAt(i++); + int value = packed.charAt(i++); + value--; + do result[j++] = value; while (--count > 0); + } + return j; + } + + + /* error codes */ + private static final int ZZ_UNKNOWN_ERROR = 0; + private static final int ZZ_NO_MATCH = 1; + private static final int ZZ_PUSHBACK_2BIG = 2; + + /* error messages for the codes above */ + private static final String ZZ_ERROR_MSG[] = { + "Unkown internal scanner error", + "Error: could not match input", + "Error: pushback value was too large" + }; + + /** + * ZZ_ATTRIBUTE[aState] contains the attributes of state aState + */ + private static final int [] ZZ_ATTRIBUTE = zzUnpackAttribute(); + + private static final String ZZ_ATTRIBUTE_PACKED_0 = + "\1\0\1\11\1\1\102\11\2\1\30\0\1\11\1\0"+ + "\1\11\13\0\2\11"; + + private static int [] zzUnpackAttribute() { + int [] result = new int[111]; + int offset = 0; + offset = zzUnpackAttribute(ZZ_ATTRIBUTE_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackAttribute(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int count = packed.charAt(i++); + int value = packed.charAt(i++); + do result[j++] = value; while (--count > 0); + } + return j; + } + + /** the input device */ + private java.io.Reader zzReader; + + /** the current state of the DFA */ + private int zzState; + + /** the current lexical state */ + private int zzLexicalState = YYINITIAL; + + /** this buffer contains the current text to be matched and is + the source of the yytext() string */ + private char zzBuffer[] = new char[ZZ_BUFFERSIZE]; + + /** the textposition at the last accepting state */ + private int zzMarkedPos; + + /** the current text position in the buffer */ + private int zzCurrentPos; + + /** startRead marks the beginning of the yytext() string in the buffer */ + private int zzStartRead; + + /** endRead marks the last character in the buffer, that has been read + from input */ + private int zzEndRead; + + /** number of newlines encountered up to the start of the matched text */ + private int yyline; + + /** the number of characters up to the start of the matched text */ + private int yychar; + + /** + * the number of characters from the last newline up to the start of the + * matched text + */ + private int yycolumn; + + /** + * zzAtBOL == true <=> the scanner is currently at the beginning of a line + */ + private boolean zzAtBOL = true; + + /** zzAtEOF == true <=> the scanner is at the EOF */ + private boolean zzAtEOF; + + /** denotes if the user-EOF-code has already been executed */ + private boolean zzEOFDone; + + /* user code: */ + /* + * Betacode to Unicode conversion + */ + + + + /** + * Creates a new scanner + * There is also a java.io.InputStream version of this constructor. + * + * @param in the java.io.Reader to read input from. + */ + public Unicode2BuckwalterLex(java.io.Reader in) { + this.zzReader = in; + } + + /** + * Creates a new scanner. + * There is also java.io.Reader version of this constructor. + * + * @param in the java.io.Inputstream to read input from. + */ + public Unicode2BuckwalterLex(java.io.InputStream in) { + this(new java.io.InputStreamReader(in)); + } + + /** + * Unpacks the compressed character translation table. + * + * @param packed the packed character translation table + * @return the unpacked character translation table + */ + private static char [] zzUnpackCMap(String packed) { + char [] map = new char[0x10000]; + int i = 0; /* index in packed string */ + int j = 0; /* index in unpacked array */ + while (i < 240) { + int count = packed.charAt(i++); + char value = packed.charAt(i++); + do map[j++] = value; while (--count > 0); + } + return map; + } + + + /** + * Refills the input buffer. + * + * @return false, iff there was new input. + * + * @exception java.io.IOException if any I/O-Error occurs + */ + private boolean zzRefill() throws java.io.IOException { + + /* first: make room (if you can) */ + if (zzStartRead > 0) { + System.arraycopy(zzBuffer, zzStartRead, + zzBuffer, 0, + zzEndRead-zzStartRead); + + /* translate stored positions */ + zzEndRead-= zzStartRead; + zzCurrentPos-= zzStartRead; + zzMarkedPos-= zzStartRead; + zzStartRead = 0; + } + + /* is the buffer big enough? */ + if (zzCurrentPos >= zzBuffer.length) { + /* if not: blow it up */ + char newBuffer[] = new char[zzCurrentPos*2]; + System.arraycopy(zzBuffer, 0, newBuffer, 0, zzBuffer.length); + zzBuffer = newBuffer; + } + + /* finally: fill the buffer with new input */ + int numRead = zzReader.read(zzBuffer, zzEndRead, + zzBuffer.length-zzEndRead); + + if (numRead > 0) { + zzEndRead+= numRead; + return false; + } + // unlikely but not impossible: read 0 characters, but not at end of stream + if (numRead == 0) { + int c = zzReader.read(); + if (c == -1) { + return true; + } else { + zzBuffer[zzEndRead++] = (char) c; + return false; + } + } + + // numRead < 0 + return true; + } + + + /** + * Closes the input stream. + */ + public final void yyclose() throws java.io.IOException { + zzAtEOF = true; /* indicate end of file */ + zzEndRead = zzStartRead; /* invalidate buffer */ + + if (zzReader != null) + zzReader.close(); + } + + + /** + * Resets the scanner to read from a new input stream. + * Does not close the old reader. + * + * All internal variables are reset, the old input stream + * cannot be reused (internal buffer is discarded and lost). + * Lexical state is set to ZZ_INITIAL. + * + * @param reader the new input stream + */ + public final void yyreset(java.io.Reader reader) { + zzReader = reader; + zzAtBOL = true; + zzAtEOF = false; + zzEOFDone = false; + zzEndRead = zzStartRead = 0; + zzCurrentPos = zzMarkedPos = 0; + yyline = yychar = yycolumn = 0; + zzLexicalState = YYINITIAL; + } + + + /** + * Returns the current lexical state. + */ + public final int yystate() { + return zzLexicalState; + } + + + /** + * Enters a new lexical state + * + * @param newState the new lexical state + */ + public final void yybegin(int newState) { + zzLexicalState = newState; + } + + + /** + * Returns the text matched by the current regular expression. + */ + public final String yytext() { + return new String( zzBuffer, zzStartRead, zzMarkedPos-zzStartRead ); + } + + + /** + * Returns the character at position pos from the + * matched text. + * + * It is equivalent to yytext().charAt(pos), but faster + * + * @param pos the position of the character to fetch. + * A value from 0 to yylength()-1. + * + * @return the character at position pos + */ + public final char yycharat(int pos) { + return zzBuffer[zzStartRead+pos]; + } + + + /** + * Returns the length of the matched text region. + */ + public final int yylength() { + return zzMarkedPos-zzStartRead; + } + + + /** + * Reports an error that occured while scanning. + * + * In a wellformed scanner (no or only correct usage of + * yypushback(int) and a match-all fallback rule) this method + * will only be called with things that "Can't Possibly Happen". + * If this method is called, something is seriously wrong + * (e.g. a JFlex bug producing a faulty scanner etc.). + * + * Usual syntax/scanner level error handling should be done + * in error fallback rules. + * + * @param errorCode the code of the errormessage to display + */ + private void zzScanError(int errorCode) { + String message; + try { + message = ZZ_ERROR_MSG[errorCode]; + } + catch (ArrayIndexOutOfBoundsException e) { + message = ZZ_ERROR_MSG[ZZ_UNKNOWN_ERROR]; + } + + throw new Error(message); + } + + + /** + * Pushes the specified amount of characters back into the input stream. + * + * They will be read again by then next call of the scanning method + * + * @param number the number of characters to be read again. + * This number must not be greater than yylength()! + */ + public void yypushback(int number) { + if ( number > yylength() ) + zzScanError(ZZ_PUSHBACK_2BIG); + + zzMarkedPos -= number; + } + + + /** + * Resumes scanning until the next regular expression is matched, + * the end of input is encountered or an I/O-Error occurs. + * + * @return the next token + * @exception java.io.IOException if any I/O-Error occurs + */ + public java.lang.String yylex() throws java.io.IOException { + int zzInput; + int zzAction; + + // cached fields: + int zzCurrentPosL; + int zzMarkedPosL; + int zzEndReadL = zzEndRead; + char [] zzBufferL = zzBuffer; + char [] zzCMapL = ZZ_CMAP; + + int [] zzTransL = ZZ_TRANS; + int [] zzRowMapL = ZZ_ROWMAP; + int [] zzAttrL = ZZ_ATTRIBUTE; + + while (true) { + zzMarkedPosL = zzMarkedPos; + + zzAction = -1; + + zzCurrentPosL = zzCurrentPos = zzStartRead = zzMarkedPosL; + + zzState = ZZ_LEXSTATE[zzLexicalState]; + + + zzForAction: { + while (true) { + + if (zzCurrentPosL < zzEndReadL) + zzInput = zzBufferL[zzCurrentPosL++]; + else if (zzAtEOF) { + zzInput = YYEOF; + break zzForAction; + } + else { + // store back cached positions + zzCurrentPos = zzCurrentPosL; + zzMarkedPos = zzMarkedPosL; + boolean eof = zzRefill(); + // get translated positions and possibly new buffer + zzCurrentPosL = zzCurrentPos; + zzMarkedPosL = zzMarkedPos; + zzBufferL = zzBuffer; + zzEndReadL = zzEndRead; + if (eof) { + zzInput = YYEOF; + break zzForAction; + } + else { + zzInput = zzBufferL[zzCurrentPosL++]; + } + } + int zzNext = zzTransL[ zzRowMapL[zzState] + zzCMapL[zzInput] ]; + if (zzNext == -1) break zzForAction; + zzState = zzNext; + + int zzAttributes = zzAttrL[zzState]; + if ( (zzAttributes & 1) == 1 ) { + zzAction = zzState; + zzMarkedPosL = zzCurrentPosL; + if ( (zzAttributes & 8) == 8 ) break zzForAction; + } + + } + } + + // store back cached position + zzMarkedPos = zzMarkedPosL; + + switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) { + case 23: + { return "D"; + } + case 72: break; + case 17: + { return "*"; + } + case 73: break; + case 46: + { return "o"; + } + case 74: break; + case 60: + { return ";"; + } + case 75: break; + case 63: + { return "!"; + } + case 76: break; + case 29: + { return "f"; + } + case 77: break; + case 36: + { return "w"; + } + case 78: break; + case 67: + { return "]"; + } + case 79: break; + case 70: + { return ")"; + } + case 80: break; + case 69: + { return ">"; + } + case 81: break; + case 34: + { return "n"; + } + case 82: break; + case 24: + { return "T"; + } + case 83: break; + case 57: + { return ":"; + } + case 84: break; + case 41: + { return "K"; + } + case 85: break; + case 12: + { return "v"; + } + case 86: break; + case 71: + { return "("; + } + case 87: break; + case 33: + { return "m"; + } + case 88: break; + case 22: + { return "S"; + } + case 89: break; + case 45: + { return "~"; + } + case 90: break; + case 16: + { return "d"; + } + case 91: break; + case 52: + { return "J"; + } + case 92: break; + case 43: + { return "u"; + } + case 93: break; + case 59: + { return "["; + } + case 94: break; + case 8: + { return "A"; + } + case 95: break; + case 2: + { return "'"; + } + case 96: break; + case 32: + { return "l"; + } + case 97: break; + case 55: + { return "R"; + } + case 98: break; + case 7: + { return "}"; + } + case 99: break; + case 11: + { return "t"; + } + case 100: break; + case 25: + { return "Z"; + } + case 101: break; + case 58: + { return "@"; + } + case 102: break; + case 5: + { return "&"; + } + case 103: break; + case 31: + { return "k"; + } + case 104: break; + case 3: + { return "|"; + } + case 105: break; + case 9: + { return "b"; + } + case 106: break; + case 14: + { return "H"; + } + case 107: break; + case 62: + { return "."; + } + case 108: break; + case 20: + { return "s"; + } + case 109: break; + case 37: + { return "Y"; + } + case 110: break; + case 56: + { return "?"; + } + case 111: break; + case 66: + { return "%"; + } + case 112: break; + case 13: + { return "j"; + } + case 113: break; + case 51: + { return "P"; + } + case 114: break; + case 50: + { return "{"; + } + case 115: break; + case 1: + { return yytext(); + } + case 116: break; + case 42: + { return "a"; + } + case 117: break; + case 54: + { return "G"; + } + case 118: break; + case 64: + { return "-"; + } + case 119: break; + case 18: + { return "r"; + } + case 120: break; + case 4: + { return ">"; + } + case 121: break; + case 21: + { return "$"; + } + case 122: break; + case 44: + { return "i"; + } + case 123: break; + case 19: + { return "z"; + } + case 124: break; + case 68: + { return "<"; + } + case 125: break; + case 49: + { return "`"; + } + case 126: break; + case 39: + { return "F"; + } + case 127: break; + case 61: + { return ","; + } + case 128: break; + case 30: + { return "q"; + } + case 129: break; + case 48: + { return "#"; + } + case 130: break; + case 35: + { return "h"; + } + case 131: break; + case 40: + { return "N"; + } + case 132: break; + case 38: + { return "y"; + } + case 133: break; + case 28: + { return "_"; + } + case 134: break; + case 26: + { return "E"; + } + case 135: break; + case 65: + { return "+"; + } + case 136: break; + case 10: + { return "p"; + } + case 137: break; + case 53: + { return "V"; + } + case 138: break; + case 6: + { return "<"; + } + case 139: break; + case 27: + { return "g"; + } + case 140: break; + case 15: + { return "x"; + } + case 141: break; + case 47: + { return "^"; + } + case 142: break; + default: + if (zzInput == YYEOF && zzStartRead == zzCurrentPos) { + zzAtEOF = true; + return null; + } + else { + zzScanError(ZZ_NO_MATCH); + } + } + } + } + + +} diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lucene/util/LuceneUtil.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lucene/util/LuceneUtil.java Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,27 @@ +package de.mpg.mpiwg.berlin.mpdl.lucene.util; + +import java.util.ArrayList; + +public class LuceneUtil { + private static LuceneUtil instance; + + public static LuceneUtil getInstance() { + if (instance == null) { + instance = new LuceneUtil(); + } + return instance; + } + + public ArrayList getVariantsFromLuceneQuery(String queryString) { + ArrayList variants = new ArrayList(); + String[] variantTokens = queryString.split(" "); // TODO throw the phrases away (e.g.: "bla bla bla") + for (int i = 0; i < variantTokens.length; i++) { + String token = variantTokens[i]; + if (! (token.contains("*") || token.contains("?") || token.contains("~") || token.contains("-") || token.contains("+") || token.contains("^") || token.contains("OR") || token.contains("AND") || token.contains("NOT"))) { + variants.add(token); + } + } + return variants; + } + +} \ No newline at end of file diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/test/TestLocal.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/test/TestLocal.java Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,131 @@ +package de.mpg.mpiwg.berlin.mpdl.test; + +import java.io.BufferedInputStream; +import java.io.File; +import java.io.IOException; +import java.io.InputStream; +import java.io.StringReader; +import java.net.URL; +import java.util.ArrayList; + +import org.apache.commons.io.FileUtils; +import org.apache.commons.io.IOUtils; + +import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; +import de.mpg.mpiwg.berlin.mpdl.lt.dict.app.Lexicon; +import de.mpg.mpiwg.berlin.mpdl.lt.dict.app.LexiconEntry; +import de.mpg.mpiwg.berlin.mpdl.lt.dict.db.LexHandler; +import de.mpg.mpiwg.berlin.mpdl.lt.morph.app.Lemma; +import de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize.Token; +import de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize.Tokenizer; +import de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize.XmlTokenizer; + +public class TestLocal { + private LexHandler lexHandler; + + public static void main(String[] args) throws ApplicationException { + try { + TestLocal test = new TestLocal(); + test.init(); + // test.testCalls(); + // test.tokenizeString(); + // test.tokenizeXmlFragment(); + test.getLexEntriesByLexiconBeginningWith("ls", "a"); + // test.end(); + } catch (Exception e) { + e.printStackTrace(); + } + } + + private void init() throws ApplicationException { + lexHandler = LexHandler.getInstance(); + } + + private void end() throws ApplicationException { + lexHandler.end(); + } + + private ArrayList tokenizeString() throws ApplicationException { + ArrayList tokens = new ArrayList(); + try { + StringReader reader = new StringReader("edo philoſophi"); + // StringReader reader = new StringReader("扞盗則李兗州"); + Tokenizer tokenizer = new Tokenizer(reader); + tokenizer.setLanguage("lat"); + // tokenizer.setLanguage("zho"); + String[] normFunctions = new String[1]; + normFunctions[0] = "norm"; + tokenizer.setNormFunctions(normFunctions); + tokens = tokenizer.getTokens(); + tokenizer.end(); + tokenizer.close(); + } catch (IOException e) { + throw new ApplicationException(e); + } + return tokens; + } + + private String tokenizeXmlFragment() throws ApplicationException { + String result = null; + try { + String xmlFragment = new String(FileUtils.readFileToByteArray(new File("/Users/jwillenborg/tmp/testFragment2.xml")), "utf-8"); + String srcUrlStr = "http://mpdl-system.mpiwg-berlin.mpg.de/mpdl/page-query-result.xql?document=/echo/la/Benedetti_1585.xml&mode=pureXml&pn=13"; + URL srcUrl = new URL(srcUrlStr); + InputStream inputStream = srcUrl.openStream(); + BufferedInputStream in = new BufferedInputStream(inputStream); + xmlFragment = IOUtils.toString(in, "utf-8"); + in.close(); + + XmlTokenizer xmlTokenizer = new XmlTokenizer(new StringReader(xmlFragment)); + xmlTokenizer.setLanguage("lat"); + String[] normFunctions = new String[1]; + normFunctions[0] = "norm"; + String[] stopElements = new String[1]; + stopElements[0] = "var"; + xmlTokenizer.setNormFunctions(normFunctions); + xmlTokenizer.setStopElements(stopElements); + result = xmlTokenizer.tokenize(); + System.out.println(result); + } catch (Exception e) { + throw new ApplicationException(e); + } + return result; + } + + private void testCalls() throws ApplicationException { + String query = "sum quibus"; + String language = "lat"; + // String query = "ἱκανῶσ"; + // String language = "el"; + String inputType = "form"; + String outputType = null; + String outputFormat = "html"; + String dictionaryName = null; + String normalization = "norm"; + getLexEntries(query, language, inputType, outputType, outputFormat, dictionaryName, normalization); + } + + private void getLexEntries(String query, String language, String inputType , String outputType, String outputFormat, String dictionaryName, String normalization) throws ApplicationException { + ArrayList lemmas = lexHandler.getLemmas(query, inputType, language, normalization); + ArrayList dictionaries = lexHandler.getLexEntries(lemmas, language, dictionaryName); + // String result = lexHandler.getLexEntries(query, language, inputType, outputType, outputFormat, dictionaryName, normalization); + String result = ""; + result = result + ""; + for (int i=0; i"; + System.out.println(result); + } + + private void getLexEntriesByLexiconBeginningWith(String lexiconName, String prefix) throws ApplicationException { + ArrayList lexEntries = lexHandler.getLexEntriesByLexiconBeginningWith(lexiconName, prefix, 1); + System.out.println(lexEntries); + } + + private void getLexEntriesBeginningWith(String language, String prefix) throws ApplicationException { + ArrayList lexEntries = lexHandler.getLexEntriesBeginningWith(language, prefix, 1); + System.out.println(lexEntries); + } +} diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/util/StringUtils.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/util/StringUtils.java Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,491 @@ +package de.mpg.mpiwg.berlin.mpdl.util; + +import java.io.UnsupportedEncodingException; +import java.net.URLEncoder; +import java.text.CharacterIterator; +import java.text.StringCharacterIterator; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +public class StringUtils { + + /** + * Puts a zwsp between two ideographic characters (e.g. in CJK Unified Ideographs) + * @param str + * @return + */ + public static String zwsp(String str) { + // based on Unicode 3.2 + String ideographic = "[\u3300-\u33ff\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff]"; + String regex = "(" + ideographic + ")(" + ideographic + ")"; + String retStr = str.replaceAll(regex, "$1\u200b$2"); + retStr = retStr.replaceAll(regex, "$1\u200b$2"); + return retStr; + } + + + public static String deleteSpecialXmlEntities(String inputStr) { + inputStr = inputStr.replaceAll("<", ""); + inputStr = inputStr.replaceAll(">", ""); + inputStr = inputStr.replaceAll("&lt;", ""); + inputStr = inputStr.replaceAll("&gt;", ""); + return inputStr; + } + + public static String resolveXmlEntities(String inputStr) { + inputStr = inputStr.replaceAll("&", "&"); + inputStr = inputStr.replaceAll("<", "<"); + inputStr = inputStr.replaceAll(">", ">"); + inputStr = inputStr.replaceAll(""", "\""); + inputStr = inputStr.replaceAll("'", "'"); + return inputStr; + } + + public static String deresolveXmlEntities(String inputStr) { + StringBuffer buf = new StringBuffer(); + for (int i = 0; i < inputStr.length(); i++) { + char c = inputStr.charAt(i); + String replace = new String(); + switch (c) { + case '&': replace = "&"; break; + case '<': replace = "<"; break; + case '>': replace = ">"; break; + case '"': replace = """; break; + // case '\'': replace = "'"; break; // causes problems in DictionarizerContentHandler + default: replace += c; break; + } + buf.append(replace); + } + return buf.toString(); + } + + /** + * Escape characters for text appearing in HTML markup. + * + *

        This method exists as a defence against Cross Site Scripting (XSS) hacks. + * The idea is to neutralize control characters commonly used by scripts, such that + * they will not be executed by the browser. This is done by replacing the control + * characters with their escaped equivalents. + * See {@link hirondelle.web4j.security.SafeText} as well. + * + *

        The following characters are replaced with corresponding + * HTML character entities : + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + *
        Character Replacement
        < <
        > >
        & &
        " "
        \t
        ! !
        # #
        $ $
        % %
        ' '
        ( (
        ) )
        * *
        + +
        , ,
        - -
        . .
        / /
        : :
        ; ;
        = =
        ? ?
        @ @
        [ [
        \ \
        ] ]
        ^ ^
        _ _
        ` `
        { {
        | |
        } }
        ~ ~
        + * + *

        Note that JSTL's {@code } escapes only the first + * five of the above characters. + */ + public static String forHTML(String aText){ + final StringBuilder result = new StringBuilder(); + final StringCharacterIterator iterator = new StringCharacterIterator(aText); + char character = iterator.current(); + while (character != CharacterIterator.DONE ){ + if (character == '<') { + result.append("<"); + } + else if (character == '>') { + result.append(">"); + } + else if (character == '&') { + result.append("&"); + } + else if (character == '\"') { + result.append("""); + } + else if (character == '\t') { + addCharEntity(9, result); + } + else if (character == '!') { + addCharEntity(33, result); + } + else if (character == '#') { + addCharEntity(35, result); + } + else if (character == '$') { + addCharEntity(36, result); + } + else if (character == '%') { + addCharEntity(37, result); + } + else if (character == '\'') { + addCharEntity(39, result); + } + else if (character == '(') { + addCharEntity(40, result); + } + else if (character == ')') { + addCharEntity(41, result); + } + else if (character == '*') { + addCharEntity(42, result); + } + else if (character == '+') { + addCharEntity(43, result); + } + else if (character == ',') { + addCharEntity(44, result); + } + else if (character == '-') { + addCharEntity(45, result); + } + else if (character == '.') { + addCharEntity(46, result); + } + else if (character == '/') { + addCharEntity(47, result); + } + else if (character == ':') { + addCharEntity(58, result); + } + else if (character == ';') { + addCharEntity(59, result); + } + else if (character == '=') { + addCharEntity(61, result); + } + else if (character == '?') { + addCharEntity(63, result); + } + else if (character == '@') { + addCharEntity(64, result); + } + else if (character == '[') { + addCharEntity(91, result); + } + else if (character == '\\') { + addCharEntity(92, result); + } + else if (character == ']') { + addCharEntity(93, result); + } + else if (character == '^') { + addCharEntity(94, result); + } + else if (character == '_') { + addCharEntity(95, result); + } + else if (character == '`') { + addCharEntity(96, result); + } + else if (character == '{') { + addCharEntity(123, result); + } + else if (character == '|') { + addCharEntity(124, result); + } + else if (character == '}') { + addCharEntity(125, result); + } + else if (character == '~') { + addCharEntity(126, result); + } + else { + //the char is not a special one + //add it to the result as is + result.append(character); + } + character = iterator.next(); + } + return result.toString(); + } + + + /** + * Escape all ampersand characters in a URL. + * + *

        Replaces all '&' characters with '&'. + * + *

        An ampersand character may appear in the query string of a URL. + * The ampersand character is indeed valid in a URL. + * However, URLs usually appear as an HREF attribute, and + * such attributes have the additional constraint that ampersands + * must be escaped. + * + *

        The JSTL tag does indeed perform proper URL encoding of + * query parameters. But it does not, in general, produce text which + * is valid as an HREF attribute, simply because it does + * not escape the ampersand character. This is a nuisance when + * multiple query parameters appear in the URL, since it requires a little + * extra work. + */ + public static String forHrefAmpersand(String aURL){ + return aURL.replace("&", "&"); + } + + /** + * Synonym for URLEncoder.encode(String, "UTF-8"). + * + *

        Used to ensure that HTTP query strings are in proper form, by escaping + * special characters such as spaces. + * + *

        It is important to note that if a query string appears in an HREF + * attribute, then there are two issues - ensuring the query string is valid HTTP + * (it is URL-encoded), and ensuring it is valid HTML (ensuring the + * ampersand is escaped). + */ + public static String forURL(String aURLFragment){ + String result = null; + try { + result = URLEncoder.encode(aURLFragment, "UTF-8"); + } + catch (UnsupportedEncodingException ex){ + throw new RuntimeException("UTF-8 not supported", ex); + } + return result; + } + + /** + * Escape characters for text appearing as XML data, between tags. + * + *

        The following characters are replaced with corresponding character entities : + * + * + * + * + * + * + * + *
        Character Encoding
        < <
        > >
        & &
        " "
        ' '
        + * + *

        Note that JSTL's {@code } escapes the exact same set of + * characters as this method. That is, {@code } + * is good for escaping to produce valid XML, but not for producing safe + * HTML. + */ + public static String forXML(String aText){ + final StringBuilder result = new StringBuilder(); + final StringCharacterIterator iterator = new StringCharacterIterator(aText); + char character = iterator.current(); + while (character != CharacterIterator.DONE ){ + if (character == '<') { + result.append("<"); + } + else if (character == '>') { + result.append(">"); + } + else if (character == '\"') { + result.append("""); + } + else if (character == '\'') { + result.append("'"); + } + else if (character == '&') { + result.append("&"); + } + else { + //the char is not a special one + //add it to the result as is + result.append(character); + } + character = iterator.next(); + } + return result.toString(); + } + + /** + * Return aText with all '<' and '>' characters + * replaced by their escaped equivalents. + */ + public static String toDisableTags(String aText){ + final StringBuilder result = new StringBuilder(); + final StringCharacterIterator iterator = new StringCharacterIterator(aText); + char character = iterator.current(); + while (character != CharacterIterator.DONE ){ + if (character == '<') { + result.append("<"); + } + else if (character == '>') { + result.append(">"); + } + else { + //the char is not a special one + //add it to the result as is + result.append(character); + } + character = iterator.next(); + } + return result.toString(); + } + + + /** + * Replace characters having special meaning in regular expressions + * with their escaped equivalents, preceded by a '\' character. + * + *

        The escaped characters include : + *

          + *
        • . + *
        • \ + *
        • ?, * , and + + *
        • & + *
        • : + *
        • { and } + *
        • [ and ] + *
        • ( and ) + *
        • ^ and $ + *
        + */ + public static String forRegex(String aRegexFragment){ + final StringBuilder result = new StringBuilder(); + + final StringCharacterIterator iterator = + new StringCharacterIterator(aRegexFragment) + ; + char character = iterator.current(); + while (character != CharacterIterator.DONE ){ + /* + * All literals need to have backslashes doubled. + */ + if (character == '.') { + result.append("\\."); + } + else if (character == '\\') { + result.append("\\\\"); + } + else if (character == '?') { + result.append("\\?"); + } + else if (character == '*') { + result.append("\\*"); + } + else if (character == '+') { + result.append("\\+"); + } + else if (character == '&') { + result.append("\\&"); + } + else if (character == ':') { + result.append("\\:"); + } + else if (character == '{') { + result.append("\\{"); + } + else if (character == '}') { + result.append("\\}"); + } + else if (character == '[') { + result.append("\\["); + } + else if (character == ']') { + result.append("\\]"); + } + else if (character == '(') { + result.append("\\("); + } + else if (character == ')') { + result.append("\\)"); + } + else if (character == '^') { + result.append("\\^"); + } + else if (character == '$') { + result.append("\\$"); + } + else { + //the char is not a special one + //add it to the result as is + result.append(character); + } + character = iterator.next(); + } + return result.toString(); + } + + /** + * Escape '$' and '\' characters in replacement strings. + * + *

        Synonym for Matcher.quoteReplacement(String). + * + *

        The following methods use replacement strings which treat + * '$' and '\' as special characters: + *

          + *
        • String.replaceAll(String, String) + *
        • String.replaceFirst(String, String) + *
        • Matcher.appendReplacement(StringBuffer, String) + *
        + * + *

        If replacement text can contain arbitrary characters, then you + * will usually need to escape that text, to ensure special characters + * are interpreted literally. + */ + public static String forReplacementString(String aInput){ + return Matcher.quoteReplacement(aInput); + } + + /** + * Disable all ", Pattern.CASE_INSENSITIVE + ); + + private static void addCharEntity(Integer aIdx, StringBuilder aBuilder){ + String padding = ""; + if( aIdx <= 9 ){ + padding = "00"; + } + else if( aIdx <= 99 ){ + padding = "0"; + } + else { + //no prefix + } + String number = padding + aIdx.toString(); + aBuilder.append("&#" + number + ";"); + } + } diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/util/Util.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/util/Util.java Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,32 @@ +package de.mpg.mpiwg.berlin.mpdl.util; + +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.util.Date; +import java.util.Properties; + +public class Util { + + public Properties getProperties(String fullFileName) { + Properties props = new Properties(); + try { + File file = new File(fullFileName); + FileInputStream in = new FileInputStream(file); + props.load(in); + } catch (IOException e) { + e.printStackTrace(); + } + return props; + } + + public Double getSecondWithMillisecondsBetween(Date begin, Date end) { + long beginMS = begin.getTime(); + long endMS = end.getTime(); + long elapsedSeconds = (endMS - beginMS) / 1000; + long elapsedMilliSecondsAfterSeconds1 = (endMS - beginMS) - (elapsedSeconds * 1000); + Double seconds = new Double(elapsedSeconds + "." + elapsedMilliSecondsAfterSeconds1); + return seconds; + } + +} diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-xml-web/.DS_Store Binary file software/mpdl-services/mpiwg-mpdl-xml-web/.DS_Store has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-xml-web/.classpath --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-xml-web/.classpath Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,18 @@ + + + + + + + + + + + + + + + + + + diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-xml-web/.externalToolBuilders/New_Builder.launch --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-xml-web/.externalToolBuilders/New_Builder.launch Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,12 @@ + + + + + + + + + + + + diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-xml-web/.project --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-xml-web/.project Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,46 @@ + + + mpiwg-mpdl-xml-web + + + + + + org.eclipse.wst.jsdt.core.javascriptValidator + + + + + org.eclipse.jdt.core.javabuilder + + + + + org.eclipse.wst.common.project.facet.core.builder + + + + + org.eclipse.wst.validation.validationbuilder + + + + + org.eclipse.ui.externaltools.ExternalToolBuilder + full,incremental, + + + LaunchConfigHandle + <project>/.externalToolBuilders/New_Builder.launch + + + + + + org.eclipse.jem.workbench.JavaEMFNature + org.eclipse.wst.common.modulecore.ModuleCoreNature + org.eclipse.wst.common.project.facet.core.nature + org.eclipse.jdt.core.javanature + org.eclipse.wst.jsdt.core.jsNature + + diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-xml-web/.settings/.jsdtscope --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-xml-web/.settings/.jsdtscope Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,12 @@ + + + + + + + + + + + + diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-xml-web/.settings/org.eclipse.jdt.core.prefs --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-xml-web/.settings/org.eclipse.jdt.core.prefs Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,12 @@ +#Mon Sep 12 15:41:45 CEST 2011 +eclipse.preferences.version=1 +org.eclipse.jdt.core.compiler.codegen.inlineJsrBytecode=enabled +org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.6 +org.eclipse.jdt.core.compiler.codegen.unusedLocal=preserve +org.eclipse.jdt.core.compiler.compliance=1.6 +org.eclipse.jdt.core.compiler.debug.lineNumber=generate +org.eclipse.jdt.core.compiler.debug.localVariable=generate +org.eclipse.jdt.core.compiler.debug.sourceFile=generate +org.eclipse.jdt.core.compiler.problem.assertIdentifier=error +org.eclipse.jdt.core.compiler.problem.enumIdentifier=error +org.eclipse.jdt.core.compiler.source=1.6 diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-xml-web/.settings/org.eclipse.wst.common.component --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-xml-web/.settings/org.eclipse.wst.common.component Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,10 @@ + + + + + + + + + + diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-xml-web/.settings/org.eclipse.wst.common.project.facet.core.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-xml-web/.settings/org.eclipse.wst.common.project.facet.core.xml Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,10 @@ + + + + + + + + + + diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-xml-web/.settings/org.eclipse.wst.jsdt.ui.superType.container --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-xml-web/.settings/org.eclipse.wst.jsdt.ui.superType.container Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,1 @@ +org.eclipse.wst.jsdt.launching.baseBrowserLibrary \ No newline at end of file diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-xml-web/.settings/org.eclipse.wst.jsdt.ui.superType.name --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-xml-web/.settings/org.eclipse.wst.jsdt.ui.superType.name Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,1 @@ +Window \ No newline at end of file diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-xml-web/.settings/org.eclipse.wst.ws.service.policy.prefs --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-xml-web/.settings/org.eclipse.wst.ws.service.policy.prefs Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,3 @@ +#Mon Sep 12 15:41:45 CEST 2011 +eclipse.preferences.version=1 +org.eclipse.wst.ws.service.policy.projectEnabled=false diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-xml-web/WebContent/.DS_Store Binary file software/mpdl-services/mpiwg-mpdl-xml-web/WebContent/.DS_Store has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-xml-web/WebContent/META-INF/MANIFEST.MF --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-xml-web/WebContent/META-INF/MANIFEST.MF Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,3 @@ +Manifest-Version: 1.0 +Class-Path: + diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-xml-web/WebContent/WEB-INF/.DS_Store Binary file software/mpdl-services/mpiwg-mpdl-xml-web/WebContent/WEB-INF/.DS_Store has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-xml-web/WebContent/WEB-INF/classes/.DS_Store Binary file software/mpdl-services/mpiwg-mpdl-xml-web/WebContent/WEB-INF/classes/.DS_Store has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-xml-web/WebContent/WEB-INF/classes/constants.properties --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-xml-web/WebContent/WEB-INF/classes/constants.properties Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,1 @@ +docDir=/usr/local/tomcat-mpdl/mpdl-data/xml/documents \ No newline at end of file diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-xml-web/WebContent/WEB-INF/lib/.DS_Store Binary file software/mpdl-services/mpiwg-mpdl-xml-web/WebContent/WEB-INF/lib/.DS_Store has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-xml-web/WebContent/WEB-INF/lib/commons-io-2.0.1.jar Binary file software/mpdl-services/mpiwg-mpdl-xml-web/WebContent/WEB-INF/lib/commons-io-2.0.1.jar has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-xml-web/WebContent/WEB-INF/lib/mpiwg-mpdl-xml-web.jar Binary file software/mpdl-services/mpiwg-mpdl-xml-web/WebContent/WEB-INF/lib/mpiwg-mpdl-xml-web.jar has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-xml-web/WebContent/WEB-INF/lib/mpiwg-mpdl-xml.jar Binary file software/mpdl-services/mpiwg-mpdl-xml-web/WebContent/WEB-INF/lib/mpiwg-mpdl-xml.jar has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-xml-web/WebContent/WEB-INF/lib/saxon.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-xml-web/WebContent/WEB-INF/lib/saxon.txt Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,3 @@ +Saxon: + +Release 9.1.0.5 (free version): releases < 9.1.0.7 support saxon extension functions diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-xml-web/WebContent/WEB-INF/lib/saxon9-s9api.jar Binary file software/mpdl-services/mpiwg-mpdl-xml-web/WebContent/WEB-INF/lib/saxon9-s9api.jar has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-xml-web/WebContent/WEB-INF/lib/saxon9.jar Binary file software/mpdl-services/mpiwg-mpdl-xml-web/WebContent/WEB-INF/lib/saxon9.jar has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-xml-web/WebContent/WEB-INF/web.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-xml-web/WebContent/WEB-INF/web.xml Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,40 @@ + + + mpiwg-mpdl-xml-web + + index.html + + + Transform + Transform + Transform + de.mpg.mpiwg.berlin.mpdl.servlets.xml.Transform + + + Transform + /transform/Transform + + + GetFragment + GetFragment + GetFragment + de.mpg.mpiwg.berlin.mpdl.servlets.xml.GetFragment + + + GetFragment + /transform/GetFragment + + + XQuery + XQuery + XQuery + de.mpg.mpiwg.berlin.mpdl.servlets.xml.XQuery + + + XQuery + /xquery/XQuery + + + de.mpg.mpiwg.berlin.mpdl.servlets.xml.MpiwgMpdlXmlWebServletContextListener + + \ No newline at end of file diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-xml-web/WebContent/index.html --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-xml-web/WebContent/index.html Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,130 @@ + + + +Max Planck Institute for the History of Science - Mpdl: XML Services + + +

        Max Planck Institute for the History of Science - Mpdl: XML Services

        +

        Available Services

        + +
          +
        • Url: /mpiwg-mpdl-xml-web/transform/Transform +
            +
          • Request parameters +
              +
            • srcUrl (required) +
                +
              • url of the Xml source document
              • +
              +
            • +
            • xslUrl (required) +
                +
              • url of the Xsl document which does the transformation of the Xml document
              • +
              +
            • +
            • parameters (optional) +
                +
              • parameters separated with blanks (e.g. "yourParam1=yourValue1 yourParam2=yourValue2")
              • +
              • default: no parameters
              • +
              +
            • +
            • outputProperties (optional) +
                +
              • output properties separated with blanks (e.g. "encoding=utf-8 indent=yes") +
                  +
                • "method=xhtml"
                • +
                • "indent=yes"
                • +
                • "media-type=text/html"
                • +
                • "encoding=utf-8"
                • +
                • default: "method=xml indent=yes media-type=text/xml encoding=utf-8"
                • +
                +
              • +
              +
            • +
            +
          • +
          • Response output + +
          • +
          +
        • + +
        • Url: /mpiwg-mpdl-xml-web/transform/GetFragment +
            +
          • Request parameters +
              +
            • docId (required) +
                +
              • document identifier of the Xml source document (e.g. "/tei/la/Test_1789.xml")
              • +
              +
            • +
            • ms1Name (required) +
                +
              • starting milestone element name (e.g. "pb")
              • +
              +
            • +
            • ms1Pos (required) +
                +
              • starting milestone position (e.g. "13")
              • +
              +
            • +
            • ms2Name (required) +
                +
              • ending milestone element name (e.g. "pb")
              • +
              +
            • +
            • ms2Pos (required) +
                +
              • ending milestone position (e.g. "14")
              • +
              +
            • +
            +
          • +
          • Response output + +
          • +
          +
        • + +
        • Url: /mpiwg-mpdl-xml-web/xquery/XQuery +
            +
          • Request parameters +
              +
            • inputString or srcUrl (required) +
                +
              • inputString +
                  +
                • XML string
                • +
                +
              • +
              • srcUrl +
                  +
                • source URL of XML document
                • +
                +
              • +
              +
            • +
            • xquery (required) +
                +
              • XQuery (or XPath) source code which should be executed
              • +
              +
            • +
            +
          • +
          • Response output + +
          • +
          +
        • +
        + + \ No newline at end of file diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-xml-web/WebContent/xsl/generateId.xsl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-xml-web/WebContent/xsl/generateId.xsl Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,34 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-xml-web/build/.DS_Store Binary file software/mpdl-services/mpiwg-mpdl-xml-web/build/.DS_Store has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-xml-web/build/build.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-xml-web/build/build.xml Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,57 @@ + + + mpiwg-mpdl-xml-web + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-xml-web/conf/constants-mpdl-system.properties --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-xml-web/conf/constants-mpdl-system.properties Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,1 @@ +docDir=/usr/local/tomcat-mpdl/mpdl-data/xml/documents \ No newline at end of file diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-xml-web/conf/constants.properties --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-xml-web/conf/constants.properties Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,1 @@ +docDir=/Users/jwillenborg/mpdl/data/xml/documents \ No newline at end of file diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-xml-web/dist-remote/mpiwg-mpdl-xml-web.jar Binary file software/mpdl-services/mpiwg-mpdl-xml-web/dist-remote/mpiwg-mpdl-xml-web.jar has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-xml-web/dist-remote/mpiwg-mpdl-xml-web.war Binary file software/mpdl-services/mpiwg-mpdl-xml-web/dist-remote/mpiwg-mpdl-xml-web.war has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-xml-web/dist/.DS_Store Binary file software/mpdl-services/mpiwg-mpdl-xml-web/dist/.DS_Store has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-xml-web/dist/mpiwg-mpdl-xml-web.jar Binary file software/mpdl-services/mpiwg-mpdl-xml-web/dist/mpiwg-mpdl-xml-web.jar has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-xml-web/dist/mpiwg-mpdl-xml-web.war Binary file software/mpdl-services/mpiwg-mpdl-xml-web/dist/mpiwg-mpdl-xml-web.war has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-xml-web/src/.DS_Store Binary file software/mpdl-services/mpiwg-mpdl-xml-web/src/.DS_Store has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-xml-web/src/de/.DS_Store Binary file software/mpdl-services/mpiwg-mpdl-xml-web/src/de/.DS_Store has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-xml-web/src/de/mpg/.DS_Store Binary file software/mpdl-services/mpiwg-mpdl-xml-web/src/de/mpg/.DS_Store has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-xml-web/src/de/mpg/mpiwg/.DS_Store Binary file software/mpdl-services/mpiwg-mpdl-xml-web/src/de/mpg/mpiwg/.DS_Store has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-xml-web/src/de/mpg/mpiwg/berlin/.DS_Store Binary file software/mpdl-services/mpiwg-mpdl-xml-web/src/de/mpg/mpiwg/berlin/.DS_Store has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-xml-web/src/de/mpg/mpiwg/berlin/mpdl/.DS_Store Binary file software/mpdl-services/mpiwg-mpdl-xml-web/src/de/mpg/mpiwg/berlin/mpdl/.DS_Store has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-xml-web/src/de/mpg/mpiwg/berlin/mpdl/exception/ApplicationException.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-xml-web/src/de/mpg/mpiwg/berlin/mpdl/exception/ApplicationException.java Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,15 @@ +package de.mpg.mpiwg.berlin.mpdl.exception; + +public class ApplicationException extends Exception { + private static final long serialVersionUID = 1L; + + public ApplicationException(Exception e) { + super(e); + } + + public ApplicationException(String str) { + super(str); + } + +} + diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-xml-web/src/de/mpg/mpiwg/berlin/mpdl/servlets/xml/GetFragment.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-xml-web/src/de/mpg/mpiwg/berlin/mpdl/servlets/xml/GetFragment.java Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,55 @@ +package de.mpg.mpiwg.berlin.mpdl.servlets.xml; + +import java.io.IOException; +import java.io.PrintWriter; + +import javax.servlet.ServletConfig; +import javax.servlet.ServletContext; +import javax.servlet.ServletException; +import javax.servlet.http.HttpServlet; +import javax.servlet.http.HttpServletRequest; +import javax.servlet.http.HttpServletResponse; + +import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; +import de.mpg.mpiwg.berlin.mpdl.xml.transform.FragmentTransformer; + +public class GetFragment extends HttpServlet { + private static final long serialVersionUID = 1L; + private FragmentTransformer fragmentTransformer; + private String documentDirectory; + public GetFragment() { + super(); + } + + public void init(ServletConfig config) throws ServletException { + super.init(config); + ServletContext context = getServletContext(); + fragmentTransformer = (FragmentTransformer) context.getAttribute("fragmentTransformer"); + documentDirectory = (String) context.getAttribute("documentDirectory"); + } + + protected void doGet(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException { + request.setCharacterEncoding("utf-8"); + response.setCharacterEncoding("utf-8"); + String docId = request.getParameter("docId"); + String ms1Name = request.getParameter("ms1Name"); + int ms1Pos = new Integer(request.getParameter("ms1Pos")); + String ms2Name = request.getParameter("ms2Name"); + int ms2Pos = new Integer(request.getParameter("ms2Pos")); + try { + String xmlFileName = documentDirectory + docId; + String result = fragmentTransformer.getFragment(xmlFileName, ms1Name, ms1Pos, ms2Name, ms2Pos); + response.setContentType("text/xml"); + PrintWriter out = response.getWriter(); + out.print(result); + out.close(); + } catch (ApplicationException e) { + throw new ServletException(e); + } + } + + protected void doPost(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException { + // TODO Auto-generated method stub + } + +} diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-xml-web/src/de/mpg/mpiwg/berlin/mpdl/servlets/xml/MpiwgMpdlXmlWebServletContextListener.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-xml-web/src/de/mpg/mpiwg/berlin/mpdl/servlets/xml/MpiwgMpdlXmlWebServletContextListener.java Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,33 @@ +package de.mpg.mpiwg.berlin.mpdl.servlets.xml; + +import javax.servlet.ServletContext; +import javax.servlet.ServletContextEvent; +import javax.servlet.ServletContextListener; + +import de.mpg.mpiwg.berlin.mpdl.xml.general.Constants; +import de.mpg.mpiwg.berlin.mpdl.xml.transform.FragmentTransformer; + +public class MpiwgMpdlXmlWebServletContextListener implements ServletContextListener { + private ServletContext context = null; + private FragmentTransformer fragmentTransformer = null; + + public void contextInitialized(ServletContextEvent event) { + try { + this.context = event.getServletContext(); + fragmentTransformer = new FragmentTransformer(); + context.setAttribute("fragmentTransformer", fragmentTransformer); + String docDirectory = Constants.getInstance().getDocumentDir(); + context.setAttribute("documentDirectory", docDirectory); + System.out.println(MpiwgMpdlXmlWebServletContextListener.class.getName() + ": contextInitialized (document directory= \"" + docDirectory + "\", set in constants.properties)"); + // String documentDirectory = System.getProperty("catalina.base") + "/webapps/mpiwg-mpdl-xml-web/documents"; + } catch (Exception e) { + e.printStackTrace(); + } + } + + public void contextDestroyed(ServletContextEvent e) { + this.context = null; + this.fragmentTransformer = null; + System.out.println(MpiwgMpdlXmlWebServletContextListener.class.getName() + ": contextDestroyed"); + } +} \ No newline at end of file diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-xml-web/src/de/mpg/mpiwg/berlin/mpdl/servlets/xml/Transform.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-xml-web/src/de/mpg/mpiwg/berlin/mpdl/servlets/xml/Transform.java Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,48 @@ +package de.mpg.mpiwg.berlin.mpdl.servlets.xml; + +import java.io.IOException; +import java.io.PrintWriter; + +import javax.servlet.ServletConfig; +import javax.servlet.ServletException; +import javax.servlet.http.HttpServlet; +import javax.servlet.http.HttpServletRequest; +import javax.servlet.http.HttpServletResponse; + +import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; +import de.mpg.mpiwg.berlin.mpdl.xml.transform.BasicTransformer; + +public class Transform extends HttpServlet { + private static final long serialVersionUID = 1L; + public Transform() { + super(); + } + + public void init(ServletConfig config) throws ServletException { + super.init(config); + } + + protected void doGet(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException { + request.setCharacterEncoding("utf-8"); + response.setCharacterEncoding("utf-8"); + String srcUrl = request.getParameter("srcUrl"); + String xslUrl = request.getParameter("xslUrl"); + String parameters = request.getParameter("parameters"); + String outputProperties = request.getParameter("outputProperties"); + try { + BasicTransformer basicTransformer = new BasicTransformer(); + String result = basicTransformer.transform(srcUrl, xslUrl, parameters, outputProperties); + response.setContentType("text/xml"); + PrintWriter out = response.getWriter(); + out.print(result); + out.close(); + } catch (ApplicationException e) { + throw new ServletException(e); + } + } + + protected void doPost(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException { + // TODO Auto-generated method stub + } + +} diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-xml-web/src/de/mpg/mpiwg/berlin/mpdl/servlets/xml/XQuery.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-xml-web/src/de/mpg/mpiwg/berlin/mpdl/servlets/xml/XQuery.java Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,52 @@ +package de.mpg.mpiwg.berlin.mpdl.servlets.xml; + +import java.io.IOException; +import java.io.PrintWriter; +import java.net.URL; + +import javax.servlet.ServletConfig; +import javax.servlet.ServletException; +import javax.servlet.http.HttpServlet; +import javax.servlet.http.HttpServletRequest; +import javax.servlet.http.HttpServletResponse; + +import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; +import de.mpg.mpiwg.berlin.mpdl.xml.xquery.XQueryEvaluator; + +public class XQuery extends HttpServlet { + private static final long serialVersionUID = 1L; + private XQueryEvaluator xqueryEvaluator; + + public XQuery() { + super(); + } + + public void init(ServletConfig config) throws ServletException { + super.init(config); + xqueryEvaluator = new XQueryEvaluator(); + } + + protected void doGet(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException { + String result = null; + request.setCharacterEncoding("utf-8"); + response.setCharacterEncoding("utf-8"); + String inputString = request.getParameter("inputString"); + String srcUrlStr = request.getParameter("srcUrl"); + String xqueryStr = request.getParameter("xquery"); + try { + if (inputString != null) { + result = xqueryEvaluator.evaluateAsString(inputString, xqueryStr); + } else if (srcUrlStr != null) { + URL srcUrl = new URL(srcUrlStr); + result = xqueryEvaluator.evaluateAsString(srcUrl, xqueryStr); + } + response.setContentType("text/xml"); + PrintWriter out = response.getWriter(); + out.print("" + result + ""); + out.close(); + } catch (ApplicationException e) { + throw new ServletException(e); + } + } + +}