# HG changeset patch # User Josef Willenborg # Date 1320849125 -3600 # Node ID 4a3641ae14d2107bda519c28257b915a00189a1c # Parent dc5e9fcb3fdcd3461235a09f4b0dbf7aa6b74558 Erstellung diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/.DS_Store Binary file software/mpdl-services/mpiwg-mpdl-lt-web/.DS_Store has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/.classpath --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt-web/.classpath Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,18 @@ + + + + + + + + + + + + + + + + + + diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/.externalToolBuilders/Ant-Build.launch --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt-web/.externalToolBuilders/Ant-Build.launch Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,12 @@ + + + + + + + + + + + + diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/.project --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt-web/.project Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,46 @@ + + + mpiwg-mpdl-lt-web + + + + + + org.eclipse.wst.jsdt.core.javascriptValidator + + + + + org.eclipse.jdt.core.javabuilder + + + + + org.eclipse.wst.common.project.facet.core.builder + + + + + org.eclipse.wst.validation.validationbuilder + + + + + org.eclipse.ui.externaltools.ExternalToolBuilder + full,incremental, + + + LaunchConfigHandle + <project>/.externalToolBuilders/Ant-Build.launch + + + + + + org.eclipse.jem.workbench.JavaEMFNature + org.eclipse.wst.common.modulecore.ModuleCoreNature + org.eclipse.wst.common.project.facet.core.nature + org.eclipse.jdt.core.javanature + org.eclipse.wst.jsdt.core.jsNature + + diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/.settings/.jsdtscope --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt-web/.settings/.jsdtscope Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,12 @@ + + + + + + + + + + + + diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/.settings/org.eclipse.jdt.core.prefs --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt-web/.settings/org.eclipse.jdt.core.prefs Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,12 @@ +#Mon Sep 12 15:41:45 CEST 2011 +eclipse.preferences.version=1 +org.eclipse.jdt.core.compiler.codegen.inlineJsrBytecode=enabled +org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.6 +org.eclipse.jdt.core.compiler.codegen.unusedLocal=preserve +org.eclipse.jdt.core.compiler.compliance=1.6 +org.eclipse.jdt.core.compiler.debug.lineNumber=generate +org.eclipse.jdt.core.compiler.debug.localVariable=generate +org.eclipse.jdt.core.compiler.debug.sourceFile=generate +org.eclipse.jdt.core.compiler.problem.assertIdentifier=error +org.eclipse.jdt.core.compiler.problem.enumIdentifier=error +org.eclipse.jdt.core.compiler.source=1.6 diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/.settings/org.eclipse.wst.common.component --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt-web/.settings/org.eclipse.wst.common.component Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,10 @@ + + + + + + + + + + diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/.settings/org.eclipse.wst.common.project.facet.core.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt-web/.settings/org.eclipse.wst.common.project.facet.core.xml Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,10 @@ + + + + + + + + + + diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/.settings/org.eclipse.wst.jsdt.ui.superType.container --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt-web/.settings/org.eclipse.wst.jsdt.ui.superType.container Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,1 @@ +org.eclipse.wst.jsdt.launching.baseBrowserLibrary \ No newline at end of file diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/.settings/org.eclipse.wst.jsdt.ui.superType.name --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt-web/.settings/org.eclipse.wst.jsdt.ui.superType.name Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,1 @@ +Window \ No newline at end of file diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/.settings/org.eclipse.wst.ws.service.policy.prefs --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt-web/.settings/org.eclipse.wst.ws.service.policy.prefs Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,3 @@ +#Mon Sep 12 15:41:45 CEST 2011 +eclipse.preferences.version=1 +org.eclipse.wst.ws.service.policy.projectEnabled=false diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/.DS_Store Binary file software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/.DS_Store has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/META-INF/MANIFEST.MF --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/META-INF/MANIFEST.MF Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,3 @@ +Manifest-Version: 1.0 +Class-Path: + diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/WEB-INF/.DS_Store Binary file software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/WEB-INF/.DS_Store has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/WEB-INF/classes/.DS_Store Binary file software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/WEB-INF/classes/.DS_Store has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/WEB-INF/classes/constants.properties --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/WEB-INF/classes/constants.properties Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,1 @@ +dataDir=/usr/local/tomcat-mpdl/mpdl-data/lt \ No newline at end of file diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/WEB-INF/lib/.DS_Store Binary file software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/WEB-INF/lib/.DS_Store has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/WEB-INF/lib/berkeley-db-3.3.82.jar Binary file software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/WEB-INF/lib/berkeley-db-3.3.82.jar has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/WEB-INF/lib/commons-io-2.0.1.jar Binary file software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/WEB-INF/lib/commons-io-2.0.1.jar has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/WEB-INF/lib/commons-lang3-3.0.1.jar Binary file software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/WEB-INF/lib/commons-lang3-3.0.1.jar has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/WEB-INF/lib/lucene-core-3.4.0.jar Binary file software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/WEB-INF/lib/lucene-core-3.4.0.jar has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/WEB-INF/lib/mpiwg-mpdl-lt-web.jar Binary file software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/WEB-INF/lib/mpiwg-mpdl-lt-web.jar has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/WEB-INF/lib/mpiwg-mpdl-lt.jar Binary file software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/WEB-INF/lib/mpiwg-mpdl-lt.jar has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/WEB-INF/lib/mpiwg-mpdl-xml.jar Binary file software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/WEB-INF/lib/mpiwg-mpdl-xml.jar has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/WEB-INF/lib/saxon9-s9api.jar Binary file software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/WEB-INF/lib/saxon9-s9api.jar has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/WEB-INF/lib/saxon9.jar Binary file software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/WEB-INF/lib/saxon9.jar has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/WEB-INF/lib/transcoder11.jar Binary file software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/WEB-INF/lib/transcoder11.jar has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/WEB-INF/web.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/WEB-INF/web.xml Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,70 @@ + + + mpiwg-mpdl-xml-web + + index.html + + + GetDictionaryEntries + GetDictionaryEntries + GetDictionaryEntries + de.mpg.mpiwg.berlin.mpdl.servlets.lt.GetDictionaryEntries + + + GetDictionaryEntries + /lt/GetDictionaryEntries + + + GetLemmas + GetLemmas + GetLemmas + de.mpg.mpiwg.berlin.mpdl.servlets.lt.GetLemmas + + + GetLemmas + /lt/GetLemmas + + + GetForms + GetForms + GetForms + de.mpg.mpiwg.berlin.mpdl.servlets.lt.GetForms + + + GetForms + /lt/GetForms + + + Tokenize + Tokenize + Tokenize + de.mpg.mpiwg.berlin.mpdl.servlets.lt.Tokenize + + + Tokenize + /text/Tokenize + + + Normalize + Normalize + Normalize + de.mpg.mpiwg.berlin.mpdl.servlets.lt.Normalize + + + Normalize + /text/Normalize + + + Transcode + Transcode + Transcode + de.mpg.mpiwg.berlin.mpdl.servlets.lt.Transcode + + + Transcode + /text/Transcode + + + de.mpg.mpiwg.berlin.mpdl.servlets.lt.MpiwgMpdlLtWebServletContextListener + + \ No newline at end of file diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/.DS_Store Binary file software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/.DS_Store has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/2downarrow.png Binary file software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/2downarrow.png has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/2leftarrow.png Binary file software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/2leftarrow.png has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/2rightarrow.png Binary file software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/2rightarrow.png has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/2uparrow.png Binary file software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/2uparrow.png has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/book-pointer.gif Binary file software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/book-pointer.gif has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/book.png Binary file software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/book.png has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/camera.png Binary file software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/camera.png has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/copyleft.png Binary file software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/copyleft.png has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/dictionary.gif Binary file software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/dictionary.gif has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/dictionaryMorph.gif Binary file software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/dictionaryMorph.gif has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/dot.gif Binary file software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/dot.gif has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/download.png Binary file software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/download.png has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/echo.gif Binary file software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/echo.gif has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/figures.png Binary file software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/figures.png has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/help.png Binary file software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/help.png has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/image.jpg Binary file software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/image.jpg has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/imageU.jpg Binary file software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/imageU.jpg has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/info.png Binary file software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/info.png has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/left.gif Binary file software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/left.gif has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/link.png Binary file software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/link.png has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/linkback.png Binary file software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/linkback.png has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/linkext.png Binary file software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/linkext.png has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/linkto.png Binary file software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/linkto.png has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/malcolm.jpg Binary file software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/malcolm.jpg has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/malcolm.tif Binary file software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/malcolm.tif has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/pirate-joey.gif Binary file software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/pirate-joey.gif has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/right.gif Binary file software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/right.gif has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/search.gif Binary file software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/search.gif has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/searchMorph.gif Binary file software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/searchMorph.gif has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/searchStructural.gif Binary file software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/searchStructural.gif has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/searchXPath.gif Binary file software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/searchXPath.gif has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/slime_logo.png Binary file software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/slime_logo.png has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/text.jpg Binary file software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/text.jpg has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/textPollux.jpg Binary file software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/textPollux.jpg has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/textPolluxU.jpg Binary file software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/textPolluxU.jpg has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/textU.jpg Binary file software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/textU.jpg has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/toc.gif Binary file software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/toc.gif has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/xml.jpg Binary file software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/xml.jpg has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/xmlU.jpg Binary file software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/images/xmlU.jpg has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/index.html --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt-web/WebContent/index.html Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,370 @@ + + + + +Max Planck Institute for the History of Science - Mpdl: Language technology services + + + + + + +

+ [This software is dedicated to Dr. Malcolm Hyman] +

+ [It is based on Donatus and Pollux] +

Max Planck Institute for the History of Science - Mpdl: Language technology services

Url: /mpiwg-mpdl-lt-web/lt/GetDictionaryEntries +
- Request parameters +
  - query (required) +
    - by one form or lemma (e.g. "revolution")
    - by a list of forms or lemmas (e.g. "revolution equality brotherliness")
    - by a prefix range: entries starting with a prefix (e.g. "a*")
    +
  - inputType (optional) +
    - "form"
    - "lemma"
    - default: "form"
    +
  - language (optional) +
    - ISO 639-3 specifier
    - default: "eng"
    +
  - dictionary (optional) +
    - dictionary name, e.g. "webster"
    - default: "all" (all dictionaries for the specified language)
    +
  - outputType (optional) +
    - "compact"
    - "full"
    - default: "compact"
    +
  - outputFormat (optional) +
    - "html"
    - "xml"
    - default: "xml"
    +
  - normalization (optional) +
    - "none"
    - "norm"
    - default: "norm"
    +
  - resultPage (optional) +
    - works only for range queries
    - page number of the result (e.g. "2": result entries from position 51 to 100)
    - default: "1"
    +
  +
- Response output +
  - dependent of outputFormat, outputType and resultPage: morphology, dictionary and Wikipedia entries in Xml or Html format
  - Example: query=a*&dictionary=ls
  - Example: query=a*&language=lat&outputFormat=html
  - Example: query=revolution&language=lat
  - Example: query=multa&language=lat&outputFormat=html&outputType=full
  +
+
Url: /mpiwg-mpdl-lt-web/lt/GetLemmas +
- Request parameters +
  - query (required) +
    - one form or lemma (e.g. "revolution") or
    - blank separated list of forms or lemmas (e.g. "revolution equality brotherliness")
    +
  - inputType (optional) +
    - "form"
    - "lemma"
    - default: "form"
    +
  - language (optional) +
    - ISO 639-3 specifier
    - default: "eng"
    +
  - outputType (optional) +
    - "compact"
    - "full"
    - default: "compact"
    +
  - outputFormat (optional) +
    - "html"
    - "xml"
    - "string" (lemma names separated by a blank)
    - default: "xml"
    +
  - normalization (optional) +
    - "none"
    - "norm"
    - default: "norm"
    +
  +
- Response output +
  - dependent of outputFormat and outputType: lemma entries in Xml or Html or string format
  - Example: query=multa&language=lat&outputFormat=html
  +
+
Url: /mpiwg-mpdl-lt-web/lt/GetForms +
- Request parameters +
  - query (required) +
    - one lemma (e.g. "revolution") or
    - blank separated list of forms (e.g. "revolution equality brotherliness")
    +
  - language (optional) +
    - ISO 639-3 specifier
    - default: "eng"
    +
  - outputType (optional) +
    - "compact"
    - "full"
    - default: "compact"
    +
  - outputFormat (optional) +
    - "html"
    - "xml"
    - "string" (lemma names separated by a blank)
    - default: "xml"
    +
  - normalization (optional) +
    - "none"
    - "norm"
    - default: "norm"
    +
  +
- Response output +
  - dependent of outputFormat and outputType: form entries in Xml or Html or string format
  - Example: query=edo sum&language=lat&outputFormat=string
  +
+
Url: /mpiwg-mpdl-lt-web/text/Tokenize +
- Request parameters +
  - inputString or srcUrl (required) +
    - inputString +
      - string which should be tokenized +
        +
        unstructured text
        +
        XML fragment/document
        +
        +
      +
    - srcUrl +
      - source URL +
        +
        unstructured text
        +
        XML fragment/document
        +
        +
      +
    +
  - language (optional) +
    - ISO 639-3 specifier
    - if input is XML and an element contains the attribute "xml:lang" this value is used for this element
    - default: "eng"
    +
  - normalization (optional) +
    - "none"
    - "norm"
    - default: "norm"
    +
  - dictionary (optional) +
    - "yes"
    - "no"
    - default: "yes"
    +
  - stopElements (optional) +
    - list of xml element names which should not be tokenized (e.g. "var")
    - default: empty list
    +
  - outputFormat (optional) +
    - "xml"
    - "string"
    - default: "xml"
    +
  - outputOptions (optional) +
    - output options separated with blanks (e.g. "withForms withLemmas") +
      - "withForms"
      - "withLemmas"
      - default: empty list
      +
    +
  +
- Response output +
  - outputFormat=xml +
    - tokenized inputString or document (enriched by element <w>) +
      - Example: <s><w lang="deu" form="dies" forms="dies, dieser, dieses, diesen" lemmas="dieser">Dies</w> <w + lang="deu" form="ist" forms="bin, bist, ist, seid, sind, sein, war, warst, wart" lemmas="sein">ist</w> <w + lang="deu" form="ein" forms="ein, eines, einer" lemmas="ein">ein</w> <w lang="deu" form="satz" + forms="satz, sätze, satzes" lemmas="satz">Satz</w></s> +
      +
    +
  - outputFormat=string +
    - word tokens of inputString or document (separated by Blank)
    +
  - Example: inputString=edo sum philoſophi&language=lat&outputFormat=xml
  - Example: srcUrl=http://mpdl-system.mpiwg-berlin.mpg.de/mpdl/page-query-result.xql?document=/echo/la/Benedetti_1585.xml%26mode=pureXml%26pn=13&language=lat
  - Example: srcUrl=http://mpdl-system.mpiwg-berlin.mpg.de/mpdl/page-query-result.xql?document=/echo/la/Benedetti_1585.xml%26mode=pureXml%26pn=13&language=lat&outputOptions=withForms withLemmas
  +
+
Url: /mpiwg-mpdl-lt-web/text/Normalize +
- Request parameters +
  - inputString (required) +
    - string which should be normalized
    +
  - language (optional) +
    - ISO 639-3 specifier
    - default: "eng"
    +
  - type (optional) +
    - "dictionary"
    - "display"
    - default: "display"
    +
  +
- Response output +
  - normalized string
  - Example: inputString=philoſophi&language=lat
  +
+
Url: /mpiwg-mpdl-lt-web/text/Transcode +
- Request parameters +
  - inputString (required) +
    - string which should be transcoded
    +
  - srcEncoding (required) +
    - "betacode"
    - "buckwalter"
    - "unicode"
    +
  - destEncoding (optional) +
    - "betacode"
    - "buckwalter"
    - "unicode"
    - default: "unicode"
    +
  +
- Response output +
  - transcoded string
  - Example: inputString=kai/&srcEncoding=betacode&destEncoding=unicode
  +
+

+ + + + + diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/build/.DS_Store Binary file software/mpdl-services/mpiwg-mpdl-lt-web/build/.DS_Store has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/build/build.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt-web/build/build.xml Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,57 @@ + + + mpiwg-mpdl-lt-web + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/conf/constants-mpdl-system.properties --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt-web/conf/constants-mpdl-system.properties Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,1 @@ +dataDir=/usr/local/tomcat-mpdl/mpdl-data/lt \ No newline at end of file diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/conf/constants.properties --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt-web/conf/constants.properties Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,1 @@ +dataDir=/Users/jwillenborg/mpdl/data/lt \ No newline at end of file diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/dist-remote/mpiwg-mpdl-lt-web.jar Binary file software/mpdl-services/mpiwg-mpdl-lt-web/dist-remote/mpiwg-mpdl-lt-web.jar has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/dist-remote/mpiwg-mpdl-lt-web.war Binary file software/mpdl-services/mpiwg-mpdl-lt-web/dist-remote/mpiwg-mpdl-lt-web.war has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/dist/.DS_Store Binary file software/mpdl-services/mpiwg-mpdl-lt-web/dist/.DS_Store has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/dist/mpiwg-mpdl-lt-web.jar Binary file software/mpdl-services/mpiwg-mpdl-lt-web/dist/mpiwg-mpdl-lt-web.jar has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/dist/mpiwg-mpdl-lt-web.war Binary file software/mpdl-services/mpiwg-mpdl-lt-web/dist/mpiwg-mpdl-lt-web.war has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/src/.DS_Store Binary file software/mpdl-services/mpiwg-mpdl-lt-web/src/.DS_Store has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/src/de/.DS_Store Binary file software/mpdl-services/mpiwg-mpdl-lt-web/src/de/.DS_Store has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/src/de/mpg/.DS_Store Binary file software/mpdl-services/mpiwg-mpdl-lt-web/src/de/mpg/.DS_Store has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/src/de/mpg/mpiwg/.DS_Store Binary file software/mpdl-services/mpiwg-mpdl-lt-web/src/de/mpg/mpiwg/.DS_Store has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/src/de/mpg/mpiwg/berlin/.DS_Store Binary file software/mpdl-services/mpiwg-mpdl-lt-web/src/de/mpg/mpiwg/berlin/.DS_Store has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/src/de/mpg/mpiwg/berlin/mpdl/.DS_Store Binary file software/mpdl-services/mpiwg-mpdl-lt-web/src/de/mpg/mpiwg/berlin/mpdl/.DS_Store has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/src/de/mpg/mpiwg/berlin/mpdl/exception/ApplicationException.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt-web/src/de/mpg/mpiwg/berlin/mpdl/exception/ApplicationException.java Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,14 @@ +package de.mpg.mpiwg.berlin.mpdl.exception; + +public class ApplicationException extends Exception { + + public ApplicationException(Exception e) { + super(e); + } + + public ApplicationException(String str) { + super(str); + } + +} + diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/src/de/mpg/mpiwg/berlin/mpdl/servlets/lt/GetDictionaryEntries.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt-web/src/de/mpg/mpiwg/berlin/mpdl/servlets/lt/GetDictionaryEntries.java Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,324 @@ +package de.mpg.mpiwg.berlin.mpdl.servlets.lt; + +import java.io.IOException; +import java.io.PrintWriter; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Date; + +import javax.servlet.ServletConfig; +import javax.servlet.ServletException; +import javax.servlet.http.HttpServlet; +import javax.servlet.http.HttpServletRequest; +import javax.servlet.http.HttpServletResponse; + +import org.apache.commons.lang3.StringEscapeUtils; + +import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; +import de.mpg.mpiwg.berlin.mpdl.lt.dict.app.Lexicon; +import de.mpg.mpiwg.berlin.mpdl.lt.dict.app.LexiconEntry; +import de.mpg.mpiwg.berlin.mpdl.lt.dict.db.LexHandler; +import de.mpg.mpiwg.berlin.mpdl.lt.general.Language; +import de.mpg.mpiwg.berlin.mpdl.lt.morph.app.Form; +import de.mpg.mpiwg.berlin.mpdl.lt.morph.app.Lemma; +import de.mpg.mpiwg.berlin.mpdl.servlets.util.ServletUtil; + +public class GetDictionaryEntries extends HttpServlet { + private static final long serialVersionUID = 1L; + private LexHandler lexHandler; + + public GetDictionaryEntries() { + super(); + } + + public void init(ServletConfig config) throws ServletException { + super.init(config); + try { + lexHandler = LexHandler.getInstance(); + } catch (ApplicationException e) { + throw new ServletException(e); + } + } + + protected void doGet(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException { + Date begin = new Date(); + request.setCharacterEncoding("utf-8"); + response.setCharacterEncoding("utf-8"); + String query = request.getParameter("query"); + String language = request.getParameter("language"); + String inputType = request.getParameter("inputType"); + String outputFormat = request.getParameter("outputFormat"); + String outputType = request.getParameter("outputType"); + String dictionary = request.getParameter("dictionary"); + String normalization = request.getParameter("normalization"); + String resultPage = request.getParameter("resultPage"); + if (query == null) + query = "a*"; + if (language == null) + language = "eng"; + if (inputType == null || ! (inputType.equals("form") || inputType.equals("lemma"))) + inputType = "form"; + if (outputFormat == null || ! (outputFormat.equals("xml") || outputFormat.equals("html"))) + outputFormat = "xml"; + if (outputType == null || ! (outputType.equals("compact") || outputType.equals("full"))) + outputType = "compact"; + if (normalization == null || ! (normalization.equals("none") || normalization.equals("reg") || normalization.equals("reg norm"))) + normalization = "norm"; + String xmlDict = "all"; + if (dictionary != null) + xmlDict = dictionary; + int pn = 1; + if (resultPage != null) + pn = new Integer(resultPage); + boolean isRangeQuery = false; + if (query.endsWith("*")) + isRangeQuery = true; + String xmlQueryString = "" + query + "" + "" + language + "" + "" + inputType + "" + + "" + outputFormat + "" + "" + outputType + "" + "" + xmlDict + "" + + "" + normalization + "" + ""; + try { + if (outputFormat.equals("xml")) + response.setContentType("text/xml"); + else if (outputFormat.equals("html")) + response.setContentType("text/html"); + else + response.setContentType("text/xml"); + PrintWriter out = response.getWriter(); + if (query == null || query.isEmpty()) { + out.print("request parameter query is empty. Please specify a query."); + out.close(); + return; + } + ArrayList lemmas = null; + ArrayList dictionaries = null; + if (isRangeQuery) { + String queryTmp = query.substring(0, query.length() - 1); // without last star + if (dictionary != null) + dictionaries = lexHandler.getLexEntriesByLexiconBeginningWith(dictionary, queryTmp, pn); + else + dictionaries = lexHandler.getLexEntriesBeginningWith(language, queryTmp, pn); + } else { + lemmas = lexHandler.getLemmas(query, inputType, language, normalization); + dictionaries = lexHandler.getLexEntries(lemmas, language, dictionary); + } + String baseUrl = ServletUtil.getInstance().getBaseUrl(request); + Date end = new Date(); + String elapsedTime = String.valueOf(end.getTime() - begin.getTime()); + String result = ""; + if (outputFormat == null || outputFormat.equals("xml")) + result = createXmlOutputString(query, lemmas, dictionaries, outputType, baseUrl, xmlQueryString, elapsedTime); + else if (outputFormat.equals("html")) + result = createHtmlOutputString(query, lemmas, dictionaries, outputType, elapsedTime); + else + result = createXmlOutputString(query, lemmas, dictionaries, outputType, baseUrl, xmlQueryString, elapsedTime); + out.print(result); + out.close(); + } catch (ApplicationException e) { + throw new ServletException(e); + } + } + + protected void doPost(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException { + + } + + private String createXmlOutputString(String query, ArrayList lemmas, ArrayList lexicons, String outputType, String baseUrl, String xmlQueryString, String elapsedTime) { + String result = ""; + result = result + "" + "MPIWG MPDL language technology service (see: " + "" + baseUrl + "), Max Planck Institute for the History of Science, Berlin." + ""; + result = result + xmlQueryString; + result = result + "" + elapsedTime + ""; + if (lemmas != null && ! lemmas.isEmpty()) { + result = result + ""; + for (int i=0; i"; + result = result + "" + lemmaName + ""; + if (outputType != null && outputType.equals("full")) { + String lemmaProvider = lemma.getProvider(); + result = result + "" + lemmaProvider + ""; + result = result + "" + language + ""; + } + if (Language.getInstance().isArabic(language) || Language.getInstance().isLatin(language)) { + String remoteUrl = "http://www.perseus.tufts.edu/hopper/morph?l=" + lemmaName + "&la=" + language; + result = result + "" + remoteUrl + ""; + } else if (Language.getInstance().isGreek(language)) { + String remoteUrl = "http://www.perseus.tufts.edu/hopper/morph?l=" + lemmaName + "&la=" + "greek"; + result = result + "" + remoteUrl + ""; + } + if (outputType != null && outputType.equals("full")) { + ArrayList"; + } + } + result = result + ""; + } + result = result + ""; + } + if (lexicons != null) { + result = result + ""; + for (int i=0; i"; + } + if (outputType != null && outputType.equals("full") && lemmas != null && ! lemmas.isEmpty()) { + result = result + ""; + for (int i=0; i"; + result = result + "" + lemmaName + ""; + String wikiHrefExact = "http://" + language + ".wikipedia.org/wiki/" + lemmaName; + String wikiHrefSearch = "http://" + language + ".wikipedia.org/wiki/index.php?search=" + lemmaName; + result = result + "" + wikiHrefExact + ""; + result = result + "" + wikiHrefSearch + ""; + result = result + ""; + } + result = result + ""; + } + result = result + ""; + return result; + } + + private String createHtmlOutputString(String query, ArrayList lemmas, ArrayList lexicons, String outputType, String elapsedTime) { + String result = ""; + result = result + ""; + result = result + ""; + result = result + "Word information for: \"" + query + "\""; + result = result + ""; + result = result + ""; + result = result + ""; + result = result + ""; + result = result + "

[This is a MPIWG MPDL language technology service] $\"MPIWG$

"; + result = result + "

Word information for: \"" + query + "\"

"; + if (lemmas != null && ! lemmas.isEmpty()) { + result = result + "

Morphology

"; + result = result + "

"; + for (int i=0; i"; + result = result + lemmaName; + if (outputType != null && outputType.equals("full")) { + String lemmaProvider = lemma.getProvider(); + result = result + " (data provider: " + lemmaProvider + ")"; + } + if (Language.getInstance().isArabic(language) || Language.getInstance().isLatin(language)) + result = result + " (external link: " + lemmaName + ")"; + else if (Language.getInstance().isGreek(language)) + result = result + " (external link: " + lemmaName + ")"; + if (outputType != null && outputType.equals("full")) { + ArrayList

forms = lemma.getFormsList(); + Collections.sort(forms); + if (forms != null && ! forms.isEmpty()) { + result = result + "

"; + } + } + result = result + ""; + } + result = result + "

"; + } + if (lexicons != null && ! lexicons.isEmpty()) { + result = result + "

Dictionary

"; + result = result + "

"; + for (int i=0; i"; + result = result + "" + lexicon.getDescription() + ""; + result = result + "

entries = lexicon.getEntries(); + for (int j=0; j", ""); + repairedEntry = repairedEntry.replaceAll("", ""); + entryContent = entryContent + repairedEntry; // valid unicode content of the original entry + } else { + entryContent = entryContent + "[Remark: this dictionary entry has no valid XML/HTML content in database so a text version of this entry is shown.]:
"; + String originalEntry = entry.getOriginalEntry(); + originalEntry = originalEntry.replaceAll("", ""); + originalEntry = originalEntry.replaceAll("", ""); + originalEntry = StringEscapeUtils.escapeXml(originalEntry); // create text version of the invalid xml content + entryContent = entryContent + originalEntry; + } + if (entry.getRemoteUrl() != null) { + entryContent = entryContent + "

(external link: " + entry.getFormName() + ")

"; + } + } else { + if (entry.getRemoteUrl() != null) { + entryContent = entryContent + "external link: " + entry.getFormName() + ""; + } + } + String formName = entry.getFormName(); + String dictName = lexicon.getName(); + if (outputType != null && outputType.equals("full")) { + result = result + "

" + "" + formName + "

" + entryContent + "

"; + } else if (outputType != null && outputType.equals("compact")) { + result = result + "

" + "" + formName + "

"; + } + } + result = result + "

"; + result = result + "

"; + } + if (outputType != null && outputType.equals("full") && lemmas != null && ! lemmas.isEmpty()) { + result = result + "

Wikipedia

"; + result = result + "

"; + for (int i=0; i"; + String wikiHrefExact = "http://" + language + ".wikipedia.org/wiki/" + lemmaName; + String wikiHrefSearch = "http://" + language + ".wikipedia.org/wiki/index.php?search=" + lemmaName; + result = result + "Article: External link: " + lemmaName + " (or search for " + lemmaName + ")"; + result = result + ""; + } + result = result + "

"; + } + result = result + "[* external links may not function]"; + result = result + "

"; + result = result + "

"; + result = result + "Elapsed time: " + elapsedTime + " ms, see the service description of this page, if you find a bug let us know"; + result = result + ""; + result = result + ""; + return result; + } + + +} diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt-web/src/de/mpg/mpiwg/berlin/mpdl/servlets/lt/GetForms.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt-web/src/de/mpg/mpiwg/berlin/mpdl/servlets/lt/GetForms.java Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,210 @@ +package de.mpg.mpiwg.berlin.mpdl.servlets.lt; + +import java.io.IOException; +import java.io.PrintWriter; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Date; +import java.util.Hashtable; + +import javax.servlet.ServletConfig; +import javax.servlet.ServletException; +import javax.servlet.http.HttpServlet; +import javax.servlet.http.HttpServletRequest; +import javax.servlet.http.HttpServletResponse; + +import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; +import de.mpg.mpiwg.berlin.mpdl.lt.dict.db.LexHandler; +import de.mpg.mpiwg.berlin.mpdl.lt.morph.app.Form; +import de.mpg.mpiwg.berlin.mpdl.lt.morph.app.Lemma; + +public class GetForms extends HttpServlet { + private static final long serialVersionUID = 1L; + private LexHandler lexHandler; + + public GetForms() { + super(); + } + + public void init(ServletConfig config) throws ServletException { + super.init(config); + try { + lexHandler = LexHandler.getInstance(); + } catch (ApplicationException e) { + throw new ServletException(e); + } + } + + protected void doGet(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException { + Date begin = new Date(); + request.setCharacterEncoding("utf-8"); + response.setCharacterEncoding("utf-8"); + String query = request.getParameter("query"); + String language = request.getParameter("language"); + String outputFormat = request.getParameter("outputFormat"); + String outputType = request.getParameter("outputType"); + String normalization = request.getParameter("normalization"); + if (language == null) + language = "eng"; + if (outputFormat == null || ! (outputFormat.equals("xml") || outputFormat.equals("html") || outputFormat.equals("string"))) + outputFormat = "xml"; + if (outputType == null || ! (outputType.equals("compact") || outputType.equals("full"))) + outputType = "compact"; + if (normalization == null || ! (normalization.equals("none") || normalization.equals("reg") || normalization.equals("reg norm"))) + normalization = "norm"; + + String xmlQueryString = "" + query + "" + "" + language + "" + + "" + outputFormat + "" + "" + outputType + "" + "" + normalization + "" + ""; + try { + if (outputFormat.equals("xml")) + response.setContentType("text/xml"); + else if (outputFormat.equals("html") || outputFormat.equals("string")) + response.setContentType("text/html"); + else + response.setContentType("text/xml"); + PrintWriter out = response.getWriter(); + if (query == null || query.isEmpty()) { + out.print("request parameter query is empty. Please specify a query."); + out.close(); + return; + } + ArrayList lemmas = lexHandler.getLemmas(query, "lemma", language, normalization); + Hashtable formsHashtable = new Hashtable(); + ArrayList forms = new ArrayList(); + if (lemmas != null && ! lemmas.isEmpty()) { + for (int i=0; i lemmaForms = lemma.getFormsList(); + for (int j=0; j < lemmaForms.size(); j++) { + Form form = lemmaForms.get(j); + formsHashtable.put(form.getFormName(), form); + } + } + } + forms.addAll(formsHashtable.values()); + Collections.sort(forms); + String baseUrl = getBaseUrl(request); + Date end = new Date(); + String elapsedTime = String.valueOf(end.getTime() - begin.getTime()); + String result = ""; + if (outputFormat == null || outputFormat.equals("xml")) + result = createXmlOutputString(query, forms, outputType, baseUrl, xmlQueryString, elapsedTime); + else if (outputFormat.equals("html")) + result = createHtmlOutputString(query, forms, outputType, elapsedTime); + else if (outputFormat.equals("string")) + result = createStringOutputString(forms); + else + result = createXmlOutputString(query, forms, outputType, baseUrl, xmlQueryString, elapsedTime); + out.print(result); + out.close(); + } catch (ApplicationException e) { + throw new ServletException(e); + } + } + + protected void doPost(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException { + + } + + private String getBaseUrl( HttpServletRequest request ) { + if (request.getServerPort() == 80 || request.getServerPort() == 443) + return request.getScheme() + "://" + request.getServerName() + request.getContextPath(); + else + return request.getScheme() + "://" + request.getServerName() + ":" + request.getServerPort() + request.getContextPath(); + } + + + private String createXmlOutputString(String query, ArrayList forms, String outputType, String baseUrl, String xmlQueryString, String elapsedTime) { + String result = ""; + result = result + "" + "MPIWG MPDL language technology service (see: " + "" + baseUrl + "), Max Planck Institute for the History of Science, Berlin." + ""; + result = result + xmlQueryString; + result = result + "" + elapsedTime + ""; + if (forms != null && ! forms.isEmpty()) { + result = result + ""; + result = result + ""; + for (int i=0; i"; + Form f = forms.get(i); + String formName = f.getFormName(); + String language = f.getLanguage(); + String formProvider = f.getProvider(); + String lemmaName = f.getLemmaName(); + result = result + "" + formProvider + ""; + result = result + "" + language + ""; + result = result + "" + lemmaName + ""; + result = result + "" + formName + ""; + result = result + ""; + } + result = result + ""; + result = result + ""; + } + result = result + ""; + return result; + } + + private String createHtmlOutputString(String query, ArrayList

forms, String outputType, String elapsedTime) { + String result = ""; + result = result + ""; + result = result + ""; + result = result + "Lemmas for: \"" + query + "\""; + result = result + ""; + result = result + ""; + result = result + ""; + result = result + ""; + result = result + "

[This is a MPIWG MPDL language technology service] $\"MPIWG$

"; + result = result + "

Forms for: \"" + query + "\"

"; + if (forms != null && ! forms.isEmpty()) { + result = result + "

Morphology

"; + result = result + "

"; + if (outputType != null && outputType.equals("full")) { + for (int i=0; i"; + Form f = forms.get(i); + String formName = f.getFormName(); + String formProvider = f.getProvider(); + String language = f.getLanguage(); + String lemmaName = f.getLemmaName(); + result = result + formName + " (data provider: " + formProvider + ", language: " + language + ", lemmaName: " + lemmaName + ")"; + result = result + ""; + } + } else if (outputType == null || outputType.equals("compact")) { + result = result + "

"; + for (int i=0; i
"; + } else if (outputType.equals("string")) { + for (int i=0; i
"; + result = result + "
"; + result = result + "Elapsed time: " + elapsedTime + " ms, see the service description of this page, if you find a bug let us know"; + result = result + ""; + result = result + ""; + return result; + } + + private String createStringOutputString(ArrayList forms) { + String result = ""; + for (int i=0; i" + "" + language + "" + "" + inputType + "" + + "" + outputFormat + "" + "" + outputType + "" + "" + normalization + "" + ""; + try { + if (outputFormat.equals("xml")) + response.setContentType("text/xml"); + else if (outputFormat.equals("html") || outputFormat.equals("string")) + response.setContentType("text/html"); + else + response.setContentType("text/xml"); + PrintWriter out = response.getWriter(); + if (query == null || query.isEmpty()) { + out.print("request parameter query is empty. Please specify a query."); + out.close(); + return; + } + ArrayList lemmas = lexHandler.getLemmas(query, inputType, language, normalization); + String baseUrl = getBaseUrl(request); + Date end = new Date(); + String elapsedTime = String.valueOf(end.getTime() - begin.getTime()); + String result = ""; + if (outputFormat == null || outputFormat.equals("xml")) + result = createXmlOutputString(query, lemmas, outputType, baseUrl, xmlQueryString, elapsedTime); + else if (outputFormat.equals("html")) + result = createHtmlOutputString(query, lemmas, outputType, elapsedTime); + else if (outputFormat.equals("string")) + result = createStringOutputString(lemmas); + else + result = createXmlOutputString(query, lemmas, outputType, baseUrl, xmlQueryString, elapsedTime); + out.print(result); + out.close(); + } catch (ApplicationException e) { + throw new ServletException(e); + } + } + + protected void doPost(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException { + + } + + private String getBaseUrl( HttpServletRequest request ) { + if (request.getServerPort() == 80 || request.getServerPort() == 443) + return request.getScheme() + "://" + request.getServerName() + request.getContextPath(); + else + return request.getScheme() + "://" + request.getServerName() + ":" + request.getServerPort() + request.getContextPath(); + } + + + private String createXmlOutputString(String query, ArrayList lemmas, String outputType, String baseUrl, String xmlQueryString, String elapsedTime) { + String result = ""; + result = result + "" + "MPIWG MPDL language technology service (see: " + "" + baseUrl + "), Max Planck Institute for the History of Science, Berlin." + ""; + result = result + xmlQueryString; + result = result + "" + elapsedTime + ""; + if (lemmas != null && ! lemmas.isEmpty()) { + result = result + ""; + for (int i=0; i"; + result = result + "" + lemmaName + ""; + if (outputType != null && outputType.equals("full")) { + String lemmaProvider = lemma.getProvider(); + result = result + "" + lemmaProvider + ""; + result = result + "" + language + ""; + } + if (Language.getInstance().isArabic(language) || Language.getInstance().isLatin(language)) { + String remoteUrl = "http://www.perseus.tufts.edu/hopper/morph?l=" + lemmaName + "&la=" + language; + result = result + "" + remoteUrl + ""; + } else if (Language.getInstance().isGreek(language)) { + String remoteUrl = "http://www.perseus.tufts.edu/hopper/morph?l=" + lemmaName + "&la=" + "greek"; + result = result + "" + remoteUrl + ""; + } + if (outputType != null && outputType.equals("full")) { + ArrayList forms = lemma.getFormsList(); + Collections.sort(forms); + if (forms != null && ! forms.isEmpty()) { + result = result + ""; + for (int j=0; j"; + Form f = forms.get(j); + String formName = f.getFormName(); + String formProvider = f.getProvider(); + result = result + "" + formProvider + ""; + result = result + "" + language + ""; + result = result + "" + formName + ""; + result = result + ""; + } + result = result + ""; + } + } + result = result + ""; + } + result = result + ""; + } + result = result + ""; + return result; + } + + private String createHtmlOutputString(String query, ArrayList lemmas, String outputType, String elapsedTime) { + String result = ""; + result = result + ""; + result = result + ""; + result = result + "Lemmas for: \"" + query + "\""; + result = result + ""; + result = result + ""; + result = result + ""; + result = result + ""; + result = result + "
[This is a MPIWG MPDL language technology service] $\"MPIWG$
"; + result = result + "
"; + result = result + "
Lemmas for: \"" + query + "\"
"; + if (lemmas != null && ! lemmas.isEmpty()) { + result = result + "
Morphology
"; + result = result + "
- ", "
- "); // TODO hack + } + // repair for all lexicons + newLexValueStr = newLexValueStr.replaceAll("type=style", "type=\"style\""); + newLexValueStr = newLexValueStr.replaceAll("type=dom", "type=\"dom\""); + newLexValueStr = newLexValueStr.replaceAll("<\\*>", ""); + newLexValueStr = newLexValueStr.replaceAll("
  ", "
  "); + LexiconEntry newLexEntryTemp = new LexiconEntry(lexiconName, lexDumpKeyStr, newLexValueStr); // lexDumpKeyStr is not transcoded yet but it will not be used in further in the code + LexiconEntry newLexEntry = xmlParseAndRepair(newLexEntryTemp); + String xmlValidString = "true"; + if (! newLexEntry.isXmlValid()) { + xmlValidString = "false"; + } + newLexValueStr = newLexEntry.getContent(); + // transcode the Betacode lexicon entries to Unicode (key and value) + if (lexicon.isBetacodeLexicon()) { + Transcoder transcoder = Transcoder.getInstance(); + lexDumpKeyStr = transcoder.transcodeFromBetaCode2Unicode(lexDumpKeyStr); + String elementName = "G"; + if (newLexEntry.isXmlValid()) { + newLexValueStr = transcodeByElementName("fromBetacode2Unicode", elementName, newLexValueStr); + } + } + // transcode the Buckwalter entries to Unicode (key and value) + if (lexicon.isBuckwalterLexicon()) { + Transcoder transcoder = Transcoder.getInstance(); + lexDumpKeyStr = transcoder.transcodeFromBuckwalter2Unicode(lexDumpKeyStr); + String elementName = "AR"; + if (newLexEntry.isXmlValid()) { + newLexValueStr = transcodeByElementName("fromBuckwalter2Unicode", elementName, newLexValueStr); + } + } + // put the entry into database + newLexValueStr = "" + xmlValidString + "" + lexDumpValueStr + "" + "" + newLexValueStr + "" + ""; + DatabaseEntry newLexDumpKey = new DatabaseEntry(lexDumpKeyStr.getBytes("utf-8")); + DatabaseEntry newLexValue = new DatabaseEntry(newLexValueStr.getBytes("utf-8")); + lexDB.put(null, newLexDumpKey, newLexValue); + } + } + } catch (DatabaseException e) { + throw new ApplicationException(e); + } catch (UnsupportedEncodingException e) { + throw new ApplicationException(e); + } + } + + private void printSampleEntries(String lexiconName, int count) throws ApplicationException { + try { + int counter = 0; + dbEnvLexica.openDatabase(lexiconName); + Database lexDB = dbEnvLexica.getLexiconDB(lexiconName); + Cursor cursor = lexDB.openCursor(null, null); + DatabaseEntry dbEntryKey = new DatabaseEntry(); + DatabaseEntry dbEntryValue = new DatabaseEntry(); + OperationStatus operationStatus = cursor.getFirst(dbEntryKey, dbEntryValue, LockMode.DEFAULT); + while (operationStatus == OperationStatus.SUCCESS && counter < count) { + int size = dbEntryKey.getSize(); + if (size > 0) { + byte[] dbEntryKeyBytes = dbEntryKey.getData(); + String dbEntryKeyStr = new String(dbEntryKeyBytes, "utf-8"); + System.out.println(lexiconName + ": key: " + dbEntryKeyStr + " value size: " + dbEntryValue.getSize()); + } + operationStatus = cursor.getNext(dbEntryKey, dbEntryValue, LockMode.DEFAULT); + counter++; + } + cursor.close(); + } catch (DatabaseException e) { + throw new ApplicationException(e); + } catch (UnsupportedEncodingException e) { + throw new ApplicationException(e); + } + } + + private void testTranscoder() throws ApplicationException { + String testStr = "hfhf fdfdei)mi/ (sum), Aeol. e)/mmi hfhfh Sapph.2.15, Theoc.20.32; Cret. h)mi/ GDI 4959a; 2sg. ei)=, Ep. and Ion. ei)s Od.17.388, al., Aeol. e)/ssi, Ep. and Dor. e)ssi/ Il.1.176, Pi."; + String testStr2 = "aaaaa 1111a 2222a 3333a 1111a aaaaa bbbbb 1111b 2222b 3333b 1111b bbbbb "; + String testStr3 = "e)pano/rqwsin e)/xein, opp a)ni/aton ei)=nai *hi3. 1165 b18. --e)panorqw/seis kai boh/qeiai *rb5. 1383 a20."; + String testStr4 = "suni^hmi Ar.Av.946 (s. v.l.), Strato Com.1.3: with variation of quantity, plei=ston ou)=lon i(/ei [i^], i)/oulon i(/ei [i_] Carm.Pop. 1.]:—" + + ";
  release, let go, h(=ka ..po/das kai\\ xei=re fe/resqai Od.12.442; h(=ke fe/resqai let him float" + + "off, Il.21.120; let fall, ka\\d de\\ ka/rhtos h(=ke ko/mas made his locks flow down from his head, Od.<" + + "/author>6.231; [e)qei/ras] i(/ei lo/fon a)mfi/ .... ggg"; + String testStr5 = "plei=ston ou)=lon i(/ei "; + String testStr6 = "*a as< as as: *)a *s ss "; + Transcoder t = Transcoder.getInstance(); + String transcoded = t.transcodeFromBetaCode2Unicode(testStr4); + transcoded = t.transcodeFromBetaCode2Unicode(testStr5); + transcoded = t.transcodeFromBetaCode2Unicode(testStr6); + + String arabTestStr1 = "^nutaf"; + String arabTestStr2 = "min"; + String arabTestStr3 = "Aal-Hiyal (^qAla ^>arisTwTAlys) yataEaj~aba Aal-nAs minhA <im~A fy Aal->a$yA' Aal~aty taEriDu TabEAF fa-mim~A lA yuElamu Eil~atuhu wa-<im~A fy Aal->a$yA' Aal-muxAlifap li-l-TabE fa-mim~A yuEmalu bi-Aal-SinAEap li-manfaEap Aal-nAs li->an~a Aal-TabyEap tulzimu >abadAF jihap wAHidap wa->am~A manAfiE Aal-nAs fa-<in~ahA taxtalifu <ixtilAfAF kavyrAF."; + transcoded = t.transcodeFromBuckwalter2Unicode(arabTestStr1); + transcoded = t.transcodeFromBuckwalter2Unicode(arabTestStr2); + transcoded = t.transcodeFromBuckwalter2Unicode(arabTestStr3); + + // String deletedNestedTags = deleteNestedTags("G", testStr4); + // String regExpr = "(.*?)(.*?)(.*?)(.*?)(.*?)"; + String regExpr = "(.*?)(.*)(){1,}(.*?)"; + // String regExpr = "(.*?)(.*?)(.*?)(.*?)(.*?)"; + String replaceStr = testStr2.replaceAll(regExpr, "$1$2$4"); + // String replaceStr2 = testStr2.replaceAll("(.*)(.*)(.*)(.*)(.*)", "$2$3$4$5"); + regExpr = ".*?(.*?){1,}.*?"; + regExpr = "(.*?)(.*?)(.*?){1,}(.*?)"; + // String regExpr = "[a-zA-Z0-9]+?\\[.+?\\]/" + "|" + "[a-zA-Z0-9]+?/" + "|" + "[a-zA-Z0-9]+?\\[.+\\]$" + "|" + "[a-zA-Z0-9]+?$"; // pathName example: "/archimedes[@xmlns:xlink eq "http://www.w3.org/1999/xlink"]/text/body/chap/p[@type eq "main"]/s/foreign[@lang eq "en"]" + Pattern p = Pattern.compile(regExpr, Pattern.CASE_INSENSITIVE | Pattern.MULTILINE); // both flags enabled + Matcher m = p.matcher(testStr2); + while (m.find()) { + int msBeginPos = m.start(); + int msEndPos = m.end(); + String matchStr = testStr2.substring(msBeginPos, msEndPos); + String bla = ""; + } + + String retStr = transcodeByElementName("fromBetacode2Unicode", "G", testStr); + retStr = transcodeByElementName("fromBetacode2Unicode", "G", "bla"); + retStr = transcodeByElementName("fromBetacode2Unicode", "G", ""); + } + + private String transcodeByElementName(String transcodeDirection, String elementName, String inputStr) throws ApplicationException { + if (inputStr == null || elementName == null) + return null; + String elemBeginTag = "<" + elementName + ">"; + String elemEndTag = ""; + Transcoder transcoder = Transcoder.getInstance(); + String outputStr = ""; + int begin = inputStr.indexOf(elemBeginTag); + int end = inputStr.indexOf(elemEndTag); + while (begin != -1 && end != -1 && begin < end) { + String before = inputStr.substring(0, begin); + String origStr = inputStr.substring(begin + elemBeginTag.length(), end); + origStr = StringUtils.deleteSpecialXmlEntities(origStr); + String transcodedStr = origStr; + if (transcodeDirection.equals("fromBetacode2Unicode")) + transcodedStr = transcoder.transcodeFromBetaCode2Unicode(origStr); + else if (transcodeDirection.equals("fromBuckwalter2Unicode")) + transcodedStr = transcoder.transcodeFromBuckwalter2Unicode(origStr); + outputStr = outputStr + before + new String(elemBeginTag); + outputStr = outputStr + transcodedStr; + outputStr = outputStr + new String(elemEndTag); + inputStr = inputStr.substring(end + elemEndTag.length()); + begin = inputStr.indexOf(elemBeginTag); + end = inputStr.indexOf(elemEndTag); + } + outputStr = outputStr + inputStr; + return outputStr; + } + + private String deleteNestedTags(String elementName, String inputStr) { + String inputStrTmp = new String(inputStr); + String elemBeginTag = "<" + elementName + ">"; + String elemEndTag = ""; + String outputStr = ""; + int begin = inputStrTmp.indexOf(elemBeginTag); + int end = inputStrTmp.indexOf(elemEndTag); + while (begin != -1 && end != -1) { + end = getIndexClosedTag(begin, elementName, inputStrTmp); + String before = inputStrTmp.substring(0, begin); + String origStr = null; + if (end == -1) // if no end tag could be found + origStr = inputStrTmp.substring(begin + elemBeginTag.length(), inputStrTmp.length()); + else + origStr = inputStrTmp.substring(begin + elemBeginTag.length(), end); + origStr = origStr.replaceAll(elemBeginTag, ""); + origStr = origStr.replaceAll(elemEndTag, ""); + outputStr = outputStr + before + new String(elemBeginTag); + outputStr = outputStr + origStr; + outputStr = outputStr + new String(elemEndTag); + inputStrTmp = inputStrTmp.substring(end + elemEndTag.length()); + begin = inputStrTmp.indexOf(elemBeginTag); + } + outputStr = outputStr + inputStrTmp; + return outputStr; + } + + private int getIndexClosedTag(int begin, String elementName, String inputStr) { + int beginTmp = begin; + int retIndex = -1; + String elemBeginTag = "<" + elementName + ">"; + String elemEndTag = ""; + int indexEndTag = inputStr.indexOf(elemEndTag); + while (indexEndTag != -1) { + String betweenTmpStr = inputStr.substring(beginTmp + elemBeginTag.length(), indexEndTag); + int indexBeginTag = betweenTmpStr.indexOf(elemBeginTag); + if (indexBeginTag != -1) { + beginTmp = indexEndTag; + } else { + return indexEndTag; + } + indexEndTag = inputStr.indexOf(elemEndTag, indexEndTag + elemEndTag.length()); + } + return retIndex; + } + + private HashMap getWholeLexiconHashMap(String lexiconName) throws ApplicationException { + HashMap lexHashMap = new HashMap(); + try { + dbEnvLexica.openDatabase(lexiconName); + Database lexDB = dbEnvLexica.getLexiconDB(lexiconName); + Cursor cursor = lexDB.openCursor(null, null); + DatabaseEntry dbEntryKey = new DatabaseEntry(); + DatabaseEntry dbEntryValue = new DatabaseEntry(); + OperationStatus operationStatus = cursor.getFirst(dbEntryKey, dbEntryValue, LockMode.DEFAULT); + while (operationStatus == OperationStatus.SUCCESS) { + int size = dbEntryKey.getSize(); + if (size > 0) { + byte[] dbEntryKeyBytes = dbEntryKey.getData(); + String dbEntryKeyStr = new String(dbEntryKeyBytes, "utf-8"); + DatabaseEntry newDbEntryValue = new DatabaseEntry(dbEntryValue.getData()); + lexHashMap.put(dbEntryKeyStr, newDbEntryValue); + } + operationStatus = cursor.getNext(dbEntryKey, dbEntryValue, LockMode.DEFAULT); + } + cursor.close(); + } catch (DatabaseException e) { + throw new ApplicationException(e); + } catch (UnsupportedEncodingException e) { + throw new ApplicationException(e); + } + return lexHashMap; + } + + private LexiconEntry xmlParseAndRepair(LexiconEntry lexEntry) throws ApplicationException { + String origLexEntryContent = lexEntry.getContent(); + String lexEntryContent = new String(origLexEntryContent); + lexEntry.setContent(lexEntryContent); + // parse and repair: try to repair it 3 times through parsing + LexiconEntry retLexiconEntry = xmParseAndRepairLocal(lexEntry); + retLexiconEntry = xmParseAndRepairLocal(retLexiconEntry); + retLexiconEntry = xmParseAndRepairLocal(retLexiconEntry); + // if it could not be repaired the original content (which is not XML valid) is delivered + if (! retLexiconEntry.isXmlValid()) + retLexiconEntry.setContent(origLexEntryContent); + return retLexiconEntry; + } + + private LexiconEntry xmParseAndRepairLocal(LexiconEntry lexEntry) throws ApplicationException { + if (! lexEntry.isXmlValid()) { + lexEntry = xmlParse(lexEntry); + } + if (! lexEntry.isXmlValid() && lexEntry.getValidationCode() != null && lexEntry.getValidationCode().equals("elementNotClosed")) { + String elementName = lexEntry.getValidationFailElementName(); + String lexiconEntryContent = lexEntry.getContent(); + lexiconEntryContent = lexiconEntryContent.replaceAll("<" + elementName + " .*?>", ""); + lexiconEntryContent = lexiconEntryContent.replaceAll("", ""); + lexEntry.setContent(lexiconEntryContent); + lexEntry.setXmlMadeValid(true); + } + return lexEntry; + } + + private LexiconEntry xmlParse(LexiconEntry lexEntry) throws ApplicationException { + String lexEntryContent = "" + lexEntry.getContent() + ""; + LexEntryContentHandler lexEntryContentHandler = new LexEntryContentHandler(); + XMLReader xmlParser = new SAXParser(); + xmlParser.setContentHandler(lexEntryContentHandler); + LexEntryErrorHandler lexEntryErrorHandler = new LexEntryErrorHandler(); + xmlParser.setErrorHandler(lexEntryErrorHandler); + try { + Reader reader = new StringReader(lexEntryContent); + InputSource input = new InputSource(reader); + xmlParser.parse(input); + lexEntry.setXmlValid(true); + } catch (SAXException e) { + // nothing but following + lexEntry.setXmlValid(false); + String exceptionMessage = e.getMessage(); + if (exceptionMessage.matches("The element type .* must be terminated by the matching end-tag .*")) { + int begin = exceptionMessage.indexOf("\""); + if (begin != -1) { + String subStr = exceptionMessage.substring(begin + 1); + int end = subStr.indexOf("\""); + if (end != -1) { + String elementName = exceptionMessage.substring(begin + 1, begin + 1 + end); + lexEntry.setValidationCode("elementNotClosed"); + lexEntry.setValidationFailElementName(elementName); + } + } + } + } catch (IOException e) { + throw new ApplicationException(e); + } + return lexEntry; + } + + private void writeLexiconsToFiles() throws ApplicationException { + BufferedReader in = null; + BufferedOutputStream out = null; + try { + ArrayList lexicons = Lexica.getInstance().getLocalLexicons(); + for (int i=0; i lexHashMap = getWholeLexiconHashMap(lexiconName); + Iterator lexDumpIter = lexHashMap.keySet().iterator(); + File outputFile = new File(DATA_FILES_DIR_LEXICA + "/" + lexiconName + ".xml"); + out = new BufferedOutputStream(new FileOutputStream(outputFile)); + write("\n", out); + write("" + lexiconName + "\n", out); + write("" + lexicon.getDescription() + "\n", out); + write("\n", out); + while (lexDumpIter.hasNext()) { + write("\n", out); + String lexKeyStr = lexDumpIter.next(); + write("
  " + lexKeyStr + "
  \n", out); + DatabaseEntry lexValue = lexHashMap.get(lexKeyStr); + byte[] lexValueBytes = lexValue.getData(); + write(lexValueBytes, out); + write("\n", out); + } + write("\n", out); + write("\n", out); + } + } catch (FileNotFoundException e) { + throw new ApplicationException(e); + } finally { + // always close the stream + if (in != null) try { in.close(); } catch (Exception e) { } + if (out != null) try { out.close(); } catch (Exception e) { } + } + } + + private void write(byte[] inputBytes, BufferedOutputStream out) throws ApplicationException { + try { + out.write(inputBytes, 0, inputBytes.length); + out.flush(); + } catch (IOException e) { + throw new ApplicationException(e); + } + } + + private void write(String outStr, BufferedOutputStream out) throws ApplicationException { + try { + byte[] bytes = outStr.getBytes("utf-8"); + out.write(bytes, 0, bytes.length); + out.flush(); + } catch (IOException e) { + throw new ApplicationException(e); + } + } + + private void beginOperation() { + beginOfOperation = new Date(); + } + + private void endOperation() { + endOfOperation = new Date(); + } + +} \ No newline at end of file diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/dict/db/DbEnvLex.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/dict/db/DbEnvLex.java Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,101 @@ +package de.mpg.mpiwg.berlin.mpdl.lt.dict.db; + +import java.io.File; +import java.util.HashMap; + +import com.sleepycat.je.Database; +import com.sleepycat.je.DatabaseConfig; +import com.sleepycat.je.DatabaseException; +import com.sleepycat.je.Environment; +import com.sleepycat.je.EnvironmentConfig; + +import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; + +public class DbEnvLex { + private String dataDir; + private File envPath; + private Environment env; + private EnvironmentConfig envConfig; + private DatabaseConfig dbConfig; + private HashMap
  lexiconDBs = new HashMap(); + + public DbEnvLex() { + } + + public void setDataDir(String dataDir) { + this.dataDir = dataDir; + } + + public void initReadOnly() throws ApplicationException { + try { + envConfig = new EnvironmentConfig(); + dbConfig = new DatabaseConfig(); + envPath = new File(dataDir); + env = new Environment(envPath, envConfig); + } catch (DatabaseException e) { + throw new ApplicationException(e); + } + } + + public void initReadWrite() throws ApplicationException { + try { + envConfig = new EnvironmentConfig(); + dbConfig = new DatabaseConfig(); + envConfig.setReadOnly(false); + dbConfig.setReadOnly(false); + envConfig.setAllowCreate(true); + dbConfig.setAllowCreate(true); + envConfig.setTransactional(true); + dbConfig.setTransactional(true); + envPath = new File(dataDir); + env = new Environment(envPath, envConfig); + } catch (DatabaseException e) { + throw new ApplicationException(e); + } + } + + public void openDatabase(String lexiconName) throws ApplicationException { + try { + Database lexDB = lexiconDBs.get(lexiconName); + if (lexDB == null) { + Database lexiconDB = env.openDatabase(null, lexiconName + ".db", dbConfig); + lexiconDBs.put(lexiconName, lexiconDB); + } + } catch (DatabaseException e) { + throw new ApplicationException(e); + } + } + + public void closeDatabase(String lexiconName) throws ApplicationException { + try { + if (lexiconDBs != null) { + Database lexiconDB = lexiconDBs.get(lexiconName); + if (lexiconDB != null) + lexiconDB.close(); + } + } catch (DatabaseException e) { + throw new ApplicationException(e); + } + } + + public Environment getEnv() { + return env; + } + + public Database getLexiconDB(String lexiconName) { + Database lexiconDB = lexiconDBs.get(lexiconName); + return lexiconDB; + } + + public void close() throws ApplicationException { + if (env != null) { + try { + if (env != null) + env.close(); + } catch (DatabaseException e) { + throw new ApplicationException(e); + } + } + } +} + diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/dict/db/LexEntryContentHandler.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/dict/db/LexEntryContentHandler.java Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,43 @@ +package de.mpg.mpiwg.berlin.mpdl.lt.dict.db; + +import org.xml.sax.*; + +public class LexEntryContentHandler implements ContentHandler { + + public LexEntryContentHandler() { + } + + public void startDocument() throws SAXException { + } + + public void endDocument() throws SAXException { + } + + public void characters(char[] c, int start, int length) throws SAXException { + } + + public void ignorableWhitespace(char[] c, int start, int length) throws SAXException { + } + + public void processingInstruction(String target, String data) throws SAXException { + } + + public void setDocumentLocator(org.xml.sax.Locator arg1) { + } + + public void endPrefixMapping(String prefix) throws SAXException { + } + + public void skippedEntity(String name) throws SAXException { + } + + public void startElement(String uri, String localName, String name, Attributes attrs) throws SAXException { + } + + public void endElement(String uri, String localName, String name) throws SAXException { + } + + public void startPrefixMapping(String prefix, String uri) throws SAXException { + } + +} diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/dict/db/LexEntryErrorHandler.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/dict/db/LexEntryErrorHandler.java Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,12 @@ +package de.mpg.mpiwg.berlin.mpdl.lt.dict.db; + +import org.xml.sax.*; + +public class LexEntryErrorHandler implements ErrorHandler { + public void warning(SAXParseException exception) throws SAXException { + } + public void error(SAXParseException exception) throws SAXException { + } + public void fatalError(SAXParseException exception) throws SAXException { + } +} \ No newline at end of file diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/dict/db/LexHandler.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/dict/db/LexHandler.java Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,353 @@ +package de.mpg.mpiwg.berlin.mpdl.lt.dict.db; + +import java.io.UnsupportedEncodingException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Date; +import java.util.logging.Logger; + +import com.sleepycat.je.Cursor; +import com.sleepycat.je.Database; +import com.sleepycat.je.DatabaseEntry; +import com.sleepycat.je.DatabaseException; +import com.sleepycat.je.LockMode; +import com.sleepycat.je.OperationStatus; + +import de.mpg.mpiwg.berlin.mpdl.util.Util; +import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; +import de.mpg.mpiwg.berlin.mpdl.lt.dict.app.Lexica; +import de.mpg.mpiwg.berlin.mpdl.lt.dict.app.Lexicon; +import de.mpg.mpiwg.berlin.mpdl.lt.dict.app.LexiconEntry; +import de.mpg.mpiwg.berlin.mpdl.lt.general.Constants; +import de.mpg.mpiwg.berlin.mpdl.lt.general.Language; +import de.mpg.mpiwg.berlin.mpdl.lt.morph.app.Lemma; +import de.mpg.mpiwg.berlin.mpdl.lt.morph.app.MorphologyCache; +import de.mpg.mpiwg.berlin.mpdl.lt.text.transcode.Transcoder; + +public class LexHandler { + private static LexHandler instance; + private static Logger LOGGER = Logger.getLogger(LexHandler.class.getName()); + private static String DATA_DIR = Constants.getInstance().getDataDir(); + private static String DB_DIR_LEXICA = DATA_DIR + "/dataBerkeleyDB/pollux"; + private DbEnvLex dbEnvLexica; + private Date beginOfOperation; + private Date endOfOperation; + + public static LexHandler getInstance() throws ApplicationException { + if (instance == null) { + instance = new LexHandler(); + instance.initReadOnly(); + } + return instance; + } + + public void end() throws ApplicationException { + ArrayList lexicons = Lexica.getInstance().getLocalLexicons(); + for (int i=0; i getLemmas(String query, String type, String language, String normalization) throws ApplicationException { + ArrayList lexLemmas = new ArrayList(); + // get lemmas of all forms in query + MorphologyCache morphologyCache = MorphologyCache.getInstance(); + String[] queryForms = query.split(" "); + for (int k=0; k lemmas = null; + if (type.equals("form")) { + if (normalization.equals("norm")) + lemmas = morphologyCache.getLemmasByFormName(language, queryForm, true); + else if (normalization.equals("none")) + lemmas = morphologyCache.getLemmasByFormName(language, queryForm, false); + else + lemmas = morphologyCache.getLemmasByFormName(language, queryForm, true); // TODO reg and reg+norm + } else if (type.equals("lemma")) { + lemmas = new ArrayList(); + Lemma l = null; + if (normalization.equals("norm")) + l = morphologyCache.getLemma(language, queryForm, true); + else if (normalization.equals("none")) + l = morphologyCache.getLemma(language, queryForm, false); + else + l = morphologyCache.getLemma(language, queryForm, true); + if (l != null) + lemmas.add(l); + } + if (lemmas != null && ! lemmas.isEmpty()) { + lexLemmas.addAll(lemmas); + } else { + Lemma l = new Lemma("created dynamically cause no lemma is available", language, queryForm); // at least the word form is added for finding it in the lexicon + lexLemmas.add(l); + } + } + Collections.sort(lexLemmas); + if (lexLemmas.isEmpty()) + return null; + else + return lexLemmas; + } + + public ArrayList getLexEntries(ArrayList lexLemmas, String language, String lexiconName) throws ApplicationException { + ArrayList retLexicons = new ArrayList(); + ArrayList lexicons = Lexica.getInstance().getLexicons(language); + if (lexiconName != null) { + lexicons = new ArrayList(); + Lexicon lexicon = Lexica.getInstance().getLexicon(lexiconName); + if (lexicon != null) + lexicons.add(lexicon); + } + if (lexicons != null) { + for (int i=0; i getLexEntryKeys(String formName, String language, boolean normalize) throws ApplicationException { + ArrayList lexEntryKeys = new ArrayList(); + MorphologyCache morphologyCache = MorphologyCache.getInstance(); + ArrayList formLemmas = morphologyCache.getLemmasByFormName(language, formName, normalize); + boolean hasLexEntry = false; + hasLexEntry = hasLexEntryKey(formName, language); + if (hasLexEntry) + lexEntryKeys.add(formName); + if (formLemmas != null) { + for (int j=0; j statLexicons = Lexica.getInstance().getLocalLexicons(language); + if (statLexicons != null) { + for (int i=0; i getLexEntriesBeginningWith(String language, String formPrefix, int pageNumber) throws ApplicationException { + int pageSize = 50; + int from = (pageNumber * pageSize) - pageSize + 1; + int to = pageNumber * pageSize; + ArrayList statLexicons = Lexica.getInstance().getLocalLexicons(language); + ArrayList retLexicons = null; + if (statLexicons != null) { + for (int i=0; i lexEntries = readEntriesBeginningWith(lexiconName, formPrefix, from, to); + // TODO merge the entries and remove duplicates + if (lexEntries != null) { + lexicon.addEntries(lexEntries); + if (retLexicons == null) + retLexicons = new ArrayList(); + retLexicons.add(lexicon); + } + } + } + return retLexicons; + } + + public ArrayList getLexEntriesByLexiconBeginningWith(String lexiconName, String formPrefix, int pageNumber) throws ApplicationException { + int pageSize = 50; + int from = (pageNumber * pageSize) - pageSize + 1; + int to = pageNumber * pageSize; + Lexicon lexicon = Lexica.getInstance().getLexicon(lexiconName).clone(); + ArrayList retLexicons = null; + if (lexicon != null) { + ArrayList lexEntries = readEntriesBeginningWith(lexiconName, formPrefix, from, to); + if (lexEntries != null) { + lexicon.addEntries(lexEntries); + retLexicons = new ArrayList(); + retLexicons.add(lexicon); + } + } + return retLexicons; + } + + private LexiconEntry getEntry(Lexicon lexicon, String formName) throws ApplicationException { + LexiconEntry lexEntry = null; + if (lexicon.isLocalLexicon()) { + lexEntry = readEntry(lexicon.getName(), formName); + String lexiconQueryUrl = lexicon.getQueryUrl(); + if (lexEntry != null && lexicon.getQueryUrl() != null) { + String language = lexicon.getSourceLanguage(); + if (Language.getInstance().isGreek(language)) { + formName = Transcoder.getInstance().transcodeFromUnicode2BetaCode(formName); + } else if (Language.getInstance().isArabic(language)) { + formName = Transcoder.getInstance().transcodeFromUnicode2Buckwalter(formName); + } + lexEntry.setRemoteUrl(lexiconQueryUrl + formName); + } + } else { + lexEntry = lexicon.getDynamicEntry(formName); + } + return lexEntry; + } + + private LexiconEntry readEntry(String lexiconName, String formName) throws ApplicationException { + LexiconEntry retLexEntry = null; + try { + String dbFoundValueStr = null; + String keyStr = formName; + DatabaseEntry dbEntryKey = new DatabaseEntry(keyStr.getBytes("utf-8")); + Database lexDB = dbEnvLexica.getLexiconDB(lexiconName); + Cursor cursor = lexDB.openCursor(null, null); + DatabaseEntry foundValue = new DatabaseEntry(); + OperationStatus operationStatus = cursor.getSearchKey(dbEntryKey, foundValue, LockMode.DEFAULT); + if (operationStatus == OperationStatus.SUCCESS) { + byte[] foundValueBytes = foundValue.getData(); + dbFoundValueStr = new String(foundValueBytes, "utf-8"); + } + cursor.close(); + if (dbFoundValueStr != null) { + retLexEntry = new LexiconEntry(lexiconName, formName, dbFoundValueStr); + } + } catch (DatabaseException e) { + throw new ApplicationException(e); + } catch (UnsupportedEncodingException e) { + throw new ApplicationException(e); + } + return retLexEntry; + } + + private ArrayList readEntriesBeginningWith(String lexiconName, String formPrefix, int from, int to) throws ApplicationException { + ArrayList retLexEntries = new ArrayList();; + try { + String dbFoundValueStr = null; + String keyStr = formPrefix; + DatabaseEntry dbEntryKey = new DatabaseEntry(keyStr.getBytes("utf-8")); + Database lexDB = dbEnvLexica.getLexiconDB(lexiconName); + Cursor cursor = lexDB.openCursor(null, null); + DatabaseEntry foundValue = new DatabaseEntry(); + OperationStatus operationStatus = cursor.getSearchKeyRange(dbEntryKey, foundValue, LockMode.DEFAULT); + int counter = 1; + while (operationStatus == OperationStatus.SUCCESS && counter <= to) { + if (counter >= from) { + byte[] foundValueBytes = foundValue.getData(); + dbFoundValueStr = new String(foundValueBytes, "utf-8"); + byte[] foundKeyBytes = dbEntryKey.getData(); + String dbFoundKeyStr = new String(foundKeyBytes, "utf-8"); + LexiconEntry lexEntry = new LexiconEntry(lexiconName, dbFoundKeyStr, dbFoundValueStr); + retLexEntries.add(lexEntry); + } + operationStatus = cursor.getNext(dbEntryKey, foundValue, LockMode.DEFAULT); + counter++; + } + cursor.close(); + if (retLexEntries.isEmpty()) { + return null; + } + } catch (DatabaseException e) { + throw new ApplicationException(e); + } catch (UnsupportedEncodingException e) { + throw new ApplicationException(e); + } + return retLexEntries; + } + + public static void main(String[] args) throws ApplicationException { + getInstance(); + instance.beginOperation(); + System.out.print("Start ..."); + instance.readSampleData(); + instance.end(); + instance.endOperation(); + Double elapsedTime = new Util().getSecondWithMillisecondsBetween(instance.beginOfOperation, instance.endOfOperation); + System.out.println("End."); + System.out.println("Needed time: " + elapsedTime + " seconds"); + } + + private void initReadOnly() throws ApplicationException { + dbEnvLexica = new DbEnvLex(); + dbEnvLexica.setDataDir(DB_DIR_LEXICA); + dbEnvLexica.initReadOnly(); + ArrayList lexicons = Lexica.getInstance().getLocalLexicons(); + for (int i=0; i dbNames = dbEnvLexica.getEnv().getDatabaseNames(); + String l1 = readEntry("autenrieth", "au)to/s").getContent(); // greek: see also bonitz and lsj + String l2 = readEntry("ls", "laudabilis").getContent(); // latin + System.out.println("Autenrieth: autos: " + l1); + System.out.println("Lewis & Short: Laudabilis: " + l2); + } + + private void beginOperation() { + beginOfOperation = new Date(); + } + + private void endOperation() { + endOfOperation = new Date(); + } + +} \ No newline at end of file diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/general/Constants.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/general/Constants.java Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,36 @@ +package de.mpg.mpiwg.berlin.mpdl.lt.general; + +import java.net.URL; +import java.util.Properties; + +import de.mpg.mpiwg.berlin.mpdl.util.Util; + +public class Constants { + public static String DEFAULT_LANGUAGE = "en"; + public static int MORPHOLOGY_CACHE_SIZE = 1000000; + private static Constants instance; + private Properties properties; + + public static Constants getInstance() { + if (instance == null) { + instance = new Constants(); + instance.init(); + } + return instance; + } + + private void init() { + URL url = Constants.class.getClassLoader().getResource("constants.properties"); + if (url != null) { + String propertiesFileName = url.toString().substring(5); + properties = (new Util()).getProperties(propertiesFileName); + } + } + + public String getDataDir() { + if (properties != null) + return properties.getProperty("dataDir"); + else + return "no properties file"; + } +} diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/general/Language.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/general/Language.java Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,172 @@ +package de.mpg.mpiwg.berlin.mpdl.lt.general; + +import java.util.HashMap; + +/** + * + * Language codes from ISO 639-3 + * + */ +public class Language { + private static Language instance; + private static HashMap languageIds = new HashMap(); + private static HashMap iso639Codes = new HashMap(); + + public static Language getInstance() { + if (instance == null) { + instance = new Language(); + instance.init(); + } + return instance; + } + + private void init() { + languageIds.put("ar", "ar"); + languageIds.put("ara", "ar"); + languageIds.put("de", "de"); + languageIds.put("ger", "de"); + languageIds.put("deu", "de"); + languageIds.put("el", "el"); + languageIds.put("grc", "el"); + languageIds.put("en", "en"); + languageIds.put("eng", "en"); + languageIds.put("fr", "fr"); + languageIds.put("fra", "fr"); + languageIds.put("it", "it"); + languageIds.put("ita", "it"); + languageIds.put("la", "la"); + languageIds.put("lat", "la"); + languageIds.put("nl", "nl"); + languageIds.put("nld", "nl"); + languageIds.put("zh", "zh"); + languageIds.put("zho", "zh"); + languageIds.put("zho-Hant", "zh"); + + iso639Codes.put("ar", "ara"); + iso639Codes.put("ara", "ara"); + iso639Codes.put("de", "ger"); + iso639Codes.put("ger", "ger"); + iso639Codes.put("deu", "ger"); + iso639Codes.put("el", "grc"); + iso639Codes.put("grc", "grc"); + iso639Codes.put("en", "eng"); + iso639Codes.put("eng", "eng"); + iso639Codes.put("fr", "fra"); + iso639Codes.put("fra", "fra"); + iso639Codes.put("it", "ita"); + iso639Codes.put("ita", "ita"); + iso639Codes.put("la", "lat"); + iso639Codes.put("lat", "lat"); + iso639Codes.put("nl", "nld"); + iso639Codes.put("nld", "nld"); + iso639Codes.put("zh", "zho"); + iso639Codes.put("zho", "zho"); + iso639Codes.put("zho-Hant", "zho"); + } + + public String getISO639Code(String language) { + if (language == null) + return null; + String retISO639Code = null; + retISO639Code = iso639Codes.get(language); + return retISO639Code; + } + + public String getLanguageId(String language) { + if (language == null) + return null; + String retLanguageId = null; + retLanguageId = languageIds.get(language); + return retLanguageId; + } + + public boolean isLatin(String language) { + String langId = getLanguageId(language); + if (langId == null) + return false; + if (langId.equals("la")) + return true; + else + return false; + } + + public boolean isGerman(String language) { + String langId = getLanguageId(language); + if (langId == null) + return false; + if (langId.equals("de")) + return true; + else + return false; + } + + public boolean isFrench(String language) { + String langId = getLanguageId(language); + if (langId == null) + return false; + if (langId.equals("fr")) + return true; + else + return false; + } + + public boolean isEnglish(String language) { + String langId = getLanguageId(language); + if (langId == null) + return false; + if (langId.equals("en")) + return true; + else + return false; + } + + public boolean isDutch(String language) { + String langId = getLanguageId(language); + if (langId == null) + return false; + if (langId.equals("nl")) + return true; + else + return false; + } + + public boolean isGreek(String language) { + String langId = getLanguageId(language); + if (langId == null) + return false; + if (langId.equals("el")) + return true; + else + return false; + } + + public boolean isArabic(String language) { + String langId = getLanguageId(language); + if (langId == null) + return false; + if (langId.equals("ar")) + return true; + else + return false; + } + + public boolean isItalian(String language) { + String langId = getLanguageId(language); + if (langId == null) + return false; + if (langId.equals("it")) + return true; + else + return false; + } + + public boolean isChinese(String language) { + String langId = getLanguageId(language); + if (langId == null) + return false; + if (langId.equals("zh")) + return true; + else + return false; + } +} \ No newline at end of file diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/morph/app/Form.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/morph/app/Form.java Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,337 @@ +package de.mpg.mpiwg.berlin.mpdl.lt.morph.app; + +public class Form implements Comparable
  { + private String provider; + private String language; + private String formName; + private String lemmaName; + private String pos; + private String tense; + private String voice; + private String casus; + private String number; + private String mood; + private String person; + private String gender; + private String definite; + + public Form() { + } + + public Form(String provider, String language, String formName) { + this.provider = provider; + this.language = language; + this.formName = formName; + } + + public int compareTo(Form f) { + return formName.compareTo(f.formName); + } + + public void normalize() { + // lower case of form and lemma + formName = formName.toLowerCase(); + lemmaName = lemmaName.toLowerCase(); + // XML: special symbols + formName = formName.replaceAll("&", "&"); + formName = formName.replaceAll("'", "'"); + formName = formName.replaceAll("<", "<"); + formName = formName.replaceAll(">", ">"); + formName = formName.replaceAll("\"", """); + lemmaName = lemmaName.replaceAll("&", "&"); + lemmaName = lemmaName.replaceAll("'", "'"); + lemmaName = lemmaName.replaceAll("<", "<"); + lemmaName = lemmaName.replaceAll(">", ">"); + lemmaName = lemmaName.replaceAll("\"", """); + // unification of lemma names (homographs) TODO do not unificate the homographs + lemmaName = lemmaName.replaceAll("#[0-9]", ""); + if (isArabic()) { + if (lemmaName != null) { + int length = lemmaName.length(); + char lastChar = lemmaName.charAt(length - 1); + boolean isDigit = Character.isDigit(lastChar); + if (isDigit) + lemmaName = lemmaName.substring(0, length - 1); + } + } + // unification of forms and lemmas with hyphens: remove the hyphen + formName = formName.replaceAll("-", ""); + lemmaName = lemmaName.replaceAll("-", ""); + // unification of forms and lemmas with blanks (sequence of words): remove the blanks + formName = formName.replaceAll(" ", ""); + lemmaName = lemmaName.replaceAll(" ", ""); + // unification of forms and lemmas with plus symbols: remove the plus symbol + formName = formName.replaceAll("\\+", ""); + lemmaName = lemmaName.replaceAll("\\+", ""); + // TODO call MpdlMorphDataNormalizer (handle Umlauts in german, accents in french, character classes (longs, s, ...) ...) + + } + + public boolean isOk() { + boolean ret = true; + if (formName == null || lemmaName == null) + ret = false; + else if (formName.length() == 0 || lemmaName.length() == 0 || formName.length() == 1 || lemmaName.length() == 1) + ret = false; + return ret; + } + + public boolean isGreek() { + boolean ret = false; + if (language != null && language.equals("el")) + ret = true; + return ret; + } + + public boolean isArabic() { + boolean ret = false; + if (language != null && language.equals("ar")) + ret = true; + return ret; + } + + public boolean isRicherThan(Form otherForm) { + boolean richer = false; + if (! isOk()) + return false; + else if (! otherForm.isOk()) + return true; + String otherFormPos = otherForm.getPos(); + if (pos != null && pos.length() > 0 && (otherFormPos == null || otherFormPos.length() == 0)) + return true; + // TODO all other cases + return richer; + } + + public String getXmlString() { + String xmlString = "\n"; + if (provider != null) + xmlString += " " + provider + "\n"; + if (language != null) + xmlString += " " + language + "\n"; + if (formName != null) + xmlString += " " + formName + "\n"; + if (lemmaName != null) + xmlString += " " + lemmaName + "\n"; + if (pos != null) + xmlString += " " + pos + "\n"; + if (tense != null) + xmlString += " " + tense + "\n"; + if (voice != null) + xmlString += " " + voice + "\n"; + if (casus != null) + xmlString += " " + casus + "\n"; + if (number != null) + xmlString += " " + number + "\n"; + if (mood != null) + xmlString += " " + mood + "\n"; + if (person != null) + xmlString += " " + person + "\n"; + if (gender != null) + xmlString += " " + gender + "\n"; + if (definite != null) + xmlString += " " + definite + "\n"; + xmlString += "
  \n"; + return xmlString; + } + + public String toString() { + return getXmlString(); + } + + public String getTense() { + return tense; + } + + public void setTense(String tense) { + this.tense = tense; + } + + public void addTense(String newTense) { + if (tense == null) + this.tense = newTense; + else + tense += newTense; + } + + public String getVoice() { + return voice; + } + + public void setVoice(String voice) { + this.voice = voice; + } + + public void addVoice(String newVoice) { + if (voice == null) + this.voice = newVoice; + else + voice += newVoice; + } + + public String getCasus() { + return casus; + } + + public void setCasus(String casus) { + this.casus = casus; + } + + public void addCasus(String newCasus) { + if (casus == null) + this.casus = newCasus; + else + casus += newCasus; + } + + public String getNumber() { + return number; + } + + public void setNumber(String number) { + this.number = number; + } + + public void addNumber(String newNumber) { + if (number == null) + this.number = newNumber; + else + number += newNumber; + } + + public String getMood() { + return mood; + } + + public void setMood(String mood) { + this.mood = mood; + } + + public void addMood(String newMood) { + if (mood == null) + this.mood = newMood; + else + mood += newMood; + } + + public String getPerson() { + return person; + } + + public void setPerson(String person) { + this.person = person; + } + + public void addPerson(String newPerson) { + if (person == null) + this.person = newPerson; + else + person += newPerson; + } + + public String getGender() { + return gender; + } + + public void setGender(String gender) { + this.gender = gender; + } + + public void addGender(String newGender) { + if (gender == null) + this.gender = newGender; + else + gender += newGender; + } + + public String getDefinite() { + return definite; + } + + public void setDefinite(String definite) { + this.definite = definite; + } + + public void addDefinite(String newDefinite) { + if (definite == null) + this.definite = newDefinite; + else + definite += newDefinite; + } + + public String getLemmaName() { + return lemmaName; + } + + public String getPos() { + return pos; + } + + public String getProvider() { + return provider; + } + + public void setProvider(String provider) { + this.provider = provider; + } + + public void addProvider(String newProvider) { + if (provider == null) + this.provider = newProvider; + else + provider += newProvider; + } + + public String getLanguage() { + return language; + } + + public void setLanguage(String language) { + this.language = language; + } + + public void addLanguage(String newLanguage) { + if (language == null) + this.language = newLanguage; + else + language += newLanguage; + } + + public String getFormName() { + return formName; + } + + public void setFormName(String formName) { + this.formName = formName; + } + + public void addFormName(String newFormName) { + if (formName == null) + this.formName = newFormName; + else + formName += newFormName; + } + + public void setLemmaName(String lemmaName) { + this.lemmaName = lemmaName; + } + + public void addLemmaName(String newLemmaName) { + if (lemmaName == null) + this.lemmaName = newLemmaName; + else + lemmaName += newLemmaName; + } + + public void setPos(String pos) { + this.pos = pos; + } + + public void addPos(String newPos) { + if (pos == null) + this.pos = newPos; + else + pos += newPos; + } + +} diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/morph/app/Lemma.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/morph/app/Lemma.java Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,152 @@ +package de.mpg.mpiwg.berlin.mpdl.lt.morph.app; + +import java.util.ArrayList; +import java.util.Enumeration; +import java.util.Hashtable; + + +public class Lemma implements Comparable { + private String provider; + private String language; + private String lemmaName; + private Hashtable forms; + + public Lemma() { + } + + public Lemma(String provider, String language, String lemmaName) { + this.provider = provider; + this.language = language; + this.lemmaName = lemmaName; + this.forms = new Hashtable(); + // always contains the form with the same lemma name + Form form = new Form(provider, language, lemmaName); + addForm(form); + } + + public String getProvider() { + return provider; + } + + public void setProvider(String provider) { + this.provider = provider; + } + + public void addProvider(String newProvider) { + if (provider == null) + this.provider = newProvider; + else + provider += newProvider; + } + + public String getLanguage() { + return language; + } + + public void setLanguage(String language) { + this.language = language; + } + + public void addLanguage(String newLanguage) { + if (language == null) + this.language = newLanguage; + else + language += newLanguage; + } + + public String getLemmaName() { + return lemmaName; + } + + public void setLemmaName(String lemmaName) { + this.lemmaName = lemmaName; + } + + public void addLemmaName(String newLemmaName) { + if (lemmaName == null) + this.lemmaName = newLemmaName; + else + lemmaName += newLemmaName; + } + + public Hashtable getForms() { + return forms; + } + + public ArrayList
  getForms(String provider) { + ArrayList result = new ArrayList(); + Enumeration keys = forms.keys(); + while (keys.hasMoreElements()) { + String key = keys.nextElement(); + Form form = forms.get(key); + String prov = form.getProvider(); + if (prov.equals(provider)) + result.add(form); + } + return result; + } + + public ArrayList getFormsList() { + ArrayList result = new ArrayList(); + if(forms != null) { + Enumeration keys = forms.keys(); + while (keys.hasMoreElements()) { + String key = keys.nextElement(); + Form form = forms.get(key); + result.add(form); + } + } + return result; + } + + public void setForms(ArrayList forms) { + for (int i=0; i(); + Form f = forms.get(formKey); + if (f == null) { + forms.put(formKey, newForm); + } else { + if(newForm.isRicherThan(f)) + forms.put(formKey, newForm); + } + } + + public Form getForm(String formKey) { + return forms.get(formKey); + } + + public String getXmlString() { + String xmlString = "\n"; + xmlString += " " + provider + "\n"; + xmlString += " " + language + "\n"; + xmlString += " " + lemmaName + "\n"; + xmlString += ""; + return xmlString; + } + + public String toString() { + return getXmlString(); + } + + public int compareTo(Lemma l) { + if (l.getLemmaName() == null && this.getLemmaName() == null) { + return 0; + } + if (this.getLemmaName() == null) { + return 1; + } + if (l.getLemmaName() == null) { + return -1; + } + return this.getLemmaName().compareTo(l.getLemmaName()); + } + +} diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/morph/app/MorphFileReaderContentHandler.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/morph/app/MorphFileReaderContentHandler.java Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,127 @@ +package de.mpg.mpiwg.berlin.mpdl.lt.morph.app; + +import java.util.Hashtable; + +import org.xml.sax.*; + +import de.mpg.mpiwg.berlin.mpdl.lt.morph.app.Form; +import de.mpg.mpiwg.berlin.mpdl.lt.morph.app.Lemma; + +public class MorphFileReaderContentHandler implements ContentHandler { + private Hashtable forms; + private Hashtable lemmas; + private Element currentElement; + private Form currentForm; + + public MorphFileReaderContentHandler(Hashtable forms, Hashtable lemmas) { + this.forms = forms; + this.lemmas = lemmas; + } + + public void startDocument() throws SAXException { + } + + public void endDocument() throws SAXException { + } + + public void characters(char[] c, int start, int length) throws SAXException { + if (currentElement != null) { + String elemName = currentElement.name; + if (currentForm != null) { + char[] cCopy = new char[length]; + System.arraycopy(c, start, cCopy, 0, length); + String charactersStr = String.valueOf(cCopy); + if (charactersStr != null && ! (charactersStr.trim().equals(""))) { + if (elemName.equals("provider")) { + currentForm.setProvider(charactersStr); + } else if (elemName.equals("language")) { + currentForm.setLanguage(charactersStr); + } else if (elemName.equals("form-name")) { + currentForm.setFormName(charactersStr); + } else if (elemName.equals("lemma-name")) { + currentForm.setLemmaName(charactersStr); + } else if (elemName.equals("pos")) { + currentForm.setPos(charactersStr); + } else if (elemName.equals("tense")) { + currentForm.setTense(charactersStr); + } else if (elemName.equals("voice")) { + currentForm.setVoice(charactersStr); + } else if (elemName.equals("casus")) { + currentForm.setCasus(charactersStr); + } else if (elemName.equals("number")) { + currentForm.setNumber(charactersStr); + } else if (elemName.equals("mood")) { + currentForm.setMood(charactersStr); + } else if (elemName.equals("person")) { + currentForm.setPerson(charactersStr); + } else if (elemName.equals("gender")) { + currentForm.setGender(charactersStr); + } else if (elemName.equals("definite")) { + currentForm.setDefinite(charactersStr); + } + } + } + } + } + + public void ignorableWhitespace(char[] c, int start, int length) throws SAXException { + } + + public void processingInstruction(String target, String data) throws SAXException { + } + + public void setDocumentLocator(org.xml.sax.Locator arg1) { + } + + public void endPrefixMapping(String prefix) throws SAXException { + } + + public void skippedEntity(String name) throws SAXException { + } + + public void endElement(String uri, String localName, String name) throws SAXException { + currentElement = null; + if (name.equals("form")) { + String provider = currentForm.getProvider(); + String language = currentForm.getLanguage(); + String formName = currentForm.getFormName(); + String lemmaName = currentForm.getLemmaName(); + String formKey = language + "###" + formName; + forms.put(formKey, currentForm); + String lemmaKey = language + "###" + lemmaName; + Lemma lemma = lemmas.get(lemmaKey); + if(lemma == null) { + Lemma l = new Lemma(provider, language, lemmaName); + l.addForm(currentForm); + lemmas.put(lemmaKey, l); + } else { + lemma.addForm(currentForm); + } + currentForm = null; + } + } + + public void startElement(String uri, String localName, String name, Attributes attrs) throws SAXException { + currentElement = new Element(name); + if (name.equals("form")) { + currentForm = new Form(); + } + } + + public void startPrefixMapping(String prefix, String uri) throws SAXException { + } + + private class Element { + String name; + String value; + + Element(String name) { + this.name = name; + } + + Element(String name, String value) { + this.name = name; + this.value = value; + } + } +} diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/morph/app/MorphologyCache.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/morph/app/MorphologyCache.java Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,295 @@ +package de.mpg.mpiwg.berlin.mpdl.lt.morph.app; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.Date; +import java.util.Enumeration; +import java.util.Hashtable; + +import java.util.logging.Logger; + +import de.mpg.mpiwg.berlin.mpdl.lt.general.Language; +import de.mpg.mpiwg.berlin.mpdl.lt.general.Constants; +import de.mpg.mpiwg.berlin.mpdl.lt.morph.app.Form; +import de.mpg.mpiwg.berlin.mpdl.lt.morph.app.Lemma; +import de.mpg.mpiwg.berlin.mpdl.lt.morph.db.DBMorphHandler; +import de.mpg.mpiwg.berlin.mpdl.lt.text.norm.Normalizer; +import de.mpg.mpiwg.berlin.mpdl.lucene.util.LuceneUtil; +import de.mpg.mpiwg.berlin.mpdl.util.Util; +import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; + +public class MorphologyCache { + private static MorphologyCache instance; + private static Logger LOGGER = Logger.getLogger(MorphologyCache.class.getName()); + private static String DATA_DIR = Constants.getInstance().getDataDir(); + private static String DB_DIR_DONATUS = DATA_DIR + "/dataBerkeleyDB/donatus"; + public static int QUERY_MODE = 0; + public static int DOCUMENT_MODE = 1; + private static int MAX_HASHTABLE_SIZE = Constants.MORPHOLOGY_CACHE_SIZE; + protected int mode = QUERY_MODE; + private Hashtable> forms = new Hashtable>(); // cache of forms: hashKey is formName + private Hashtable lemmas = new Hashtable(); // cache of lemmas: hashKey is lemmaName + private DBMorphHandler dbMorphHandlerStatic; // handles static morph data (BerkeleyDB) + private Date beginOfOperation; + private Date endOfOperation; + + public static MorphologyCache getInstance() throws ApplicationException { + if (instance == null) { + instance = new MorphologyCache(); + instance.init(); + } + return instance; + } + + private void init() throws ApplicationException { + instance.beginOperation(); + dbMorphHandlerStatic = new DBMorphHandler(DB_DIR_DONATUS); + dbMorphHandlerStatic.start(); + dbMorphHandlerStatic.openDatabases(); + instance.endOperation(); + Double elapsedTime = new Util().getSecondWithMillisecondsBetween(instance.beginOfOperation, instance.endOfOperation); + LOGGER.info("Morphology db cache: opened (needed " + elapsedTime + " seconds)"); + } + + public int getMode() { + return mode; + } + + public void setMode(int newMode) { + this.mode = newMode; + } + + public void end() throws ApplicationException { + dbMorphHandlerStatic.closeDatabases(); + LOGGER.info("Morphology db cache: closed"); + } + + public ArrayList getLemmasByFormName(String lang, String formNameArg, boolean normalize) throws ApplicationException { + String language = Language.getInstance().getLanguageId(lang); + ArrayList retFormLemmas = null; + String formName = formNameArg; + if (normalize) { + Normalizer normalizer = new Normalizer(language); + formName = normalizer.normalize(formNameArg); + } + // first look in local cache + String key = language + "###" + formName; + Hashtable formLemmasHashtable = forms.get(key); + if (formLemmasHashtable == null) { + ArrayList dbFormLemmas = readLemmasByFormName(language, formName); + // put lemmas into local cache + int localHashTableSize = forms.size(); + if (localHashTableSize >= MAX_HASHTABLE_SIZE) { + clearCache(); + } + if (dbFormLemmas != null && ! dbFormLemmas.isEmpty()) { + formLemmasHashtable = new Hashtable(); + for (int i=0; i lemmaForms = readFormsByLemmaName(language, lemmaName); + lemma.setForms(lemmaForms); + lemmas.put(lemmaKey, lemma); + } else { + lemma = localLemma; + } + formLemmasHashtable.put(lemmaKey, lemma); + } + forms.put(key, formLemmasHashtable); + } + } + retFormLemmas = new ArrayList(); + if (formLemmasHashtable != null) { + Enumeration formLemmasKeys = formLemmasHashtable.keys(); + while(formLemmasKeys.hasMoreElements()) { + String lemmaKey = formLemmasKeys.nextElement(); + Lemma l = formLemmasHashtable.get(lemmaKey); + retFormLemmas.add(l); + } + } + Collections.sort(retFormLemmas); + return retFormLemmas; + } + + public Lemma getLemma(String lang, String lemmaNameArg, boolean normalize) throws ApplicationException { + String language = Language.getInstance().getLanguageId(lang); + String lemmaName = lemmaNameArg; + if (normalize) { + Normalizer normalizer = new Normalizer(language); + lemmaName = normalizer.normalize(lemmaNameArg); + } + // first look in local cache + String key = language + "###" + lemmaName; + Lemma lemma = lemmas.get(key); + if (lemma == null) { + ArrayList dbLemmaForms = readFormsByLemmaName(language, lemmaName); + if (dbLemmaForms != null && dbLemmaForms.size() > 0) { + lemma = new Lemma(); + lemma.setLemmaName(lemmaName); + lemma.setLanguage(language); + lemma.setProvider(dbLemmaForms.get(0).getProvider()); + lemma.setForms(dbLemmaForms); + lemmas.put(lemmaName, lemma); + } + } + return lemma; + } + + public ArrayList getFormsByLuceneQuery(String lang, String luceneQueryString, boolean normalize) throws ApplicationException { + String language = Language.getInstance().getLanguageId(lang); + ArrayList result = new ArrayList(); + luceneQueryString = luceneQueryString.toLowerCase(); + ArrayList formsFromQuery = getVariantsFromLuceneQuery(luceneQueryString); + if (! (formsFromQuery == null || formsFromQuery.isEmpty())) { + for (int i=0; i formLemmas = null; + // lemma mode: if formName contains "lemmalemma" then the lemma itself is fetched + if (formStr.startsWith("lemmalemma")) { + formLemmas = new ArrayList(); + String lemmaName = formStr.substring(10); + Lemma lemma = getLemma(language, lemmaName, false); + formLemmas.add(lemma); + } else { + formLemmas = getLemmasByFormName(language, formStr, false); + } + if (formLemmas != null && ! formLemmas.isEmpty()) { + for (int j=0; j lemmaForms = l.getFormsList(); + result.addAll(lemmaForms); + } + } + } + } + return result; + } + + public ArrayList getLemmasByLuceneQuery(String lang, String luceneQueryString, boolean normalize) throws ApplicationException { + String language = Language.getInstance().getLanguageId(lang); + Hashtable lemmas = new Hashtable(); + luceneQueryString = luceneQueryString.toLowerCase(); + ArrayList formsFromQuery = getVariantsFromLuceneQuery(luceneQueryString); + if (! (formsFromQuery == null || formsFromQuery.isEmpty())) { + for (int i=0; i formLemmas = null; + // lemma mode: if formName starts with "lemmalemma" then the lemma itself is fetched + if (formStr.startsWith("lemmalemma")) { + formLemmas = new ArrayList(); + String lemmaName = formStr.substring(10); + Lemma lemma = getLemma(language, lemmaName, false); + formLemmas.add(lemma); + } else { + formLemmas = getLemmasByFormName(language, formStr, false); + } + if (formLemmas != null) { + for (int j=0; j result = new ArrayList(); + if (lemmas != null) { + Enumeration formLemmasKeys = lemmas.keys(); + while(formLemmasKeys.hasMoreElements()) { + String lemmaKey = formLemmasKeys.nextElement(); + Lemma l = lemmas.get(lemmaKey); + result.add(l); + } + } + Collections.sort(result); + if (result.isEmpty()) + return null; + else + return result; + } + + public ArrayList getIndexKeysByLemmaNames(String lang, ArrayList lemmaNames) throws ApplicationException { + String language = Language.getInstance().getLanguageId(lang); + Hashtable indexKeys = new Hashtable(); + for (int j=0; j lemmaForms = lemma.getFormsList(); + for (int k=0; k fLemmas = getLemmasByFormName(language, form.getFormName(), false); + if (fLemmas != null) { + String indexKey = ""; + if (fLemmas.size() == 1) { + indexKey = fLemmas.get(0).getLemmaName(); + } else { + for (int l=0; l result = new ArrayList(); + if (indexKeys != null) { + Enumeration indexKeysKeys = indexKeys.keys(); + while(indexKeysKeys.hasMoreElements()) { + String indexKey = indexKeysKeys.nextElement(); + result.add(indexKey); + } + } + Collections.sort(result); + if (result.isEmpty()) + return null; + else + return result; + } + + private void clearCache() { + forms = null; + lemmas = null; + forms = new Hashtable>(); + lemmas = new Hashtable(); + } + + private ArrayList readLemmasByFormName(String lang, String formName) throws ApplicationException { + String language = Language.getInstance().getLanguageId(lang); + ArrayList lemmasStatic = dbMorphHandlerStatic.readLemmas(language, formName); + return lemmasStatic; + } + + private ArrayList readFormsByLemmaName(String lang, String lemmaName) throws ApplicationException { + String language = Language.getInstance().getLanguageId(lang); + ArrayList formsStatic = dbMorphHandlerStatic.readForms(language, lemmaName); + return formsStatic; + } + + private ArrayList getVariantsFromLuceneQuery(String queryString) { + LuceneUtil luceneUtil = LuceneUtil.getInstance(); + ArrayList variants = luceneUtil.getVariantsFromLuceneQuery(queryString); + return variants; + } + + private void beginOperation() { + beginOfOperation = new Date(); + } + + private void endOperation() { + endOfOperation = new Date(); + } +} \ No newline at end of file diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/morph/app/SimpleMorphContentHandler.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/morph/app/SimpleMorphContentHandler.java Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,119 @@ +package de.mpg.mpiwg.berlin.mpdl.lt.morph.app; + +import org.xml.sax.*; + + +public class SimpleMorphContentHandler implements ContentHandler { + private Element currentElement; + private Lemma lemma; + private Form form; + + public SimpleMorphContentHandler() { + } + + public Form getForm() { + return form; + } + + public Lemma getLemma() { + return lemma; + } + + public void startDocument() throws SAXException { + } + + public void endDocument() throws SAXException { + } + + public void characters(char[] c, int start, int length) throws SAXException { + if (currentElement != null) { + String elemName = currentElement.name; + if (form != null) { + char[] cCopy = new char[length]; + System.arraycopy(c, start, cCopy, 0, length); + String charactersStr = String.valueOf(cCopy); + if (elemName.equals("provider")) + form.setProvider(charactersStr); + else if (elemName.equals("language")) + form.setLanguage(charactersStr); + else if (elemName.equals("form-name")) + form.setFormName(charactersStr); + else if (elemName.equals("lemma-name")) + form.setLemmaName(charactersStr); + else if (elemName.equals("pos")) + form.setPos(charactersStr); + else if (elemName.equals("tense")) + form.setTense(charactersStr); + else if (elemName.equals("voice")) + form.setVoice(charactersStr); + else if (elemName.equals("casus")) + form.setCasus(charactersStr); + else if (elemName.equals("number")) + form.setNumber(charactersStr); + else if (elemName.equals("mood")) + form.setMood(charactersStr); + else if (elemName.equals("person")) + form.setPerson(charactersStr); + else if (elemName.equals("gender")) + form.setGender(charactersStr); + else if (elemName.equals("definite")) + form.setDefinite(charactersStr); + } else if (lemma != null) { + char[] cCopy = new char[length]; + System.arraycopy(c, start, cCopy, 0, length); + String charactersStr = String.valueOf(cCopy); + if (elemName.equals("provider")) + lemma.setProvider(charactersStr); + else if (elemName.equals("language")) + lemma.setLanguage(charactersStr); + else if (elemName.equals("lemma-name")) + lemma.setLemmaName(charactersStr); + } + } + } + + public void ignorableWhitespace(char[] c, int start, int length) throws SAXException { + } + + public void processingInstruction(String target, String data) throws SAXException { + } + + public void setDocumentLocator(org.xml.sax.Locator arg1) { + } + + public void endPrefixMapping(String prefix) throws SAXException { + } + + public void skippedEntity(String name) throws SAXException { + } + + public void endElement(String uri, String localName, String name) throws SAXException { + currentElement = null; + } + + public void startElement(String uri, String localName, String name, Attributes attrs) throws SAXException { + currentElement = new Element(name); + if (name.equals("form")) { + form = new Form(); + } else if (name.equals("lemma")) { + lemma = new Lemma(); + } + } + + public void startPrefixMapping(String prefix, String uri) throws SAXException { + } + + private class Element { + String name; + String value; + + Element(String name) { + this.name = name; + } + + Element(String name, String value) { + this.name = name; + this.value = value; + } + } +} diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/morph/db/DBMorphHandler.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/morph/db/DBMorphHandler.java Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,242 @@ +package de.mpg.mpiwg.berlin.mpdl.lt.morph.db; + +import java.io.IOException; +import java.io.Reader; +import java.io.StringReader; +import java.io.UnsupportedEncodingException; +import java.util.ArrayList; +import java.util.Hashtable; + +import org.xml.sax.InputSource; +import org.xml.sax.SAXException; +import org.xml.sax.XMLReader; + +import com.sleepycat.je.Cursor; +import com.sleepycat.je.Database; +import com.sleepycat.je.DatabaseEntry; +import com.sleepycat.je.DatabaseException; +import com.sleepycat.je.LockMode; +import com.sleepycat.je.OperationStatus; +import com.sun.org.apache.xerces.internal.parsers.SAXParser; + +import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; +import de.mpg.mpiwg.berlin.mpdl.lt.general.Language; +import de.mpg.mpiwg.berlin.mpdl.lt.morph.app.Form; +import de.mpg.mpiwg.berlin.mpdl.lt.morph.app.Lemma; +import de.mpg.mpiwg.berlin.mpdl.lt.morph.app.SimpleMorphContentHandler; + +public class DBMorphHandler { + private String dbDirectory; + private DbEnvMorph morphDbEnv; + + public DBMorphHandler(String dbDir) { + this.dbDirectory = dbDir; + } + + public void start() throws ApplicationException { + morphDbEnv = new DbEnvMorph(); + morphDbEnv.setDataDir(dbDirectory); + morphDbEnv.init(); // open databases in read/write mode + } + + public void openDatabases() throws ApplicationException { + morphDbEnv.openDatabases(); + } + + public void closeDatabases() throws ApplicationException { + morphDbEnv.close(); + } + + public void deleteMorphData() throws ApplicationException { + morphDbEnv.removeDatabases(); + } + + public long getSize() throws ApplicationException { + long size = 0; + try { + Database formDB = morphDbEnv.getFormDB(); + size = formDB.count(); + } catch (DatabaseException e) { + throw new ApplicationException(e); + } + return size; + } + + + public void writeFormLemma(Form form, Lemma lemma) throws ApplicationException { + try { + String lang = Language.getInstance().getLanguageId(form.getLanguage()); + String keyStr = lang + "###" + form.getFormName(); + String valueStr = lemma.getXmlString(); + DatabaseEntry dbEntryKey = new DatabaseEntry(keyStr.getBytes("utf-8")); + DatabaseEntry dbEntryValue = new DatabaseEntry(valueStr.getBytes("utf-8")); + Database formDB = morphDbEnv.getFormDB(); + formDB.put(null, dbEntryKey, dbEntryValue); + } catch (DatabaseException e) { + throw new ApplicationException(e); + } catch (UnsupportedEncodingException e) { + throw new ApplicationException(e); + } + } + + public void writeLemmaForm(Lemma lemma, Form form) throws ApplicationException { + try { + String lang = Language.getInstance().getLanguageId(lemma.getLanguage()); + String keyStr = lang + "###" + lemma.getLemmaName(); + String valueStr = form.getXmlString(); + DatabaseEntry dbEntryKey = new DatabaseEntry(keyStr.getBytes("utf-8")); + DatabaseEntry dbEntryValue = new DatabaseEntry(valueStr.getBytes("utf-8")); + Database lemmaDB = morphDbEnv.getLemmaDB(); + lemmaDB.put(null, dbEntryKey, dbEntryValue); + } catch (DatabaseException e) { + throw new ApplicationException(e); + } catch (UnsupportedEncodingException e) { + throw new ApplicationException(e); + } + } + + public void deleteLemma(Lemma lemma) throws ApplicationException { + try { + String lang = Language.getInstance().getLanguageId(lemma.getLanguage()); + String keyStr = lang + "###" + lemma.getLemmaName(); + DatabaseEntry dbEntryKey = new DatabaseEntry(keyStr.getBytes("utf-8")); + Database lemmaDB = morphDbEnv.getLemmaDB(); + lemmaDB.delete(null, dbEntryKey); + } catch (DatabaseException e) { + throw new ApplicationException(e); + } catch (UnsupportedEncodingException e) { + throw new ApplicationException(e); + } + } + + public void deleteForm(Form form) throws ApplicationException { + try { + String lang = Language.getInstance().getLanguageId(form.getLanguage()); + String keyStr = lang + "###" + form.getFormName(); + DatabaseEntry dbEntryKey = new DatabaseEntry(keyStr.getBytes("utf-8")); + Database formDB = morphDbEnv.getFormDB(); + formDB.delete(null, dbEntryKey); + } catch (DatabaseException e) { + throw new ApplicationException(e); + } catch (UnsupportedEncodingException e) { + throw new ApplicationException(e); + } + } + + public ArrayList readForms(String language, String lemmaName) throws ApplicationException { + ArrayList retForms = new ArrayList(); + String lang = Language.getInstance().getLanguageId(language); + String hashKey = lang + "###" + lemmaName; + try { + Database lemmaDB = morphDbEnv.getLemmaDB(); + Cursor cursor = lemmaDB.openCursor(null, null); + byte[] bHashKey = hashKey.getBytes("utf-8"); + DatabaseEntry dbEntryKey = new DatabaseEntry(bHashKey); + DatabaseEntry foundFormValue = new DatabaseEntry(); + OperationStatus operationStatus = cursor.getSearchKey(dbEntryKey, foundFormValue, LockMode.DEFAULT); + while (operationStatus == OperationStatus.SUCCESS) { + byte[] foundFormValueBytes = foundFormValue.getData(); + String foundFormValueStr = new String(foundFormValueBytes, "utf-8"); + Form f = parseXmlFormString(foundFormValueStr); + retForms.add(f); + operationStatus = cursor.getNextDup(dbEntryKey, foundFormValue, LockMode.DEFAULT); + } + cursor.close(); + } catch (DatabaseException e) { + throw new ApplicationException(e); + } catch (UnsupportedEncodingException e) { + throw new ApplicationException(e); + } + return retForms; + } + + // TODO diese Methode wird nicht verwendet bis jetzt + public Hashtable readForms() throws ApplicationException { + Hashtable retForms = new Hashtable(); + try { + Database lemmaDB = morphDbEnv.getLemmaDB(); + Cursor cursor = lemmaDB.openCursor(null, null); + DatabaseEntry dbEntryKey = new DatabaseEntry(); + DatabaseEntry foundFormValue = new DatabaseEntry(); + OperationStatus operationStatus = cursor.getFirst(dbEntryKey, foundFormValue, LockMode.DEFAULT); + while (operationStatus == OperationStatus.SUCCESS) { + byte[] foundFormValueBytes = foundFormValue.getData(); + String foundFormValueStr = new String(foundFormValueBytes, "utf-8"); + Form f = parseXmlFormString(foundFormValueStr); + String formHashKey = f.getLanguage() + "###" + f.getFormName(); + retForms.put(formHashKey, f); + operationStatus = cursor.getNext(dbEntryKey, foundFormValue, LockMode.DEFAULT); + } + cursor.close(); + } catch (DatabaseException e) { + throw new ApplicationException(e); + } catch (UnsupportedEncodingException e) { + throw new ApplicationException(e); + } + return retForms; + } + + public ArrayList readLemmas(String language, String formName) throws ApplicationException { + ArrayList retForms = new ArrayList(); + String lang = Language.getInstance().getLanguageId(language); + String hashKey = lang + "###" + formName; + try { + Database formDB = morphDbEnv.getFormDB(); + Cursor cursor = formDB.openCursor(null, null); + byte[] bHashKey = hashKey.getBytes("utf-8"); + DatabaseEntry dbEntryKey = new DatabaseEntry(bHashKey); + DatabaseEntry foundLemmaValue = new DatabaseEntry(); + OperationStatus operationStatus = cursor.getSearchKey(dbEntryKey, foundLemmaValue, LockMode.DEFAULT); + while (operationStatus == OperationStatus.SUCCESS) { + byte[] foundLemmaValueBytes = foundLemmaValue.getData(); + String foundLemmaValueStr = new String(foundLemmaValueBytes, "utf-8"); + Lemma l = parseXmlLemmaString(foundLemmaValueStr); + retForms.add(l); + operationStatus = cursor.getNextDup(dbEntryKey, foundLemmaValue, LockMode.DEFAULT); + } + cursor.close(); + } catch (DatabaseException e) { + throw new ApplicationException(e); + } catch (UnsupportedEncodingException e) { + throw new ApplicationException(e); + } + return retForms; + } + + private Form parseXmlFormString(String xmlString) throws ApplicationException { + Form form = null; + try { + XMLReader xmlParser = new SAXParser(); + SimpleMorphContentHandler morphContentHandler = new SimpleMorphContentHandler(); + xmlParser.setContentHandler(morphContentHandler); + Reader reader = new StringReader(xmlString); + InputSource input = new InputSource(reader); + xmlParser.parse(input); + form = morphContentHandler.getForm(); + } catch (SAXException e) { + throw new ApplicationException(e); + } catch (IOException e) { + throw new ApplicationException(e); + } + return form; + } + + private Lemma parseXmlLemmaString(String xmlString) throws ApplicationException { + Lemma lemma = null; + try { + XMLReader xmlParser = new SAXParser(); + SimpleMorphContentHandler morphContentHandler = new SimpleMorphContentHandler(); + xmlParser.setContentHandler(morphContentHandler); + Reader reader = new StringReader(xmlString); + InputSource input = new InputSource(reader); + xmlParser.parse(input); + lemma = morphContentHandler.getLemma(); + } catch (SAXException e) { + throw new ApplicationException(e); + } catch (IOException e) { + throw new ApplicationException(e); + } + return lemma; + } + +} \ No newline at end of file diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/morph/db/DBMorphSupWriter.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/morph/db/DBMorphSupWriter.java Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,265 @@ +package de.mpg.mpiwg.berlin.mpdl.lt.morph.db; + +import java.io.BufferedOutputStream; +import java.io.BufferedReader; +import java.io.File; +import java.io.FileNotFoundException; +import java.io.FileOutputStream; +import java.io.FileReader; +import java.io.IOException; +import java.io.UnsupportedEncodingException; +import java.util.Date; +import java.util.HashMap; +import java.util.Iterator; + +import com.sleepycat.je.Cursor; +import com.sleepycat.je.Database; +import com.sleepycat.je.DatabaseEntry; +import com.sleepycat.je.DatabaseException; +import com.sleepycat.je.LockMode; +import com.sleepycat.je.OperationStatus; +import com.sleepycat.je.util.DbLoad; + +import de.mpg.mpiwg.berlin.mpdl.util.Util; +import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; +import de.mpg.mpiwg.berlin.mpdl.lt.general.Constants; +import de.mpg.mpiwg.berlin.mpdl.lt.text.transcode.Transcoder; + +public class DBMorphSupWriter { + private static DBMorphSupWriter instance; + private static String DATA_DIR = Constants.getInstance().getDataDir(); + private static String DATA_FILES_DIR_DONATUS_ADD_SUP = DATA_DIR + "/dataFiles/donatusAdditionalSup"; + private static String DB_DIR_DONATUS_ADD_SUP = DATA_DIR + "/dataFiles/donatusAdditionalSup/db"; + private static String[] DONATUS_SUP_DUMPS = {"cache-la", "cache-el", "cache-it"}; + private DbEnvMorphSup dbEnvMorphSup; + private Date beginOfOperation; + private Date endOfOperation; + + public static DBMorphSupWriter getInstance() throws ApplicationException { + if (instance == null) { + instance = new DBMorphSupWriter(); + } + return instance; + } + + public static void main(String[] args) throws ApplicationException { + getInstance(); + instance.beginOperation(); + System.out.print("Start ..."); + instance.initReadWrite(); + // instance.loadDonatusSupDbDumpsToDb(); + instance.printSizeOfAllMorphSupDBs(); + // instance.writeDonatusSupsToFiles(); + instance.end(); + instance.endOperation(); + Double elapsedTime = new Util().getSecondWithMillisecondsBetween(instance.beginOfOperation, instance.endOfOperation); + System.out.println("End."); + System.out.println("Needed time: " + elapsedTime + " seconds"); + } + + private void initReadWrite() throws ApplicationException { + dbEnvMorphSup = new DbEnvMorphSup(); + dbEnvMorphSup.setDataDir(DB_DIR_DONATUS_ADD_SUP); + dbEnvMorphSup.initReadWrite(); + } + + private void loadDonatusSupDbDumpsToDb() throws ApplicationException { + for (int i=0; i getWholeMorphHashMap(String donatusSupName) throws ApplicationException { + HashMap morphHashMap = new HashMap(); + try { + dbEnvMorphSup.openDatabase(donatusSupName + "Dump"); + Database morphDB = dbEnvMorphSup.getMorphSupDB(donatusSupName + "Dump"); + Cursor cursor = morphDB.openCursor(null, null); + DatabaseEntry dbEntryKey = new DatabaseEntry(); + DatabaseEntry dbEntryValue = new DatabaseEntry(); + OperationStatus operationStatus = cursor.getFirst(dbEntryKey, dbEntryValue, LockMode.DEFAULT); + while (operationStatus == OperationStatus.SUCCESS) { + int size = dbEntryKey.getSize(); + if (size > 0) { + byte[] dbEntryKeyBytes = dbEntryKey.getData(); + String dbEntryKeyStr = new String(dbEntryKeyBytes, "utf-8"); + DatabaseEntry newDbEntryValue = new DatabaseEntry(dbEntryValue.getData()); + morphHashMap.put(dbEntryKeyStr, newDbEntryValue); + } + operationStatus = cursor.getNext(dbEntryKey, dbEntryValue, LockMode.DEFAULT); + } + cursor.close(); + } catch (DatabaseException e) { + throw new ApplicationException(e); + } catch (UnsupportedEncodingException e) { + throw new ApplicationException(e); + } + return morphHashMap; + } + + private void writeDonatusSupsToFiles() throws ApplicationException { + BufferedReader in = null; + BufferedOutputStream out = null; + try { + for (int i=0; i morphHashMap = getWholeMorphHashMap(donatusSupName); + Iterator morphDumpIter = morphHashMap.keySet().iterator(); + File outputFile = new File(DATA_FILES_DIR_DONATUS_ADD_SUP + "/donatus-sup-" + donatusSupName + ".xml"); + out = new BufferedOutputStream(new FileOutputStream(outputFile)); + write("\n", out); + while (morphDumpIter.hasNext()) { + write("\n", out); + write("" + "donatus-sup" + "\n", out); + String language = "unknown"; + if (donatusSupName.startsWith("cache-")) + language = donatusSupName.substring(6); + write("" + language + "\n", out); + String morphKeyStr = morphDumpIter.next(); + String formStr = morphKeyStr; + if (language.equals("el")) + formStr = transcodeFromBetaCode2Unicode(formStr); + formStr = formStr.toLowerCase(); + write("" + formStr + "\n", out); + DatabaseEntry morphValue = morphHashMap.get(morphKeyStr); + byte[] morphValueBytes = morphValue.getData(); + String wholeLemmaStr = new String(morphValueBytes, "utf-8"); + // only first lemma is recognized TODO recognize all lemmas for the form + char splitSymbol = '\u0009'; + int firstIndexOfSplitSymbol = wholeLemmaStr.indexOf(splitSymbol); + String lemmaForm = wholeLemmaStr; + if (firstIndexOfSplitSymbol != -1) + lemmaForm = wholeLemmaStr.substring(0, firstIndexOfSplitSymbol); + else + lemmaForm = lemmaForm + "XXXXXX"; + char splitSymbol2 = '\u000B'; + int firstIndexOfSplitSymbol2 = lemmaForm.indexOf(splitSymbol2); + if (firstIndexOfSplitSymbol2 != -1) + lemmaForm = lemmaForm.substring(0, firstIndexOfSplitSymbol2); + if (language.equals("el")) + lemmaForm = transcodeFromBetaCode2Unicode(lemmaForm); + lemmaForm = lemmaForm.replaceAll("#\\d", ""); + lemmaForm = lemmaForm.toLowerCase(); + write("" + lemmaForm + "\n", out); + write("\n", out); + } + write("\n", out); + } + } catch (FileNotFoundException e) { + throw new ApplicationException(e); + } catch (UnsupportedEncodingException e) { + throw new ApplicationException(e); + } finally { + // always close the stream + if (in != null) try { in.close(); } catch (Exception e) { } + if (out != null) try { out.close(); } catch (Exception e) { } + } + } + + private void write(byte[] inputBytes, BufferedOutputStream out) throws ApplicationException { + try { + out.write(inputBytes, 0, inputBytes.length); + out.flush(); + } catch (IOException e) { + throw new ApplicationException(e); + } + } + + private void write(String outStr, BufferedOutputStream out) throws ApplicationException { + try { + byte[] bytes = outStr.getBytes("utf-8"); + out.write(bytes, 0, bytes.length); + out.flush(); + } catch (IOException e) { + throw new ApplicationException(e); + } + } + + private String transcodeFromBetaCode2Unicode(String inputStr) throws ApplicationException { + Transcoder transcoder = Transcoder.getInstance(); + String encodedUnicodeForm = transcoder.transcodeFromBetaCode2Unicode(inputStr); + return encodedUnicodeForm; + } + + private void beginOperation() { + beginOfOperation = new Date(); + } + + private void endOperation() { + endOfOperation = new Date(); + } + +} \ No newline at end of file diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/morph/db/DBMorphWriter.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/morph/db/DBMorphWriter.java Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,168 @@ +package de.mpg.mpiwg.berlin.mpdl.lt.morph.db; + +import java.io.BufferedInputStream; +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.util.ArrayList; +import java.util.Date; + +import org.xml.sax.InputSource; +import org.xml.sax.SAXException; +import org.xml.sax.XMLReader; + +import com.sun.org.apache.xerces.internal.parsers.SAXParser; + +import de.mpg.mpiwg.berlin.mpdl.lt.general.Constants; +import de.mpg.mpiwg.berlin.mpdl.lt.morph.app.Form; +import de.mpg.mpiwg.berlin.mpdl.lt.morph.app.Lemma; +import de.mpg.mpiwg.berlin.mpdl.lt.morph.db.DBMorphHandler; +import de.mpg.mpiwg.berlin.mpdl.util.Util; +import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; + +public class DBMorphWriter { + private static DBMorphWriter instance; + private static String DATA_DIR = Constants.getInstance().getDataDir(); + private static String DB_DIR_DONATUS = DATA_DIR + "/dataBerkeleyDB/donatus"; + private static String DATA_FILES_DIR = DATA_DIR + "/dataFiles"; + private DBMorphHandler dbMorphHandler; + private Date beginOfOperation; + private Date endOfOperation; + + public static DBMorphWriter getInstance() throws ApplicationException { + if (instance == null) { + instance = new DBMorphWriter(); + instance.init(); + } + return instance; + } + + /** + * + */ + public static void main(String[] args) throws ApplicationException { + getInstance(); + instance.beginOperation(); + System.out.println("Start ..."); + instance.init(); + instance.openMorphData(); + // instance.deleteMorphData(); + long size = instance.getSize(); + System.out.println("Count forms: " + size); + // instance.writeMorphData(); + // instance.readSampleData(); + instance.end(); + instance.endOperation(); + Double elapsedTime = new Util().getSecondWithMillisecondsBetween(instance.beginOfOperation, instance.endOfOperation); + System.out.println("End."); + System.out.println("Needed time: " + elapsedTime + " seconds"); + } + + private void init() throws ApplicationException { + dbMorphHandler = new DBMorphHandler(DB_DIR_DONATUS); + dbMorphHandler.start(); + } + + private void openMorphData() throws ApplicationException { + dbMorphHandler.openDatabases(); + } + + private void deleteMorphData() throws ApplicationException { + dbMorphHandler.deleteMorphData(); + } + + private void writeMorphData() throws ApplicationException { + String inputFileNameLatin = DATA_FILES_DIR + "/" + "perseus-latin-forms.xml"; + instance.write(inputFileNameLatin); + String inputFileNameGreek = DATA_FILES_DIR + "/" + "perseus-greek-forms.xml"; + instance.write(inputFileNameGreek); + String inputFileNameArabic = DATA_FILES_DIR + "/" + "perseus-arabic-forms.xml"; + instance.write(inputFileNameArabic); + String inputFileNameDutch = DATA_FILES_DIR + "/" + "celex-dutch-forms.xml"; + instance.write(inputFileNameDutch); + String inputFileNameGerman = DATA_FILES_DIR + "/" + "celex-german-forms.xml"; + instance.write(inputFileNameGerman); + String inputFileNameEnglish = DATA_FILES_DIR + "/" + "celex-english-forms.xml"; + instance.write(inputFileNameEnglish); + String inputFileNameFrench = DATA_FILES_DIR + "/" + "lexique-french-forms.xml"; + instance.write(inputFileNameFrench); + String inputFileNameItalian = DATA_FILES_DIR + "/" + "donatus-italian-forms.xml"; + instance.write(inputFileNameItalian); + String[] languages = {"ar", "de", "en", "el", "fr", "it", "la"}; + for (int i = 0; i < languages.length; i++) { + String language = languages[i]; + String inputFileNameDonatusSup = DATA_FILES_DIR + "/" + "donatus-sup-" + language + "-forms.xml"; + instance.write(inputFileNameDonatusSup); + } + String[] donatusAdditionalSups = {"cache-la", "cache-el", "cache-it"}; + for (int i = 0; i < donatusAdditionalSups.length; i++) { + String donatusAdditionalSupName = donatusAdditionalSups[i]; + String inputFileNameDonatusAddSup = DATA_FILES_DIR + "/donatusAdditionalSup/" + "donatus-sup-" + donatusAdditionalSupName + ".xml"; + instance.write(inputFileNameDonatusAddSup); + } + } + + private void write(String inputFileName) throws ApplicationException { + File inputFile = new File(inputFileName); + if (! inputFile.exists()) { + System.out.println("Input file: " + inputFile.getAbsolutePath() + " does not exist."); + return; + } + DBMorphWriterContentHandler morphContentHandler = new DBMorphWriterContentHandler(dbMorphHandler); + try { + XMLReader xmlParser = new SAXParser(); + xmlParser.setContentHandler(morphContentHandler); + InputStream inputStream = new FileInputStream(inputFile); + BufferedInputStream bufferedInputStream = new BufferedInputStream(inputStream); + InputSource input = new InputSource(bufferedInputStream); + xmlParser.parse(input); + bufferedInputStream.close(); + } catch (SAXException e) { + throw new ApplicationException(e); + } catch (IOException e) { + throw new ApplicationException(e); + } + } + + private long getSize() throws ApplicationException { + long size = dbMorphHandler.getSize(); + return size; + } + + private void addSampleData() throws ApplicationException { + Lemma l1 = new Lemma("perseus", "la", "abrogo"); + Form f1 = new Form("perseus", "la", "abrogare"); + Form f2 = new Form("perseus", "la", "abroges"); + dbMorphHandler.writeFormLemma(f1, l1); + dbMorphHandler.writeLemmaForm(l1, f1); + dbMorphHandler.writeLemmaForm(l1, f2); + } + + private void readSampleData() throws ApplicationException { + ArrayList
  forms = dbMorphHandler.readForms("la", "abrogo"); + System.out.println("Forms: " + forms); + } + + private void deleteSampleData() throws ApplicationException { + Lemma l1 = new Lemma("perseus", "la", "abrogo"); + Form f1 = new Form("perseus", "la", "abrogare"); + Form f2 = new Form("perseus", "la", "abroges"); + dbMorphHandler.deleteLemma(l1); + dbMorphHandler.deleteForm(f1); + dbMorphHandler.deleteForm(f2); + } + + private void end() throws ApplicationException { + dbMorphHandler.closeDatabases(); + } + + private void beginOperation() { + beginOfOperation = new Date(); + } + + private void endOperation() { + endOfOperation = new Date(); + } + +} \ No newline at end of file diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/morph/db/DBMorphWriterContentHandler.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/morph/db/DBMorphWriterContentHandler.java Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,133 @@ +package de.mpg.mpiwg.berlin.mpdl.lt.morph.db; + +import java.util.Hashtable; + +import org.xml.sax.*; + +import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; +import de.mpg.mpiwg.berlin.mpdl.lt.morph.app.Form; +import de.mpg.mpiwg.berlin.mpdl.lt.morph.app.Lemma; + +public class DBMorphWriterContentHandler implements ContentHandler { + private DBMorphHandler dbMorphHandler; + private Element currentElement; + private Form form; + private Lemma lemma; + private Hashtable forms; + + public DBMorphWriterContentHandler(DBMorphHandler dbMorphHandler) { + this.dbMorphHandler = dbMorphHandler; + } + + public void startDocument() throws SAXException { + forms = new Hashtable(); + } + + public void endDocument() throws SAXException { + forms = null; + } + + // TODO setPos etc. ersetzen durch addPos etc. + public void characters(char[] c, int start, int length) throws SAXException { + if (currentElement != null) { + String elemName = currentElement.name; + if (form != null) { + char[] cCopy = new char[length]; + System.arraycopy(c, start, cCopy, 0, length); + String charactersStr = String.valueOf(cCopy); + if (charactersStr != null && ! (charactersStr.trim().equals(""))) { + if (elemName.equals("provider")) { + form.addProvider(charactersStr); + lemma.addProvider(charactersStr); + } else if (elemName.equals("language")) { + form.addLanguage(charactersStr); + lemma.addLanguage(charactersStr); + } else if (elemName.equals("form-name")) { + form.addFormName(charactersStr); + } else if (elemName.equals("lemma-name")) { + form.addLemmaName(charactersStr); + lemma.addLemmaName(charactersStr); + } else if (elemName.equals("pos")) { + form.addPos(charactersStr); + } else if (elemName.equals("tense")) { + form.addTense(charactersStr); + } else if (elemName.equals("voice")) { + form.addVoice(charactersStr); + } else if (elemName.equals("casus")) { + form.addCasus(charactersStr); + } else if (elemName.equals("number")) { + form.addNumber(charactersStr); + } else if (elemName.equals("mood")) { + form.addMood(charactersStr); + } else if (elemName.equals("person")) { + form.addPerson(charactersStr); + } else if (elemName.equals("gender")) { + form.addGender(charactersStr); + } else if (elemName.equals("definite")) { + form.addDefinite(charactersStr); + } + } + } + } + } + + public void ignorableWhitespace(char[] c, int start, int length) throws SAXException { + } + + public void processingInstruction(String target, String data) throws SAXException { + } + + public void setDocumentLocator(org.xml.sax.Locator arg1) { + } + + public void endPrefixMapping(String prefix) throws SAXException { + } + + public void skippedEntity(String name) throws SAXException { + } + + public void startElement(String uri, String localName, String name, Attributes attrs) throws SAXException { + currentElement = new Element(name, ""); + if (localName.equals("form")) { + form = new Form(); + lemma = new Lemma(); + } + } + + public void endElement(String uri, String localName, String name) throws SAXException { + currentElement = null; + if (localName.equals("form")) { + String keyStr = form.getFormName(); + forms.put(keyStr, form); + write(form, lemma); + form = null; + lemma = null; + } + } + + public void startPrefixMapping(String prefix, String uri) throws SAXException { + } + + private void write(Form form, Lemma lemma) throws SAXException { + try { + dbMorphHandler.writeFormLemma(form, lemma); + dbMorphHandler.writeLemmaForm(lemma, form); + } catch (ApplicationException e) { + throw new SAXException(e); + } + } + + private class Element { + String name; + String value; + + Element(String name) { + this.name = name; + } + + Element(String name, String value) { + this.name = name; + this.value = value; + } + } +} diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/morph/db/DbEnvMorph.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/morph/db/DbEnvMorph.java Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,105 @@ +package de.mpg.mpiwg.berlin.mpdl.lt.morph.db; + +import java.io.File; + +import com.sleepycat.je.Database; +import com.sleepycat.je.DatabaseConfig; +import com.sleepycat.je.DatabaseException; +import com.sleepycat.je.Environment; +import com.sleepycat.je.EnvironmentConfig; + +import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; + +public class DbEnvMorph { + private String dataDir; + private File envPath; + private Environment env; + private EnvironmentConfig envConfig; + private DatabaseConfig dbConfig; + private Database lemmaDB; + private Database formDB; + + public DbEnvMorph() { + } + + public void setDataDir(String dataDir) { + this.dataDir = dataDir; + } + + public void init() throws ApplicationException { + try { + envConfig = new EnvironmentConfig(); + dbConfig = new DatabaseConfig(); + envConfig.setReadOnly(false); + dbConfig.setReadOnly(false); + envConfig.setAllowCreate(true); + dbConfig.setAllowCreate(true); + envConfig.setTransactional(true); + dbConfig.setTransactional(true); + // allow duplicates for keys + dbConfig.setSortedDuplicates(true); + envPath = new File(dataDir); + env = new Environment(envPath, envConfig); + } catch (DatabaseException e) { + throw new ApplicationException(e); + } + } + + public void openDatabases() throws ApplicationException { + try { + // open databases (and create them if they do not exist) + lemmaDB = env.openDatabase(null, "LemmaDB", dbConfig); + formDB = env.openDatabase(null, "FormDB", dbConfig); + } catch (DatabaseException e) { + throw new ApplicationException(e); + } + } + + public void removeDatabases() throws ApplicationException { + try { + if (formDB != null) + formDB.close(); + if (lemmaDB != null) + lemmaDB.close(); + env.removeDatabase(null, "LemmaDB"); + env.removeDatabase(null, "FormDB"); + formDB = null; + lemmaDB = null; + /* + boolean bla = true; + env.truncateDatabase(null, "LemmaDB", bla); + env.truncateDatabase(null, "FormDB", bla); + */ + } catch (DatabaseException e) { + throw new ApplicationException(e); + } + } + + public Environment getEnv() { + return env; + } + + public Database getLemmaDB() { + return lemmaDB; + } + + public Database getFormDB() { + return formDB; + } + + public void close() throws ApplicationException { + if (env != null) { + try { + if (formDB != null) + formDB.close(); + if (lemmaDB != null) + lemmaDB.close(); + if (env != null) + env.close(); + } catch (DatabaseException e) { + throw new ApplicationException(e); + } + } + } +} + diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/morph/db/DbEnvMorphSup.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/morph/db/DbEnvMorphSup.java Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,101 @@ +package de.mpg.mpiwg.berlin.mpdl.lt.morph.db; + +import java.io.File; +import java.util.HashMap; + +import com.sleepycat.je.Database; +import com.sleepycat.je.DatabaseConfig; +import com.sleepycat.je.DatabaseException; +import com.sleepycat.je.Environment; +import com.sleepycat.je.EnvironmentConfig; + +import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; + +public class DbEnvMorphSup { + private String dataDir; + private File envPath; + private Environment env; + private EnvironmentConfig envConfig; + private DatabaseConfig dbConfig; + private HashMap morphSupDBs = new HashMap(); + + public DbEnvMorphSup() { + } + + public void setDataDir(String dataDir) { + this.dataDir = dataDir; + } + + public void initReadOnly() throws ApplicationException { + try { + envConfig = new EnvironmentConfig(); + dbConfig = new DatabaseConfig(); + envPath = new File(dataDir); + env = new Environment(envPath, envConfig); + } catch (DatabaseException e) { + throw new ApplicationException(e); + } + } + + public void initReadWrite() throws ApplicationException { + try { + envConfig = new EnvironmentConfig(); + dbConfig = new DatabaseConfig(); + envConfig.setReadOnly(false); + dbConfig.setReadOnly(false); + envConfig.setAllowCreate(true); + dbConfig.setAllowCreate(true); + envConfig.setTransactional(true); + dbConfig.setTransactional(true); + envPath = new File(dataDir); + env = new Environment(envPath, envConfig); + } catch (DatabaseException e) { + throw new ApplicationException(e); + } + } + + public void openDatabase(String morphSupName) throws ApplicationException { + try { + Database lexDB = morphSupDBs.get(morphSupName); + if (lexDB == null) { + Database morphSupDB = env.openDatabase(null, morphSupName + ".db", dbConfig); + morphSupDBs.put(morphSupName, morphSupDB); + } + } catch (DatabaseException e) { + throw new ApplicationException(e); + } + } + + public void closeDatabase(String morphSupName) throws ApplicationException { + try { + if (morphSupDBs != null) { + Database morphSupDB = morphSupDBs.get(morphSupName); + if (morphSupDB != null) + morphSupDB.close(); + } + } catch (DatabaseException e) { + throw new ApplicationException(e); + } + } + + public Environment getEnv() { + return env; + } + + public Database getMorphSupDB(String morphSupName) { + Database morphSupDB = morphSupDBs.get(morphSupName); + return morphSupDB; + } + + public void close() throws ApplicationException { + if (env != null) { + try { + if (env != null) + env.close(); + } catch (DatabaseException e) { + throw new ApplicationException(e); + } + } + } +} + diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/Normalizer.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/Normalizer.java Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,1208 @@ +package de.mpg.mpiwg.berlin.mpdl.lt.text.norm; + +import java.io.IOException; +import java.io.StringReader; +import java.util.ArrayList; + +import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; +import de.mpg.mpiwg.berlin.mpdl.lt.general.Language; +import de.mpg.mpiwg.berlin.mpdl.lt.text.norm.lang.MpdlNormalizerLexAR; +import de.mpg.mpiwg.berlin.mpdl.lt.text.norm.lang.MpdlNormalizerLexDE; +import de.mpg.mpiwg.berlin.mpdl.lt.text.norm.lang.MpdlNormalizerLexEL; +import de.mpg.mpiwg.berlin.mpdl.lt.text.norm.lang.MpdlNormalizerLexEN; +import de.mpg.mpiwg.berlin.mpdl.lt.text.norm.lang.MpdlNormalizerLexFR; +import de.mpg.mpiwg.berlin.mpdl.lt.text.norm.lang.MpdlNormalizerLexIT; +import de.mpg.mpiwg.berlin.mpdl.lt.text.norm.lang.MpdlNormalizerLexLA; +import de.mpg.mpiwg.berlin.mpdl.lt.text.norm.lang.MpdlNormalizerLexNL; +import de.mpg.mpiwg.berlin.mpdl.lt.text.norm.lang.MpdlNormalizerLexZH; +import de.mpg.mpiwg.berlin.mpdl.lt.text.reg.Regularization; +import de.mpg.mpiwg.berlin.mpdl.lt.text.reg.RegularizationManager; + +public class Normalizer { + public static int DISPLAY = 1; // normalization in DISPLAY mode + public static int DICTIONARY = 2; // normalization in DICTIONARY mode + public static int SEARCH = 3; // normalization in SEARCH mode; never used so far in indexing because it does not support the morph. lexicons such as CELEX (e.g. eingeschränkt would not be stemmed to eingeschraenkt) + private int normMode = DISPLAY; // Default e.g. for indexing and querying + private String[] normFunctions = {"norm"}; // default is to use the norm function + private String language; + private int[] offsets; + + public Normalizer(String[] normFunctions, String lang) { + this.normFunctions = normFunctions; + String language = Language.getInstance().getLanguageId(lang); + this.language = language; + } + + public Normalizer(String language) { + this.language = language; + } + + public String getLanguage() { + return language; + } + + public void setNormMode(int normMode) { + this.normMode = normMode; + } + + /** + * Applies the normalization rules in language to + * s, without offset tracking. + * + * @param s source string + * @return normalized string + */ + public String normalize(String s) throws ApplicationException { + String normStr = s; + if (useSpecialNormFunction()) + normStr = removeSpecialNWDMarks(normStr); + if (useRegFunction()) { + // try to regularize the string to the norm form over predefined regularizations + RegularizationManager regManager = RegularizationManager.getInstance(); + ArrayList regs = regManager.findRegsByOrig(language, s); + if (regs != null && regs.size() > 0) { + Regularization reg = regs.get(0); // only one: the first one + String regNormStr = reg.getNorm(); + normStr = regNormStr; + } + } + if (useNormFunction()) { + // normalize the string by string replacements + if (normMode == DICTIONARY) { + normStr = normalize(normStr, DICTIONARY); + } else if (normMode == DISPLAY) { + normStr = normalize(normStr, DISPLAY); + } else if (normMode == SEARCH) { + normStr = normalize(normStr, SEARCH); + } + } + if (useSpecialNormFunction()) + normStr = insertSpecialNWDMarks(normStr); + return normStr; + } + + private boolean useRegFunction() { + boolean useReg = false; + for (int i=0; i< normFunctions.length; i++) { + String function = normFunctions[i]; + if (function.equals("reg")) + return true; + } + return useReg; + } + + private boolean useNormFunction() { + boolean useNorm = false; + for (int i=0; i< normFunctions.length; i++) { + String function = normFunctions[i]; + if (function.equals("norm") || function.equals("specialNorm")) + return true; + } + return useNorm; + } + + private boolean useSpecialNormFunction() { + boolean useNorm = false; + for (int i=0; i< normFunctions.length; i++) { + String function = normFunctions[i]; + if (function.equals("specialNorm")) + return true; + } + return useNorm; + } + + private String normalize(String s, int mode) { + String inputStr = s; + StringReader strReader = new StringReader(inputStr + "\n"); + String retStr = ""; + String token = ""; + try { + if (Language.getInstance().isLatin(language)) { + MpdlNormalizerLexLA mpdlNormalizerLex = new MpdlNormalizerLexLA(strReader); + if (mode == DISPLAY) + mpdlNormalizerLex.yybegin(MpdlNormalizerLexLA.DISP); + else if (mode == DICTIONARY) + mpdlNormalizerLex.yybegin(MpdlNormalizerLexLA.DICT); + else if (mode == SEARCH) + mpdlNormalizerLex.yybegin(MpdlNormalizerLexLA.SEARCH); + while (token != null) { + token = mpdlNormalizerLex.yylex(); + if (token != null) + retStr += token; + } + } else if (Language.getInstance().isArabic(language)) { + MpdlNormalizerLexAR mpdlNormalizerLex = new MpdlNormalizerLexAR(strReader); + if (mode == DISPLAY) + mpdlNormalizerLex.yybegin(MpdlNormalizerLexAR.DISP); + else if (mode == DICTIONARY) + mpdlNormalizerLex.yybegin(MpdlNormalizerLexAR.DICT); + else if (mode == SEARCH) + mpdlNormalizerLex.yybegin(MpdlNormalizerLexAR.SEARCH); + while (token != null) { + token = mpdlNormalizerLex.yylex(); + if (token != null) + retStr += token; + } + } else if (Language.getInstance().isGerman(language)) { + MpdlNormalizerLexDE mpdlNormalizerLex = new MpdlNormalizerLexDE(strReader); + if (mode == DISPLAY) + mpdlNormalizerLex.yybegin(MpdlNormalizerLexDE.DISP); + else if (mode == DICTIONARY) + mpdlNormalizerLex.yybegin(MpdlNormalizerLexDE.CELEX); + else if (mode == SEARCH) + mpdlNormalizerLex.yybegin(MpdlNormalizerLexDE.SEARCH); + while (token != null) { + token = mpdlNormalizerLex.yylex(); + if (token != null) + retStr += token; + } + } else if (Language.getInstance().isGreek(language)) { + MpdlNormalizerLexEL mpdlNormalizerLex = new MpdlNormalizerLexEL(strReader); + if (mode == DISPLAY) + mpdlNormalizerLex.yybegin(MpdlNormalizerLexEL.DISP); + else if (mode == DICTIONARY) + mpdlNormalizerLex.yybegin(MpdlNormalizerLexEL.SIGMA); + else if (mode == SEARCH) + mpdlNormalizerLex.yybegin(MpdlNormalizerLexEL.SEARCH); + while (token != null) { + token = mpdlNormalizerLex.yylex(); + if (token != null) + retStr += token; + } + } else if (Language.getInstance().isEnglish(language)) { + MpdlNormalizerLexEN mpdlNormalizerLex = new MpdlNormalizerLexEN(strReader); + if (mode == DISPLAY) + mpdlNormalizerLex.yybegin(MpdlNormalizerLexEN.DISP); + else if (mode == DICTIONARY) + mpdlNormalizerLex.yybegin(MpdlNormalizerLexEN.DICT); + else if (mode == SEARCH) + mpdlNormalizerLex.yybegin(MpdlNormalizerLexEN.SEARCH); + while (token != null) { + token = mpdlNormalizerLex.yylex(); + if (token != null) + retStr += token; + } + } else if (Language.getInstance().isFrench(language)) { + MpdlNormalizerLexFR mpdlNormalizerLex = new MpdlNormalizerLexFR(strReader); + if (mode == DISPLAY) + mpdlNormalizerLex.yybegin(MpdlNormalizerLexFR.DISP); + else if (mode == DICTIONARY) + mpdlNormalizerLex.yybegin(MpdlNormalizerLexFR.DICT_ASCII); + else if (mode == SEARCH) + mpdlNormalizerLex.yybegin(MpdlNormalizerLexFR.SEARCH); + while (token != null) { + token = mpdlNormalizerLex.yylex(); + if (token != null) + retStr += token; + } + } else if (Language.getInstance().isItalian(language)) { + MpdlNormalizerLexIT mpdlNormalizerLex = new MpdlNormalizerLexIT(strReader); + if (mode == DISPLAY) + mpdlNormalizerLex.yybegin(MpdlNormalizerLexIT.DISP); + else if (mode == DICTIONARY) + mpdlNormalizerLex.yybegin(MpdlNormalizerLexIT.DICT); + else if (mode == SEARCH) + mpdlNormalizerLex.yybegin(MpdlNormalizerLexIT.SEARCH); + while (token != null) { + token = mpdlNormalizerLex.yylex(); + if (token != null) + retStr += token; + } + } else if (Language.getInstance().isDutch(language)) { + MpdlNormalizerLexNL mpdlNormalizerLex = new MpdlNormalizerLexNL(strReader); + if (mode == DISPLAY) + mpdlNormalizerLex.yybegin(MpdlNormalizerLexNL.DISP); + else if (mode == DICTIONARY) + mpdlNormalizerLex.yybegin(MpdlNormalizerLexNL.DICT); + else if (mode == SEARCH) + mpdlNormalizerLex.yybegin(MpdlNormalizerLexNL.SEARCH); + while (token != null) { + token = mpdlNormalizerLex.yylex(); + if (token != null) + retStr += token; + } + } else if (Language.getInstance().isChinese(language)) { + MpdlNormalizerLexZH mpdlNormalizerLex = new MpdlNormalizerLexZH(strReader); + if (mode == DISPLAY) + mpdlNormalizerLex.yybegin(MpdlNormalizerLexZH.DISP); + else if (mode == DICTIONARY) + mpdlNormalizerLex.yybegin(MpdlNormalizerLexZH.DICT); + else if (mode == SEARCH) + mpdlNormalizerLex.yybegin(MpdlNormalizerLexZH.SEARCH); + while (token != null) { + token = mpdlNormalizerLex.yylex(); + if (token != null) + retStr += token; + } + } else { + retStr = s; // return the string unchanged + } + } catch (IOException e ) { + // nothing cause IOException is not needed for a StringReader + } + return retStr; + } + + + // used only in XmlTokenizerContentHandler // TODO make it better + private String removeSpecialNWDMarks(String inputString) { + String COMPLEX_ELEMENT_NWD_MARK = new Character('\u2424').toString(); // not word delimiting element + String cleanedWord = inputString; + boolean startsWithNWDMark = cleanedWord.startsWith(COMPLEX_ELEMENT_NWD_MARK); + if (startsWithNWDMark) + cleanedWord = cleanedWord.substring(1); + int countNWDMarks = cleanedWord.length() - cleanedWord.replaceAll(COMPLEX_ELEMENT_NWD_MARK, "").length(); + if (countNWDMarks > 1) + cleanedWord = cleanedWord.replaceAll(COMPLEX_ELEMENT_NWD_MARK + "+", COMPLEX_ELEMENT_NWD_MARK); + // boolean notHyphenPlusNWD = cleanedWord.matches(".*[^-]+" + COMPLEX_ELEMENT_NWD_MARK + "+.*"); // e.g. "praebi ta" + // if (notHyphenPlusNWD) + // cleanedWord = cleanedWord.replaceAll("([^-]+)" + COMPLEX_ELEMENT_NWD_MARK + "+", "$1-" + COMPLEX_ELEMENT_NWD_MARK); // e.g. "praebi ta" is replaced by "praebi- ta" + cleanedWord = cleanedWord.replaceAll(COMPLEX_ELEMENT_NWD_MARK, " "); + return cleanedWord; + } + + private String insertSpecialNWDMarks(String inputString) { + String COMPLEX_ELEMENT_NWD_MARK = new Character('\u2424').toString(); // not word delimiting element + String retStr = inputString; + boolean startsWithNWDMark = retStr.startsWith(COMPLEX_ELEMENT_NWD_MARK); + int countNWDMarks = retStr.length() - retStr.replaceAll(COMPLEX_ELEMENT_NWD_MARK, "").length(); + retStr = retStr.replaceAll(" ", COMPLEX_ELEMENT_NWD_MARK); + // if (notHyphenPlusNWD) + // normalizedWordStr = normalizedWordStr.replaceAll("-" + COMPLEX_ELEMENT_NWD_MARK, COMPLEX_ELEMENT_NWD_MARK); // e.g. "praebi- ta" is replaced by "praebi ta" + if (countNWDMarks > 1) { + String nwdStr = ""; + for (int i=0; ilanguage to + * s, with offset tracking.
  + * + * WARNING: + * Arboreal will not work properly if a normalization substitution + * replaces a source character with more than two target characters! + * This is simply a BUG, and should be fixed. Fortunately, however, + * one does not often need such a replacement.
  + * + * @param s source string + * @param offsets character offset table + * @return normalized string + */ + private String normalize4Lexica(String s, int[] offsets) { + this.offsets = offsets; + if (language.equals("la") || language.equals("lat")) { + StringBuffer buf = new StringBuffer(); + int n = 0; + for (int i = 0; i < s.length(); i++) { + char c = s.charAt(i); + String replace = new String(); + switch (c) { + case 'j': replace = "i"; break; + case 'v': replace = "u"; break; + /* + * Linguistic note: /u/ and /v/ are rarely phonemic + * in Latin, as in alui 's/he nourished' vs. + * alvi 'of a belly', volui 's/he wished' or 'it rolled' + * vs. volvi 'to be rolled', (in)seruit 's/he joined + * together' vs. (in)servit 's/he serves'. + */ + case 'q': + if ((i < s.length() - 1) && (s.charAt(i + 1) == ';')) + replace = "qu"; + else + replace = "q"; + break; + case ';': + if ((i > 0) && (s.charAt(i - 1) == 'q')) + replace = "e"; + else if ((i == 0) || ! Character.isLetter(s.charAt(i - 1))) + replace = ";"; + else + replace = ""; + break; + case '\u0300': replace = ""; break; // COMBINING GRAVE ACCENT + case '\u0301': replace = ""; break; // COMBINING ACCUTE ACCENT + case '\u0302': replace = ""; break; // COMBINING CIRCUMFLEX ACCENT + + case '\u00c0': replace = "A"; break; // LATIN CAPITAL LETTER A GRAVE + case '\u00c1': replace = "A"; break; // LATIN CAPITAL LETTER A ACUTE + case '\u00c2': replace = "A"; break; // LATIN CAPITAL LETTER A CIRCUMFLEX + case '\u00c4': replace = "A"; break; // LATIN CAPITAL LETTER A DIAERESIS + case '\u00c6': replace = "Ae"; break; // LATIN CAPITAL LETTER A E + case '\u00c7': replace = "C"; break; // LATIN CAPITAL LETTER C CEDILLA + case '\u00c8': replace = "E"; break; // LATIN CAPITAL LETTER E GRAVE + case '\u00c9': replace = "E"; break; // LATIN CAPITAL LETTER E ACUTE + case '\u00ca': replace = "E"; break; // LATIN CAPITAL LETTER E CIRCUMFLEX + case '\u00cb': replace = "E"; break; // LATIN CAPITAL LETTER E DIAERESIS + case '\u00cc': replace = "I"; break; // LATIN CAPITAL LETTER I GRAVE; + case '\u00cd': replace = "I"; break; // LATIN CAPITAL LETTER I ACUTE + case '\u00ce': replace = "I"; break; // LATIN CAPITAL LETTER I CIRCUMFLEX + case '\u00cf': replace = "I"; break; // LATIN CAPITAL LETTER I DIAERESIS + case '\u00d2': replace = "O"; break; // LATIN CAPITAL LETTER O GRAVE + case '\u00d3': replace = "O"; break; // LATIN CAPITAL LETTER O ACUTE + case '\u00d4': replace = "O"; break; // LATIN CAPITAL LETTER O CIRCUMFLEX + case '\u00d6': replace = "O"; break; // LATIN CAPITAL LETTER O DIAERESIS + case '\u00d9': replace = "U"; break; // LATIN CAPITAL LETTER U GRAVE + case '\u00da': replace = "U"; break; // LATIN CAPITAL LETTER U ACUTE + case '\u00db': replace = "U"; break; // LATIN CAPITAL LETTER U CIRCUMFLEX + case '\u00dc': replace = "U"; break; // LATIN CAPITAL LETTER U DIAERESIS + case '\u00e0': replace = "a"; break; // LATIN SMALL LETTER A GRAVE + case '\u00e1': replace = "a"; break; // LATIN SMALL LETTER A ACUTE + case '\u00e2': replace = "a"; break; // LATIN SMALL LETTER A CIRCUMFLEX + case '\u00e4': replace = "a"; break; // LATIN SMALL LETTER A DIAERESIS + case '\u00e6': replace = "ae"; break; // LATIN SMALL LETTER A E + case '\u00e7': replace = "c"; break; // LATIN SMALL LETTER C CEDILLA + case '\u00e8': replace = "e"; break; // LATIN SMALL LETTER E GRAVE + case '\u00e9': replace = "e"; break; // LATIN SMALL LETTER E ACUTE + case '\u00ea': replace = "e"; break; // LATIN SMALL LETTER E CIRCUMFLEX + case '\u00eb': replace = "e"; break; // LATIN SMALL LETTER E DIAERESIS + case '\u00ec': replace = "i"; break; // LATIN SMALL LETTER I GRAVE + case '\u00ed': replace = "i"; break; // LATIN SMALL LETTER I ACUTE + case '\u00ee': replace = "i"; break; // LATIN SMALL LETTER I CIRCUMFLEX + case '\u00ef': replace = "i"; break; // LATIN SMALL LETTER I DIAERESIS + case '\u00f2': replace = "o"; break; // LATIN SMALL LETTER O GRAVE + case '\u00f3': replace = "o"; break; // LATIN SMALL LETTER O ACUTE + case '\u00f4': replace = "o"; break; // LATIN SMALL LETTER O CIRCUMFLEX + case '\u00f6': replace = "o"; break; // LATIN SMALL LETTER O DIAERESIS + case '\u00f9': replace = "u"; break; // LATIN SMALL LETTER U GRAVE + case '\u00fa': replace = "u"; break; // LATIN SMALL LETTER U ACUTE + case '\u00fb': replace = "u"; break; // LATIN SMALL LETTER U CIRCUMFLEX + case '\u00fc': replace = "u"; break; // LATIN SMALL LETTER U DIAERESIS + case '\u0100': replace = "A"; break; // LATIN CAPITAL LETTER A MACRON + case '\u0101': replace = "a"; break; // LATIN SMALL LETTER A MACRON + case '\u0102': replace = "A"; break; // LATIN CAPITAL LETTER A BREVE + case '\u0103': replace = "a"; break; // LATIN SMALL LETTER A BREVE + case '\u0112': replace = "E"; break; // LATIN CAPITAL LETTER E MACRON + case '\u0113': replace = "e"; break; // LATIN SMALL LETTER E MACRON + case '\u0114': replace = "E"; break; // LATIN CAPITAL LETTER E BREVE + case '\u0115': replace = "e"; break; // LATIN SMALL LETTER E BREVE + case '\u0118': replace = "Ae"; break; // LATIN CAPITAL LETTER E OGONEK + case '\u0119': replace = "ae"; break; // LATIN SMALL LETTER E OGONEK + case '\u012a': replace = "I"; break; // LATIN CAPITAL LETTER I MACRON + case '\u012b': replace = "i"; break; // LATIN SMALL LETTER I MACRON + case '\u012c': replace = "I"; break; // LATIN CAPITAL LETTER I BREVE + case '\u012d': replace = "i"; break; // LATIN SMALL LETTER I BREVE + case '\u014c': replace = "O"; break; // LATIN CAPITAL LETTER O MACRON + case '\u014d': replace = "o"; break; // LATIN SMALL LETTER O MACRON + case '\u014e': replace = "O"; break; // LATIN CAPITAL LETTER O BREVE + case '\u014f': replace = "o"; break; // LATIN SMALL LETTER O BREVE + case '\u0152': replace = "Oe"; break; // LATIN CAPITAL LETTER O E + case '\u0153': replace = "oe"; break; // LATIN SMALL LETTER O E + case '\u016a': replace = "U"; break; // LATIN CAPITAL LETTER U MACRON + case '\u016b': replace = "u"; break; // LATIN SMALL LETTER U MACRON + case '\u016c': replace = "U"; break; // LATIN CAPITAL LETTER U BREVE + case '\u016d': replace = "u"; break; // LATIN SMALL LETTER U BREVE + case '\u017f': replace = "s"; break; // LATIN SMALL LETTER LONG S + case '\u00df': replace = "ss"; break; // LATIN SMALL LETTER SHARP S + case '\u00ad': break; // soft hyphen + // new in MPDL project by J. Willenborg + case '\u1e14': replace = "e"; break; // LATIN ... LETTER E WITH ... + case '\u1e15': replace = "e"; break; // LATIN ... LETTER E WITH ... + case '\u1e16': replace = "e"; break; // LATIN ... LETTER E WITH ... + case '\u1e17': replace = "e"; break; // LATIN ... LETTER E WITH ... + case '\u1e18': replace = "e"; break; // LATIN ... LETTER E WITH ... + case '\u1e19': replace = "e"; break; // LATIN ... LETTER E WITH ... + case '\u1e1a': replace = "e"; break; // LATIN ... LETTER E WITH ... + case '\u1e1b': replace = "e"; break; // LATIN ... LETTER E WITH ... + case '\u1e1c': replace = "e"; break; // LATIN ... LETTER E WITH ... + case '\u1e1d': replace = "e"; break; // LATIN ... LETTER E WITH ... + case '\u1eb8': replace = "e"; break; // LATIN ... LETTER E WITH ... + case '\u1eb9': replace = "e"; break; // LATIN ... LETTER E WITH ... + case '\u1eba': replace = "e"; break; // LATIN ... LETTER E WITH ... + case '\u1ebb': replace = "e"; break; // LATIN ... LETTER E WITH ... + case '\u1ebc': replace = "e"; break; // LATIN ... LETTER E WITH ... + case '\u1ebd': replace = "e"; break; // LATIN ... LETTER E WITH ... + case '\u1ebe': replace = "e"; break; // LATIN ... LETTER E WITH ... + case '\u1ebf': replace = "e"; break; // LATIN ... LETTER E WITH ... + case '\u1ec0': replace = "e"; break; // LATIN ... LETTER E WITH ... + case '\u1ec1': replace = "e"; break; // LATIN ... LETTER E WITH ... + case '\u1ec2': replace = "e"; break; // LATIN ... LETTER E WITH ... + case '\u1ec3': replace = "e"; break; // LATIN ... LETTER E WITH ... + case '\u1ec4': replace = "e"; break; // LATIN ... LETTER E WITH ... + case '\u1ec5': replace = "e"; break; // LATIN ... LETTER E WITH ... + case '\u1ec6': replace = "e"; break; // LATIN ... LETTER E WITH ... + case '\u1ec7': replace = "e"; break; // LATIN ... LETTER E WITH ... + // by Malcolm + case '\u2329': break; // BRA + case '\u232a': break; // KET + default: replace += c; break; + } + buf.append(replace); + // update offsets if replacement is a different length + if (offsets != null) { + int r = replace.length(); + if (r == 0) + this.offsets = arrayKill(this.offsets, i - n); + else if (r == 2) + this.offsets = arrayInsert(this.offsets, i - n + 1, this.offsets[i - n], r - 1); + n += 1 - r; + } + } + return buf.toString(); + } else if (language.equals("it")) { + // new Mpdl code: added by J. Willenborg: some of Malcolms code did not work without errors so it has to be taken away, also all latin stuff is imported + StringBuffer buf = new StringBuffer(); + int n = 0; + for (int i = 0; i < s.length(); i++) { + char c = s.charAt(i); + String replace = new String(); + switch (c) { + case '\u00c0': replace = "A"; break; // LATIN CAPITAL LETTER A GRAVE + case '\u00c1': replace = "A"; break; // LATIN CAPITAL LETTER A ACUTE + case '\u00c2': replace = "A"; break; // LATIN CAPITAL LETTER A CIRCUMFLEX + case '\u00c4': replace = "A"; break; // LATIN CAPITAL LETTER A DIAERESIS + case '\u00c6': replace = "Ae"; break; // LATIN CAPITAL LETTER A E + case '\u00c7': replace = "C"; break; // LATIN CAPITAL LETTER C CEDILLA + case '\u00c8': replace = "E"; break; // LATIN CAPITAL LETTER E GRAVE + case '\u00c9': replace = "E"; break; // LATIN CAPITAL LETTER E ACUTE + case '\u00ca': replace = "E"; break; // LATIN CAPITAL LETTER E CIRCUMFLEX + case '\u00cb': replace = "E"; break; // LATIN CAPITAL LETTER E DIAERESIS + case '\u00cc': replace = "I"; break; // LATIN CAPITAL LETTER I GRAVE; + case '\u00cd': replace = "I"; break; // LATIN CAPITAL LETTER I ACUTE + case '\u00ce': replace = "I"; break; // LATIN CAPITAL LETTER I CIRCUMFLEX + case '\u00cf': replace = "I"; break; // LATIN CAPITAL LETTER I DIAERESIS + case '\u00d2': replace = "O"; break; // LATIN CAPITAL LETTER O GRAVE + case '\u00d3': replace = "O"; break; // LATIN CAPITAL LETTER O ACUTE + case '\u00d4': replace = "O"; break; // LATIN CAPITAL LETTER O CIRCUMFLEX + case '\u00d6': replace = "O"; break; // LATIN CAPITAL LETTER O DIAERESIS + case '\u00d9': replace = "U"; break; // LATIN CAPITAL LETTER U GRAVE + case '\u00da': replace = "U"; break; // LATIN CAPITAL LETTER U ACUTE + case '\u00db': replace = "U"; break; // LATIN CAPITAL LETTER U CIRCUMFLEX + case '\u00dc': replace = "U"; break; // LATIN CAPITAL LETTER U DIAERESIS + case '\u00e0': replace = "a"; break; // LATIN SMALL LETTER A GRAVE + case '\u00e1': replace = "a"; break; // LATIN SMALL LETTER A ACUTE + case '\u00e2': replace = "a"; break; // LATIN SMALL LETTER A CIRCUMFLEX + case '\u00e4': replace = "a"; break; // LATIN SMALL LETTER A DIAERESIS + case '\u00e6': replace = "ae"; break; // LATIN SMALL LETTER A E + case '\u00e7': replace = "c"; break; // LATIN SMALL LETTER C CEDILLA + case '\u00e8': replace = "e"; break; // LATIN SMALL LETTER E GRAVE + case '\u00e9': replace = "e"; break; // LATIN SMALL LETTER E ACUTE + case '\u00ea': replace = "e"; break; // LATIN SMALL LETTER E CIRCUMFLEX + case '\u00eb': replace = "e"; break; // LATIN SMALL LETTER E DIAERESIS + case '\u00ec': replace = "i"; break; // LATIN SMALL LETTER I GRAVE + case '\u00ed': replace = "i"; break; // LATIN SMALL LETTER I ACUTE + case '\u00ee': replace = "i"; break; // LATIN SMALL LETTER I CIRCUMFLEX + case '\u00ef': replace = "i"; break; // LATIN SMALL LETTER I DIAERESIS + case '\u00f2': replace = "o"; break; // LATIN SMALL LETTER O GRAVE + case '\u00f3': replace = "o"; break; // LATIN SMALL LETTER O ACUTE + case '\u00f4': replace = "o"; break; // LATIN SMALL LETTER O CIRCUMFLEX + case '\u00f6': replace = "o"; break; // LATIN SMALL LETTER O DIAERESIS + case '\u00f9': replace = "u"; break; // LATIN SMALL LETTER U GRAVE + case '\u00fa': replace = "u"; break; // LATIN SMALL LETTER U ACUTE + case '\u00fb': replace = "u"; break; // LATIN SMALL LETTER U CIRCUMFLEX + case '\u00fc': replace = "u"; break; // LATIN SMALL LETTER U DIAERESIS + case '\u0100': replace = "A"; break; // LATIN CAPITAL LETTER A MACRON + case '\u0101': replace = "a"; break; // LATIN SMALL LETTER A MACRON + case '\u0102': replace = "A"; break; // LATIN CAPITAL LETTER A BREVE + case '\u0103': replace = "a"; break; // LATIN SMALL LETTER A BREVE + case '\u0112': replace = "E"; break; // LATIN CAPITAL LETTER E MACRON + case '\u0113': replace = "e"; break; // LATIN SMALL LETTER E MACRON + case '\u0114': replace = "E"; break; // LATIN CAPITAL LETTER E BREVE + case '\u0115': replace = "e"; break; // LATIN SMALL LETTER E BREVE + case '\u0118': replace = "Ae"; break; // LATIN CAPITAL LETTER E OGONEK + case '\u0119': replace = "ae"; break; // LATIN SMALL LETTER E OGONEK + case '\u012a': replace = "I"; break; // LATIN CAPITAL LETTER I MACRON + case '\u012b': replace = "i"; break; // LATIN SMALL LETTER I MACRON + case '\u012c': replace = "I"; break; // LATIN CAPITAL LETTER I BREVE + case '\u012d': replace = "i"; break; // LATIN SMALL LETTER I BREVE + case '\u014c': replace = "O"; break; // LATIN CAPITAL LETTER O MACRON + case '\u014d': replace = "o"; break; // LATIN SMALL LETTER O MACRON + case '\u014e': replace = "O"; break; // LATIN CAPITAL LETTER O BREVE + case '\u014f': replace = "o"; break; // LATIN SMALL LETTER O BREVE + case '\u0152': replace = "Oe"; break; // LATIN CAPITAL LETTER O E + case '\u0153': replace = "oe"; break; // LATIN SMALL LETTER O E + case '\u016a': replace = "U"; break; // LATIN CAPITAL LETTER U MACRON + case '\u016b': replace = "u"; break; // LATIN SMALL LETTER U MACRON + case '\u016c': replace = "U"; break; // LATIN CAPITAL LETTER U BREVE + case '\u016d': replace = "u"; break; // LATIN SMALL LETTER U BREVE + case '\u017f': replace = "s"; break; // LATIN SMALL LETTER LONG S + case '\u00df': replace = "ss"; break; // LATIN SMALL LETTER SHARP S + // new in MPDL project by J. Willenborg + case '\u1e8d': replace = "e"; break; // LATIN SMALL LETTER E WITH TILDE + default: replace += c; break; + } + buf.append(replace); + // update offsets if replacement is a different length + if (offsets != null) { + int r = replace.length(); + if (r == 0) this.offsets = arrayKill(this.offsets, i - n); + else if (r == 2) + this.offsets = arrayInsert(this.offsets, i - n + 1, this.offsets[i - n], r - 1); + n += 1 - r; + } + } + return buf.toString(); + // new Mpdl code: added by J. Willenborg: most of the latin replacements also in english + } else if (language.equals("en")) { + StringBuffer buf = new StringBuffer(); + int n = 0; + for (int i = 0; i < s.length(); i++) { + char c = s.charAt(i); + String replace = new String(); + switch (c) { + case '\u0300': replace = ""; break; // COMBINING GRAVE ACCENT + case '\u0301': replace = ""; break; // COMBINING ACCUTE ACCENT + case '\u0302': replace = ""; break; // COMBINING CIRCUMFLEX ACCENT + + case '\u00c0': replace = "A"; break; // LATIN CAPITAL LETTER A GRAVE + case '\u00c1': replace = "A"; break; // LATIN CAPITAL LETTER A ACUTE + case '\u00c2': replace = "A"; break; // LATIN CAPITAL LETTER A CIRCUMFLEX + case '\u00c4': replace = "A"; break; // LATIN CAPITAL LETTER A DIAERESIS + case '\u00c6': replace = "Ae"; break; // LATIN CAPITAL LETTER A E + case '\u00c7': replace = "C"; break; // LATIN CAPITAL LETTER C CEDILLA + case '\u00c8': replace = "E"; break; // LATIN CAPITAL LETTER E GRAVE + case '\u00c9': replace = "E"; break; // LATIN CAPITAL LETTER E ACUTE + case '\u00ca': replace = "E"; break; // LATIN CAPITAL LETTER E CIRCUMFLEX + case '\u00cb': replace = "E"; break; // LATIN CAPITAL LETTER E DIAERESIS + case '\u00cc': replace = "I"; break; // LATIN CAPITAL LETTER I GRAVE; + case '\u00cd': replace = "I"; break; // LATIN CAPITAL LETTER I ACUTE + case '\u00ce': replace = "I"; break; // LATIN CAPITAL LETTER I CIRCUMFLEX + case '\u00cf': replace = "I"; break; // LATIN CAPITAL LETTER I DIAERESIS + case '\u00d2': replace = "O"; break; // LATIN CAPITAL LETTER O GRAVE + case '\u00d3': replace = "O"; break; // LATIN CAPITAL LETTER O ACUTE + case '\u00d4': replace = "O"; break; // LATIN CAPITAL LETTER O CIRCUMFLEX + case '\u00d6': replace = "O"; break; // LATIN CAPITAL LETTER O DIAERESIS + case '\u00d9': replace = "U"; break; // LATIN CAPITAL LETTER U GRAVE + case '\u00da': replace = "U"; break; // LATIN CAPITAL LETTER U ACUTE + case '\u00db': replace = "U"; break; // LATIN CAPITAL LETTER U CIRCUMFLEX + case '\u00dc': replace = "U"; break; // LATIN CAPITAL LETTER U DIAERESIS + case '\u00e0': replace = "a"; break; // LATIN SMALL LETTER A GRAVE + case '\u00e1': replace = "a"; break; // LATIN SMALL LETTER A ACUTE + case '\u00e2': replace = "a"; break; // LATIN SMALL LETTER A CIRCUMFLEX + case '\u00e4': replace = "a"; break; // LATIN SMALL LETTER A DIAERESIS + case '\u00e6': replace = "ae"; break; // LATIN SMALL LETTER A E + case '\u00e7': replace = "c"; break; // LATIN SMALL LETTER C CEDILLA + case '\u00e8': replace = "e"; break; // LATIN SMALL LETTER E GRAVE + case '\u00e9': replace = "e"; break; // LATIN SMALL LETTER E ACUTE + case '\u00ea': replace = "e"; break; // LATIN SMALL LETTER E CIRCUMFLEX + case '\u00eb': replace = "e"; break; // LATIN SMALL LETTER E DIAERESIS + case '\u00ec': replace = "i"; break; // LATIN SMALL LETTER I GRAVE + case '\u00ed': replace = "i"; break; // LATIN SMALL LETTER I ACUTE + case '\u00ee': replace = "i"; break; // LATIN SMALL LETTER I CIRCUMFLEX + case '\u00ef': replace = "i"; break; // LATIN SMALL LETTER I DIAERESIS + case '\u00f2': replace = "o"; break; // LATIN SMALL LETTER O GRAVE + case '\u00f3': replace = "o"; break; // LATIN SMALL LETTER O ACUTE + case '\u00f4': replace = "o"; break; // LATIN SMALL LETTER O CIRCUMFLEX + case '\u00f6': replace = "o"; break; // LATIN SMALL LETTER O DIAERESIS + case '\u00f9': replace = "u"; break; // LATIN SMALL LETTER U GRAVE + case '\u00fa': replace = "u"; break; // LATIN SMALL LETTER U ACUTE + case '\u00fb': replace = "u"; break; // LATIN SMALL LETTER U CIRCUMFLEX + case '\u00fc': replace = "u"; break; // LATIN SMALL LETTER U DIAERESIS + case '\u0100': replace = "A"; break; // LATIN CAPITAL LETTER A MACRON + case '\u0101': replace = "a"; break; // LATIN SMALL LETTER A MACRON + case '\u0102': replace = "A"; break; // LATIN CAPITAL LETTER A BREVE + case '\u0103': replace = "a"; break; // LATIN SMALL LETTER A BREVE + case '\u0112': replace = "E"; break; // LATIN CAPITAL LETTER E MACRON + case '\u0113': replace = "e"; break; // LATIN SMALL LETTER E MACRON + case '\u0114': replace = "E"; break; // LATIN CAPITAL LETTER E BREVE + case '\u0115': replace = "e"; break; // LATIN SMALL LETTER E BREVE + case '\u0118': replace = "Ae"; break; // LATIN CAPITAL LETTER E OGONEK + case '\u0119': replace = "ae"; break; // LATIN SMALL LETTER E OGONEK + case '\u012a': replace = "I"; break; // LATIN CAPITAL LETTER I MACRON + case '\u012b': replace = "i"; break; // LATIN SMALL LETTER I MACRON + case '\u012c': replace = "I"; break; // LATIN CAPITAL LETTER I BREVE + case '\u012d': replace = "i"; break; // LATIN SMALL LETTER I BREVE + case '\u014c': replace = "O"; break; // LATIN CAPITAL LETTER O MACRON + case '\u014d': replace = "o"; break; // LATIN SMALL LETTER O MACRON + case '\u014e': replace = "O"; break; // LATIN CAPITAL LETTER O BREVE + case '\u014f': replace = "o"; break; // LATIN SMALL LETTER O BREVE + case '\u0152': replace = "Oe"; break; // LATIN CAPITAL LETTER O E + case '\u0153': replace = "oe"; break; // LATIN SMALL LETTER O E + case '\u016a': replace = "U"; break; // LATIN CAPITAL LETTER U MACRON + case '\u016b': replace = "u"; break; // LATIN SMALL LETTER U MACRON + case '\u016c': replace = "U"; break; // LATIN CAPITAL LETTER U BREVE + case '\u016d': replace = "u"; break; // LATIN SMALL LETTER U BREVE + case '\u017f': replace = "s"; break; // LATIN SMALL LETTER LONG S + case '\u00df': replace = "ss"; break; // LATIN SMALL LETTER SHARP S + // new in MPDL project by J. Willenborg + case '\u1e8d': replace = "e"; break; // LATIN SMALL LETTER E WITH TILDE + // by Malcolm + case '\u00ad': break; // soft hyphen + case '\u2329': break; // BRA + case '\u232a': break; // KET + default: replace += c; break; + } + buf.append(replace); + // update offsets if replacement is a different length + if (offsets != null) { + int r = replace.length(); + if (r == 0) + this.offsets = arrayKill(this.offsets, i - n); + else if (r == 2) + this.offsets = arrayInsert(this.offsets, i - n + 1, this.offsets[i - n], r - 1); + n += 1 - r; + } + } + return buf.toString(); + } else if (language.equals("fr")) { + // new Mpdl code: added by J. Willenborg: some of Malcolms code did not work without errors so it has to be taken away + StringBuffer buf = new StringBuffer(); + int n = 0; + for (int i = 0; i < s.length(); i++) { + char c = s.charAt(i); + String replace = new String(); + switch (c) { + case '\u00e6': replace = "ae"; break; // LATIN SMALL LETTER A E + case '\u017f': replace = "s"; break; // LATIN SMALL LETTER LONG S + case '\u00df': replace = "ss"; break; // LATIN SMALL LETTER SHARP S + case '\u00ad': break; // soft hyphen + case '-': break; + default: replace += c; break; + } + buf.append(replace); + // update offsets if replacement is a different length + if (offsets != null) { + int r = replace.length(); + if (r == 0) + this.offsets = arrayKill(this.offsets, i - n); + else if (r == 2) + this.offsets = arrayInsert(this.offsets, i - n + 1, this.offsets[i - n], r - 1); + n += 1 - r; + } + } + return buf.toString(); + } else if (language.equals("de")) { + StringBuffer buf = new StringBuffer(); + int n = 0; + for (int i = 0; i < s.length(); i++) { + char c = s.charAt(i); + String replace = new String(); + switch (c) { + case '\u00c4': replace = "Ae"; break; + case '\u00d6': replace = "Oe"; break; + case '\u00dc': replace = "Ue"; break; + case '\u00df': replace = "ss"; break; + case '\u00e4': replace = "ae"; break; + case '\u00f6': replace = "oe"; break; + case '\u00fc': replace = "ue"; break; + case '\u00ad': break; // soft hyphen + case '\u00e9': replace = "e"; break; + // new in MPDL project by J. Willenborg + case '\u017f': replace = "s"; break; // LATIN SMALL LETTER LONG S + // case '-': break; + default: replace += c; break; + } + buf.append(replace); + // update offsets if replacement is a different length + if (offsets != null) { + int r = replace.length(); + if (r == 0) + this.offsets = arrayKill(this.offsets, i - n); + else if (r == 2) + this.offsets = arrayInsert(this.offsets, i - n + 1, this.offsets[i - n], r - 1); + n += 1 - r; + } + } + return buf.toString(); + } else if (language.equals("zh")) { + StringBuffer buf = new StringBuffer(); + int n = 0; + for (int i = 0; i < s.length(); i++) { + char c = s.charAt(i); + String replace = new String(); + switch (c) { + case '\u00b9': replace = "1"; break; + case '\u00b2': replace = "2"; break; + case '\u00b3': replace = "3"; break; + case '\u2074': replace = "4"; break; + case '\u2075': replace = "5"; break; + // original by Malcolm Hyman: with the following replacements + // case '\u3000': replace = " "; break; + // case '\u3001': replace = ","; break; + // case '\u3002': replace = "."; break; + // case '\u200b': break; // BREAKS EVERYTHING! + default: replace += c; break; + } + buf.append(replace); + // update offsets if replacement is a different length + if (offsets != null) { + int r = replace.length(); + if (r == 0) + this.offsets = arrayKill(this.offsets, i - n); + else if (r == 2) + this.offsets = arrayInsert(this.offsets, i - n + 1, this.offsets[i - n], r - 1); + n += 1 - r; + } + } + return buf.toString(); + } else if (language.equals("akk") || + language.equals("qam") || + language.equals("qpc") || + language.equals("elx") || + language.equals("sux") || + language.equals("hit") || + language.equals("qhu") || + language.equals("peo") || + language.equals("uga") || + language.equals("ura") || + language.equals("qcu")) { + StringBuffer buf = new StringBuffer(); + int n = 0; + char last = '\u0000'; + for (int i = 0; i < s.length(); i++) { + char c = s.charAt(i); + c = Character.toLowerCase(c); + String replace = new String(); + switch (c) { + case '{': replace += "-"; break; + case '}': replace += "-"; break; + // These are from PSD::ATF::Unicode by Steve Tinney + case '\u0161': replace += "sz"; break; + case '\u1e63': replace += "s,"; break; + case '\u1e6d': replace += "t,"; break; + case '\u014b': replace += "j"; break; + case '\u015b': replace += "s'"; break; + case '\u2080': replace += "0"; break; + case '\u2081': replace += "1"; break; + case '\u2082': replace += "2"; break; + case '\u2083': replace += "3"; break; + case '\u2084': replace += "4"; break; + case '\u2085': replace += "5"; break; + case '\u2086': replace += "6"; break; + case '\u2087': replace += "7"; break; + case '\u2088': replace += "8"; break; + case '\u2089': replace += "9"; break; + + case 'c': // shin (except where used as modifier) + if ((i > 0) && ((last == '~') || (last == '@'))) + replace += "c"; + else replace += "sz"; + break; + default: replace += c; break; + } + // suppress grapheme boundary before or after word boundary + if (replace.equals("-")) { + if ((i + 1 == s.length()) || (s.charAt(i + 1) == ' ') || (i == 0) || (s.charAt(i - 1) == ' ')) + replace = ""; + } + last = c; + buf.append(replace); + // update offsets if replacement is a different length + if (offsets != null) { + int r = replace.length(); + if (r == 0) + this.offsets = arrayKill(this.offsets, i - n); + else if (r == 2) + this.offsets = arrayInsert(this.offsets, i - n + 1, this.offsets[i - n], r - 1); + n += 1 - r; + } + } + return buf.toString(); + } else if (language.equals("el") || language.equals("grc")) { + StringBuffer buf = new StringBuffer(); + int n = 0; + for (int i = 0; i < s.length(); i++) { + char c = s.charAt(i); + String replace = new String(); + switch (c) { + case '\u03c2': replace = "\u03c3"; break; // GREEK SMALL LETTER FINAL SIGMA + case '<': break; + case '>': break; + case '[': break; + case ']': break; + case '1': break; + case '2': break; + case '\u03ac': replace = "\u1f71"; break; + case '\u03ad': replace = "\u1f73"; break; + case '\u03ae': replace = "\u1f75"; break; + case '\u03af': replace = "\u1f77"; break; + case '\u03cc': replace = "\u1f79"; break; + case '\u03cd': replace = "\u1f7b"; break; + case '\u03ce': replace = "\u1f7d"; break; + case '-': break; // same treatment as soft hyphen + case '\u00ad': break; // soft hyphen + default: replace += c; break; + } + buf.append(replace); + // update offsets if replacement is a different length + if (offsets != null) { + int r = replace.length(); + if (r == 0) + this.offsets = arrayKill(this.offsets, i - n); + else if (r == 2) + this.offsets = arrayInsert(this.offsets, i - n + 1, this.offsets[i - n], r - 1); + n += 1 - r; + } + } + return buf.toString(); + } else if (language.equals("el_atonic")) { + StringBuffer buf = new StringBuffer(); + int n = 0; + for (int i = 0; i < s.length(); i++) { + char c = s.charAt(i); + String replace = new String(); + switch (c) { + case '\u03c2': replace = "\u03c3"; break; // GREEK SMALL LETTER FINAL SIGMA + // map characters with diacritics to their plain equivalent + // cf. BetaCode.java + case '\u03aa': replace = "\u0399"; break; + case '\u03ab': replace = "\u03a5"; break; + case '\u03ac': replace = "\u0381"; break; + case '\u03ad': replace = "\u0385"; break; + case '\u03ae': replace = "\u0387"; break; + case '\u03af': replace = "\u0389"; break; + case '\u03ca': replace = "\u03b9"; break; + case '\u03cb': replace = "\u03c5"; break; + case '\u03cc': replace = "\u03bf"; break; + case '\u03cd': replace = "\u03c5"; break; + case '\u03ce': replace = "\u03c9"; break; + case '\u1f00': replace = "\u03b1"; break; + case '\u1f01': replace = "\u03b1"; break; + case '\u1f02': replace = "\u03b1"; break; + case '\u1f03': replace = "\u03b1"; break; + case '\u1f04': replace = "\u03b1"; break; + case '\u1f05': replace = "\u03b1"; break; + case '\u1f06': replace = "\u03b1"; break; + case '\u1f07': replace = "\u03b1"; break; + case '\u1f08': replace = "\u0391"; break; + case '\u1f09': replace = "\u0391"; break; + case '\u1f0a': replace = "\u0391"; break; + case '\u1f0b': replace = "\u0391"; break; + case '\u1f0c': replace = "\u0391"; break; + case '\u1f0d': replace = "\u0391"; break; + case '\u1f0e': replace = "\u0391"; break; + case '\u1f0f': replace = "\u0391"; break; + case '\u1f10': replace = "\u03b5"; break; + case '\u1f11': replace = "\u03b5"; break; + case '\u1f12': replace = "\u03b5"; break; + case '\u1f13': replace = "\u03b5"; break; + case '\u1f14': replace = "\u03b5"; break; + case '\u1f15': replace = "\u03b5"; break; + case '\u1f18': replace = "\u0395"; break; + case '\u1f19': replace = "\u0395"; break; + case '\u1f1a': replace = "\u0395"; break; + case '\u1f1b': replace = "\u0395"; break; + case '\u1f1c': replace = "\u0395"; break; + case '\u1f1d': replace = "\u0395"; break; + case '\u1f20': replace = "\u03b7"; break; + case '\u1f21': replace = "\u03b7"; break; + case '\u1f22': replace = "\u03b7"; break; + case '\u1f23': replace = "\u03b7"; break; + case '\u1f24': replace = "\u03b7"; break; + case '\u1f25': replace = "\u03b7"; break; + case '\u1f26': replace = "\u03b7"; break; + case '\u1f27': replace = "\u03b7"; break; + case '\u1f28': replace = "\u0397"; break; + case '\u1f29': replace = "\u0397"; break; + case '\u1f2a': replace = "\u0397"; break; + case '\u1f2b': replace = "\u0397"; break; + case '\u1f2c': replace = "\u0397"; break; + case '\u1f2d': replace = "\u0397"; break; + case '\u1f2e': replace = "\u0397"; break; + case '\u1f2f': replace = "\u0397"; break; + case '\u1f30': replace = "\u03b9"; break; + case '\u1f31': replace = "\u03b9"; break; + case '\u1f32': replace = "\u03b9"; break; + case '\u1f33': replace = "\u03b9"; break; + case '\u1f34': replace = "\u03b9"; break; + case '\u1f35': replace = "\u03b9"; break; + case '\u1f36': replace = "\u03b9"; break; + case '\u1f37': replace = "\u03b9"; break; + case '\u1f38': replace = "\u0399"; break; + case '\u1f39': replace = "\u0399"; break; + case '\u1f3a': replace = "\u0399"; break; + case '\u1f3b': replace = "\u0399"; break; + case '\u1f3c': replace = "\u0399"; break; + case '\u1f3d': replace = "\u0399"; break; + case '\u1f3e': replace = "\u0399"; break; + case '\u1f3f': replace = "\u0399"; break; + case '\u1f40': replace = "\u03bf"; break; + case '\u1f41': replace = "\u03bf"; break; + case '\u1f42': replace = "\u03bf"; break; + case '\u1f43': replace = "\u03bf"; break; + case '\u1f44': replace = "\u03bf"; break; + case '\u1f45': replace = "\u03bf"; break; + case '\u1f48': replace = "\u039f"; break; + case '\u1f49': replace = "\u039f"; break; + case '\u1f4a': replace = "\u039f"; break; + case '\u1f4b': replace = "\u039f"; break; + case '\u1f4c': replace = "\u039f"; break; + case '\u1f4d': replace = "\u039f"; break; + case '\u1f50': replace = "\u03c5"; break; + case '\u1f51': replace = "\u03c5"; break; + case '\u1f52': replace = "\u03c5"; break; + case '\u1f53': replace = "\u03c5"; break; + case '\u1f54': replace = "\u03c5"; break; + case '\u1f55': replace = "\u03c5"; break; + case '\u1f56': replace = "\u03c5"; break; + case '\u1f57': replace = "\u03c5"; break; + case '\u1f58': replace = "\u03a5"; break; + case '\u1f59': replace = "\u03a5"; break; + case '\u1f5a': replace = "\u03a5"; break; + case '\u1f5b': replace = "\u03a5"; break; + case '\u1f5c': replace = "\u03a5"; break; + case '\u1f5d': replace = "\u03a5"; break; + case '\u1f5e': replace = "\u03a5"; break; + case '\u1f5f': replace = "\u03a5"; break; + case '\u1f60': replace = "\u03c9"; break; + case '\u1f61': replace = "\u03c9"; break; + case '\u1f62': replace = "\u03c9"; break; + case '\u1f63': replace = "\u03c9"; break; + case '\u1f64': replace = "\u03c9"; break; + case '\u1f65': replace = "\u03c9"; break; + case '\u1f66': replace = "\u03c9"; break; + case '\u1f67': replace = "\u03c9"; break; + case '\u1f68': replace = "\u03a9"; break; + case '\u1f69': replace = "\u03a9"; break; + case '\u1f6a': replace = "\u03a9"; break; + case '\u1f6b': replace = "\u03a9"; break; + case '\u1f6c': replace = "\u03a9"; break; + case '\u1f6d': replace = "\u03a9"; break; + case '\u1f6e': replace = "\u03a9"; break; + case '\u1f6f': replace = "\u03a9"; break; + case '\u1f70': replace = "\u03b1"; break; + case '\u1f71': replace = "\u03b1"; break; + case '\u1f72': replace = "\u03b5"; break; + case '\u1f73': replace = "\u03b5"; break; + case '\u1f74': replace = "\u03b7"; break; + case '\u1f75': replace = "\u03b7"; break; + case '\u1f76': replace = "\u03b9"; break; + case '\u1f77': replace = "\u03b9"; break; + case '\u1f78': replace = "\u03bf"; break; + case '\u1f79': replace = "\u03bf"; break; + case '\u1f7a': replace = "\u03c5"; break; + case '\u1f7b': replace = "\u03c5"; break; + case '\u1f7c': replace = "\u03c9"; break; + case '\u1f7d': replace = "\u03c9"; break; + case '\u1f80': replace = "\u03b1"; break; + case '\u1f81': replace = "\u03b1"; break; + case '\u1f82': replace = "\u03b1"; break; + case '\u1f83': replace = "\u03b1"; break; + case '\u1f84': replace = "\u03b1"; break; + case '\u1f85': replace = "\u03b1"; break; + case '\u1f86': replace = "\u03b1"; break; + case '\u1f87': replace = "\u03b1"; break; + case '\u1f88': replace = "\u0391"; break; + case '\u1f89': replace = "\u0391"; break; + case '\u1f8a': replace = "\u0391"; break; + case '\u1f8b': replace = "\u0391"; break; + case '\u1f8c': replace = "\u0391"; break; + case '\u1f8d': replace = "\u0391"; break; + case '\u1f8e': replace = "\u0391"; break; + case '\u1f8f': replace = "\u0391"; break; + case '\u1f90': replace = "\u03b7"; break; + case '\u1f91': replace = "\u03b7"; break; + case '\u1f92': replace = "\u03b7"; break; + case '\u1f93': replace = "\u03b7"; break; + case '\u1f94': replace = "\u03b7"; break; + case '\u1f95': replace = "\u03b7"; break; + case '\u1f96': replace = "\u03b7"; break; + case '\u1f97': replace = "\u03b7"; break; + case '\u1f98': replace = "\u0397"; break; + case '\u1f99': replace = "\u0397"; break; + case '\u1f9a': replace = "\u0397"; break; + case '\u1f9b': replace = "\u0397"; break; + case '\u1f9c': replace = "\u0397"; break; + case '\u1f9d': replace = "\u0397"; break; + case '\u1f9e': replace = "\u0397"; break; + case '\u1f9f': replace = "\u0397"; break; + case '\u1fa0': replace = "\u03c9"; break; + case '\u1fa1': replace = "\u03c9"; break; + case '\u1fa2': replace = "\u03c9"; break; + case '\u1fa3': replace = "\u03c9"; break; + case '\u1fa4': replace = "\u03c9"; break; + case '\u1fa5': replace = "\u03c9"; break; + case '\u1fa6': replace = "\u03c9"; break; + case '\u1fa7': replace = "\u03c9"; break; + case '\u1fa8': replace = "\u03a9"; break; + case '\u1fa9': replace = "\u03a9"; break; + case '\u1faa': replace = "\u03a9"; break; + case '\u1fab': replace = "\u03a9"; break; + case '\u1fac': replace = "\u03a9"; break; + case '\u1fad': replace = "\u03a9"; break; + case '\u1fae': replace = "\u03a9"; break; + case '\u1faf': replace = "\u03a9"; break; + case '\u1fb2': replace = "\u03b1"; break; + case '\u1fb3': replace = "\u03b1"; break; + case '\u1fb4': replace = "\u03b1"; break; + case '\u1fb6': replace = "\u03b1"; break; + case '\u1fb7': replace = "\u03b1"; break; + case '\u1fba': replace = "\u0391"; break; + case '\u1fbb': replace = "\u0391"; break; + case '\u1fbc': replace = "\u0391"; break; + case '\u1fc2': replace = "\u03b7"; break; + case '\u1fc3': replace = "\u03b7"; break; + case '\u1fc4': replace = "\u03b7"; break; + case '\u1fc6': replace = "\u03b7"; break; + case '\u1fc7': replace = "\u03b7"; break; + case '\u1fca': replace = "\u0397"; break; + case '\u1fcb': replace = "\u0397"; break; + case '\u1fcc': replace = "\u0397"; break; + case '\u1fd2': replace = "\u03b9"; break; + case '\u1fd3': replace = "\u03b9"; break; + case '\u1fd6': replace = "\u03b9"; break; + case '\u1fd7': replace = "\u03b9"; break; + case '\u1fda': replace = "\u0399"; break; + case '\u1fdb': replace = "\u039f"; break; + case '\u1fe2': replace = "\u03c5"; break; + case '\u1fe3': replace = "\u03c5"; break; + case '\u1fe4': replace = "\u03c1"; break; + case '\u1fe5': replace = "\u03c1"; break; + case '\u1fe6': replace = "\u03c5"; break; + case '\u1fe7': replace = "\u03c5"; break; + case '\u1fea': replace = "\u03a5"; break; + case '\u1feb': replace = "\u03a5"; break; + case '\u1fec': replace = "\u03a1"; break; + case '\u1ff2': replace = "\u03c9"; break; + case '\u1ff3': replace = "\u03c9"; break; + case '\u1ff4': replace = "\u03c9"; break; + case '\u1ff6': replace = "\u03c9"; break; + case '\u1ff7': replace = "\u03c9"; break; + case '\u1ff8': replace = "\u039f"; break; + case '\u1ff9': replace = "\u039f"; break; + case '\u1ffa': replace = "\u03a9"; break; + case '\u1ffb': replace = "\u03a9"; break; + case '\u1ffc': replace = "\u03a9"; break; + + case '<': break; + case '>': break; + case '-': break; // same treatment as soft hyphen + case '\u00ad': break; // soft hyphen + default: replace += c; break; + } + buf.append(replace); + // update offsets if replacement is a different length + if (offsets != null) { + int r = replace.length(); + if (r == 0) + this.offsets = arrayKill(this.offsets, i - n); + else if (r == 2) + this.offsets = arrayInsert(this.offsets, i - n + 1, this.offsets[i - n], r - 1); + n += 1 - r; + } + } + return buf.toString(); + } else { // unknown or no language + return s; + } + } + + /* + // explicit words + normStr = normStr.replaceAll("aliàs", "alias"); + normStr = normStr.replaceAll("hîc", "hic"); + normStr = normStr.replaceAll("quòd", "quod"); + normStr = normStr.replaceAll("Quòd", "Quod"); + normStr = normStr.replaceAll("QVòd", "Quod"); + normStr = normStr.replaceAll("Cùmque", "Cumque"); + normStr = normStr.replaceAll("aër", "aer"); + // ij + normStr = normStr.replaceAll("ij", "ii"); + // qu/qv + normStr = normStr.replaceAll("qv", "qu"); + // normStr = normStr.replaceAll("qV", "qU"); + normStr = normStr.replaceAll("Qv", "Qu"); + normStr = normStr.replaceAll("QV", "QU"); + // u/v + String vowels = getVowels(); + String consonants = getConsonants(); + normStr = normStr.replaceAll("([" + vowels + "])([-]*)u([" + vowels +"])", "$1$2v$3"); // vowel + u + vowel --> vowel + v + vowel + normStr = normStr.replaceAll("([" + vowels + "])([-]*)U([" + vowels +"])", "$1$2V$3"); // vowel + U + vowel --> vowel + V + vowel + normStr = normStr.replaceAll("([" + consonants + "])([-]*)v([" + consonants +"])", "$1$2u$3"); // consonant + v + consonant --> consonant + u + consonant + normStr = normStr.replaceAll("([" + consonants + "])([-]*)V([" + consonants +"])", "$1$2U$3"); // consonant + V + consonant --> consonant + U + consonant + normStr = normStr.replaceAll("^v([" + consonants +"])", "u$1"); // v + consonant --> u + consonant + normStr = normStr.replaceAll("^V([" + consonants +"])", "U$1"); // V + consonant --> U + consonant + // end of word: diacritica + normStr = normStr.replaceAll("à$", "a"); + normStr = normStr.replaceAll("è$", "e"); + normStr = normStr.replaceAll("ò$", "o"); + normStr = normStr.replaceAll("àm$", "am"); + normStr = normStr.replaceAll("ùm$", "um"); + String normStrTmp = normStr; + normStr = ""; + for (int i = 0; i < normStrTmp.length(); i++) { + char c = normStrTmp.charAt(i); + String replace = ""; + switch (c) { + case 'ſ': replace = "s"; break; + case 'ß': replace = "ss"; break; + case 'æ': replace = "ae"; break; + case 'Æ': replace = "AE"; break; + case 'ę': replace = "ae"; break; + case 'œ': replace = "oe"; break; + default: replace += c; break; + } + normStr = normStr + replace; + } + + + private String getVowels() { + String retStr = null; + if (Language.getInstance().isItalian(language)) { + retStr = "AEIOUaeiou" + + "\u00c6\u00e6" + // AE ligatures + "\u0152\u0153"; // OE ligatures + } else if (Language.getInstance().isLatin(language)) { + retStr = "AEIOUaeiouÆœęàèòù"; + } + return retStr; + } + + private String getConsonants() { + String retStr = null; + if (Language.getInstance().isItalian(language)) { + retStr = "BCDFGHKLMNPQRSTVWXZ" + + "bcdfghklmnpqrstvwxz" + + "ſß"; // long/sharp S + } else if (Language.getInstance().isLatin(language)) { + retStr = "BCDFGHKLMNPQRSTVWXZ" + + "bcdfghklmnpqrstvwxz" + + "ſß"; // long/sharp S + } + return retStr; + } + + + + + + * + * + * + * + */ + + + + + + + /** + * Returns a copy of an integer array with the element at + * index removed ("killed"). + * + * @param array integer array + * @param index index of element to remove + */ + private int[] arrayKill(int[] array, int index) { + int[] newArray = new int[array.length - 1]; + System.arraycopy(array, 0, newArray, 0, index); + System.arraycopy(array, index + 1, newArray, index, array.length - index - 1); + return newArray; + } + + /** + * Returns a copy of an integer array with count elements + * inserted at index. + * + * @param array integer array + * @param index index to insert new elements + * @param value value to insert into new slots + * @param count number of new slots to insert + */ + private int[] arrayInsert(int[] array, int index, int value, int count) { + int[] newArray = new int[array.length + count]; + System.arraycopy(array, 0, newArray, 0, index); + for (int i = 0; i < count; i++) newArray[index + i] = value; + System.arraycopy(array, index, newArray, index + count, array.length - index); + return newArray; + } + +} \ No newline at end of file diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexAR.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexAR.java Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,584 @@ +/* The following code was generated by JFlex 1.4.3 on 21.07.11 11:22 */ + +/* + * Normalization rules for Arabic text + * [this is a JFlex specification] + * + * Wolfgang Schmidle + * version 2011-02-28 + * + */ + +package de.mpg.mpiwg.berlin.mpdl.lt.text.norm.lang; + + +/** + * This class is a scanner generated by + * JFlex 1.4.3 + * on 21.07.11 11:22 from the specification file + * MpdlNormalizerLexAR.lex + */ +public class MpdlNormalizerLexAR { + + /** This character denotes the end of file */ + public static final int YYEOF = -1; + + /** initial size of the lookahead buffer */ + private static final int ZZ_BUFFERSIZE = 16384; + + /** lexical states */ + public static final int SEARCH = 6; + public static final int DICT = 4; + public static final int YYINITIAL = 0; + public static final int DISP = 2; + + /** + * ZZ_LEXSTATE[l] is the state in the DFA for the lexical state l + * ZZ_LEXSTATE[l+1] is the state in the DFA for the lexical state l + * at the beginning of a line + * l is of the form l = 2*k, k a non negative integer + */ + private static final int ZZ_LEXSTATE[] = { + 0, 0, 1, 1, 2, 2, 3, 3 + }; + + /** + * Translates characters to character classes + */ + private static final String ZZ_CMAP_PACKED = + "\12\0\1\3\25\0\1\2\14\0\1\1\2\0\1\1\17\0\1\4"+ + "\40\0\1\1\2\0\1\1\20\0\1\1\5\0\1\1\1\0\1\1"+ + "\uff82\0"; + + /** + * Translates characters to character classes + */ + private static final char [] ZZ_CMAP = zzUnpackCMap(ZZ_CMAP_PACKED); + + /** + * Translates DFA states to action switch labels. + */ + private static final int [] ZZ_ACTION = zzUnpackAction(); + + private static final String ZZ_ACTION_PACKED_0 = + "\4\0\2\1\1\2\1\3\1\4\1\5"; + + private static int [] zzUnpackAction() { + int [] result = new int[10]; + int offset = 0; + offset = zzUnpackAction(ZZ_ACTION_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackAction(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int count = packed.charAt(i++); + int value = packed.charAt(i++); + do result[j++] = value; while (--count > 0); + } + return j; + } + + + /** + * Translates a state to a row index in the transition table + */ + private static final int [] ZZ_ROWMAP = zzUnpackRowMap(); + + private static final String ZZ_ROWMAP_PACKED_0 = + "\0\0\0\5\0\12\0\17\0\24\0\31\0\24\0\24"+ + "\0\24\0\24"; + + private static int [] zzUnpackRowMap() { + int [] result = new int[10]; + int offset = 0; + offset = zzUnpackRowMap(ZZ_ROWMAP_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackRowMap(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int high = packed.charAt(i++) << 16; + result[j++] = high | packed.charAt(i++); + } + return j; + } + + /** + * The transition table of the DFA + */ + private static final int [] ZZ_TRANS = zzUnpackTrans(); + + private static final String ZZ_TRANS_PACKED_0 = + "\1\5\1\6\1\5\1\0\1\7\1\5\1\6\1\5"+ + "\1\10\1\7\1\5\1\6\1\5\1\11\1\7\1\5"+ + "\1\6\1\5\1\12\1\7\7\0\1\5\2\0"; + + private static int [] zzUnpackTrans() { + int [] result = new int[30]; + int offset = 0; + offset = zzUnpackTrans(ZZ_TRANS_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackTrans(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int count = packed.charAt(i++); + int value = packed.charAt(i++); + value--; + do result[j++] = value; while (--count > 0); + } + return j; + } + + + /* error codes */ + private static final int ZZ_UNKNOWN_ERROR = 0; + private static final int ZZ_NO_MATCH = 1; + private static final int ZZ_PUSHBACK_2BIG = 2; + + /* error messages for the codes above */ + private static final String ZZ_ERROR_MSG[] = { + "Unkown internal scanner error", + "Error: could not match input", + "Error: pushback value was too large" + }; + + /** + * ZZ_ATTRIBUTE[aState] contains the attributes of state aState + */ + private static final int [] ZZ_ATTRIBUTE = zzUnpackAttribute(); + + private static final String ZZ_ATTRIBUTE_PACKED_0 = + "\4\0\1\11\1\1\4\11"; + + private static int [] zzUnpackAttribute() { + int [] result = new int[10]; + int offset = 0; + offset = zzUnpackAttribute(ZZ_ATTRIBUTE_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackAttribute(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int count = packed.charAt(i++); + int value = packed.charAt(i++); + do result[j++] = value; while (--count > 0); + } + return j; + } + + /** the input device */ + private java.io.Reader zzReader; + + /** the current state of the DFA */ + private int zzState; + + /** the current lexical state */ + private int zzLexicalState = YYINITIAL; + + /** this buffer contains the current text to be matched and is + the source of the yytext() string */ + private char zzBuffer[] = new char[ZZ_BUFFERSIZE]; + + /** the textposition at the last accepting state */ + private int zzMarkedPos; + + /** the current text position in the buffer */ + private int zzCurrentPos; + + /** startRead marks the beginning of the yytext() string in the buffer */ + private int zzStartRead; + + /** endRead marks the last character in the buffer, that has been read + from input */ + private int zzEndRead; + + /** number of newlines encountered up to the start of the matched text */ + private int yyline; + + /** the number of characters up to the start of the matched text */ + private int yychar; + + /** + * the number of characters from the last newline up to the start of the + * matched text + */ + private int yycolumn; + + /** + * zzAtBOL == true <=> the scanner is currently at the beginning of a line + */ + private boolean zzAtBOL = true; + + /** zzAtEOF == true <=> the scanner is at the EOF */ + private boolean zzAtEOF; + + /** denotes if the user-EOF-code has already been executed */ + private boolean zzEOFDone; + + /* user code: */ + private String original = ""; + private String normalized = ""; + private int problem = 0; + + private void add (String norm) { + original += yytext(); + normalized += norm; + } + + private static final String LB = "[\u002d\u00ad] "; + + + /** + * Creates a new scanner + * There is also a java.io.InputStream version of this constructor. + * + * @param in the java.io.Reader to read input from. + */ + public MpdlNormalizerLexAR(java.io.Reader in) { + this.zzReader = in; + } + + /** + * Creates a new scanner. + * There is also java.io.Reader version of this constructor. + * + * @param in the java.io.Inputstream to read input from. + */ + public MpdlNormalizerLexAR(java.io.InputStream in) { + this(new java.io.InputStreamReader(in)); + } + + /** + * Unpacks the compressed character translation table. + * + * @param packed the packed character translation table + * @return the unpacked character translation table + */ + private static char [] zzUnpackCMap(String packed) { + char [] map = new char[0x10000]; + int i = 0; /* index in packed string */ + int j = 0; /* index in unpacked array */ + while (i < 42) { + int count = packed.charAt(i++); + char value = packed.charAt(i++); + do map[j++] = value; while (--count > 0); + } + return map; + } + + + /** + * Refills the input buffer. + * + * @return false, iff there was new input. + * + * @exception java.io.IOException if any I/O-Error occurs + */ + private boolean zzRefill() throws java.io.IOException { + + /* first: make room (if you can) */ + if (zzStartRead > 0) { + System.arraycopy(zzBuffer, zzStartRead, + zzBuffer, 0, + zzEndRead-zzStartRead); + + /* translate stored positions */ + zzEndRead-= zzStartRead; + zzCurrentPos-= zzStartRead; + zzMarkedPos-= zzStartRead; + zzStartRead = 0; + } + + /* is the buffer big enough? */ + if (zzCurrentPos >= zzBuffer.length) { + /* if not: blow it up */ + char newBuffer[] = new char[zzCurrentPos*2]; + System.arraycopy(zzBuffer, 0, newBuffer, 0, zzBuffer.length); + zzBuffer = newBuffer; + } + + /* finally: fill the buffer with new input */ + int numRead = zzReader.read(zzBuffer, zzEndRead, + zzBuffer.length-zzEndRead); + + if (numRead > 0) { + zzEndRead+= numRead; + return false; + } + // unlikely but not impossible: read 0 characters, but not at end of stream + if (numRead == 0) { + int c = zzReader.read(); + if (c == -1) { + return true; + } else { + zzBuffer[zzEndRead++] = (char) c; + return false; + } + } + + // numRead < 0 + return true; + } + + + /** + * Closes the input stream. + */ + public final void yyclose() throws java.io.IOException { + zzAtEOF = true; /* indicate end of file */ + zzEndRead = zzStartRead; /* invalidate buffer */ + + if (zzReader != null) + zzReader.close(); + } + + + /** + * Resets the scanner to read from a new input stream. + * Does not close the old reader. + * + * All internal variables are reset, the old input stream + * cannot be reused (internal buffer is discarded and lost). + * Lexical state is set to ZZ_INITIAL. + * + * @param reader the new input stream + */ + public final void yyreset(java.io.Reader reader) { + zzReader = reader; + zzAtBOL = true; + zzAtEOF = false; + zzEOFDone = false; + zzEndRead = zzStartRead = 0; + zzCurrentPos = zzMarkedPos = 0; + yyline = yychar = yycolumn = 0; + zzLexicalState = YYINITIAL; + } + + + /** + * Returns the current lexical state. + */ + public final int yystate() { + return zzLexicalState; + } + + + /** + * Enters a new lexical state + * + * @param newState the new lexical state + */ + public final void yybegin(int newState) { + zzLexicalState = newState; + } + + + /** + * Returns the text matched by the current regular expression. + */ + public final String yytext() { + return new String( zzBuffer, zzStartRead, zzMarkedPos-zzStartRead ); + } + + + /** + * Returns the character at position pos from the + * matched text. + * + * It is equivalent to yytext().charAt(pos), but faster + * + * @param pos the position of the character to fetch. + * A value from 0 to yylength()-1. + * + * @return the character at position pos + */ + public final char yycharat(int pos) { + return zzBuffer[zzStartRead+pos]; + } + + + /** + * Returns the length of the matched text region. + */ + public final int yylength() { + return zzMarkedPos-zzStartRead; + } + + + /** + * Reports an error that occured while scanning. + * + * In a wellformed scanner (no or only correct usage of + * yypushback(int) and a match-all fallback rule) this method + * will only be called with things that "Can't Possibly Happen". + * If this method is called, something is seriously wrong + * (e.g. a JFlex bug producing a faulty scanner etc.). + * + * Usual syntax/scanner level error handling should be done + * in error fallback rules. + * + * @param errorCode the code of the errormessage to display + */ + private void zzScanError(int errorCode) { + String message; + try { + message = ZZ_ERROR_MSG[errorCode]; + } + catch (ArrayIndexOutOfBoundsException e) { + message = ZZ_ERROR_MSG[ZZ_UNKNOWN_ERROR]; + } + + throw new Error(message); + } + + + /** + * Pushes the specified amount of characters back into the input stream. + * + * They will be read again by then next call of the scanning method + * + * @param number the number of characters to be read again. + * This number must not be greater than yylength()! + */ + public void yypushback(int number) { + if ( number > yylength() ) + zzScanError(ZZ_PUSHBACK_2BIG); + + zzMarkedPos -= number; + } + + + /** + * Resumes scanning until the next regular expression is matched, + * the end of input is encountered or an I/O-Error occurs. + * + * @return the next token + * @exception java.io.IOException if any I/O-Error occurs + */ + public java.lang.String yylex() throws java.io.IOException { + int zzInput; + int zzAction; + + // cached fields: + int zzCurrentPosL; + int zzMarkedPosL; + int zzEndReadL = zzEndRead; + char [] zzBufferL = zzBuffer; + char [] zzCMapL = ZZ_CMAP; + + int [] zzTransL = ZZ_TRANS; + int [] zzRowMapL = ZZ_ROWMAP; + int [] zzAttrL = ZZ_ATTRIBUTE; + + while (true) { + zzMarkedPosL = zzMarkedPos; + + zzAction = -1; + + zzCurrentPosL = zzCurrentPos = zzStartRead = zzMarkedPosL; + + zzState = ZZ_LEXSTATE[zzLexicalState]; + + + zzForAction: { + while (true) { + + if (zzCurrentPosL < zzEndReadL) + zzInput = zzBufferL[zzCurrentPosL++]; + else if (zzAtEOF) { + zzInput = YYEOF; + break zzForAction; + } + else { + // store back cached positions + zzCurrentPos = zzCurrentPosL; + zzMarkedPos = zzMarkedPosL; + boolean eof = zzRefill(); + // get translated positions and possibly new buffer + zzCurrentPosL = zzCurrentPos; + zzMarkedPosL = zzMarkedPos; + zzBufferL = zzBuffer; + zzEndReadL = zzEndRead; + if (eof) { + zzInput = YYEOF; + break zzForAction; + } + else { + zzInput = zzBufferL[zzCurrentPosL++]; + } + } + int zzNext = zzTransL[ zzRowMapL[zzState] + zzCMapL[zzInput] ]; + if (zzNext == -1) break zzForAction; + zzState = zzNext; + + int zzAttributes = zzAttrL[zzState]; + if ( (zzAttributes & 1) == 1 ) { + zzAction = zzState; + zzMarkedPosL = zzCurrentPosL; + if ( (zzAttributes & 8) == 8 ) break zzForAction; + } + + } + } + + // store back cached position + zzMarkedPos = zzMarkedPosL; + + switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) { + case 5: + { switch (problem) { + case 1: return original; + default: return normalized.replaceAll(LB, ""); + } + } + case 6: break; + case 4: + { switch (problem) { + case 1: return ""; + default: return normalized.replaceAll(LB, ""); + } + } + case 7: break; + case 2: + { problem = 1; add(yytext()); + } + case 8: break; + case 3: + { switch (problem) { + case 1: return original; + default: return normalized; + } + } + case 9: break; + case 1: + { add(yytext()); + } + case 10: break; + default: + if (zzInput == YYEOF && zzStartRead == zzCurrentPos) { + zzAtEOF = true; + return null; + } + else { + zzScanError(ZZ_NO_MATCH); + } + } + } + } + + +} diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexAR.lex --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexAR.lex Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,90 @@ +/* + * Normalization rules for Arabic text + * [this is a JFlex specification] + * + * Wolfgang Schmidle + * version 2011-02-28 + * + */ + +package de.mpg.mpiwg.berlin.mpdl.lt.analyzer.lang; + +%% + +%public +%class MpdlNormalizerLexAR +%type java.lang.String +%unicode + +// Arabic: ar + +%states DISP, DICT, SEARCH + +%{ + private String original = ""; + private String normalized = ""; + private int problem = 0; + + private void add (String norm) { + original += yytext(); + normalized += norm; + } + + private static final String LB = "[\u002d\u00ad] "; +%} + +hyphen = [-\u{00ad}] // hyphen and soft hyphen +LB = {hyphen} \u0020 +// lb = ({hyphen} \u0020)? + +END = \n + +%% + +@ { problem = 1; add(yytext()); } +{LB} { add(yytext()); } +. { add(yytext()); } + + + { + +{END} { + switch (problem) { + case 1: return original; + default: return normalized; + } + } +} + + { + +{END} { + switch (problem) { + case 1: return ""; + default: return normalized.replaceAll(LB, ""); + } + } +} + +
  { + +{END} { + switch (problem) { + case 1: return original; + default: return normalized.replaceAll(LB, ""); + } + } +} + + +/* + +Annahmen: +- die Routine wird wortweise aufgerufen, mit einem \n am Ende des Strings +- Zeilenumbrüche innerhalb des Wortes sind als hyphen/soft hyphen plus space markiert + +TO DO: + +AR: fehlt noch + +*/ diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexAll.lex --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexAll.lex Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,143 @@ +/* + * Normalization rules for all languages + * [this is a JFlex specification] + * + * Wolfgang Schmidle + * 2011-01-25 + * + */ + +package de.mpg.mpiwg.berlin.mpdl.lt.analyzer.lang; + +%% + +%public +%class MpdlNormalizerLexAll +%type java.lang.String +%unicode +// %debug + +%states LA, ZH + +%{ + int cv = 0; // consonant = 1, vowel = 2, everything else = 0 +%} + +VOWEL=[AEIOUaeiouÆæęàèòùœ] +CONS=[BCDFGHKLMNPQRSTVWXZbcdfghklmnpqrstvwxzſß] +LR=[lLrR] +QUE=(que)? +END=\n + +%% + + { + +// 1. simple replacements + +// 1.1 single characters +ſ { cv = 1; return "s"; } +ß { cv = 1; return "ss"; } +[æę] { cv = 2; return "ae"; } +Æ { cv = 2; return "AE"; } +œ { cv = 2; return "oe"; } +// 1.2 character combinations +ij { cv = 2; return "ii"; } + +// 2. diacritics + +// 2.1 superfluous diacritics in single words +^ hîc {END} { return "hic"; } + +// 2.2 superfluous diacritics at the end of a word +// 2.2.1 common cases +à / {QUE} {END} { return "a"; } +àm / {QUE} {END} { return "am"; } +às / {QUE} {END} { return "as"; } // (-àsque will likely never occur) +// à / [ms]? {QUE} {END} { return "a"; } +è / {QUE} {END} { return "e"; } +ò / {QUE} {END} { return "o"; } +òd / {QUE} {END} { return "od"; } +ùm / {QUE} {END} { return "um"; } +ùs / {QUE} {END} { return "us"; } + +// 2.3 superfluous diacritics within a word +// 2.3.1 common cases +aë { cv = 2; return "ae"; } +oë { cv = 2; return "oe"; } +// 2.3.2 rare cases +oï { cv = 2; return "oi"; } +uï { cv = 2; return "ui"; } +// 2.3.3 extremely rare cases +uü { cv = 2; return "uu"; } + +// 3. rules for u and v + +// 3.1 rules for u + +u/{VOWEL} { + switch(cv) { + case 2: return "v"; + default: cv = 2; return "u"; + } + } +U/{VOWEL} { + switch(cv) { + case 2: return "V"; + default: cv = 2; return "U"; + } + } + +// 3.2 rules for v + +qv { cv = 1; return "qu"; } // the replaced v still counts as consonant +Qv { cv = 1; return "Qu"; } +QV { cv = 1; return "QU"; } + +{LR}v { + switch(cv) { + case 1: return yytext().replace("v", "u"); + default: cv = 1; return yytext(); + } + } +{LR}V { + switch(cv) { + case 1: return yytext().replace("V", "U"); + default: cv = 1; return yytext(); + } + } + +v/{CONS} { cv = 1; return "u"; } +V/{CONS} { cv = 1; return "U"; } + + +// default + +{VOWEL} { cv = 2; return yytext(); } +{CONS} { cv = 1; return yytext(); } +\n { cv = 0; return ""; } +. { cv = 0; return yytext(); } + +} + + { + +// Codepoint < FFFF + +竒 { return "奇"; } // 7AD2 --> 5947 +旹 { return "時"; } // 65F9 --> 6642 +歴 { return "歷"; } // 6B74 --> 6B77 +精 { return "精"; } // FA1D --> 7CBE (FA1D is a compatibility ideograph) + +// Codepoint > FFFF + +庶 { return "庶"; } // 2F88D --> 5EB6 (2F88D is a compatibility ideograph) + + +} + + +// default (can be overridden by individual languages) + +\n { return ""; } +. { return yytext(); } diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexDE.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexDE.java Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,648 @@ +/* The following code was generated by JFlex 1.4.3 on 05.09.11 10:34 */ + +/* + * Normalization rules for German text + * [this is a JFlex specification] + * + * Wolfgang Schmidle + * version 2011-08-10 + * + */ + +package de.mpg.mpiwg.berlin.mpdl.lt.text.norm.lang; + + +/** + * This class is a scanner generated by + * JFlex 1.4.3 + * on 05.09.11 10:34 from the specification file + * MpdlNormalizerLexDE.lex + */ +public class MpdlNormalizerLexDE { + + /** This character denotes the end of file */ + public static final int YYEOF = -1; + + /** initial size of the lookahead buffer */ + private static final int ZZ_BUFFERSIZE = 16384; + + /** lexical states */ + public static final int SEARCH = 10; + public static final int DICT_ASCII = 6; + public static final int SEARCH_ASCII = 12; + public static final int DICT = 4; + public static final int YYINITIAL = 0; + public static final int DISP = 2; + public static final int GRIMM = 8; + + /** + * ZZ_LEXSTATE[l] is the state in the DFA for the lexical state l + * ZZ_LEXSTATE[l+1] is the state in the DFA for the lexical state l + * at the beginning of a line + * l is of the form l = 2*k, k a non negative integer + */ + private static final int ZZ_LEXSTATE[] = { + 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6 + }; + + /** + * Translates characters to character classes + */ + private static final String ZZ_CMAP_PACKED = + "\12\0\1\3\25\0\1\2\14\0\1\1\2\0\1\1\17\0\1\20"+ + "\32\4\6\0\1\11\2\4\1\5\12\4\1\13\5\4\1\7\5\4"+ + "\1\1\1\0\1\1\106\0\1\14\21\0\1\15\5\0\1\16\2\0"+ + "\1\17\4\0\1\14\21\0\1\15\5\0\1\16\202\0\1\6\u01e4\0"+ + "\1\12\1\0\1\10\ufc99\0"; + + /** + * Translates characters to character classes + */ + private static final char [] ZZ_CMAP = zzUnpackCMap(ZZ_CMAP_PACKED); + + /** + * Translates DFA states to action switch labels. + */ + private static final int [] ZZ_ACTION = zzUnpackAction(); + + private static final String ZZ_ACTION_PACKED_0 = + "\7\0\2\1\1\2\1\3\1\4\3\1\1\5\1\3"+ + "\3\1\1\6\1\7\1\10\1\11\1\12\1\13\1\14"+ + "\1\15\1\16\1\17"; + + private static int [] zzUnpackAction() { + int [] result = new int[30]; + int offset = 0; + offset = zzUnpackAction(ZZ_ACTION_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackAction(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int count = packed.charAt(i++); + int value = packed.charAt(i++); + do result[j++] = value; while (--count > 0); + } + return j; + } + + + /** + * Translates a state to a row index in the transition table + */ + private static final int [] ZZ_ROWMAP = zzUnpackRowMap(); + + private static final String ZZ_ROWMAP_PACKED_0 = + "\0\0\0\21\0\42\0\63\0\104\0\125\0\146\0\167"+ + "\0\210\0\167\0\167\0\167\0\231\0\252\0\273\0\167"+ + "\0\210\0\314\0\335\0\356\0\167\0\167\0\167\0\167"+ + "\0\167\0\167\0\167\0\167\0\167\0\167"; + + private static int [] zzUnpackRowMap() { + int [] result = new int[30]; + int offset = 0; + offset = zzUnpackRowMap(ZZ_ROWMAP_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackRowMap(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int high = packed.charAt(i++) << 16; + result[j++] = high | packed.charAt(i++); + } + return j; + } + + /** + * The transition table of the DFA + */ + private static final int [] ZZ_TRANS = zzUnpackTrans(); + + private static final String ZZ_TRANS_PACKED_0 = + "\1\10\1\11\1\10\1\0\1\10\1\11\1\12\1\11"+ + "\1\10\1\11\6\10\1\13\1\10\1\11\1\10\1\14"+ + "\1\10\1\11\1\12\1\15\1\10\1\16\1\10\1\17"+ + "\4\10\1\13\1\10\1\11\1\10\1\20\1\10\1\11"+ + "\1\12\1\15\1\10\1\16\1\10\1\17\4\10\2\13"+ + "\1\21\1\13\1\20\1\10\1\11\1\12\1\22\1\13"+ + "\1\23\1\13\1\24\1\25\1\26\1\27\1\30\1\13"+ + "\1\10\1\11\1\10\1\20\1\10\1\11\1\12\1\15"+ + "\1\10\1\16\1\10\1\17\3\10\1\31\1\13\1\10"+ + "\1\11\1\10\1\32\1\10\1\11\1\12\1\15\1\10"+ + "\1\16\1\10\1\17\4\10\2\13\1\21\1\13\1\32"+ + "\1\10\1\11\1\12\1\22\1\13\1\23\1\13\1\24"+ + "\1\25\1\26\1\27\1\30\1\13\23\0\1\10\20\0"+ + "\1\10\5\0\1\33\1\0\1\34\10\0\1\10\7\0"+ + "\1\35\20\0\1\36\10\0\1\10\5\0\1\33\1\0"+ + "\1\27\10\0\1\10\7\0\1\25\20\0\1\26\6\0"; + + private static int [] zzUnpackTrans() { + int [] result = new int[255]; + int offset = 0; + offset = zzUnpackTrans(ZZ_TRANS_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackTrans(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int count = packed.charAt(i++); + int value = packed.charAt(i++); + value--; + do result[j++] = value; while (--count > 0); + } + return j; + } + + + /* error codes */ + private static final int ZZ_UNKNOWN_ERROR = 0; + private static final int ZZ_NO_MATCH = 1; + private static final int ZZ_PUSHBACK_2BIG = 2; + + /* error messages for the codes above */ + private static final String ZZ_ERROR_MSG[] = { + "Unkown internal scanner error", + "Error: could not match input", + "Error: pushback value was too large" + }; + + /** + * ZZ_ATTRIBUTE[aState] contains the attributes of state aState + */ + private static final int [] ZZ_ATTRIBUTE = zzUnpackAttribute(); + + private static final String ZZ_ATTRIBUTE_PACKED_0 = + "\7\0\1\11\1\1\3\11\3\1\1\11\4\1\12\11"; + + private static int [] zzUnpackAttribute() { + int [] result = new int[30]; + int offset = 0; + offset = zzUnpackAttribute(ZZ_ATTRIBUTE_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackAttribute(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int count = packed.charAt(i++); + int value = packed.charAt(i++); + do result[j++] = value; while (--count > 0); + } + return j; + } + + /** the input device */ + private java.io.Reader zzReader; + + /** the current state of the DFA */ + private int zzState; + + /** the current lexical state */ + private int zzLexicalState = YYINITIAL; + + /** this buffer contains the current text to be matched and is + the source of the yytext() string */ + private char zzBuffer[] = new char[ZZ_BUFFERSIZE]; + + /** the textposition at the last accepting state */ + private int zzMarkedPos; + + /** the current text position in the buffer */ + private int zzCurrentPos; + + /** startRead marks the beginning of the yytext() string in the buffer */ + private int zzStartRead; + + /** endRead marks the last character in the buffer, that has been read + from input */ + private int zzEndRead; + + /** number of newlines encountered up to the start of the matched text */ + private int yyline; + + /** the number of characters up to the start of the matched text */ + private int yychar; + + /** + * the number of characters from the last newline up to the start of the + * matched text + */ + private int yycolumn; + + /** + * zzAtBOL == true <=> the scanner is currently at the beginning of a line + */ + private boolean zzAtBOL = true; + + /** zzAtEOF == true <=> the scanner is at the EOF */ + private boolean zzAtEOF; + + /** denotes if the user-EOF-code has already been executed */ + private boolean zzEOFDone; + + /* user code: */ + public static final int CELEX = DICT_ASCII; + + private String original = ""; + private String normalized = ""; + private int problem = 0; + + private void add (String norm) { + original += yytext(); + normalized += norm; + } + + private static final String LB = "[\u002d\u00ad] "; + + + /** + * Creates a new scanner + * There is also a java.io.InputStream version of this constructor. + * + * @param in the java.io.Reader to read input from. + */ + public MpdlNormalizerLexDE(java.io.Reader in) { + this.zzReader = in; + } + + /** + * Creates a new scanner. + * There is also java.io.Reader version of this constructor. + * + * @param in the java.io.Inputstream to read input from. + */ + public MpdlNormalizerLexDE(java.io.InputStream in) { + this(new java.io.InputStreamReader(in)); + } + + /** + * Unpacks the compressed character translation table. + * + * @param packed the packed character translation table + * @return the unpacked character translation table + */ + private static char [] zzUnpackCMap(String packed) { + char [] map = new char[0x10000]; + int i = 0; /* index in packed string */ + int j = 0; /* index in unpacked array */ + while (i < 88) { + int count = packed.charAt(i++); + char value = packed.charAt(i++); + do map[j++] = value; while (--count > 0); + } + return map; + } + + + /** + * Refills the input buffer. + * + * @return false, iff there was new input. + * + * @exception java.io.IOException if any I/O-Error occurs + */ + private boolean zzRefill() throws java.io.IOException { + + /* first: make room (if you can) */ + if (zzStartRead > 0) { + System.arraycopy(zzBuffer, zzStartRead, + zzBuffer, 0, + zzEndRead-zzStartRead); + + /* translate stored positions */ + zzEndRead-= zzStartRead; + zzCurrentPos-= zzStartRead; + zzMarkedPos-= zzStartRead; + zzStartRead = 0; + } + + /* is the buffer big enough? */ + if (zzCurrentPos >= zzBuffer.length) { + /* if not: blow it up */ + char newBuffer[] = new char[zzCurrentPos*2]; + System.arraycopy(zzBuffer, 0, newBuffer, 0, zzBuffer.length); + zzBuffer = newBuffer; + } + + /* finally: fill the buffer with new input */ + int numRead = zzReader.read(zzBuffer, zzEndRead, + zzBuffer.length-zzEndRead); + + if (numRead > 0) { + zzEndRead+= numRead; + return false; + } + // unlikely but not impossible: read 0 characters, but not at end of stream + if (numRead == 0) { + int c = zzReader.read(); + if (c == -1) { + return true; + } else { + zzBuffer[zzEndRead++] = (char) c; + return false; + } + } + + // numRead < 0 + return true; + } + + + /** + * Closes the input stream. + */ + public final void yyclose() throws java.io.IOException { + zzAtEOF = true; /* indicate end of file */ + zzEndRead = zzStartRead; /* invalidate buffer */ + + if (zzReader != null) + zzReader.close(); + } + + + /** + * Resets the scanner to read from a new input stream. + * Does not close the old reader. + * + * All internal variables are reset, the old input stream + * cannot be reused (internal buffer is discarded and lost). + * Lexical state is set to ZZ_INITIAL. + * + * @param reader the new input stream + */ + public final void yyreset(java.io.Reader reader) { + zzReader = reader; + zzAtBOL = true; + zzAtEOF = false; + zzEOFDone = false; + zzEndRead = zzStartRead = 0; + zzCurrentPos = zzMarkedPos = 0; + yyline = yychar = yycolumn = 0; + zzLexicalState = YYINITIAL; + } + + + /** + * Returns the current lexical state. + */ + public final int yystate() { + return zzLexicalState; + } + + + /** + * Enters a new lexical state + * + * @param newState the new lexical state + */ + public final void yybegin(int newState) { + zzLexicalState = newState; + } + + + /** + * Returns the text matched by the current regular expression. + */ + public final String yytext() { + return new String( zzBuffer, zzStartRead, zzMarkedPos-zzStartRead ); + } + + + /** + * Returns the character at position pos from the + * matched text. + * + * It is equivalent to yytext().charAt(pos), but faster + * + * @param pos the position of the character to fetch. + * A value from 0 to yylength()-1. + * + * @return the character at position pos + */ + public final char yycharat(int pos) { + return zzBuffer[zzStartRead+pos]; + } + + + /** + * Returns the length of the matched text region. + */ + public final int yylength() { + return zzMarkedPos-zzStartRead; + } + + + /** + * Reports an error that occured while scanning. + * + * In a wellformed scanner (no or only correct usage of + * yypushback(int) and a match-all fallback rule) this method + * will only be called with things that "Can't Possibly Happen". + * If this method is called, something is seriously wrong + * (e.g. a JFlex bug producing a faulty scanner etc.). + * + * Usual syntax/scanner level error handling should be done + * in error fallback rules. + * + * @param errorCode the code of the errormessage to display + */ + private void zzScanError(int errorCode) { + String message; + try { + message = ZZ_ERROR_MSG[errorCode]; + } + catch (ArrayIndexOutOfBoundsException e) { + message = ZZ_ERROR_MSG[ZZ_UNKNOWN_ERROR]; + } + + throw new Error(message); + } + + + /** + * Pushes the specified amount of characters back into the input stream. + * + * They will be read again by then next call of the scanning method + * + * @param number the number of characters to be read again. + * This number must not be greater than yylength()! + */ + public void yypushback(int number) { + if ( number > yylength() ) + zzScanError(ZZ_PUSHBACK_2BIG); + + zzMarkedPos -= number; + } + + + /** + * Resumes scanning until the next regular expression is matched, + * the end of input is encountered or an I/O-Error occurs. + * + * @return the next token + * @exception java.io.IOException if any I/O-Error occurs + */ + public java.lang.String yylex() throws java.io.IOException { + int zzInput; + int zzAction; + + // cached fields: + int zzCurrentPosL; + int zzMarkedPosL; + int zzEndReadL = zzEndRead; + char [] zzBufferL = zzBuffer; + char [] zzCMapL = ZZ_CMAP; + + int [] zzTransL = ZZ_TRANS; + int [] zzRowMapL = ZZ_ROWMAP; + int [] zzAttrL = ZZ_ATTRIBUTE; + + while (true) { + zzMarkedPosL = zzMarkedPos; + + zzAction = -1; + + zzCurrentPosL = zzCurrentPos = zzStartRead = zzMarkedPosL; + + zzState = ZZ_LEXSTATE[zzLexicalState]; + + + zzForAction: { + while (true) { + + if (zzCurrentPosL < zzEndReadL) + zzInput = zzBufferL[zzCurrentPosL++]; + else if (zzAtEOF) { + zzInput = YYEOF; + break zzForAction; + } + else { + // store back cached positions + zzCurrentPos = zzCurrentPosL; + zzMarkedPos = zzMarkedPosL; + boolean eof = zzRefill(); + // get translated positions and possibly new buffer + zzCurrentPosL = zzCurrentPos; + zzMarkedPosL = zzMarkedPos; + zzBufferL = zzBuffer; + zzEndReadL = zzEndRead; + if (eof) { + zzInput = YYEOF; + break zzForAction; + } + else { + zzInput = zzBufferL[zzCurrentPosL++]; + } + } + int zzNext = zzTransL[ zzRowMapL[zzState] + zzCMapL[zzInput] ]; + if (zzNext == -1) break zzForAction; + zzState = zzNext; + + int zzAttributes = zzAttrL[zzState]; + if ( (zzAttributes & 1) == 1 ) { + zzAction = zzState; + zzMarkedPosL = zzCurrentPosL; + if ( (zzAttributes & 8) == 8 ) break zzForAction; + } + + } + } + + // store back cached position + zzMarkedPos = zzMarkedPosL; + + switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) { + case 10: + { add("sz"); + } + case 16: break; + case 3: + { problem = 1; add(yytext()); + } + case 17: break; + case 6: + { add("ae"); + } + case 18: break; + case 2: + { add("s"); + } + case 19: break; + case 4: + { switch (problem) { + case 1: return original; + default: return normalized; + } + } + case 20: break; + case 13: + { add("ü"); + } + case 21: break; + case 8: + { add("ue"); + } + case 22: break; + case 11: + { switch (problem) { + case 1: return original; + default: return normalized.replaceAll(LB, "").toLowerCase(); + } + } + case 23: break; + case 12: + { add("u"); + } + case 24: break; + case 14: + { add("ä"); + } + case 25: break; + case 1: + { add(yytext()); + } + case 26: break; + case 9: + { add("ss"); + } + case 27: break; + case 7: + { add("oe"); + } + case 28: break; + case 15: + { add("ö"); + } + case 29: break; + case 5: + { switch (problem) { + case 1: return ""; + default: return normalized.replaceAll(LB, ""); + } + } + case 30: break; + default: + if (zzInput == YYEOF && zzStartRead == zzCurrentPos) { + zzAtEOF = true; + return null; + } + else { + zzScanError(ZZ_NO_MATCH); + } + } + } + } + + +} diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexDE.lex --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexDE.lex Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,134 @@ +/* + * Normalization rules for German text + * [this is a JFlex specification] + * + * Wolfgang Schmidle + * version 2011-07-12 + * + */ + +package de.mpg.mpiwg.berlin.mpdl.lt.analyzer.lang; + +%% + +%public +%class MpdlNormalizerLexDE +%type java.lang.String +%unicode + +// German: de, deu, ger + +%states DISP, DICT, SEARCH +%state CELEX, GRIMM + +%{ + private String original = ""; + private String normalized = ""; + private int problem = 0; + + private void add (String norm) { + original += yytext(); + normalized += norm; + } + + private static final String LB = "[\u002d\u00ad] "; +%} + +hyphen = [-\u{00ad}] // hyphen and soft hyphen +LB = {hyphen} \u0020 +// lb = ({hyphen} \u0020)? + +END = \n + +Alphabet = [abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ] + +%% + +ſ { add("s"); } + +// Fraktur + + { + +uͦ {add("u"); } +aͤ {add("ä"); } +oͤ {add("ö"); } +uͤ {add("ü"); } + +} + + { + +// normalize ä ö ü ß only for Celex! + +ä | Ä | aͤ { add("ae"); } +ö | Ö | oͤ { add("oe"); } +ü | Ü | uͤ { add("ue"); } +uͦ {add("u"); } +ß { add("ss"); } + +{Alphabet} { add(yytext()); } + +. { problem = 1; add(yytext()); } + +} + + { + +ß { add("sz"); } + +} + + +// default + +@ { problem = 1; add(yytext()); } +{LB} { add(yytext()); } +. { add(yytext()); } + + + { + +{END} { + switch (problem) { + case 1: return original; + default: return normalized; + } + } +} + + { + +{END} { + switch (problem) { + case 1: return ""; + default: return normalized.replaceAll(LB, ""); + } + } +} + + { + +{END} { + switch (problem) { + case 1: return original; + default: return normalized.replaceAll(LB, "").toLowerCase(); + } + } +} + + + +/* + +Annahmen: +- die Routine wird wortweise aufgerufen, mit einem \n am Ende des Strings +- Zeilenumbrüche innerhalb des Wortes sind als hyphen/soft hyphen plus space markiert + +TO DO: + +DE: Trennung von Deutsch und Fraktur? +DE: Celex: hyphens weg? + +*/ diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexEL.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexEL.java Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,711 @@ +/* The following code was generated by JFlex 1.4.3 on 05.09.11 10:35 */ + +/* + * Normalization rules for Greek text + * [this is a JFlex specification] + * + * Wolfgang Schmidle + * version 2011-08-03 + * + */ + +package de.mpg.mpiwg.berlin.mpdl.lt.text.norm.lang; + + +/** + * This class is a scanner generated by + * JFlex 1.4.3 + * on 05.09.11 10:35 from the specification file + * MpdlNormalizerLexEL.lex + */ +public class MpdlNormalizerLexEL { + + /** This character denotes the end of file */ + public static final int YYEOF = -1; + + /** initial size of the lookahead buffer */ + private static final int ZZ_BUFFERSIZE = 16384; + + /** lexical states */ + public static final int SEARCH = 6; + public static final int DICT = 4; + public static final int YYINITIAL = 0; + public static final int SIGMA = 8; + public static final int DISP = 2; + + /** + * ZZ_LEXSTATE[l] is the state in the DFA for the lexical state l + * ZZ_LEXSTATE[l+1] is the state in the DFA for the lexical state l + * at the beginning of a line + * l is of the form l = 2*k, k a non negative integer + */ + private static final int ZZ_LEXSTATE[] = { + 0, 0, 1, 1, 2, 2, 3, 3, 4, 4 + }; + + /** + * Translates characters to character classes + */ + private static final String ZZ_CMAP_PACKED = + "\12\0\1\3\25\0\1\2\14\0\1\1\2\0\1\1\17\0\1\5"+ + "\32\5\6\0\1\6\2\5\1\6\20\5\1\6\5\5\1\1\1\0"+ + "\1\1\u032e\0\1\7\1\10\1\11\1\12\15\0\1\4\3\0\1\4"+ + "\1\30\11\0\1\13\1\14\1\15\u1ba1\0\1\16\1\0\1\20\1\0"+ + "\1\21\1\0\1\23\1\0\1\24\1\0\1\25\1\0\1\26\65\0"+ + "\1\17\17\0\1\22\57\0\1\27\ue00d\0"; + + /** + * Translates characters to character classes + */ + private static final char [] ZZ_CMAP = zzUnpackCMap(ZZ_CMAP_PACKED); + + /** + * Translates DFA states to action switch labels. + */ + private static final int [] ZZ_ACTION = zzUnpackAction(); + + private static final String ZZ_ACTION_PACKED_0 = + "\5\0\2\1\2\2\1\3\1\4\1\5\1\6\1\7"+ + "\1\10\1\11\1\12\1\13\12\1\1\14\1\15\1\16"+ + "\1\0\1\17\1\0\1\20\1\0\1\21\1\0\1\22"+ + "\1\0\1\23\1\0\1\24\1\0\1\25\1\0\1\26"+ + "\1\0\1\27\1\0"; + + private static int [] zzUnpackAction() { + int [] result = new int[50]; + int offset = 0; + offset = zzUnpackAction(ZZ_ACTION_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackAction(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int count = packed.charAt(i++); + int value = packed.charAt(i++); + do result[j++] = value; while (--count > 0); + } + return j; + } + + + /** + * Translates a state to a row index in the transition table + */ + private static final int [] ZZ_ROWMAP = zzUnpackRowMap(); + + private static final String ZZ_ROWMAP_PACKED_0 = + "\0\0\0\31\0\62\0\113\0\144\0\175\0\226\0\175"+ + "\0\226\0\175\0\175\0\175\0\175\0\175\0\175\0\175"+ + "\0\175\0\175\0\257\0\310\0\341\0\372\0\u0113\0\u012c"+ + "\0\u0145\0\u015e\0\u0177\0\u0190\0\175\0\175\0\175\0\u01a9"+ + "\0\175\0\u01c2\0\175\0\u01db\0\175\0\u01f4\0\175\0\u020d"+ + "\0\175\0\u0226\0\175\0\u023f\0\175\0\u0258\0\175\0\u0271"+ + "\0\175\0\u028a"; + + private static int [] zzUnpackRowMap() { + int [] result = new int[50]; + int offset = 0; + offset = zzUnpackRowMap(ZZ_ROWMAP_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackRowMap(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int high = packed.charAt(i++) << 16; + result[j++] = high | packed.charAt(i++); + } + return j; + } + + /** + * The transition table of the DFA + */ + private static final int [] ZZ_TRANS = zzUnpackTrans(); + + private static final String ZZ_TRANS_PACKED_0 = + "\1\6\1\7\1\6\1\0\1\6\1\10\1\11\1\12"+ + "\1\13\1\14\1\15\1\16\1\17\1\20\14\6\1\7"+ + "\1\6\1\21\1\6\1\10\1\11\1\12\1\13\1\14"+ + "\1\15\1\16\1\17\1\20\14\6\1\7\1\6\1\22"+ + "\1\6\1\10\1\11\1\12\1\13\1\14\1\15\1\16"+ + "\1\17\1\20\1\23\1\24\1\25\1\26\1\27\1\30"+ + "\1\31\1\32\1\33\1\34\2\6\1\7\1\6\1\35"+ + "\1\6\1\10\1\11\1\12\1\13\1\14\1\15\1\16"+ + "\1\17\1\20\1\23\1\24\1\25\1\26\1\27\1\30"+ + "\1\31\1\32\1\33\1\34\2\6\1\7\1\6\1\22"+ + "\1\6\1\10\1\11\1\12\1\13\1\14\1\15\1\16"+ + "\1\17\1\20\1\23\1\24\1\25\1\26\1\27\1\30"+ + "\1\31\1\32\1\33\1\34\1\36\33\0\1\6\31\0"+ + "\1\37\1\40\23\0\1\40\3\0\1\41\1\42\23\0"+ + "\1\42\3\0\1\43\1\44\23\0\1\44\3\0\1\45"+ + "\1\46\23\0\1\46\3\0\1\47\1\50\23\0\1\50"+ + "\3\0\1\51\1\52\23\0\1\52\3\0\1\53\1\54"+ + "\23\0\1\54\3\0\1\55\1\56\23\0\1\56\3\0"+ + "\1\57\1\60\23\0\1\60\3\0\1\61\1\62\23\0"+ + "\1\62\3\0\1\37\30\0\1\41\30\0\1\43\30\0"+ + "\1\45\30\0\1\47\30\0\1\51\30\0\1\53\30\0"+ + "\1\55\30\0\1\57\30\0\1\61\25\0"; + + private static int [] zzUnpackTrans() { + int [] result = new int[675]; + int offset = 0; + offset = zzUnpackTrans(ZZ_TRANS_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackTrans(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int count = packed.charAt(i++); + int value = packed.charAt(i++); + value--; + do result[j++] = value; while (--count > 0); + } + return j; + } + + + /* error codes */ + private static final int ZZ_UNKNOWN_ERROR = 0; + private static final int ZZ_NO_MATCH = 1; + private static final int ZZ_PUSHBACK_2BIG = 2; + + /* error messages for the codes above */ + private static final String ZZ_ERROR_MSG[] = { + "Unkown internal scanner error", + "Error: could not match input", + "Error: pushback value was too large" + }; + + /** + * ZZ_ATTRIBUTE[aState] contains the attributes of state aState + */ + private static final int [] ZZ_ATTRIBUTE = zzUnpackAttribute(); + + private static final String ZZ_ATTRIBUTE_PACKED_0 = + "\5\0\1\11\1\1\1\11\1\1\11\11\12\1\3\11"+ + "\1\0\1\11\1\0\1\11\1\0\1\11\1\0\1\11"+ + "\1\0\1\11\1\0\1\11\1\0\1\11\1\0\1\11"+ + "\1\0\1\11\1\0"; + + private static int [] zzUnpackAttribute() { + int [] result = new int[50]; + int offset = 0; + offset = zzUnpackAttribute(ZZ_ATTRIBUTE_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackAttribute(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int count = packed.charAt(i++); + int value = packed.charAt(i++); + do result[j++] = value; while (--count > 0); + } + return j; + } + + /** the input device */ + private java.io.Reader zzReader; + + /** the current state of the DFA */ + private int zzState; + + /** the current lexical state */ + private int zzLexicalState = YYINITIAL; + + /** this buffer contains the current text to be matched and is + the source of the yytext() string */ + private char zzBuffer[] = new char[ZZ_BUFFERSIZE]; + + /** the textposition at the last accepting state */ + private int zzMarkedPos; + + /** the current text position in the buffer */ + private int zzCurrentPos; + + /** startRead marks the beginning of the yytext() string in the buffer */ + private int zzStartRead; + + /** endRead marks the last character in the buffer, that has been read + from input */ + private int zzEndRead; + + /** number of newlines encountered up to the start of the matched text */ + private int yyline; + + /** the number of characters up to the start of the matched text */ + private int yychar; + + /** + * the number of characters from the last newline up to the start of the + * matched text + */ + private int yycolumn; + + /** + * zzAtBOL == true <=> the scanner is currently at the beginning of a line + */ + private boolean zzAtBOL = true; + + /** zzAtEOF == true <=> the scanner is at the EOF */ + private boolean zzAtEOF; + + /** denotes if the user-EOF-code has already been executed */ + private boolean zzEOFDone; + + /* user code: */ + private String original = ""; + private String normalized = ""; + private int problem = 0; + + private void add (String norm) { + original += yytext(); + normalized += norm; + } + + private static final String LB = "[\u002d\u00ad] "; + + + /** + * Creates a new scanner + * There is also a java.io.InputStream version of this constructor. + * + * @param in the java.io.Reader to read input from. + */ + public MpdlNormalizerLexEL(java.io.Reader in) { + this.zzReader = in; + } + + /** + * Creates a new scanner. + * There is also java.io.Reader version of this constructor. + * + * @param in the java.io.Inputstream to read input from. + */ + public MpdlNormalizerLexEL(java.io.InputStream in) { + this(new java.io.InputStreamReader(in)); + } + + /** + * Unpacks the compressed character translation table. + * + * @param packed the packed character translation table + * @return the unpacked character translation table + */ + private static char [] zzUnpackCMap(String packed) { + char [] map = new char[0x10000]; + int i = 0; /* index in packed string */ + int j = 0; /* index in unpacked array */ + while (i < 112) { + int count = packed.charAt(i++); + char value = packed.charAt(i++); + do map[j++] = value; while (--count > 0); + } + return map; + } + + + /** + * Refills the input buffer. + * + * @return false, iff there was new input. + * + * @exception java.io.IOException if any I/O-Error occurs + */ + private boolean zzRefill() throws java.io.IOException { + + /* first: make room (if you can) */ + if (zzStartRead > 0) { + System.arraycopy(zzBuffer, zzStartRead, + zzBuffer, 0, + zzEndRead-zzStartRead); + + /* translate stored positions */ + zzEndRead-= zzStartRead; + zzCurrentPos-= zzStartRead; + zzMarkedPos-= zzStartRead; + zzStartRead = 0; + } + + /* is the buffer big enough? */ + if (zzCurrentPos >= zzBuffer.length) { + /* if not: blow it up */ + char newBuffer[] = new char[zzCurrentPos*2]; + System.arraycopy(zzBuffer, 0, newBuffer, 0, zzBuffer.length); + zzBuffer = newBuffer; + } + + /* finally: fill the buffer with new input */ + int numRead = zzReader.read(zzBuffer, zzEndRead, + zzBuffer.length-zzEndRead); + + if (numRead > 0) { + zzEndRead+= numRead; + return false; + } + // unlikely but not impossible: read 0 characters, but not at end of stream + if (numRead == 0) { + int c = zzReader.read(); + if (c == -1) { + return true; + } else { + zzBuffer[zzEndRead++] = (char) c; + return false; + } + } + + // numRead < 0 + return true; + } + + + /** + * Closes the input stream. + */ + public final void yyclose() throws java.io.IOException { + zzAtEOF = true; /* indicate end of file */ + zzEndRead = zzStartRead; /* invalidate buffer */ + + if (zzReader != null) + zzReader.close(); + } + + + /** + * Resets the scanner to read from a new input stream. + * Does not close the old reader. + * + * All internal variables are reset, the old input stream + * cannot be reused (internal buffer is discarded and lost). + * Lexical state is set to ZZ_INITIAL. + * + * @param reader the new input stream + */ + public final void yyreset(java.io.Reader reader) { + zzReader = reader; + zzAtBOL = true; + zzAtEOF = false; + zzEOFDone = false; + zzEndRead = zzStartRead = 0; + zzCurrentPos = zzMarkedPos = 0; + yyline = yychar = yycolumn = 0; + zzLexicalState = YYINITIAL; + } + + + /** + * Returns the current lexical state. + */ + public final int yystate() { + return zzLexicalState; + } + + + /** + * Enters a new lexical state + * + * @param newState the new lexical state + */ + public final void yybegin(int newState) { + zzLexicalState = newState; + } + + + /** + * Returns the text matched by the current regular expression. + */ + public final String yytext() { + return new String( zzBuffer, zzStartRead, zzMarkedPos-zzStartRead ); + } + + + /** + * Returns the character at position pos from the + * matched text. + * + * It is equivalent to yytext().charAt(pos), but faster + * + * @param pos the position of the character to fetch. + * A value from 0 to yylength()-1. + * + * @return the character at position pos + */ + public final char yycharat(int pos) { + return zzBuffer[zzStartRead+pos]; + } + + + /** + * Returns the length of the matched text region. + */ + public final int yylength() { + return zzMarkedPos-zzStartRead; + } + + + /** + * Reports an error that occured while scanning. + * + * In a wellformed scanner (no or only correct usage of + * yypushback(int) and a match-all fallback rule) this method + * will only be called with things that "Can't Possibly Happen". + * If this method is called, something is seriously wrong + * (e.g. a JFlex bug producing a faulty scanner etc.). + * + * Usual syntax/scanner level error handling should be done + * in error fallback rules. + * + * @param errorCode the code of the errormessage to display + */ + private void zzScanError(int errorCode) { + String message; + try { + message = ZZ_ERROR_MSG[errorCode]; + } + catch (ArrayIndexOutOfBoundsException e) { + message = ZZ_ERROR_MSG[ZZ_UNKNOWN_ERROR]; + } + + throw new Error(message); + } + + + /** + * Pushes the specified amount of characters back into the input stream. + * + * They will be read again by then next call of the scanning method + * + * @param number the number of characters to be read again. + * This number must not be greater than yylength()! + */ + public void yypushback(int number) { + if ( number > yylength() ) + zzScanError(ZZ_PUSHBACK_2BIG); + + zzMarkedPos -= number; + } + + + /** + * Resumes scanning until the next regular expression is matched, + * the end of input is encountered or an I/O-Error occurs. + * + * @return the next token + * @exception java.io.IOException if any I/O-Error occurs + */ + public java.lang.String yylex() throws java.io.IOException { + int zzInput; + int zzAction; + + // cached fields: + int zzCurrentPosL; + int zzMarkedPosL; + int zzEndReadL = zzEndRead; + char [] zzBufferL = zzBuffer; + char [] zzCMapL = ZZ_CMAP; + + int [] zzTransL = ZZ_TRANS; + int [] zzRowMapL = ZZ_ROWMAP; + int [] zzAttrL = ZZ_ATTRIBUTE; + + while (true) { + zzMarkedPosL = zzMarkedPos; + + zzAction = -1; + + zzCurrentPosL = zzCurrentPos = zzStartRead = zzMarkedPosL; + + zzState = ZZ_LEXSTATE[zzLexicalState]; + + + zzForAction: { + while (true) { + + if (zzCurrentPosL < zzEndReadL) + zzInput = zzBufferL[zzCurrentPosL++]; + else if (zzAtEOF) { + zzInput = YYEOF; + break zzForAction; + } + else { + // store back cached positions + zzCurrentPos = zzCurrentPosL; + zzMarkedPos = zzMarkedPosL; + boolean eof = zzRefill(); + // get translated positions and possibly new buffer + zzCurrentPosL = zzCurrentPos; + zzMarkedPosL = zzMarkedPos; + zzBufferL = zzBuffer; + zzEndReadL = zzEndRead; + if (eof) { + zzInput = YYEOF; + break zzForAction; + } + else { + zzInput = zzBufferL[zzCurrentPosL++]; + } + } + int zzNext = zzTransL[ zzRowMapL[zzState] + zzCMapL[zzInput] ]; + if (zzNext == -1) break zzForAction; + zzState = zzNext; + + int zzAttributes = zzAttrL[zzState]; + if ( (zzAttributes & 1) == 1 ) { + zzAction = zzState; + zzMarkedPosL = zzCurrentPosL; + if ( (zzAttributes & 8) == 8 ) break zzForAction; + } + + } + } + + // store back cached position + zzMarkedPos = zzMarkedPosL; + + switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) { + case 23: + // lookahead expression with fixed base length + zzMarkedPos = zzStartRead + 1; + { add("ῴ"); + } + case 24: break; + case 5: + { add("ή"); + } + case 25: break; + case 17: + // lookahead expression with fixed base length + zzMarkedPos = zzStartRead + 1; + { add("ή"); + } + case 26: break; + case 13: + { add("σ"); + } + case 27: break; + case 6: + { add("ί"); + } + case 28: break; + case 1: + { add(yytext()); + } + case 29: break; + case 22: + // lookahead expression with fixed base length + zzMarkedPos = zzStartRead + 1; + { add("ώ"); + } + case 30: break; + case 11: + { switch (problem) { + case 1: return ""; + default: return normalized.replaceAll(LB, ""); + } + } + case 31: break; + case 19: + // lookahead expression with fixed base length + zzMarkedPos = zzStartRead + 1; + { add("ί"); + } + case 32: break; + case 15: + // lookahead expression with fixed base length + zzMarkedPos = zzStartRead + 1; + { add("ᾴ"); + } + case 33: break; + case 7: + { add("ό"); + } + case 34: break; + case 14: + // lookahead expression with fixed base length + zzMarkedPos = zzStartRead + 1; + { add("ά"); + } + case 35: break; + case 12: + { switch (problem) { + case 1: return original; + default: return normalized.replaceAll(LB, "").toLowerCase(); + } + } + case 36: break; + case 8: + { add("ύ"); + } + case 37: break; + case 2: + { problem = 1; add(yytext()); + } + case 38: break; + case 20: + // lookahead expression with fixed base length + zzMarkedPos = zzStartRead + 1; + { add("ό"); + } + case 39: break; + case 3: + { add("ά"); + } + case 40: break; + case 10: + { switch (problem) { + case 1: return original; + default: return normalized; + } + } + case 41: break; + case 9: + { add("ώ"); + } + case 42: break; + case 16: + // lookahead expression with fixed base length + zzMarkedPos = zzStartRead + 1; + { add("έ"); + } + case 43: break; + case 18: + // lookahead expression with fixed base length + zzMarkedPos = zzStartRead + 1; + { add("ῄ"); + } + case 44: break; + case 4: + { add("έ"); + } + case 45: break; + case 21: + // lookahead expression with fixed base length + zzMarkedPos = zzStartRead + 1; + { add("ύ"); + } + case 46: break; + default: + if (zzInput == YYEOF && zzStartRead == zzCurrentPos) { + zzAtEOF = true; + return null; + } + else { + zzScanError(ZZ_NO_MATCH); + } + } + } + } + + +} diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexEL.lex --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexEL.lex Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,139 @@ +/* + * Normalization rules for Greek text + * [this is a JFlex specification] + * + * Wolfgang Schmidle + * version 2011-08-03 + * + */ + +package de.mpg.mpiwg.berlin.mpdl.lt.analyzer.lang; + +%% + +%public +%class MpdlNormalizerLexEL +%type java.lang.String +%unicode + +// Greek: el, grc + +%states DISP, DICT, SEARCH +%state SIGMA + +%{ + private String original = ""; + private String normalized = ""; + private int problem = 0; + + private void add (String norm) { + original += yytext(); + normalized += norm; + } + + private static final String LB = "[\u002d\u00ad] "; +%} + +hyphen = [-\u{00ad}] // hyphen and soft hyphen +LB = {hyphen} \u0020 +// lb = ({hyphen} \u0020)? + +END = \n + +wordend = [νρς]? {END} + +Latin = [abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ] + + +%% + + +// always replace tonos by oxia +// (although this should really be corrected in the text rather than normalized) +ά { add("ά"); } +έ { add("έ"); } +ή { add("ή"); } +ί { add("ί"); } +ό { add("ό"); } +ύ { add("ύ"); } +ώ { add("ώ"); } + + + { + +ὰ / {wordend} { add("ά"); } +ᾲ / {wordend} { add("ᾴ"); } +ὲ / {wordend} { add("έ"); } +ὴ / {wordend} { add("ή"); } +ῂ / {wordend} { add("ῄ"); } +ὶ / {wordend} { add("ί"); } +ὸ / {wordend} { add("ό"); } +ὺ / {wordend} { add("ύ"); } +ὼ / {wordend} { add("ώ"); } +ῲ / {wordend} { add("ῴ"); } + +// other candidates: Ὰ Ὲ Ὴ Ὶ Ὺ Ὸ Ὼ + +} + + { + +ς { add("σ"); } + +} + +// default + +@ { problem = 1; add(yytext()); } +{Latin} { problem = 1; add(yytext()); } + +{LB} { add(yytext()); } +. { add(yytext()); } + + + { + +{END} { + switch (problem) { + case 1: return original; + default: return normalized; + } + } +} + + { + +{END} { + switch (problem) { + case 1: return ""; + default: return normalized.replaceAll(LB, ""); + } + } +} + + { + +{END} { + switch (problem) { + case 1: return original; + default: return normalized.replaceAll(LB, "").toLowerCase(); + } + } +} + + +/* + +Annahmen: +- die Routine wird wortweise aufgerufen, mit einem \n am Ende des Strings +- Zeilenumbrüche innerhalb des Wortes sind als hyphen/soft hyphen plus space markiert + +TO DO: + +EL: tonos --> oxia wieder rausnehmen, weil es im Text geändert werden muss? +EL: gibt es noch weitere Fälle, wo legitimerweise ein Gravis vorkommen kann? +EL: kommen Großbuchstaben mit Gravis bei uns jemals vor, und sollen sie normalisiert werden? +EL: neuer State BETACODE ? +EL: nicht falsche Zeichen definieren, sondern erlaubte Zeichen + +*/ diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexEN.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexEN.java Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,589 @@ +/* The following code was generated by JFlex 1.4.3 on 21.07.11 11:22 */ + +/* + * Normalization rules for English text + * [this is a JFlex specification] + * + * Wolfgang Schmidle + * version 2011-07-12 + * + */ + +package de.mpg.mpiwg.berlin.mpdl.lt.text.norm.lang; + + +/** + * This class is a scanner generated by + * JFlex 1.4.3 + * on 21.07.11 11:22 from the specification file + * MpdlNormalizerLexEN.lex + */ +public class MpdlNormalizerLexEN { + + /** This character denotes the end of file */ + public static final int YYEOF = -1; + + /** initial size of the lookahead buffer */ + private static final int ZZ_BUFFERSIZE = 16384; + + /** lexical states */ + public static final int SEARCH = 6; + public static final int DICT = 4; + public static final int YYINITIAL = 0; + public static final int DISP = 2; + + /** + * ZZ_LEXSTATE[l] is the state in the DFA for the lexical state l + * ZZ_LEXSTATE[l+1] is the state in the DFA for the lexical state l + * at the beginning of a line + * l is of the form l = 2*k, k a non negative integer + */ + private static final int ZZ_LEXSTATE[] = { + 0, 0, 1, 1, 2, 2, 3, 3 + }; + + /** + * Translates characters to character classes + */ + private static final String ZZ_CMAP_PACKED = + "\12\0\1\3\25\0\1\2\14\0\1\1\2\0\1\1\17\0\1\5"+ + "\40\0\1\1\2\0\1\1\20\0\1\1\5\0\1\1\1\0\1\1"+ + "\u0101\0\1\4\ufe80\0"; + + /** + * Translates characters to character classes + */ + private static final char [] ZZ_CMAP = zzUnpackCMap(ZZ_CMAP_PACKED); + + /** + * Translates DFA states to action switch labels. + */ + private static final int [] ZZ_ACTION = zzUnpackAction(); + + private static final String ZZ_ACTION_PACKED_0 = + "\4\0\2\1\1\2\1\3\1\4\1\5\1\6"; + + private static int [] zzUnpackAction() { + int [] result = new int[11]; + int offset = 0; + offset = zzUnpackAction(ZZ_ACTION_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackAction(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int count = packed.charAt(i++); + int value = packed.charAt(i++); + do result[j++] = value; while (--count > 0); + } + return j; + } + + + /** + * Translates a state to a row index in the transition table + */ + private static final int [] ZZ_ROWMAP = zzUnpackRowMap(); + + private static final String ZZ_ROWMAP_PACKED_0 = + "\0\0\0\6\0\14\0\22\0\30\0\36\0\30\0\30"+ + "\0\30\0\30\0\30"; + + private static int [] zzUnpackRowMap() { + int [] result = new int[11]; + int offset = 0; + offset = zzUnpackRowMap(ZZ_ROWMAP_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackRowMap(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int high = packed.charAt(i++) << 16; + result[j++] = high | packed.charAt(i++); + } + return j; + } + + /** + * The transition table of the DFA + */ + private static final int [] ZZ_TRANS = zzUnpackTrans(); + + private static final String ZZ_TRANS_PACKED_0 = + "\1\5\1\6\1\5\1\0\1\5\1\7\1\5\1\6"+ + "\1\5\1\10\1\11\1\7\1\5\1\6\1\5\1\12"+ + "\1\11\1\7\1\5\1\6\1\5\1\13\1\11\1\7"+ + "\10\0\1\5\3\0"; + + private static int [] zzUnpackTrans() { + int [] result = new int[36]; + int offset = 0; + offset = zzUnpackTrans(ZZ_TRANS_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackTrans(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int count = packed.charAt(i++); + int value = packed.charAt(i++); + value--; + do result[j++] = value; while (--count > 0); + } + return j; + } + + + /* error codes */ + private static final int ZZ_UNKNOWN_ERROR = 0; + private static final int ZZ_NO_MATCH = 1; + private static final int ZZ_PUSHBACK_2BIG = 2; + + /* error messages for the codes above */ + private static final String ZZ_ERROR_MSG[] = { + "Unkown internal scanner error", + "Error: could not match input", + "Error: pushback value was too large" + }; + + /** + * ZZ_ATTRIBUTE[aState] contains the attributes of state aState + */ + private static final int [] ZZ_ATTRIBUTE = zzUnpackAttribute(); + + private static final String ZZ_ATTRIBUTE_PACKED_0 = + "\4\0\1\11\1\1\5\11"; + + private static int [] zzUnpackAttribute() { + int [] result = new int[11]; + int offset = 0; + offset = zzUnpackAttribute(ZZ_ATTRIBUTE_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackAttribute(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int count = packed.charAt(i++); + int value = packed.charAt(i++); + do result[j++] = value; while (--count > 0); + } + return j; + } + + /** the input device */ + private java.io.Reader zzReader; + + /** the current state of the DFA */ + private int zzState; + + /** the current lexical state */ + private int zzLexicalState = YYINITIAL; + + /** this buffer contains the current text to be matched and is + the source of the yytext() string */ + private char zzBuffer[] = new char[ZZ_BUFFERSIZE]; + + /** the textposition at the last accepting state */ + private int zzMarkedPos; + + /** the current text position in the buffer */ + private int zzCurrentPos; + + /** startRead marks the beginning of the yytext() string in the buffer */ + private int zzStartRead; + + /** endRead marks the last character in the buffer, that has been read + from input */ + private int zzEndRead; + + /** number of newlines encountered up to the start of the matched text */ + private int yyline; + + /** the number of characters up to the start of the matched text */ + private int yychar; + + /** + * the number of characters from the last newline up to the start of the + * matched text + */ + private int yycolumn; + + /** + * zzAtBOL == true <=> the scanner is currently at the beginning of a line + */ + private boolean zzAtBOL = true; + + /** zzAtEOF == true <=> the scanner is at the EOF */ + private boolean zzAtEOF; + + /** denotes if the user-EOF-code has already been executed */ + private boolean zzEOFDone; + + /* user code: */ + private String original = ""; + private String normalized = ""; + private int problem = 0; + + private void add (String norm) { + original += yytext(); + normalized += norm; + } + + private static final String LB = "[\u002d\u00ad] "; + + + /** + * Creates a new scanner + * There is also a java.io.InputStream version of this constructor. + * + * @param in the java.io.Reader to read input from. + */ + public MpdlNormalizerLexEN(java.io.Reader in) { + this.zzReader = in; + } + + /** + * Creates a new scanner. + * There is also java.io.Reader version of this constructor. + * + * @param in the java.io.Inputstream to read input from. + */ + public MpdlNormalizerLexEN(java.io.InputStream in) { + this(new java.io.InputStreamReader(in)); + } + + /** + * Unpacks the compressed character translation table. + * + * @param packed the packed character translation table + * @return the unpacked character translation table + */ + private static char [] zzUnpackCMap(String packed) { + char [] map = new char[0x10000]; + int i = 0; /* index in packed string */ + int j = 0; /* index in unpacked array */ + while (i < 46) { + int count = packed.charAt(i++); + char value = packed.charAt(i++); + do map[j++] = value; while (--count > 0); + } + return map; + } + + + /** + * Refills the input buffer. + * + * @return false, iff there was new input. + * + * @exception java.io.IOException if any I/O-Error occurs + */ + private boolean zzRefill() throws java.io.IOException { + + /* first: make room (if you can) */ + if (zzStartRead > 0) { + System.arraycopy(zzBuffer, zzStartRead, + zzBuffer, 0, + zzEndRead-zzStartRead); + + /* translate stored positions */ + zzEndRead-= zzStartRead; + zzCurrentPos-= zzStartRead; + zzMarkedPos-= zzStartRead; + zzStartRead = 0; + } + + /* is the buffer big enough? */ + if (zzCurrentPos >= zzBuffer.length) { + /* if not: blow it up */ + char newBuffer[] = new char[zzCurrentPos*2]; + System.arraycopy(zzBuffer, 0, newBuffer, 0, zzBuffer.length); + zzBuffer = newBuffer; + } + + /* finally: fill the buffer with new input */ + int numRead = zzReader.read(zzBuffer, zzEndRead, + zzBuffer.length-zzEndRead); + + if (numRead > 0) { + zzEndRead+= numRead; + return false; + } + // unlikely but not impossible: read 0 characters, but not at end of stream + if (numRead == 0) { + int c = zzReader.read(); + if (c == -1) { + return true; + } else { + zzBuffer[zzEndRead++] = (char) c; + return false; + } + } + + // numRead < 0 + return true; + } + + + /** + * Closes the input stream. + */ + public final void yyclose() throws java.io.IOException { + zzAtEOF = true; /* indicate end of file */ + zzEndRead = zzStartRead; /* invalidate buffer */ + + if (zzReader != null) + zzReader.close(); + } + + + /** + * Resets the scanner to read from a new input stream. + * Does not close the old reader. + * + * All internal variables are reset, the old input stream + * cannot be reused (internal buffer is discarded and lost). + * Lexical state is set to ZZ_INITIAL. + * + * @param reader the new input stream + */ + public final void yyreset(java.io.Reader reader) { + zzReader = reader; + zzAtBOL = true; + zzAtEOF = false; + zzEOFDone = false; + zzEndRead = zzStartRead = 0; + zzCurrentPos = zzMarkedPos = 0; + yyline = yychar = yycolumn = 0; + zzLexicalState = YYINITIAL; + } + + + /** + * Returns the current lexical state. + */ + public final int yystate() { + return zzLexicalState; + } + + + /** + * Enters a new lexical state + * + * @param newState the new lexical state + */ + public final void yybegin(int newState) { + zzLexicalState = newState; + } + + + /** + * Returns the text matched by the current regular expression. + */ + public final String yytext() { + return new String( zzBuffer, zzStartRead, zzMarkedPos-zzStartRead ); + } + + + /** + * Returns the character at position pos from the + * matched text. + * + * It is equivalent to yytext().charAt(pos), but faster + * + * @param pos the position of the character to fetch. + * A value from 0 to yylength()-1. + * + * @return the character at position pos + */ + public final char yycharat(int pos) { + return zzBuffer[zzStartRead+pos]; + } + + + /** + * Returns the length of the matched text region. + */ + public final int yylength() { + return zzMarkedPos-zzStartRead; + } + + + /** + * Reports an error that occured while scanning. + * + * In a wellformed scanner (no or only correct usage of + * yypushback(int) and a match-all fallback rule) this method + * will only be called with things that "Can't Possibly Happen". + * If this method is called, something is seriously wrong + * (e.g. a JFlex bug producing a faulty scanner etc.). + * + * Usual syntax/scanner level error handling should be done + * in error fallback rules. + * + * @param errorCode the code of the errormessage to display + */ + private void zzScanError(int errorCode) { + String message; + try { + message = ZZ_ERROR_MSG[errorCode]; + } + catch (ArrayIndexOutOfBoundsException e) { + message = ZZ_ERROR_MSG[ZZ_UNKNOWN_ERROR]; + } + + throw new Error(message); + } + + + /** + * Pushes the specified amount of characters back into the input stream. + * + * They will be read again by then next call of the scanning method + * + * @param number the number of characters to be read again. + * This number must not be greater than yylength()! + */ + public void yypushback(int number) { + if ( number > yylength() ) + zzScanError(ZZ_PUSHBACK_2BIG); + + zzMarkedPos -= number; + } + + + /** + * Resumes scanning until the next regular expression is matched, + * the end of input is encountered or an I/O-Error occurs. + * + * @return the next token + * @exception java.io.IOException if any I/O-Error occurs + */ + public java.lang.String yylex() throws java.io.IOException { + int zzInput; + int zzAction; + + // cached fields: + int zzCurrentPosL; + int zzMarkedPosL; + int zzEndReadL = zzEndRead; + char [] zzBufferL = zzBuffer; + char [] zzCMapL = ZZ_CMAP; + + int [] zzTransL = ZZ_TRANS; + int [] zzRowMapL = ZZ_ROWMAP; + int [] zzAttrL = ZZ_ATTRIBUTE; + + while (true) { + zzMarkedPosL = zzMarkedPos; + + zzAction = -1; + + zzCurrentPosL = zzCurrentPos = zzStartRead = zzMarkedPosL; + + zzState = ZZ_LEXSTATE[zzLexicalState]; + + + zzForAction: { + while (true) { + + if (zzCurrentPosL < zzEndReadL) + zzInput = zzBufferL[zzCurrentPosL++]; + else if (zzAtEOF) { + zzInput = YYEOF; + break zzForAction; + } + else { + // store back cached positions + zzCurrentPos = zzCurrentPosL; + zzMarkedPos = zzMarkedPosL; + boolean eof = zzRefill(); + // get translated positions and possibly new buffer + zzCurrentPosL = zzCurrentPos; + zzMarkedPosL = zzMarkedPos; + zzBufferL = zzBuffer; + zzEndReadL = zzEndRead; + if (eof) { + zzInput = YYEOF; + break zzForAction; + } + else { + zzInput = zzBufferL[zzCurrentPosL++]; + } + } + int zzNext = zzTransL[ zzRowMapL[zzState] + zzCMapL[zzInput] ]; + if (zzNext == -1) break zzForAction; + zzState = zzNext; + + int zzAttributes = zzAttrL[zzState]; + if ( (zzAttributes & 1) == 1 ) { + zzAction = zzState; + zzMarkedPosL = zzCurrentPosL; + if ( (zzAttributes & 8) == 8 ) break zzForAction; + } + + } + } + + // store back cached position + zzMarkedPos = zzMarkedPosL; + + switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) { + case 5: + { switch (problem) { + case 1: return ""; + default: return normalized.replaceAll(LB, ""); + } + } + case 7: break; + case 2: + { problem = 1; add(yytext()); + } + case 8: break; + case 4: + { add("s"); + } + case 9: break; + case 3: + { switch (problem) { + case 1: return original; + default: return normalized; + } + } + case 10: break; + case 6: + { switch (problem) { + case 1: return original; + default: return normalized.replaceAll(LB, "").toLowerCase(); + } + } + case 11: break; + case 1: + { add(yytext()); + } + case 12: break; + default: + if (zzInput == YYEOF && zzStartRead == zzCurrentPos) { + zzAtEOF = true; + return null; + } + else { + zzScanError(ZZ_NO_MATCH); + } + } + } + } + + +} diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexEN.lex --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexEN.lex Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,99 @@ +/* + * Normalization rules for English text + * [this is a JFlex specification] + * + * Wolfgang Schmidle + * version 2011-07-12 + * + */ + +package de.mpg.mpiwg.berlin.mpdl.lt.analyzer.lang; + +%% + +%public +%class MpdlNormalizerLexEN +%type java.lang.String +%unicode + +// 1.5 English: en + +%states DISP, DICT, SEARCH + +%{ + private String original = ""; + private String normalized = ""; + private int problem = 0; + + private void add (String norm) { + original += yytext(); + normalized += norm; + } + + private static final String LB = "[\u002d\u00ad] "; +%} + +hyphen = [-\u{00ad}] // hyphen and soft hyphen +LB = {hyphen} \u0020 +// lb = ({hyphen} \u0020)? + +END = \n + +%% + + { + +ſ { add("s"); } + +} + + +// default + +@ { problem = 1; add(yytext()); } +{LB} { add(yytext()); } +. { add(yytext()); } + + + { + +{END} { + switch (problem) { + case 1: return original; + default: return normalized; + } + } +} + + { + +{END} { + switch (problem) { + case 1: return ""; + default: return normalized.replaceAll(LB, ""); + } + } +} + + { + +{END} { + switch (problem) { + case 1: return original; + default: return normalized.replaceAll(LB, "").toLowerCase(); + } + } +} + + +/* + +Annahmen: +- die Routine wird wortweise aufgerufen, mit einem \n am Ende des Strings +- Zeilenumbrüche innerhalb des Wortes sind als hyphen/soft hyphen plus space markiert + +TO DO: + +EN: vollständig? + +*/ diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexFR.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexFR.java Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,635 @@ +/* The following code was generated by JFlex 1.4.3 on 05.09.11 10:35 */ + +/* + * Normalization rules for French text + * [this is a JFlex specification] + * + * Wolfgang Schmidle + * version 2011-08-10 + * + */ + +package de.mpg.mpiwg.berlin.mpdl.lt.text.norm.lang; + + +/** + * This class is a scanner generated by + * JFlex 1.4.3 + * on 05.09.11 10:35 from the specification file + * MpdlNormalizerLexFR.lex + */ +public class MpdlNormalizerLexFR { + + /** This character denotes the end of file */ + public static final int YYEOF = -1; + + /** initial size of the lookahead buffer */ + private static final int ZZ_BUFFERSIZE = 16384; + + /** lexical states */ + public static final int DICT_ASCII = 8; + public static final int SEARCH = 6; + public static final int DICT = 4; + public static final int YYINITIAL = 0; + public static final int DISP = 2; + + /** + * ZZ_LEXSTATE[l] is the state in the DFA for the lexical state l + * ZZ_LEXSTATE[l+1] is the state in the DFA for the lexical state l + * at the beginning of a line + * l is of the form l = 2*k, k a non negative integer + */ + private static final int ZZ_LEXSTATE[] = { + 0, 0, 1, 1, 2, 2, 3, 3, 4, 4 + }; + + /** + * Translates characters to character classes + */ + private static final String ZZ_CMAP_PACKED = + "\12\0\1\3\25\0\1\2\14\0\1\1\2\0\1\1\17\0\1\20"+ + "\32\4\6\0\1\5\2\4\1\5\20\4\1\5\5\4\1\1\1\0"+ + "\1\1\141\0\1\7\3\12\3\0\1\10\1\0\3\13\1\0\3\14"+ + "\3\0\3\15\4\0\3\16\126\0\2\11\53\0\1\6\u1e99\0\1\17"+ + "\udfe6\0"; + + /** + * Translates characters to character classes + */ + private static final char [] ZZ_CMAP = zzUnpackCMap(ZZ_CMAP_PACKED); + + /** + * Translates DFA states to action switch labels. + */ + private static final int [] ZZ_ACTION = zzUnpackAction(); + + private static final String ZZ_ACTION_PACKED_0 = + "\5\0\2\1\1\2\1\3\1\4\1\5\1\6\1\7"+ + "\1\10\1\2\1\11\1\12\1\13\1\14\1\15\1\16"+ + "\1\17"; + + private static int [] zzUnpackAction() { + int [] result = new int[22]; + int offset = 0; + offset = zzUnpackAction(ZZ_ACTION_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackAction(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int count = packed.charAt(i++); + int value = packed.charAt(i++); + do result[j++] = value; while (--count > 0); + } + return j; + } + + + /** + * Translates a state to a row index in the transition table + */ + private static final int [] ZZ_ROWMAP = zzUnpackRowMap(); + + private static final String ZZ_ROWMAP_PACKED_0 = + "\0\0\0\21\0\42\0\63\0\104\0\125\0\146\0\125"+ + "\0\125\0\125\0\125\0\125\0\125\0\125\0\146\0\125"+ + "\0\125\0\125\0\125\0\125\0\125\0\125"; + + private static int [] zzUnpackRowMap() { + int [] result = new int[22]; + int offset = 0; + offset = zzUnpackRowMap(ZZ_ROWMAP_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackRowMap(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int high = packed.charAt(i++) << 16; + result[j++] = high | packed.charAt(i++); + } + return j; + } + + /** + * The transition table of the DFA + */ + private static final int [] ZZ_TRANS = zzUnpackTrans(); + + private static final String ZZ_TRANS_PACKED_0 = + "\1\6\1\7\1\6\1\0\1\6\1\7\12\6\1\10"+ + "\1\6\1\7\1\6\1\11\1\6\1\7\1\12\1\13"+ + "\1\14\7\6\1\10\1\6\1\7\1\6\1\15\1\6"+ + "\1\7\1\12\1\13\1\14\7\6\1\10\1\6\1\7"+ + "\1\6\1\16\1\6\1\7\1\12\1\13\1\14\7\6"+ + "\2\10\1\17\1\10\1\15\1\6\1\7\1\12\1\13"+ + "\1\14\1\20\1\21\1\22\1\23\1\24\1\25\1\26"+ + "\1\10\23\0\1\6\16\0"; + + private static int [] zzUnpackTrans() { + int [] result = new int[119]; + int offset = 0; + offset = zzUnpackTrans(ZZ_TRANS_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackTrans(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int count = packed.charAt(i++); + int value = packed.charAt(i++); + value--; + do result[j++] = value; while (--count > 0); + } + return j; + } + + + /* error codes */ + private static final int ZZ_UNKNOWN_ERROR = 0; + private static final int ZZ_NO_MATCH = 1; + private static final int ZZ_PUSHBACK_2BIG = 2; + + /* error messages for the codes above */ + private static final String ZZ_ERROR_MSG[] = { + "Unkown internal scanner error", + "Error: could not match input", + "Error: pushback value was too large" + }; + + /** + * ZZ_ATTRIBUTE[aState] contains the attributes of state aState + */ + private static final int [] ZZ_ATTRIBUTE = zzUnpackAttribute(); + + private static final String ZZ_ATTRIBUTE_PACKED_0 = + "\5\0\1\11\1\1\7\11\1\1\7\11"; + + private static int [] zzUnpackAttribute() { + int [] result = new int[22]; + int offset = 0; + offset = zzUnpackAttribute(ZZ_ATTRIBUTE_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackAttribute(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int count = packed.charAt(i++); + int value = packed.charAt(i++); + do result[j++] = value; while (--count > 0); + } + return j; + } + + /** the input device */ + private java.io.Reader zzReader; + + /** the current state of the DFA */ + private int zzState; + + /** the current lexical state */ + private int zzLexicalState = YYINITIAL; + + /** this buffer contains the current text to be matched and is + the source of the yytext() string */ + private char zzBuffer[] = new char[ZZ_BUFFERSIZE]; + + /** the textposition at the last accepting state */ + private int zzMarkedPos; + + /** the current text position in the buffer */ + private int zzCurrentPos; + + /** startRead marks the beginning of the yytext() string in the buffer */ + private int zzStartRead; + + /** endRead marks the last character in the buffer, that has been read + from input */ + private int zzEndRead; + + /** number of newlines encountered up to the start of the matched text */ + private int yyline; + + /** the number of characters up to the start of the matched text */ + private int yychar; + + /** + * the number of characters from the last newline up to the start of the + * matched text + */ + private int yycolumn; + + /** + * zzAtBOL == true <=> the scanner is currently at the beginning of a line + */ + private boolean zzAtBOL = true; + + /** zzAtEOF == true <=> the scanner is at the EOF */ + private boolean zzAtEOF; + + /** denotes if the user-EOF-code has already been executed */ + private boolean zzEOFDone; + + /* user code: */ + private String original = ""; + private String normalized = ""; + private int problem = 0; + + private void add (String norm) { + original += yytext(); + normalized += norm; + } + + private static final String LB = "[\u002d\u00ad] "; + + + /** + * Creates a new scanner + * There is also a java.io.InputStream version of this constructor. + * + * @param in the java.io.Reader to read input from. + */ + public MpdlNormalizerLexFR(java.io.Reader in) { + this.zzReader = in; + } + + /** + * Creates a new scanner. + * There is also java.io.Reader version of this constructor. + * + * @param in the java.io.Inputstream to read input from. + */ + public MpdlNormalizerLexFR(java.io.InputStream in) { + this(new java.io.InputStreamReader(in)); + } + + /** + * Unpacks the compressed character translation table. + * + * @param packed the packed character translation table + * @return the unpacked character translation table + */ + private static char [] zzUnpackCMap(String packed) { + char [] map = new char[0x10000]; + int i = 0; /* index in packed string */ + int j = 0; /* index in unpacked array */ + while (i < 82) { + int count = packed.charAt(i++); + char value = packed.charAt(i++); + do map[j++] = value; while (--count > 0); + } + return map; + } + + + /** + * Refills the input buffer. + * + * @return false, iff there was new input. + * + * @exception java.io.IOException if any I/O-Error occurs + */ + private boolean zzRefill() throws java.io.IOException { + + /* first: make room (if you can) */ + if (zzStartRead > 0) { + System.arraycopy(zzBuffer, zzStartRead, + zzBuffer, 0, + zzEndRead-zzStartRead); + + /* translate stored positions */ + zzEndRead-= zzStartRead; + zzCurrentPos-= zzStartRead; + zzMarkedPos-= zzStartRead; + zzStartRead = 0; + } + + /* is the buffer big enough? */ + if (zzCurrentPos >= zzBuffer.length) { + /* if not: blow it up */ + char newBuffer[] = new char[zzCurrentPos*2]; + System.arraycopy(zzBuffer, 0, newBuffer, 0, zzBuffer.length); + zzBuffer = newBuffer; + } + + /* finally: fill the buffer with new input */ + int numRead = zzReader.read(zzBuffer, zzEndRead, + zzBuffer.length-zzEndRead); + + if (numRead > 0) { + zzEndRead+= numRead; + return false; + } + // unlikely but not impossible: read 0 characters, but not at end of stream + if (numRead == 0) { + int c = zzReader.read(); + if (c == -1) { + return true; + } else { + zzBuffer[zzEndRead++] = (char) c; + return false; + } + } + + // numRead < 0 + return true; + } + + + /** + * Closes the input stream. + */ + public final void yyclose() throws java.io.IOException { + zzAtEOF = true; /* indicate end of file */ + zzEndRead = zzStartRead; /* invalidate buffer */ + + if (zzReader != null) + zzReader.close(); + } + + + /** + * Resets the scanner to read from a new input stream. + * Does not close the old reader. + * + * All internal variables are reset, the old input stream + * cannot be reused (internal buffer is discarded and lost). + * Lexical state is set to ZZ_INITIAL. + * + * @param reader the new input stream + */ + public final void yyreset(java.io.Reader reader) { + zzReader = reader; + zzAtBOL = true; + zzAtEOF = false; + zzEOFDone = false; + zzEndRead = zzStartRead = 0; + zzCurrentPos = zzMarkedPos = 0; + yyline = yychar = yycolumn = 0; + zzLexicalState = YYINITIAL; + } + + + /** + * Returns the current lexical state. + */ + public final int yystate() { + return zzLexicalState; + } + + + /** + * Enters a new lexical state + * + * @param newState the new lexical state + */ + public final void yybegin(int newState) { + zzLexicalState = newState; + } + + + /** + * Returns the text matched by the current regular expression. + */ + public final String yytext() { + return new String( zzBuffer, zzStartRead, zzMarkedPos-zzStartRead ); + } + + + /** + * Returns the character at position pos from the + * matched text. + * + * It is equivalent to yytext().charAt(pos), but faster + * + * @param pos the position of the character to fetch. + * A value from 0 to yylength()-1. + * + * @return the character at position pos + */ + public final char yycharat(int pos) { + return zzBuffer[zzStartRead+pos]; + } + + + /** + * Returns the length of the matched text region. + */ + public final int yylength() { + return zzMarkedPos-zzStartRead; + } + + + /** + * Reports an error that occured while scanning. + * + * In a wellformed scanner (no or only correct usage of + * yypushback(int) and a match-all fallback rule) this method + * will only be called with things that "Can't Possibly Happen". + * If this method is called, something is seriously wrong + * (e.g. a JFlex bug producing a faulty scanner etc.). + * + * Usual syntax/scanner level error handling should be done + * in error fallback rules. + * + * @param errorCode the code of the errormessage to display + */ + private void zzScanError(int errorCode) { + String message; + try { + message = ZZ_ERROR_MSG[errorCode]; + } + catch (ArrayIndexOutOfBoundsException e) { + message = ZZ_ERROR_MSG[ZZ_UNKNOWN_ERROR]; + } + + throw new Error(message); + } + + + /** + * Pushes the specified amount of characters back into the input stream. + * + * They will be read again by then next call of the scanning method + * + * @param number the number of characters to be read again. + * This number must not be greater than yylength()! + */ + public void yypushback(int number) { + if ( number > yylength() ) + zzScanError(ZZ_PUSHBACK_2BIG); + + zzMarkedPos -= number; + } + + + /** + * Resumes scanning until the next regular expression is matched, + * the end of input is encountered or an I/O-Error occurs. + * + * @return the next token + * @exception java.io.IOException if any I/O-Error occurs + */ + public java.lang.String yylex() throws java.io.IOException { + int zzInput; + int zzAction; + + // cached fields: + int zzCurrentPosL; + int zzMarkedPosL; + int zzEndReadL = zzEndRead; + char [] zzBufferL = zzBuffer; + char [] zzCMapL = ZZ_CMAP; + + int [] zzTransL = ZZ_TRANS; + int [] zzRowMapL = ZZ_ROWMAP; + int [] zzAttrL = ZZ_ATTRIBUTE; + + while (true) { + zzMarkedPosL = zzMarkedPos; + + zzAction = -1; + + zzCurrentPosL = zzCurrentPos = zzStartRead = zzMarkedPosL; + + zzState = ZZ_LEXSTATE[zzLexicalState]; + + + zzForAction: { + while (true) { + + if (zzCurrentPosL < zzEndReadL) + zzInput = zzBufferL[zzCurrentPosL++]; + else if (zzAtEOF) { + zzInput = YYEOF; + break zzForAction; + } + else { + // store back cached positions + zzCurrentPos = zzCurrentPosL; + zzMarkedPos = zzMarkedPosL; + boolean eof = zzRefill(); + // get translated positions and possibly new buffer + zzCurrentPosL = zzCurrentPos; + zzMarkedPosL = zzMarkedPos; + zzBufferL = zzBuffer; + zzEndReadL = zzEndRead; + if (eof) { + zzInput = YYEOF; + break zzForAction; + } + else { + zzInput = zzBufferL[zzCurrentPosL++]; + } + } + int zzNext = zzTransL[ zzRowMapL[zzState] + zzCMapL[zzInput] ]; + if (zzNext == -1) break zzForAction; + zzState = zzNext; + + int zzAttributes = zzAttrL[zzState]; + if ( (zzAttributes & 1) == 1 ) { + zzAction = zzState; + zzMarkedPosL = zzCurrentPosL; + if ( (zzAttributes & 8) == 8 ) break zzForAction; + } + + } + } + + // store back cached position + zzMarkedPos = zzMarkedPosL; + + switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) { + case 2: + { problem = 1; add(yytext()); + } + case 16: break; + case 6: + { add("ae"); + } + case 17: break; + case 4: + { add("s"); + } + case 18: break; + case 13: + { add("o"); + } + case 19: break; + case 3: + { switch (problem) { + case 1: return original; + default: return normalized; + } + } + case 20: break; + case 8: + { switch (problem) { + case 1: return original; + default: return normalized.replaceAll(LB, "").toLowerCase(); + } + } + case 21: break; + case 14: + { add("u"); + } + case 22: break; + case 1: + { add(yytext()); + } + case 23: break; + case 12: + { add("i"); + } + case 24: break; + case 15: + { add(""); + } + case 25: break; + case 11: + { add("e"); + } + case 26: break; + case 10: + { add("a"); + } + case 27: break; + case 9: + { add("oe"); + } + case 28: break; + case 5: + { add("ss"); + } + case 29: break; + case 7: + { switch (problem) { + case 1: return ""; + default: return normalized.replaceAll(LB, ""); + } + } + case 30: break; + default: + if (zzInput == YYEOF && zzStartRead == zzCurrentPos) { + zzAtEOF = true; + return null; + } + else { + zzScanError(ZZ_NO_MATCH); + } + } + } + } + + +} diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexFR.lex --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexFR.lex Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,119 @@ +/* + * Normalization rules for French text + * [this is a JFlex specification] + * + * Wolfgang Schmidle + * version 2011-07-12 + * + */ + +package de.mpg.mpiwg.berlin.mpdl.lt.analyzer.lang; + +%% + +%public +%class MpdlNormalizerLexFR +%type java.lang.String +%unicode + +// French: fr + +%states DISP, DICT, SEARCH +%state CELEX + +%{ + private String original = ""; + private String normalized = ""; + private int problem = 0; + + private void add (String norm) { + original += yytext(); + normalized += norm; + } + + private static final String LB = "[\u002d\u00ad] "; +%} + +hyphen = [-\u{00ad}] // hyphen and soft hyphen +LB = {hyphen} \u0020 +// lb = ({hyphen} \u0020)? + +END = \n + +Alphabet = [abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ] + +%% + + { + +ſ { add("s"); } +ß { add("ss"); } +æ { add("ae"); } + +} + + { + +[œŒ] { add("oe"); } +[áàâ] { add("a"); } +[éèê] { add("e"); } +[íìî] { add("i"); } +[óòô] { add("o"); } +[úùû] { add("u"); } +’ { add(""); } + +{Alphabet} { add(yytext()); } + +. { problem = 1; add(yytext()); } // in particular "@" + +} + +// default + +@ { problem = 1; add(yytext()); } +{LB} { add(yytext()); } +. { add(yytext()); } + + + { + +{END} { + switch (problem) { + case 1: return original; + default: return normalized; + } + } +} + + { + +{END} { + switch (problem) { + case 1: return ""; + default: return normalized.replaceAll(LB, ""); + } + } +} + + { + +{END} { + switch (problem) { + case 1: return original; + default: return normalized.replaceAll(LB, "").toLowerCase(); + } + } +} + + +/* + +Annahmen: +- die Routine wird wortweise aufgerufen, mit einem \n am Ende des Strings +- Zeilenumbrüche innerhalb des Wortes sind als hyphen/soft hyphen plus space markiert + +TO DO: + +FR: richtig? vollständig? + +*/ diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexIT.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexIT.java Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,887 @@ +/* The following code was generated by JFlex 1.4.3 on 21.07.11 11:22 */ + +/* + * Normalization rules for Italian text + * [this is a JFlex specification] + * + * Wolfgang Schmidle + * version 2011-07-12 + * + */ + +package de.mpg.mpiwg.berlin.mpdl.lt.text.norm.lang; + + +/** + * This class is a scanner generated by + * JFlex 1.4.3 + * on 21.07.11 11:22 from the specification file + * MpdlNormalizerLexIT.lex + */ +public class MpdlNormalizerLexIT { + + /** This character denotes the end of file */ + public static final int YYEOF = -1; + + /** initial size of the lookahead buffer */ + private static final int ZZ_BUFFERSIZE = 16384; + + /** lexical states */ + public static final int SEARCH = 6; + public static final int DICT = 4; + public static final int YYINITIAL = 0; + public static final int DISP = 2; + + /** + * ZZ_LEXSTATE[l] is the state in the DFA for the lexical state l + * ZZ_LEXSTATE[l+1] is the state in the DFA for the lexical state l + * at the beginning of a line + * l is of the form l = 2*k, k a non negative integer + */ + private static final int ZZ_LEXSTATE[] = { + 0, 0, 1, 2, 3, 4, 5, 6 + }; + + /** + * Translates characters to character classes + */ + private static final String ZZ_CMAP_PACKED = + "\12\0\1\6\25\0\1\5\14\0\1\4\22\0\1\52\1\1\3\2"+ + "\1\1\3\2\1\41\1\0\1\2\1\3\2\2\1\42\1\2\1\50"+ + "\1\3\1\2\1\40\1\45\1\51\2\2\1\0\1\2\6\0\1\44"+ + "\3\2\1\12\2\2\1\43\1\7\1\36\1\2\1\3\1\2\1\10"+ + "\1\37\1\14\1\46\1\13\1\2\1\11\1\16\1\47\2\2\1\0"+ + "\1\2\62\0\1\4\22\0\1\17\5\0\1\33\1\0\1\20\3\0"+ + "\1\21\5\0\1\22\6\0\1\23\5\0\1\31\1\24\5\0\1\32"+ + "\1\0\1\25\3\0\1\26\5\0\1\27\6\0\1\30\37\0\1\1"+ + "\70\0\1\35\1\34\53\0\1\15\ufe80\0"; + + /** + * Translates characters to character classes + */ + private static final char [] ZZ_CMAP = zzUnpackCMap(ZZ_CMAP_PACKED); + + /** + * Translates DFA states to action switch labels. + */ + private static final int [] ZZ_ACTION = zzUnpackAction(); + + private static final String ZZ_ACTION_PACKED_0 = + "\11\0\1\1\1\2\2\3\1\1\1\4\1\2\1\3"+ + "\1\5\1\2\1\6\1\7\1\10\1\11\1\12\5\3"+ + "\1\13\1\2\1\3\1\5\1\2\1\14\1\15\1\16"+ + "\1\17\1\20\1\21\1\22\1\23\1\24\1\25\1\26"+ + "\1\27\1\30\4\0\1\31\1\32\1\33\1\0\1\34"+ + "\1\0\1\35\1\36\1\0\1\37\1\40\1\41\4\0"+ + "\1\42\6\0\1\43\1\44\4\0\1\45\1\0\1\46"+ + "\10\0\1\47\4\0\1\45\2\0\1\50"; + + private static int [] zzUnpackAction() { + int [] result = new int[100]; + int offset = 0; + offset = zzUnpackAction(ZZ_ACTION_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackAction(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int count = packed.charAt(i++); + int value = packed.charAt(i++); + do result[j++] = value; while (--count > 0); + } + return j; + } + + + /** + * Translates a state to a row index in the transition table + */ + private static final int [] ZZ_ROWMAP = zzUnpackRowMap(); + + private static final String ZZ_ROWMAP_PACKED_0 = + "\0\0\0\53\0\126\0\201\0\254\0\327\0\u0102\0\u012d"+ + "\0\u0158\0\0\0\0\0\0\0\u0183\0\u01ae\0\0\0\u01d9"+ + "\0\u0204\0\0\0\u022f\0\0\0\0\0\0\0\0\0\0"+ + "\0\u025a\0\u0285\0\u02b0\0\u02db\0\u0306\0\0\0\u0331\0\u035c"+ + "\0\u0387\0\u03b2\0\u03dd\0\0\0\0\0\0\0\0\0\0"+ + "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\u0408"+ + "\0\u0433\0\u045e\0\u0489\0\0\0\0\0\0\0\u04b4\0\0"+ + "\0\u04df\0\0\0\0\0\u050a\0\0\0\0\0\0\0\u0535"+ + "\0\u0560\0\u058b\0\u05b6\0\0\0\u05e1\0\u060c\0\u0637\0\u0662"+ + "\0\u068d\0\0\0\0\0\0\0\u06b8\0\u06e3\0\u070e\0\u035c"+ + "\0\u0739\0\u0764\0\0\0\u078f\0\u07ba\0\u07e5\0\0\0\u0810"+ + "\0\u083b\0\u0866\0\u0891\0\0\0\u08bc\0\u08e7\0\u0912\0\u093d"+ + "\0\0\0\u0968\0\u0993\0\0"; + + private static int [] zzUnpackRowMap() { + int [] result = new int[100]; + int offset = 0; + offset = zzUnpackRowMap(ZZ_ROWMAP_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackRowMap(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int high = packed.charAt(i++) << 16; + result[j++] = high | packed.charAt(i++); + } + return j; + } + + /** + * The transition table of the DFA + */ + private static final int [] ZZ_TRANS = zzUnpackTrans(); + + private static final String ZZ_TRANS_PACKED_0 = + "\53\0\1\12\1\13\1\14\1\15\1\16\1\12\1\17"+ + "\1\20\1\14\1\21\1\13\1\15\1\14\1\22\1\23"+ + "\5\12\2\13\1\12\2\13\1\24\1\25\1\26\1\27"+ + "\1\30\1\12\1\13\1\31\2\13\1\14\1\13\1\23"+ + "\1\32\1\33\1\34\1\35\1\36\1\12\1\13\1\14"+ + "\1\15\1\16\1\12\1\17\1\37\1\14\1\21\1\13"+ + "\1\15\1\40\1\41\1\42\5\12\2\13\1\12\2\13"+ + "\1\24\1\25\1\26\1\27\1\30\1\12\1\13\1\31"+ + "\2\13\1\43\1\13\1\42\1\32\1\33\1\34\1\35"+ + "\1\36\1\12\1\13\1\14\1\15\1\16\1\12\1\44"+ + "\1\20\1\14\1\21\1\13\1\15\1\14\1\22\1\23"+ + "\1\45\1\46\1\47\1\50\1\51\1\52\1\53\1\54"+ + "\1\55\1\56\1\24\1\25\1\26\1\27\1\30\1\12"+ + "\1\13\1\31\2\13\1\14\1\13\1\23\1\32\1\33"+ + "\1\34\1\35\1\36\1\12\1\13\1\14\1\15\1\16"+ + "\1\12\1\44\1\37\1\14\1\21\1\13\1\15\1\40"+ + "\1\41\1\42\1\45\1\46\1\47\1\50\1\51\1\52"+ + "\1\53\1\54\1\55\1\56\1\24\1\25\1\26\1\27"+ + "\1\30\1\12\1\13\1\31\2\13\1\43\1\13\1\42"+ + "\1\32\1\33\1\34\1\35\1\36\1\12\1\13\1\14"+ + "\1\15\1\16\1\12\1\57\1\20\1\14\1\21\1\13"+ + "\1\15\1\14\1\22\1\23\1\45\1\46\1\47\1\50"+ + "\1\51\1\52\1\53\1\54\1\55\1\56\1\24\1\25"+ + "\1\26\1\27\1\30\1\12\1\13\1\31\2\13\1\14"+ + "\1\13\1\23\1\32\1\33\1\34\1\35\1\36\1\12"+ + "\1\13\1\14\1\15\1\16\1\12\1\57\1\37\1\14"+ + "\1\21\1\13\1\15\1\40\1\41\1\42\1\45\1\46"+ + "\1\47\1\50\1\51\1\52\1\53\1\54\1\55\1\56"+ + "\1\24\1\25\1\26\1\27\1\30\1\12\1\13\1\31"+ + "\2\13\1\43\1\13\1\42\1\32\1\33\1\34\1\35"+ + "\1\36\7\0\1\60\4\0\1\61\1\62\42\0\1\63"+ + "\114\0\1\64\1\0\1\64\6\0\1\65\103\0\1\66"+ + "\23\0\1\67\44\0\1\70\5\0\1\70\2\0\1\70"+ + "\3\0\1\70\5\0\2\70\1\0\2\70\1\0\3\70"+ + "\2\0\1\70\1\0\2\70\1\0\2\70\46\0\1\71"+ + "\60\0\1\72\5\0\2\73\1\74\3\0\2\73\1\0"+ + "\3\73\13\0\1\73\6\0\1\73\2\0\1\73\2\0"+ + "\4\73\50\0\1\75\1\0\1\76\3\0\2\77\1\100"+ + "\3\0\2\77\1\0\3\77\13\0\1\77\6\0\1\77"+ + "\2\0\1\77\2\0\4\77\11\0\1\101\25\0\1\66"+ + "\26\0\1\102\52\0\1\102\3\0\1\103\35\0\1\104"+ + "\5\0\1\104\2\0\1\104\3\0\1\104\5\0\2\104"+ + "\1\0\2\104\1\0\3\104\2\0\1\104\1\0\2\104"+ + "\1\0\2\104\44\0\1\105\4\0\1\106\16\0\1\107"+ + "\54\0\1\110\52\0\1\110\3\0\1\111\40\0\1\112"+ + "\105\0\1\113\55\0\1\114\15\0\1\115\52\0\1\116"+ + "\51\0\1\117\4\0\1\120\54\0\1\121\43\0\1\122"+ + "\7\0\1\120\44\0\1\123\52\0\1\123\1\124\1\125"+ + "\46\0\1\126\4\0\1\61\54\0\1\127\43\0\1\130"+ + "\7\0\1\61\40\0\2\73\4\0\2\73\1\0\3\73"+ + "\13\0\1\73\6\0\1\73\2\0\1\73\2\0\4\73"+ + "\3\0\2\77\4\0\2\77\1\0\3\77\13\0\1\77"+ + "\6\0\1\77\2\0\1\77\2\0\4\77\6\0\1\131"+ + "\51\0\1\132\53\0\1\133\53\0\1\134\50\0\1\135"+ + "\3\0\1\136\47\0\1\137\52\0\1\140\56\0\1\120"+ + "\46\0\1\141\61\0\1\120\43\0\1\142\104\0\1\143"+ + "\24\0\1\61\55\0\1\61\46\0\1\136\50\0\1\144"+ + "\44\0"; + + private static int [] zzUnpackTrans() { + int [] result = new int[2494]; + int offset = 0; + offset = zzUnpackTrans(ZZ_TRANS_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackTrans(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int count = packed.charAt(i++); + int value = packed.charAt(i++); + value--; + do result[j++] = value; while (--count > 0); + } + return j; + } + + + /* error codes */ + private static final int ZZ_UNKNOWN_ERROR = 0; + private static final int ZZ_NO_MATCH = 1; + private static final int ZZ_PUSHBACK_2BIG = 2; + + /* error messages for the codes above */ + private static final String ZZ_ERROR_MSG[] = { + "Unkown internal scanner error", + "Error: could not match input", + "Error: pushback value was too large" + }; + + /** + * ZZ_ATTRIBUTE[aState] contains the attributes of state aState + */ + private static final int [] ZZ_ATTRIBUTE = zzUnpackAttribute(); + + private static final String ZZ_ATTRIBUTE_PACKED_0 = + "\1\10\7\0\1\1\3\11\2\1\1\11\2\1\1\11"+ + "\1\1\5\11\5\1\1\11\5\1\14\11\4\0\3\11"+ + "\1\0\1\11\1\0\2\11\1\0\3\11\4\0\1\11"+ + "\5\0\3\11\4\0\1\1\1\0\1\11\3\0\1\11"+ + "\4\0\1\11\4\0\1\11\2\0\1\11"; + + private static int [] zzUnpackAttribute() { + int [] result = new int[100]; + int offset = 0; + offset = zzUnpackAttribute(ZZ_ATTRIBUTE_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackAttribute(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int count = packed.charAt(i++); + int value = packed.charAt(i++); + do result[j++] = value; while (--count > 0); + } + return j; + } + + /** the input device */ + private java.io.Reader zzReader; + + /** the current state of the DFA */ + private int zzState; + + /** the current lexical state */ + private int zzLexicalState = YYINITIAL; + + /** this buffer contains the current text to be matched and is + the source of the yytext() string */ + private char zzBuffer[] = new char[ZZ_BUFFERSIZE]; + + /** the textposition at the last accepting state */ + private int zzMarkedPos; + + /** the current text position in the buffer */ + private int zzCurrentPos; + + /** startRead marks the beginning of the yytext() string in the buffer */ + private int zzStartRead; + + /** endRead marks the last character in the buffer, that has been read + from input */ + private int zzEndRead; + + /** number of newlines encountered up to the start of the matched text */ + private int yyline; + + /** the number of characters up to the start of the matched text */ + private int yychar; + + /** + * the number of characters from the last newline up to the start of the + * matched text + */ + private int yycolumn; + + /** + * zzAtBOL == true <=> the scanner is currently at the beginning of a line + */ + private boolean zzAtBOL = true; + + /** zzAtEOF == true <=> the scanner is at the EOF */ + private boolean zzAtEOF; + + /** denotes if the user-EOF-code has already been executed */ + private boolean zzEOFDone; + + /** For the backwards DFA of general lookahead statements */ + private boolean [] zzFin = new boolean [ZZ_BUFFERSIZE+1]; + + /* user code: */ + private static final int CONS = 1; + private static final int VOWEL = 2; + private int cv = 0; // consonant = 1, vowel = 2, everything else = 0 + + private String original = ""; + private String normalized = ""; + private int problem = 0; + + private void add (String norm) { + original += yytext(); + normalized += norm; + } + + private static final String LB = "[\u002d\u00ad] "; + + + /** + * Creates a new scanner + * There is also a java.io.InputStream version of this constructor. + * + * @param in the java.io.Reader to read input from. + */ + public MpdlNormalizerLexIT(java.io.Reader in) { + this.zzReader = in; + } + + /** + * Creates a new scanner. + * There is also java.io.Reader version of this constructor. + * + * @param in the java.io.Inputstream to read input from. + */ + public MpdlNormalizerLexIT(java.io.InputStream in) { + this(new java.io.InputStreamReader(in)); + } + + /** + * Unpacks the compressed character translation table. + * + * @param packed the packed character translation table + * @return the unpacked character translation table + */ + private static char [] zzUnpackCMap(String packed) { + char [] map = new char[0x10000]; + int i = 0; /* index in packed string */ + int j = 0; /* index in unpacked array */ + while (i < 172) { + int count = packed.charAt(i++); + char value = packed.charAt(i++); + do map[j++] = value; while (--count > 0); + } + return map; + } + + + /** + * Refills the input buffer. + * + * @return false, iff there was new input. + * + * @exception java.io.IOException if any I/O-Error occurs + */ + private boolean zzRefill() throws java.io.IOException { + + /* first: make room (if you can) */ + if (zzStartRead > 0) { + System.arraycopy(zzBuffer, zzStartRead, + zzBuffer, 0, + zzEndRead-zzStartRead); + + /* translate stored positions */ + zzEndRead-= zzStartRead; + zzCurrentPos-= zzStartRead; + zzMarkedPos-= zzStartRead; + zzStartRead = 0; + } + + /* is the buffer big enough? */ + if (zzCurrentPos >= zzBuffer.length) { + /* if not: blow it up */ + char newBuffer[] = new char[zzCurrentPos*2]; + System.arraycopy(zzBuffer, 0, newBuffer, 0, zzBuffer.length); + zzBuffer = newBuffer; + } + + /* finally: fill the buffer with new input */ + int numRead = zzReader.read(zzBuffer, zzEndRead, + zzBuffer.length-zzEndRead); + + if (numRead > 0) { + zzEndRead+= numRead; + return false; + } + // unlikely but not impossible: read 0 characters, but not at end of stream + if (numRead == 0) { + int c = zzReader.read(); + if (c == -1) { + return true; + } else { + zzBuffer[zzEndRead++] = (char) c; + return false; + } + } + + // numRead < 0 + return true; + } + + + /** + * Closes the input stream. + */ + public final void yyclose() throws java.io.IOException { + zzAtEOF = true; /* indicate end of file */ + zzEndRead = zzStartRead; /* invalidate buffer */ + + if (zzReader != null) + zzReader.close(); + } + + + /** + * Resets the scanner to read from a new input stream. + * Does not close the old reader. + * + * All internal variables are reset, the old input stream + * cannot be reused (internal buffer is discarded and lost). + * Lexical state is set to ZZ_INITIAL. + * + * @param reader the new input stream + */ + public final void yyreset(java.io.Reader reader) { + zzReader = reader; + zzAtBOL = true; + zzAtEOF = false; + zzEOFDone = false; + zzEndRead = zzStartRead = 0; + zzCurrentPos = zzMarkedPos = 0; + yyline = yychar = yycolumn = 0; + zzLexicalState = YYINITIAL; + } + + + /** + * Returns the current lexical state. + */ + public final int yystate() { + return zzLexicalState; + } + + + /** + * Enters a new lexical state + * + * @param newState the new lexical state + */ + public final void yybegin(int newState) { + zzLexicalState = newState; + } + + + /** + * Returns the text matched by the current regular expression. + */ + public final String yytext() { + return new String( zzBuffer, zzStartRead, zzMarkedPos-zzStartRead ); + } + + + /** + * Returns the character at position pos from the + * matched text. + * + * It is equivalent to yytext().charAt(pos), but faster + * + * @param pos the position of the character to fetch. + * A value from 0 to yylength()-1. + * + * @return the character at position pos + */ + public final char yycharat(int pos) { + return zzBuffer[zzStartRead+pos]; + } + + + /** + * Returns the length of the matched text region. + */ + public final int yylength() { + return zzMarkedPos-zzStartRead; + } + + + /** + * Reports an error that occured while scanning. + * + * In a wellformed scanner (no or only correct usage of + * yypushback(int) and a match-all fallback rule) this method + * will only be called with things that "Can't Possibly Happen". + * If this method is called, something is seriously wrong + * (e.g. a JFlex bug producing a faulty scanner etc.). + * + * Usual syntax/scanner level error handling should be done + * in error fallback rules. + * + * @param errorCode the code of the errormessage to display + */ + private void zzScanError(int errorCode) { + String message; + try { + message = ZZ_ERROR_MSG[errorCode]; + } + catch (ArrayIndexOutOfBoundsException e) { + message = ZZ_ERROR_MSG[ZZ_UNKNOWN_ERROR]; + } + + throw new Error(message); + } + + + /** + * Pushes the specified amount of characters back into the input stream. + * + * They will be read again by then next call of the scanning method + * + * @param number the number of characters to be read again. + * This number must not be greater than yylength()! + */ + public void yypushback(int number) { + if ( number > yylength() ) + zzScanError(ZZ_PUSHBACK_2BIG); + + zzMarkedPos -= number; + } + + + /** + * Resumes scanning until the next regular expression is matched, + * the end of input is encountered or an I/O-Error occurs. + * + * @return the next token + * @exception java.io.IOException if any I/O-Error occurs + */ + public java.lang.String yylex() throws java.io.IOException { + int zzInput; + int zzAction; + + // cached fields: + int zzCurrentPosL; + int zzMarkedPosL; + int zzEndReadL = zzEndRead; + char [] zzBufferL = zzBuffer; + char [] zzCMapL = ZZ_CMAP; + + int [] zzTransL = ZZ_TRANS; + int [] zzRowMapL = ZZ_ROWMAP; + int [] zzAttrL = ZZ_ATTRIBUTE; + + while (true) { + zzMarkedPosL = zzMarkedPos; + + if (zzMarkedPosL > zzStartRead) { + switch (zzBufferL[zzMarkedPosL-1]) { + case '\n': + case '\u000B': + case '\u000C': + case '\u0085': + case '\u2028': + case '\u2029': + zzAtBOL = true; + break; + case '\r': + if (zzMarkedPosL < zzEndReadL) + zzAtBOL = zzBufferL[zzMarkedPosL] != '\n'; + else if (zzAtEOF) + zzAtBOL = false; + else { + boolean eof = zzRefill(); + zzMarkedPosL = zzMarkedPos; + zzEndReadL = zzEndRead; + zzBufferL = zzBuffer; + if (eof) + zzAtBOL = false; + else + zzAtBOL = zzBufferL[zzMarkedPosL] != '\n'; + } + break; + default: + zzAtBOL = false; + } + } + zzAction = -1; + + zzCurrentPosL = zzCurrentPos = zzStartRead = zzMarkedPosL; + + if (zzAtBOL) + zzState = ZZ_LEXSTATE[zzLexicalState+1]; + else + zzState = ZZ_LEXSTATE[zzLexicalState]; + + + zzForAction: { + while (true) { + + if (zzCurrentPosL < zzEndReadL) + zzInput = zzBufferL[zzCurrentPosL++]; + else if (zzAtEOF) { + zzInput = YYEOF; + break zzForAction; + } + else { + // store back cached positions + zzCurrentPos = zzCurrentPosL; + zzMarkedPos = zzMarkedPosL; + boolean eof = zzRefill(); + // get translated positions and possibly new buffer + zzCurrentPosL = zzCurrentPos; + zzMarkedPosL = zzMarkedPos; + zzBufferL = zzBuffer; + zzEndReadL = zzEndRead; + if (eof) { + zzInput = YYEOF; + break zzForAction; + } + else { + zzInput = zzBufferL[zzCurrentPosL++]; + } + } + int zzNext = zzTransL[ zzRowMapL[zzState] + zzCMapL[zzInput] ]; + if (zzNext == -1) break zzForAction; + zzState = zzNext; + + int zzAttributes = zzAttrL[zzState]; + if ( (zzAttributes & 1) == 1 ) { + zzAction = zzState; + zzMarkedPosL = zzCurrentPosL; + if ( (zzAttributes & 8) == 8 ) break zzForAction; + } + + } + } + + // store back cached position + zzMarkedPos = zzMarkedPosL; + + switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) { + case 33: + // lookahead expression with fixed base length + zzMarkedPos = zzStartRead + 1; + { cv = CONS; add("U"); + } + case 41: break; + case 14: + { add("Á"); + } + case 42: break; + case 40: + // lookahead expression with fixed lookahead length + yypushback(1); + { add(yytext()); + } + case 43: break; + case 39: + // lookahead expression with fixed base length + zzMarkedPos = zzStartRead + 3; + { add(yytext()); + } + case 44: break; + case 38: + // lookahead expression with fixed base length + zzMarkedPos = zzStartRead + 2; + { add(yytext()); + } + case 45: break; + case 26: + { add(yytext()); + } + case 46: break; + case 21: + { add("í"); + } + case 47: break; + case 8: + { cv = VOWEL; add("AE"); + } + case 48: break; + case 11: + { problem = 1; cv = 0; add(yytext()); + } + case 49: break; + case 4: + { switch (problem) { + case 1: return original; + default: return normalized; + } + } + case 50: break; + case 30: + // lookahead expression with fixed base length + zzMarkedPos = zzStartRead + 1; + { cv = CONS; add("u"); + } + case 51: break; + case 19: + { add("á"); + } + case 52: break; + case 1: + { cv = 0; add(yytext()); + } + case 53: break; + case 24: + { switch (problem) { + case 1: return original; + default: return normalized.replaceAll(LB, "").toLowerCase(); + } + } + case 54: break; + case 34: + // lookahead expression with fixed base length + zzMarkedPos = zzStartRead + 1; + { cv = VOWEL; add(yytext().replaceAll("u", "v").replaceAll("U", "V")); + } + case 55: break; + case 35: + { cv = VOWEL; add("zio"); + } + case 56: break; + case 10: + { cv = VOWEL; add("OE"); + } + case 57: break; + case 18: + { add("Ú"); + } + case 58: break; + case 37: + // general lookahead, find correct zzMarkedPos + { int zzFState = 7; + int zzFPos = zzStartRead; + if (zzFin.length <= zzBufferL.length) { zzFin = new boolean[zzBufferL.length+1]; } + boolean zzFinL[] = zzFin; + while (zzFState != -1 && zzFPos < zzMarkedPos) { + if ((zzAttrL[zzFState] & 1) == 1) { zzFinL[zzFPos] = true; } + zzInput = zzBufferL[zzFPos++]; + zzFState = zzTransL[ zzRowMapL[zzFState] + zzCMapL[zzInput] ]; + } + if (zzFState != -1 && (zzAttrL[zzFState] & 1) == 1) { zzFinL[zzFPos] = true; } + + zzFState = 8; + zzFPos = zzMarkedPos; + while (!zzFinL[zzFPos] || (zzAttrL[zzFState] & 1) != 1) { + zzInput = zzBufferL[--zzFPos]; + zzFState = zzTransL[ zzRowMapL[zzFState] + zzCMapL[zzInput] ]; + }; + zzMarkedPos = zzFPos; + } + { cv = VOWEL; add(yytext().replace("ſ", "s")); + } + case 59: break; + case 3: + { cv = CONS; add(yytext()); + } + case 60: break; + case 32: + { cv = CONS; add("QU"); + } + case 61: break; + case 15: + { add("É"); + } + case 62: break; + case 28: + // lookahead expression with fixed base length + zzMarkedPos = zzStartRead + 1; + { switch(cv) { + case VOWEL: add(yytext().replace("u", "v").replace("U", "V")); break; + default: cv = VOWEL; add(yytext()); break; + } + } + case 63: break; + case 6: + { cv = CONS; add("ss"); + } + case 64: break; + case 5: + { cv = CONS; add("s"); + } + case 65: break; + case 13: + { switch (problem) { + case 1: return ""; + default: return normalized.replaceAll(LB, ""); + } + } + case 66: break; + case 36: + { cv = VOWEL; add("ZIO"); + } + case 67: break; + case 2: + { cv = VOWEL; add(yytext()); + } + case 68: break; + case 17: + { add("Ó"); + } + case 69: break; + case 23: + { add("ú"); + } + case 70: break; + case 31: + { cv = CONS; add("Qu"); + } + case 71: break; + case 20: + { add("é"); + } + case 72: break; + case 7: + { cv = VOWEL; add("ae"); + } + case 73: break; + case 12: + { add(""); + } + case 74: break; + case 22: + { add("ó"); + } + case 75: break; + case 9: + { cv = VOWEL; add("oe"); + } + case 76: break; + case 29: + { cv = CONS; add("qu"); + } + case 77: break; + case 25: + { switch(cv) { + case CONS: add(yytext().replace("v", "u").replace("V", "U")); break; + default: cv = CONS; add(yytext()); break; + } + } + case 78: break; + case 27: + { cv = VOWEL; add("ii"); + } + case 79: break; + case 16: + { add("Í"); + } + case 80: break; + default: + if (zzInput == YYEOF && zzStartRead == zzCurrentPos) { + zzAtEOF = true; + return null; + } + else { + zzScanError(ZZ_NO_MATCH); + } + } + } + } + + +} diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexIT.lex --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexIT.lex Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,183 @@ +/* + * Normalization rules for Italian text + * [this is a JFlex specification] + * + * Wolfgang Schmidle + * version 2011-07-12 + * + */ + +package de.mpg.mpiwg.berlin.mpdl.lt.analyzer.lang; + +%% + +%public +%class MpdlNormalizerLexIT +%type java.lang.String +%unicode + +// Italian: it, ita + +%states DISP, DICT, SEARCH + +%{ + private static final int CONS = 1; + private static final int VOWEL = 2; + private int cv = 0; // consonant = 1, vowel = 2, everything else = 0 + + private String original = ""; + private String normalized = ""; + private int problem = 0; + + private void add (String norm) { + original += yytext(); + normalized += norm; + } + + private static final String LB = "[\u002d\u00ad] "; +%} + +Vowel = [AEIOUaeiouÆæęàèòùœ] +Cons = [BCDFGHKLMNPQRSTVWXZbcdfghklmnpqrstvwxzſß] +LR = [lLrR] + + +hyphen = [\u002d\u00ad] // hyphen and soft hyphen +LB = {hyphen} \u0020 +lb = ({hyphen} \u0020)? + +END = \n + +prefixCons = (in{lb}ter | per | ſu{lb}per | ſer) + +%% + + { + +À { add("Á"); } +È { add("É"); } +Ì { add("Í"); } +Ò { add("Ó"); } +Ù { add("Ú"); } +à { add("á"); } +è { add("é"); } +ì { add("í"); } +ò { add("ó"); } +ù { add("ú"); } + +} + + { + +ſ { cv = CONS; add("s"); } +ß { cv = CONS; add("ss"); } +æ { cv = VOWEL; add("ae"); } +Æ { cv = VOWEL; add("AE"); } +œ { cv = VOWEL; add("oe"); } +Œ { cv = VOWEL; add("OE"); } + +ij { cv = VOWEL; add("ii"); } + +tio { cv = VOWEL; add("zio"); } +TIO { cv = VOWEL; add("ZIO"); } + +// h-Regeln aus Arboreal: +^ ha / {END} { add(yytext()); } +^ hai / {END} { add(yytext()); } +^ han{lb}no / {END} { add(yytext()); } +^ ho / {END} { add(yytext()); } +^ h { add(""); } + + +// u/v rules are taken from MpdlNormalizerLexLA.lex + +// 1. rules for u --> v + +^ {prefixCons} / {lb} { cv = VOWEL; add(yytext().replace("ſ", "s")); } + +^ [uU] / {Vowel} { cv = VOWEL; add(yytext().replaceAll("u", "v").replaceAll("U", "V")); } + + +[uU] / {Vowel} { + switch(cv) { + case VOWEL: add(yytext().replace("u", "v").replace("U", "V")); break; + default: cv = VOWEL; add(yytext()); break; + } + } + +// 2. rules for v --> u + +qv { cv = CONS; add("qu"); } // the replaced v still counts as consonant +Qv { cv = CONS; add("Qu"); } +QV { cv = CONS; add("QU"); } + +{LR} [vV] { + switch(cv) { + case CONS: add(yytext().replace("v", "u").replace("V", "U")); break; + default: cv = CONS; add(yytext()); break; + } + } + +v / {lb} {Cons} { cv = CONS; add("u"); } +V / {lb} {Cons} { cv = CONS; add("U"); } + +// 3. override default rule for . + +{Vowel} { cv = VOWEL; add(yytext()); } +{Cons} { cv = CONS; add(yytext()); } +@ { problem = 1; cv = 0; add(yytext()); } +{LB} { add(yytext()); } +. { cv = 0; add(yytext()); } + +} + + + { + +{END} { + switch (problem) { + case 1: return original; + default: return normalized; + } + } +} + + { + +{END} { + switch (problem) { + case 1: return ""; + default: return normalized.replaceAll(LB, ""); + } + } +} + + { + +{END} { + switch (problem) { + case 1: return original; + default: return normalized.replaceAll(LB, "").toLowerCase(); + } + } +} + + +/* + +Annahmen: +- die Routine wird wortweise aufgerufen, mit einem \n am Ende des Strings +- Zeilenumbrüche innerhalb des Wortes sind als hyphen/soft hyphen plus space markiert + +TO DO: + +IT: all these rules are taken from Arboreal; do we need them all? +IT: richtig? vollständig? +IT: Sind die u/v-Regeln wirklich genau wie in LA ? insbesondere: gleiche Vokal-Klasse? +IT: Änderungen in den lateinischen u/v-Regeln übernehmen? +IT: italienische Beispielwörter für die u/v-Regeln angeben +IT: Brauchen wir die Gravis-Regeln aus Arboreal in DICT wirklich? +IT: wenn ja: gehört À --> Á etc. in die Wörterbuch-Schicht? Und einschränken auf letzte Silbe? +IT: ist prefixCons = (inter | per | ſuper | ſer) auch für Italienisch gültig? + +*/ diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexLA.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexLA.java Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,1024 @@ +/* The following code was generated by JFlex 1.4.3 on 05.09.11 10:35 */ + +/* + * Normalization rules for Latin text + * [this is a JFlex specification] + * + * Wolfgang Schmidle + * version 2011-07-12 + * + */ + +package de.mpg.mpiwg.berlin.mpdl.lt.text.norm.lang; + + +/** + * This class is a scanner generated by + * JFlex 1.4.3 + * on 05.09.11 10:35 from the specification file + * MpdlNormalizerLexLA.lex + */ +public class MpdlNormalizerLexLA { + + /** This character denotes the end of file */ + public static final int YYEOF = -1; + + /** initial size of the lookahead buffer */ + private static final int ZZ_BUFFERSIZE = 16384; + + /** lexical states */ + public static final int RENAISSANCE_DICT = 8; + public static final int SEARCH = 10; + public static final int RENAISSANCE_DISP = 4; + public static final int DICT = 6; + public static final int YYINITIAL = 0; + public static final int RENAISSANCE_SEARCH = 12; + public static final int DISP = 2; + + /** + * ZZ_LEXSTATE[l] is the state in the DFA for the lexical state l + * ZZ_LEXSTATE[l+1] is the state in the DFA for the lexical state l + * at the beginning of a line + * l is of the form l = 2*k, k a non negative integer + */ + private static final int ZZ_LEXSTATE[] = { + 0, 0, 1, 2, 1, 2, 3, 4, 3, 4, 5, 6, 5, 6 + }; + + /** + * Translates characters to character classes + */ + private static final String ZZ_CMAP_PACKED = + "\12\0\1\6\25\0\1\5\14\0\1\4\22\0\1\0\1\1\3\2"+ + "\1\1\2\2\1\53\1\1\1\0\1\2\1\3\2\2\1\1\1\2"+ + "\1\46\1\3\2\2\1\64\1\65\2\2\1\66\1\2\6\0\1\57"+ + "\1\2\1\47\1\43\1\11\2\2\1\51\1\14\1\27\1\2\1\50"+ + "\1\40\1\13\1\61\1\17\1\7\1\16\1\32\1\15\1\10\1\12"+ + "\2\2\1\66\1\2\62\0\1\4\30\0\1\25\30\0\1\23\1\37"+ + "\1\31\1\55\3\0\1\24\1\0\1\41\1\33\1\0\1\60\1\45"+ + "\1\34\1\52\1\62\2\0\1\42\1\35\1\54\4\0\1\44\1\36"+ + "\1\56\1\63\34\0\1\24\71\0\1\26\53\0\1\20\u0181\0\1\30"+ + "\ud4fe\0\1\21\u0590\0\1\22\u226e\0"; + + /** + * Translates characters to character classes + */ + private static final char [] ZZ_CMAP = zzUnpackCMap(ZZ_CMAP_PACKED); + + /** + * Translates DFA states to action switch labels. + */ + private static final int [] ZZ_ACTION = zzUnpackAction(); + + private static final String ZZ_ACTION_PACKED_0 = + "\12\0\1\1\1\2\2\3\1\1\1\4\1\3\1\2"+ + "\1\3\1\2\1\5\1\1\1\6\1\7\1\10\1\11"+ + "\11\1\1\3\2\1\3\2\1\3\1\12\1\3\2\2"+ + "\1\3\1\5\3\3\1\1\1\2\1\13\1\14\4\0"+ + "\1\15\1\16\1\17\1\20\1\0\1\21\1\22\1\23"+ + "\1\24\1\0\1\25\20\0\1\26\3\0\1\27\3\0"+ + "\1\30\1\0\1\31\3\0\1\32\1\33\1\34\1\0"+ + "\1\35\1\36\2\0\1\37\20\0\1\40\1\0\1\41"+ + "\1\0\1\42\1\0\1\43\1\44\1\45\1\46\1\0"+ + "\1\47\1\0\1\50\1\0\1\51\1\0\1\52\4\0"+ + "\1\53\10\0\1\54\6\0\1\55\3\0\1\56\1\57"+ + "\1\60\2\0\1\61\5\0\1\53"; + + private static int [] zzUnpackAction() { + int [] result = new int[179]; + int offset = 0; + offset = zzUnpackAction(ZZ_ACTION_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackAction(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int count = packed.charAt(i++); + int value = packed.charAt(i++); + do result[j++] = value; while (--count > 0); + } + return j; + } + + + /** + * Translates a state to a row index in the transition table + */ + private static final int [] ZZ_ROWMAP = zzUnpackRowMap(); + + private static final String ZZ_ROWMAP_PACKED_0 = + "\0\0\0\67\0\156\0\245\0\334\0\u0113\0\u014a\0\u0181"+ + "\0\u01b8\0\u01ef\0\u0226\0\u0226\0\u0226\0\u025d\0\u0294\0\u0226"+ + "\0\u02cb\0\u0302\0\u0339\0\u0370\0\u0226\0\u01ef\0\u0226\0\u0226"+ + "\0\u0226\0\u0226\0\u03a7\0\u03de\0\u0415\0\u044c\0\u0483\0\u04ba"+ + "\0\u04f1\0\u0528\0\u055f\0\u0596\0\u05cd\0\u0604\0\u063b\0\u0672"+ + "\0\u06a9\0\u06e0\0\u0226\0\u0717\0\u074e\0\u0785\0\u07bc\0\u07f3"+ + "\0\u082a\0\u0861\0\u0898\0\u08cf\0\u0906\0\u0226\0\u0226\0\u093d"+ + "\0\u0974\0\u09ab\0\u09e2\0\u0226\0\u0226\0\u0226\0\u0226\0\u0a19"+ + "\0\u0226\0\u0226\0\u0226\0\u0226\0\u0a50\0\u0226\0\u0a87\0\u0abe"+ + "\0\u0af5\0\u0b2c\0\u0b63\0\u0b9a\0\u0bd1\0\u0c08\0\u0c3f\0\u0c76"+ + "\0\u0cad\0\u0ce4\0\u0d1b\0\u0d52\0\u0d89\0\u0dc0\0\u0226\0\u0df7"+ + "\0\u0e2e\0\u0e65\0\u0226\0\u0e9c\0\u0ed3\0\u0f0a\0\u0226\0\u0f41"+ + "\0\u0226\0\u0f78\0\u0faf\0\u0fe6\0\u0226\0\u0226\0\u0226\0\u101d"+ + "\0\u0226\0\u0226\0\u1054\0\u108b\0\u0226\0\u10c2\0\u10f9\0\u1130"+ + "\0\u1167\0\u119e\0\u11d5\0\u120c\0\u1243\0\u127a\0\u0226\0\u12b1"+ + "\0\u12e8\0\u131f\0\u1356\0\u138d\0\u08cf\0\u0226\0\u13c4\0\u0226"+ + "\0\u13fb\0\u0226\0\u1432\0\u0226\0\u0226\0\u0226\0\u0226\0\u1469"+ + "\0\u0226\0\u14a0\0\u0226\0\u14d7\0\u0226\0\u150e\0\u0226\0\u1545"+ + "\0\u157c\0\u15b3\0\u07bc\0\u15ea\0\u1621\0\u1658\0\u168f\0\u16c6"+ + "\0\u16fd\0\u0226\0\u1734\0\u176b\0\u0226\0\u17a2\0\u17d9\0\u1810"+ + "\0\u1847\0\u187e\0\u18b5\0\u0226\0\u18ec\0\u1923\0\u195a\0\u0226"+ + "\0\u0226\0\u0226\0\u1991\0\u19c8\0\u0226\0\u19ff\0\u1a36\0\u1a6d"+ + "\0\u1aa4\0\u1adb\0\u0226"; + + private static int [] zzUnpackRowMap() { + int [] result = new int[179]; + int offset = 0; + offset = zzUnpackRowMap(ZZ_ROWMAP_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackRowMap(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int high = packed.charAt(i++) << 16; + result[j++] = high | packed.charAt(i++); + } + return j; + } + + /** + * The transition table of the DFA + */ + private static final int [] ZZ_TRANS = zzUnpackTrans(); + + private static final String ZZ_TRANS_PACKED_0 = + "\21\0\1\12\45\0\1\13\1\14\1\15\1\16\1\17"+ + "\1\13\1\20\1\21\1\22\1\14\1\23\1\15\1\24"+ + "\1\15\1\16\1\15\1\25\1\26\1\13\1\27\1\30"+ + "\1\31\1\32\2\13\1\33\1\15\1\34\1\35\1\36"+ + "\1\37\1\40\1\15\1\41\1\42\1\15\1\43\1\13"+ + "\1\44\1\15\1\16\1\15\1\13\1\15\1\13\1\45"+ + "\1\46\1\47\1\13\1\50\2\13\1\51\1\52\1\53"+ + "\1\13\1\14\1\15\1\16\1\17\1\13\1\20\1\54"+ + "\1\55\1\14\1\23\1\15\1\56\1\15\1\16\1\57"+ + "\1\60\1\26\1\13\1\27\1\30\1\31\1\32\2\13"+ + "\1\33\1\15\1\34\1\35\1\36\1\37\1\40\1\15"+ + "\1\41\1\42\1\15\1\43\1\13\1\61\1\15\1\16"+ + "\1\62\1\13\1\63\1\64\1\45\1\46\1\47\1\13"+ + "\1\50\2\13\1\65\1\52\1\53\1\13\1\14\1\15"+ + "\1\16\1\17\1\13\1\66\1\21\1\22\1\14\1\23"+ + "\1\15\1\24\1\15\1\16\1\15\1\25\1\26\1\13"+ + "\1\27\1\30\1\31\1\32\2\13\1\33\1\15\1\34"+ + "\1\35\1\36\1\37\1\40\1\15\1\41\1\42\1\15"+ + "\1\43\1\13\1\44\1\15\1\16\1\15\1\13\1\15"+ + "\1\13\1\45\1\46\1\47\1\13\1\50\2\13\1\51"+ + "\1\52\1\53\1\13\1\14\1\15\1\16\1\17\1\13"+ + "\1\66\1\54\1\55\1\14\1\23\1\15\1\56\1\15"+ + "\1\16\1\57\1\60\1\26\1\13\1\27\1\30\1\31"+ + "\1\32\2\13\1\33\1\15\1\34\1\35\1\36\1\37"+ + "\1\40\1\15\1\41\1\42\1\15\1\43\1\13\1\61"+ + "\1\15\1\16\1\62\1\13\1\63\1\64\1\45\1\46"+ + "\1\47\1\13\1\50\2\13\1\65\1\52\1\53\1\13"+ + "\1\14\1\15\1\16\1\17\1\13\1\67\1\21\1\22"+ + "\1\14\1\23\1\15\1\24\1\15\1\16\1\15\1\25"+ + "\1\26\1\13\1\27\1\30\1\31\1\32\2\13\1\33"+ + "\1\15\1\34\1\35\1\36\1\37\1\40\1\15\1\41"+ + "\1\42\1\15\1\43\1\13\1\44\1\15\1\16\1\15"+ + "\1\13\1\15\1\13\1\45\1\46\1\47\1\13\1\50"+ + "\2\13\1\51\1\52\1\53\1\13\1\14\1\15\1\16"+ + "\1\17\1\13\1\67\1\54\1\55\1\14\1\23\1\15"+ + "\1\56\1\15\1\16\1\57\1\60\1\26\1\13\1\27"+ + "\1\30\1\31\1\32\2\13\1\33\1\15\1\34\1\35"+ + "\1\36\1\37\1\40\1\15\1\41\1\42\1\15\1\43"+ + "\1\13\1\61\1\15\1\16\1\62\1\13\1\63\1\64"+ + "\1\45\1\46\1\47\1\13\1\50\2\13\1\65\1\52"+ + "\1\53\14\0\1\70\2\0\1\71\1\72\53\0\1\73"+ + "\103\0\1\74\145\0\1\75\52\0\1\75\6\0\1\76"+ + "\73\0\1\77\15\0\1\100\37\0\1\101\6\0\2\101"+ + "\2\0\1\101\7\0\3\101\30\0\1\101\1\0\1\101"+ + "\1\102\1\103\1\101\4\0\2\104\1\105\2\0\1\104"+ + "\2\0\2\104\1\0\4\104\2\0\1\104\6\0\1\104"+ + "\5\0\1\104\2\0\1\104\2\0\4\104\1\0\1\104"+ + "\11\0\1\104\30\0\1\106\46\0\1\107\2\0\2\110"+ + "\1\0\2\111\13\0\1\111\5\0\1\111\35\0\1\112"+ + "\2\0\2\113\1\0\2\114\13\0\1\114\5\0\1\114"+ + "\35\0\1\115\2\0\2\116\1\0\2\117\13\0\1\117"+ + "\5\0\1\117\35\0\1\120\2\0\2\121\1\0\2\122"+ + "\13\0\1\122\5\0\1\122\35\0\1\123\1\0\1\124"+ + "\2\125\1\0\2\126\13\0\1\126\5\0\1\126\34\0"+ + "\1\127\1\107\22\0\1\130\5\0\1\131\6\0\1\132"+ + "\25\0\1\133\1\112\5\0\1\134\1\135\13\0\1\136"+ + "\42\0\1\137\1\120\33\0\1\140\31\0\1\141\23\0"+ + "\1\142\5\0\1\143\7\0\1\144\30\0\1\145\52\0"+ + "\1\146\7\0\1\127\1\107\6\0\1\147\102\0\1\150"+ + "\114\0\1\30\66\0\1\32\1\0\1\151\5\0\1\101"+ + "\6\0\2\101\2\0\1\101\7\0\3\101\30\0\1\101"+ + "\1\0\1\101\2\0\1\101\4\0\2\152\1\153\2\0"+ + "\1\152\2\0\2\152\1\0\4\152\2\0\1\152\6\0"+ + "\1\152\5\0\1\152\2\0\1\152\2\0\4\152\1\0"+ + "\1\152\11\0\1\152\11\0\1\154\1\0\1\77\15\0"+ + "\1\100\37\0\1\155\6\0\2\155\2\0\1\155\7\0"+ + "\3\155\30\0\1\155\1\0\1\155\1\102\1\103\1\155"+ + "\15\0\1\156\13\0\1\106\50\0\1\157\65\0\1\160"+ + "\1\157\65\0\1\161\1\0\1\145\52\0\1\146\53\0"+ + "\1\162\66\0\1\163\22\0\1\137\61\0\1\155\6\0"+ + "\2\155\2\0\1\155\7\0\3\155\30\0\1\155\1\0"+ + "\1\155\2\0\1\155\15\0\1\164\64\0\1\165\65\0"+ + "\1\166\1\165\61\0\1\167\72\0\1\170\63\0\1\171"+ + "\71\0\1\110\67\0\1\172\64\0\1\107\2\0\2\110"+ + "\63\0\1\113\67\0\1\173\64\0\1\112\2\0\2\113"+ + "\63\0\1\116\67\0\1\174\64\0\1\115\2\0\2\116"+ + "\63\0\1\121\67\0\1\175\64\0\1\120\2\0\2\121"+ + "\63\0\1\125\64\0\1\176\71\0\1\177\64\0\1\123"+ + "\2\0\2\125\61\0\1\200\1\201\65\0\1\202\1\203"+ + "\65\0\1\204\66\0\1\205\66\0\1\206\66\0\1\207"+ + "\1\210\65\0\1\211\1\212\65\0\1\213\1\214\65\0"+ + "\1\215\1\216\65\0\1\217\66\0\1\213\65\0\1\220"+ + "\126\0\1\221\25\0\1\222\10\0\1\223\67\0\1\224"+ + "\54\0\1\225\12\0\1\223\114\0\1\226\70\0\1\227"+ + "\66\0\1\230\23\0\1\231\10\0\1\71\67\0\1\232"+ + "\54\0\1\233\12\0\1\71\60\0\1\234\57\0\2\104"+ + "\3\0\1\104\2\0\2\104\1\0\4\104\2\0\1\104"+ + "\6\0\1\104\5\0\1\104\2\0\1\104\2\0\4\104"+ + "\1\0\1\104\11\0\1\104\7\0\1\127\66\0\1\133"+ + "\66\0\1\235\66\0\1\141\70\0\1\236\66\0\1\237"+ + "\66\0\1\240\66\0\1\241\66\0\1\242\66\0\1\243"+ + "\60\0\2\152\3\0\1\152\2\0\2\152\1\0\4\152"+ + "\2\0\1\152\6\0\1\152\5\0\1\152\2\0\1\152"+ + "\2\0\4\152\1\0\1\152\11\0\1\152\7\0\1\244"+ + "\65\0\1\245\65\0\1\246\67\0\1\247\67\0\1\250"+ + "\66\0\1\251\66\0\1\252\65\0\1\253\66\0\1\254"+ + "\67\0\1\255\71\0\1\256\66\0\1\257\66\0\1\260"+ + "\66\0\1\261\66\0\1\150\66\0\1\262\72\0\1\223"+ + "\56\0\1\263\100\0\1\223\64\0\1\71\70\0\1\71"+ + "\55\0\1\200\66\0\1\202\66\0\1\207\66\0\1\211"+ + "\66\0\1\215\60\0"; + + private static int [] zzUnpackTrans() { + int [] result = new int[6930]; + int offset = 0; + offset = zzUnpackTrans(ZZ_TRANS_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackTrans(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int count = packed.charAt(i++); + int value = packed.charAt(i++); + value--; + do result[j++] = value; while (--count > 0); + } + return j; + } + + + /* error codes */ + private static final int ZZ_UNKNOWN_ERROR = 0; + private static final int ZZ_NO_MATCH = 1; + private static final int ZZ_PUSHBACK_2BIG = 2; + + /* error messages for the codes above */ + private static final String ZZ_ERROR_MSG[] = { + "Unkown internal scanner error", + "Error: could not match input", + "Error: pushback value was too large" + }; + + /** + * ZZ_ATTRIBUTE[aState] contains the attributes of state aState + */ + private static final int [] ZZ_ATTRIBUTE = zzUnpackAttribute(); + + private static final String ZZ_ATTRIBUTE_PACKED_0 = + "\10\0\1\1\1\0\3\11\2\1\1\11\4\1\1\11"+ + "\1\1\4\11\20\1\1\11\12\1\2\11\4\0\4\11"+ + "\1\0\4\11\1\0\1\11\20\0\1\11\3\0\1\11"+ + "\3\0\1\11\1\0\1\11\3\0\3\11\1\0\2\11"+ + "\2\0\1\11\11\0\1\11\6\0\1\11\1\0\1\11"+ + "\1\0\1\11\1\0\4\11\1\0\1\11\1\0\1\11"+ + "\1\0\1\11\1\0\1\11\4\0\1\1\5\0\1\11"+ + "\2\0\1\11\6\0\1\11\3\0\3\11\2\0\1\11"+ + "\5\0\1\11"; + + private static int [] zzUnpackAttribute() { + int [] result = new int[179]; + int offset = 0; + offset = zzUnpackAttribute(ZZ_ATTRIBUTE_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackAttribute(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int count = packed.charAt(i++); + int value = packed.charAt(i++); + do result[j++] = value; while (--count > 0); + } + return j; + } + + /** the input device */ + private java.io.Reader zzReader; + + /** the current state of the DFA */ + private int zzState; + + /** the current lexical state */ + private int zzLexicalState = YYINITIAL; + + /** this buffer contains the current text to be matched and is + the source of the yytext() string */ + private char zzBuffer[] = new char[ZZ_BUFFERSIZE]; + + /** the textposition at the last accepting state */ + private int zzMarkedPos; + + /** the current text position in the buffer */ + private int zzCurrentPos; + + /** startRead marks the beginning of the yytext() string in the buffer */ + private int zzStartRead; + + /** endRead marks the last character in the buffer, that has been read + from input */ + private int zzEndRead; + + /** number of newlines encountered up to the start of the matched text */ + private int yyline; + + /** the number of characters up to the start of the matched text */ + private int yychar; + + /** + * the number of characters from the last newline up to the start of the + * matched text + */ + private int yycolumn; + + /** + * zzAtBOL == true <=> the scanner is currently at the beginning of a line + */ + private boolean zzAtBOL = true; + + /** zzAtEOF == true <=> the scanner is at the EOF */ + private boolean zzAtEOF; + + /** denotes if the user-EOF-code has already been executed */ + private boolean zzEOFDone; + + /** For the backwards DFA of general lookahead statements */ + private boolean [] zzFin = new boolean [ZZ_BUFFERSIZE+1]; + + /* user code: */ + private static final int CONS = 1; + private static final int VOWEL = 2; + private int cv = 0; // consonant = 1, vowel = 2, everything else = 0 + + private String original = ""; + private String normalized = ""; + private int problem = 0; + + private void add (String norm) { + original += yytext(); + normalized += norm; + } + + private static final String LB = "[\u002d\u00ad] "; + + + /** + * Creates a new scanner + * There is also a java.io.InputStream version of this constructor. + * + * @param in the java.io.Reader to read input from. + */ + public MpdlNormalizerLexLA(java.io.Reader in) { + this.zzReader = in; + } + + /** + * Creates a new scanner. + * There is also java.io.Reader version of this constructor. + * + * @param in the java.io.Inputstream to read input from. + */ + public MpdlNormalizerLexLA(java.io.InputStream in) { + this(new java.io.InputStreamReader(in)); + } + + /** + * Unpacks the compressed character translation table. + * + * @param packed the packed character translation table + * @return the unpacked character translation table + */ + private static char [] zzUnpackCMap(String packed) { + char [] map = new char[0x10000]; + int i = 0; /* index in packed string */ + int j = 0; /* index in unpacked array */ + while (i < 190) { + int count = packed.charAt(i++); + char value = packed.charAt(i++); + do map[j++] = value; while (--count > 0); + } + return map; + } + + + /** + * Refills the input buffer. + * + * @return false, iff there was new input. + * + * @exception java.io.IOException if any I/O-Error occurs + */ + private boolean zzRefill() throws java.io.IOException { + + /* first: make room (if you can) */ + if (zzStartRead > 0) { + System.arraycopy(zzBuffer, zzStartRead, + zzBuffer, 0, + zzEndRead-zzStartRead); + + /* translate stored positions */ + zzEndRead-= zzStartRead; + zzCurrentPos-= zzStartRead; + zzMarkedPos-= zzStartRead; + zzStartRead = 0; + } + + /* is the buffer big enough? */ + if (zzCurrentPos >= zzBuffer.length) { + /* if not: blow it up */ + char newBuffer[] = new char[zzCurrentPos*2]; + System.arraycopy(zzBuffer, 0, newBuffer, 0, zzBuffer.length); + zzBuffer = newBuffer; + } + + /* finally: fill the buffer with new input */ + int numRead = zzReader.read(zzBuffer, zzEndRead, + zzBuffer.length-zzEndRead); + + if (numRead > 0) { + zzEndRead+= numRead; + return false; + } + // unlikely but not impossible: read 0 characters, but not at end of stream + if (numRead == 0) { + int c = zzReader.read(); + if (c == -1) { + return true; + } else { + zzBuffer[zzEndRead++] = (char) c; + return false; + } + } + + // numRead < 0 + return true; + } + + + /** + * Closes the input stream. + */ + public final void yyclose() throws java.io.IOException { + zzAtEOF = true; /* indicate end of file */ + zzEndRead = zzStartRead; /* invalidate buffer */ + + if (zzReader != null) + zzReader.close(); + } + + + /** + * Resets the scanner to read from a new input stream. + * Does not close the old reader. + * + * All internal variables are reset, the old input stream + * cannot be reused (internal buffer is discarded and lost). + * Lexical state is set to ZZ_INITIAL. + * + * @param reader the new input stream + */ + public final void yyreset(java.io.Reader reader) { + zzReader = reader; + zzAtBOL = true; + zzAtEOF = false; + zzEOFDone = false; + zzEndRead = zzStartRead = 0; + zzCurrentPos = zzMarkedPos = 0; + yyline = yychar = yycolumn = 0; + zzLexicalState = YYINITIAL; + } + + + /** + * Returns the current lexical state. + */ + public final int yystate() { + return zzLexicalState; + } + + + /** + * Enters a new lexical state + * + * @param newState the new lexical state + */ + public final void yybegin(int newState) { + zzLexicalState = newState; + } + + + /** + * Returns the text matched by the current regular expression. + */ + public final String yytext() { + return new String( zzBuffer, zzStartRead, zzMarkedPos-zzStartRead ); + } + + + /** + * Returns the character at position pos from the + * matched text. + * + * It is equivalent to yytext().charAt(pos), but faster + * + * @param pos the position of the character to fetch. + * A value from 0 to yylength()-1. + * + * @return the character at position pos + */ + public final char yycharat(int pos) { + return zzBuffer[zzStartRead+pos]; + } + + + /** + * Returns the length of the matched text region. + */ + public final int yylength() { + return zzMarkedPos-zzStartRead; + } + + + /** + * Reports an error that occured while scanning. + * + * In a wellformed scanner (no or only correct usage of + * yypushback(int) and a match-all fallback rule) this method + * will only be called with things that "Can't Possibly Happen". + * If this method is called, something is seriously wrong + * (e.g. a JFlex bug producing a faulty scanner etc.). + * + * Usual syntax/scanner level error handling should be done + * in error fallback rules. + * + * @param errorCode the code of the errormessage to display + */ + private void zzScanError(int errorCode) { + String message; + try { + message = ZZ_ERROR_MSG[errorCode]; + } + catch (ArrayIndexOutOfBoundsException e) { + message = ZZ_ERROR_MSG[ZZ_UNKNOWN_ERROR]; + } + + throw new Error(message); + } + + + /** + * Pushes the specified amount of characters back into the input stream. + * + * They will be read again by then next call of the scanning method + * + * @param number the number of characters to be read again. + * This number must not be greater than yylength()! + */ + public void yypushback(int number) { + if ( number > yylength() ) + zzScanError(ZZ_PUSHBACK_2BIG); + + zzMarkedPos -= number; + } + + + /** + * Resumes scanning until the next regular expression is matched, + * the end of input is encountered or an I/O-Error occurs. + * + * @return the next token + * @exception java.io.IOException if any I/O-Error occurs + */ + public java.lang.String yylex() throws java.io.IOException { + int zzInput; + int zzAction; + + // cached fields: + int zzCurrentPosL; + int zzMarkedPosL; + int zzEndReadL = zzEndRead; + char [] zzBufferL = zzBuffer; + char [] zzCMapL = ZZ_CMAP; + + int [] zzTransL = ZZ_TRANS; + int [] zzRowMapL = ZZ_ROWMAP; + int [] zzAttrL = ZZ_ATTRIBUTE; + + while (true) { + zzMarkedPosL = zzMarkedPos; + + if (zzMarkedPosL > zzStartRead) { + switch (zzBufferL[zzMarkedPosL-1]) { + case '\n': + case '\u000B': + case '\u000C': + case '\u0085': + case '\u2028': + case '\u2029': + zzAtBOL = true; + break; + case '\r': + if (zzMarkedPosL < zzEndReadL) + zzAtBOL = zzBufferL[zzMarkedPosL] != '\n'; + else if (zzAtEOF) + zzAtBOL = false; + else { + boolean eof = zzRefill(); + zzMarkedPosL = zzMarkedPos; + zzEndReadL = zzEndRead; + zzBufferL = zzBuffer; + if (eof) + zzAtBOL = false; + else + zzAtBOL = zzBufferL[zzMarkedPosL] != '\n'; + } + break; + default: + zzAtBOL = false; + } + } + zzAction = -1; + + zzCurrentPosL = zzCurrentPos = zzStartRead = zzMarkedPosL; + + if (zzAtBOL) + zzState = ZZ_LEXSTATE[zzLexicalState+1]; + else + zzState = ZZ_LEXSTATE[zzLexicalState]; + + + zzForAction: { + while (true) { + + if (zzCurrentPosL < zzEndReadL) + zzInput = zzBufferL[zzCurrentPosL++]; + else if (zzAtEOF) { + zzInput = YYEOF; + break zzForAction; + } + else { + // store back cached positions + zzCurrentPos = zzCurrentPosL; + zzMarkedPos = zzMarkedPosL; + boolean eof = zzRefill(); + // get translated positions and possibly new buffer + zzCurrentPosL = zzCurrentPos; + zzMarkedPosL = zzMarkedPos; + zzBufferL = zzBuffer; + zzEndReadL = zzEndRead; + if (eof) { + zzInput = YYEOF; + break zzForAction; + } + else { + zzInput = zzBufferL[zzCurrentPosL++]; + } + } + int zzNext = zzTransL[ zzRowMapL[zzState] + zzCMapL[zzInput] ]; + if (zzNext == -1) break zzForAction; + zzState = zzNext; + + int zzAttributes = zzAttrL[zzState]; + if ( (zzAttributes & 1) == 1 ) { + zzAction = zzState; + zzMarkedPosL = zzCurrentPosL; + if ( (zzAttributes & 8) == 8 ) break zzForAction; + } + + } + } + + // store back cached position + zzMarkedPos = zzMarkedPosL; + + switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) { + case 41: + // lookahead expression with fixed base length + zzMarkedPos = zzStartRead + 2; + { add("um"); + } + case 50: break; + case 30: + // lookahead expression with fixed base length + zzMarkedPos = zzStartRead + 1; + { cv = CONS; add("U"); + } + case 51: break; + case 15: + { add(yytext()); + } + case 52: break; + case 48: + // lookahead expression with fixed base length + zzMarkedPos = zzStartRead + 3; + { add("Hic"); + } + case 53: break; + case 8: + { cv = VOWEL; add("AE"); + } + case 54: break; + case 1: + { problem = 1; cv = 0; add(yytext()); + } + case 55: break; + case 4: + { switch (problem) { + case 1: return original; + default: return normalized; + } + } + case 56: break; + case 20: + // lookahead expression with fixed base length + zzMarkedPos = zzStartRead + 1; + { cv = CONS; add("u"); + } + case 57: break; + case 10: + { cv = 0; add(yytext()); + } + case 58: break; + case 12: + { switch (problem) { + case 1: return original; + default: return normalized.replaceAll(LB, "").toLowerCase(); + } + } + case 59: break; + case 36: + // lookahead expression with fixed base length + zzMarkedPos = zzStartRead + 2; + { add("et"); + } + case 60: break; + case 23: + // lookahead expression with fixed base length + zzMarkedPos = zzStartRead + 1; + { add("e"); + } + case 61: break; + case 31: + // lookahead expression with fixed base length + zzMarkedPos = zzStartRead + 1; + { cv = VOWEL; add(yytext().replaceAll("u", "v").replaceAll("U", "V")); + } + case 62: break; + case 43: + // general lookahead, find correct zzMarkedPos + { int zzFState = 7; + int zzFPos = zzStartRead; + if (zzFin.length <= zzBufferL.length) { zzFin = new boolean[zzBufferL.length+1]; } + boolean zzFinL[] = zzFin; + while (zzFState != -1 && zzFPos < zzMarkedPos) { + if ((zzAttrL[zzFState] & 1) == 1) { zzFinL[zzFPos] = true; } + zzInput = zzBufferL[zzFPos++]; + zzFState = zzTransL[ zzRowMapL[zzFState] + zzCMapL[zzInput] ]; + } + if (zzFState != -1 && (zzAttrL[zzFState] & 1) == 1) { zzFinL[zzFPos] = true; } + + zzFState = 8; + zzFPos = zzMarkedPos; + while (!zzFinL[zzFPos] || (zzAttrL[zzFState] & 1) != 1) { + zzInput = zzBufferL[--zzFPos]; + zzFState = zzTransL[ zzRowMapL[zzFState] + zzCMapL[zzInput] ]; + }; + zzMarkedPos = zzFPos; + } + { cv = VOWEL; add(yytext().replace("ſ", "s")); + } + case 63: break; + case 3: + { cv = CONS; add(yytext()); + } + case 64: break; + case 29: + { cv = VOWEL; add("oi"); + } + case 65: break; + case 27: + { cv = CONS; add("QU"); + } + case 66: break; + case 17: + // lookahead expression with fixed base length + zzMarkedPos = zzStartRead + 1; + { switch(cv) { + case VOWEL: add(yytext().replace("u", "v").replace("U", "V")); break; + default: cv = VOWEL; add(yytext()); break; + } + } + case 67: break; + case 6: + { cv = CONS; add("ss"); + } + case 68: break; + case 5: + { cv = CONS; add("s"); + } + case 69: break; + case 11: + { switch (problem) { + case 1: return ""; + default: return normalized.replaceAll(LB, ""); + } + } + case 70: break; + case 24: + // lookahead expression with fixed base length + zzMarkedPos = zzStartRead + 1; + { add("o"); + } + case 71: break; + case 35: + // lookahead expression with fixed base length + zzMarkedPos = zzStartRead + 2; + { add("ac"); + } + case 72: break; + case 2: + { cv = VOWEL; add(yytext()); + } + case 73: break; + case 45: + // lookahead expression with fixed base length + zzMarkedPos = zzStartRead + 3; + { add("qui"); + } + case 74: break; + case 37: + // lookahead expression with fixed base length + zzMarkedPos = zzStartRead + 2; + { add("er"); + } + case 75: break; + case 26: + { cv = CONS; add("Qu"); + } + case 76: break; + case 32: + // lookahead expression with fixed base length + zzMarkedPos = zzStartRead + 2; + { add("ve"); + } + case 77: break; + case 40: + // lookahead expression with fixed base length + zzMarkedPos = zzStartRead + 2; + { add("us"); + } + case 78: break; + case 34: + // lookahead expression with fixed base length + zzMarkedPos = zzStartRead + 2; + { add("am"); + } + case 79: break; + case 7: + { cv = VOWEL; add("ae"); + } + case 80: break; + case 28: + { add("ar"); + } + case 81: break; + case 47: + // lookahead expression with fixed base length + zzMarkedPos = zzStartRead + 3; + { add("hic"); + } + case 82: break; + case 19: + { cv = VOWEL; add("uu"); + } + case 83: break; + case 42: + // lookahead expression with fixed base length + zzMarkedPos = zzStartRead + 2; + { add("ul"); + } + case 84: break; + case 22: + // lookahead expression with fixed base length + zzMarkedPos = zzStartRead + 1; + { add("a"); + } + case 85: break; + case 9: + { cv = VOWEL; add("oe"); + } + case 86: break; + case 18: + { cv = VOWEL; add("ui"); + } + case 87: break; + case 16: + { cv = CONS; add("qu"); + } + case 88: break; + case 49: + // lookahead expression with fixed base length + zzMarkedPos = zzStartRead + 4; + { add("que"); + } + case 89: break; + case 25: + // lookahead expression with fixed base length + zzMarkedPos = zzStartRead + 1; + { add("u"); + } + case 90: break; + case 38: + // lookahead expression with fixed base length + zzMarkedPos = zzStartRead + 2; + { add("es"); + } + case 91: break; + case 46: + // lookahead expression with fixed base length + zzMarkedPos = zzStartRead + 3; + { add("Qui"); + } + case 92: break; + case 44: + // lookahead expression with fixed base length + zzMarkedPos = zzStartRead + 1; + { add("i"); + } + case 93: break; + case 13: + { add("X"); + } + case 94: break; + case 14: + { switch(cv) { + case CONS: add(yytext().replace("v", "u").replace("V", "U")); break; + default: cv = CONS; add(yytext()); break; + } + } + case 95: break; + case 21: + { cv = VOWEL; add("ii"); + } + case 96: break; + case 33: + // lookahead expression with fixed base length + zzMarkedPos = zzStartRead + 2; + { add("as"); + } + case 97: break; + case 39: + // lookahead expression with fixed base length + zzMarkedPos = zzStartRead + 2; + { add("od"); + } + case 98: break; + default: + if (zzInput == YYEOF && zzStartRead == zzCurrentPos) { + zzAtEOF = true; + return null; + } + else { + zzScanError(ZZ_NO_MATCH); + } + } + } + } + + +} diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexLA.lex --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexLA.lex Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,228 @@ +/* + * Normalization rules for Latin text + * [this is a JFlex specification] + * + * Wolfgang Schmidle + * version 2011-07-12 + * + */ + +package de.mpg.mpiwg.berlin.mpdl.lt.analyzer.lang; + +%% + +%public +%class MpdlNormalizerLexLA +%type java.lang.String +%unicode + +// Latin: la, lat + +%states DISP, DICT, SEARCH +%states RENAISSANCE_DISP, RENAISSANCE_DICT, RENAISSANCE_SEARCH + +%{ + private static final int CONS = 1; + private static final int VOWEL = 2; + private int cv = 0; // consonant = 1, vowel = 2, everything else = 0 + + private String original = ""; + private String normalized = ""; + private int problem = 0; + + private void add (String norm) { + original += yytext(); + normalized += norm; + } + + private static final String LB = "[\u002d\u00ad] "; +%} + +Vowel = [AEIOUaeiouÆæęœ] // without àèòù etc. +Cons = [BCDFGHKLMNPQRSTVWXZbcdfghklmnpqrstvwxzſß] +// y counts neither as Vowel nor as Cons, see the default rule below: [yY] { cv = 0; add(yytext()); } + +LR = [lLrR] + +hyphen = [\u002d\u00ad] // hyphen and soft hyphen +LB = {hyphen} \u0020 +lb = ({hyphen} \u0020)? + +END = \n + +que = (que)? // optional -que +enclitic = (que | ve | ne) +prefixCons = (in{lb}ter | per | ſu{lb}per | ſer) // "ſer" for forms of ſervare + +%% + + +// TEST, siehe Benedetti Seite 444 +𐆑 { add("X"); } // (U+10191; D800+DD91) + + + { + +// 1. simple replacements + +// 1.1 single characters +ſ { cv = CONS; add("s"); } +ß { cv = CONS; add("ss"); } +[æę] { cv = VOWEL; add("ae"); } +Æ { cv = VOWEL; add("AE"); } +œ { cv = VOWEL; add("oe"); } + +// 1.2 character combinations +ij { cv = VOWEL; add("ii"); } + +// 2. superfluous diacritics + +// 2.1 acute accent +q́ue / {END} { add("que"); } // G +á / [mrst]? {enclitic} {END} { add("a"); } // G +é / [mrst]? {enclitic} {END} { add("e"); } // G +í / [mrst]? {enclitic} {END} { add("i"); } // G +ó / [mrst]? {enclitic} {END} { add("o"); } // G +ú / [mrst]? {enclitic} {END} { add("u"); } // G + +úe / {END} { add("ve"); } // W ?? + +// 2.2 grave accent +à / {que} {END} { add("a"); } // W G +àm / {que} {END} { add("am"); } // W (G) +às / {que} {END} { add("as"); } // W (G) (-àsque will likely never occur) +è / {que} {END} { add("e"); } // W G +ò / {que} {END} { add("o"); } // W G +òd / {que} {END} { add("od"); } // W (G) +ùm / {que} {END} { add("um"); } // W (G) +ùs / {que} {END} { add("us"); } // W G + +ès / {que} {END} { add("es"); } // (G) +^ quì / {END} { add("qui"); } // W ?? +^ Quì / {END} { add("Qui"); } // W ?? +àc / {END} { add("ac"); } // W ?? +èr / {END} { add("er"); } // W ?? +èt / {END} { add("et"); } // W ?? +ù / {END} { add("u"); } // W ?? +ùl / {END} { add("ul"); } // W ?? + +// 2.3 circumflex accent +^ hîc / {END} { add("hic"); } // W G +^ Hîc / {END} { add("Hic"); } // W G +^ ô / {END} { add("o"); } // G +â / {que} {END} { add("a"); } // W G +ûs / {END} { add("us"); } // W G +âr { add("ar"); } // W (G) --> this is only a rough approximation! + +// 2.4 trema +// 2.4.1 common cases +aë { cv = VOWEL; add("ae"); } +oë { cv = VOWEL; add("oe"); } +// 2.4.2 rare cases +oï { cv = VOWEL; add("oi"); } +uï { cv = VOWEL; add("ui"); } +// 2.4.3 extremely rare cases +uü { cv = VOWEL; add("uu"); } + + +// 3. rules for u and v + +// 3.1 rules for u --> v + +// peruenias --> pervenias, interuallum --> intervallum +^ {prefixCons} / {lb} { cv = VOWEL; add(yytext().replace("ſ", "s")); } // not cv = CONS ! + +// uellet --> vellet +^ [uU] / {Vowel} { cv = VOWEL; add(yytext().replaceAll("u", "v").replaceAll("U", "V")); } + +// diuidatur --> dividatur +// ut, volui: unchanged +// no rule for veruina because we cannot distinguish it from volui +[uU] / {Vowel} { + switch(cv) { + case VOWEL: add(yytext().replace("u", "v").replace("U", "V")); break; + default: cv = VOWEL; add(yytext()); break; + } + } + +// 3.2 rules for v --> u + +// qvam --> quam +qv { cv = CONS; add("qu"); } // the replaced v still counts as consonant +Qv { cv = CONS; add("Qu"); } +QV { cv = CONS; add("QU"); } + +// febrvarius --> februarius +// curva: unchanged +{LR} [vV] { + switch(cv) { + case CONS: add(yytext().replace("v", "u").replace("V", "U")); break; + default: cv = CONS; add(yytext()); break; + } + } + +// februarivs --> februarius +v / {lb} {Cons} { cv = CONS; add("u"); } +V / {lb} {Cons} { cv = CONS; add("U"); } + +// 3.3 override default rule for . + +{Vowel} { cv = VOWEL; add(yytext()); } +{Cons} { cv = CONS; add(yytext()); } +[yY] { cv = 0; add(yytext()); } + +@ { problem = 1; cv = 0; add(yytext()); } +{LB} { add(yytext()); } +. { problem = 1; cv = 0; add(yytext()); } // in particular from Arboreal: "〈" (2329), "〉" (232A), Ç, ç + +} + + + { + +{END} { + switch (problem) { + case 1: return original; + default: return normalized; + } + } +} + + { + +{END} { + switch (problem) { + case 1: return ""; + default: return normalized.replaceAll(LB, ""); + } + } +} + + { + +{END} { + switch (problem) { + case 1: return original; + default: return normalized.replaceAll(LB, "").toLowerCase(); + } + } +} + + +/* + +Annahmen: +- die Routine wird wortweise aufgerufen, mit einem \n am Ende des Strings +- Zeilenumbrüche innerhalb des Wortes sind als hyphen/soft hyphen plus space markiert + + +TO DO: + +LA: Nochmal überlegen, ob man Ææęàèòùœ in der Vokal-Klasse weglassen kann. Sie schaden aber auch nicht. (Oder doch !?) Unterscheide Vokal-Klassen vor und nach dem u ? +LA: Diakritika nochmal mit Paul durchgehen +LA: Die Disambiguierungen durch die Diakritika fehlen noch. +LA: ist J wirklich ein Problemfall? +LA: gibt es Wörter wie super-rv... oder super-lv... in Klein- oder Großbuchstaben? + +*/ diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexNL.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexNL.java Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,589 @@ +/* The following code was generated by JFlex 1.4.3 on 21.07.11 11:22 */ + +/* + * Normalization rules for Dutch text + * [this is a JFlex specification] + * + * Wolfgang Schmidle + * version 2011-07-12 + * + */ + +package de.mpg.mpiwg.berlin.mpdl.lt.text.norm.lang; + + +/** + * This class is a scanner generated by + * JFlex 1.4.3 + * on 21.07.11 11:22 from the specification file + * MpdlNormalizerLexNL.lex + */ +public class MpdlNormalizerLexNL { + + /** This character denotes the end of file */ + public static final int YYEOF = -1; + + /** initial size of the lookahead buffer */ + private static final int ZZ_BUFFERSIZE = 16384; + + /** lexical states */ + public static final int SEARCH = 6; + public static final int DICT = 4; + public static final int YYINITIAL = 0; + public static final int DISP = 2; + + /** + * ZZ_LEXSTATE[l] is the state in the DFA for the lexical state l + * ZZ_LEXSTATE[l+1] is the state in the DFA for the lexical state l + * at the beginning of a line + * l is of the form l = 2*k, k a non negative integer + */ + private static final int ZZ_LEXSTATE[] = { + 0, 0, 1, 1, 2, 2, 3, 3 + }; + + /** + * Translates characters to character classes + */ + private static final String ZZ_CMAP_PACKED = + "\12\0\1\3\25\0\1\2\14\0\1\1\2\0\1\1\17\0\1\5"+ + "\40\0\1\1\2\0\1\1\20\0\1\1\5\0\1\1\1\0\1\1"+ + "\u0101\0\1\4\ufe80\0"; + + /** + * Translates characters to character classes + */ + private static final char [] ZZ_CMAP = zzUnpackCMap(ZZ_CMAP_PACKED); + + /** + * Translates DFA states to action switch labels. + */ + private static final int [] ZZ_ACTION = zzUnpackAction(); + + private static final String ZZ_ACTION_PACKED_0 = + "\4\0\2\1\1\2\1\3\1\4\1\5\1\6"; + + private static int [] zzUnpackAction() { + int [] result = new int[11]; + int offset = 0; + offset = zzUnpackAction(ZZ_ACTION_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackAction(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int count = packed.charAt(i++); + int value = packed.charAt(i++); + do result[j++] = value; while (--count > 0); + } + return j; + } + + + /** + * Translates a state to a row index in the transition table + */ + private static final int [] ZZ_ROWMAP = zzUnpackRowMap(); + + private static final String ZZ_ROWMAP_PACKED_0 = + "\0\0\0\6\0\14\0\22\0\30\0\36\0\30\0\30"+ + "\0\30\0\30\0\30"; + + private static int [] zzUnpackRowMap() { + int [] result = new int[11]; + int offset = 0; + offset = zzUnpackRowMap(ZZ_ROWMAP_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackRowMap(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int high = packed.charAt(i++) << 16; + result[j++] = high | packed.charAt(i++); + } + return j; + } + + /** + * The transition table of the DFA + */ + private static final int [] ZZ_TRANS = zzUnpackTrans(); + + private static final String ZZ_TRANS_PACKED_0 = + "\1\5\1\6\1\5\1\0\1\5\1\7\1\5\1\6"+ + "\1\5\1\10\1\11\1\7\1\5\1\6\1\5\1\12"+ + "\1\11\1\7\1\5\1\6\1\5\1\13\1\11\1\7"+ + "\10\0\1\5\3\0"; + + private static int [] zzUnpackTrans() { + int [] result = new int[36]; + int offset = 0; + offset = zzUnpackTrans(ZZ_TRANS_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackTrans(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int count = packed.charAt(i++); + int value = packed.charAt(i++); + value--; + do result[j++] = value; while (--count > 0); + } + return j; + } + + + /* error codes */ + private static final int ZZ_UNKNOWN_ERROR = 0; + private static final int ZZ_NO_MATCH = 1; + private static final int ZZ_PUSHBACK_2BIG = 2; + + /* error messages for the codes above */ + private static final String ZZ_ERROR_MSG[] = { + "Unkown internal scanner error", + "Error: could not match input", + "Error: pushback value was too large" + }; + + /** + * ZZ_ATTRIBUTE[aState] contains the attributes of state aState + */ + private static final int [] ZZ_ATTRIBUTE = zzUnpackAttribute(); + + private static final String ZZ_ATTRIBUTE_PACKED_0 = + "\4\0\1\11\1\1\5\11"; + + private static int [] zzUnpackAttribute() { + int [] result = new int[11]; + int offset = 0; + offset = zzUnpackAttribute(ZZ_ATTRIBUTE_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackAttribute(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int count = packed.charAt(i++); + int value = packed.charAt(i++); + do result[j++] = value; while (--count > 0); + } + return j; + } + + /** the input device */ + private java.io.Reader zzReader; + + /** the current state of the DFA */ + private int zzState; + + /** the current lexical state */ + private int zzLexicalState = YYINITIAL; + + /** this buffer contains the current text to be matched and is + the source of the yytext() string */ + private char zzBuffer[] = new char[ZZ_BUFFERSIZE]; + + /** the textposition at the last accepting state */ + private int zzMarkedPos; + + /** the current text position in the buffer */ + private int zzCurrentPos; + + /** startRead marks the beginning of the yytext() string in the buffer */ + private int zzStartRead; + + /** endRead marks the last character in the buffer, that has been read + from input */ + private int zzEndRead; + + /** number of newlines encountered up to the start of the matched text */ + private int yyline; + + /** the number of characters up to the start of the matched text */ + private int yychar; + + /** + * the number of characters from the last newline up to the start of the + * matched text + */ + private int yycolumn; + + /** + * zzAtBOL == true <=> the scanner is currently at the beginning of a line + */ + private boolean zzAtBOL = true; + + /** zzAtEOF == true <=> the scanner is at the EOF */ + private boolean zzAtEOF; + + /** denotes if the user-EOF-code has already been executed */ + private boolean zzEOFDone; + + /* user code: */ + private String original = ""; + private String normalized = ""; + private int problem = 0; + + private void add (String norm) { + original += yytext(); + normalized += norm; + } + + private static final String LB = "[\u002d\u00ad] "; + + + /** + * Creates a new scanner + * There is also a java.io.InputStream version of this constructor. + * + * @param in the java.io.Reader to read input from. + */ + public MpdlNormalizerLexNL(java.io.Reader in) { + this.zzReader = in; + } + + /** + * Creates a new scanner. + * There is also java.io.Reader version of this constructor. + * + * @param in the java.io.Inputstream to read input from. + */ + public MpdlNormalizerLexNL(java.io.InputStream in) { + this(new java.io.InputStreamReader(in)); + } + + /** + * Unpacks the compressed character translation table. + * + * @param packed the packed character translation table + * @return the unpacked character translation table + */ + private static char [] zzUnpackCMap(String packed) { + char [] map = new char[0x10000]; + int i = 0; /* index in packed string */ + int j = 0; /* index in unpacked array */ + while (i < 46) { + int count = packed.charAt(i++); + char value = packed.charAt(i++); + do map[j++] = value; while (--count > 0); + } + return map; + } + + + /** + * Refills the input buffer. + * + * @return false, iff there was new input. + * + * @exception java.io.IOException if any I/O-Error occurs + */ + private boolean zzRefill() throws java.io.IOException { + + /* first: make room (if you can) */ + if (zzStartRead > 0) { + System.arraycopy(zzBuffer, zzStartRead, + zzBuffer, 0, + zzEndRead-zzStartRead); + + /* translate stored positions */ + zzEndRead-= zzStartRead; + zzCurrentPos-= zzStartRead; + zzMarkedPos-= zzStartRead; + zzStartRead = 0; + } + + /* is the buffer big enough? */ + if (zzCurrentPos >= zzBuffer.length) { + /* if not: blow it up */ + char newBuffer[] = new char[zzCurrentPos*2]; + System.arraycopy(zzBuffer, 0, newBuffer, 0, zzBuffer.length); + zzBuffer = newBuffer; + } + + /* finally: fill the buffer with new input */ + int numRead = zzReader.read(zzBuffer, zzEndRead, + zzBuffer.length-zzEndRead); + + if (numRead > 0) { + zzEndRead+= numRead; + return false; + } + // unlikely but not impossible: read 0 characters, but not at end of stream + if (numRead == 0) { + int c = zzReader.read(); + if (c == -1) { + return true; + } else { + zzBuffer[zzEndRead++] = (char) c; + return false; + } + } + + // numRead < 0 + return true; + } + + + /** + * Closes the input stream. + */ + public final void yyclose() throws java.io.IOException { + zzAtEOF = true; /* indicate end of file */ + zzEndRead = zzStartRead; /* invalidate buffer */ + + if (zzReader != null) + zzReader.close(); + } + + + /** + * Resets the scanner to read from a new input stream. + * Does not close the old reader. + * + * All internal variables are reset, the old input stream + * cannot be reused (internal buffer is discarded and lost). + * Lexical state is set to ZZ_INITIAL. + * + * @param reader the new input stream + */ + public final void yyreset(java.io.Reader reader) { + zzReader = reader; + zzAtBOL = true; + zzAtEOF = false; + zzEOFDone = false; + zzEndRead = zzStartRead = 0; + zzCurrentPos = zzMarkedPos = 0; + yyline = yychar = yycolumn = 0; + zzLexicalState = YYINITIAL; + } + + + /** + * Returns the current lexical state. + */ + public final int yystate() { + return zzLexicalState; + } + + + /** + * Enters a new lexical state + * + * @param newState the new lexical state + */ + public final void yybegin(int newState) { + zzLexicalState = newState; + } + + + /** + * Returns the text matched by the current regular expression. + */ + public final String yytext() { + return new String( zzBuffer, zzStartRead, zzMarkedPos-zzStartRead ); + } + + + /** + * Returns the character at position pos from the + * matched text. + * + * It is equivalent to yytext().charAt(pos), but faster + * + * @param pos the position of the character to fetch. + * A value from 0 to yylength()-1. + * + * @return the character at position pos + */ + public final char yycharat(int pos) { + return zzBuffer[zzStartRead+pos]; + } + + + /** + * Returns the length of the matched text region. + */ + public final int yylength() { + return zzMarkedPos-zzStartRead; + } + + + /** + * Reports an error that occured while scanning. + * + * In a wellformed scanner (no or only correct usage of + * yypushback(int) and a match-all fallback rule) this method + * will only be called with things that "Can't Possibly Happen". + * If this method is called, something is seriously wrong + * (e.g. a JFlex bug producing a faulty scanner etc.). + * + * Usual syntax/scanner level error handling should be done + * in error fallback rules. + * + * @param errorCode the code of the errormessage to display + */ + private void zzScanError(int errorCode) { + String message; + try { + message = ZZ_ERROR_MSG[errorCode]; + } + catch (ArrayIndexOutOfBoundsException e) { + message = ZZ_ERROR_MSG[ZZ_UNKNOWN_ERROR]; + } + + throw new Error(message); + } + + + /** + * Pushes the specified amount of characters back into the input stream. + * + * They will be read again by then next call of the scanning method + * + * @param number the number of characters to be read again. + * This number must not be greater than yylength()! + */ + public void yypushback(int number) { + if ( number > yylength() ) + zzScanError(ZZ_PUSHBACK_2BIG); + + zzMarkedPos -= number; + } + + + /** + * Resumes scanning until the next regular expression is matched, + * the end of input is encountered or an I/O-Error occurs. + * + * @return the next token + * @exception java.io.IOException if any I/O-Error occurs + */ + public java.lang.String yylex() throws java.io.IOException { + int zzInput; + int zzAction; + + // cached fields: + int zzCurrentPosL; + int zzMarkedPosL; + int zzEndReadL = zzEndRead; + char [] zzBufferL = zzBuffer; + char [] zzCMapL = ZZ_CMAP; + + int [] zzTransL = ZZ_TRANS; + int [] zzRowMapL = ZZ_ROWMAP; + int [] zzAttrL = ZZ_ATTRIBUTE; + + while (true) { + zzMarkedPosL = zzMarkedPos; + + zzAction = -1; + + zzCurrentPosL = zzCurrentPos = zzStartRead = zzMarkedPosL; + + zzState = ZZ_LEXSTATE[zzLexicalState]; + + + zzForAction: { + while (true) { + + if (zzCurrentPosL < zzEndReadL) + zzInput = zzBufferL[zzCurrentPosL++]; + else if (zzAtEOF) { + zzInput = YYEOF; + break zzForAction; + } + else { + // store back cached positions + zzCurrentPos = zzCurrentPosL; + zzMarkedPos = zzMarkedPosL; + boolean eof = zzRefill(); + // get translated positions and possibly new buffer + zzCurrentPosL = zzCurrentPos; + zzMarkedPosL = zzMarkedPos; + zzBufferL = zzBuffer; + zzEndReadL = zzEndRead; + if (eof) { + zzInput = YYEOF; + break zzForAction; + } + else { + zzInput = zzBufferL[zzCurrentPosL++]; + } + } + int zzNext = zzTransL[ zzRowMapL[zzState] + zzCMapL[zzInput] ]; + if (zzNext == -1) break zzForAction; + zzState = zzNext; + + int zzAttributes = zzAttrL[zzState]; + if ( (zzAttributes & 1) == 1 ) { + zzAction = zzState; + zzMarkedPosL = zzCurrentPosL; + if ( (zzAttributes & 8) == 8 ) break zzForAction; + } + + } + } + + // store back cached position + zzMarkedPos = zzMarkedPosL; + + switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) { + case 5: + { switch (problem) { + case 1: return ""; + default: return normalized.replaceAll(LB, ""); + } + } + case 7: break; + case 2: + { problem = 1; add(yytext()); + } + case 8: break; + case 4: + { add("s"); + } + case 9: break; + case 3: + { switch (problem) { + case 1: return original; + default: return normalized; + } + } + case 10: break; + case 6: + { switch (problem) { + case 1: return original; + default: return normalized.replaceAll(LB, "").toLowerCase(); + } + } + case 11: break; + case 1: + { add(yytext()); + } + case 12: break; + default: + if (zzInput == YYEOF && zzStartRead == zzCurrentPos) { + zzAtEOF = true; + return null; + } + else { + zzScanError(ZZ_NO_MATCH); + } + } + } + } + + +} diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexNL.lex --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexNL.lex Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,99 @@ +/* + * Normalization rules for Dutch text + * [this is a JFlex specification] + * + * Wolfgang Schmidle + * version 2011-07-12 + * + */ + +package de.mpg.mpiwg.berlin.mpdl.lt.analyzer.lang; + +%% + +%public +%class MpdlNormalizerLexNL +%type java.lang.String +%unicode + +// Dutch: nl + +%states DISP, DICT, SEARCH + +%{ + private String original = ""; + private String normalized = ""; + private int problem = 0; + + private void add (String norm) { + original += yytext(); + normalized += norm; + } + + private static final String LB = "[\u002d\u00ad] "; +%} + +hyphen = [-\u{00ad}] // hyphen and soft hyphen +LB = {hyphen} \u0020 +// lb = ({hyphen} \u0020)? + +END = \n + +%% + + { + +ſ { add("s"); } + +} + + +// default + +@ { problem = 1; add(yytext()); } +{LB} { add(yytext()); } +. { add(yytext()); } + + + { + +{END} { + switch (problem) { + case 1: return original; + default: return normalized; + } + } +} + + { + +{END} { + switch (problem) { + case 1: return ""; + default: return normalized.replaceAll(LB, ""); + } + } +} + + { + +{END} { + switch (problem) { + case 1: return original; + default: return normalized.replaceAll(LB, "").toLowerCase(); + } + } +} + + +/* + +Annahmen: +- die Routine wird wortweise aufgerufen, mit einem \n am Ende des Strings +- Zeilenumbrüche innerhalb des Wortes sind als hyphen/soft hyphen plus space markiert + +TO DO: + +NL: vollständig? + +*/ diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexTemplate.lex --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexTemplate.lex Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,89 @@ +/* + * Template for normalization rules + * [this is a JFlex specification] + * + * Wolfgang Schmidle + * version 2011-07-12 + * + */ + +package de.mpg.mpiwg.berlin.mpdl.lt.analyzer.lang; + +%% + +%public +%class MpdlNormalizerLexTemplate +%type java.lang.String +%unicode + +// Language: list of ISO codes + +%states DISP, DICT, SEARCH + +%{ + private String original = ""; + private String normalized = ""; + private int problem = 0; + + private void add (String norm) { + original += yytext(); + normalized += norm; + } + + private static final String LB = "[\u002d\u00ad] "; +%} + +hyphen = [-\u{00ad}] // hyphen and soft hyphen +LB = {hyphen} \u0020 +// lb = ({hyphen} \u0020)? + +END = \n + +%% + + { + +ſ { add("s"); } // sample rule + +} + + +// default rules + +@ { problem = 1; add(yytext()); } +{LB} { add(yytext()); } +. { add(yytext()); } + + +// at the end, determine which string to return + + { + +{END} { + switch (problem) { + case 1: return original; + default: return normalized; + } + } +} + + { + +{END} { + switch (problem) { + case 1: return ""; + default: return normalized.replaceAll(LB, ""); + } + } +} + + { + +{END} { + switch (problem) { + case 1: return original; + default: return normalized.replaceAll(LB, "").toLowerCase(); + } + } +} + diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexZH.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexZH.java Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,637 @@ +/* The following code was generated by JFlex 1.4.3 on 21.07.11 11:22 */ + +/* + * Normalization rules for Chinese text + * [this is a JFlex specification] + * + * Wolfgang Schmidle + * version 2011-02-28 + * + */ + +package de.mpg.mpiwg.berlin.mpdl.lt.text.norm.lang; + + +/** + * This class is a scanner generated by + * JFlex 1.4.3 + * on 21.07.11 11:22 from the specification file + * MpdlNormalizerLexZH.lex + */ +public class MpdlNormalizerLexZH { + + /** This character denotes the end of file */ + public static final int YYEOF = -1; + + /** initial size of the lookahead buffer */ + private static final int ZZ_BUFFERSIZE = 16384; + + /** lexical states */ + public static final int SEARCH = 6; + public static final int DICT = 4; + public static final int YYINITIAL = 0; + public static final int DISP = 2; + + /** + * ZZ_LEXSTATE[l] is the state in the DFA for the lexical state l + * ZZ_LEXSTATE[l+1] is the state in the DFA for the lexical state l + * at the beginning of a line + * l is of the form l = 2*k, k a non negative integer + */ + private static final int ZZ_LEXSTATE[] = { + 0, 0, 1, 1, 2, 2, 3, 3 + }; + + /** + * Translates characters to character classes + */ + private static final String ZZ_CMAP_PACKED = + "\12\0\1\2\45\0\1\1\1\0\1\1\15\0\1\20\41\0\1\1"+ + "\22\0\1\1\5\0\1\1\1\0\1\1\u4f84\0\1\3\176\0\1\4"+ + "\u035a\0\1\4\u0a9a\0\1\6\u0781\0\1\10\u057a\0\1\11\u06bd\0\1\12"+ + "\15\0\1\7\u0891\0\1\5\u1baf\0\1\13\340\0\1\14\u411a\0\1\16"+ + "\u040e\0\1\17\u1d8f\0\1\15\u05e2\0"; + + /** + * Translates characters to character classes + */ + private static final char [] ZZ_CMAP = zzUnpackCMap(ZZ_CMAP_PACKED); + + /** + * Translates DFA states to action switch labels. + */ + private static final int [] ZZ_ACTION = zzUnpackAction(); + + private static final String ZZ_ACTION_PACKED_0 = + "\4\0\1\1\1\2\1\3\1\4\1\5\1\6\1\7"+ + "\1\10\1\11\1\12\1\13\1\14\1\15\1\16\1\1"+ + "\1\17\1\20\1\21"; + + private static int [] zzUnpackAction() { + int [] result = new int[22]; + int offset = 0; + offset = zzUnpackAction(ZZ_ACTION_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackAction(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int count = packed.charAt(i++); + int value = packed.charAt(i++); + do result[j++] = value; while (--count > 0); + } + return j; + } + + + /** + * Translates a state to a row index in the transition table + */ + private static final int [] ZZ_ROWMAP = zzUnpackRowMap(); + + private static final String ZZ_ROWMAP_PACKED_0 = + "\0\0\0\21\0\42\0\63\0\104\0\104\0\104\0\104"+ + "\0\104\0\104\0\104\0\104\0\104\0\104\0\104\0\104"+ + "\0\104\0\104\0\125\0\104\0\104\0\104"; + + private static int [] zzUnpackRowMap() { + int [] result = new int[22]; + int offset = 0; + offset = zzUnpackRowMap(ZZ_ROWMAP_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackRowMap(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int high = packed.charAt(i++) << 16; + result[j++] = high | packed.charAt(i++); + } + return j; + } + + /** + * The transition table of the DFA + */ + private static final int [] ZZ_TRANS = zzUnpackTrans(); + + private static final String ZZ_TRANS_PACKED_0 = + "\2\5\1\0\15\5\1\6\2\5\1\7\1\10\1\11"+ + "\1\12\1\13\1\14\1\15\1\16\1\17\1\20\1\21"+ + "\1\22\1\23\1\5\1\6\1\5\1\24\1\25\1\10"+ + "\1\11\1\12\1\13\1\14\1\15\1\16\1\17\1\20"+ + "\1\21\1\22\1\23\1\5\1\6\1\5\1\24\1\7"+ + "\1\10\1\11\1\12\1\13\1\14\1\15\1\16\1\17"+ + "\1\20\1\21\1\22\1\23\1\5\1\6\40\0\1\26"+ + "\1\0"; + + private static int [] zzUnpackTrans() { + int [] result = new int[102]; + int offset = 0; + offset = zzUnpackTrans(ZZ_TRANS_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackTrans(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int count = packed.charAt(i++); + int value = packed.charAt(i++); + value--; + do result[j++] = value; while (--count > 0); + } + return j; + } + + + /* error codes */ + private static final int ZZ_UNKNOWN_ERROR = 0; + private static final int ZZ_NO_MATCH = 1; + private static final int ZZ_PUSHBACK_2BIG = 2; + + /* error messages for the codes above */ + private static final String ZZ_ERROR_MSG[] = { + "Unkown internal scanner error", + "Error: could not match input", + "Error: pushback value was too large" + }; + + /** + * ZZ_ATTRIBUTE[aState] contains the attributes of state aState + */ + private static final int [] ZZ_ATTRIBUTE = zzUnpackAttribute(); + + private static final String ZZ_ATTRIBUTE_PACKED_0 = + "\4\0\16\11\1\1\3\11"; + + private static int [] zzUnpackAttribute() { + int [] result = new int[22]; + int offset = 0; + offset = zzUnpackAttribute(ZZ_ATTRIBUTE_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackAttribute(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int count = packed.charAt(i++); + int value = packed.charAt(i++); + do result[j++] = value; while (--count > 0); + } + return j; + } + + /** the input device */ + private java.io.Reader zzReader; + + /** the current state of the DFA */ + private int zzState; + + /** the current lexical state */ + private int zzLexicalState = YYINITIAL; + + /** this buffer contains the current text to be matched and is + the source of the yytext() string */ + private char zzBuffer[] = new char[ZZ_BUFFERSIZE]; + + /** the textposition at the last accepting state */ + private int zzMarkedPos; + + /** the current text position in the buffer */ + private int zzCurrentPos; + + /** startRead marks the beginning of the yytext() string in the buffer */ + private int zzStartRead; + + /** endRead marks the last character in the buffer, that has been read + from input */ + private int zzEndRead; + + /** number of newlines encountered up to the start of the matched text */ + private int yyline; + + /** the number of characters up to the start of the matched text */ + private int yychar; + + /** + * the number of characters from the last newline up to the start of the + * matched text + */ + private int yycolumn; + + /** + * zzAtBOL == true <=> the scanner is currently at the beginning of a line + */ + private boolean zzAtBOL = true; + + /** zzAtEOF == true <=> the scanner is at the EOF */ + private boolean zzAtEOF; + + /** denotes if the user-EOF-code has already been executed */ + private boolean zzEOFDone; + + /* user code: */ + private String original = ""; + private String normalized = ""; + private int problem = 0; + + private void add (String norm) { + original += yytext(); + normalized += norm; + } + + + /** + * Creates a new scanner + * There is also a java.io.InputStream version of this constructor. + * + * @param in the java.io.Reader to read input from. + */ + public MpdlNormalizerLexZH(java.io.Reader in) { + this.zzReader = in; + } + + /** + * Creates a new scanner. + * There is also java.io.Reader version of this constructor. + * + * @param in the java.io.Inputstream to read input from. + */ + public MpdlNormalizerLexZH(java.io.InputStream in) { + this(new java.io.InputStreamReader(in)); + } + + /** + * Unpacks the compressed character translation table. + * + * @param packed the packed character translation table + * @return the unpacked character translation table + */ + private static char [] zzUnpackCMap(String packed) { + char [] map = new char[0x10000]; + int i = 0; /* index in packed string */ + int j = 0; /* index in unpacked array */ + while (i < 90) { + int count = packed.charAt(i++); + char value = packed.charAt(i++); + do map[j++] = value; while (--count > 0); + } + return map; + } + + + /** + * Refills the input buffer. + * + * @return false, iff there was new input. + * + * @exception java.io.IOException if any I/O-Error occurs + */ + private boolean zzRefill() throws java.io.IOException { + + /* first: make room (if you can) */ + if (zzStartRead > 0) { + System.arraycopy(zzBuffer, zzStartRead, + zzBuffer, 0, + zzEndRead-zzStartRead); + + /* translate stored positions */ + zzEndRead-= zzStartRead; + zzCurrentPos-= zzStartRead; + zzMarkedPos-= zzStartRead; + zzStartRead = 0; + } + + /* is the buffer big enough? */ + if (zzCurrentPos >= zzBuffer.length) { + /* if not: blow it up */ + char newBuffer[] = new char[zzCurrentPos*2]; + System.arraycopy(zzBuffer, 0, newBuffer, 0, zzBuffer.length); + zzBuffer = newBuffer; + } + + /* finally: fill the buffer with new input */ + int numRead = zzReader.read(zzBuffer, zzEndRead, + zzBuffer.length-zzEndRead); + + if (numRead > 0) { + zzEndRead+= numRead; + return false; + } + // unlikely but not impossible: read 0 characters, but not at end of stream + if (numRead == 0) { + int c = zzReader.read(); + if (c == -1) { + return true; + } else { + zzBuffer[zzEndRead++] = (char) c; + return false; + } + } + + // numRead < 0 + return true; + } + + + /** + * Closes the input stream. + */ + public final void yyclose() throws java.io.IOException { + zzAtEOF = true; /* indicate end of file */ + zzEndRead = zzStartRead; /* invalidate buffer */ + + if (zzReader != null) + zzReader.close(); + } + + + /** + * Resets the scanner to read from a new input stream. + * Does not close the old reader. + * + * All internal variables are reset, the old input stream + * cannot be reused (internal buffer is discarded and lost). + * Lexical state is set to ZZ_INITIAL. + * + * @param reader the new input stream + */ + public final void yyreset(java.io.Reader reader) { + zzReader = reader; + zzAtBOL = true; + zzAtEOF = false; + zzEOFDone = false; + zzEndRead = zzStartRead = 0; + zzCurrentPos = zzMarkedPos = 0; + yyline = yychar = yycolumn = 0; + zzLexicalState = YYINITIAL; + } + + + /** + * Returns the current lexical state. + */ + public final int yystate() { + return zzLexicalState; + } + + + /** + * Enters a new lexical state + * + * @param newState the new lexical state + */ + public final void yybegin(int newState) { + zzLexicalState = newState; + } + + + /** + * Returns the text matched by the current regular expression. + */ + public final String yytext() { + return new String( zzBuffer, zzStartRead, zzMarkedPos-zzStartRead ); + } + + + /** + * Returns the character at position pos from the + * matched text. + * + * It is equivalent to yytext().charAt(pos), but faster + * + * @param pos the position of the character to fetch. + * A value from 0 to yylength()-1. + * + * @return the character at position pos + */ + public final char yycharat(int pos) { + return zzBuffer[zzStartRead+pos]; + } + + + /** + * Returns the length of the matched text region. + */ + public final int yylength() { + return zzMarkedPos-zzStartRead; + } + + + /** + * Reports an error that occured while scanning. + * + * In a wellformed scanner (no or only correct usage of + * yypushback(int) and a match-all fallback rule) this method + * will only be called with things that "Can't Possibly Happen". + * If this method is called, something is seriously wrong + * (e.g. a JFlex bug producing a faulty scanner etc.). + * + * Usual syntax/scanner level error handling should be done + * in error fallback rules. + * + * @param errorCode the code of the errormessage to display + */ + private void zzScanError(int errorCode) { + String message; + try { + message = ZZ_ERROR_MSG[errorCode]; + } + catch (ArrayIndexOutOfBoundsException e) { + message = ZZ_ERROR_MSG[ZZ_UNKNOWN_ERROR]; + } + + throw new Error(message); + } + + + /** + * Pushes the specified amount of characters back into the input stream. + * + * They will be read again by then next call of the scanning method + * + * @param number the number of characters to be read again. + * This number must not be greater than yylength()! + */ + public void yypushback(int number) { + if ( number > yylength() ) + zzScanError(ZZ_PUSHBACK_2BIG); + + zzMarkedPos -= number; + } + + + /** + * Resumes scanning until the next regular expression is matched, + * the end of input is encountered or an I/O-Error occurs. + * + * @return the next token + * @exception java.io.IOException if any I/O-Error occurs + */ + public java.lang.String yylex() throws java.io.IOException { + int zzInput; + int zzAction; + + // cached fields: + int zzCurrentPosL; + int zzMarkedPosL; + int zzEndReadL = zzEndRead; + char [] zzBufferL = zzBuffer; + char [] zzCMapL = ZZ_CMAP; + + int [] zzTransL = ZZ_TRANS; + int [] zzRowMapL = ZZ_ROWMAP; + int [] zzAttrL = ZZ_ATTRIBUTE; + + while (true) { + zzMarkedPosL = zzMarkedPos; + + zzAction = -1; + + zzCurrentPosL = zzCurrentPos = zzStartRead = zzMarkedPosL; + + zzState = ZZ_LEXSTATE[zzLexicalState]; + + + zzForAction: { + while (true) { + + if (zzCurrentPosL < zzEndReadL) + zzInput = zzBufferL[zzCurrentPosL++]; + else if (zzAtEOF) { + zzInput = YYEOF; + break zzForAction; + } + else { + // store back cached positions + zzCurrentPos = zzCurrentPosL; + zzMarkedPos = zzMarkedPosL; + boolean eof = zzRefill(); + // get translated positions and possibly new buffer + zzCurrentPosL = zzCurrentPos; + zzMarkedPosL = zzMarkedPos; + zzBufferL = zzBuffer; + zzEndReadL = zzEndRead; + if (eof) { + zzInput = YYEOF; + break zzForAction; + } + else { + zzInput = zzBufferL[zzCurrentPosL++]; + } + } + int zzNext = zzTransL[ zzRowMapL[zzState] + zzCMapL[zzInput] ]; + if (zzNext == -1) break zzForAction; + zzState = zzNext; + + int zzAttributes = zzAttrL[zzState]; + if ( (zzAttributes & 1) == 1 ) { + zzAction = zzState; + zzMarkedPosL = zzCurrentPosL; + if ( (zzAttributes & 8) == 8 ) break zzForAction; + } + + } + } + + // store back cached position + zzMarkedPos = zzMarkedPosL; + + switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) { + case 17: + { add("庶"); + } + case 18: break; + case 9: + { add("時"); + } + case 19: break; + case 2: + { problem = 1; add(yytext()); + } + case 20: break; + case 3: + { switch (problem) { + case 1: return original; + default: return normalized; + } + } + case 21: break; + case 10: + { add("歷"); + } + case 22: break; + case 13: + { add("面"); + } + case 23: break; + case 14: + { add("精"); + } + case 24: break; + case 12: + { add("陰"); + } + case 25: break; + case 8: + { add("床"); + } + case 26: break; + case 1: + { add(yytext()); + } + case 27: break; + case 15: + { add(""); + } + case 28: break; + case 7: + { add("并"); + } + case 29: break; + case 4: + { add("併"); + } + case 30: break; + case 11: + { add("為"); + } + case 31: break; + case 6: + { add("奇"); + } + case 32: break; + case 5: + { add("叟"); + } + case 33: break; + case 16: + { switch (problem) { + case 1: return ""; + default: return normalized; + } + } + case 34: break; + default: + if (zzInput == YYEOF && zzStartRead == zzCurrentPos) { + zzAtEOF = true; + return null; + } + else { + zzScanError(ZZ_NO_MATCH); + } + } + } + } + + +} diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexZH.lex --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexZH.lex Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,120 @@ +/* + * Normalization rules for Chinese text + * [this is a JFlex specification] + * + * Wolfgang Schmidle + * version 2011-02-28 + * + */ + +package de.mpg.mpiwg.berlin.mpdl.lt.analyzer.lang; + +%% + +%public +%class MpdlNormalizerLexZH +%type java.lang.String +%unicode + +// classical Chinese: zh, zho, zho-Hant + +%states DISP, DICT, SEARCH + +%{ + private String original = ""; + private String normalized = ""; + private int problem = 0; + + private void add (String norm) { + original += yytext(); + normalized += norm; + } +%} + +ZWS = [\u{200b}] + +END = \n + +%% + +// Normalization in Chinese means that character variants will be replaced by their standard characters +// if there is no doubt about what the standard character is. + +// The input is supposed to be a single Chinese character, but strings of characters are also handled correctly. + + { + +// Codepoint < FFFF + +倂 { add("併"); } // 5002 --> 4F75 +傁 | 叜 { add("叟"); } // 5081, 53DC --> 53DF +竒 { add("奇"); } // 7AD2 --> 5947 +幷 { add("并"); } // 5E77 --> 5E76 +牀 { add("床"); } // 7240 --> 5E8A +旹 { add("時"); } // 65F9 --> 6642 +歴 { add("歷"); } // 6B74 --> 6B77 +爲 { add("為"); } // 7232 --> 70BA +隂 { add("陰"); } // 9682 --> 9670 +靣 { add("面"); } // 9763 --> 9762 +精 { add("精"); } // FA1D --> 7CBE (FA1D is a compatibility ideograph) + +// Codepoint > FFFF + +// note that [ABC] is not equivalent to A | B | C for codepoints above FFFF due to their internal encoding: +// for example, 庶 (U+2F88D) is represented as a sequence of two codepoints: D87E DC8D +// i.e. never use [ABC] but A | B | C + +庶 { add("庶"); } // 2F88D --> 5EB6 (2F88D is a compatibility ideograph) + +} + + { + +// remove Zero Width Space (if there is any in the the input string) + +{ZWS} { add(""); } + +} + +// default + +@ { problem = 1; add(yytext()); } +. { add(yytext()); } + + + { + +{END} { + switch (problem) { + case 1: return original; + default: return normalized; + } + } +} + + { + +{END} { + switch (problem) { + case 1: return ""; + default: return normalized; + } + } +} + + +/* + +Annahmen: +- die Routine wird zeichenweise (oder mit mehr als einem Zeichen) aufgerufen, mit einem \n am Ende des Strings +- es gibt keine Zeilenumbrüche + +TO DO: + +ZH: Liste ergänzen +ZH: was ist, wenn man wirklich die Variante, die im Text steht, nachschlagen will? Dann muss man das Zeichen wohl selbst rauskopieren. +ZH: sollen lateinische Buchstaben bewirken, dass problem = 1 ist? +ZH: sollen Zeilenumbrüche rausgenommen werden, auch wenn sie in korrekt markiertem Text nicht vorkommen? +ZH: was ist, wenn beijing übergeben wird und einen Zeilenumbruch enthält? Verlässt sich der Wrapper darauf, dass die Zeichenzahl gleich bleibt, oder macht er ein hyphen rein? was macht oder ? + +*/ diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/reg/DBRegularizationHandler.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/reg/DBRegularizationHandler.java Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,146 @@ +package de.mpg.mpiwg.berlin.mpdl.lt.text.reg; + +import java.io.UnsupportedEncodingException; +import java.util.ArrayList; + +import com.sleepycat.je.Cursor; +import com.sleepycat.je.Database; +import com.sleepycat.je.DatabaseEntry; +import com.sleepycat.je.DatabaseException; +import com.sleepycat.je.LockMode; +import com.sleepycat.je.OperationStatus; + +import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; +import de.mpg.mpiwg.berlin.mpdl.lt.general.Language; + +public class DBRegularizationHandler { + private String dbDirectory; + private DbEnvRegularization regDbEnv; + + public DBRegularizationHandler(String dbDir) { + this.dbDirectory = dbDir; + } + + public void start() throws ApplicationException { + regDbEnv = new DbEnvRegularization(); + regDbEnv.setDataDir(dbDirectory); + regDbEnv.init(); // open databases in read/write mode + } + + public void openDatabases() throws ApplicationException { + regDbEnv.openDatabases(); + } + + public void closeDatabases() throws ApplicationException { + regDbEnv.close(); + } + + public void deleteData() throws ApplicationException { + regDbEnv.removeDatabases(); + } + + public void writeOrigReg(Regularization reg) throws ApplicationException { + try { + String language = Language.getInstance().getLanguageId(reg.getLanguage()); + String keyStr = language + "###" + reg.getOrig(); + String valueStr = reg.getXmlString(); + DatabaseEntry dbEntryKey = new DatabaseEntry(keyStr.getBytes("utf-8")); + DatabaseEntry dbEntryValue = new DatabaseEntry(valueStr.getBytes("utf-8")); + Database origDB = regDbEnv.getOrigDB(); + origDB.put(null, dbEntryKey, dbEntryValue); + } catch (DatabaseException e) { + throw new ApplicationException(e); + } catch (UnsupportedEncodingException e) { + throw new ApplicationException(e); + } + } + + public void writeNormReg(Regularization reg) throws ApplicationException { + try { + String language = Language.getInstance().getLanguageId(reg.getLanguage()); + String keyStr = language + "###" + reg.getNorm(); + String valueStr = reg.getXmlString(); + DatabaseEntry dbEntryKey = new DatabaseEntry(keyStr.getBytes("utf-8")); + DatabaseEntry dbEntryValue = new DatabaseEntry(valueStr.getBytes("utf-8")); + Database normDB = regDbEnv.getNormDB(); + normDB.put(null, dbEntryKey, dbEntryValue); + } catch (DatabaseException e) { + throw new ApplicationException(e); + } catch (UnsupportedEncodingException e) { + throw new ApplicationException(e); + } + } + + public void deleteReg(Regularization reg) throws ApplicationException { + try { + String language = Language.getInstance().getLanguageId(reg.getLanguage()); + String keyStrOrig = language + "###" + reg.getOrig(); + DatabaseEntry dbEntryKey = new DatabaseEntry(keyStrOrig.getBytes("utf-8")); + Database origDB = regDbEnv.getOrigDB(); + origDB.delete(null, dbEntryKey); + String keyStrNorm = reg.getLanguage() + "###" + reg.getNorm(); + dbEntryKey = new DatabaseEntry(keyStrNorm.getBytes("utf-8")); + Database normDB = regDbEnv.getNormDB(); + normDB.delete(null, dbEntryKey); + } catch (DatabaseException e) { + throw new ApplicationException(e); + } catch (UnsupportedEncodingException e) { + throw new ApplicationException(e); + } + } + + public ArrayList readRegsByOrig(String lang, String orig) throws ApplicationException { + String language = Language.getInstance().getLanguageId(lang); + ArrayList retRegs = new ArrayList(); + String hashKey = language + "###" + orig; + try { + Database origDB = regDbEnv.getOrigDB(); + Cursor cursor = origDB.openCursor(null, null); + byte[] bHashKey = hashKey.getBytes("utf-8"); + DatabaseEntry dbEntryKey = new DatabaseEntry(bHashKey); + DatabaseEntry foundValue = new DatabaseEntry(); + OperationStatus operationStatus = cursor.getSearchKey(dbEntryKey, foundValue, LockMode.DEFAULT); + while (operationStatus == OperationStatus.SUCCESS) { + byte[] foundValueBytes = foundValue.getData(); + String foundValueStr = new String(foundValueBytes, "utf-8"); + Regularization reg = Regularization.getInstance(foundValueStr); + retRegs.add(reg); + operationStatus = cursor.getNextDup(dbEntryKey, foundValue, LockMode.DEFAULT); + } + cursor.close(); + } catch (DatabaseException e) { + throw new ApplicationException(e); + } catch (UnsupportedEncodingException e) { + throw new ApplicationException(e); + } + return retRegs; + } + + public ArrayList readRegsByNorm(String lang, String norm) throws ApplicationException { + String language = Language.getInstance().getLanguageId(lang); + ArrayList retRegs = new ArrayList(); + String hashKey = language + "###" + norm; + try { + Database normDB = regDbEnv.getNormDB(); + Cursor cursor = normDB.openCursor(null, null); + byte[] bHashKey = hashKey.getBytes("utf-8"); + DatabaseEntry dbEntryKey = new DatabaseEntry(bHashKey); + DatabaseEntry foundValue = new DatabaseEntry(); + OperationStatus operationStatus = cursor.getSearchKey(dbEntryKey, foundValue, LockMode.DEFAULT); + while (operationStatus == OperationStatus.SUCCESS) { + byte[] foundValueBytes = foundValue.getData(); + String foundValueStr = new String(foundValueBytes, "utf-8"); + Regularization reg = Regularization.getInstance(foundValueStr); + retRegs.add(reg); + operationStatus = cursor.getNextDup(dbEntryKey, foundValue, LockMode.DEFAULT); + } + cursor.close(); + } catch (DatabaseException e) { + throw new ApplicationException(e); + } catch (UnsupportedEncodingException e) { + throw new ApplicationException(e); + } + return retRegs; + } + +} \ No newline at end of file diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/reg/DbEnvRegularization.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/reg/DbEnvRegularization.java Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,100 @@ +package de.mpg.mpiwg.berlin.mpdl.lt.text.reg; + +import java.io.File; + +import com.sleepycat.je.Database; +import com.sleepycat.je.DatabaseConfig; +import com.sleepycat.je.DatabaseException; +import com.sleepycat.je.Environment; +import com.sleepycat.je.EnvironmentConfig; + +import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; + +public class DbEnvRegularization { + private String dataDir; + private File envPath; + private Environment env; + private EnvironmentConfig envConfig; + private DatabaseConfig dbConfig; + private Database origDB; + private Database normDB; + + public DbEnvRegularization() { + } + + public void setDataDir(String dataDir) { + this.dataDir = dataDir; + } + + public void init() throws ApplicationException { + try { + envConfig = new EnvironmentConfig(); + dbConfig = new DatabaseConfig(); + envConfig.setReadOnly(false); + dbConfig.setReadOnly(false); + envConfig.setAllowCreate(true); + dbConfig.setAllowCreate(true); + envConfig.setTransactional(true); + dbConfig.setTransactional(true); + // allow duplicates for keys + dbConfig.setSortedDuplicates(true); + envPath = new File(dataDir); + env = new Environment(envPath, envConfig); + } catch (DatabaseException e) { + throw new ApplicationException(e); + } + } + + public void openDatabases() throws ApplicationException { + try { + // open databases (and create them if they do not exist) + origDB = env.openDatabase(null, "OrigDB", dbConfig); + normDB = env.openDatabase(null, "NormDB", dbConfig); + } catch (DatabaseException e) { + throw new ApplicationException(e); + } + } + + public void removeDatabases() throws ApplicationException { + try { + if (origDB != null) + origDB.close(); + if (normDB != null) + normDB.close(); + env.removeDatabase(null, "OrigDB"); + env.removeDatabase(null, "NormDB"); + origDB = null; + normDB = null; + } catch (DatabaseException e) { + throw new ApplicationException(e); + } + } + + public Environment getEnv() { + return env; + } + + public Database getNormDB() { + return normDB; + } + + public Database getOrigDB() { + return origDB; + } + + public void close() throws ApplicationException { + if (env != null) { + try { + if (origDB != null) + origDB.close(); + if (normDB != null) + normDB.close(); + if (env != null) + env.close(); + } catch (DatabaseException e) { + throw new ApplicationException(e); + } + } + } +} + diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/reg/Regularization.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/reg/Regularization.java Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,89 @@ +package de.mpg.mpiwg.berlin.mpdl.lt.text.reg; + +import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; +import de.mpg.mpiwg.berlin.mpdl.util.StringUtils; +import de.mpg.mpiwg.berlin.mpdl.xml.xquery.XQueryEvaluator; + +public class Regularization { + private String language; + private String orig; + private String norm; + private String source; + private int sourcePosition; + + public Regularization(String language, String orig, String norm, String source) { + this.language = language; + this.orig = orig; + this.norm = norm; + this.source = source; + } + + public static Regularization getInstance(String xmlStr) throws ApplicationException { + XQueryEvaluator xQueryEvaluator = new XQueryEvaluator(); + String language = xQueryEvaluator.evaluateAsStringValue(xmlStr, "//language"); + String orig = xQueryEvaluator.evaluateAsStringValue(xmlStr, "//orig"); + String norm = xQueryEvaluator.evaluateAsStringValue(xmlStr, "//norm"); + String source = xQueryEvaluator.evaluateAsStringValue(xmlStr, "//source"); + String sourcePosStr = xQueryEvaluator.evaluateAsStringValue(xmlStr, "//source/@position"); + int sourcePos = new Integer(sourcePosStr); + Regularization reg = new Regularization(language, orig, norm, source); + reg.setSourcePosition(sourcePos); + return reg; + } + + public String getLanguage() { + return language; + } + + public void setLanguage(String language) { + this.language = language; + } + + public String getOrig() { + return orig; + } + + public void setOrig(String orig) { + this.orig = orig; + } + + public String getNorm() { + return norm; + } + + public void setNorm(String norm) { + this.norm = norm; + } + + public String getSource() { + return source; + } + + public void setSource(String source) { + this.source = source; + } + + public int getSourcePosition() { + return sourcePosition; + } + + public void setSourcePosition(int sourcePosition) { + this.sourcePosition = sourcePosition; + } + + public String getXmlString() { + String xmlString = "\n"; + if (language != null) + xmlString += " " + language + "\n"; + if (orig != null) + xmlString += " " + StringUtils.deresolveXmlEntities(orig) + "\n"; + if (norm != null) + xmlString += " " + StringUtils.deresolveXmlEntities(norm) + "\n"; + if (source != null) + xmlString += " " + StringUtils.deresolveXmlEntities(source) + "\n"; + xmlString += "\n"; + return xmlString; + } + + +} diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/reg/RegularizationManager.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/reg/RegularizationManager.java Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,118 @@ +package de.mpg.mpiwg.berlin.mpdl.lt.text.reg; + +import java.util.ArrayList; +import java.util.Date; +import java.util.Hashtable; +import java.util.logging.Logger; + +import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; +import de.mpg.mpiwg.berlin.mpdl.lt.general.Constants; +import de.mpg.mpiwg.berlin.mpdl.lucene.util.LuceneUtil; +import de.mpg.mpiwg.berlin.mpdl.util.Util; + +public class RegularizationManager { + private static RegularizationManager instance; + private static String DATA_DIR = Constants.getInstance().getDataDir(); + private static String REGULARIZATION_DB_DIR = DATA_DIR + "/dataBerkeleyDB/regularization"; + private static Logger LOGGER = Logger.getLogger(RegularizationManager.class.getName()); + private DBRegularizationHandler dbRegHandler; + private Hashtable> regsOrig; + private Hashtable> regsNorm; + private Date beginOfOperation; + private Date endOfOperation; + + public static RegularizationManager getInstance() throws ApplicationException { + if (instance == null) { + instance = new RegularizationManager(); + instance.init(); + } + return instance; + } + + public static void main(String[] args) throws ApplicationException { + getInstance(); + instance.beginOperation(); + System.out.print("Start ..."); + ArrayList regs = instance.findRegsByNorm("la", "Illiusque"); + ArrayList regs2 = instance.findRegsByNorm("la", "Itaque"); + Regularization bla = regs.get(0); + Regularization bla2 = regs2.get(0); + + instance.end(); + instance.endOperation(); + Double elapsedTime = new Util().getSecondWithMillisecondsBetween(instance.beginOfOperation, instance.endOfOperation); + System.out.println("End."); + System.out.println("Needed time: " + elapsedTime + " seconds"); + } + + private void init() throws ApplicationException { + regsOrig = new Hashtable>(); + regsNorm = new Hashtable>(); + dbRegHandler = new DBRegularizationHandler(REGULARIZATION_DB_DIR); + dbRegHandler.start(); + dbRegHandler.openDatabases(); + LOGGER.info("Regularization db cache: opened"); + } + + public ArrayList findRegsByOrig(String language, String orig) throws ApplicationException { + orig = orig.toLowerCase(); + String hashKey = language + "###" + orig; + ArrayList regs = regsOrig.get(hashKey); + if (regs == null) { + regs = dbRegHandler.readRegsByOrig(language, orig); + if (regs == null || regs.isEmpty()) + regsOrig.put(hashKey, new ArrayList()); + else + regsOrig.put(hashKey, regs); + } + return regs; + } + + public ArrayList findRegsByNorm(String language, String norm) throws ApplicationException { + norm = norm.toLowerCase(); + String hashKey = language + "###" + norm; + ArrayList regs = regsNorm.get(hashKey); + if (regs == null || regs.isEmpty()) { + regs = dbRegHandler.readRegsByNorm(language, norm); + if (regs == null) + regsNorm.put(hashKey, new ArrayList()); + else + regsNorm.put(hashKey, regs); + } + return regs; + } + + public ArrayList getRegOrigsByNormLuceneQueryString(String language, String luceneQueryString) throws ApplicationException { + ArrayList regForms = new ArrayList(); + LuceneUtil luceneUtil = LuceneUtil.getInstance(); + ArrayList variants = luceneUtil.getVariantsFromLuceneQuery(luceneQueryString); + if (variants != null) { + for (int i=0; i regs = findRegsByNorm(language, variant); + if (regs != null) { + for (int j=0; j getTokens() throws ApplicationException { + if (Language.getInstance().isChinese(language)) { + return getTokensByChineseTokenizer(input, normFunctions); + } + ArrayList tokens = new ArrayList(); + try { + reset(input); + CharTermAttribute charTermAttribute = getAttribute(CharTermAttribute.class); + OffsetAttribute offsetAttribute = getAttribute(OffsetAttribute.class); + while (incrementToken()) { + String term = charTermAttribute.toString(); + int start = offsetAttribute.startOffset(); + int end = offsetAttribute.endOffset(); + String normedTerm = normalizer.normalize(term); + Token token = new Token(start, end, normedTerm); + tokens.add(token); + } + end(); // TODO needed ? + close(); // TODO needed ? + } catch (IOException e) { + throw new ApplicationException(e); + } + return tokens; + } + + /** Returns true iff a character should be included in a token. */ + protected boolean isTokenChar(int codepoint) { + boolean isTokenChar = true; + char c = (char) codepoint; + switch (c) { + case ' ': isTokenChar = false; break; + case '.': isTokenChar = false; break; + case ',': isTokenChar = false; break; + case '!': isTokenChar = false; break; + case '?': isTokenChar = false; break; + case ';': isTokenChar = false; break; + case ':': isTokenChar = false; break; + case '(': isTokenChar = false; break; + case ')': isTokenChar = false; break; + case '[': isTokenChar = false; break; + case ']': isTokenChar = false; break; + case '{': isTokenChar = false; break; + case '}': isTokenChar = false; break; + case '<': isTokenChar = false; break; + case '>': isTokenChar = false; break; + case '/': isTokenChar = false; break; + case '=': isTokenChar = false; break; + case '&': isTokenChar = false; break; + case '+': isTokenChar = false; break; + case '#': isTokenChar = false; break; + case '"': isTokenChar = false; break; + case '�': isTokenChar = false; break; + case '�': isTokenChar = false; break; + case '�': isTokenChar = false; break; + case '�': isTokenChar = false; break; + case '\'': isTokenChar = false; break; + case '\t': isTokenChar = false; break; // do not break words which have tabs in it + case '\n': isTokenChar = false; break; // do not break words which are on another line + case '\u2425': isTokenChar = false; break; // special char for marking xml elements + } + return isTokenChar; + } + + /** Called on each token character to normalize it before it is added to the + * token. The default implementation does nothing. Subclasses may use this + * to, e.g., lowercase tokens. */ + protected char normalize(char c) { + return c; + } + protected int normalize(int c) { + return c; + } + + /* + * Code is copied from Lucene 3.4. CharTokenizer.incrementToken() + * @see org.apache.lucene.analysis.TokenStream#incrementToken() + */ + public boolean incrementToken() throws IOException { + clearAttributes(); + int length = 0; + int start = -1; // this variable is always initialized + char[] buffer = termAtt.buffer(); + while (true) { + if (bufferIndex >= dataLen) { + offset += dataLen; + if(! charUtils.fill(ioBuffer, input)) { // read supplementary char aware with CharacterUtils + dataLen = 0; // so next offset += dataLen won't decrement offset + if (length > 0) { + break; + } else { + finalOffset = correctOffset(offset); + return false; + } + } + dataLen = ioBuffer.getLength(); + bufferIndex = 0; + } + // use CharacterUtils here to support < 3.1 UTF-16 code unit behavior if the char based methods are gone + int c = charUtils.codePointAt(ioBuffer.getBuffer(), bufferIndex); + bufferIndex += Character.charCount(c); + if (isTokenChar(c)) { // if it's a token char + if (length == 0) { // start of token + start = offset + bufferIndex - 1; + } else if (length >= buffer.length-1) { // check if a supplementary could run out of bounds + buffer = termAtt.resizeBuffer(2 + length); // make sure a supplementary fits in the buffer + } + length += Character.toChars(normalize(c), buffer, length); // buffer it, normalized + if (length >= MAX_WORD_LEN) // buffer overflow! make sure to check for >= surrogate pair could break == test + break; + } else if (length > 0) // at non-Letter w/ chars + break; // return 'em + } + termAtt.setLength(length); + offsetAtt.setOffset(correctOffset(start), finalOffset = correctOffset(start + length)); + return true; + } + + /* + * Code is copied from Lucene 3.4. CharTokenizer.end() + * @see org.apache.lucene.analysis.TokenStream#end() + */ + @Override + public final void end() { + // set final offset + offsetAtt.setOffset(finalOffset, finalOffset); + } + + /* + * Code is copied from Lucene 3.4. CharTokenizer.reset() + * @see org.apache.lucene.analysis.Tokenizer#reset(java.io.Reader) + */ + @Override + public void reset(Reader input) throws IOException { + super.reset(input); + bufferIndex = 0; + offset = 0; + dataLen = 0; + finalOffset = 0; + ioBuffer.reset(); // make sure to reset the IO buffer!! + this.normalizer = new Normalizer(normFunctions, language); + } + + private ArrayList getTokensByChineseTokenizer(Reader input, String[] normFunctions) throws ApplicationException { + StandardTokenizer chineseTokenizer = new StandardTokenizer(Version.LUCENE_34, input); // is recommended instead of ChineseTokenizer which is deprecated + ArrayList tokens = new ArrayList(); + try { + reset(input); + chineseTokenizer.reset(input); + CharTermAttribute charTermAttribute = chineseTokenizer.getAttribute(CharTermAttribute.class); + OffsetAttribute offsetAttribute = chineseTokenizer.getAttribute(OffsetAttribute.class); + while (chineseTokenizer.incrementToken()) { + String term = charTermAttribute.toString(); + String normedTerm = normalizer.normalize(term); + int start = offsetAttribute.startOffset(); + int end = offsetAttribute.endOffset(); + Token token = new Token(start, end, normedTerm); + tokens.add(token); + } + chineseTokenizer.end(); // TODO needed ? + chineseTokenizer.close(); // TODO needed ? + end(); // TODO needed ? + close(); // TODO needed ? + } catch (IOException e) { + throw new ApplicationException(e); + } + return tokens; + } + +} \ No newline at end of file diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/tokenize/XmlTokenizer.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/tokenize/XmlTokenizer.java Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,68 @@ +package de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize; + +import java.io.IOException; +import java.io.Reader; + +import org.xml.sax.InputSource; +import org.xml.sax.SAXException; +import org.xml.sax.XMLReader; + +import com.sun.org.apache.xerces.internal.parsers.SAXParser; + +import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; +import de.mpg.mpiwg.berlin.mpdl.lt.general.Language; + +public class XmlTokenizer { + private Reader input; + private String language = "eng"; // default: english + private String[] normFunctions = {"specialNorm"}; // default: use special norm function + private String[] nwbElements = {"lb", "br", "cb", "figure", "image", "handwritten", "anchor", "emph", "note"}; // non word breaking elements, default: these elements + private String[] stopElements = {}; // default: no stop elements + private String[] outputOptions = {}; + + public XmlTokenizer(Reader input) { + this.input = input; + } + + public void setLanguage(String lang) { + String language = Language.getInstance().getLanguageId(lang); + this.language = language; + } + + public void setNormFunctions(String[] normFunctions) { + this.normFunctions = normFunctions; + } + + public void setNWBElements(String[] nwbElements) { + this.nwbElements = nwbElements; + } + + public void setStopElements(String[] stopElements) { + this.stopElements = stopElements; + } + + public void setOutputOptions(String[] outputOptions) { + this.outputOptions = outputOptions; + } + + public String tokenize() throws ApplicationException { + String retString = null; + try { + XmlTokenizerContentHandler dictContentHandler = new XmlTokenizerContentHandler(normFunctions, language); + dictContentHandler.setStopElements(stopElements); + dictContentHandler.setNWBElements(nwbElements); + dictContentHandler.setOutputOptions(outputOptions); + XMLReader xmlParser = new SAXParser(); + xmlParser.setContentHandler(dictContentHandler); + InputSource inputSource = new InputSource(input); + xmlParser.parse(inputSource); + retString = dictContentHandler.getXmlFragment(); + } catch (SAXException e) { + throw new ApplicationException(e); + } catch (IOException e) { + throw new ApplicationException(e); + } + return retString; + } + +} \ No newline at end of file diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/tokenize/XmlTokenizerContentHandler.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/tokenize/XmlTokenizerContentHandler.java Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,426 @@ +package de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize; + +import java.io.StringReader; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Hashtable; + +import org.xml.sax.*; + +import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; +import de.mpg.mpiwg.berlin.mpdl.lt.dict.db.LexHandler; +import de.mpg.mpiwg.berlin.mpdl.lt.general.Language; +import de.mpg.mpiwg.berlin.mpdl.lt.morph.app.Form; +import de.mpg.mpiwg.berlin.mpdl.lt.morph.app.Lemma; +import de.mpg.mpiwg.berlin.mpdl.util.StringUtils; + +public class XmlTokenizerContentHandler implements ContentHandler { + private static String COMPLEX_ELEMENT_MARK = new Character('\u2425').toString(); // word delimiting element + private static String COMPLEX_ELEMENT_NWD_MARK = new Character('\u2424').toString(); // not word delimiting element + private static int COMPLEX_ELEMENT_MARK_SIZE = COMPLEX_ELEMENT_MARK.length(); + private static int ELEMENT_TYPE_CHARACTERS = 1; + private static int ELEMENT_TYPE_COMPLEX = 2; + private String[] normalizeFunctions = {}; // default: without normalize functions + private String[] nwbElements = {}; // non word breaking elements, default: these elements + private String[] stopElements = {}; // default: no stop elements + private String[] outputOptions = {}; + private String xmlnsString = ""; + private String language; + private String outputXmlFragment = ""; + private Element rootElement; + private Element currentElement; + private ArrayList elementQueue; + + public XmlTokenizerContentHandler(String[] normalizeFunctions, String language) throws ApplicationException { + if (normalizeFunctions == null) { + String[] emptyFunctions = {}; + this.normalizeFunctions = emptyFunctions; + } else { + this.normalizeFunctions = normalizeFunctions; + } + this.language = language; + } + + public void setNWBElements(String[] nwbElements) { + this.nwbElements = nwbElements; + } + + public void setStopElements(String[] stopElements) { + this.stopElements = stopElements; + } + + public void setOutputOptions(String[] outputOptions) { + this.outputOptions = outputOptions; + } + + public String getXmlFragment() { + return outputXmlFragment; + } + + public void startDocument() throws SAXException { + } + + public void endDocument() throws SAXException { + try { + String rootElemToStr = rootElement.toXmlString(); + write(rootElemToStr); + write("\n"); + } catch (NullPointerException e) { + throw new SAXException(e); + } + } + + public void characters(char[] c, int start, int length) throws SAXException { + char[] cCopy = new char[length]; + System.arraycopy(c, start, cCopy, 0, length); + String charactersStr = String.valueOf(cCopy); + if (charactersStr != null && ! charactersStr.equals("")) { + if (currentElement != null) { + Element charElement = new Element("characters", ELEMENT_TYPE_CHARACTERS); + charElement.value = StringUtils.deresolveXmlEntities(charactersStr); + if (currentElement.composites == null) + currentElement.composites = new ArrayList(); + currentElement.composites.add(charElement); + } + } + } + + public void ignorableWhitespace(char[] c, int start, int length) throws SAXException { + } + + public void processingInstruction(String target, String data) throws SAXException { + } + + public void setDocumentLocator(Locator locator) { + } + + public void startPrefixMapping(String prefix, String uri) throws SAXException { + xmlnsString += "xmlns:" + prefix + "=\"" + uri + "\" "; + if (prefix != null && prefix.equals("")) + xmlnsString = "xmlns" + prefix + "=\"" + uri + "\" "; + } + + public void endPrefixMapping(String prefix) throws SAXException { + } + + public void skippedEntity(String name) throws SAXException { + } + + public void startElement(String uri, String localName, String name, Attributes attrs) throws SAXException { + if (elementQueue == null) + elementQueue = new ArrayList(); + Element newElement = new Element(name); // element of type: complex + if (currentElement != null) { + if (currentElement.composites == null) + currentElement.composites = new ArrayList(); + if (currentElement.lang != null) + newElement.lang = currentElement.lang; // language is inherited to childs + currentElement.composites.add(newElement); + } + currentElement = newElement; + int attrSize = attrs.getLength(); + String attrString = ""; + for (int i=0; i 0) { + int lastIndex = elementQueue.size() - 1; + elementQueue.remove(lastIndex); + } + if (elementQueue != null && elementQueue.size() > 0) { + int lastIndex = elementQueue.size() - 1; + currentElement = elementQueue.get(lastIndex); + } else { + currentElement = null; + } + } + + private boolean withForms() { + boolean result = false; + for (int i=0; i< outputOptions.length; i++) { + String function = outputOptions[i]; + if (function.equals("withForms")) + return true; + } + return result; + } + + private boolean withLemmas() { + boolean result = false; + for (int i=0; i< outputOptions.length; i++) { + String function = outputOptions[i]; + if (function.equals("withLemmas")) + return true; + } + return result; + } + + private void write(String outStr) throws SAXException { + outputXmlFragment += outStr; + } + + private class Element { + private int type; + private String name; + private String xmlnsString; + private String attrString; + private String value; + private String lang; // normally value of attribute xml:lang or the inherited xml:lang value of the father node + private ArrayList composites; + + private Element(String name) { + this.type = ELEMENT_TYPE_COMPLEX; + this.name = name; + } + + private Element(String name, int type) { + this.type = type; + this.name = name; + } + + private boolean isComplex() { + boolean isComplex = false; + if (type == ELEMENT_TYPE_COMPLEX) + isComplex = true; + return isComplex; + } + + private boolean isWordDelimiterElement() { + boolean isWordDelimiterElement = true; + for (int i=0; i"; + } else { + retString = retString + "<" + name + " " + xmlNsString + attrString + ">"; + } + if (composites != null) { + String compositesCharsWithMarks = ""; + ArrayList complexElements = new ArrayList(); + for (int i=0; i) + } else { + compositesCharsWithMarks = compositesCharsWithMarks + COMPLEX_ELEMENT_MARK; // add a special mark symbol at the position of the "word delimiter element" (e.g. ) + } + complexElements.add(composite); + } + } + // compositesCharsWithMarks = compositesCharsWithMarks.replaceAll(COMPLEX_ELEMENT_NWD_MARK + " +", COMPLEX_ELEMENT_NWD_MARK); // remove Blanks after the non word breaking mark (e.g. "praebi ta" is changed to "praebita") + String compositesCharsWithMarksWithWordTags = insertWordTags(compositesCharsWithMarks, elemLanguage); + compositesCharsWithMarksWithWordTags = compositesCharsWithMarksWithWordTags.replaceAll(COMPLEX_ELEMENT_NWD_MARK, COMPLEX_ELEMENT_MARK); // mark symbols are unified to COMPLEX_ELEMENT_MARK to replace them by the element values + if (complexElements.size() > 0) { + for (int i=0; i 0) { + firstPiece = compositesCharsWithMarksWithWordTags.substring(0, indexComplexElemCompositesCharsWithMarks); + compositesCharsWithMarksWithWordTags = compositesCharsWithMarksWithWordTags.substring(indexComplexElemCompositesCharsWithMarks); + } + retString = retString + firstPiece + complexElementStr; + compositesCharsWithMarksWithWordTags = compositesCharsWithMarksWithWordTags.substring(COMPLEX_ELEMENT_MARK_SIZE); + } + retString = retString + compositesCharsWithMarksWithWordTags; // last one must also be added + } else { + retString = retString + compositesCharsWithMarksWithWordTags; // last one must also be added + } + } + retString = retString + ""; + } + return retString; + } + + private String insertWordTags(String charactersStrDeresolved, String language) throws SAXException { + String charactersStr = StringUtils.resolveXmlEntities(charactersStrDeresolved); + String retStr = ""; + try { + Tokenizer tokenizer = new Tokenizer(new StringReader(charactersStr)); + tokenizer.setLanguage(language); + tokenizer.setNormFunctions(normalizeFunctions); + ArrayList tokens = tokenizer.getTokens(); + int endPos = 0; + for (int i=0; i < tokens.size(); i++) { + Token token = tokens.get(i); + String wordForm = token.getContent(); + int startPos = token.getStart(); + String beforeStr = charactersStr.substring(endPos, startPos); + endPos = token.getEnd(); + String beforeStrDeresolved = StringUtils.deresolveXmlEntities(beforeStr); + String origWordForm = charactersStr.substring(startPos, endPos); + String wordTag = insertWordTags(wordForm, language, origWordForm); + retStr = retStr + beforeStrDeresolved + wordTag; + } + String lastAfterStr = charactersStr.substring(endPos); + String lastAfterStrDeresolved = StringUtils.deresolveXmlEntities(lastAfterStr); + retStr = retStr + lastAfterStrDeresolved; + } catch (ApplicationException e) { + throw new SAXException(e); + } + return retStr; + } + + private String insertWordTags(String wordForm, String language, String origWordForm) throws ApplicationException { + String wordTag = null; + if (origWordForm != null && origWordForm.equals(COMPLEX_ELEMENT_NWD_MARK)) + return origWordForm; + if (isStopElement()) + return origWordForm; + wordForm = removeSpecialSymbols(wordForm); + wordForm = wordForm.toLowerCase(); + String origWordFormDeresolved = StringUtils.deresolveXmlEntities(origWordForm); + ArrayList lemmas = null; + if (withForms() || withLemmas()) { + LexHandler lexHandler = LexHandler.getInstance(); + lemmas = lexHandler.getLemmas(wordForm, "form", language, "none"); + } + wordTag = insertWordTags(origWordFormDeresolved, wordForm, language, null, lemmas); + return wordTag; + } + + /** + * + * @param origWordToken could contain nwd marks + * @param wordForm contains no nwd marks + * @param language + * @param origWordFormNormalized + * @param lemmas + * @return for each substring between nwd marks create a word tag + */ + private String insertWordTags(String origWordToken, String wordForm, String language, String origWordFormNormalized, ArrayList lemmas) { + if (origWordToken.isEmpty()) + return origWordToken; + if (origWordToken.equals(COMPLEX_ELEMENT_NWD_MARK)) + return COMPLEX_ELEMENT_NWD_MARK; + String retWordTags = ""; + String origWordTokenTmp = origWordToken; + while (! origWordTokenTmp.isEmpty()) { + if (origWordTokenTmp.startsWith(COMPLEX_ELEMENT_NWD_MARK)) { // single nwd mark + origWordTokenTmp = origWordTokenTmp.substring(1); + retWordTags = retWordTags + COMPLEX_ELEMENT_NWD_MARK; + } else { + int indexUpToNWD = origWordTokenTmp.indexOf(COMPLEX_ELEMENT_NWD_MARK); + if (indexUpToNWD != -1) { // not end of string reached + String origWordTokenFragment = origWordTokenTmp.substring(0, indexUpToNWD); + String origWordTokenFragmentWithTags = getWordTag(origWordTokenFragment, wordForm, language, origWordFormNormalized, lemmas); + retWordTags = retWordTags + origWordTokenFragmentWithTags + COMPLEX_ELEMENT_NWD_MARK; + origWordTokenTmp = origWordTokenTmp.substring(indexUpToNWD + 1); + } else { // end of string reached + String origWordTokenFragment = origWordTokenTmp.substring(0, origWordTokenTmp.length()); + String origWordTokenFragmentWithTags = getWordTag(origWordTokenFragment, wordForm, language, origWordFormNormalized, lemmas); + retWordTags = retWordTags + origWordTokenFragmentWithTags; + origWordTokenTmp = ""; // finente + } + } + } + return retWordTags; + } + + private String getWordTag(String origWordForm, String wordForm, String language, String origWordFormNormalized, ArrayList lemmas) { + if (origWordForm == null || origWordForm.isEmpty()) + return ""; + String langISOCode = Language.getInstance().getISO639Code(language); + String retStr = " formsHashtable = new Hashtable(); + for (int i=0; i < lemmas.size(); i++) { + Lemma lemma = lemmas.get(i); + ArrayList lemmaForms = lemma.getFormsList(); + for (int j=0; j < lemmaForms.size(); j++) { + Form form = lemmaForms.get(j); + formsHashtable.put(form.getFormName(), form); + } + String lemmaName = lemma.getLemmaName(); + lemmasStr = lemmasStr + lemmaName + " "; + } + ArrayList forms = new ArrayList(); + forms.addAll(formsHashtable.values()); + Collections.sort(forms); + for (int i=0; i < forms.size(); i++) { + Form form = forms.get(i); + String formName = form.getFormName(); + formName = StringUtils.forXML(formName); + formsStr = formsStr + formName + " "; + } + if (formsStr.endsWith(" ")) + formsStr = formsStr.substring(0, formsStr.length() - 1); + if (lemmasStr.endsWith(" ")) + lemmasStr = lemmasStr.substring(0, lemmasStr.length() - 1); + if (withForms()) + retStr = retStr + " forms=\"" + formsStr + "\""; + if (withLemmas()) + retStr = retStr + " lemmas=\"" + lemmasStr + "\""; + } + retStr = retStr + ">" + origWordForm + ""; + return retStr; + } + + private String removeSpecialSymbols(String inputStr) { + String retStr = inputStr.replaceAll(" |\n|\t|-|\u2424|\u2425", ""); + return retStr; + } + + } +} diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/transcode/Betacode2Unicode.lex --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/transcode/Betacode2Unicode.lex Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,332 @@ +package de.mpg.mpiwg.berlin.mpdl.lt.general; + +%% +%{ + /* + * Betacode to Unicode conversion + */ + + private String toUnicode(int in) { + char c = (char) in; + Character ch = new Character(c); + String retString = ch.toString(); + return retString; + } + +%} + +%class Betacode2UnicodeLex +%public +%type java.lang.String +%unicode +%% + + +"<"[^>]+">" { return yytext(); } + +"*j" { return "H"; } +"j" { return "h"; } +"*v" { return "F"; } +"v" { return "f"; } +"*s" { return toUnicode(0x03a3); } + +"!" { return "."; } +":" { return toUnicode(0x00B7); } /* MPDL update */ + +"a)" { return toUnicode(0x1F00); } +"a(" { return toUnicode(0x1F01); } +"a)\\" { return toUnicode(0x1F02); } +"a(\\" { return toUnicode(0x1F03); } +"a)/" { return toUnicode(0x1F04); } +"a(/" { return toUnicode(0x1F05); } +"a)=" { return toUnicode(0x1F06); } +"a(=" { return toUnicode(0x1F07); } +"*)a" { return toUnicode(0x1F08); } +"*(a" { return toUnicode(0x1F09); } +"*)\\a" { return toUnicode(0x1F0A); } +"*(\\a" { return toUnicode(0x1F0B); } +"*)/a" { return toUnicode(0x1F0C); } +"*(/a" { return toUnicode(0x1F0D); } +"*)=a" { return toUnicode(0x1F0E); } +"*(=a" { return toUnicode(0x1F0F); } +"e)" { return toUnicode(0x1F10); } +"e(" { return toUnicode(0x1F11); } +"e)\\" { return toUnicode(0x1F12); } +"e(\\" { return toUnicode(0x1F13); } +"e)/" { return toUnicode(0x1F14); } +"e(/" { return toUnicode(0x1F15); } +"*)e" { return toUnicode(0x1F18); } +"*(e" { return toUnicode(0x1F19); } +"*)\\e" { return toUnicode(0x1F1A); } +"*(\\e" { return toUnicode(0x1F1B); } +"*)/e" { return toUnicode(0x1F1C); } +"*(/e" { return toUnicode(0x1F1D); } +"h)" { return toUnicode(0x1F20); } +"h(" { return toUnicode(0x1F21); } +"h)\\" { return toUnicode(0x1F22); } +"h(\\" { return toUnicode(0x1F23); } +"h)/" { return toUnicode(0x1F24); } +"h(/" { return toUnicode(0x1F25); } +"h)=" { return toUnicode(0x1F26); } +"h(=" { return toUnicode(0x1F27); } +"*)h" { return toUnicode(0x1F28); } +"*(h" { return toUnicode(0x1F29); } +"*)\\h" { return toUnicode(0x1F2A); } +"*(\\h" { return toUnicode(0x1F2B); } +"*)/h" { return toUnicode(0x1F2C); } +"*(/h" { return toUnicode(0x1F2D); } +"*)=h" { return toUnicode(0x1F2E); } +"*(=h" { return toUnicode(0x1F2F); } +"i)" { return toUnicode(0x1F30); } +"i(" { return toUnicode(0x1F31); } +"i)\\" { return toUnicode(0x1F32); } +"i(\\" { return toUnicode(0x1F33); } +"i)/" { return toUnicode(0x1F34); } +"i(/" { return toUnicode(0x1F35); } +"i)=" { return toUnicode(0x1F36); } +"i(=" { return toUnicode(0x1F37); } +"*)i" { return toUnicode(0x1F38); } +"*(i" { return toUnicode(0x1F39); } +"*)\\i" { return toUnicode(0x1F3A); } +"*(\\i" { return toUnicode(0x1F3B); } +"*)/i" { return toUnicode(0x1F3C); } +"*(/i" { return toUnicode(0x1F3D); } +"*)=i" { return toUnicode(0x1F3E); } +"*(=i" { return toUnicode(0x1F3F); } +"o)" { return toUnicode(0x1F40); } +"o(" { return toUnicode(0x1F41); } +"o)\\" { return toUnicode(0x1F42); } +"o(\\" { return toUnicode(0x1F43); } +"o)/" { return toUnicode(0x1F44); } +"o(/" { return toUnicode(0x1F45); } +"*)o" { return toUnicode(0x1F48); } +"*(o" { return toUnicode(0x1F49); } +"*)\\o" { return toUnicode(0x1F4A); } +"*(\\o" { return toUnicode(0x1F4B); } +"*)/o" { return toUnicode(0x1F4C); } +"*(/o" { return toUnicode(0x1F4D); } +"u)" { return toUnicode(0x1F50); } +"u(" { return toUnicode(0x1F51); } +"u)\\" { return toUnicode(0x1F52); } +"u(\\" { return toUnicode(0x1F53); } +"u)/" { return toUnicode(0x1F54); } +"u(/" { return toUnicode(0x1F55); } +"u)=" { return toUnicode(0x1F56); } +"u(=" { return toUnicode(0x1F57); } +"*(u" { return toUnicode(0x1F59); } +"*(\\u" { return toUnicode(0x1F5B); } +"*(/u" { return toUnicode(0x1F5D); } +"*(=u" { return toUnicode(0x1F5F); } +"w)" { return toUnicode(0x1F60); } +"w(" { return toUnicode(0x1F61); } +"w)\\" { return toUnicode(0x1F62); } +"w(\\" { return toUnicode(0x1F63); } +"w)/" { return toUnicode(0x1F64); } +"w(/" { return toUnicode(0x1F65); } +"w)=" { return toUnicode(0x1F66); } +"w(=" { return toUnicode(0x1F67); } +"*)w" { return toUnicode(0x1F68); } +"*(w" { return toUnicode(0x1F69); } +"*)\\w" { return toUnicode(0x1F6A); } +"*(\\w" { return toUnicode(0x1F6B); } +"*)/w" { return toUnicode(0x1F6C); } +"*(/w" { return toUnicode(0x1F6D); } +"*)=w" { return toUnicode(0x1F6E); } +"*(=w" { return toUnicode(0x1F6F); } +"a\\" { return toUnicode(0x1F70); } +"a/" { return toUnicode(0x1F71); } +"e\\" { return toUnicode(0x1F72); } +"e/" { return toUnicode(0x1F73); } +"h\\" { return toUnicode(0x1F74); } +"h/" { return toUnicode(0x1F75); } +"i\\" { return toUnicode(0x1F76); } +"i/" { return toUnicode(0x1F77); } +"o\\" { return toUnicode(0x1F78); } +"o/" { return toUnicode(0x1F79); } +"u\\" { return toUnicode(0x1F7A); } +"u/" { return toUnicode(0x1F7B); } +"w\\" { return toUnicode(0x1F7C); } +"w/" { return toUnicode(0x1F7D); } +"a)|" { return toUnicode(0x1F80); } +"a(|" { return toUnicode(0x1F81); } +"a)\\|" { return toUnicode(0x1F82); } +"a(\\|" { return toUnicode(0x1F83); } +"a)/|" { return toUnicode(0x1F84); } +"a(/|" { return toUnicode(0x1F85); } +"a)=|" { return toUnicode(0x1F86); } +"a(=|" { return toUnicode(0x1F87); } +"*)|a" { return toUnicode(0x1F88); } +"*(|a" { return toUnicode(0x1F89); } +"*)\\|a" { return toUnicode(0x1F8A); } +"*(\\|a" { return toUnicode(0x1F8B); } +"*)/|a" { return toUnicode(0x1F8C); } +"*(/|a" { return toUnicode(0x1F8D); } +"*)=|a" { return toUnicode(0x1F8E); } +"*(=|a" { return toUnicode(0x1F8F); } +"h)|" { return toUnicode(0x1F90); } +"h(|" { return toUnicode(0x1F91); } +"h)\\|" { return toUnicode(0x1F92); } +"h(\\|" { return toUnicode(0x1F93); } +"h)/|" { return toUnicode(0x1F94); } +"h(/|" { return toUnicode(0x1F95); } +"h)=|" { return toUnicode(0x1F96); } +"h(=|" { return toUnicode(0x1F97); } +"*)|h" { return toUnicode(0x1F98); } +"*(|h" { return toUnicode(0x1F99); } +"*)\\|h" { return toUnicode(0x1F9A); } +"*(\\|h" { return toUnicode(0x1F9B); } +"*)/|h" { return toUnicode(0x1F9C); } +"*(/|h" { return toUnicode(0x1F9D); } +"*)=|h" { return toUnicode(0x1F9E); } +"*(=|h" { return toUnicode(0x1F9F); } +"w)|" { return toUnicode(0x1FA0); } +"w(|" { return toUnicode(0x1FA1); } +"w)\\|" { return toUnicode(0x1FA2); } +"w(\\|" { return toUnicode(0x1FA3); } +"w)/|" { return toUnicode(0x1FA4); } +"w(/|" { return toUnicode(0x1FA5); } +"w)=|" { return toUnicode(0x1FA6); } +"w(=|" { return toUnicode(0x1FA7); } +"*)|w" { return toUnicode(0x1FA8); } +"*(|w" { return toUnicode(0x1FA9); } +"*)\\|w" { return toUnicode(0x1FAA); } +"*(\\|w" { return toUnicode(0x1FAB); } +"*)/|w" { return toUnicode(0x1FAC); } +"*(/|w" { return toUnicode(0x1FAD); } +"*)=|w" { return toUnicode(0x1FAE); } +"*(=|w" { return toUnicode(0x1FAF); } +"a^" { return toUnicode(0x1FB0); } +"a_" { return toUnicode(0x1FB1); } +"a\\|" { return toUnicode(0x1FB2); } +"a|" { return toUnicode(0x1FB3); } +"a/|" { return toUnicode(0x1FB4); } +"a=" { return toUnicode(0x1FB6); } +"a=|" { return toUnicode(0x1FB7); } +"*a^" { return toUnicode(0x1FB8); } +"*a_" { return toUnicode(0x1FB9); } +"*a\\" { return toUnicode(0x1FBA); } +"*a/" { return toUnicode(0x1FBB); } +"*a|" { return toUnicode(0x1FBC); } +"h\\|" { return toUnicode(0x1FC2); } +"h|" { return toUnicode(0x1FC3); } +"h/|" { return toUnicode(0x1FC4); } +"h=" { return toUnicode(0x1FC6); } +"h=|" { return toUnicode(0x1FC7); } +"*e\\" { return toUnicode(0x1FC8); } +"*e/" { return toUnicode(0x1FC9); } +"*h\\" { return toUnicode(0x1FCA); } +"*h/" { return toUnicode(0x1FCB); } +"*h|" { return toUnicode(0x1FCC); } +"i^" { return toUnicode(0x1FD0); } +"i_" { return toUnicode(0x1FD1); } +"i+\\" { return toUnicode(0x1FD2); } +"i+/" { return toUnicode(0x1FD3); } +"i=" { return toUnicode(0x1FD6); } +"i+=" { return toUnicode(0x1FD7); } +"*i^" { return toUnicode(0x1FD8); } +"*i_" { return toUnicode(0x1FD9); } +"*i\\" { return toUnicode(0x1FDA); } +"*i/" { return toUnicode(0x1FDB); } +"u^" { return toUnicode(0x1FE0); } +"u_" { return toUnicode(0x1FE1); } +"u+\\" { return toUnicode(0x1FE2); } +"u+/" { return toUnicode(0x1FE3); } +"r)" { return toUnicode(0x1FE4); } +"r(" { return toUnicode(0x1FE5); } +"u=" { return toUnicode(0x1FE6); } +"u+=" { return toUnicode(0x1FE7); } +"*u^" { return toUnicode(0x1FE8); } +"*u_" { return toUnicode(0x1FE9); } +"*u\\" { return toUnicode(0x1FEA); } +"*u/" { return toUnicode(0x1FEB); } +"*(r" { return toUnicode(0x1FEC); } +"w\\|" { return toUnicode(0x1FF2); } +"w|" { return toUnicode(0x1FF3); } +"w/|" { return toUnicode(0x1FF4); } +"*w\\" { return toUnicode(0x1FFA); } +"*w/" { return toUnicode(0x1FFB); } +"*w|" { return toUnicode(0x1FFC); } +"w=" { return toUnicode(0x1FF6); } +"w=|" { return toUnicode(0x1FF7); } +"*o\\" { return toUnicode(0x1FF8); } +"*o/" { return toUnicode(0x1FF9); } + +"\\" { return toUnicode(0x0300); } +"/" { return toUnicode(0x0301); } +"_" { return toUnicode(0x0304); } +"^" { return toUnicode(0x0306); } +"+" { return toUnicode(0x0308); } +"=" { return toUnicode(0x0302); } +")" { return toUnicode(0x0313); } +"(" { return toUnicode(0x0314); } +"?" { return toUnicode(0x0323); } +"|" { return toUnicode(0x0345); } + +"a" { return toUnicode(0x03b1); } /* MPDL update */ +"*a" { return toUnicode(0x0391); } /* MPDL update */ +"b" { return toUnicode(0x03b2); } /* MPDL update */ +"*b" { return toUnicode(0x0392); } /* MPDL update */ +"g" { return toUnicode(0x03b3); } /* MPDL update */ +"*g" { return toUnicode(0x0393); } /* MPDL update */ +"d" { return toUnicode(0x03b4); } /* MPDL update */ +"*d" { return toUnicode(0x0394); } /* MPDL update */ +"e" { return toUnicode(0x03b5); } /* MPDL update */ +"*e" { return toUnicode(0x0395); } /* MPDL update */ +"z" { return toUnicode(0x03b6); } /* MPDL update */ +"*z" { return toUnicode(0x0396); } /* MPDL update */ +"h" { return toUnicode(0x03b7); } /* MPDL update */ +"*h" { return toUnicode(0x0397); } /* MPDL update */ +"q" { return toUnicode(0x03b8); } /* MPDL update */ +"*q" { return toUnicode(0x0398); } /* MPDL update */ +"i" { return toUnicode(0x03b9); } /* MPDL update */ +"*i" { return toUnicode(0x0399); } /* MPDL update */ +"k" { return toUnicode(0x03ba); } /* MPDL update */ +"*k" { return toUnicode(0x039a); } /* MPDL update */ +"l" { return toUnicode(0x03bb); } /* MPDL update */ +"*l" { return toUnicode(0x039b); } /* MPDL update */ +"m" { return toUnicode(0x03bc); } /* MPDL update */ +"*m" { return toUnicode(0x039c); } /* MPDL update */ +"n" { return toUnicode(0x03bd); } /* MPDL update */ +"*n" { return toUnicode(0x039d); } /* MPDL update */ +"c" { return toUnicode(0x03be); } /* MPDL update */ +"*c" { return toUnicode(0x039e); } /* MPDL update */ +"o" { return toUnicode(0x03bf); } /* MPDL update */ +"*o" { return toUnicode(0x039f); } /* MPDL update */ +"p" { return toUnicode(0x03c0); } /* MPDL update */ +"*p" { return toUnicode(0x03a0); } /* MPDL update */ +"r" { return toUnicode(0x03c1); } /* MPDL update */ +"*r" { return toUnicode(0x03a1); } /* MPDL update */ + +"*s" { return toUnicode(0x03a3); } /* MPDL update */ +"s1" { return toUnicode(0x03c3); } /* mdh 2002-01-07 */ +"s"/\-\- { return toUnicode(0x03c2); } +"s"/\> }[a-z\?\!0-9*=\/()\'\-] { return toUnicode(0x03c3); } /* MPDL update */ +"s"/\< { return toUnicode(0x03c2); } /* MPDL update */ +"s"/[\[\]][a-z\?\!0-9*=\/()\'\-] { return toUnicode(0x03c3); } /* MPDL update */ +"s"/\??[^a-z0-9*=\/()\'\-\[\?] { return toUnicode(0x03c2); } +"s" { return toUnicode(0x03c3); } /* MPDL update */ + +"t" { return toUnicode(0x03c4); } /* MPDL update */ +"*t" { return toUnicode(0x03a4); } /* MPDL update */ +"u" { return toUnicode(0x03c5); } /* MPDL update */ +"*u" { return toUnicode(0x03a5); } /* MPDL update */ +"f" { return toUnicode(0x03c6); } /* MPDL update */ +"*f" { return toUnicode(0x03a6); } /* MPDL update */ +"x" { return toUnicode(0x03c7); } /* MPDL update */ +"*x" { return toUnicode(0x03a7); } /* MPDL update */ +"y" { return toUnicode(0x03c8); } /* MPDL update */ +"*y" { return toUnicode(0x03a8); } /* MPDL update */ +"w" { return toUnicode(0x03c9); } /* MPDL update */ +"*w" { return toUnicode(0x03a9); } /* MPDL update */ + +[\&_]"vert;" { return "|"; } +[\&_]"lpar;" { return "("; } +[\&_]"rpar;" { return ")"; } +[\_\&]"lt;" { return "<"; } +[\_\&]"gt;" { return ">"; } +"'" { return "'"; } /* MPDL update */ + +"&"[a-zA-Z]+";" { return yytext(); } + +. { return yytext(); } +\n { return yytext(); } \ No newline at end of file diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/transcode/Betacode2Unicode.lex.old --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/transcode/Betacode2Unicode.lex.old Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,318 @@ +package de.mpg.mpiwg.berlin.mpdl.lt.general; + +%% +%{ + /* + * Betacode to Unicode conversion + */ + + private int isUpper = 0; + + private String toUnicodeGreek(int in) { + String retStr = toUnicode(in - (isUpper * 0x0020)); + isUpper = 0; + return retStr; + } + + private String toUnicode(int in) { + char c = (char) in; + Character ch = new Character(c); + String retString = ch.toString(); + return retString; + } + +%} + +%class Betacode2UnicodeLex +%public +%type java.lang.String +%unicode +%% + + +"<"[^>]+">" { return yytext(); } + +"*j" { return "H"; } +"j" { return "h"; } +"*v" { return "F"; } +"v" { return "f"; } +"*s" { return toUnicode(0x03a3); } + +"!" { return "."; } +":" { return toUnicode(0x00B7); } /* MPDL update */ + +"a)" { return toUnicode(0x1F00); } +"a(" { return toUnicode(0x1F01); } +"a)\\" { return toUnicode(0x1F02); } +"a(\\" { return toUnicode(0x1F03); } +"a)/" { return toUnicode(0x1F04); } +"a(/" { return toUnicode(0x1F05); } +"a)=" { return toUnicode(0x1F06); } +"a(=" { return toUnicode(0x1F07); } +"*)a" { return toUnicode(0x1F08); } +"*(a" { return toUnicode(0x1F09); } +"*)\\a" { return toUnicode(0x1F0A); } +"*(\\a" { return toUnicode(0x1F0B); } +"*)/a" { return toUnicode(0x1F0C); } +"*(/a" { return toUnicode(0x1F0D); } +"*)=a" { return toUnicode(0x1F0E); } +"*(=a" { return toUnicode(0x1F0F); } +"e)" { return toUnicode(0x1F10); } +"e(" { return toUnicode(0x1F11); } +"e)\\" { return toUnicode(0x1F12); } +"e(\\" { return toUnicode(0x1F13); } +"e)/" { return toUnicode(0x1F14); } +"e(/" { return toUnicode(0x1F15); } +"*)e" { return toUnicode(0x1F18); } +"*(e" { return toUnicode(0x1F19); } +"*)\\e" { return toUnicode(0x1F1A); } +"*(\\e" { return toUnicode(0x1F1B); } +"*)/e" { return toUnicode(0x1F1C); } +"*(/e" { return toUnicode(0x1F1D); } +"h)" { return toUnicode(0x1F20); } +"h(" { return toUnicode(0x1F21); } +"h)\\" { return toUnicode(0x1F22); } +"h(\\" { return toUnicode(0x1F23); } +"h)/" { return toUnicode(0x1F24); } +"h(/" { return toUnicode(0x1F25); } +"h)=" { return toUnicode(0x1F26); } +"h(=" { return toUnicode(0x1F27); } +"*)h" { return toUnicode(0x1F28); } +"*(h" { return toUnicode(0x1F29); } +"*)\\h" { return toUnicode(0x1F2A); } +"*(\\h" { return toUnicode(0x1F2B); } +"*)/h" { return toUnicode(0x1F2C); } +"*(/h" { return toUnicode(0x1F2D); } +"*)=h" { return toUnicode(0x1F2E); } +"*(=h" { return toUnicode(0x1F2F); } +"i)" { return toUnicode(0x1F30); } +"i(" { return toUnicode(0x1F31); } +"i)\\" { return toUnicode(0x1F32); } +"i(\\" { return toUnicode(0x1F33); } +"i)/" { return toUnicode(0x1F34); } +"i(/" { return toUnicode(0x1F35); } +"i)=" { return toUnicode(0x1F36); } +"i(=" { return toUnicode(0x1F37); } +"*)i" { return toUnicode(0x1F38); } +"*(i" { return toUnicode(0x1F39); } +"*)\\i" { return toUnicode(0x1F3A); } +"*(\\i" { return toUnicode(0x1F3B); } +"*)/i" { return toUnicode(0x1F3C); } +"*(/i" { return toUnicode(0x1F3D); } +"*)=i" { return toUnicode(0x1F3E); } +"*(=i" { return toUnicode(0x1F3F); } +"o)" { return toUnicode(0x1F40); } +"o(" { return toUnicode(0x1F41); } +"o)\\" { return toUnicode(0x1F42); } +"o(\\" { return toUnicode(0x1F43); } +"o)/" { return toUnicode(0x1F44); } +"o(/" { return toUnicode(0x1F45); } +"*)o" { return toUnicode(0x1F48); } +"*(o" { return toUnicode(0x1F49); } +"*)\\o" { return toUnicode(0x1F4A); } +"*(\\o" { return toUnicode(0x1F4B); } +"*)/o" { return toUnicode(0x1F4C); } +"*(/o" { return toUnicode(0x1F4D); } +"u)" { return toUnicode(0x1F50); } +"u(" { return toUnicode(0x1F51); } +"u)\\" { return toUnicode(0x1F52); } +"u(\\" { return toUnicode(0x1F53); } +"u)/" { return toUnicode(0x1F54); } +"u(/" { return toUnicode(0x1F55); } +"u)=" { return toUnicode(0x1F56); } +"u(=" { return toUnicode(0x1F57); } +"*(u" { return toUnicode(0x1F59); } +"*(\\u" { return toUnicode(0x1F5B); } +"*(/u" { return toUnicode(0x1F5D); } +"*(=u" { return toUnicode(0x1F5F); } +"w)" { return toUnicode(0x1F60); } +"w(" { return toUnicode(0x1F61); } +"w)\\" { return toUnicode(0x1F62); } +"w(\\" { return toUnicode(0x1F63); } +"w)/" { return toUnicode(0x1F64); } +"w(/" { return toUnicode(0x1F65); } +"w)=" { return toUnicode(0x1F66); } +"w(=" { return toUnicode(0x1F67); } +"*)w" { return toUnicode(0x1F68); } +"*(w" { return toUnicode(0x1F69); } +"*)\\w" { return toUnicode(0x1F6A); } +"*(\\w" { return toUnicode(0x1F6B); } +"*)/w" { return toUnicode(0x1F6C); } +"*(/w" { return toUnicode(0x1F6D); } +"*)=w" { return toUnicode(0x1F6E); } +"*(=w" { return toUnicode(0x1F6F); } +"a\\" { return toUnicode(0x1F70); } +"a/" { return toUnicode(0x1F71); } +"e\\" { return toUnicode(0x1F72); } +"e/" { return toUnicode(0x1F73); } +"h\\" { return toUnicode(0x1F74); } +"h/" { return toUnicode(0x1F75); } +"i\\" { return toUnicode(0x1F76); } +"i/" { return toUnicode(0x1F77); } +"o\\" { return toUnicode(0x1F78); } +"o/" { return toUnicode(0x1F79); } +"u\\" { return toUnicode(0x1F7A); } +"u/" { return toUnicode(0x1F7B); } +"w\\" { return toUnicode(0x1F7C); } +"w/" { return toUnicode(0x1F7D); } +"a)|" { return toUnicode(0x1F80); } +"a(|" { return toUnicode(0x1F81); } +"a)\\|" { return toUnicode(0x1F82); } +"a(\\|" { return toUnicode(0x1F83); } +"a)/|" { return toUnicode(0x1F84); } +"a(/|" { return toUnicode(0x1F85); } +"a)=|" { return toUnicode(0x1F86); } +"a(=|" { return toUnicode(0x1F87); } +"*)|a" { return toUnicode(0x1F88); } +"*(|a" { return toUnicode(0x1F89); } +"*)\\|a" { return toUnicode(0x1F8A); } +"*(\\|a" { return toUnicode(0x1F8B); } +"*)/|a" { return toUnicode(0x1F8C); } +"*(/|a" { return toUnicode(0x1F8D); } +"*)=|a" { return toUnicode(0x1F8E); } +"*(=|a" { return toUnicode(0x1F8F); } +"h)|" { return toUnicode(0x1F90); } +"h(|" { return toUnicode(0x1F91); } +"h)\\|" { return toUnicode(0x1F92); } +"h(\\|" { return toUnicode(0x1F93); } +"h)/|" { return toUnicode(0x1F94); } +"h(/|" { return toUnicode(0x1F95); } +"h)=|" { return toUnicode(0x1F96); } +"h(=|" { return toUnicode(0x1F97); } +"*)|h" { return toUnicode(0x1F98); } +"*(|h" { return toUnicode(0x1F99); } +"*)\\|h" { return toUnicode(0x1F9A); } +"*(\\|h" { return toUnicode(0x1F9B); } +"*)/|h" { return toUnicode(0x1F9C); } +"*(/|h" { return toUnicode(0x1F9D); } +"*)=|h" { return toUnicode(0x1F9E); } +"*(=|h" { return toUnicode(0x1F9F); } +"w)|" { return toUnicode(0x1FA0); } +"w(|" { return toUnicode(0x1FA1); } +"w)\\|" { return toUnicode(0x1FA2); } +"w(\\|" { return toUnicode(0x1FA3); } +"w)/|" { return toUnicode(0x1FA4); } +"w(/|" { return toUnicode(0x1FA5); } +"w)=|" { return toUnicode(0x1FA6); } +"w(=|" { return toUnicode(0x1FA7); } +"*)|w" { return toUnicode(0x1FA8); } +"*(|w" { return toUnicode(0x1FA9); } +"*)\\|w" { return toUnicode(0x1FAA); } +"*(\\|w" { return toUnicode(0x1FAB); } +"*)/|w" { return toUnicode(0x1FAC); } +"*(/|w" { return toUnicode(0x1FAD); } +"*)=|w" { return toUnicode(0x1FAE); } +"*(=|w" { return toUnicode(0x1FAF); } +"a^" { return toUnicode(0x1FB0); } +"a_" { return toUnicode(0x1FB1); } +"a\\|" { return toUnicode(0x1FB2); } +"a|" { return toUnicode(0x1FB3); } +"a/|" { return toUnicode(0x1FB4); } +"a=" { return toUnicode(0x1FB6); } +"a=|" { return toUnicode(0x1FB7); } +"*a^" { return toUnicode(0x1FB8); } +"*a_" { return toUnicode(0x1FB9); } +"*a\\" { return toUnicode(0x1FBA); } +"*a/" { return toUnicode(0x1FBB); } +"*a|" { return toUnicode(0x1FBC); } +"h\\|" { return toUnicode(0x1FC2); } +"h|" { return toUnicode(0x1FC3); } +"h/|" { return toUnicode(0x1FC4); } +"h=" { return toUnicode(0x1FC6); } +"h=|" { return toUnicode(0x1FC7); } +"*e\\" { return toUnicode(0x1FC8); } +"*e/" { return toUnicode(0x1FC9); } +"*h\\" { return toUnicode(0x1FCA); } +"*h/" { return toUnicode(0x1FCB); } +"*h|" { return toUnicode(0x1FCC); } +"i^" { return toUnicode(0x1FD0); } +"i_" { return toUnicode(0x1FD1); } +"i+\\" { return toUnicode(0x1FD2); } +"i+/" { return toUnicode(0x1FD3); } +"i=" { return toUnicode(0x1FD6); } +"i+=" { return toUnicode(0x1FD7); } +"*i^" { return toUnicode(0x1FD8); } +"*i_" { return toUnicode(0x1FD9); } +"*i\\" { return toUnicode(0x1FDA); } +"*i/" { return toUnicode(0x1FDB); } +"u^" { return toUnicode(0x1FE0); } +"u_" { return toUnicode(0x1FE1); } +"u+\\" { return toUnicode(0x1FE2); } +"u+/" { return toUnicode(0x1FE3); } +"r)" { return toUnicode(0x1FE4); } +"r(" { return toUnicode(0x1FE5); } +"u=" { return toUnicode(0x1FE6); } +"u+=" { return toUnicode(0x1FE7); } +"*u^" { return toUnicode(0x1FE8); } +"*u_" { return toUnicode(0x1FE9); } +"*u\\" { return toUnicode(0x1FEA); } +"*u/" { return toUnicode(0x1FEB); } +"*(r" { return toUnicode(0x1FEC); } +"w\\|" { return toUnicode(0x1FF2); } +"w|" { return toUnicode(0x1FF3); } +"w/|" { return toUnicode(0x1FF4); } +"*w\\" { return toUnicode(0x1FFA); } +"*w/" { return toUnicode(0x1FFB); } +"*w|" { return toUnicode(0x1FFC); } +"w=" { return toUnicode(0x1FF6); } +"w=|" { return toUnicode(0x1FF7); } +"*o\\" { return toUnicode(0x1FF8); } +"*o/" { return toUnicode(0x1FF9); } + +"*" isUpper = 1; + +"\\" { return toUnicode(0x0300); } +"/" { return toUnicode(0x0301); } +"_" { return toUnicode(0x0304); } +"^" { return toUnicode(0x0306); } +"+" { return toUnicode(0x0308); } +"=" { return toUnicode(0x0302); } +")" { return toUnicode(0x0313); } +"(" { return toUnicode(0x0314); } +"?" { return toUnicode(0x0323); } +"|" { return toUnicode(0x0345); } + +"a" { return toUnicodeGreek(0x03b1); } +"b" { return toUnicodeGreek(0x03b2); } +"g" { return toUnicodeGreek(0x03b3); } +"d" { return toUnicodeGreek(0x03b4); } +"e" { return toUnicodeGreek(0x03b5); } +"z" { return toUnicodeGreek(0x03b6); } +"h" { return toUnicodeGreek(0x03b7); } +"q" { return toUnicodeGreek(0x03b8); } +"i" { return toUnicodeGreek(0x03b9); } +"k" { return toUnicodeGreek(0x03ba); } +"l" { return toUnicodeGreek(0x03bb); } +"m" { return toUnicodeGreek(0x03bc); } +"n" { return toUnicodeGreek(0x03bd); } +"c" { return toUnicodeGreek(0x03be); } +"o" { return toUnicodeGreek(0x03bf); } +"p" { return toUnicodeGreek(0x03c0); } +"r" { return toUnicodeGreek(0x03c1); } + +"s1" { return toUnicode(0x03c3); } /* mdh 2002-01-07 */ +"s"/\-\- { return toUnicode(0x03c2); } +"s"/\> }[a-z\?\!0-9*=\/()\'\-] { return toUnicodeGreek(0x03c3); } +"s"/\< { return toUnicodeGreek(0x03c2); } /* MPDL update */ +"s"/[\[\]][a-z\?\!0-9*=\/()\'\-] { return toUnicodeGreek(0x03c3); } +"s"/\??[^a-z0-9*=\/()\'\-\[\?] { return toUnicode(0x03c2); } +"s" { return toUnicodeGreek(0x03c3); } + +"t" { return toUnicodeGreek(0x03c4); } +"u" { return toUnicodeGreek(0x03c5); } +"f" { return toUnicodeGreek(0x03c6); } +"x" { return toUnicodeGreek(0x03c7); } +"y" { return toUnicodeGreek(0x03c8); } +"w" { return toUnicodeGreek(0x03c9); } + +[\&_]"vert;" { return "|"; } +[\&_]"lpar;" { return "("; } +[\&_]"rpar;" { return ")"; } +[\_\&]"lt;" { return "<"; } +[\_\&]"gt;" { return ">"; } +"'" { return "'"; } /* MPDL update */ + +"&"[a-zA-Z]+";" { return yytext(); } + +. { return yytext(); } +\n { return yytext(); } \ No newline at end of file diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/transcode/Betacode2UnicodeLex.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/transcode/Betacode2UnicodeLex.java Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,1908 @@ +package de.mpg.mpiwg.berlin.mpdl.lt.text.transcode; + + +/** + * This class is a scanner generated by + * JFlex 1.4.3 + * on 19.11.09 20:01 from the specification file + * /Users/jwillenborg/java/existDevMai2009/mpdl/extensions/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/general/Betacode2Unicode.lex + */ +public class Betacode2UnicodeLex { + + /** This character denotes the end of file */ + public static final int YYEOF = -1; + + /** initial size of the lookahead buffer */ + private static final int ZZ_BUFFERSIZE = 16384; + + /** lexical states */ + public static final int YYINITIAL = 0; + + /** + * ZZ_LEXSTATE[l] is the state in the DFA for the lexical state l + * ZZ_LEXSTATE[l+1] is the state in the DFA for the lexical state l + * at the beginning of a line + * l is of the form l = 2*k, k a non negative integer + */ + private static final int ZZ_LEXSTATE[] = { + 0, 0 + }; + + /** + * Translates characters to character classes + */ + private static final String ZZ_CMAP_PACKED = + "\12\0\1\0\26\0\1\7\1\0\1\62\2\0\1\50\1\54\1\13"+ + "\1\12\1\3\1\30\1\0\1\47\1\0\1\15\1\63\1\46\1\54"+ + "\1\64\5\54\1\65\1\10\1\52\1\1\1\16\1\2\1\32\1\0"+ + "\32\66\1\56\1\14\1\55\1\26\1\27\1\0\1\11\1\33\1\44"+ + "\1\35\1\17\1\57\1\34\1\20\1\21\1\4\1\40\1\41\1\42"+ + "\1\43\1\22\1\45\1\37\1\31\1\6\1\51\1\23\1\5\1\24"+ + "\1\60\1\61\1\36\1\0\1\25\1\53\uff82\0"; + + /** + * Translates characters to character classes + */ + private static final char [] ZZ_CMAP = zzUnpackCMap(ZZ_CMAP_PACKED); + + /** + * Translates DFA states to action switch labels. + */ + private static final int [] ZZ_ACTION = zzUnpackAction(); + + private static final String ZZ_ACTION_PACKED_0 = + "\1\0\3\1\1\2\1\3\1\4\1\5\1\6\1\7"+ + "\1\10\1\11\1\12\1\13\1\14\1\15\1\16\1\17"+ + "\1\20\1\21\1\22\1\23\1\24\1\25\1\26\1\27"+ + "\1\30\1\31\1\32\1\33\1\34\1\35\1\36\1\37"+ + "\1\40\1\41\1\42\1\43\1\1\1\44\1\45\1\46"+ + "\1\47\1\0\1\50\1\51\1\52\1\53\2\0\1\54"+ + "\1\55\1\56\1\57\1\60\1\61\1\62\1\63\1\64"+ + "\1\65\1\66\1\67\1\70\1\71\1\72\1\73\1\74"+ + "\1\75\1\76\1\77\1\100\1\101\1\102\1\0\1\4"+ + "\1\0\2\102\1\0\1\103\1\104\1\105\1\106\1\107"+ + "\1\110\1\111\1\112\1\113\1\114\1\115\1\116\1\117"+ + "\1\120\1\121\1\122\1\123\1\124\1\125\1\126\1\127"+ + "\1\130\1\131\1\132\1\133\1\0\1\134\1\135\1\136"+ + "\1\137\1\140\1\141\1\142\1\143\1\144\1\145\1\146"+ + "\1\0\1\147\1\150\1\151\1\152\1\153\1\154\4\0"+ + "\1\155\1\156\6\0\1\157\1\160\1\161\1\162\1\163"+ + "\1\164\3\0\1\165\1\166\1\167\1\170\1\171\1\0"+ + "\1\172\3\0\1\173\1\174\1\175\1\176\1\177\1\200"+ + "\1\0\1\201\1\202\1\203\1\204\1\205\1\206\1\207"+ + "\1\210\1\211\1\212\1\213\1\214\1\215\1\216\1\217"+ + "\1\220\1\221\1\222\1\223\2\0\1\224\1\225\1\226"+ + "\1\227\1\230\1\231\1\232\1\233\1\234\1\235\1\236"+ + "\1\237\1\240\1\241\1\242\1\243\1\244\1\245\1\246"+ + "\1\247\1\250\1\251\1\252\1\253\1\254\1\255\1\256"+ + "\1\257\1\260\1\261\1\262\1\263\1\264\1\265\1\266"+ + "\1\267\1\270\1\271\1\272\1\273\1\274\1\275\1\276"+ + "\1\277\1\300\1\301\1\302\1\303\1\304\1\305\1\306"+ + "\1\307\1\310\1\311\1\312\1\313\1\314\1\315\1\316"+ + "\1\317\13\0\1\320\1\321\1\322\1\323\1\324\1\325"+ + "\1\0\1\326\1\327\1\330\1\331\1\332\1\333\1\0"+ + "\1\334\1\335\1\336\1\337\1\0\1\340\1\341\1\342"+ + "\1\343\1\344\1\345\1\346\1\347\1\350\1\351\1\0"+ + "\1\352\1\353\1\354\1\355\1\356\1\357\1\360\1\0"+ + "\1\361\1\362\1\363\1\364\1\365\1\0\1\366\1\367"+ + "\1\370\2\0\1\371\1\372\1\373\1\374\1\375\1\376"+ + "\1\377\1\u0100\1\u0101\1\u0102\1\u0103\1\u0104\1\u0105\1\u0106"+ + "\1\u0107\1\u0108\1\u0109\1\u010a\2\0\1\u010b\1\0\1\u010c"+ + "\4\0\1\u010d\1\u010e\1\u010f\1\u0110\1\u0111\1\u0112\1\u0113"+ + "\1\u0114\1\u0115\1\u0116\1\u0117\1\u0118\1\u0119\1\u011a\1\u011b"+ + "\1\u011c\1\u011d\1\u011e\10\0\1\u011f\1\u0120\1\u0121\1\u0122"; + + private static int [] zzUnpackAction() { + int [] result = new int[359]; + int offset = 0; + offset = zzUnpackAction(ZZ_ACTION_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackAction(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int count = packed.charAt(i++); + int value = packed.charAt(i++); + do result[j++] = value; while (--count > 0); + } + return j; + } + + + /** + * Translates a state to a row index in the transition table + */ + private static final int [] ZZ_ROWMAP = zzUnpackRowMap(); + + private static final String ZZ_ROWMAP_PACKED_0 = + "\0\0\0\67\0\156\0\245\0\67\0\67\0\334\0\67"+ + "\0\67\0\u0113\0\67\0\67\0\67\0\67\0\67\0\u014a"+ + "\0\u0181\0\u01b8\0\u01ef\0\u0226\0\u025d\0\67\0\67\0\u0294"+ + "\0\67\0\u02cb\0\67\0\67\0\67\0\67\0\67\0\67"+ + "\0\67\0\67\0\67\0\67\0\67\0\67\0\u0302\0\67"+ + "\0\67\0\67\0\67\0\u0339\0\67\0\67\0\67\0\u0370"+ + "\0\u03a7\0\u03de\0\u0415\0\u044c\0\u0483\0\u04ba\0\u04f1\0\u0528"+ + "\0\67\0\67\0\67\0\67\0\67\0\67\0\67\0\67"+ + "\0\67\0\67\0\67\0\67\0\67\0\67\0\67\0\67"+ + "\0\67\0\u055f\0\67\0\u0596\0\u05cd\0\u0604\0\u0604\0\u063b"+ + "\0\u0672\0\u06a9\0\u06e0\0\u0717\0\67\0\67\0\67\0\u074e"+ + "\0\u0785\0\67\0\67\0\u07bc\0\u07f3\0\u082a\0\u0861\0\u0898"+ + "\0\67\0\u08cf\0\u0906\0\67\0\67\0\67\0\67\0\67"+ + "\0\u093d\0\u0974\0\u09ab\0\67\0\67\0\u09e2\0\u0a19\0\67"+ + "\0\67\0\67\0\67\0\67\0\u0a50\0\u0a87\0\u0abe\0\u0af5"+ + "\0\u0b2c\0\u0b63\0\67\0\u0b9a\0\u0bd1\0\u0c08\0\u0c3f\0\67"+ + "\0\67\0\u0c76\0\u0cad\0\u0ce4\0\u0d1b\0\u0d52\0\u0d89\0\67"+ + "\0\67\0\67\0\67\0\67\0\67\0\u0dc0\0\u0df7\0\u0e2e"+ + "\0\67\0\67\0\67\0\67\0\67\0\u0e65\0\67\0\u0e9c"+ + "\0\u0ed3\0\u0f0a\0\67\0\67\0\67\0\67\0\67\0\67"+ + "\0\u0f41\0\67\0\67\0\67\0\67\0\67\0\67\0\67"+ + "\0\67\0\67\0\67\0\67\0\67\0\67\0\67\0\67"+ + "\0\67\0\67\0\67\0\67\0\u0f78\0\u0faf\0\67\0\u0fe6"+ + "\0\u101d\0\u1054\0\67\0\u108b\0\u10c2\0\u10f9\0\67\0\67"+ + "\0\67\0\67\0\67\0\67\0\67\0\67\0\u1130\0\u1167"+ + "\0\u119e\0\67\0\u11d5\0\u120c\0\u1243\0\67\0\67\0\67"+ + "\0\67\0\67\0\67\0\67\0\67\0\67\0\67\0\67"+ + "\0\67\0\67\0\67\0\67\0\67\0\67\0\67\0\67"+ + "\0\67\0\67\0\67\0\67\0\67\0\67\0\67\0\u127a"+ + "\0\u12b1\0\u12e8\0\67\0\u131f\0\u1356\0\u138d\0\67\0\67"+ + "\0\67\0\67\0\u13c4\0\u13fb\0\u1432\0\u1469\0\u14a0\0\u14d7"+ + "\0\u150e\0\u1545\0\u157c\0\u15b3\0\u15ea\0\67\0\67\0\67"+ + "\0\67\0\67\0\67\0\u1621\0\67\0\67\0\67\0\67"+ + "\0\67\0\67\0\u1658\0\67\0\67\0\67\0\67\0\u168f"+ + "\0\67\0\67\0\67\0\67\0\67\0\67\0\67\0\67"+ + "\0\67\0\67\0\u16c6\0\67\0\67\0\67\0\67\0\67"+ + "\0\67\0\67\0\u16fd\0\67\0\67\0\67\0\67\0\67"+ + "\0\u1734\0\67\0\67\0\67\0\u176b\0\u17a2\0\67\0\67"+ + "\0\67\0\67\0\67\0\67\0\67\0\67\0\67\0\67"+ + "\0\67\0\67\0\67\0\67\0\67\0\67\0\67\0\67"+ + "\0\u17d9\0\u1810\0\67\0\u1847\0\67\0\u187e\0\u18b5\0\u18ec"+ + "\0\u1923\0\67\0\67\0\67\0\67\0\67\0\67\0\67"+ + "\0\67\0\67\0\67\0\67\0\67\0\67\0\67\0\67"+ + "\0\67\0\67\0\67\0\u195a\0\u1991\0\u19c8\0\u19ff\0\u1a36"+ + "\0\u1a6d\0\u1aa4\0\u1adb\0\67\0\67\0\67\0\67"; + + private static int [] zzUnpackRowMap() { + int [] result = new int[359]; + int offset = 0; + offset = zzUnpackRowMap(ZZ_ROWMAP_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackRowMap(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int high = packed.charAt(i++) << 16; + result[j++] = high | packed.charAt(i++); + } + return j; + } + + /** + * The transition table of the DFA + */ + private static final int [] ZZ_TRANS = zzUnpackTrans(); + + private static final String ZZ_TRANS_PACKED_0 = + "\1\2\1\3\1\2\1\4\1\5\1\6\1\7\1\10"+ + "\1\11\1\12\1\13\1\14\1\15\1\16\1\17\1\20"+ + "\1\21\1\22\1\23\1\24\1\25\1\26\1\27\1\30"+ + "\1\31\1\32\1\33\1\34\1\35\1\36\1\37\1\40"+ + "\1\41\1\42\1\43\1\44\1\45\1\46\2\2\1\47"+ + "\1\50\5\2\1\51\1\52\1\53\5\2\67\0\2\54"+ + "\1\0\64\54\4\0\1\55\1\56\1\57\2\0\1\60"+ + "\1\61\1\62\3\0\1\63\1\64\1\65\1\66\1\67"+ + "\1\70\4\0\1\71\1\0\1\72\1\73\1\74\1\75"+ + "\1\76\1\77\1\100\1\101\1\102\1\103\1\104\3\0"+ + "\1\105\5\0\1\106\1\107\1\110\5\0\3\111\4\0"+ + "\2\111\3\0\1\111\10\0\4\111\1\0\1\112\13\0"+ + "\1\113\1\114\1\115\1\0\2\111\1\0\1\116\1\117"+ + "\3\0\1\111\3\0\1\111\12\0\1\120\1\121\1\122"+ + "\1\123\1\124\6\0\1\125\1\126\1\127\51\0\1\130"+ + "\1\131\1\132\1\133\63\0\1\134\1\135\1\136\1\137"+ + "\1\140\6\0\1\141\53\0\1\142\1\143\1\144\1\145"+ + "\1\146\7\0\1\147\1\150\1\151\50\0\1\152\1\153"+ + "\1\154\1\155\63\0\1\156\1\157\1\160\1\161\1\162"+ + "\7\0\1\163\1\164\1\165\50\0\1\166\1\167\1\170"+ + "\1\171\1\172\6\0\1\173\46\0\1\174\23\0\1\175"+ + "\2\0\1\176\4\0\1\177\37\0\1\200\1\201\57\0"+ + "\1\202\1\203\1\202\2\0\1\202\5\0\6\202\4\0"+ + "\1\204\1\0\1\202\1\205\4\202\1\206\4\202\3\0"+ + "\1\202\5\0\3\202\1\207\3\0\1\202\2\54\1\2"+ + "\64\54\14\0\1\210\1\211\7\0\1\212\1\213\1\214"+ + "\50\0\1\215\2\0\1\216\1\217\1\220\1\221\1\222"+ + "\1\223\1\224\1\0\1\225\1\226\52\0\1\227\2\0"+ + "\1\230\1\231\1\232\1\233\1\234\1\235\1\236\1\237"+ + "\1\240\1\241\3\0\1\242\51\0\1\243\1\244\65\0"+ + "\1\245\1\246\7\0\1\247\55\0\1\250\1\251\10\0"+ + "\1\252\1\253\53\0\1\254\1\255\65\0\1\256\1\257"+ + "\10\0\1\260\1\261\53\0\1\262\1\263\7\0\1\264"+ + "\41\0\3\111\4\0\2\111\3\0\1\111\10\0\4\111"+ + "\17\0\1\111\1\0\2\111\1\0\1\111\4\0\1\111"+ + "\3\0\1\111\47\0\1\111\53\0\1\265\4\0\1\266"+ + "\30\0\5\267\1\0\3\267\1\0\10\267\4\0\17\267"+ + "\1\0\1\267\2\0\1\267\2\0\3\267\1\0\3\267"+ + "\15\0\1\270\1\271\1\272\6\0\1\273\55\0\1\274"+ + "\1\275\1\276\6\0\1\277\66\0\1\300\66\0\1\301"+ + "\66\0\1\302\55\0\1\303\1\304\65\0\1\305\1\306"+ + "\65\0\1\307\1\310\1\311\6\0\1\312\55\0\1\313"+ + "\1\314\1\315\6\0\1\316\66\0\1\317\66\0\1\320"+ + "\66\0\1\321\55\0\1\322\1\323\1\324\64\0\1\325"+ + "\1\326\1\327\64\0\1\330\1\331\1\332\64\0\1\333"+ + "\1\334\65\0\1\335\1\336\65\0\1\337\1\340\1\341"+ + "\64\0\1\342\1\343\1\344\64\0\1\345\1\346\1\347"+ + "\64\0\1\350\1\351\1\352\6\0\1\353\55\0\1\354"+ + "\1\355\1\356\6\0\1\357\66\0\1\360\66\0\1\361"+ + "\66\0\1\362\60\0\1\363\114\0\1\364\72\0\1\365"+ + "\62\0\1\366\3\0\1\367\21\0\3\202\2\0\1\202"+ + "\5\0\6\202\4\0\1\202\1\0\13\202\3\0\1\202"+ + "\1\2\4\0\3\202\4\0\1\202\4\0\3\202\2\0"+ + "\1\202\5\0\1\370\5\202\4\0\1\202\1\0\13\202"+ + "\3\0\1\202\1\2\4\0\3\202\4\0\1\202\4\0"+ + "\3\202\2\0\1\202\5\0\6\202\4\0\1\202\1\0"+ + "\12\202\1\371\3\0\1\202\1\2\4\0\3\202\4\0"+ + "\1\202\4\0\3\202\2\0\1\202\5\0\6\202\4\0"+ + "\1\202\1\0\13\202\3\0\1\372\1\2\4\0\3\202"+ + "\4\0\1\202\4\0\3\202\2\0\1\202\5\0\6\202"+ + "\4\0\1\202\1\0\12\202\1\373\3\0\1\374\1\2"+ + "\4\0\3\202\4\0\1\202\63\0\1\375\14\0\1\376"+ + "\5\0\1\377\1\u0100\1\u0101\1\u0102\1\0\1\u0103\1\u0104"+ + "\52\0\1\u0105\5\0\1\u0106\1\u0107\1\u0108\1\u0109\1\0"+ + "\1\u010a\1\u010b\52\0\1\u010c\6\0\1\u010d\1\u010e\2\0"+ + "\1\u010f\1\u0110\52\0\1\u0111\6\0\1\u0112\3\0\1\u0113"+ + "\53\0\1\u0114\5\0\1\u0115\1\u0116\1\u0117\1\u0118\1\u0119"+ + "\1\u011a\1\u011b\52\0\1\u011c\5\0\1\u011d\1\u011e\1\u011f"+ + "\1\u0120\1\u0121\1\u0122\1\u0123\52\0\1\u0124\6\0\1\u0125"+ + "\1\u0126\1\0\1\u0127\1\u0128\1\u0129\52\0\1\u012a\6\0"+ + "\1\u012b\3\0\1\u012c\113\0\1\u012d\66\0\1\u012e\42\0"+ + "\1\u012f\66\0\1\u0130\66\0\1\u0131\66\0\1\u0132\66\0"+ + "\1\u0133\66\0\1\u0134\66\0\1\u0135\66\0\1\u0136\66\0"+ + "\1\u0137\66\0\1\u0138\66\0\1\u0139\66\0\1\u013a\66\0"+ + "\1\u013b\66\0\1\u013c\66\0\1\u013d\66\0\1\u013e\66\0"+ + "\1\u013f\66\0\1\u0140\72\0\1\u0141\46\0\1\u0142\127\0"+ + "\1\u0143\25\0\1\u0144\127\0\1\u0145\20\0\3\202\2\0"+ + "\1\202\5\0\6\202\4\0\1\u0146\1\0\13\202\3\0"+ + "\1\202\1\2\4\0\3\202\4\0\1\202\4\0\3\202"+ + "\2\0\1\u0147\5\0\6\202\4\0\1\202\1\0\13\202"+ + "\3\0\1\202\1\2\4\0\3\202\4\0\1\202\4\0"+ + "\3\202\2\0\1\202\5\0\6\202\4\0\1\202\1\0"+ + "\13\202\3\0\1\202\1\u0143\4\0\3\202\4\0\1\202"+ + "\4\0\3\202\2\0\1\u0148\5\0\6\202\4\0\1\202"+ + "\1\0\13\202\3\0\1\202\1\2\4\0\3\202\4\0"+ + "\1\202\4\0\3\202\2\0\1\202\5\0\6\202\4\0"+ + "\1\202\1\0\13\202\3\0\1\202\1\u0145\4\0\3\202"+ + "\4\0\1\202\64\0\1\u0149\13\0\1\u014a\6\0\1\u014b"+ + "\3\0\1\u014c\53\0\1\u014d\6\0\1\u014e\3\0\1\u014f"+ + "\53\0\1\u0150\6\0\1\u0151\3\0\1\u0152\53\0\1\u0153"+ + "\6\0\1\u0154\3\0\1\u0155\53\0\1\u0156\6\0\1\u0157"+ + "\3\0\1\u0158\53\0\1\u0159\6\0\1\u015a\3\0\1\u015b"+ + "\114\0\1\u015c\66\0\1\111\65\0\1\u015d\46\0\1\u015e"+ + "\66\0\1\u015f\41\0\3\202\2\0\1\202\5\0\6\202"+ + "\4\0\1\202\1\0\13\202\3\0\1\u0160\1\2\4\0"+ + "\3\202\4\0\1\202\4\0\3\202\2\0\1\202\5\0"+ + "\6\202\4\0\1\u0161\1\0\13\202\3\0\1\202\1\2"+ + "\4\0\3\202\4\0\1\202\4\0\3\202\2\0\1\202"+ + "\5\0\6\202\4\0\1\u0162\1\0\13\202\3\0\1\202"+ + "\1\2\4\0\3\202\4\0\1\202\65\0\1\u0163\54\0"+ + "\1\117\65\0\1\u0164\66\0\1\u0165\66\0\1\u0166\20\0"+ + "\3\202\2\0\1\202\5\0\6\202\4\0\1\202\1\0"+ + "\13\202\3\0\1\202\1\u0164\4\0\3\202\4\0\1\202"+ + "\4\0\3\202\2\0\1\202\5\0\6\202\4\0\1\202"+ + "\1\0\13\202\3\0\1\202\1\u0165\4\0\3\202\4\0"+ + "\1\202\4\0\3\202\2\0\1\202\5\0\6\202\4\0"+ + "\1\202\1\0\13\202\3\0\1\202\1\u0166\4\0\3\202"+ + "\4\0\1\202\52\0\1\u0167\14\0"; + + private static int [] zzUnpackTrans() { + int [] result = new int[6930]; + int offset = 0; + offset = zzUnpackTrans(ZZ_TRANS_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackTrans(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int count = packed.charAt(i++); + int value = packed.charAt(i++); + value--; + do result[j++] = value; while (--count > 0); + } + return j; + } + + + /* error codes */ + private static final int ZZ_UNKNOWN_ERROR = 0; + private static final int ZZ_NO_MATCH = 1; + private static final int ZZ_PUSHBACK_2BIG = 2; + + /* error messages for the codes above */ + private static final String ZZ_ERROR_MSG[] = { + "Unkown internal scanner error", + "Error: could not match input", + "Error: pushback value was too large" + }; + + /** + * ZZ_ATTRIBUTE[aState] contains the attributes of state aState + */ + private static final int [] ZZ_ATTRIBUTE = zzUnpackAttribute(); + + private static final String ZZ_ATTRIBUTE_PACKED_0 = + "\1\0\1\11\2\1\2\11\1\1\2\11\1\1\5\11"+ + "\6\1\2\11\1\1\1\11\1\1\14\11\1\1\4\11"+ + "\1\0\3\11\1\1\2\0\6\1\21\11\1\0\1\11"+ + "\1\0\2\1\1\0\5\1\3\11\2\1\2\11\5\1"+ + "\1\11\2\1\5\11\1\0\2\1\2\11\2\1\5\11"+ + "\1\0\5\1\1\11\4\0\2\11\6\0\6\11\3\0"+ + "\5\11\1\0\1\11\3\0\6\11\1\0\23\11\2\0"+ + "\1\11\3\1\1\11\3\1\10\11\3\1\1\11\3\1"+ + "\32\11\3\1\1\11\3\1\4\11\13\0\6\11\1\0"+ + "\6\11\1\0\4\11\1\0\12\11\1\0\7\11\1\0"+ + "\5\11\1\0\3\11\2\0\22\11\2\0\1\11\1\0"+ + "\1\11\4\0\22\11\10\0\4\11"; + + private static int [] zzUnpackAttribute() { + int [] result = new int[359]; + int offset = 0; + offset = zzUnpackAttribute(ZZ_ATTRIBUTE_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackAttribute(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int count = packed.charAt(i++); + int value = packed.charAt(i++); + do result[j++] = value; while (--count > 0); + } + return j; + } + + /** the input device */ + private java.io.Reader zzReader; + + /** the current state of the DFA */ + private int zzState; + + /** the current lexical state */ + private int zzLexicalState = YYINITIAL; + + /** this buffer contains the current text to be matched and is + the source of the yytext() string */ + private char zzBuffer[] = new char[ZZ_BUFFERSIZE]; + + /** the textposition at the last accepting state */ + private int zzMarkedPos; + + /** the current text position in the buffer */ + private int zzCurrentPos; + + /** startRead marks the beginning of the yytext() string in the buffer */ + private int zzStartRead; + + /** endRead marks the last character in the buffer, that has been read + from input */ + private int zzEndRead; + + /** number of newlines encountered up to the start of the matched text */ + private int yyline; + + /** the number of characters up to the start of the matched text */ + private int yychar; + + /** + * the number of characters from the last newline up to the start of the + * matched text + */ + private int yycolumn; + + /** + * zzAtBOL == true <=> the scanner is currently at the beginning of a line + */ + private boolean zzAtBOL = true; + + /** zzAtEOF == true <=> the scanner is at the EOF */ + private boolean zzAtEOF; + + /** denotes if the user-EOF-code has already been executed */ + private boolean zzEOFDone; + + /* user code: */ + /* + * Betacode to Unicode conversion + */ + + private String toUnicode(int in) { + char c = (char) in; + Character ch = new Character(c); + String retString = ch.toString(); + return retString; + } + + + + /** + * Creates a new scanner + * There is also a java.io.InputStream version of this constructor. + * + * @param in the java.io.Reader to read input from. + */ + public Betacode2UnicodeLex(java.io.Reader in) { + this.zzReader = in; + } + + /** + * Creates a new scanner. + * There is also java.io.Reader version of this constructor. + * + * @param in the java.io.Inputstream to read input from. + */ + public Betacode2UnicodeLex(java.io.InputStream in) { + this(new java.io.InputStreamReader(in)); + } + + /** + * Unpacks the compressed character translation table. + * + * @param packed the packed character translation table + * @return the unpacked character translation table + */ + private static char [] zzUnpackCMap(String packed) { + char [] map = new char[0x10000]; + int i = 0; /* index in packed string */ + int j = 0; /* index in unpacked array */ + while (i < 134) { + int count = packed.charAt(i++); + char value = packed.charAt(i++); + do map[j++] = value; while (--count > 0); + } + return map; + } + + + /** + * Refills the input buffer. + * + * @return false, iff there was new input. + * + * @exception java.io.IOException if any I/O-Error occurs + */ + private boolean zzRefill() throws java.io.IOException { + + /* first: make room (if you can) */ + if (zzStartRead > 0) { + System.arraycopy(zzBuffer, zzStartRead, + zzBuffer, 0, + zzEndRead-zzStartRead); + + /* translate stored positions */ + zzEndRead-= zzStartRead; + zzCurrentPos-= zzStartRead; + zzMarkedPos-= zzStartRead; + zzStartRead = 0; + } + + /* is the buffer big enough? */ + if (zzCurrentPos >= zzBuffer.length) { + /* if not: blow it up */ + char newBuffer[] = new char[zzCurrentPos*2]; + System.arraycopy(zzBuffer, 0, newBuffer, 0, zzBuffer.length); + zzBuffer = newBuffer; + } + + /* finally: fill the buffer with new input */ + int numRead = zzReader.read(zzBuffer, zzEndRead, + zzBuffer.length-zzEndRead); + + if (numRead > 0) { + zzEndRead+= numRead; + return false; + } + // unlikely but not impossible: read 0 characters, but not at end of stream + if (numRead == 0) { + int c = zzReader.read(); + if (c == -1) { + return true; + } else { + zzBuffer[zzEndRead++] = (char) c; + return false; + } + } + + // numRead < 0 + return true; + } + + + /** + * Closes the input stream. + */ + public final void yyclose() throws java.io.IOException { + zzAtEOF = true; /* indicate end of file */ + zzEndRead = zzStartRead; /* invalidate buffer */ + + if (zzReader != null) + zzReader.close(); + } + + + /** + * Resets the scanner to read from a new input stream. + * Does not close the old reader. + * + * All internal variables are reset, the old input stream + * cannot be reused (internal buffer is discarded and lost). + * Lexical state is set to ZZ_INITIAL. + * + * @param reader the new input stream + */ + public final void yyreset(java.io.Reader reader) { + zzReader = reader; + zzAtBOL = true; + zzAtEOF = false; + zzEOFDone = false; + zzEndRead = zzStartRead = 0; + zzCurrentPos = zzMarkedPos = 0; + yyline = yychar = yycolumn = 0; + zzLexicalState = YYINITIAL; + } + + + /** + * Returns the current lexical state. + */ + public final int yystate() { + return zzLexicalState; + } + + + /** + * Enters a new lexical state + * + * @param newState the new lexical state + */ + public final void yybegin(int newState) { + zzLexicalState = newState; + } + + + /** + * Returns the text matched by the current regular expression. + */ + public final String yytext() { + return new String( zzBuffer, zzStartRead, zzMarkedPos-zzStartRead ); + } + + + /** + * Returns the character at position pos from the + * matched text. + * + * It is equivalent to yytext().charAt(pos), but faster + * + * @param pos the position of the character to fetch. + * A value from 0 to yylength()-1. + * + * @return the character at position pos + */ + public final char yycharat(int pos) { + return zzBuffer[zzStartRead+pos]; + } + + + /** + * Returns the length of the matched text region. + */ + public final int yylength() { + return zzMarkedPos-zzStartRead; + } + + + /** + * Reports an error that occured while scanning. + * + * In a wellformed scanner (no or only correct usage of + * yypushback(int) and a match-all fallback rule) this method + * will only be called with things that "Can't Possibly Happen". + * If this method is called, something is seriously wrong + * (e.g. a JFlex bug producing a faulty scanner etc.). + * + * Usual syntax/scanner level error handling should be done + * in error fallback rules. + * + * @param errorCode the code of the errormessage to display + */ + private void zzScanError(int errorCode) { + String message; + try { + message = ZZ_ERROR_MSG[errorCode]; + } + catch (ArrayIndexOutOfBoundsException e) { + message = ZZ_ERROR_MSG[ZZ_UNKNOWN_ERROR]; + } + + throw new Error(message); + } + + + /** + * Pushes the specified amount of characters back into the input stream. + * + * They will be read again by then next call of the scanning method + * + * @param number the number of characters to be read again. + * This number must not be greater than yylength()! + */ + public void yypushback(int number) { + if ( number > yylength() ) + zzScanError(ZZ_PUSHBACK_2BIG); + + zzMarkedPos -= number; + } + + + /** + * Resumes scanning until the next regular expression is matched, + * the end of input is encountered or an I/O-Error occurs. + * + * @return the next token + * @exception java.io.IOException if any I/O-Error occurs + */ + public java.lang.String yylex() throws java.io.IOException { + int zzInput; + int zzAction; + + // cached fields: + int zzCurrentPosL; + int zzMarkedPosL; + int zzEndReadL = zzEndRead; + char [] zzBufferL = zzBuffer; + char [] zzCMapL = ZZ_CMAP; + + int [] zzTransL = ZZ_TRANS; + int [] zzRowMapL = ZZ_ROWMAP; + int [] zzAttrL = ZZ_ATTRIBUTE; + + while (true) { + zzMarkedPosL = zzMarkedPos; + + zzAction = -1; + + zzCurrentPosL = zzCurrentPos = zzStartRead = zzMarkedPosL; + + zzState = ZZ_LEXSTATE[zzLexicalState]; + + + zzForAction: { + while (true) { + + if (zzCurrentPosL < zzEndReadL) + zzInput = zzBufferL[zzCurrentPosL++]; + else if (zzAtEOF) { + zzInput = YYEOF; + break zzForAction; + } + else { + // store back cached positions + zzCurrentPos = zzCurrentPosL; + zzMarkedPos = zzMarkedPosL; + boolean eof = zzRefill(); + // get translated positions and possibly new buffer + zzCurrentPosL = zzCurrentPos; + zzMarkedPosL = zzMarkedPos; + zzBufferL = zzBuffer; + zzEndReadL = zzEndRead; + if (eof) { + zzInput = YYEOF; + break zzForAction; + } + else { + zzInput = zzBufferL[zzCurrentPosL++]; + } + } + int zzNext = zzTransL[ zzRowMapL[zzState] + zzCMapL[zzInput] ]; + if (zzNext == -1) break zzForAction; + zzState = zzNext; + + int zzAttributes = zzAttrL[zzState]; + if ( (zzAttributes & 1) == 1 ) { + zzAction = zzState; + zzMarkedPosL = zzCurrentPosL; + if ( (zzAttributes & 8) == 8 ) break zzForAction; + } + + } + } + + // store back cached position + zzMarkedPos = zzMarkedPosL; + + switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) { + case 139: + { return toUnicode(0x1FF8); + } + case 291: break; + case 85: + { return toUnicode(0x1F30); + } + case 292: break; + case 64: + { return toUnicode(0x03a7); + } + case 293: break; + case 60: + { return toUnicode(0x039e); + } + case 294: break; + case 151: + { return toUnicode(0x1F06); + } + case 295: break; + case 206: + { return toUnicode(0x1FF4); + } + case 296: break; + case 42: + { return toUnicode(0x03a3); + } + case 297: break; + case 56: + { return toUnicode(0x039a); + } + case 298: break; + case 149: + { return toUnicode(0x1F02); + } + case 299: break; + case 254: + { return toUnicode(0x1F87); + } + case 300: break; + case 83: + { return toUnicode(0x1FC6); + } + case 301: break; + case 32: + { return toUnicode(0x03bc); + } + case 302: break; + case 216: + { return toUnicode(0x1F2C); + } + case 303: break; + case 252: + { return toUnicode(0x1F83); + } + case 304: break; + case 172: + { return toUnicode(0x1FC2); + } + case 305: break; + case 127: + { return toUnicode(0x1F59); + } + case 306: break; + case 192: + { return toUnicode(0x1F55); + } + case 307: break; + case 129: + { return toUnicode(0x1FEC); + } + case 308: break; + case 97: + { return toUnicode(0x1F51); + } + case 309: break; + case 39: + { return toUnicode(0x03c8); + } + case 310: break; + case 170: + { return toUnicode(0x1F27); + } + case 311: break; + case 36: + { return toUnicode(0x03c4); + } + case 312: break; + case 168: + { return toUnicode(0x1F23); + } + case 313: break; + case 99: + { return toUnicode(0x1F7B); + } + case 314: break; + case 111: + { return toUnicode(0x1FBA); + } + case 315: break; + case 35: + { return toUnicode(0x03c0); + } + case 316: break; + case 196: + { return toUnicode(0x1FE7); + } + case 317: break; + case 238: + { return toUnicode(0x1F4D); + } + case 318: break; + case 195: + { return toUnicode(0x1FE3); + } + case 319: break; + case 115: + { return toUnicode(0x1FB9); + } + case 320: break; + case 87: + { return toUnicode(0x1F76); + } + case 321: break; + case 9: + { return toUnicode(0x0314); + } + case 322: break; + case 228: + { return toUnicode(0x1F1B); + } + case 323: break; + case 77: + { return toUnicode(0x1F72); + } + case 324: break; + case 46: + { return toUnicode(0x0399); + } + case 325: break; + case 74: + { return toUnicode(0x1FB1); + } + case 326: break; + case 120: + { return toUnicode(0x1F48); + } + case 327: break; + case 44: + { return toUnicode(0x0395); + } + case 328: break; + case 185: + { return toUnicode(0x1F44); + } + case 329: break; + case 273: + { return toUnicode(0x1F9C); + } + case 330: break; + case 136: + { return toUnicode(0x1FDB); + } + case 331: break; + case 43: + { return toUnicode(0x0391); + } + case 332: break; + case 92: + { return toUnicode(0x1F40); + } + case 333: break; + case 14: + { return toUnicode(0x03b7); + } + case 334: break; + case 268: + { return "<"; + } + case 335: break; + case 223: + { return toUnicode(0x1F6E); + } + case 336: break; + case 283: + { return toUnicode(0x1FAD); + } + case 337: break; + case 26: + { return toUnicode(0x03b3); + } + case 338: break; + case 160: + { return toUnicode(0x1F12); + } + case 339: break; + case 213: + { return toUnicode(0x1F6A); + } + case 340: break; + case 260: + { return toUnicode(0x1F97); + } + case 341: break; + case 89: + { return toUnicode(0x1FD6); + } + case 342: break; + case 217: + { return toUnicode(0x1F3C); + } + case 343: break; + case 258: + { return toUnicode(0x1F93); + } + case 344: break; + case 181: + { return toUnicode(0x1FD2); + } + case 345: break; + case 128: + { return toUnicode(0x1F69); + } + case 346: break; + case 226: + { return toUnicode(0x1FA8); + } + case 347: break; + case 220: + { return toUnicode(0x1F0E); + } + case 348: break; + case 202: + { return toUnicode(0x1F65); + } + case 349: break; + case 262: + { return toUnicode(0x1FA4); + } + case 350: break; + case 147: + { return toUnicode(0x1FFC); + } + case 351: break; + case 208: + { return toUnicode(0x1F0A); + } + case 352: break; + case 104: + { return toUnicode(0x1F61); + } + case 353: break; + case 288: + { return ")"; + } + case 354: break; + case 200: + { return toUnicode(0x1FA0); + } + case 355: break; + case 180: + { return toUnicode(0x1F37); + } + case 356: break; + case 284: + { return toUnicode(0x1F8F); + } + case 357: break; + case 287: + { return "|"; + } + case 358: break; + case 178: + { return toUnicode(0x1F33); + } + case 359: break; + case 278: + { return toUnicode(0x1F8B); + } + case 360: break; + case 132: + { return toUnicode(0x1FCA); + } + case 361: break; + case 122: + { return toUnicode(0x1F09); + } + case 362: break; + case 207: + { return toUnicode(0x1FF7); + } + case 363: break; + case 63: + { return toUnicode(0x03a6); + } + case 364: break; + case 59: + { return toUnicode(0x039d); + } + case 365: break; + case 154: + { return toUnicode(0x1F05); + } + case 366: break; + case 239: + { return toUnicode(0x1F5D); + } + case 367: break; + case 108: + { return toUnicode(0x1FF3); + } + case 368: break; + case 131: + { return toUnicode(0x1FC9); + } + case 369: break; + case 68: + { return toUnicode(0x1F01); + } + case 370: break; + case 16: + { return toUnicode(0x03bf); + } + case 371: break; + case 242: + { return toUnicode(0x1F2F); + } + case 372: break; + case 251: + { return toUnicode(0x1F86); + } + case 373: break; + case 6: + { return toUnicode(0x00B7); + } + case 374: break; + case 31: + { return toUnicode(0x03bb); + } + case 375: break; + case 229: + { return toUnicode(0x1F2B); + } + case 376: break; + case 249: + { return toUnicode(0x1F82); + } + case 377: break; + case 2: + { return "h"; + } + case 378: break; + case 189: + { return toUnicode(0x1F54); + } + case 379: break; + case 142: + { return toUnicode(0x1FEB); + } + case 380: break; + case 96: + { return toUnicode(0x1F50); + } + case 381: break; + case 38: + { return toUnicode(0x03c7); + } + case 382: break; + case 166: + { return toUnicode(0x1F26); + } + case 383: break; + case 4: + { return toUnicode(0x03c3); + } + case 384: break; + case 148: + // lookahead expression with fixed base length + zzMarkedPos = zzStartRead + 1; + { return toUnicode(0x03c3); + } + case 385: break; + case 164: + { return toUnicode(0x1F22); + } + case 386: break; + case 98: + { return toUnicode(0x1F7A); + } + case 387: break; + case 100: + { return toUnicode(0x1FE6); + } + case 388: break; + case 19: + { return toUnicode(0x0345); + } + case 389: break; + case 218: + { return toUnicode(0x1F4C); + } + case 390: break; + case 194: + { return toUnicode(0x1FE2); + } + case 391: break; + case 95: + { return toUnicode(0x1F79); + } + case 392: break; + case 114: + { return toUnicode(0x1FB8); + } + case 393: break; + case 82: + { return toUnicode(0x1F75); + } + case 394: break; + case 158: + { return toUnicode(0x1FB4); + } + case 395: break; + case 8: + { return toUnicode(0x0313); + } + case 396: break; + case 209: + { return toUnicode(0x1F1A); + } + case 397: break; + case 70: + { return toUnicode(0x1F71); + } + case 398: break; + case 40: + { return "H"; + } + case 399: break; + case 55: + { return toUnicode(0x0398); + } + case 400: break; + case 73: + { return toUnicode(0x1FB0); + } + case 401: break; + case 285: + { return toUnicode(0x1F9F); + } + case 402: break; + case 53: + { return toUnicode(0x0394); + } + case 403: break; + case 186: + { return toUnicode(0x1F43); + } + case 404: break; + case 279: + { return toUnicode(0x1F9B); + } + case 405: break; + case 135: + { return toUnicode(0x1FDA); + } + case 406: break; + case 123: + { return toUnicode(0x1F19); + } + case 407: break; + case 28: + { return toUnicode(0x03b6); + } + case 408: break; + case 163: + { return toUnicode(0x1F15); + } + case 409: break; + case 240: + { return toUnicode(0x1F6D); + } + case 410: break; + case 274: + { return toUnicode(0x1FAC); + } + case 411: break; + case 25: + { return toUnicode(0x03b2); + } + case 412: break; + case 138: + { return toUnicode(0x1FD9); + } + case 413: break; + case 76: + { return toUnicode(0x1F11); + } + case 414: break; + case 243: + { return toUnicode(0x1F3F); + } + case 415: break; + case 257: + { return toUnicode(0x1F96); + } + case 416: break; + case 230: + { return toUnicode(0x1F3B); + } + case 417: break; + case 255: + { return toUnicode(0x1F92); + } + case 418: break; + case 91: + { return toUnicode(0x1FD1); + } + case 419: break; + case 121: + { return toUnicode(0x1F68); + } + case 420: break; + case 266: + { return toUnicode(0x1FA7); + } + case 421: break; + case 20: + { return toUnicode(0x0306); + } + case 422: break; + case 234: + { return toUnicode(0x1F0D); + } + case 423: break; + case 198: + { return toUnicode(0x1F64); + } + case 424: break; + case 264: + { return toUnicode(0x1FA3); + } + case 425: break; + case 146: + { return toUnicode(0x1FFB); + } + case 426: break; + case 12: + { return toUnicode(0x0302); + } + case 427: break; + case 103: + { return toUnicode(0x1F60); + } + case 428: break; + case 289: + { return "("; + } + case 429: break; + case 177: + { return toUnicode(0x1F36); + } + case 430: break; + case 275: + { return toUnicode(0x1F8E); + } + case 431: break; + case 175: + { return toUnicode(0x1F32); + } + case 432: break; + case 49: + { return toUnicode(0x03a9); + } + case 433: break; + case 269: + { return toUnicode(0x1F8A); + } + case 434: break; + case 116: + { return toUnicode(0x1F08); + } + case 435: break; + case 107: + { return toUnicode(0x1FF6); + } + case 436: break; + case 267: + { return ">"; + } + case 437: break; + case 48: + { return toUnicode(0x03a5); + } + case 438: break; + case 58: + { return toUnicode(0x039c); + } + case 439: break; + case 150: + { return toUnicode(0x1F04); + } + case 440: break; + case 205: + { return toUnicode(0x1FF2); + } + case 441: break; + case 50: + { return toUnicode(0x03a1); + } + case 442: break; + case 246: + { return toUnicode(0x1F89); + } + case 443: break; + case 130: + { return toUnicode(0x1FC8); + } + case 444: break; + case 67: + { return toUnicode(0x1F00); + } + case 445: break; + case 34: + { return toUnicode(0x03be); + } + case 446: break; + case 221: + { return toUnicode(0x1F2E); + } + case 447: break; + case 253: + { return toUnicode(0x1F85); + } + case 448: break; + case 173: + { return toUnicode(0x1FC4); + } + case 449: break; + case 24: + { return toUnicode(0x0323); + } + case 450: break; + case 30: + { return toUnicode(0x03ba); + } + case 451: break; + case 210: + { return toUnicode(0x1F2A); + } + case 452: break; + case 156: + { return toUnicode(0x1F81); + } + case 453: break; + case 193: + { return toUnicode(0x1F57); + } + case 454: break; + case 191: + { return toUnicode(0x1F53); + } + case 455: break; + case 141: + { return toUnicode(0x1FEA); + } + case 456: break; + case 124: + { return toUnicode(0x1F29); + } + case 457: break; + case 37: + { return toUnicode(0x03c6); + } + case 458: break; + case 169: + { return toUnicode(0x1F25); + } + case 459: break; + case 106: + { return toUnicode(0x1F7D); + } + case 460: break; + case 113: + { return toUnicode(0x1FBC); + } + case 461: break; + case 66: + // lookahead expression with fixed base length + zzMarkedPos = zzStartRead + 1; + { return toUnicode(0x03c2); + } + case 462: break; + case 144: + { return toUnicode(0x1FE9); + } + case 463: break; + case 80: + { return toUnicode(0x1F21); + } + case 464: break; + case 110: + { return toUnicode(0x1FE5); + } + case 465: break; + case 1: + { return yytext(); + } + case 466: break; + case 231: + { return toUnicode(0x1F4B); + } + case 467: break; + case 102: + { return toUnicode(0x1FE1); + } + case 468: break; + case 94: + { return toUnicode(0x1F78); + } + case 469: break; + case 159: + { return toUnicode(0x1FB7); + } + case 470: break; + case 235: + { return toUnicode(0x1F1D); + } + case 471: break; + case 81: + { return toUnicode(0x1F74); + } + case 472: break; + case 72: + { return toUnicode(0x1FB3); + } + case 473: break; + case 69: + { return toUnicode(0x1F70); + } + case 474: break; + case 45: + { return toUnicode(0x0397); + } + case 475: break; + case 276: + { return toUnicode(0x1F9E); + } + case 476: break; + case 52: + { return toUnicode(0x0393); + } + case 477: break; + case 184: + { return toUnicode(0x1F42); + } + case 478: break; + case 15: + { return toUnicode(0x03b9); + } + case 479: break; + case 270: + { return toUnicode(0x1F9A); + } + case 480: break; + case 117: + { return toUnicode(0x1F18); + } + case 481: break; + case 286: + { return toUnicode(0x1FAF); + } + case 482: break; + case 13: + { return toUnicode(0x03b5); + } + case 483: break; + case 161: + { return toUnicode(0x1F14); + } + case 484: break; + case 219: + { return toUnicode(0x1F6C); + } + case 485: break; + case 280: + { return toUnicode(0x1FAB); + } + case 486: break; + case 7: + { return toUnicode(0x03b1); + } + case 487: break; + case 247: + { return toUnicode(0x1F99); + } + case 488: break; + case 137: + { return toUnicode(0x1FD8); + } + case 489: break; + case 75: + { return toUnicode(0x1F10); + } + case 490: break; + case 222: + { return toUnicode(0x1F3E); + } + case 491: break; + case 259: + { return toUnicode(0x1F95); + } + case 492: break; + case 211: + { return toUnicode(0x1F3A); + } + case 493: break; + case 171: + { return toUnicode(0x1F91); + } + case 494: break; + case 90: + { return toUnicode(0x1FD0); + } + case 495: break; + case 203: + { return toUnicode(0x1F67); + } + case 496: break; + case 263: + { return toUnicode(0x1FA6); + } + case 497: break; + case 214: + { return toUnicode(0x1F0C); + } + case 498: break; + case 201: + { return toUnicode(0x1F63); + } + case 499: break; + case 261: + { return toUnicode(0x1FA2); + } + case 500: break; + case 145: + { return toUnicode(0x1FFA); + } + case 501: break; + case 125: + { return toUnicode(0x1F39); + } + case 502: break; + case 11: + { return toUnicode(0x0301); + } + case 503: break; + case 290: + { return "'"; + } + case 504: break; + case 179: + { return toUnicode(0x1F35); + } + case 505: break; + case 281: + { return toUnicode(0x1F8D); + } + case 506: break; + case 134: + { return toUnicode(0x1FCC); + } + case 507: break; + case 140: + { return toUnicode(0x1FF9); + } + case 508: break; + case 86: + { return toUnicode(0x1F31); + } + case 509: break; + case 65: + { return toUnicode(0x03a8); + } + case 510: break; + case 47: + { return toUnicode(0x039f); + } + case 511: break; + case 155: + { return toUnicode(0x1F07); + } + case 512: break; + case 244: + { return toUnicode(0x1F5F); + } + case 513: break; + case 62: + { return toUnicode(0x03a4); + } + case 514: break; + case 57: + { return toUnicode(0x039b); + } + case 515: break; + case 153: + { return toUnicode(0x1F03); + } + case 516: break; + case 232: + { return toUnicode(0x1F5B); + } + case 517: break; + case 61: + { return toUnicode(0x03a0); + } + case 518: break; + case 224: + { return toUnicode(0x1F88); + } + case 519: break; + case 174: + { return toUnicode(0x1FC7); + } + case 520: break; + case 33: + { return toUnicode(0x03bd); + } + case 521: break; + case 236: + { return toUnicode(0x1F2D); + } + case 522: break; + case 250: + { return toUnicode(0x1F84); + } + case 523: break; + case 84: + { return toUnicode(0x1FC3); + } + case 524: break; + case 152: + { return toUnicode(0x1F80); + } + case 525: break; + case 3: + { return "f"; + } + case 526: break; + case 190: + { return toUnicode(0x1F56); + } + case 527: break; + case 188: + { return toUnicode(0x1F52); + } + case 528: break; + case 18: + { return toUnicode(0x03c9); + } + case 529: break; + case 118: + { return toUnicode(0x1F28); + } + case 530: break; + case 17: + { return toUnicode(0x03c5); + } + case 531: break; + case 165: + { return toUnicode(0x1F24); + } + case 532: break; + case 105: + { return toUnicode(0x1F7C); + } + case 533: break; + case 112: + { return toUnicode(0x1FBB); + } + case 534: break; + case 23: + { return toUnicode(0x03c1); + } + case 535: break; + case 143: + { return toUnicode(0x1FE8); + } + case 536: break; + case 79: + { return toUnicode(0x1F20); + } + case 537: break; + case 109: + { return toUnicode(0x1FE4); + } + case 538: break; + case 212: + { return toUnicode(0x1F4A); + } + case 539: break; + case 101: + { return toUnicode(0x1FE0); + } + case 540: break; + case 88: + { return toUnicode(0x1F77); + } + case 541: break; + case 71: + { return toUnicode(0x1FB6); + } + case 542: break; + case 215: + { return toUnicode(0x1F1C); + } + case 543: break; + case 78: + { return toUnicode(0x1F73); + } + case 544: break; + case 157: + { return toUnicode(0x1FB2); + } + case 545: break; + case 126: + { return toUnicode(0x1F49); + } + case 546: break; + case 41: + { return "F"; + } + case 547: break; + case 54: + { return toUnicode(0x0396); + } + case 548: break; + case 187: + { return toUnicode(0x1F45); + } + case 549: break; + case 282: + { return toUnicode(0x1F9D); + } + case 550: break; + case 51: + { return toUnicode(0x0392); + } + case 551: break; + case 93: + { return toUnicode(0x1F41); + } + case 552: break; + case 29: + { return toUnicode(0x03b8); + } + case 553: break; + case 245: + { return toUnicode(0x1F6F); + } + case 554: break; + case 277: + { return toUnicode(0x1FAE); + } + case 555: break; + case 27: + { return toUnicode(0x03b4); + } + case 556: break; + case 162: + { return toUnicode(0x1F13); + } + case 557: break; + case 233: + { return toUnicode(0x1F6B); + } + case 558: break; + case 271: + { return toUnicode(0x1FAA); + } + case 559: break; + case 225: + { return toUnicode(0x1F98); + } + case 560: break; + case 183: + { return toUnicode(0x1FD7); + } + case 561: break; + case 237: + { return toUnicode(0x1F3D); + } + case 562: break; + case 256: + { return toUnicode(0x1F94); + } + case 563: break; + case 182: + { return toUnicode(0x1FD3); + } + case 564: break; + case 248: + { return toUnicode(0x1FA9); + } + case 565: break; + case 22: + { return toUnicode(0x0308); + } + case 566: break; + case 167: + { return toUnicode(0x1F90); + } + case 567: break; + case 241: + { return toUnicode(0x1F0F); + } + case 568: break; + case 199: + { return toUnicode(0x1F66); + } + case 569: break; + case 5: + { return "."; + } + case 570: break; + case 265: + { return toUnicode(0x1FA5); + } + case 571: break; + case 21: + { return toUnicode(0x0304); + } + case 572: break; + case 227: + { return toUnicode(0x1F0B); + } + case 573: break; + case 197: + { return toUnicode(0x1F62); + } + case 574: break; + case 204: + { return toUnicode(0x1FA1); + } + case 575: break; + case 119: + { return toUnicode(0x1F38); + } + case 576: break; + case 10: + { return toUnicode(0x0300); + } + case 577: break; + case 176: + { return toUnicode(0x1F34); + } + case 578: break; + case 272: + { return toUnicode(0x1F8C); + } + case 579: break; + case 133: + { return toUnicode(0x1FCB); + } + case 580: break; + default: + if (zzInput == YYEOF && zzStartRead == zzCurrentPos) { + zzAtEOF = true; + return null; + } + else { + zzScanError(ZZ_NO_MATCH); + } + } + } + } + + +} diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/transcode/Buckwalter2Unicode.lex --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/transcode/Buckwalter2Unicode.lex Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,121 @@ +package de.mpg.mpiwg.berlin.mpdl.lt.general; + +%% +%{ + /* + * Betacode to Unicode conversion + */ + +%} + +%class Buckwalter2UnicodeLex +%public +%type java.lang.String +%unicode +%% + + +"<"[^>]+">" { return yytext(); } + +"'" { return "\u0621"; } /* Hamza */ +"|" { return "\u0622"; } /* ALEF WITH MADDA ABOVE from AraMorph */ +">" { return "\u0623"; } /* Hamza */ +"&" { return "\u0624"; } /* Hamza */ +"<" { return "\u0625"; } /* Alif + HamzaBelow */ +"}" { return "\u0626"; } /* Ya + HamzaAbove */ +"A" { return "\u0627"; } /* Alif */ +"b" { return "\u0628"; } /* Ba */ +"p" { return "\u0629"; } /* TaMarbuta */ +"t" { return "\u062A"; } /* Ta */ +"v" { return "\u062B"; } /* Tha */ +"j" { return "\u062C"; } /* Jeem */ +"H" { return "\u062D"; } /* HHa */ +"x" { return "\u062E"; } /* Kha */ +"d" { return "\u062F"; } /* Dal */ +"*" { return "\u0630"; } /* Thal */ +"r" { return "\u0631"; } /* Ra */ +"z" { return "\u0632"; } /* Zain */ +"s" { return "\u0633"; } /* Seen */ +"$" { return "\u0634"; } /* Sheen */ +"S" { return "\u0635"; } /* Sad */ +"D" { return "\u0636"; } /* DDad */ +"T" { return "\u0637"; } /* TTa */ +"Z" { return "\u0638"; } /* DTha */ +"E" { return "\u0639"; } /* Ain */ +"g" { return "\u063A"; } /* Ghain */ + +"_" { return "\u0640"; } /* Tatweel */ +"f" { return "\u0641"; } /* Fa */ +"q" { return "\u0642"; } /* Qaf */ +"k" { return "\u0643"; } /* Kaf */ +"l" { return "\u0644"; } /* Lam */ +"m" { return "\u0645"; } /* Meem */ +"n" { return "\u0646"; } /* Noon */ +"h" { return "\u0647"; } /* Ha */ +"w" { return "\u0648"; } /* Waw */ +"Y" { return "\u0649"; } /* AlifMaksura */ +"y" { return "\u064A"; } /* Ya */ +"F" { return "\u064B"; } /* Fathatan */ +"N" { return "\u064C"; } /* Dammatan */ +"K" { return "\u064D"; } /* Kasratan */ +"a" { return "\u064E"; } /* Fatha */ +"u" { return "\u064F"; } /* Damma */ +"i" { return "\u0650"; } /* Kasra */ +"~" { return "\u0651"; } /* Shadda */ +"o" { return "\u0652"; } /* Sukun */ +"^" { return "\u0653"; } /* Maddah */ +"#" { return "\u0654"; } /* HamzaAbove */ + +"`" { return "\u0670"; } /* AlifKhanjareeya */ +"{" { return "\u0671"; } /* Alif + HamzatWasl */ + +"P" { return "\u067E"; } /* PEH from AraMorph */ +"J" { return "\u0686"; } /* TCHEH from AraMorph */ +"V" { return "\u06A4"; } /* VEH from AraMorph */ +"G" { return "\u06AF"; } /* GAF from AraMorph */ +"R" { return "\u0698"; } /* JEH from AraMorph */ +"?" { return "\u061F"; } /* QUESTION MARK from AraMorph */ + +":" { return "\u06DC"; } /* SmallHighSeen */ +"@" { return "\u06DF"; } /* SmallHighRoundedZero */ + +"[" { return "\u06E2"; } /* SmallHighMeemIsolatedForm */ +";" { return "\u06E3"; } /* SmallLowSeen */ +"," { return "\u06E5"; } /* SmallWaw */ +"." { return "\u06E6"; } /* SmallYa */ +"!" { return "\u06E8"; } /* SmallHighNoon */ +"-" { return "\u06EA"; } /* EmptyCentreLowStop */ +"+" { return "\u06EB"; } /* EmptyCentreHighStop */ +"%" { return "\u06EC"; } /* RoundedHighStopWithFilledCentre */ +"]" { return "\u06ED"; } /* SmallLowMeem */ + +[\&_]"vert;" { return "|"; } +[\&_]"lpar;" { return "("; } +[\&_]"rpar;" { return ")"; } +[\_\&]"lt;" { return "<"; } +[\_\&]"gt;" { return ">"; } +"'" { return "'"; } + +"&"[a-zA-Z]+";" { return yytext(); } + +. { return yytext(); } +\n { return yytext(); } + +/* make problemes */ +/* "\\"" { return "\u06E0"; } SmallHighUprightRectangularZero */ + + +/* double entries */ +/* "," { return "\u060C"; } COMMA from AraMorph */ +/* ";" { return "\u061B"; } SEMICOLON from AraMorph */ + +/* not in buckwalter contained */ +/* \u0679 : ARABIC LETTER TTEH */ +/* \u0688 : ARABIC LETTER DDAL */ +/* \u06A9 : ARABIC LETTER KEHEH */ +/* \u0691 : ARABIC LETTER RREH */ +/* \u06BA : ARABIC LETTER NOON GHUNNA */ +/* \u06BE : ARABIC LETTER HEH DOACHASHMEE */ +/* \u06C1 : ARABIC LETTER HEH GOAL */ +/* \u06D2 : ARABIC LETTER YEH BARREE */ + diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/transcode/Buckwalter2UnicodeLex.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/transcode/Buckwalter2UnicodeLex.java Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,909 @@ +package de.mpg.mpiwg.berlin.mpdl.lt.text.transcode; + + +/** + * This class is a scanner generated by + * JFlex 1.4.3 + * on 20.11.09 17:57 from the specification file + * /Users/jwillenborg/java/existDevMai2009/mpdl/extensions/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/general/Buckwalter2Unicode.lex + */ +public class Buckwalter2UnicodeLex { + + /** This character denotes the end of file */ + public static final int YYEOF = -1; + + /** initial size of the lookahead buffer */ + private static final int ZZ_BUFFERSIZE = 16384; + + /** lexical states */ + public static final int YYINITIAL = 0; + + /** + * ZZ_LEXSTATE[l] is the state in the DFA for the lexical state l + * ZZ_LEXSTATE[l+1] is the state in the DFA for the lexical state l + * at the beginning of a line + * l is of the form l = 2*k, k a non negative integer + */ + private static final int ZZ_LEXSTATE[] = { + 0, 0 + }; + + /** + * Translates characters to character classes + */ + private static final String ZZ_CMAP_PACKED = + "\12\0\1\0\26\0\1\76\1\0\1\57\1\24\1\101\1\5\1\3"+ + "\2\0\1\20\1\100\1\74\1\77\1\75\1\0\1\104\2\0\1\105"+ + "\5\0\1\106\1\70\1\73\1\1\1\0\1\2\1\67\1\71\1\7"+ + "\2\107\1\26\1\31\1\46\1\65\1\15\1\107\1\63\1\50\2\107"+ + "\1\47\1\107\1\62\1\107\1\66\1\25\1\27\1\107\1\64\2\107"+ + "\1\44\1\30\1\72\1\0\1\102\1\56\1\33\1\60\1\51\1\10"+ + "\1\107\1\17\1\103\1\34\1\32\1\42\1\53\1\14\1\36\1\37"+ + "\1\40\1\41\1\55\1\11\1\35\1\21\1\23\1\12\1\52\1\13"+ + "\1\43\1\16\1\45\1\22\1\61\1\4\1\6\1\54\uff81\0"; + + /** + * Translates characters to character classes + */ + private static final char [] ZZ_CMAP = zzUnpackCMap(ZZ_CMAP_PACKED); + + /** + * Translates DFA states to action switch labels. + */ + private static final int [] ZZ_ACTION = zzUnpackAction(); + + private static final String ZZ_ACTION_PACKED_0 = + "\1\0\1\1\1\2\1\3\1\4\1\5\1\6\1\7"+ + "\1\10\1\11\1\12\1\13\1\14\1\15\1\16\1\17"+ + "\1\20\1\21\1\22\1\23\1\24\1\25\1\26\1\27"+ + "\1\30\1\31\1\32\1\33\1\34\1\35\1\36\1\37"+ + "\1\40\1\41\1\42\1\43\1\44\1\45\1\46\1\47"+ + "\1\50\1\51\1\52\1\53\1\54\1\55\1\56\1\57"+ + "\1\60\1\61\1\62\1\63\1\64\1\65\1\66\1\67"+ + "\1\70\1\71\1\72\1\73\1\74\1\75\1\76\1\77"+ + "\1\100\1\101\1\102\1\103\30\0\1\104\1\0\1\105"+ + "\13\0\1\106\1\107\1\110\1\111"; + + private static int [] zzUnpackAction() { + int [] result = new int[110]; + int offset = 0; + offset = zzUnpackAction(ZZ_ACTION_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackAction(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int count = packed.charAt(i++); + int value = packed.charAt(i++); + do result[j++] = value; while (--count > 0); + } + return j; + } + + + /** + * Translates a state to a row index in the transition table + */ + private static final int [] ZZ_ROWMAP = zzUnpackRowMap(); + + private static final String ZZ_ROWMAP_PACKED_0 = + "\0\0\0\110\0\220\0\110\0\110\0\110\0\330\0\110"+ + "\0\110\0\110\0\110\0\110\0\110\0\110\0\110\0\110"+ + "\0\110\0\110\0\110\0\110\0\110\0\110\0\110\0\110"+ + "\0\110\0\110\0\110\0\110\0\u0120\0\110\0\110\0\110"+ + "\0\110\0\110\0\110\0\110\0\110\0\110\0\110\0\110"+ + "\0\110\0\110\0\110\0\110\0\110\0\110\0\110\0\110"+ + "\0\110\0\110\0\110\0\110\0\110\0\110\0\110\0\110"+ + "\0\110\0\110\0\110\0\110\0\110\0\110\0\110\0\110"+ + "\0\110\0\110\0\110\0\110\0\u0168\0\u01b0\0\u01f8\0\u0240"+ + "\0\u0288\0\u02d0\0\u0318\0\u0360\0\u03a8\0\u03f0\0\u0438\0\u0480"+ + "\0\u04c8\0\u0510\0\u0558\0\u05a0\0\u05e8\0\u0630\0\u0678\0\u06c0"+ + "\0\u0708\0\u0750\0\u0798\0\u07e0\0\110\0\u0828\0\110\0\u0870"+ + "\0\u08b8\0\u0900\0\u0948\0\u0990\0\u09d8\0\u0a20\0\u0a68\0\u0ab0"+ + "\0\u0af8\0\u0b40\0\110\0\110\0\110\0\110"; + + private static int [] zzUnpackRowMap() { + int [] result = new int[110]; + int offset = 0; + offset = zzUnpackRowMap(ZZ_ROWMAP_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackRowMap(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int high = packed.charAt(i++) << 16; + result[j++] = high | packed.charAt(i++); + } + return j; + } + + /** + * The transition table of the DFA + */ + private static final int [] ZZ_TRANS = zzUnpackTrans(); + + private static final String ZZ_TRANS_PACKED_0 = + "\1\2\1\3\1\4\1\5\1\6\1\7\1\10\1\11"+ + "\1\12\1\13\1\14\1\15\1\16\1\17\1\20\1\21"+ + "\1\22\1\23\1\24\1\25\1\26\1\27\1\30\1\31"+ + "\1\32\1\33\1\34\1\35\1\36\1\37\1\40\1\41"+ + "\1\42\1\43\1\44\1\45\1\46\1\47\1\50\1\51"+ + "\1\52\1\53\1\54\1\55\1\56\1\57\1\60\1\61"+ + "\1\62\1\63\1\64\1\65\1\66\1\67\1\70\1\71"+ + "\1\72\1\73\1\74\1\75\1\76\1\77\1\100\1\101"+ + "\1\102\1\103\1\104\5\2\110\0\2\105\1\0\105\105"+ + "\7\0\4\106\1\107\4\106\1\0\1\110\2\106\1\0"+ + "\5\106\1\111\1\0\3\106\1\112\14\106\1\0\1\106"+ + "\1\0\1\113\2\0\5\106\14\0\1\106\3\0\1\106"+ + "\13\0\1\114\5\0\1\115\10\0\1\116\4\0\1\117"+ + "\50\0\2\105\1\2\105\105\7\0\11\106\1\0\3\106"+ + "\1\0\6\106\1\0\20\106\1\0\1\106\4\0\5\106"+ + "\4\0\1\2\7\0\1\106\3\0\1\106\7\0\11\106"+ + "\1\0\3\106\1\0\6\106\1\0\20\106\1\0\1\106"+ + "\4\0\5\106\4\0\1\2\7\0\1\120\3\0\1\106"+ + "\7\0\2\106\1\121\6\106\1\0\3\106\1\0\6\106"+ + "\1\0\20\106\1\0\1\106\4\0\5\106\4\0\1\2"+ + "\7\0\1\106\3\0\1\106\7\0\3\106\1\122\5\106"+ + "\1\0\3\106\1\0\6\106\1\0\20\106\1\0\1\106"+ + "\4\0\5\106\4\0\1\2\7\0\1\106\3\0\1\106"+ + "\7\0\2\106\1\123\1\124\5\106\1\0\3\106\1\0"+ + "\6\106\1\0\20\106\1\0\1\106\4\0\5\106\4\0"+ + "\1\2\7\0\1\106\3\0\1\106\104\0\1\125\106\0"+ + "\1\126\15\0\1\127\110\0\1\130\106\0\1\131\1\132"+ + "\104\0\11\106\1\0\1\133\2\106\1\0\6\106\1\0"+ + "\20\106\1\0\1\106\4\0\5\106\4\0\1\2\7\0"+ + "\1\106\3\0\1\106\7\0\11\106\1\0\3\106\1\0"+ + "\6\106\1\0\15\106\1\134\2\106\1\0\1\106\4\0"+ + "\5\106\4\0\1\2\7\0\1\106\3\0\1\106\7\0"+ + "\11\106\1\0\3\106\1\0\6\106\1\0\20\106\1\0"+ + "\1\106\4\0\5\106\4\0\1\135\7\0\1\106\3\0"+ + "\1\106\7\0\11\106\1\0\3\106\1\0\6\106\1\0"+ + "\15\106\1\136\2\106\1\0\1\106\4\0\5\106\4\0"+ + "\1\2\7\0\1\106\3\0\1\106\7\0\11\106\1\0"+ + "\3\106\1\0\6\106\1\0\20\106\1\0\1\106\4\0"+ + "\5\106\4\0\1\137\7\0\1\106\3\0\1\106\105\0"+ + "\1\140\23\0\1\141\137\0\1\142\131\0\1\135\65\0"+ + "\1\143\131\0\1\137\23\0\3\106\1\144\5\106\1\0"+ + "\3\106\1\0\6\106\1\0\20\106\1\0\1\106\4\0"+ + "\5\106\4\0\1\2\7\0\1\106\3\0\1\106\7\0"+ + "\11\106\1\0\1\145\2\106\1\0\6\106\1\0\20\106"+ + "\1\0\1\106\4\0\5\106\4\0\1\2\7\0\1\106"+ + "\3\0\1\106\7\0\11\106\1\0\1\146\2\106\1\0"+ + "\6\106\1\0\20\106\1\0\1\106\4\0\5\106\4\0"+ + "\1\2\7\0\1\106\3\0\1\106\106\0\1\147\13\0"+ + "\1\150\116\0\1\151\107\0\1\152\75\0\11\106\1\0"+ + "\3\106\1\0\6\106\1\0\20\106\1\0\1\106\4\0"+ + "\5\106\4\0\1\153\7\0\1\106\3\0\1\106\7\0"+ + "\11\106\1\0\3\106\1\0\6\106\1\0\20\106\1\0"+ + "\1\106\4\0\5\106\4\0\1\154\7\0\1\106\3\0"+ + "\1\106\7\0\11\106\1\0\3\106\1\0\6\106\1\0"+ + "\20\106\1\0\1\106\4\0\5\106\4\0\1\155\7\0"+ + "\1\106\3\0\1\106\73\0\1\156\107\0\1\153\107\0"+ + "\1\154\107\0\1\155\14\0"; + + private static int [] zzUnpackTrans() { + int [] result = new int[2952]; + int offset = 0; + offset = zzUnpackTrans(ZZ_TRANS_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackTrans(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int count = packed.charAt(i++); + int value = packed.charAt(i++); + value--; + do result[j++] = value; while (--count > 0); + } + return j; + } + + + /* error codes */ + private static final int ZZ_UNKNOWN_ERROR = 0; + private static final int ZZ_NO_MATCH = 1; + private static final int ZZ_PUSHBACK_2BIG = 2; + + /* error messages for the codes above */ + private static final String ZZ_ERROR_MSG[] = { + "Unkown internal scanner error", + "Error: could not match input", + "Error: pushback value was too large" + }; + + /** + * ZZ_ATTRIBUTE[aState] contains the attributes of state aState + */ + private static final int [] ZZ_ATTRIBUTE = zzUnpackAttribute(); + + private static final String ZZ_ATTRIBUTE_PACKED_0 = + "\1\0\1\11\1\1\3\11\1\1\25\11\1\1\47\11"+ + "\30\0\1\11\1\0\1\11\13\0\4\11"; + + private static int [] zzUnpackAttribute() { + int [] result = new int[110]; + int offset = 0; + offset = zzUnpackAttribute(ZZ_ATTRIBUTE_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackAttribute(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int count = packed.charAt(i++); + int value = packed.charAt(i++); + do result[j++] = value; while (--count > 0); + } + return j; + } + + /** the input device */ + private java.io.Reader zzReader; + + /** the current state of the DFA */ + private int zzState; + + /** the current lexical state */ + private int zzLexicalState = YYINITIAL; + + /** this buffer contains the current text to be matched and is + the source of the yytext() string */ + private char zzBuffer[] = new char[ZZ_BUFFERSIZE]; + + /** the textposition at the last accepting state */ + private int zzMarkedPos; + + /** the current text position in the buffer */ + private int zzCurrentPos; + + /** startRead marks the beginning of the yytext() string in the buffer */ + private int zzStartRead; + + /** endRead marks the last character in the buffer, that has been read + from input */ + private int zzEndRead; + + /** number of newlines encountered up to the start of the matched text */ + private int yyline; + + /** the number of characters up to the start of the matched text */ + private int yychar; + + /** + * the number of characters from the last newline up to the start of the + * matched text + */ + private int yycolumn; + + /** + * zzAtBOL == true <=> the scanner is currently at the beginning of a line + */ + private boolean zzAtBOL = true; + + /** zzAtEOF == true <=> the scanner is at the EOF */ + private boolean zzAtEOF; + + /** denotes if the user-EOF-code has already been executed */ + private boolean zzEOFDone; + + /* user code: */ + /* + * Betacode to Unicode conversion + */ + + + + /** + * Creates a new scanner + * There is also a java.io.InputStream version of this constructor. + * + * @param in the java.io.Reader to read input from. + */ + public Buckwalter2UnicodeLex(java.io.Reader in) { + this.zzReader = in; + } + + /** + * Creates a new scanner. + * There is also java.io.Reader version of this constructor. + * + * @param in the java.io.Inputstream to read input from. + */ + public Buckwalter2UnicodeLex(java.io.InputStream in) { + this(new java.io.InputStreamReader(in)); + } + + /** + * Unpacks the compressed character translation table. + * + * @param packed the packed character translation table + * @return the unpacked character translation table + */ + private static char [] zzUnpackCMap(String packed) { + char [] map = new char[0x10000]; + int i = 0; /* index in packed string */ + int j = 0; /* index in unpacked array */ + while (i < 178) { + int count = packed.charAt(i++); + char value = packed.charAt(i++); + do map[j++] = value; while (--count > 0); + } + return map; + } + + + /** + * Refills the input buffer. + * + * @return false, iff there was new input. + * + * @exception java.io.IOException if any I/O-Error occurs + */ + private boolean zzRefill() throws java.io.IOException { + + /* first: make room (if you can) */ + if (zzStartRead > 0) { + System.arraycopy(zzBuffer, zzStartRead, + zzBuffer, 0, + zzEndRead-zzStartRead); + + /* translate stored positions */ + zzEndRead-= zzStartRead; + zzCurrentPos-= zzStartRead; + zzMarkedPos-= zzStartRead; + zzStartRead = 0; + } + + /* is the buffer big enough? */ + if (zzCurrentPos >= zzBuffer.length) { + /* if not: blow it up */ + char newBuffer[] = new char[zzCurrentPos*2]; + System.arraycopy(zzBuffer, 0, newBuffer, 0, zzBuffer.length); + zzBuffer = newBuffer; + } + + /* finally: fill the buffer with new input */ + int numRead = zzReader.read(zzBuffer, zzEndRead, + zzBuffer.length-zzEndRead); + + if (numRead > 0) { + zzEndRead+= numRead; + return false; + } + // unlikely but not impossible: read 0 characters, but not at end of stream + if (numRead == 0) { + int c = zzReader.read(); + if (c == -1) { + return true; + } else { + zzBuffer[zzEndRead++] = (char) c; + return false; + } + } + + // numRead < 0 + return true; + } + + + /** + * Closes the input stream. + */ + public final void yyclose() throws java.io.IOException { + zzAtEOF = true; /* indicate end of file */ + zzEndRead = zzStartRead; /* invalidate buffer */ + + if (zzReader != null) + zzReader.close(); + } + + + /** + * Resets the scanner to read from a new input stream. + * Does not close the old reader. + * + * All internal variables are reset, the old input stream + * cannot be reused (internal buffer is discarded and lost). + * Lexical state is set to ZZ_INITIAL. + * + * @param reader the new input stream + */ + public final void yyreset(java.io.Reader reader) { + zzReader = reader; + zzAtBOL = true; + zzAtEOF = false; + zzEOFDone = false; + zzEndRead = zzStartRead = 0; + zzCurrentPos = zzMarkedPos = 0; + yyline = yychar = yycolumn = 0; + zzLexicalState = YYINITIAL; + } + + + /** + * Returns the current lexical state. + */ + public final int yystate() { + return zzLexicalState; + } + + + /** + * Enters a new lexical state + * + * @param newState the new lexical state + */ + public final void yybegin(int newState) { + zzLexicalState = newState; + } + + + /** + * Returns the text matched by the current regular expression. + */ + public final String yytext() { + return new String( zzBuffer, zzStartRead, zzMarkedPos-zzStartRead ); + } + + + /** + * Returns the character at position pos from the + * matched text. + * + * It is equivalent to yytext().charAt(pos), but faster + * + * @param pos the position of the character to fetch. + * A value from 0 to yylength()-1. + * + * @return the character at position pos + */ + public final char yycharat(int pos) { + return zzBuffer[zzStartRead+pos]; + } + + + /** + * Returns the length of the matched text region. + */ + public final int yylength() { + return zzMarkedPos-zzStartRead; + } + + + /** + * Reports an error that occured while scanning. + * + * In a wellformed scanner (no or only correct usage of + * yypushback(int) and a match-all fallback rule) this method + * will only be called with things that "Can't Possibly Happen". + * If this method is called, something is seriously wrong + * (e.g. a JFlex bug producing a faulty scanner etc.). + * + * Usual syntax/scanner level error handling should be done + * in error fallback rules. + * + * @param errorCode the code of the errormessage to display + */ + private void zzScanError(int errorCode) { + String message; + try { + message = ZZ_ERROR_MSG[errorCode]; + } + catch (ArrayIndexOutOfBoundsException e) { + message = ZZ_ERROR_MSG[ZZ_UNKNOWN_ERROR]; + } + + throw new Error(message); + } + + + /** + * Pushes the specified amount of characters back into the input stream. + * + * They will be read again by then next call of the scanning method + * + * @param number the number of characters to be read again. + * This number must not be greater than yylength()! + */ + public void yypushback(int number) { + if ( number > yylength() ) + zzScanError(ZZ_PUSHBACK_2BIG); + + zzMarkedPos -= number; + } + + + /** + * Resumes scanning until the next regular expression is matched, + * the end of input is encountered or an I/O-Error occurs. + * + * @return the next token + * @exception java.io.IOException if any I/O-Error occurs + */ + public java.lang.String yylex() throws java.io.IOException { + int zzInput; + int zzAction; + + // cached fields: + int zzCurrentPosL; + int zzMarkedPosL; + int zzEndReadL = zzEndRead; + char [] zzBufferL = zzBuffer; + char [] zzCMapL = ZZ_CMAP; + + int [] zzTransL = ZZ_TRANS; + int [] zzRowMapL = ZZ_ROWMAP; + int [] zzAttrL = ZZ_ATTRIBUTE; + + while (true) { + zzMarkedPosL = zzMarkedPos; + + zzAction = -1; + + zzCurrentPosL = zzCurrentPos = zzStartRead = zzMarkedPosL; + + zzState = ZZ_LEXSTATE[zzLexicalState]; + + + zzForAction: { + while (true) { + + if (zzCurrentPosL < zzEndReadL) + zzInput = zzBufferL[zzCurrentPosL++]; + else if (zzAtEOF) { + zzInput = YYEOF; + break zzForAction; + } + else { + // store back cached positions + zzCurrentPos = zzCurrentPosL; + zzMarkedPos = zzMarkedPosL; + boolean eof = zzRefill(); + // get translated positions and possibly new buffer + zzCurrentPosL = zzCurrentPos; + zzMarkedPosL = zzMarkedPos; + zzBufferL = zzBuffer; + zzEndReadL = zzEndRead; + if (eof) { + zzInput = YYEOF; + break zzForAction; + } + else { + zzInput = zzBufferL[zzCurrentPosL++]; + } + } + int zzNext = zzTransL[ zzRowMapL[zzState] + zzCMapL[zzInput] ]; + if (zzNext == -1) break zzForAction; + zzState = zzNext; + + int zzAttributes = zzAttrL[zzState]; + if ( (zzAttributes & 1) == 1 ) { + zzAction = zzState; + zzMarkedPosL = zzCurrentPosL; + if ( (zzAttributes & 8) == 8 ) break zzForAction; + } + + } + } + + // store back cached position + zzMarkedPos = zzMarkedPosL; + + switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) { + case 54: + { return "\u06AF"; + } + case 74: break; + case 10: + { return "\u0629"; + } + case 75: break; + case 26: + { return "\u0639"; + } + case 76: break; + case 9: + { return "\u0628"; + } + case 77: break; + case 37: + { return "\u0649"; + } + case 78: break; + case 25: + { return "\u0638"; + } + case 79: break; + case 8: + { return "\u0627"; + } + case 80: break; + case 58: + { return "\u06DF"; + } + case 81: break; + case 36: + { return "\u0648"; + } + case 82: break; + case 68: + { return ">"; + } + case 83: break; + case 24: + { return "\u0637"; + } + case 84: break; + case 7: + { return "\u0626"; + } + case 85: break; + case 35: + { return "\u0647"; + } + case 86: break; + case 23: + { return "\u0636"; + } + case 87: break; + case 2: + { return "\u0625"; + } + case 88: break; + case 69: + { return "<"; + } + case 89: break; + case 34: + { return "\u0646"; + } + case 90: break; + case 67: + { return "\u06ED"; + } + case 91: break; + case 22: + { return "\u0635"; + } + case 92: break; + case 6: + { return "\u0624"; + } + case 93: break; + case 57: + { return "\u06DC"; + } + case 94: break; + case 33: + { return "\u0645"; + } + case 95: break; + case 66: + { return "\u06EC"; + } + case 96: break; + case 21: + { return "\u0634"; + } + case 97: break; + case 3: + { return "\u0623"; + } + case 98: break; + case 32: + { return "\u0644"; + } + case 99: break; + case 70: + { return "|"; + } + case 100: break; + case 65: + { return "\u06EB"; + } + case 101: break; + case 20: + { return "\u0633"; + } + case 102: break; + case 55: + { return "\u0698"; + } + case 103: break; + case 5: + { return "\u0622"; + } + case 104: break; + case 48: + { return "\u0654"; + } + case 105: break; + case 31: + { return "\u0643"; + } + case 106: break; + case 19: + { return "\u0632"; + } + case 107: break; + case 64: + { return "\u06EA"; + } + case 108: break; + case 4: + { return "\u0621"; + } + case 109: break; + case 52: + { return "\u0686"; + } + case 110: break; + case 47: + { return "\u0653"; + } + case 111: break; + case 30: + { return "\u0642"; + } + case 112: break; + case 18: + { return "\u0631"; + } + case 113: break; + case 46: + { return "\u0652"; + } + case 114: break; + case 29: + { return "\u0641"; + } + case 115: break; + case 17: + { return "\u0630"; + } + case 116: break; + case 45: + { return "\u0651"; + } + case 117: break; + case 28: + { return "\u0640"; + } + case 118: break; + case 44: + { return "\u0650"; + } + case 119: break; + case 1: + { return yytext(); + } + case 120: break; + case 50: + { return "\u0671"; + } + case 121: break; + case 49: + { return "\u0670"; + } + case 122: break; + case 63: + { return "\u06E8"; + } + case 123: break; + case 53: + { return "\u06A4"; + } + case 124: break; + case 56: + { return "\u061F"; + } + case 125: break; + case 16: + { return "\u062F"; + } + case 126: break; + case 62: + { return "\u06E6"; + } + case 127: break; + case 15: + { return "\u062E"; + } + case 128: break; + case 61: + { return "\u06E5"; + } + case 129: break; + case 43: + { return "\u064F"; + } + case 130: break; + case 14: + { return "\u062D"; + } + case 131: break; + case 42: + { return "\u064E"; + } + case 132: break; + case 60: + { return "\u06E3"; + } + case 133: break; + case 13: + { return "\u062C"; + } + case 134: break; + case 41: + { return "\u064D"; + } + case 135: break; + case 59: + { return "\u06E2"; + } + case 136: break; + case 12: + { return "\u062B"; + } + case 137: break; + case 40: + { return "\u064C"; + } + case 138: break; + case 11: + { return "\u062A"; + } + case 139: break; + case 51: + { return "\u067E"; + } + case 140: break; + case 39: + { return "\u064B"; + } + case 141: break; + case 27: + { return "\u063A"; + } + case 142: break; + case 38: + { return "\u064A"; + } + case 143: break; + case 71: + { return ")"; + } + case 144: break; + case 72: + { return "("; + } + case 145: break; + case 73: + { return "'"; + } + case 146: break; + default: + if (zzInput == YYEOF && zzStartRead == zzCurrentPos) { + zzAtEOF = true; + return null; + } + else { + zzScanError(ZZ_NO_MATCH); + } + } + } + } + + +} diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/transcode/Transcoder.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/transcode/Transcoder.java Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,226 @@ +package de.mpg.mpiwg.berlin.mpdl.lt.text.transcode; + +import java.io.IOException; +import java.io.StringReader; +import java.io.UnsupportedEncodingException; + +import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; +import edu.unc.epidoc.transcoder.TransCoder; + +public class Transcoder { + private static Transcoder instance; + private TransCoder betaCodeTranscoder; + + public static Transcoder getInstance() { + if (instance == null) { + instance = new Transcoder(); + } + return instance; + } + + public String transcodeFromBetaCode2UnicodeEpidoc(String inputStr) throws ApplicationException { + String encodedUnicodeStr = null; + try { + if (betaCodeTranscoder == null) { + betaCodeTranscoder = new TransCoder(); + betaCodeTranscoder.setParser("BetaCode"); + betaCodeTranscoder.setConverter("UnicodeC"); + } + encodedUnicodeStr = betaCodeTranscoder.getString(inputStr); + } catch (Exception e) { + throw new ApplicationException(e); + } + return encodedUnicodeStr; + } + + public String transcodeFromBetaCode2Unicode(String inputStr) throws ApplicationException { + StringReader strReader = new StringReader(inputStr); + Betacode2UnicodeLex betacode2UnicodeLex = new Betacode2UnicodeLex(strReader); + String retStr = ""; + String token = ""; + while (token != null) { + try { + token = betacode2UnicodeLex.yylex(); + if (token != null) + retStr += token; + } catch (IOException e ) { + throw new ApplicationException(e); + } + } + // replace "small letter sigma" at the end of a word by the "small letter end sigma" + if (retStr != null && retStr.contains("σ")) { + retStr = retStr.replaceAll("(.*)σ(\\s)", "$1ς$2"); + retStr = retStr.replaceAll("(.*)σ($)", "$1ς$2"); + } + return retStr; + /* + // alternative to JFlex + String encodedUnicodeStr = null; + if (inputStr.matches("^a)")) + encodedUnicodeStr = inputStr.replaceFirst("^a)", "\u1F00"); + else if (inputStr.matches("^a(")) + encodedUnicodeStr = inputStr.replaceFirst("^a(", "\u1F01"); + else if (inputStr.matches("^a)\\")) + encodedUnicodeStr = inputStr.replaceFirst("^a)\\", "\u1F02"); + + // the longest regular expressions first + + return encodedUnicodeStr; + */ + } + + + public String transcodeFromUnicode2BetaCode(String inputStr) throws ApplicationException { + StringReader strReader = new StringReader(inputStr); + Unicode2BetacodeLex betacode2UnicodeLex = new Unicode2BetacodeLex(strReader); + String retStr = ""; + String token = ""; + while (token != null) { + try { + token = betacode2UnicodeLex.yylex(); + if (token != null) + retStr += token; + } catch (IOException e ) { + throw new ApplicationException(e); + } + } + return retStr; + } + + public String transcodeFromUnicode2Buckwalter(String inputStr) throws ApplicationException { + StringReader strReader = new StringReader(inputStr); + Unicode2BuckwalterLex betacode2UnicodeLex = new Unicode2BuckwalterLex(strReader); + String retStr = ""; + String token = ""; + while (token != null) { + try { + token = betacode2UnicodeLex.yylex(); + if (token != null) + retStr += token; + } catch (IOException e ) { + throw new ApplicationException(e); + } + } + return retStr; + } + + public String transcodeFromBuckwalter2Unicode(String inputStr) throws ApplicationException { + StringReader strReader = new StringReader(inputStr); + Buckwalter2UnicodeLex buckwalter2UnicodeLex = new Buckwalter2UnicodeLex(strReader); + String retStr = ""; + String token = ""; + while (token != null) { + try { + token = buckwalter2UnicodeLex.yylex(); + if (token != null) + retStr += token; + } catch (IOException e ) { + throw new ApplicationException(e); + } + } + return retStr; + } + + + + public String transcodeFromBuckwalter2UnicodeAraMorph(String inputStr) { + String encodedUnicodeStr = arabizeWord(inputStr); + return encodedUnicodeStr; + } + + + public String encodeBig5(String inputStr) { + String charset = "big5"; + String resultStr = ""; + try { + byte[] resultBytes = inputStr.getBytes(charset); + for (int i=0; i < resultBytes.length; i++) { + byte b = resultBytes[i]; + int unsigned = unsignedByteToInt(b); + String hexStr = Integer.toHexString(unsigned); + resultStr = resultStr + "%" + hexStr; + } + } catch (UnsupportedEncodingException e) { + + } + return resultStr; + } + + private int unsignedByteToInt(byte b) { + return (int) b & 0xFF; + } + + /* + * copied from http://www.nongnu.org/aramorph/english/download.html + * Class: AraMorph + */ + private String arabizeWord(String translitered) { + String tmp_word = translitered; + // convert to transliteration + tmp_word = tmp_word.replaceAll("'", "\u0621"); //\u0621 : ARABIC LETTER HAMZA + tmp_word = tmp_word.replaceAll("\\|", "\u0622"); //\u0622 : ARABIC LETTER ALEF WITH MADDA ABOVE + tmp_word = tmp_word.replaceAll(">", "\u0623"); //\u0623 : ARABIC LETTER ALEF WITH HAMZA ABOVE + tmp_word = tmp_word.replaceAll("&", "\u0624"); //\u0624 : ARABIC LETTER WAW WITH HAMZA ABOVE + tmp_word = tmp_word.replaceAll("<", "\u0625"); //\u0625 : ARABIC LETTER ALEF WITH HAMZA BELOW + tmp_word = tmp_word.replaceAll("}", "\u0626"); //\u0626 : ARABIC LETTER YEH WITH HAMZA ABOVE + tmp_word = tmp_word.replaceAll("A", "\u0627"); //\u0627 : ARABIC LETTER ALEF + tmp_word = tmp_word.replaceAll("b", "\u0628"); //\u0628 : ARABIC LETTER BEH + tmp_word = tmp_word.replaceAll("p", "\u0629"); //\u0629 : ARABIC LETTER TEH MARBUTA + tmp_word = tmp_word.replaceAll("t", "\u062A"); //\u062A : ARABIC LETTER TEH + tmp_word = tmp_word.replaceAll("v", "\u062B"); //\u062B : ARABIC LETTER THEH + tmp_word = tmp_word.replaceAll("j", "\u062C"); //\u062C : ARABIC LETTER JEEM + tmp_word = tmp_word.replaceAll("H", "\u062D"); //\u062D : ARABIC LETTER HAH + tmp_word = tmp_word.replaceAll("x", "\u062E"); //\u062E : ARABIC LETTER KHAH + tmp_word = tmp_word.replaceAll("d", "\u062F"); //\u062F : ARABIC LETTER DAL + tmp_word = tmp_word.replaceAll("\\*", "\u0630"); //\u0630 : ARABIC LETTER THAL + tmp_word = tmp_word.replaceAll("r", "\u0631"); //\u0631 : ARABIC LETTER REH + tmp_word = tmp_word.replaceAll("z", "\u0632"); //\u0632 : ARABIC LETTER ZAIN + tmp_word = tmp_word.replaceAll("s", "\u0633" ); //\u0633 : ARABIC LETTER SEEN + tmp_word = tmp_word.replaceAll("\\$", "\u0634"); //\u0634 : ARABIC LETTER SHEEN + tmp_word = tmp_word.replaceAll("S", "\u0635"); //\u0635 : ARABIC LETTER SAD + tmp_word = tmp_word.replaceAll("D", "\u0636"); //\u0636 : ARABIC LETTER DAD + tmp_word = tmp_word.replaceAll("T", "\u0637"); //\u0637 : ARABIC LETTER TAH + tmp_word = tmp_word.replaceAll("Z", "\u0638"); //\u0638 : ARABIC LETTER ZAH + tmp_word = tmp_word.replaceAll("E", "\u0639"); //\u0639 : ARABIC LETTER AIN + tmp_word = tmp_word.replaceAll("g", "\u063A"); //\u063A : ARABIC LETTER GHAIN + tmp_word = tmp_word.replaceAll("_", "\u0640"); //\u0640 : ARABIC TATWEEL + tmp_word = tmp_word.replaceAll("f", "\u0641"); //\u0641 : ARABIC LETTER FEH + tmp_word = tmp_word.replaceAll("q", "\u0642"); //\u0642 : ARABIC LETTER QAF + tmp_word = tmp_word.replaceAll("k", "\u0643"); //\u0643 : ARABIC LETTER KAF + tmp_word = tmp_word.replaceAll("l", "\u0644"); //\u0644 : ARABIC LETTER LAM + tmp_word = tmp_word.replaceAll("m", "\u0645"); //\u0645 : ARABIC LETTER MEEM + tmp_word = tmp_word.replaceAll("n", "\u0646"); //\u0646 : ARABIC LETTER NOON + tmp_word = tmp_word.replaceAll("h", "\u0647"); //\u0647 : ARABIC LETTER HEH + tmp_word = tmp_word.replaceAll("w", "\u0648"); //\u0648 : ARABIC LETTER WAW + tmp_word = tmp_word.replaceAll("Y", "\u0649"); //\u0649 : ARABIC LETTER ALEF MAKSURA + tmp_word = tmp_word.replaceAll("y", "\u064A"); //\u064A : ARABIC LETTER YEH + tmp_word = tmp_word.replaceAll("F", "\u064B"); //\u064B : ARABIC FATHATAN + tmp_word = tmp_word.replaceAll("N", "\u064C"); //\u064C : ARABIC DAMMATAN + tmp_word = tmp_word.replaceAll("K", "\u064D"); //\u064D : ARABIC KASRATAN + tmp_word = tmp_word.replaceAll("a", "\u064E"); //\u064E : ARABIC FATHA + tmp_word = tmp_word.replaceAll("u", "\u064F"); //\u064F : ARABIC DAMMA + tmp_word = tmp_word.replaceAll("i", "\u0650"); //\u0650 : ARABIC KASRA + tmp_word = tmp_word.replaceAll("~", "\u0651"); //\u0651 : ARABIC SHADDA + tmp_word = tmp_word.replaceAll("o", "\u0652"); //\u0652 : ARABIC SUKUN + tmp_word = tmp_word.replaceAll("`", "\u0670"); //\u0670 : ARABIC LETTER SUPERSCRIPT ALEF + tmp_word = tmp_word.replaceAll("\\{", "\u0671"); //\u0671 : ARABIC LETTER ALEF WASLA + tmp_word = tmp_word.replaceAll("P", "\u067E"); //\u067E : ARABIC LETTER PEH + tmp_word = tmp_word.replaceAll("J", "\u0686"); //\u0686 : ARABIC LETTER TCHEH + tmp_word = tmp_word.replaceAll("V", "\u06A4"); //\u06A4 : ARABIC LETTER VEH + tmp_word = tmp_word.replaceAll("G", "\u06AF"); //\u06AF : ARABIC LETTER GAF + tmp_word = tmp_word.replaceAll("R", "\u0698"); //\u0698 : ARABIC LETTER JEH (no more in Buckwalter system) + //Not in Buckwalter system \u0679 : ARABIC LETTER TTEH + //Not in Buckwalter system \u0688 : ARABIC LETTER DDAL + //Not in Buckwalter system \u06A9 : ARABIC LETTER KEHEH + //Not in Buckwalter system \u0691 : ARABIC LETTER RREH + //Not in Buckwalter system \u06BA : ARABIC LETTER NOON GHUNNA + //Not in Buckwalter system \u06BE : ARABIC LETTER HEH DOACHASHMEE + //Not in Buckwalter system \u06C1 : ARABIC LETTER HEH GOAL + //Not in Buckwalter system \u06D2 : ARABIC LETTER YEH BARREE + tmp_word = tmp_word.replaceAll(",", "\u060C" ); //\u060C : ARABIC COMMA + tmp_word = tmp_word.replaceAll(";", "\u061B"); //\u061B : ARABIC SEMICOLON + tmp_word = tmp_word.replaceAll("\\?", "\u061F"); //\u061F : ARABIC QUESTION MARK + return tmp_word; + } + +} \ No newline at end of file diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/transcode/Unicode2Betacode.lex --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/transcode/Unicode2Betacode.lex Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,319 @@ +package de.mpg.mpiwg.berlin.mpdl.lt.general; + +%% + +%class Unicode2BetacodeLex +%public +%type java.lang.String +%unicode +%% + + +"<"[^>]\u+">" { return yytext(); } + +"H" { return "*j"; } +"h" { return "j"; } +"F" { return "*v"; } +"f" { return "v"; } +"\u03a3" { return "*s"; } + +"." { return "!"; } +"\u00B7" { return ":"; } /* MPDL update */ + +"\u1F00" { return "a)"; } +"\u1F01" { return "a("; } +"\u1F02" { return "a)\\"; } +"\u1F03" { return "a(\\"; } +"\u1F04" { return "a)/"; } +"\u1F05" { return "a(/"; } +"\u1F06" { return "a)="; } +"\u1F07" { return "a(="; } +"\u1F08" { return "*)a"; } +"\u1F09" { return "*(a"; } +"\u1F0A" { return "*)\\a"; } +"\u1F0B" { return "*(\\a"; } +"\u1F0C" { return "*)/a"; } +"\u1F0D" { return "*(/a"; } +"\u1F0E" { return "*)=a"; } +"\u1F0F" { return "*(=a"; } +"\u1F10" { return "e)"; } +"\u1F11" { return "e("; } +"\u1F12" { return "e)\\"; } +"\u1F13" { return "e(\\"; } +"\u1F14" { return "e)/"; } +"\u1F15" { return "e(/"; } +"\u1F18" { return "*)e"; } +"\u1F19" { return "*(e"; } +"\u1F1A" { return "*)\\e"; } +"\u1F1B" { return "*(\\e"; } +"\u1F1C" { return "*)/e"; } +"\u1F1D" { return "*(/e"; } +"\u1F20" { return "h)"; } +"\u1F21" { return "h("; } +"\u1F22" { return "h)\\"; } +"\u1F23" { return "h(\\"; } +"\u1F24" { return "h)/"; } +"\u1F25" { return "h(/"; } +"\u1F26" { return "h)="; } +"\u1F27" { return "h(="; } +"\u1F28" { return "*)h"; } +"\u1F29" { return "*(h"; } +"\u1F2A" { return "*)\\h"; } +"\u1F2B" { return "*(\\h"; } +"\u1F2C" { return "*)/h"; } +"\u1F2D" { return "*(/h"; } +"\u1F2E" { return "*)=h"; } +"\u1F2F" { return "*(=h"; } +"\u1F30" { return "i)"; } +"\u1F31" { return "i("; } +"\u1F32" { return "i)\\"; } +"\u1F33" { return "i(\\"; } +"\u1F34" { return "i)/"; } +"\u1F35" { return "i(/"; } +"\u1F36" { return "i)="; } +"\u1F37" { return "i(="; } +"\u1F38" { return "*)i"; } +"\u1F39" { return "*(i"; } +"\u1F3A" { return "*)\\i"; } +"\u1F3B" { return "*(\\i"; } +"\u1F3C" { return "*)/i"; } +"\u1F3D" { return "*(/i"; } +"\u1F3E" { return "*)=i"; } +"\u1F3F" { return "*(=i"; } +"\u1F40" { return "o)"; } +"\u1F41" { return "o("; } +"\u1F42" { return "o)\\"; } +"\u1F43" { return "o(\\"; } +"\u1F44" { return "o)/"; } +"\u1F45" { return "o(/"; } +"\u1F48" { return "*)o"; } +"\u1F49" { return "*(o"; } +"\u1F4A" { return "*)\\o"; } +"\u1F4B" { return "*(\\o"; } +"\u1F4C" { return "*)/o"; } +"\u1F4D" { return "*(/o"; } +"\u1F50" { return "u)"; } +"\u1F51" { return "u("; } +"\u1F52" { return "u)\\"; } +"\u1F53" { return "u(\\"; } +"\u1F54" { return "u)/"; } +"\u1F55" { return "u(/"; } +"\u1F56" { return "u)="; } +"\u1F57" { return "u(="; } +"\u1F59" { return "*(u"; } +"\u1F5B" { return "*(\\u"; } +"\u1F5D" { return "*(/u"; } +"\u1F5F" { return "*(=u"; } +"\u1F60" { return "w)"; } +"\u1F61" { return "w("; } +"\u1F62" { return "w)\\"; } +"\u1F63" { return "w(\\"; } +"\u1F64" { return "w)/"; } +"\u1F65" { return "w(/"; } +"\u1F66" { return "w)="; } +"\u1F67" { return "w(="; } +"\u1F68" { return "*)w"; } +"\u1F69" { return "*(w"; } +"\u1F6A" { return "*)\\w"; } +"\u1F6B" { return "*(\\w"; } +"\u1F6C" { return "*)/w"; } +"\u1F6D" { return "*(/w"; } +"\u1F6E" { return "*)=w"; } +"\u1F6F" { return "*(=w"; } +"\u1F70" { return "a\\"; } +"\u1F71" { return "a/"; } +"\u1F72" { return "e\\"; } +"\u1F73" { return "e/"; } +"\u1F74" { return "h\\"; } +"\u1F75" { return "h/"; } +"\u1F76" { return "i\\"; } +"\u1F77" { return "i/"; } +"\u1F78" { return "o\\"; } +"\u1F79" { return "o/"; } +"\u1F7A" { return "u\\"; } +"\u1F7B" { return "u/"; } +"\u1F7C" { return "w\\"; } +"\u1F7D" { return "w/"; } +"\u1F80" { return "a)|"; } +"\u1F81" { return "a(|"; } +"\u1F82" { return "a)\\|"; } +"\u1F83" { return "a(\\|"; } +"\u1F84" { return "a)/|"; } +"\u1F85" { return "a(/|"; } +"\u1F86" { return "a)=|"; } +"\u1F87" { return "a(=|"; } +"\u1F88" { return "*)|a"; } +"\u1F89" { return "*(|a"; } +"\u1F8A" { return "*)\\|a"; } +"\u1F8B" { return "*(\\|a"; } +"\u1F8C" { return "*)/|a"; } +"\u1F8D" { return "*(/|a"; } +"\u1F8E" { return "*)=|a"; } +"\u1F8F" { return "*(=|a"; } +"\u1F90" { return "h)|"; } +"\u1F91" { return "h(|"; } +"\u1F92" { return "h)\\|"; } +"\u1F93" { return "h(\\|"; } +"\u1F94" { return "h)/|"; } +"\u1F95" { return "h(/|"; } +"\u1F96" { return "h)=|"; } +"\u1F97" { return "h(=|"; } +"\u1F98" { return "*)|h"; } +"\u1F99" { return "*(|h"; } +"\u1F9A" { return "*)\\|h"; } +"\u1F9B" { return "*(\\|h"; } +"\u1F9C" { return "*)/|h"; } +"\u1F9D" { return "*(/|h"; } +"\u1F9E" { return "*)=|h"; } +"\u1F9F" { return "*(=|h"; } +"\u1FA0" { return "w)|"; } +"\u1FA1" { return "w(|"; } +"\u1FA2" { return "w)\\|"; } +"\u1FA3" { return "w(\\|"; } +"\u1FA4" { return "w)/|"; } +"\u1FA5" { return "w(/|"; } +"\u1FA6" { return "w)=|"; } +"\u1FA7" { return "w(=|"; } +"\u1FA8" { return "*)|w"; } +"\u1FA9" { return "*(|w"; } +"\u1FAA" { return "*)\\|w"; } +"\u1FAB" { return "*(\\|w"; } +"\u1FAC" { return "*)/|w"; } +"\u1FAD" { return "*(/|w"; } +"\u1FAE" { return "*)=|w"; } +"\u1FAF" { return "*(=|w"; } +"\u1FB0" { return "a^"; } +"\u1FB1" { return "a_"; } +"\u1FB2" { return "a\\|"; } +"\u1FB3" { return "a|"; } +"\u1FB4" { return "a/|"; } +"\u1FB6" { return "a="; } +"\u1FB7" { return "a=|"; } +"\u1FB8" { return "*a^"; } +"\u1FB9" { return "*a_"; } +"\u1FBA" { return "*a\\"; } +"\u1FBB" { return "*a/"; } +"\u1FBC" { return "*a|"; } +"\u1FC2" { return "h\\|"; } +"\u1FC3" { return "h|"; } +"\u1FC4" { return "h/|"; } +"\u1FC6" { return "h="; } +"\u1FC7" { return "h=|"; } +"\u1FC8" { return "*e\\"; } +"\u1FC9" { return "*e/"; } +"\u1FCA" { return "*h\\"; } +"\u1FCB" { return "*h/"; } +"\u1FCC" { return "*h|"; } +"\u1FD0" { return "i^"; } +"\u1FD1" { return "i_"; } +"\u1FD2" { return "i+\\"; } +"\u1FD3" { return "i+/"; } +"\u1FD6" { return "i="; } +"\u1FD7" { return "i+="; } +"\u1FD8" { return "*i^"; } +"\u1FD9" { return "*i_"; } +"\u1FDA" { return "*i\\"; } +"\u1FDB" { return "*i/"; } +"\u1FE0" { return "u^"; } +"\u1FE1" { return "u_"; } +"\u1FE2" { return "u+\\"; } +"\u1FE3" { return "u+/"; } +"\u1FE4" { return "r)"; } +"\u1FE5" { return "r("; } +"\u1FE6" { return "u="; } +"\u1FE7" { return "u+="; } +"\u1FE8" { return "*u^"; } +"\u1FE9" { return "*u_"; } +"\u1FEA" { return "*u\\"; } +"\u1FEB" { return "*u/"; } +"\u1FEC" { return "*(r"; } +"\u1FF2" { return "w\\|"; } +"\u1FF3" { return "w|"; } +"\u1FF4" { return "w/|"; } +"\u1FFA" { return "*w\\"; } +"\u1FFB" { return "*w/"; } +"\u1FFC" { return "*w|"; } +"\u1FF6" { return "w="; } +"\u1FF7" { return "w=|"; } +"\u1FF8" { return "*o\\"; } +"\u1FF9" { return "*o/"; } + +"\u0300" { return "\\"; } +"\u0301" { return "/"; } +"\u0304" { return "_"; } +"\u0306" { return "^"; } +"\u0308" { return "+"; } +"\u0302" { return "="; } +"\u0313" { return ")"; } +"\u0314" { return "("; } +"\u0323" { return "?"; } +"\u0345" { return "|"; } + +"\u03b1" { return "a"; } /* MPDL update */ +"\u0391" { return "*a"; } /* MPDL update */ +"\u03b2" { return "b"; } /* MPDL update */ +"\u0392" { return "*b"; } /* MPDL update */ +"\u03b3" { return "g"; } /* MPDL update */ +"\u0393" { return "*g"; } /* MPDL update */ +"\u03b4" { return "d"; } /* MPDL update */ +"\u0394" { return "*d"; } /* MPDL update */ +"\u03b5" { return "e"; } /* MPDL update */ +"\u0395" { return "*e"; } /* MPDL update */ +"\u03b6" { return "z"; } /* MPDL update */ +"\u0396" { return "*z"; } /* MPDL update */ +"\u03b7" { return "h"; } /* MPDL update */ +"\u0397" { return "*h"; } /* MPDL update */ +"\u03b8" { return "q"; } /* MPDL update */ +"\u0398" { return "*q"; } /* MPDL update */ +"\u03b9" { return "i"; } /* MPDL update */ +"\u0399" { return "*i"; } /* MPDL update */ +"\u03ba" { return "k"; } /* MPDL update */ +"\u039a" { return "*k"; } /* MPDL update */ +"\u03bb" { return "l"; } /* MPDL update */ +"\u039b" { return "*l"; } /* MPDL update */ +"\u03bc" { return "m"; } /* MPDL update */ +"\u039c" { return "*m"; } /* MPDL update */ +"\u03bd" { return "n"; } /* MPDL update */ +"\u039d" { return "*n"; } /* MPDL update */ +"\u03be" { return "c"; } /* MPDL update */ +"\u039e" { return "*c"; } /* MPDL update */ +"\u03bf" { return "o"; } /* MPDL update */ +"\u039f" { return "*o"; } /* MPDL update */ +"\u03c0" { return "p"; } /* MPDL update */ +"\u03a0" { return "*p"; } /* MPDL update */ +"\u03c1" { return "r"; } /* MPDL update */ +"\u03a1" { return "*r"; } /* MPDL update */ + +"\u03a3" { return "*s"; } /* MPDL update */ +"\u03c3" { return "s1"; } /* mdh 2002-01-07 */ +"\u03c2"/\-\- { return "s"; } +"\u03c3"/\> }[a-z\?\!0-9*=\/()\'\-] { return "s"; } /* MPDL update */ +"\u03c2"/\< { return "s"; } /* MPDL update */ +"\u03c3"/[\[\]][a-z\?\!0-9*=\/()\'\-] { return "s"; } /* MPDL update */ +"\u03c2"/\??[^a-z0-9*=\/()\'\-\[\?] { return "s"; } +"\u03c3" { return "s"; } /* MPDL update */ + +"\u03c4" { return "t"; } /* MPDL update */ +"\u03a4" { return "*t"; } /* MPDL update */ +"\u03c5" { return "u"; } /* MPDL update */ +"\u03a5" { return "*u"; } /* MPDL update */ +"\u03c6" { return "f"; } /* MPDL update */ +"\u03a6" { return "*f"; } /* MPDL update */ +"\u03c7" { return "x"; } /* MPDL update */ +"\u03a7" { return "*x"; } /* MPDL update */ +"\u03c8" { return "y"; } /* MPDL update */ +"\u03a8" { return "*y"; } /* MPDL update */ +"\u03c9" { return "w"; } /* MPDL update */ +"\u03a9" { return "*w"; } /* MPDL update */ + +[\&_]"vert;" { return "|"; } +[\&_]"lpar;" { return "("; } +[\&_]"rpar;" { return ")"; } +[\_\&]"lt;" { return "<"; } +[\_\&]"gt;" { return ">"; } +"'" { return "'"; } /* MPDL update */ + +"&"[a-zA-Z]+";" { return yytext(); } + +. { return yytext(); } +\n { return yytext(); } \ No newline at end of file diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/transcode/Unicode2BetacodeLex.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/transcode/Unicode2BetacodeLex.java Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,1866 @@ +/* The following code was generated by JFlex 1.4.3 on 14.12.10 15:03 */ + +package de.mpg.mpiwg.berlin.mpdl.lt.text.transcode; + + +/** + * This class is a scanner generated by + * JFlex 1.4.3 + * on 14.12.10 15:03 from the specification file + * /Users/jwillenborg/test/jflex/Unicode2Betacode.lex + */ +public class Unicode2BetacodeLex { + + /** This character denotes the end of file */ + public static final int YYEOF = -1; + + /** initial size of the lookahead buffer */ + private static final int ZZ_BUFFERSIZE = 16384; + + /** lexical states */ + public static final int YYINITIAL = 0; + + /** + * ZZ_LEXSTATE[l] is the state in the DFA for the lexical state l + * ZZ_LEXSTATE[l+1] is the state in the DFA for the lexical state l + * at the beginning of a line + * l is of the form l = 2*k, k a non negative integer + */ + private static final int ZZ_LEXSTATE[] = { + 0, 0 + }; + + /** + * Translates characters to character classes + */ + private static final String ZZ_CMAP_PACKED = + "\12\0\1\0\26\0\1\u0118\1\0\1\u0130\2\0\1\u0113\4\u011c\2\0"+ + "\1\u0112\1\11\1\u011c\1\u0131\2\u011c\1\u0132\5\u011c\1\u0133\1\0\1\u0116"+ + "\1\1\1\u011c\1\2\1\u011b\1\0\5\u0134\1\6\1\u0134\1\4\22\u0134"+ + "\1\u011d\1\0\1\u011a\1\0\1\u012a\1\0\1\u012f\3\u0135\1\u012c\1\7"+ + "\1\u0114\1\5\3\u0135\1\u0119\3\u0135\1\u012e\1\u0135\1\u012d\1\u0135\1\u0115"+ + "\1\3\1\u012b\4\u0135\2\0\1\u0117\71\0\1\12\u0248\0\1\344\1\345"+ + "\1\351\1\0\1\346\1\0\1\347\1\0\1\350\12\0\1\352\1\353"+ + "\16\0\1\354\41\0\1\355\113\0\1\357\1\361\1\363\1\365\1\367"+ + "\1\371\1\373\1\375\1\377\1\u0101\1\u0103\1\u0105\1\u0107\1\u0109\1\u010b"+ + "\1\u010d\1\u010f\1\0\1\10\1\u011f\1\u0121\1\u0123\1\u0125\1\u0127\1\u0129"+ + "\7\0\1\356\1\360\1\362\1\364\1\366\1\370\1\372\1\374\1\376"+ + "\1\u0100\1\u0102\1\u0104\1\u0106\1\u0108\1\u010a\1\u010c\1\u010e\1\u0111\1\u0110"+ + "\1\u011e\1\u0120\1\u0122\1\u0124\1\u0126\1\u0128\u1b36\0\1\13\1\14\1\15"+ + "\1\16\1\17\1\20\1\21\1\22\1\23\1\24\1\25\1\26\1\27"+ + "\1\30\1\31\1\32\1\33\1\34\1\35\1\36\1\37\1\40\2\0"+ + "\1\41\1\42\1\43\1\44\1\45\1\46\2\0\1\47\1\50\1\51"+ + "\1\52\1\53\1\54\1\55\1\56\1\57\1\60\1\61\1\62\1\63"+ + "\1\64\1\65\1\66\1\67\1\70\1\71\1\72\1\73\1\74\1\75"+ + "\1\76\1\77\1\100\1\101\1\102\1\103\1\104\1\105\1\106\1\107"+ + "\1\110\1\111\1\112\1\113\1\114\2\0\1\115\1\116\1\117\1\120"+ + "\1\121\1\122\2\0\1\123\1\124\1\125\1\126\1\127\1\130\1\131"+ + "\1\132\1\0\1\133\1\0\1\134\1\0\1\135\1\0\1\136\1\137"+ + "\1\140\1\141\1\142\1\143\1\144\1\145\1\146\1\147\1\150\1\151"+ + "\1\152\1\153\1\154\1\155\1\156\1\157\1\160\1\161\1\162\1\163"+ + "\1\164\1\165\1\166\1\167\1\170\1\171\1\172\1\173\1\174\2\0"+ + "\1\175\1\176\1\177\1\200\1\201\1\202\1\203\1\204\1\205\1\206"+ + "\1\207\1\210\1\211\1\212\1\213\1\214\1\215\1\216\1\217\1\220"+ + "\1\221\1\222\1\223\1\224\1\225\1\226\1\227\1\230\1\231\1\232"+ + "\1\233\1\234\1\235\1\236\1\237\1\240\1\241\1\242\1\243\1\244"+ + "\1\245\1\246\1\247\1\250\1\251\1\252\1\253\1\254\1\255\1\256"+ + "\1\257\1\260\1\261\1\0\1\262\1\263\1\264\1\265\1\266\1\267"+ + "\1\270\5\0\1\271\1\272\1\273\1\0\1\274\1\275\1\276\1\277"+ + "\1\300\1\301\1\302\3\0\1\303\1\304\1\305\1\306\2\0\1\307"+ + "\1\310\1\311\1\312\1\313\1\314\4\0\1\315\1\316\1\317\1\320"+ + "\1\321\1\322\1\323\1\324\1\325\1\326\1\327\1\330\1\331\5\0"+ + "\1\332\1\333\1\334\1\0\1\340\1\341\1\342\1\343\1\335\1\336"+ + "\1\337\ue003\0"; + + /** + * Translates characters to character classes + */ + private static final char [] ZZ_CMAP = zzUnpackCMap(ZZ_CMAP_PACKED); + + /** + * Translates DFA states to action switch labels. + */ + private static final int [] ZZ_ACTION = zzUnpackAction(); + + private static final String ZZ_ACTION_PACKED_0 = + "\1\0\2\1\1\2\1\3\1\4\1\5\1\6\1\7"+ + "\1\10\1\11\1\12\1\13\1\14\1\15\1\16\1\17"+ + "\1\20\1\21\1\22\1\23\1\24\1\25\1\26\1\27"+ + "\1\30\1\31\1\32\1\33\1\34\1\35\1\36\1\37"+ + "\1\40\1\41\1\42\1\43\1\44\1\45\1\46\1\47"+ + "\1\50\1\51\1\52\1\53\1\54\1\55\1\56\1\57"+ + "\1\60\1\61\1\62\1\63\1\64\1\65\1\66\1\67"+ + "\1\70\1\71\1\72\1\73\1\74\1\75\1\76\1\77"+ + "\1\100\1\101\1\102\1\103\1\104\1\105\1\106\1\107"+ + "\1\110\1\111\1\112\1\113\1\114\1\115\1\116\1\117"+ + "\1\120\1\121\1\122\1\123\1\124\1\125\1\126\1\127"+ + "\1\130\1\131\1\132\1\133\1\134\1\135\1\136\1\137"+ + "\1\140\1\141\1\142\1\143\1\144\1\145\1\146\1\147"+ + "\1\150\1\151\1\152\1\153\1\154\1\155\1\156\1\157"+ + "\1\160\1\161\1\162\1\163\1\164\1\165\1\166\1\167"+ + "\1\170\1\171\1\172\1\173\1\174\1\175\1\176\1\177"+ + "\1\200\1\201\1\202\1\203\1\204\1\205\1\206\1\207"+ + "\1\210\1\211\1\212\1\213\1\214\1\215\1\216\1\217"+ + "\1\220\1\221\1\222\1\223\1\224\1\225\1\226\1\227"+ + "\1\230\1\231\1\232\1\233\1\234\1\235\1\236\1\237"+ + "\1\240\1\241\1\242\1\243\1\244\1\245\1\246\1\247"+ + "\1\250\1\251\1\252\1\253\1\254\1\255\1\256\1\257"+ + "\1\260\1\261\1\262\1\263\1\264\1\265\1\266\1\267"+ + "\1\270\1\271\1\272\1\273\1\274\1\275\1\276\1\277"+ + "\1\300\1\301\1\302\1\303\1\304\1\305\1\306\1\307"+ + "\1\310\1\311\1\312\1\313\1\314\1\315\1\316\1\317"+ + "\1\320\1\321\1\322\1\323\1\324\1\325\1\326\1\327"+ + "\1\330\1\331\1\332\1\333\1\334\1\335\1\336\1\337"+ + "\1\340\1\341\1\342\1\343\1\344\1\345\1\346\1\347"+ + "\1\350\1\351\1\352\1\353\1\354\1\355\1\356\1\357"+ + "\1\360\1\361\1\362\1\363\1\364\1\365\1\366\1\367"+ + "\1\370\1\371\1\372\1\373\1\374\1\375\1\376\1\377"+ + "\1\u0100\1\u0101\1\u0102\1\u0103\1\u0104\1\u0105\1\u0106\1\u0107"+ + "\1\u0108\1\u0109\1\u010a\1\u010b\1\u010c\1\u010d\1\u010e\2\1"+ + "\1\u010f\1\u0110\1\u0111\1\u0112\1\u0113\1\u0114\1\u0115\1\u0116"+ + "\1\u0117\1\u0118\1\u0119\1\u011a\1\1\3\0\1\u011b\1\0"+ + "\1\u011b\33\0\1\u011c\1\u011d\17\0\1\u011e"; + + private static int [] zzUnpackAction() { + int [] result = new int[338]; + int offset = 0; + offset = zzUnpackAction(ZZ_ACTION_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackAction(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int count = packed.charAt(i++); + int value = packed.charAt(i++); + do result[j++] = value; while (--count > 0); + } + return j; + } + + + /** + * Translates a state to a row index in the transition table + */ + private static final int [] ZZ_ROWMAP = zzUnpackRowMap(); + + private static final String ZZ_ROWMAP_PACKED_0 = + "\0\0\0\u0136\0\u026c\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136"+ + "\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136"+ + "\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136"+ + "\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136"+ + "\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136"+ + "\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136"+ + "\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136"+ + "\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136"+ + "\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136"+ + "\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136"+ + "\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136"+ + "\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136"+ + "\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136"+ + "\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136"+ + "\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136"+ + "\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136"+ + "\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136"+ + "\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136"+ + "\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136"+ + "\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136"+ + "\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136"+ + "\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136"+ + "\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136"+ + "\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136"+ + "\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136"+ + "\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136"+ + "\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136"+ + "\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136"+ + "\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136"+ + "\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136"+ + "\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136"+ + "\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136"+ + "\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136"+ + "\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u03a2"+ + "\0\u04d8\0\u060e\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136"+ + "\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0744\0\u087a"+ + "\0\u09b0\0\u0ae6\0\u0136\0\u0c1c\0\u0d52\0\u0e88\0\u0fbe\0\u10f4"+ + "\0\u122a\0\u1360\0\u1496\0\u15cc\0\u1702\0\u1838\0\u196e\0\u1aa4"+ + "\0\u1bda\0\u1d10\0\u1e46\0\u1f7c\0\u20b2\0\u21e8\0\u231e\0\u2454"+ + "\0\u258a\0\u26c0\0\u27f6\0\u292c\0\u2a62\0\u2b98\0\u2cce\0\u2e04"+ + "\0\u0136\0\u0136\0\u2f3a\0\u3070\0\u31a6\0\u32dc\0\u3412\0\u3548"+ + "\0\u367e\0\u37b4\0\u38ea\0\u3a20\0\u3b56\0\u3c8c\0\u3dc2\0\u3ef8"+ + "\0\u402e\0\u0136"; + + private static int [] zzUnpackRowMap() { + int [] result = new int[338]; + int offset = 0; + offset = zzUnpackRowMap(ZZ_ROWMAP_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackRowMap(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int high = packed.charAt(i++) << 16; + result[j++] = high | packed.charAt(i++); + } + return j; + } + + /** + * The transition table of the DFA + */ + private static final int [] ZZ_TRANS = zzUnpackTrans(); + + private static final String ZZ_TRANS_PACKED_0 = + "\1\2\1\3\2\2\1\4\1\5\1\6\1\7\1\10"+ + "\1\11\1\12\1\13\1\14\1\15\1\16\1\17\1\20"+ + "\1\21\1\22\1\23\1\24\1\25\1\26\1\27\1\30"+ + "\1\31\1\32\1\33\1\34\1\35\1\36\1\37\1\40"+ + "\1\41\1\42\1\43\1\44\1\45\1\46\1\47\1\50"+ + "\1\51\1\52\1\53\1\54\1\55\1\56\1\57\1\60"+ + "\1\61\1\62\1\63\1\64\1\65\1\66\1\67\1\70"+ + "\1\71\1\72\1\73\1\74\1\75\1\76\1\77\1\100"+ + "\1\101\1\102\1\103\1\104\1\105\1\106\1\107\1\110"+ + "\1\111\1\112\1\113\1\114\1\115\1\116\1\117\1\120"+ + "\1\121\1\122\1\123\1\124\1\125\1\126\1\127\1\130"+ + "\1\131\1\132\1\133\1\134\1\135\1\136\1\137\1\140"+ + "\1\141\1\142\1\143\1\144\1\145\1\146\1\147\1\150"+ + "\1\151\1\152\1\153\1\154\1\155\1\156\1\157\1\160"+ + "\1\161\1\162\1\163\1\164\1\165\1\166\1\167\1\170"+ + "\1\171\1\172\1\173\1\174\1\175\1\176\1\177\1\200"+ + "\1\201\1\202\1\203\1\204\1\205\1\206\1\207\1\210"+ + "\1\211\1\212\1\213\1\214\1\215\1\216\1\217\1\220"+ + "\1\221\1\222\1\223\1\224\1\225\1\226\1\227\1\230"+ + "\1\231\1\232\1\233\1\234\1\235\1\236\1\237\1\240"+ + "\1\241\1\242\1\243\1\244\1\245\1\246\1\247\1\250"+ + "\1\251\1\252\1\253\1\254\1\255\1\256\1\257\1\260"+ + "\1\261\1\262\1\263\1\264\1\265\1\266\1\267\1\270"+ + "\1\271\1\272\1\273\1\274\1\275\1\276\1\277\1\300"+ + "\1\301\1\302\1\303\1\304\1\305\1\306\1\307\1\310"+ + "\1\311\1\312\1\313\1\314\1\315\1\316\1\317\1\320"+ + "\1\321\1\322\1\323\1\324\1\325\1\326\1\327\1\330"+ + "\1\331\1\332\1\333\1\334\1\335\1\336\1\337\1\340"+ + "\1\341\1\342\1\343\1\344\1\345\1\346\1\347\1\350"+ + "\1\351\1\352\1\353\1\354\1\355\1\356\1\357\1\360"+ + "\1\361\1\362\1\363\1\364\1\365\1\366\1\367\1\370"+ + "\1\371\1\372\1\373\1\374\1\375\1\376\1\377\1\u0100"+ + "\1\u0101\1\u0102\1\u0103\1\u0104\1\u0105\1\u0106\1\u0107\1\u0108"+ + "\1\u0109\1\u010a\1\u010b\1\u010c\1\u010d\1\u010e\1\u010f\1\u0110"+ + "\1\u0111\1\2\1\u0112\12\2\1\u0113\1\u0114\1\u0115\1\u0116"+ + "\1\u0117\1\u0118\1\u0119\1\u011a\1\u011b\1\u011c\1\u011d\1\u011e"+ + "\1\u011f\13\2\u0136\0\2\u0120\1\0\u0133\u0120\u0113\0\1\u0121"+ + "\6\0\1\u0122\2\0\1\u0122\30\0\3\u0123\1\0\1\u0123"+ + "\1\0\1\u0123\1\0\u010a\u0123\1\u0124\1\u0125\2\0\3\u0123"+ + "\1\0\1\u0123\1\u0126\2\0\15\u0123\5\0\1\u0123\3\0"+ + "\1\u0123\4\0\5\u0127\u010c\0\1\u0128\1\u0127\3\0\1\u0129"+ + "\21\0\1\u012a\1\u0127\1\u012b\2\u0127\1\u012c\3\0\2\u0127"+ + "\u0114\0\1\u012d\4\0\1\u012e\21\0\1\u012f\1\0\1\u0130"+ + "\13\0\1\u0131\u0246\0\1\u0132\44\0\1\u0123\1\0\1\u0123"+ + "\1\0\1\u0123\u010a\0\1\u0123\1\0\2\u0123\2\0\2\u0123"+ + "\1\0\2\u0123\16\0\5\u0123\1\0\3\u0123\1\0\1\u0123"+ + "\u0112\0\1\u0123\u013c\0\1\u0133\34\0\3\u0123\1\0\1\u0123"+ + "\1\0\1\u0123\1\0\u010a\u0123\1\0\1\u0123\2\0\3\u0123"+ + "\1\0\1\u0123\3\0\15\u0123\5\0\1\u0123\3\0\1\u0123"+ + "\4\0\5\u0127\u010c\0\2\u0127\1\2\2\0\1\u0127\21\0"+ + "\5\u0127\4\0\2\u0127\3\0\5\u0127\u010c\0\1\u0127\1\u0134"+ + "\1\2\2\0\1\u0127\21\0\5\u0127\4\0\2\u0127\3\0"+ + "\5\u0127\u010c\0\1\u0127\1\u0135\1\2\2\0\1\u0127\21\0"+ + "\3\u0127\1\u0136\1\u0127\4\0\2\u0127\3\0\5\u0127\u010c\0"+ + "\2\u0127\1\2\2\0\1\u0127\21\0\1\u0127\1\u0137\3\u0127"+ + "\4\0\2\u0127\3\0\5\u0127\u010c\0\2\u0127\1\2\2\0"+ + "\1\u0127\21\0\3\u0127\1\u0138\1\u0127\4\0\2\u0127\u0131\0"+ + "\1\u0139\u0119\0\1\u013a\u0135\0\1\u013b\30\0\1\u013c\u0133\0"+ + "\1\u013d\u0137\0\1\u013e\11\0\1\2\1\u0131\u0247\0\1\u013f"+ + "\u0135\0\1\u0140\43\0\5\u0127\u010c\0\2\u0127\1\u0141\2\0"+ + "\1\u0127\21\0\5\u0127\4\0\2\u0127\3\0\5\u0127\u010c\0"+ + "\2\u0127\1\u0142\2\0\1\u0127\21\0\5\u0127\4\0\2\u0127"+ + "\3\0\5\u0127\u010c\0\2\u0127\1\2\2\0\1\u0127\21\0"+ + "\4\u0127\1\u0143\4\0\2\u0127\3\0\5\u0127\u010c\0\2\u0127"+ + "\1\2\2\0\1\u0127\21\0\2\u0127\1\u0144\2\u0127\4\0"+ + "\2\u0127\3\0\5\u0127\u010c\0\2\u0127\1\2\2\0\1\u0127"+ + "\21\0\4\u0127\1\u0145\4\0\2\u0127\u0132\0\1\u0146\u0119\0"+ + "\1\u0141\u0135\0\1\u0142\u014e\0\1\u0147\u0133\0\1\u0148\u0137\0"+ + "\1\u0149\u011c\0\1\u014a\u0135\0\1\u0123\42\0\5\u0127\u010c\0"+ + "\2\u0127\1\2\2\0\1\u0127\21\0\2\u0127\1\u014b\2\u0127"+ + "\4\0\2\u0127\3\0\5\u0127\u010c\0\1\u0127\1\u014c\1\2"+ + "\2\0\1\u0127\21\0\5\u0127\4\0\2\u0127\3\0\5\u0127"+ + "\u010c\0\2\u0127\1\2\2\0\1\u0127\21\0\2\u0127\1\u014d"+ + "\2\u0127\4\0\2\u0127\u0133\0\1\u014e\u012f\0\1\u014f\u011d\0"+ + "\1\u0150\u014d\0\1\u0151\u011f\0\1\u0122\41\0\5\u0127\u010c\0"+ + "\2\u0127\1\353\2\0\1\u0127\21\0\5\u0127\4\0\2\u0127"+ + "\3\0\5\u0127\u010c\0\2\u0127\1\355\2\0\1\u0127\21\0"+ + "\5\u0127\4\0\2\u0127\3\0\5\u0127\u010c\0\2\u0127\1\352"+ + "\2\0\1\u0127\21\0\5\u0127\4\0\2\u0127\u0116\0\1\u0152"+ + "\u0135\0\1\353\u0135\0\1\355\u0135\0\1\352\37\0"; + + private static int [] zzUnpackTrans() { + int [] result = new int[16740]; + int offset = 0; + offset = zzUnpackTrans(ZZ_TRANS_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackTrans(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int count = packed.charAt(i++); + int value = packed.charAt(i++); + value--; + do result[j++] = value; while (--count > 0); + } + return j; + } + + + /* error codes */ + private static final int ZZ_UNKNOWN_ERROR = 0; + private static final int ZZ_NO_MATCH = 1; + private static final int ZZ_PUSHBACK_2BIG = 2; + + /* error messages for the codes above */ + private static final String ZZ_ERROR_MSG[] = { + "Unkown internal scanner error", + "Error: could not match input", + "Error: pushback value was too large" + }; + + /** + * ZZ_ATTRIBUTE[aState] contains the attributes of state aState + */ + private static final int [] ZZ_ATTRIBUTE = zzUnpackAttribute(); + + private static final String ZZ_ATTRIBUTE_PACKED_0 = + "\1\0\1\11\1\1\u010c\11\3\1\14\11\1\1\3\0"+ + "\1\11\1\0\1\1\33\0\2\11\17\0\1\11"; + + private static int [] zzUnpackAttribute() { + int [] result = new int[338]; + int offset = 0; + offset = zzUnpackAttribute(ZZ_ATTRIBUTE_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackAttribute(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int count = packed.charAt(i++); + int value = packed.charAt(i++); + do result[j++] = value; while (--count > 0); + } + return j; + } + + /** the input device */ + private java.io.Reader zzReader; + + /** the current state of the DFA */ + private int zzState; + + /** the current lexical state */ + private int zzLexicalState = YYINITIAL; + + /** this buffer contains the current text to be matched and is + the source of the yytext() string */ + private char zzBuffer[] = new char[ZZ_BUFFERSIZE]; + + /** the textposition at the last accepting state */ + private int zzMarkedPos; + + /** the current text position in the buffer */ + private int zzCurrentPos; + + /** startRead marks the beginning of the yytext() string in the buffer */ + private int zzStartRead; + + /** endRead marks the last character in the buffer, that has been read + from input */ + private int zzEndRead; + + /** number of newlines encountered up to the start of the matched text */ + private int yyline; + + /** the number of characters up to the start of the matched text */ + private int yychar; + + /** + * the number of characters from the last newline up to the start of the + * matched text + */ + private int yycolumn; + + /** + * zzAtBOL == true <=> the scanner is currently at the beginning of a line + */ + private boolean zzAtBOL = true; + + /** zzAtEOF == true <=> the scanner is at the EOF */ + private boolean zzAtEOF; + + /** denotes if the user-EOF-code has already been executed */ + private boolean zzEOFDone; + + + /** + * Creates a new scanner + * There is also a java.io.InputStream version of this constructor. + * + * @param in the java.io.Reader to read input from. + */ + public Unicode2BetacodeLex(java.io.Reader in) { + this.zzReader = in; + } + + /** + * Creates a new scanner. + * There is also java.io.Reader version of this constructor. + * + * @param in the java.io.Inputstream to read input from. + */ + public Unicode2BetacodeLex(java.io.InputStream in) { + this(new java.io.InputStreamReader(in)); + } + + /** + * Unpacks the compressed character translation table. + * + * @param packed the packed character translation table + * @return the unpacked character translation table + */ + private static char [] zzUnpackCMap(String packed) { + char [] map = new char[0x10000]; + int i = 0; /* index in packed string */ + int j = 0; /* index in unpacked array */ + while (i < 724) { + int count = packed.charAt(i++); + char value = packed.charAt(i++); + do map[j++] = value; while (--count > 0); + } + return map; + } + + + /** + * Refills the input buffer. + * + * @return false, iff there was new input. + * + * @exception java.io.IOException if any I/O-Error occurs + */ + private boolean zzRefill() throws java.io.IOException { + + /* first: make room (if you can) */ + if (zzStartRead > 0) { + System.arraycopy(zzBuffer, zzStartRead, + zzBuffer, 0, + zzEndRead-zzStartRead); + + /* translate stored positions */ + zzEndRead-= zzStartRead; + zzCurrentPos-= zzStartRead; + zzMarkedPos-= zzStartRead; + zzStartRead = 0; + } + + /* is the buffer big enough? */ + if (zzCurrentPos >= zzBuffer.length) { + /* if not: blow it up */ + char newBuffer[] = new char[zzCurrentPos*2]; + System.arraycopy(zzBuffer, 0, newBuffer, 0, zzBuffer.length); + zzBuffer = newBuffer; + } + + /* finally: fill the buffer with new input */ + int numRead = zzReader.read(zzBuffer, zzEndRead, + zzBuffer.length-zzEndRead); + + if (numRead > 0) { + zzEndRead+= numRead; + return false; + } + // unlikely but not impossible: read 0 characters, but not at end of stream + if (numRead == 0) { + int c = zzReader.read(); + if (c == -1) { + return true; + } else { + zzBuffer[zzEndRead++] = (char) c; + return false; + } + } + + // numRead < 0 + return true; + } + + + /** + * Closes the input stream. + */ + public final void yyclose() throws java.io.IOException { + zzAtEOF = true; /* indicate end of file */ + zzEndRead = zzStartRead; /* invalidate buffer */ + + if (zzReader != null) + zzReader.close(); + } + + + /** + * Resets the scanner to read from a new input stream. + * Does not close the old reader. + * + * All internal variables are reset, the old input stream + * cannot be reused (internal buffer is discarded and lost). + * Lexical state is set to ZZ_INITIAL. + * + * @param reader the new input stream + */ + public final void yyreset(java.io.Reader reader) { + zzReader = reader; + zzAtBOL = true; + zzAtEOF = false; + zzEOFDone = false; + zzEndRead = zzStartRead = 0; + zzCurrentPos = zzMarkedPos = 0; + yyline = yychar = yycolumn = 0; + zzLexicalState = YYINITIAL; + } + + + /** + * Returns the current lexical state. + */ + public final int yystate() { + return zzLexicalState; + } + + + /** + * Enters a new lexical state + * + * @param newState the new lexical state + */ + public final void yybegin(int newState) { + zzLexicalState = newState; + } + + + /** + * Returns the text matched by the current regular expression. + */ + public final String yytext() { + return new String( zzBuffer, zzStartRead, zzMarkedPos-zzStartRead ); + } + + + /** + * Returns the character at position pos from the + * matched text. + * + * It is equivalent to yytext().charAt(pos), but faster + * + * @param pos the position of the character to fetch. + * A value from 0 to yylength()-1. + * + * @return the character at position pos + */ + public final char yycharat(int pos) { + return zzBuffer[zzStartRead+pos]; + } + + + /** + * Returns the length of the matched text region. + */ + public final int yylength() { + return zzMarkedPos-zzStartRead; + } + + + /** + * Reports an error that occured while scanning. + * + * In a wellformed scanner (no or only correct usage of + * yypushback(int) and a match-all fallback rule) this method + * will only be called with things that "Can't Possibly Happen". + * If this method is called, something is seriously wrong + * (e.g. a JFlex bug producing a faulty scanner etc.). + * + * Usual syntax/scanner level error handling should be done + * in error fallback rules. + * + * @param errorCode the code of the errormessage to display + */ + private void zzScanError(int errorCode) { + String message; + try { + message = ZZ_ERROR_MSG[errorCode]; + } + catch (ArrayIndexOutOfBoundsException e) { + message = ZZ_ERROR_MSG[ZZ_UNKNOWN_ERROR]; + } + + throw new Error(message); + } + + + /** + * Pushes the specified amount of characters back into the input stream. + * + * They will be read again by then next call of the scanning method + * + * @param number the number of characters to be read again. + * This number must not be greater than yylength()! + */ + public void yypushback(int number) { + if ( number > yylength() ) + zzScanError(ZZ_PUSHBACK_2BIG); + + zzMarkedPos -= number; + } + + + /** + * Resumes scanning until the next regular expression is matched, + * the end of input is encountered or an I/O-Error occurs. + * + * @return the next token + * @exception java.io.IOException if any I/O-Error occurs + */ + public java.lang.String yylex() throws java.io.IOException { + int zzInput; + int zzAction; + + // cached fields: + int zzCurrentPosL; + int zzMarkedPosL; + int zzEndReadL = zzEndRead; + char [] zzBufferL = zzBuffer; + char [] zzCMapL = ZZ_CMAP; + + int [] zzTransL = ZZ_TRANS; + int [] zzRowMapL = ZZ_ROWMAP; + int [] zzAttrL = ZZ_ATTRIBUTE; + + while (true) { + zzMarkedPosL = zzMarkedPos; + + zzAction = -1; + + zzCurrentPosL = zzCurrentPos = zzStartRead = zzMarkedPosL; + + zzState = ZZ_LEXSTATE[zzLexicalState]; + + + zzForAction: { + while (true) { + + if (zzCurrentPosL < zzEndReadL) + zzInput = zzBufferL[zzCurrentPosL++]; + else if (zzAtEOF) { + zzInput = YYEOF; + break zzForAction; + } + else { + // store back cached positions + zzCurrentPos = zzCurrentPosL; + zzMarkedPos = zzMarkedPosL; + boolean eof = zzRefill(); + // get translated positions and possibly new buffer + zzCurrentPosL = zzCurrentPos; + zzMarkedPosL = zzMarkedPos; + zzBufferL = zzBuffer; + zzEndReadL = zzEndRead; + if (eof) { + zzInput = YYEOF; + break zzForAction; + } + else { + zzInput = zzBufferL[zzCurrentPosL++]; + } + } + int zzNext = zzTransL[ zzRowMapL[zzState] + zzCMapL[zzInput] ]; + if (zzNext == -1) break zzForAction; + zzState = zzNext; + + int zzAttributes = zzAttrL[zzState]; + if ( (zzAttributes & 1) == 1 ) { + zzAction = zzState; + zzMarkedPosL = zzCurrentPosL; + if ( (zzAttributes & 8) == 8 ) break zzForAction; + } + + } + } + + // store back cached position + zzMarkedPos = zzMarkedPosL; + + switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) { + case 266: + { return "p"; + } + case 287: break; + case 102: + { return "*(w"; + } + case 288: break; + case 20: + { return "*(\\a"; + } + case 289: break; + case 21: + { return "*)/a"; + } + case 290: break; + case 181: + { return "*a/"; + } + case 291: break; + case 237: + { return "*a"; + } + case 292: break; + case 260: + { return "n"; + } + case 293: break; + case 89: + { return "*(u"; + } + case 294: break; + case 16: + { return "a(="; + } + case 295: break; + case 30: + { return "e(/"; + } + case 296: break; + case 195: + { return "i+\\"; + } + case 297: break; + case 222: + { return "w="; + } + case 298: break; + case 210: + { return "u+="; + } + case 299: break; + case 99: + { return "w)="; + } + case 300: break; + case 256: + { return "l"; + } + case 301: break; + case 205: + { return "u+\\"; + } + case 302: break; + case 23: + { return "*)=a"; + } + case 303: break; + case 225: + { return "*o/"; + } + case 304: break; + case 44: + { return "h(="; + } + case 305: break; + case 3: + { return "j"; + } + case 306: break; + case 103: + { return "*)\\w"; + } + case 307: break; + case 152: + { return "*(/|h"; + } + case 308: break; + case 165: + { return "*)\\|w"; + } + case 309: break; + case 248: + { return "h"; + } + case 310: break; + case 76: + { return "*(o"; + } + case 311: break; + case 159: + { return "w)/|"; + } + case 312: break; + case 178: + { return "*a^"; + } + case 313: break; + case 141: + { return "h)\\|"; + } + case 314: break; + case 106: + { return "*(/w"; + } + case 315: break; + case 275: + { return "f"; + } + case 316: break; + case 227: + { return "/"; + } + case 317: break; + case 91: + { return "*(/u"; + } + case 318: break; + case 242: + { return "d"; + } + case 319: break; + case 161: + { return "w)=|"; + } + case 320: break; + case 57: + { return "i)/"; + } + case 321: break; + case 154: + { return "*(=|h"; + } + case 322: break; + case 95: + { return "w)\\"; + } + case 323: break; + case 108: + { return "*(=w"; + } + case 324: break; + case 116: + { return "i/"; + } + case 325: break; + case 238: + { return "b"; + } + case 326: break; + case 207: + { return "r)"; + } + case 327: break; + case 147: + { return "*)|h"; + } + case 328: break; + case 62: + { return "*(i"; + } + case 329: break; + case 230: + { return "+"; + } + case 330: break; + case 77: + { return "*)\\o"; + } + case 331: break; + case 166: + { return "*(\\|w"; + } + case 332: break; + case 71: + { return "o)\\"; + } + case 333: break; + case 92: + { return "*(=u"; + } + case 334: break; + case 232: + { return ")"; + } + case 335: break; + case 14: + { return "a(/"; + } + case 336: break; + case 122: + { return "w/"; + } + case 337: break; + case 206: + { return "u+/"; + } + case 338: break; + case 80: + { return "*(/o"; + } + case 339: break; + case 97: + { return "w)/"; + } + case 340: break; + case 123: + { return "a)|"; + } + case 341: break; + case 229: + { return "^"; + } + case 342: break; + case 32: + { return "*(e"; + } + case 343: break; + case 286: + { return "'"; + } + case 344: break; + case 42: + { return "h(/"; + } + case 345: break; + case 53: + { return "i)"; + } + case 346: break; + case 174: + { return "a|"; + } + case 347: break; + case 63: + { return "*)\\i"; + } + case 348: break; + case 139: + { return "h)|"; + } + case 349: break; + case 193: + { return "i^"; + } + case 350: break; + case 18: + { return "*(a"; + } + case 351: break; + case 74: + { return "o(/"; + } + case 352: break; + case 93: + { return "w)"; + } + case 353: break; + case 66: + { return "*(/i"; + } + case 354: break; + case 101: + { return "*)w"; + } + case 355: break; + case 7: + { return "!"; + } + case 356: break; + case 33: + { return "*)\\e"; + } + case 357: break; + case 15: + { return "a)="; + } + case 358: break; + case 29: + { return "e)/"; + } + case 359: break; + case 68: + { return "*(=i"; + } + case 360: break; + case 125: + { return "a)\\|"; + } + case 361: break; + case 36: + { return "*(/e"; + } + case 362: break; + case 115: + { return "i\\"; + } + case 363: break; + case 201: + { return "*i\\"; + } + case 364: break; + case 112: + { return "e/"; + } + case 365: break; + case 218: + { return "w/|"; + } + case 366: break; + case 176: + { return "a="; + } + case 367: break; + case 19: + { return "*)\\a"; + } + case 368: break; + case 43: + { return "h)="; + } + case 369: break; + case 133: + { return "*)\\|a"; + } + case 370: break; + case 270: + { return "s1"; + } + case 371: break; + case 247: + { return "*z"; + } + case 372: break; + case 204: + { return "u_"; + } + case 373: break; + case 143: + { return "h)/|"; + } + case 374: break; + case 22: + { return "*(/a"; + } + case 375: break; + case 82: + { return "u("; + } + case 376: break; + case 75: + { return "*)o"; + } + case 377: break; + case 223: + { return "w=|"; + } + case 378: break; + case 278: + { return "*x"; + } + case 379: break; + case 121: + { return "w\\"; + } + case 380: break; + case 200: + { return "*i_"; + } + case 381: break; + case 219: + { return "*w\\"; + } + case 382: break; + case 25: + { return "e)"; + } + case 383: break; + case 145: + { return "h)=|"; + } + case 384: break; + case 151: + { return "*)/|h"; + } + case 385: break; + case 24: + { return "*(=a"; + } + case 386: break; + case 4: + { return "*v"; + } + case 387: break; + case 192: + { return "*h|"; + } + case 388: break; + case 39: + { return "h)\\"; + } + case 389: break; + case 272: + { return "*t"; + } + case 390: break; + case 134: + { return "*(\\|a"; + } + case 391: break; + case 214: + { return "*u/"; + } + case 392: break; + case 61: + { return "*)i"; + } + case 393: break; + case 269: + { return "*r"; + } + case 394: break; + case 160: + { return "w(/|"; + } + case 395: break; + case 13: + { return "a)/"; + } + case 396: break; + case 153: + { return "*)=|h"; + } + case 397: break; + case 267: + { return "*p"; + } + case 398: break; + case 111: + { return "e\\"; + } + case 399: break; + case 88: + { return "u(="; + } + case 400: break; + case 31: + { return "*)e"; + } + case 401: break; + case 188: + { return "*e\\"; + } + case 402: break; + case 110: + { return "a/"; + } + case 403: break; + case 162: + { return "w(=|"; + } + case 404: break; + case 41: + { return "h)/"; + } + case 405: break; + case 261: + { return "*n"; + } + case 406: break; + case 226: + { return "\\"; + } + case 407: break; + case 96: + { return "w(\\"; + } + case 408: break; + case 148: + { return "*(|h"; + } + case 409: break; + case 257: + { return "*l"; + } + case 410: break; + case 211: + { return "*u^"; + } + case 411: break; + case 198: + { return "i+="; + } + case 412: break; + case 279: + { return "y"; + } + case 413: break; + case 17: + { return "*)a"; + } + case 414: break; + case 73: + { return "o)/"; + } + case 415: break; + case 72: + { return "o(\\"; + } + case 416: break; + case 118: + { return "o/"; + } + case 417: break; + case 168: + { return "*(/|w"; + } + case 418: break; + case 2: + { return "*j"; + } + case 419: break; + case 281: + { return "w"; + } + case 420: break; + case 48: + { return "*(\\h"; + } + case 421: break; + case 49: + { return "*)/h"; + } + case 422: break; + case 9: + { return "a)"; + } + case 423: break; + case 216: + { return "w\\|"; + } + case 424: break; + case 249: + { return "*h"; + } + case 425: break; + case 273: + { return "u"; + } + case 426: break; + case 171: + { return "a^"; + } + case 427: break; + case 175: + { return "a/|"; + } + case 428: break; + case 285: + { return "<"; + } + case 429: break; + case 276: + { return "*f"; + } + case 430: break; + case 38: + { return "h("; + } + case 431: break; + case 283: + // lookahead expression with fixed base length + zzMarkedPos = zzStartRead + 1; + { return "s"; + } + case 432: break; + case 51: + { return "*)=h"; + } + case 433: break; + case 127: + { return "a)/|"; + } + case 434: break; + case 170: + { return "*(=|w"; + } + case 435: break; + case 69: + { return "o)"; + } + case 436: break; + case 243: + { return "*d"; + } + case 437: break; + case 185: + { return "h/|"; + } + case 438: break; + case 250: + { return "q"; + } + case 439: break; + case 163: + { return "*)|w"; + } + case 440: break; + case 8: + { return ":"; + } + case 441: break; + case 177: + { return "a=|"; + } + case 442: break; + case 239: + { return "*b"; + } + case 443: break; + case 158: + { return "w(\\|"; + } + case 444: break; + case 109: + { return "a\\"; + } + case 445: break; + case 264: + { return "o"; + } + case 446: break; + case 129: + { return "a)=|"; + } + case 447: break; + case 86: + { return "u(/"; + } + case 448: break; + case 180: + { return "*a\\"; + } + case 449: break; + case 11: + { return "a)\\"; + } + case 450: break; + case 187: + { return "h=|"; + } + case 451: break; + case 258: + { return "m"; + } + case 452: break; + case 191: + { return "*h/"; + } + case 453: break; + case 113: + { return "h\\"; + } + case 454: break; + case 190: + { return "*h\\"; + } + case 455: break; + case 196: + { return "i+/"; + } + case 456: break; + case 254: + { return "k"; + } + case 457: break; + case 215: + { return "*(r"; + } + case 458: break; + case 27: + { return "e)\\"; + } + case 459: break; + case 117: + { return "o\\"; + } + case 460: break; + case 252: + { return "i"; + } + case 461: break; + case 224: + { return "*o\\"; + } + case 462: break; + case 144: + { return "h(/|"; + } + case 463: break; + case 179: + { return "*a_"; + } + case 464: break; + case 221: + { return "*w|"; + } + case 465: break; + case 240: + { return "g"; + } + case 466: break; + case 55: + { return "i)\\"; + } + case 467: break; + case 209: + { return "u="; + } + case 468: break; + case 87: + { return "u)="; + } + case 469: break; + case 244: + { return "e"; + } + case 470: break; + case 146: + { return "h(=|"; + } + case 471: break; + case 83: + { return "u)\\"; + } + case 472: break; + case 40: + { return "h(\\"; + } + case 473: break; + case 262: + { return "c"; + } + case 474: break; + case 136: + { return "*(/|a"; + } + case 475: break; + case 236: + { return "a"; + } + case 476: break; + case 208: + { return "r("; + } + case 477: break; + case 46: + { return "*(h"; + } + case 478: break; + case 228: + { return "_"; + } + case 479: break; + case 183: + { return "h\\|"; + } + case 480: break; + case 233: + { return "("; + } + case 481: break; + case 138: + { return "*(=|a"; + } + case 482: break; + case 194: + { return "i_"; + } + case 483: break; + case 167: + { return "*)/|w"; + } + case 484: break; + case 54: + { return "i("; + } + case 485: break; + case 131: + { return "*)|a"; + } + case 486: break; + case 47: + { return "*)\\h"; + } + case 487: break; + case 184: + { return "h|"; + } + case 488: break; + case 149: + { return "*)\\|h"; + } + case 489: break; + case 94: + { return "w("; + } + case 490: break; + case 50: + { return "*(/h"; + } + case 491: break; + case 120: + { return "u/"; + } + case 492: break; + case 85: + { return "u)/"; + } + case 493: break; + case 169: + { return "*)=|w"; + } + case 494: break; + case 156: + { return "w(|"; + } + case 495: break; + case 202: + { return "*i/"; + } + case 496: break; + case 52: + { return "*(=h"; + } + case 497: break; + case 128: + { return "a(/|"; + } + case 498: break; + case 157: + { return "w)\\|"; + } + case 499: break; + case 60: + { return "i(="; + } + case 500: break; + case 164: + { return "*(|w"; + } + case 501: break; + case 150: + { return "*(\\|h"; + } + case 502: break; + case 220: + { return "*w/"; + } + case 503: break; + case 186: + { return "h="; + } + case 504: break; + case 81: + { return "u)"; + } + case 505: break; + case 130: + { return "a(=|"; + } + case 506: break; + case 280: + { return "*y"; + } + case 507: break; + case 203: + { return "u^"; + } + case 508: break; + case 104: + { return "*(\\w"; + } + case 509: break; + case 12: + { return "a(\\"; + } + case 510: break; + case 105: + { return "*)/w"; + } + case 511: break; + case 182: + { return "*a|"; + } + case 512: break; + case 282: + { return "*w"; + } + case 513: break; + case 199: + { return "*i^"; + } + case 514: break; + case 100: + { return "w(="; + } + case 515: break; + case 90: + { return "*(\\u"; + } + case 516: break; + case 26: + { return "e("; + } + case 517: break; + case 1: + { return yytext(); + } + case 518: break; + case 142: + { return "h(\\|"; + } + case 519: break; + case 274: + { return "*u"; + } + case 520: break; + case 28: + { return "e(\\"; + } + case 521: break; + case 107: + { return "*)=w"; + } + case 522: break; + case 173: + { return "a\\|"; + } + case 523: break; + case 6: + { return "*s"; + } + case 524: break; + case 45: + { return "*)h"; + } + case 525: break; + case 251: + { return "*q"; + } + case 526: break; + case 119: + { return "u\\"; + } + case 527: break; + case 56: + { return "i(\\"; + } + case 528: break; + case 213: + { return "*u\\"; + } + case 529: break; + case 284: + { return ">"; + } + case 530: break; + case 78: + { return "*(\\o"; + } + case 531: break; + case 189: + { return "*e/"; + } + case 532: break; + case 79: + { return "*)/o"; + } + case 533: break; + case 265: + { return "*o"; + } + case 534: break; + case 135: + { return "*)/|a"; + } + case 535: break; + case 84: + { return "u(\\"; + } + case 536: break; + case 235: + { return "|"; + } + case 537: break; + case 58: + { return "i(/"; + } + case 538: break; + case 259: + { return "*m"; + } + case 539: break; + case 212: + { return "*u_"; + } + case 540: break; + case 114: + { return "h/"; + } + case 541: break; + case 246: + { return "z"; + } + case 542: break; + case 255: + { return "*k"; + } + case 543: break; + case 277: + { return "x"; + } + case 544: break; + case 64: + { return "*(\\i"; + } + case 545: break; + case 65: + { return "*)/i"; + } + case 546: break; + case 137: + { return "*)=|a"; + } + case 547: break; + case 253: + { return "*i"; + } + case 548: break; + case 98: + { return "w(/"; + } + case 549: break; + case 5: + { return "v"; + } + case 550: break; + case 124: + { return "a(|"; + } + case 551: break; + case 234: + { return "?"; + } + case 552: break; + case 172: + { return "a_"; + } + case 553: break; + case 217: + { return "w|"; + } + case 554: break; + case 10: + { return "a("; + } + case 555: break; + case 241: + { return "*g"; + } + case 556: break; + case 155: + { return "w)|"; + } + case 557: break; + case 37: + { return "h)"; + } + case 558: break; + case 271: + { return "t"; + } + case 559: break; + case 231: + { return "="; + } + case 560: break; + case 67: + { return "*)=i"; + } + case 561: break; + case 34: + { return "*(\\e"; + } + case 562: break; + case 35: + { return "*)/e"; + } + case 563: break; + case 140: + { return "h(|"; + } + case 564: break; + case 132: + { return "*(|a"; + } + case 565: break; + case 245: + { return "*e"; + } + case 566: break; + case 268: + { return "r"; + } + case 567: break; + case 59: + { return "i)="; + } + case 568: break; + case 70: + { return "o("; + } + case 569: break; + case 126: + { return "a(\\|"; + } + case 570: break; + case 263: + { return "*c"; + } + case 571: break; + case 197: + { return "i="; + } + case 572: break; + default: + if (zzInput == YYEOF && zzStartRead == zzCurrentPos) { + zzAtEOF = true; + return null; + } + else { + zzScanError(ZZ_NO_MATCH); + } + } + } + } + + +} diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/transcode/Unicode2Buckwalter.lex --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/transcode/Unicode2Buckwalter.lex Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,121 @@ +package de.mpg.mpiwg.berlin.mpdl.lt.general; + +%% +%{ + /* + * Betacode to Unicode conversion + */ + +%} + +%class Unicode2BuckwalterLex +%public +%type java.lang.String +%unicode +%% + + +"<"[^>]+">" { return yytext(); } + +"\u0621" { return "'"; } /* Hamza */ +"\u0622" { return "|"; } /* ALEF WITH MADDA ABOVE from AraMorph */ +"\u0623" { return ">"; } /* Hamza */ +"\u0624" { return "&"; } /* Hamza */ +"\u0625" { return "<"; } /* Alif + HamzaBelow */ +"\u0626" { return "}"; } /* Ya + HamzaAbove */ +"\u0627" { return "A"; } /* Alif */ +"\u0628" { return "b"; } /* Ba */ +"\u0629" { return "p"; } /* TaMarbuta */ +"\u062A" { return "t"; } /* Ta */ +"\u062B" { return "v"; } /* Tha */ +"\u062C" { return "j"; } /* Jeem */ +"\u062D" { return "H"; } /* HHa */ +"\u062E" { return "x"; } /* Kha */ +"\u062F" { return "d"; } /* Dal */ +"\u0630" { return "*"; } /* Thal */ +"\u0631" { return "r"; } /* Ra */ +"\u0632" { return "z"; } /* Zain */ +"\u0633" { return "s"; } /* Seen */ +"\u0634" { return "$"; } /* Sheen */ +"\u0635" { return "S"; } /* Sad */ +"\u0636" { return "D"; } /* DDad */ +"\u0637" { return "T"; } /* TTa */ +"\u0638" { return "Z"; } /* DTha */ +"\u0639" { return "E"; } /* Ain */ +"\u063A" { return "g"; } /* Ghain */ + +"\u0640" { return "_"; } /* Tatweel */ +"\u0641" { return "f"; } /* Fa */ +"\u0642" { return "q"; } /* Qaf */ +"\u0643" { return "k"; } /* Kaf */ +"\u0644" { return "l"; } /* Lam */ +"\u0645" { return "m"; } /* Meem */ +"\u0646" { return "n"; } /* Noon */ +"\u0647" { return "h"; } /* Ha */ +"\u0648" { return "w"; } /* Waw */ +"\u0649" { return "Y"; } /* AlifMaksura */ +"\u064A" { return "y"; } /* Ya */ +"\u064B" { return "F"; } /* Fathatan */ +"\u064C" { return "N"; } /* Dammatan */ +"\u064D" { return "K"; } /* Kasratan */ +"\u064E" { return "a"; } /* Fatha */ +"\u064F" { return "u"; } /* Damma */ +"\u0650" { return "i"; } /* Kasra */ +"\u0651" { return "~"; } /* Shadda */ +"\u0652" { return "o"; } /* Sukun */ +"\u0653" { return "^"; } /* Maddah */ +"\u0654" { return "#"; } /* HamzaAbove */ + +"\u0670" { return "`"; } /* AlifKhanjareeya */ +"\u0671" { return "{"; } /* Alif + HamzatWasl */ + +"\u067E" { return "P"; } /* PEH from AraMorph */ +"\u0686" { return "J"; } /* TCHEH from AraMorph */ +"\u06A4" { return "V"; } /* VEH from AraMorph */ +"\u06AF" { return "G"; } /* GAF from AraMorph */ +"\u0698" { return "R"; } /* JEH from AraMorph */ +"\u061F" { return "?"; } /* QUESTION MARK from AraMorph */ + +"\u06DC" { return ":"; } /* SmallHighSeen */ +"\u06DF" { return "@"; } /* SmallHighRoundedZero */ + +"\u06E2" { return "["; } /* SmallHighMeemIsolatedForm */ +"\u06E3" { return ";"; } /* SmallLowSeen */ +"\u06E5" { return ","; } /* SmallWaw */ +"\u06E6" { return "."; } /* SmallYa */ +"\u06E8" { return "!"; } /* SmallHighNoon */ +"\u06EA" { return "-"; } /* EmptyCentreLowStop */ +"\u06EB" { return "+"; } /* EmptyCentreHighStop */ +"\u06EC" { return "%"; } /* RoundedHighStopWithFilledCentre */ +"\u06ED" { return "]"; } /* SmallLowMeem */ + +[\&_]"vert;" { return "|"; } +[\&_]"lpar;" { return "("; } +[\&_]"rpar;" { return ")"; } +[\_\&]"lt;" { return "<"; } +[\_\&]"gt;" { return ">"; } +"'" { return "'"; } + +"&"[a-zA-Z]+";" { return yytext(); } + +. { return yytext(); } +\n { return yytext(); } + +/* make problemes */ +/* "\u06E0" { return "\\""; } SmallHighUprightRectangularZero */ + + +/* double entries */ +/* "\u060C" { return ","; } COMMA from AraMorph */ +/* "\u061B" { return ";"; } SEMICOLON from AraMorph */ + +/* not in buckwalter contained */ +/* \u0679 : ARABIC LETTER TTEH */ +/* \u0688 : ARABIC LETTER DDAL */ +/* \u06A9 : ARABIC LETTER KEHEH */ +/* \u0691 : ARABIC LETTER RREH */ +/* \u06BA : ARABIC LETTER NOON GHUNNA */ +/* \u06BE : ARABIC LETTER HEH DOACHASHMEE */ +/* \u06C1 : ARABIC LETTER HEH GOAL */ +/* \u06D2 : ARABIC LETTER YEH BARREE */ + diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/transcode/Unicode2BuckwalterLex.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/transcode/Unicode2BuckwalterLex.java Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,882 @@ +/* The following code was generated by JFlex 1.4.3 on 14.12.10 17:12 */ + +package de.mpg.mpiwg.berlin.mpdl.lt.text.transcode; + + +/** + * This class is a scanner generated by + * JFlex 1.4.3 + * on 14.12.10 17:12 from the specification file + * /Users/jwillenborg/test/jflex/Unicode2Buckwalter.lex + */ +public class Unicode2BuckwalterLex { + + /** This character denotes the end of file */ + public static final int YYEOF = -1; + + /** initial size of the lookahead buffer */ + private static final int ZZ_BUFFERSIZE = 16384; + + /** lexical states */ + public static final int YYINITIAL = 0; + + /** + * ZZ_LEXSTATE[l] is the state in the DFA for the lexical state l + * ZZ_LEXSTATE[l+1] is the state in the DFA for the lexical state l + * at the beginning of a line + * l is of the form l = 2*k, k a non negative integer + */ + private static final int ZZ_LEXSTATE[] = { + 0, 0 + }; + + /** + * Translates characters to character classes + */ + private static final String ZZ_CMAP_PACKED = + "\12\0\1\0\30\0\1\120\2\0\1\117\11\0\1\121\2\0\1\122"+ + "\5\0\1\123\1\0\1\112\1\1\1\0\1\2\2\0\32\124\4\0"+ + "\1\105\1\0\1\115\3\124\1\107\1\124\1\116\4\124\1\113\3\124"+ + "\1\114\1\124\1\110\1\124\1\111\1\124\1\106\4\124\u05a4\0\1\71"+ + "\1\0\1\3\1\4\1\5\1\6\1\7\1\10\1\11\1\12\1\13"+ + "\1\14\1\15\1\16\1\17\1\20\1\21\1\22\1\23\1\24\1\25"+ + "\1\26\1\27\1\30\1\31\1\32\1\33\1\34\5\0\1\35\1\36"+ + "\1\37\1\40\1\41\1\42\1\43\1\44\1\45\1\46\1\47\1\50"+ + "\1\51\1\52\1\53\1\54\1\55\1\56\1\57\1\60\1\61\33\0"+ + "\1\62\1\63\14\0\1\64\7\0\1\65\21\0\1\70\13\0\1\66"+ + "\12\0\1\67\54\0\1\72\2\0\1\73\2\0\1\74\1\75\1\0"+ + "\1\76\1\77\1\0\1\100\1\0\1\101\1\102\1\103\1\104\uf912\0"; + + /** + * Translates characters to character classes + */ + private static final char [] ZZ_CMAP = zzUnpackCMap(ZZ_CMAP_PACKED); + + /** + * Translates DFA states to action switch labels. + */ + private static final int [] ZZ_ACTION = zzUnpackAction(); + + private static final String ZZ_ACTION_PACKED_0 = + "\1\0\2\1\1\2\1\3\1\4\1\5\1\6\1\7"+ + "\1\10\1\11\1\12\1\13\1\14\1\15\1\16\1\17"+ + "\1\20\1\21\1\22\1\23\1\24\1\25\1\26\1\27"+ + "\1\30\1\31\1\32\1\33\1\34\1\35\1\36\1\37"+ + "\1\40\1\41\1\42\1\43\1\44\1\45\1\46\1\47"+ + "\1\50\1\51\1\52\1\53\1\54\1\55\1\56\1\57"+ + "\1\60\1\61\1\62\1\63\1\64\1\65\1\66\1\67"+ + "\1\70\1\71\1\72\1\73\1\74\1\75\1\76\1\77"+ + "\1\100\1\101\1\102\1\103\2\1\30\0\1\104\1\0"+ + "\1\105\13\0\1\106\1\107"; + + private static int [] zzUnpackAction() { + int [] result = new int[111]; + int offset = 0; + offset = zzUnpackAction(ZZ_ACTION_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackAction(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int count = packed.charAt(i++); + int value = packed.charAt(i++); + do result[j++] = value; while (--count > 0); + } + return j; + } + + + /** + * Translates a state to a row index in the transition table + */ + private static final int [] ZZ_ROWMAP = zzUnpackRowMap(); + + private static final String ZZ_ROWMAP_PACKED_0 = + "\0\0\0\125\0\252\0\125\0\125\0\125\0\125\0\125"+ + "\0\125\0\125\0\125\0\125\0\125\0\125\0\125\0\125"+ + "\0\125\0\125\0\125\0\125\0\125\0\125\0\125\0\125"+ + "\0\125\0\125\0\125\0\125\0\125\0\125\0\125\0\125"+ + "\0\125\0\125\0\125\0\125\0\125\0\125\0\125\0\125"+ + "\0\125\0\125\0\125\0\125\0\125\0\125\0\125\0\125"+ + "\0\125\0\125\0\125\0\125\0\125\0\125\0\125\0\125"+ + "\0\125\0\125\0\125\0\125\0\125\0\125\0\125\0\125"+ + "\0\125\0\125\0\125\0\125\0\125\0\377\0\u0154\0\u01a9"+ + "\0\u01fe\0\u0253\0\u02a8\0\u02fd\0\u0352\0\u03a7\0\u03fc\0\u0451"+ + "\0\u04a6\0\u04fb\0\u0550\0\u05a5\0\u05fa\0\u064f\0\u06a4\0\u06f9"+ + "\0\u074e\0\u07a3\0\u07f8\0\u084d\0\u08a2\0\u08f7\0\u094c\0\125"+ + "\0\u09a1\0\125\0\u09f6\0\u0a4b\0\u0aa0\0\u0af5\0\u0b4a\0\u0b9f"+ + "\0\u0bf4\0\u0c49\0\u0c9e\0\u0cf3\0\u0d48\0\125\0\125"; + + private static int [] zzUnpackRowMap() { + int [] result = new int[111]; + int offset = 0; + offset = zzUnpackRowMap(ZZ_ROWMAP_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackRowMap(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int high = packed.charAt(i++) << 16; + result[j++] = high | packed.charAt(i++); + } + return j; + } + + /** + * The transition table of the DFA + */ + private static final int [] ZZ_TRANS = zzUnpackTrans(); + + private static final String ZZ_TRANS_PACKED_0 = + "\1\2\1\3\1\2\1\4\1\5\1\6\1\7\1\10"+ + "\1\11\1\12\1\13\1\14\1\15\1\16\1\17\1\20"+ + "\1\21\1\22\1\23\1\24\1\25\1\26\1\27\1\30"+ + "\1\31\1\32\1\33\1\34\1\35\1\36\1\37\1\40"+ + "\1\41\1\42\1\43\1\44\1\45\1\46\1\47\1\50"+ + "\1\51\1\52\1\53\1\54\1\55\1\56\1\57\1\60"+ + "\1\61\1\62\1\63\1\64\1\65\1\66\1\67\1\70"+ + "\1\71\1\72\1\73\1\74\1\75\1\76\1\77\1\100"+ + "\1\101\1\102\1\103\1\104\1\105\1\106\11\2\1\107"+ + "\5\2\125\0\2\110\1\0\122\110\106\0\1\111\1\0"+ + "\1\112\2\0\1\113\2\0\1\114\114\0\1\115\1\116"+ + "\1\117\1\116\1\0\1\120\2\116\1\121\1\0\1\122"+ + "\3\0\1\116\2\110\1\2\122\110\107\0\1\123\131\0"+ + "\1\124\121\0\1\125\2\0\1\126\121\0\1\127\121\0"+ + "\1\116\1\130\2\116\1\2\4\116\5\0\1\116\106\0"+ + "\4\116\1\2\4\116\5\0\1\116\106\0\4\116\1\2"+ + "\1\116\1\131\2\116\5\0\1\116\106\0\3\116\1\132"+ + "\1\2\1\116\1\133\2\116\5\0\1\116\106\0\3\116"+ + "\1\134\1\2\4\116\5\0\1\116\121\0\1\135\113\0"+ + "\1\136\131\0\1\137\121\0\1\140\127\0\1\141\121\0"+ + "\1\142\120\0\2\116\1\143\1\116\1\2\4\116\5\0"+ + "\1\116\106\0\4\116\1\2\2\116\1\144\1\116\5\0"+ + "\1\116\106\0\4\116\1\140\4\116\5\0\1\116\106\0"+ + "\4\116\1\2\2\116\1\145\1\116\5\0\1\116\106\0"+ + "\4\116\1\142\4\116\5\0\1\116\122\0\1\146\113\0"+ + "\1\147\123\0\1\150\124\0\1\151\122\0\3\116\1\152"+ + "\1\2\4\116\5\0\1\116\106\0\2\116\1\153\1\116"+ + "\1\2\4\116\5\0\1\116\106\0\2\116\1\154\1\116"+ + "\1\2\4\116\5\0\1\116\123\0\1\155\113\0\1\5"+ + "\124\0\1\156\124\0\1\157\120\0\4\116\1\5\4\116"+ + "\5\0\1\116\106\0\4\116\1\156\4\116\5\0\1\116"+ + "\106\0\4\116\1\157\4\116\5\0\1\116\112\0\1\4"+ + "\12\0"; + + private static int [] zzUnpackTrans() { + int [] result = new int[3485]; + int offset = 0; + offset = zzUnpackTrans(ZZ_TRANS_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackTrans(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int count = packed.charAt(i++); + int value = packed.charAt(i++); + value--; + do result[j++] = value; while (--count > 0); + } + return j; + } + + + /* error codes */ + private static final int ZZ_UNKNOWN_ERROR = 0; + private static final int ZZ_NO_MATCH = 1; + private static final int ZZ_PUSHBACK_2BIG = 2; + + /* error messages for the codes above */ + private static final String ZZ_ERROR_MSG[] = { + "Unkown internal scanner error", + "Error: could not match input", + "Error: pushback value was too large" + }; + + /** + * ZZ_ATTRIBUTE[aState] contains the attributes of state aState + */ + private static final int [] ZZ_ATTRIBUTE = zzUnpackAttribute(); + + private static final String ZZ_ATTRIBUTE_PACKED_0 = + "\1\0\1\11\1\1\102\11\2\1\30\0\1\11\1\0"+ + "\1\11\13\0\2\11"; + + private static int [] zzUnpackAttribute() { + int [] result = new int[111]; + int offset = 0; + offset = zzUnpackAttribute(ZZ_ATTRIBUTE_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackAttribute(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int count = packed.charAt(i++); + int value = packed.charAt(i++); + do result[j++] = value; while (--count > 0); + } + return j; + } + + /** the input device */ + private java.io.Reader zzReader; + + /** the current state of the DFA */ + private int zzState; + + /** the current lexical state */ + private int zzLexicalState = YYINITIAL; + + /** this buffer contains the current text to be matched and is + the source of the yytext() string */ + private char zzBuffer[] = new char[ZZ_BUFFERSIZE]; + + /** the textposition at the last accepting state */ + private int zzMarkedPos; + + /** the current text position in the buffer */ + private int zzCurrentPos; + + /** startRead marks the beginning of the yytext() string in the buffer */ + private int zzStartRead; + + /** endRead marks the last character in the buffer, that has been read + from input */ + private int zzEndRead; + + /** number of newlines encountered up to the start of the matched text */ + private int yyline; + + /** the number of characters up to the start of the matched text */ + private int yychar; + + /** + * the number of characters from the last newline up to the start of the + * matched text + */ + private int yycolumn; + + /** + * zzAtBOL == true <=> the scanner is currently at the beginning of a line + */ + private boolean zzAtBOL = true; + + /** zzAtEOF == true <=> the scanner is at the EOF */ + private boolean zzAtEOF; + + /** denotes if the user-EOF-code has already been executed */ + private boolean zzEOFDone; + + /* user code: */ + /* + * Betacode to Unicode conversion + */ + + + + /** + * Creates a new scanner + * There is also a java.io.InputStream version of this constructor. + * + * @param in the java.io.Reader to read input from. + */ + public Unicode2BuckwalterLex(java.io.Reader in) { + this.zzReader = in; + } + + /** + * Creates a new scanner. + * There is also java.io.Reader version of this constructor. + * + * @param in the java.io.Inputstream to read input from. + */ + public Unicode2BuckwalterLex(java.io.InputStream in) { + this(new java.io.InputStreamReader(in)); + } + + /** + * Unpacks the compressed character translation table. + * + * @param packed the packed character translation table + * @return the unpacked character translation table + */ + private static char [] zzUnpackCMap(String packed) { + char [] map = new char[0x10000]; + int i = 0; /* index in packed string */ + int j = 0; /* index in unpacked array */ + while (i < 240) { + int count = packed.charAt(i++); + char value = packed.charAt(i++); + do map[j++] = value; while (--count > 0); + } + return map; + } + + + /** + * Refills the input buffer. + * + * @return false, iff there was new input. + * + * @exception java.io.IOException if any I/O-Error occurs + */ + private boolean zzRefill() throws java.io.IOException { + + /* first: make room (if you can) */ + if (zzStartRead > 0) { + System.arraycopy(zzBuffer, zzStartRead, + zzBuffer, 0, + zzEndRead-zzStartRead); + + /* translate stored positions */ + zzEndRead-= zzStartRead; + zzCurrentPos-= zzStartRead; + zzMarkedPos-= zzStartRead; + zzStartRead = 0; + } + + /* is the buffer big enough? */ + if (zzCurrentPos >= zzBuffer.length) { + /* if not: blow it up */ + char newBuffer[] = new char[zzCurrentPos*2]; + System.arraycopy(zzBuffer, 0, newBuffer, 0, zzBuffer.length); + zzBuffer = newBuffer; + } + + /* finally: fill the buffer with new input */ + int numRead = zzReader.read(zzBuffer, zzEndRead, + zzBuffer.length-zzEndRead); + + if (numRead > 0) { + zzEndRead+= numRead; + return false; + } + // unlikely but not impossible: read 0 characters, but not at end of stream + if (numRead == 0) { + int c = zzReader.read(); + if (c == -1) { + return true; + } else { + zzBuffer[zzEndRead++] = (char) c; + return false; + } + } + + // numRead < 0 + return true; + } + + + /** + * Closes the input stream. + */ + public final void yyclose() throws java.io.IOException { + zzAtEOF = true; /* indicate end of file */ + zzEndRead = zzStartRead; /* invalidate buffer */ + + if (zzReader != null) + zzReader.close(); + } + + + /** + * Resets the scanner to read from a new input stream. + * Does not close the old reader. + * + * All internal variables are reset, the old input stream + * cannot be reused (internal buffer is discarded and lost). + * Lexical state is set to ZZ_INITIAL. + * + * @param reader the new input stream + */ + public final void yyreset(java.io.Reader reader) { + zzReader = reader; + zzAtBOL = true; + zzAtEOF = false; + zzEOFDone = false; + zzEndRead = zzStartRead = 0; + zzCurrentPos = zzMarkedPos = 0; + yyline = yychar = yycolumn = 0; + zzLexicalState = YYINITIAL; + } + + + /** + * Returns the current lexical state. + */ + public final int yystate() { + return zzLexicalState; + } + + + /** + * Enters a new lexical state + * + * @param newState the new lexical state + */ + public final void yybegin(int newState) { + zzLexicalState = newState; + } + + + /** + * Returns the text matched by the current regular expression. + */ + public final String yytext() { + return new String( zzBuffer, zzStartRead, zzMarkedPos-zzStartRead ); + } + + + /** + * Returns the character at position pos from the + * matched text. + * + * It is equivalent to yytext().charAt(pos), but faster + * + * @param pos the position of the character to fetch. + * A value from 0 to yylength()-1. + * + * @return the character at position pos + */ + public final char yycharat(int pos) { + return zzBuffer[zzStartRead+pos]; + } + + + /** + * Returns the length of the matched text region. + */ + public final int yylength() { + return zzMarkedPos-zzStartRead; + } + + + /** + * Reports an error that occured while scanning. + * + * In a wellformed scanner (no or only correct usage of + * yypushback(int) and a match-all fallback rule) this method + * will only be called with things that "Can't Possibly Happen". + * If this method is called, something is seriously wrong + * (e.g. a JFlex bug producing a faulty scanner etc.). + * + * Usual syntax/scanner level error handling should be done + * in error fallback rules. + * + * @param errorCode the code of the errormessage to display + */ + private void zzScanError(int errorCode) { + String message; + try { + message = ZZ_ERROR_MSG[errorCode]; + } + catch (ArrayIndexOutOfBoundsException e) { + message = ZZ_ERROR_MSG[ZZ_UNKNOWN_ERROR]; + } + + throw new Error(message); + } + + + /** + * Pushes the specified amount of characters back into the input stream. + * + * They will be read again by then next call of the scanning method + * + * @param number the number of characters to be read again. + * This number must not be greater than yylength()! + */ + public void yypushback(int number) { + if ( number > yylength() ) + zzScanError(ZZ_PUSHBACK_2BIG); + + zzMarkedPos -= number; + } + + + /** + * Resumes scanning until the next regular expression is matched, + * the end of input is encountered or an I/O-Error occurs. + * + * @return the next token + * @exception java.io.IOException if any I/O-Error occurs + */ + public java.lang.String yylex() throws java.io.IOException { + int zzInput; + int zzAction; + + // cached fields: + int zzCurrentPosL; + int zzMarkedPosL; + int zzEndReadL = zzEndRead; + char [] zzBufferL = zzBuffer; + char [] zzCMapL = ZZ_CMAP; + + int [] zzTransL = ZZ_TRANS; + int [] zzRowMapL = ZZ_ROWMAP; + int [] zzAttrL = ZZ_ATTRIBUTE; + + while (true) { + zzMarkedPosL = zzMarkedPos; + + zzAction = -1; + + zzCurrentPosL = zzCurrentPos = zzStartRead = zzMarkedPosL; + + zzState = ZZ_LEXSTATE[zzLexicalState]; + + + zzForAction: { + while (true) { + + if (zzCurrentPosL < zzEndReadL) + zzInput = zzBufferL[zzCurrentPosL++]; + else if (zzAtEOF) { + zzInput = YYEOF; + break zzForAction; + } + else { + // store back cached positions + zzCurrentPos = zzCurrentPosL; + zzMarkedPos = zzMarkedPosL; + boolean eof = zzRefill(); + // get translated positions and possibly new buffer + zzCurrentPosL = zzCurrentPos; + zzMarkedPosL = zzMarkedPos; + zzBufferL = zzBuffer; + zzEndReadL = zzEndRead; + if (eof) { + zzInput = YYEOF; + break zzForAction; + } + else { + zzInput = zzBufferL[zzCurrentPosL++]; + } + } + int zzNext = zzTransL[ zzRowMapL[zzState] + zzCMapL[zzInput] ]; + if (zzNext == -1) break zzForAction; + zzState = zzNext; + + int zzAttributes = zzAttrL[zzState]; + if ( (zzAttributes & 1) == 1 ) { + zzAction = zzState; + zzMarkedPosL = zzCurrentPosL; + if ( (zzAttributes & 8) == 8 ) break zzForAction; + } + + } + } + + // store back cached position + zzMarkedPos = zzMarkedPosL; + + switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) { + case 23: + { return "D"; + } + case 72: break; + case 17: + { return "*"; + } + case 73: break; + case 46: + { return "o"; + } + case 74: break; + case 60: + { return ";"; + } + case 75: break; + case 63: + { return "!"; + } + case 76: break; + case 29: + { return "f"; + } + case 77: break; + case 36: + { return "w"; + } + case 78: break; + case 67: + { return "]"; + } + case 79: break; + case 70: + { return ")"; + } + case 80: break; + case 69: + { return ">"; + } + case 81: break; + case 34: + { return "n"; + } + case 82: break; + case 24: + { return "T"; + } + case 83: break; + case 57: + { return ":"; + } + case 84: break; + case 41: + { return "K"; + } + case 85: break; + case 12: + { return "v"; + } + case 86: break; + case 71: + { return "("; + } + case 87: break; + case 33: + { return "m"; + } + case 88: break; + case 22: + { return "S"; + } + case 89: break; + case 45: + { return "~"; + } + case 90: break; + case 16: + { return "d"; + } + case 91: break; + case 52: + { return "J"; + } + case 92: break; + case 43: + { return "u"; + } + case 93: break; + case 59: + { return "["; + } + case 94: break; + case 8: + { return "A"; + } + case 95: break; + case 2: + { return "'"; + } + case 96: break; + case 32: + { return "l"; + } + case 97: break; + case 55: + { return "R"; + } + case 98: break; + case 7: + { return "}"; + } + case 99: break; + case 11: + { return "t"; + } + case 100: break; + case 25: + { return "Z"; + } + case 101: break; + case 58: + { return "@"; + } + case 102: break; + case 5: + { return "&"; + } + case 103: break; + case 31: + { return "k"; + } + case 104: break; + case 3: + { return "|"; + } + case 105: break; + case 9: + { return "b"; + } + case 106: break; + case 14: + { return "H"; + } + case 107: break; + case 62: + { return "."; + } + case 108: break; + case 20: + { return "s"; + } + case 109: break; + case 37: + { return "Y"; + } + case 110: break; + case 56: + { return "?"; + } + case 111: break; + case 66: + { return "%"; + } + case 112: break; + case 13: + { return "j"; + } + case 113: break; + case 51: + { return "P"; + } + case 114: break; + case 50: + { return "{"; + } + case 115: break; + case 1: + { return yytext(); + } + case 116: break; + case 42: + { return "a"; + } + case 117: break; + case 54: + { return "G"; + } + case 118: break; + case 64: + { return "-"; + } + case 119: break; + case 18: + { return "r"; + } + case 120: break; + case 4: + { return ">"; + } + case 121: break; + case 21: + { return "$"; + } + case 122: break; + case 44: + { return "i"; + } + case 123: break; + case 19: + { return "z"; + } + case 124: break; + case 68: + { return "<"; + } + case 125: break; + case 49: + { return "`"; + } + case 126: break; + case 39: + { return "F"; + } + case 127: break; + case 61: + { return ","; + } + case 128: break; + case 30: + { return "q"; + } + case 129: break; + case 48: + { return "#"; + } + case 130: break; + case 35: + { return "h"; + } + case 131: break; + case 40: + { return "N"; + } + case 132: break; + case 38: + { return "y"; + } + case 133: break; + case 28: + { return "_"; + } + case 134: break; + case 26: + { return "E"; + } + case 135: break; + case 65: + { return "+"; + } + case 136: break; + case 10: + { return "p"; + } + case 137: break; + case 53: + { return "V"; + } + case 138: break; + case 6: + { return "<"; + } + case 139: break; + case 27: + { return "g"; + } + case 140: break; + case 15: + { return "x"; + } + case 141: break; + case 47: + { return "^"; + } + case 142: break; + default: + if (zzInput == YYEOF && zzStartRead == zzCurrentPos) { + zzAtEOF = true; + return null; + } + else { + zzScanError(ZZ_NO_MATCH); + } + } + } + } + + +} diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lucene/util/LuceneUtil.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lucene/util/LuceneUtil.java Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,27 @@ +package de.mpg.mpiwg.berlin.mpdl.lucene.util; + +import java.util.ArrayList; + +public class LuceneUtil { + private static LuceneUtil instance; + + public static LuceneUtil getInstance() { + if (instance == null) { + instance = new LuceneUtil(); + } + return instance; + } + + public ArrayList getVariantsFromLuceneQuery(String queryString) { + ArrayList variants = new ArrayList(); + String[] variantTokens = queryString.split(" "); // TODO throw the phrases away (e.g.: "bla bla bla") + for (int i = 0; i < variantTokens.length; i++) { + String token = variantTokens[i]; + if (! (token.contains("*") || token.contains("?") || token.contains("~") || token.contains("-") || token.contains("+") || token.contains("^") || token.contains("OR") || token.contains("AND") || token.contains("NOT"))) { + variants.add(token); + } + } + return variants; + } + +} \ No newline at end of file diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/test/TestLocal.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/test/TestLocal.java Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,131 @@ +package de.mpg.mpiwg.berlin.mpdl.test; + +import java.io.BufferedInputStream; +import java.io.File; +import java.io.IOException; +import java.io.InputStream; +import java.io.StringReader; +import java.net.URL; +import java.util.ArrayList; + +import org.apache.commons.io.FileUtils; +import org.apache.commons.io.IOUtils; + +import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; +import de.mpg.mpiwg.berlin.mpdl.lt.dict.app.Lexicon; +import de.mpg.mpiwg.berlin.mpdl.lt.dict.app.LexiconEntry; +import de.mpg.mpiwg.berlin.mpdl.lt.dict.db.LexHandler; +import de.mpg.mpiwg.berlin.mpdl.lt.morph.app.Lemma; +import de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize.Token; +import de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize.Tokenizer; +import de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize.XmlTokenizer; + +public class TestLocal { + private LexHandler lexHandler; + + public static void main(String[] args) throws ApplicationException { + try { + TestLocal test = new TestLocal(); + test.init(); + // test.testCalls(); + // test.tokenizeString(); + // test.tokenizeXmlFragment(); + test.getLexEntriesByLexiconBeginningWith("ls", "a"); + // test.end(); + } catch (Exception e) { + e.printStackTrace(); + } + } + + private void init() throws ApplicationException { + lexHandler = LexHandler.getInstance(); + } + + private void end() throws ApplicationException { + lexHandler.end(); + } + + private ArrayList tokenizeString() throws ApplicationException { + ArrayList tokens = new ArrayList(); + try { + StringReader reader = new StringReader("edo philoſophi"); + // StringReader reader = new StringReader("扞盗則李兗州"); + Tokenizer tokenizer = new Tokenizer(reader); + tokenizer.setLanguage("lat"); + // tokenizer.setLanguage("zho"); + String[] normFunctions = new String[1]; + normFunctions[0] = "norm"; + tokenizer.setNormFunctions(normFunctions); + tokens = tokenizer.getTokens(); + tokenizer.end(); + tokenizer.close(); + } catch (IOException e) { + throw new ApplicationException(e); + } + return tokens; + } + + private String tokenizeXmlFragment() throws ApplicationException { + String result = null; + try { + String xmlFragment = new String(FileUtils.readFileToByteArray(new File("/Users/jwillenborg/tmp/testFragment2.xml")), "utf-8"); + String srcUrlStr = "http://mpdl-system.mpiwg-berlin.mpg.de/mpdl/page-query-result.xql?document=/echo/la/Benedetti_1585.xml&mode=pureXml&pn=13"; + URL srcUrl = new URL(srcUrlStr); + InputStream inputStream = srcUrl.openStream(); + BufferedInputStream in = new BufferedInputStream(inputStream); + xmlFragment = IOUtils.toString(in, "utf-8"); + in.close(); + + XmlTokenizer xmlTokenizer = new XmlTokenizer(new StringReader(xmlFragment)); + xmlTokenizer.setLanguage("lat"); + String[] normFunctions = new String[1]; + normFunctions[0] = "norm"; + String[] stopElements = new String[1]; + stopElements[0] = "var"; + xmlTokenizer.setNormFunctions(normFunctions); + xmlTokenizer.setStopElements(stopElements); + result = xmlTokenizer.tokenize(); + System.out.println(result); + } catch (Exception e) { + throw new ApplicationException(e); + } + return result; + } + + private void testCalls() throws ApplicationException { + String query = "sum quibus"; + String language = "lat"; + // String query = "ἱκανῶσ"; + // String language = "el"; + String inputType = "form"; + String outputType = null; + String outputFormat = "html"; + String dictionaryName = null; + String normalization = "norm"; + getLexEntries(query, language, inputType, outputType, outputFormat, dictionaryName, normalization); + } + + private void getLexEntries(String query, String language, String inputType , String outputType, String outputFormat, String dictionaryName, String normalization) throws ApplicationException { + ArrayList lemmas = lexHandler.getLemmas(query, inputType, language, normalization); + ArrayList dictionaries = lexHandler.getLexEntries(lemmas, language, dictionaryName); + // String result = lexHandler.getLexEntries(query, language, inputType, outputType, outputFormat, dictionaryName, normalization); + String result = ""; + result = result + ""; + for (int i=0; i"; + System.out.println(result); + } + + private void getLexEntriesByLexiconBeginningWith(String lexiconName, String prefix) throws ApplicationException { + ArrayList lexEntries = lexHandler.getLexEntriesByLexiconBeginningWith(lexiconName, prefix, 1); + System.out.println(lexEntries); + } + + private void getLexEntriesBeginningWith(String language, String prefix) throws ApplicationException { + ArrayList lexEntries = lexHandler.getLexEntriesBeginningWith(language, prefix, 1); + System.out.println(lexEntries); + } +} diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/util/StringUtils.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/util/StringUtils.java Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,491 @@ +package de.mpg.mpiwg.berlin.mpdl.util; + +import java.io.UnsupportedEncodingException; +import java.net.URLEncoder; +import java.text.CharacterIterator; +import java.text.StringCharacterIterator; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +public class StringUtils { + + /** + * Puts a zwsp between two ideographic characters (e.g. in CJK Unified Ideographs) + * @param str + * @return + */ + public static String zwsp(String str) { + // based on Unicode 3.2 + String ideographic = "[\u3300-\u33ff\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff]"; + String regex = "(" + ideographic + ")(" + ideographic + ")"; + String retStr = str.replaceAll(regex, "$1\u200b$2"); + retStr = retStr.replaceAll(regex, "$1\u200b$2"); + return retStr; + } + + + public static String deleteSpecialXmlEntities(String inputStr) { + inputStr = inputStr.replaceAll("<", ""); + inputStr = inputStr.replaceAll(">", ""); + inputStr = inputStr.replaceAll("<", ""); + inputStr = inputStr.replaceAll(">", ""); + return inputStr; + } + + public static String resolveXmlEntities(String inputStr) { + inputStr = inputStr.replaceAll("&", "&"); + inputStr = inputStr.replaceAll("<", "<"); + inputStr = inputStr.replaceAll(">", ">"); + inputStr = inputStr.replaceAll(""", "\""); + inputStr = inputStr.replaceAll("'", "'"); + return inputStr; + } + + public static String deresolveXmlEntities(String inputStr) { + StringBuffer buf = new StringBuffer(); + for (int i = 0; i < inputStr.length(); i++) { + char c = inputStr.charAt(i); + String replace = new String(); + switch (c) { + case '&': replace = "&"; break; + case '<': replace = "<"; break; + case '>': replace = ">"; break; + case '"': replace = """; break; + // case '\'': replace = "'"; break; // causes problems in DictionarizerContentHandler + default: replace += c; break; + } + buf.append(replace); + } + return buf.toString(); + } + + /** + * Escape characters for text appearing in HTML markup. + * + * This method exists as a defence against Cross Site Scripting (XSS) hacks. + * The idea is to neutralize control characters commonly used by scripts, such that + * they will not be executed by the browser. This is done by replacing the control + * characters with their escaped equivalents. + * See {@link hirondelle.web4j.security.SafeText} as well. + * + * The following characters are replaced with corresponding + * HTML character entities : + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * Character Replacement < < > > & & " " \t ! ! # # $ $ % % ' ' ( ( ) ) * * + + , , - - . . / / : : ; ; = = ? ? @ @ [ [ \ \ ] ] ^ ^ _ _ ` ` { { | | } } ~ ~ + * + * Note that JSTL's {@code } escapes only the first + * five of the above characters. + */ + public static String forHTML(String aText){ + final StringBuilder result = new StringBuilder(); + final StringCharacterIterator iterator = new StringCharacterIterator(aText); + char character = iterator.current(); + while (character != CharacterIterator.DONE ){ + if (character == '<') { + result.append("<"); + } + else if (character == '>') { + result.append(">"); + } + else if (character == '&') { + result.append("&"); + } + else if (character == '\"') { + result.append("""); + } + else if (character == '\t') { + addCharEntity(9, result); + } + else if (character == '!') { + addCharEntity(33, result); + } + else if (character == '#') { + addCharEntity(35, result); + } + else if (character == '$') { + addCharEntity(36, result); + } + else if (character == '%') { + addCharEntity(37, result); + } + else if (character == '\'') { + addCharEntity(39, result); + } + else if (character == '(') { + addCharEntity(40, result); + } + else if (character == ')') { + addCharEntity(41, result); + } + else if (character == '*') { + addCharEntity(42, result); + } + else if (character == '+') { + addCharEntity(43, result); + } + else if (character == ',') { + addCharEntity(44, result); + } + else if (character == '-') { + addCharEntity(45, result); + } + else if (character == '.') { + addCharEntity(46, result); + } + else if (character == '/') { + addCharEntity(47, result); + } + else if (character == ':') { + addCharEntity(58, result); + } + else if (character == ';') { + addCharEntity(59, result); + } + else if (character == '=') { + addCharEntity(61, result); + } + else if (character == '?') { + addCharEntity(63, result); + } + else if (character == '@') { + addCharEntity(64, result); + } + else if (character == '[') { + addCharEntity(91, result); + } + else if (character == '\\') { + addCharEntity(92, result); + } + else if (character == ']') { + addCharEntity(93, result); + } + else if (character == '^') { + addCharEntity(94, result); + } + else if (character == '_') { + addCharEntity(95, result); + } + else if (character == '`') { + addCharEntity(96, result); + } + else if (character == '{') { + addCharEntity(123, result); + } + else if (character == '|') { + addCharEntity(124, result); + } + else if (character == '}') { + addCharEntity(125, result); + } + else if (character == '~') { + addCharEntity(126, result); + } + else { + //the char is not a special one + //add it to the result as is + result.append(character); + } + character = iterator.next(); + } + return result.toString(); + } + + + /** + * Escape all ampersand characters in a URL. + * + * Replaces all '&' characters with '&'. + * + * An ampersand character may appear in the query string of a URL. + * The ampersand character is indeed valid in a URL. + * However, URLs usually appear as an HREF attribute, and + * such attributes have the additional constraint that ampersands + * must be escaped. + * + * The JSTL tag does indeed perform proper URL encoding of + * query parameters. But it does not, in general, produce text which + * is valid as an HREF attribute, simply because it does + * not escape the ampersand character. This is a nuisance when + * multiple query parameters appear in the URL, since it requires a little + * extra work. + */ + public static String forHrefAmpersand(String aURL){ + return aURL.replace("&", "&"); + } + + /** + * Synonym for URLEncoder.encode(String, "UTF-8"). + * + * Used to ensure that HTTP query strings are in proper form, by escaping + * special characters such as spaces. + * + * It is important to note that if a query string appears in an HREF + * attribute, then there are two issues - ensuring the query string is valid HTTP + * (it is URL-encoded), and ensuring it is valid HTML (ensuring the + * ampersand is escaped). + */ + public static String forURL(String aURLFragment){ + String result = null; + try { + result = URLEncoder.encode(aURLFragment, "UTF-8"); + } + catch (UnsupportedEncodingException ex){ + throw new RuntimeException("UTF-8 not supported", ex); + } + return result; + } + + /** + * Escape characters for text appearing as XML data, between tags. + * + * The following characters are replaced with corresponding character entities : + * + * + * + * + * + * + * + * Character Encoding < < > > & & " " ' ' + * + * Note that JSTL's {@code } escapes the exact same set of + * characters as this method. That is, {@code } + * is good for escaping to produce valid XML, but not for producing safe + * HTML. + */ + public static String forXML(String aText){ + final StringBuilder result = new StringBuilder(); + final StringCharacterIterator iterator = new StringCharacterIterator(aText); + char character = iterator.current(); + while (character != CharacterIterator.DONE ){ + if (character == '<') { + result.append("<"); + } + else if (character == '>') { + result.append(">"); + } + else if (character == '\"') { + result.append("""); + } + else if (character == '\'') { + result.append("'"); + } + else if (character == '&') { + result.append("&"); + } + else { + //the char is not a special one + //add it to the result as is + result.append(character); + } + character = iterator.next(); + } + return result.toString(); + } + + /** + * Return aText with all '<' and '>' characters + * replaced by their escaped equivalents. + */ + public static String toDisableTags(String aText){ + final StringBuilder result = new StringBuilder(); + final StringCharacterIterator iterator = new StringCharacterIterator(aText); + char character = iterator.current(); + while (character != CharacterIterator.DONE ){ + if (character == '<') { + result.append("<"); + } + else if (character == '>') { + result.append(">"); + } + else { + //the char is not a special one + //add it to the result as is + result.append(character); + } + character = iterator.next(); + } + return result.toString(); + } + + + /** + * Replace characters having special meaning in regular expressions + * with their escaped equivalents, preceded by a '\' character. + * + * The escaped characters include : + * + *. + * \ + * ?, * , and + + * & + * : + * { and } + * [ and ] + * ( and ) + * ^ and $ + * + */ + public static String forRegex(String aRegexFragment){ + final StringBuilder result = new StringBuilder(); + + final StringCharacterIterator iterator = + new StringCharacterIterator(aRegexFragment) + ; + char character = iterator.current(); + while (character != CharacterIterator.DONE ){ + /* + * All literals need to have backslashes doubled. + */ + if (character == '.') { + result.append("\\."); + } + else if (character == '\\') { + result.append("\\\\"); + } + else if (character == '?') { + result.append("\\?"); + } + else if (character == '*') { + result.append("\\*"); + } + else if (character == '+') { + result.append("\\+"); + } + else if (character == '&') { + result.append("\\&"); + } + else if (character == ':') { + result.append("\\:"); + } + else if (character == '{') { + result.append("\\{"); + } + else if (character == '}') { + result.append("\\}"); + } + else if (character == '[') { + result.append("\\["); + } + else if (character == ']') { + result.append("\\]"); + } + else if (character == '(') { + result.append("\$"); + } + else if (character == ')') { + result.append("\$"); + } + else if (character == '^') { + result.append("\\^"); + } + else if (character == '$') { + result.append("\\$"); + } + else { + //the char is not a special one + //add it to the result as is + result.append(character); + } + character = iterator.next(); + } + return result.toString(); + } + + /** + * Escape '$' and '\' characters in replacement strings. + * + * Synonym for Matcher.quoteReplacement(String). + * + * The following methods use replacement strings which treat + * '$' and '\' as special characters: + * + * String.replaceAll(String, String) + * String.replaceFirst(String, String) + * Matcher.appendReplacement(StringBuffer, String) + * + * + * If replacement text can contain arbitrary characters, then you + * will usually need to escape that text, to ensure special characters + * are interpreted literally. + */ + public static String forReplacementString(String aInput){ + return Matcher.quoteReplacement(aInput); + } + + /** + * Disable all ", Pattern.CASE_INSENSITIVE + ); + + private static void addCharEntity(Integer aIdx, StringBuilder aBuilder){ + String padding = ""; + if( aIdx <= 9 ){ + padding = "00"; + } + else if( aIdx <= 99 ){ + padding = "0"; + } + else { + //no prefix + } + String number = padding + aIdx.toString(); + aBuilder.append("&#" + number + ";"); + } + } diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/util/Util.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/util/Util.java Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,32 @@ +package de.mpg.mpiwg.berlin.mpdl.util; + +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.util.Date; +import java.util.Properties; + +public class Util { + + public Properties getProperties(String fullFileName) { + Properties props = new Properties(); + try { + File file = new File(fullFileName); + FileInputStream in = new FileInputStream(file); + props.load(in); + } catch (IOException e) { + e.printStackTrace(); + } + return props; + } + + public Double getSecondWithMillisecondsBetween(Date begin, Date end) { + long beginMS = begin.getTime(); + long endMS = end.getTime(); + long elapsedSeconds = (endMS - beginMS) / 1000; + long elapsedMilliSecondsAfterSeconds1 = (endMS - beginMS) - (elapsedSeconds * 1000); + Double seconds = new Double(elapsedSeconds + "." + elapsedMilliSecondsAfterSeconds1); + return seconds; + } + +} diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-xml-web/.DS_Store Binary file software/mpdl-services/mpiwg-mpdl-xml-web/.DS_Store has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-xml-web/.classpath --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-xml-web/.classpath Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,18 @@ + + + + + + + + + + + + + + + + + + diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-xml-web/.externalToolBuilders/New_Builder.launch --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-xml-web/.externalToolBuilders/New_Builder.launch Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,12 @@ + + + + + + + + + + + + diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-xml-web/.project --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-xml-web/.project Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,46 @@ + + + mpiwg-mpdl-xml-web + + + + + + org.eclipse.wst.jsdt.core.javascriptValidator + + + + + org.eclipse.jdt.core.javabuilder + + + + + org.eclipse.wst.common.project.facet.core.builder + + + + + org.eclipse.wst.validation.validationbuilder + + + + + org.eclipse.ui.externaltools.ExternalToolBuilder + full,incremental, + + + LaunchConfigHandle + <project>/.externalToolBuilders/New_Builder.launch + + + + + + org.eclipse.jem.workbench.JavaEMFNature + org.eclipse.wst.common.modulecore.ModuleCoreNature + org.eclipse.wst.common.project.facet.core.nature + org.eclipse.jdt.core.javanature + org.eclipse.wst.jsdt.core.jsNature + + diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-xml-web/.settings/.jsdtscope --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-xml-web/.settings/.jsdtscope Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,12 @@ + + + + + + + + + + + + diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-xml-web/.settings/org.eclipse.jdt.core.prefs --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-xml-web/.settings/org.eclipse.jdt.core.prefs Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,12 @@ +#Mon Sep 12 15:41:45 CEST 2011 +eclipse.preferences.version=1 +org.eclipse.jdt.core.compiler.codegen.inlineJsrBytecode=enabled +org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.6 +org.eclipse.jdt.core.compiler.codegen.unusedLocal=preserve +org.eclipse.jdt.core.compiler.compliance=1.6 +org.eclipse.jdt.core.compiler.debug.lineNumber=generate +org.eclipse.jdt.core.compiler.debug.localVariable=generate +org.eclipse.jdt.core.compiler.debug.sourceFile=generate +org.eclipse.jdt.core.compiler.problem.assertIdentifier=error +org.eclipse.jdt.core.compiler.problem.enumIdentifier=error +org.eclipse.jdt.core.compiler.source=1.6 diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-xml-web/.settings/org.eclipse.wst.common.component --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-xml-web/.settings/org.eclipse.wst.common.component Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,10 @@ + + + + + + + + + + diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-xml-web/.settings/org.eclipse.wst.common.project.facet.core.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-xml-web/.settings/org.eclipse.wst.common.project.facet.core.xml Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,10 @@ + + + + + + + + + + diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-xml-web/.settings/org.eclipse.wst.jsdt.ui.superType.container --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-xml-web/.settings/org.eclipse.wst.jsdt.ui.superType.container Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,1 @@ +org.eclipse.wst.jsdt.launching.baseBrowserLibrary \ No newline at end of file diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-xml-web/.settings/org.eclipse.wst.jsdt.ui.superType.name --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-xml-web/.settings/org.eclipse.wst.jsdt.ui.superType.name Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,1 @@ +Window \ No newline at end of file diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-xml-web/.settings/org.eclipse.wst.ws.service.policy.prefs --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-xml-web/.settings/org.eclipse.wst.ws.service.policy.prefs Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,3 @@ +#Mon Sep 12 15:41:45 CEST 2011 +eclipse.preferences.version=1 +org.eclipse.wst.ws.service.policy.projectEnabled=false diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-xml-web/WebContent/.DS_Store Binary file software/mpdl-services/mpiwg-mpdl-xml-web/WebContent/.DS_Store has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-xml-web/WebContent/META-INF/MANIFEST.MF --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-xml-web/WebContent/META-INF/MANIFEST.MF Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,3 @@ +Manifest-Version: 1.0 +Class-Path: + diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-xml-web/WebContent/WEB-INF/.DS_Store Binary file software/mpdl-services/mpiwg-mpdl-xml-web/WebContent/WEB-INF/.DS_Store has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-xml-web/WebContent/WEB-INF/classes/.DS_Store Binary file software/mpdl-services/mpiwg-mpdl-xml-web/WebContent/WEB-INF/classes/.DS_Store has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-xml-web/WebContent/WEB-INF/classes/constants.properties --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-xml-web/WebContent/WEB-INF/classes/constants.properties Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,1 @@ +docDir=/usr/local/tomcat-mpdl/mpdl-data/xml/documents \ No newline at end of file diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-xml-web/WebContent/WEB-INF/lib/.DS_Store Binary file software/mpdl-services/mpiwg-mpdl-xml-web/WebContent/WEB-INF/lib/.DS_Store has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-xml-web/WebContent/WEB-INF/lib/commons-io-2.0.1.jar Binary file software/mpdl-services/mpiwg-mpdl-xml-web/WebContent/WEB-INF/lib/commons-io-2.0.1.jar has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-xml-web/WebContent/WEB-INF/lib/mpiwg-mpdl-xml-web.jar Binary file software/mpdl-services/mpiwg-mpdl-xml-web/WebContent/WEB-INF/lib/mpiwg-mpdl-xml-web.jar has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-xml-web/WebContent/WEB-INF/lib/mpiwg-mpdl-xml.jar Binary file software/mpdl-services/mpiwg-mpdl-xml-web/WebContent/WEB-INF/lib/mpiwg-mpdl-xml.jar has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-xml-web/WebContent/WEB-INF/lib/saxon.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-xml-web/WebContent/WEB-INF/lib/saxon.txt Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,3 @@ +Saxon: + +Release 9.1.0.5 (free version): releases < 9.1.0.7 support saxon extension functions diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-xml-web/WebContent/WEB-INF/lib/saxon9-s9api.jar Binary file software/mpdl-services/mpiwg-mpdl-xml-web/WebContent/WEB-INF/lib/saxon9-s9api.jar has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-xml-web/WebContent/WEB-INF/lib/saxon9.jar Binary file software/mpdl-services/mpiwg-mpdl-xml-web/WebContent/WEB-INF/lib/saxon9.jar has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-xml-web/WebContent/WEB-INF/web.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-xml-web/WebContent/WEB-INF/web.xml Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,40 @@ + + + mpiwg-mpdl-xml-web + + index.html + + + Transform + Transform + Transform + de.mpg.mpiwg.berlin.mpdl.servlets.xml.Transform + + + Transform + /transform/Transform + + + GetFragment + GetFragment + GetFragment + de.mpg.mpiwg.berlin.mpdl.servlets.xml.GetFragment + + + GetFragment + /transform/GetFragment + + + XQuery + XQuery + XQuery + de.mpg.mpiwg.berlin.mpdl.servlets.xml.XQuery + + + XQuery + /xquery/XQuery + + + de.mpg.mpiwg.berlin.mpdl.servlets.xml.MpiwgMpdlXmlWebServletContextListener + + \ No newline at end of file diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-xml-web/WebContent/index.html --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-xml-web/WebContent/index.html Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,130 @@ + + + +Max Planck Institute for the History of Science - Mpdl: XML Services + + + Max Planck Institute for the History of Science - Mpdl: XML Services +Available Services + + + Url: /mpiwg-mpdl-xml-web/transform/Transform + + Request parameters + + srcUrl (required) + + url of the Xml source document + + + xslUrl (required) + + url of the Xsl document which does the transformation of the Xml document + + + parameters (optional) + + parameters separated with blanks (e.g. "yourParam1=yourValue1 yourParam2=yourValue2") + default: no parameters + + + outputProperties (optional) + + output properties separated with blanks (e.g. "encoding=utf-8 indent=yes") + + "method=xhtml" + "indent=yes" + "media-type=text/html" + "encoding=utf-8" + default: "method=xml indent=yes media-type=text/xml encoding=utf-8" + + + + + + + Response output + + transformed Xml document + Example: Generate ids incrementally for sentences in example document + + + + + + Url: /mpiwg-mpdl-xml-web/transform/GetFragment + + Request parameters + + docId (required) + + document identifier of the Xml source document (e.g. "/tei/la/Test_1789.xml") + + + ms1Name (required) + + starting milestone element name (e.g. "pb") + + + ms1Pos (required) + + starting milestone position (e.g. "13") + + + ms2Name (required) + + ending milestone element name (e.g. "pb") + + + ms2Pos (required) + + ending milestone position (e.g. "14") + + + + + Response output + + fragment between the two milestones in the Xml document + Example: get the fragment between the second and the third page break + + + + + + Url: /mpiwg-mpdl-xml-web/xquery/XQuery + + Request parameters + + inputString or srcUrl (required) + + inputString + + XML string + + + srcUrl + + source URL of XML document + + + + + xquery (required) + + XQuery (or XPath) source code which should be executed + + + + + Response output + + XML result of the XQuery + Example: XPath: all sentences of example document + + + + + + + \ No newline at end of file diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-xml-web/WebContent/xsl/generateId.xsl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-xml-web/WebContent/xsl/generateId.xsl Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,34 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-xml-web/build/.DS_Store Binary file software/mpdl-services/mpiwg-mpdl-xml-web/build/.DS_Store has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-xml-web/build/build.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-xml-web/build/build.xml Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,57 @@ + + + mpiwg-mpdl-xml-web + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-xml-web/conf/constants-mpdl-system.properties --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-xml-web/conf/constants-mpdl-system.properties Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,1 @@ +docDir=/usr/local/tomcat-mpdl/mpdl-data/xml/documents \ No newline at end of file diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-xml-web/conf/constants.properties --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-xml-web/conf/constants.properties Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,1 @@ +docDir=/Users/jwillenborg/mpdl/data/xml/documents \ No newline at end of file diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-xml-web/dist-remote/mpiwg-mpdl-xml-web.jar Binary file software/mpdl-services/mpiwg-mpdl-xml-web/dist-remote/mpiwg-mpdl-xml-web.jar has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-xml-web/dist-remote/mpiwg-mpdl-xml-web.war Binary file software/mpdl-services/mpiwg-mpdl-xml-web/dist-remote/mpiwg-mpdl-xml-web.war has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-xml-web/dist/.DS_Store Binary file software/mpdl-services/mpiwg-mpdl-xml-web/dist/.DS_Store has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-xml-web/dist/mpiwg-mpdl-xml-web.jar Binary file software/mpdl-services/mpiwg-mpdl-xml-web/dist/mpiwg-mpdl-xml-web.jar has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-xml-web/dist/mpiwg-mpdl-xml-web.war Binary file software/mpdl-services/mpiwg-mpdl-xml-web/dist/mpiwg-mpdl-xml-web.war has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-xml-web/src/.DS_Store Binary file software/mpdl-services/mpiwg-mpdl-xml-web/src/.DS_Store has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-xml-web/src/de/.DS_Store Binary file software/mpdl-services/mpiwg-mpdl-xml-web/src/de/.DS_Store has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-xml-web/src/de/mpg/.DS_Store Binary file software/mpdl-services/mpiwg-mpdl-xml-web/src/de/mpg/.DS_Store has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-xml-web/src/de/mpg/mpiwg/.DS_Store Binary file software/mpdl-services/mpiwg-mpdl-xml-web/src/de/mpg/mpiwg/.DS_Store has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-xml-web/src/de/mpg/mpiwg/berlin/.DS_Store Binary file software/mpdl-services/mpiwg-mpdl-xml-web/src/de/mpg/mpiwg/berlin/.DS_Store has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-xml-web/src/de/mpg/mpiwg/berlin/mpdl/.DS_Store Binary file software/mpdl-services/mpiwg-mpdl-xml-web/src/de/mpg/mpiwg/berlin/mpdl/.DS_Store has changed diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-xml-web/src/de/mpg/mpiwg/berlin/mpdl/exception/ApplicationException.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-xml-web/src/de/mpg/mpiwg/berlin/mpdl/exception/ApplicationException.java Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,15 @@ +package de.mpg.mpiwg.berlin.mpdl.exception; + +public class ApplicationException extends Exception { + private static final long serialVersionUID = 1L; + + public ApplicationException(Exception e) { + super(e); + } + + public ApplicationException(String str) { + super(str); + } + +} + diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-xml-web/src/de/mpg/mpiwg/berlin/mpdl/servlets/xml/GetFragment.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-xml-web/src/de/mpg/mpiwg/berlin/mpdl/servlets/xml/GetFragment.java Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,55 @@ +package de.mpg.mpiwg.berlin.mpdl.servlets.xml; + +import java.io.IOException; +import java.io.PrintWriter; + +import javax.servlet.ServletConfig; +import javax.servlet.ServletContext; +import javax.servlet.ServletException; +import javax.servlet.http.HttpServlet; +import javax.servlet.http.HttpServletRequest; +import javax.servlet.http.HttpServletResponse; + +import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; +import de.mpg.mpiwg.berlin.mpdl.xml.transform.FragmentTransformer; + +public class GetFragment extends HttpServlet { + private static final long serialVersionUID = 1L; + private FragmentTransformer fragmentTransformer; + private String documentDirectory; + public GetFragment() { + super(); + } + + public void init(ServletConfig config) throws ServletException { + super.init(config); + ServletContext context = getServletContext(); + fragmentTransformer = (FragmentTransformer) context.getAttribute("fragmentTransformer"); + documentDirectory = (String) context.getAttribute("documentDirectory"); + } + + protected void doGet(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException { + request.setCharacterEncoding("utf-8"); + response.setCharacterEncoding("utf-8"); + String docId = request.getParameter("docId"); + String ms1Name = request.getParameter("ms1Name"); + int ms1Pos = new Integer(request.getParameter("ms1Pos")); + String ms2Name = request.getParameter("ms2Name"); + int ms2Pos = new Integer(request.getParameter("ms2Pos")); + try { + String xmlFileName = documentDirectory + docId; + String result = fragmentTransformer.getFragment(xmlFileName, ms1Name, ms1Pos, ms2Name, ms2Pos); + response.setContentType("text/xml"); + PrintWriter out = response.getWriter(); + out.print(result); + out.close(); + } catch (ApplicationException e) { + throw new ServletException(e); + } + } + + protected void doPost(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException { + // TODO Auto-generated method stub + } + +} diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-xml-web/src/de/mpg/mpiwg/berlin/mpdl/servlets/xml/MpiwgMpdlXmlWebServletContextListener.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-xml-web/src/de/mpg/mpiwg/berlin/mpdl/servlets/xml/MpiwgMpdlXmlWebServletContextListener.java Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,33 @@ +package de.mpg.mpiwg.berlin.mpdl.servlets.xml; + +import javax.servlet.ServletContext; +import javax.servlet.ServletContextEvent; +import javax.servlet.ServletContextListener; + +import de.mpg.mpiwg.berlin.mpdl.xml.general.Constants; +import de.mpg.mpiwg.berlin.mpdl.xml.transform.FragmentTransformer; + +public class MpiwgMpdlXmlWebServletContextListener implements ServletContextListener { + private ServletContext context = null; + private FragmentTransformer fragmentTransformer = null; + + public void contextInitialized(ServletContextEvent event) { + try { + this.context = event.getServletContext(); + fragmentTransformer = new FragmentTransformer(); + context.setAttribute("fragmentTransformer", fragmentTransformer); + String docDirectory = Constants.getInstance().getDocumentDir(); + context.setAttribute("documentDirectory", docDirectory); + System.out.println(MpiwgMpdlXmlWebServletContextListener.class.getName() + ": contextInitialized (document directory= \"" + docDirectory + "\", set in constants.properties)"); + // String documentDirectory = System.getProperty("catalina.base") + "/webapps/mpiwg-mpdl-xml-web/documents"; + } catch (Exception e) { + e.printStackTrace(); + } + } + + public void contextDestroyed(ServletContextEvent e) { + this.context = null; + this.fragmentTransformer = null; + System.out.println(MpiwgMpdlXmlWebServletContextListener.class.getName() + ": contextDestroyed"); + } +} \ No newline at end of file diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-xml-web/src/de/mpg/mpiwg/berlin/mpdl/servlets/xml/Transform.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-xml-web/src/de/mpg/mpiwg/berlin/mpdl/servlets/xml/Transform.java Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,48 @@ +package de.mpg.mpiwg.berlin.mpdl.servlets.xml; + +import java.io.IOException; +import java.io.PrintWriter; + +import javax.servlet.ServletConfig; +import javax.servlet.ServletException; +import javax.servlet.http.HttpServlet; +import javax.servlet.http.HttpServletRequest; +import javax.servlet.http.HttpServletResponse; + +import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; +import de.mpg.mpiwg.berlin.mpdl.xml.transform.BasicTransformer; + +public class Transform extends HttpServlet { + private static final long serialVersionUID = 1L; + public Transform() { + super(); + } + + public void init(ServletConfig config) throws ServletException { + super.init(config); + } + + protected void doGet(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException { + request.setCharacterEncoding("utf-8"); + response.setCharacterEncoding("utf-8"); + String srcUrl = request.getParameter("srcUrl"); + String xslUrl = request.getParameter("xslUrl"); + String parameters = request.getParameter("parameters"); + String outputProperties = request.getParameter("outputProperties"); + try { + BasicTransformer basicTransformer = new BasicTransformer(); + String result = basicTransformer.transform(srcUrl, xslUrl, parameters, outputProperties); + response.setContentType("text/xml"); + PrintWriter out = response.getWriter(); + out.print(result); + out.close(); + } catch (ApplicationException e) { + throw new ServletException(e); + } + } + + protected void doPost(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException { + // TODO Auto-generated method stub + } + +} diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-xml-web/src/de/mpg/mpiwg/berlin/mpdl/servlets/xml/XQuery.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-xml-web/src/de/mpg/mpiwg/berlin/mpdl/servlets/xml/XQuery.java Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,52 @@ +package de.mpg.mpiwg.berlin.mpdl.servlets.xml; + +import java.io.IOException; +import java.io.PrintWriter; +import java.net.URL; + +import javax.servlet.ServletConfig; +import javax.servlet.ServletException; +import javax.servlet.http.HttpServlet; +import javax.servlet.http.HttpServletRequest; +import javax.servlet.http.HttpServletResponse; + +import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; +import de.mpg.mpiwg.berlin.mpdl.xml.xquery.XQueryEvaluator; + +public class XQuery extends HttpServlet { + private static final long serialVersionUID = 1L; + private XQueryEvaluator xqueryEvaluator; + + public XQuery() { + super(); + } + + public void init(ServletConfig config) throws ServletException { + super.init(config); + xqueryEvaluator = new XQueryEvaluator(); + } + + protected void doGet(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException { + String result = null; + request.setCharacterEncoding("utf-8"); + response.setCharacterEncoding("utf-8"); + String inputString = request.getParameter("inputString"); + String srcUrlStr = request.getParameter("srcUrl"); + String xqueryStr = request.getParameter("xquery"); + try { + if (inputString != null) { + result = xqueryEvaluator.evaluateAsString(inputString, xqueryStr); + } else if (srcUrlStr != null) { + URL srcUrl = new URL(srcUrlStr); + result = xqueryEvaluator.evaluateAsString(srcUrl, xqueryStr); + } + response.setContentType("text/xml"); + PrintWriter out = response.getWriter(); + out.print("" + result + ""); + out.close(); + } catch (ApplicationException e) { + throw new ServletException(e); + } + } + +}

Character	Replacement
<	<
>	>
&	&
"	"
\t
!	!
#	#
$	$
%	%
'	'
(	(
)	)
*	*
+	+
,	,
-	-
.	.
/	/
:	:
;	;
=	=
?	?
@	@
[	[
\	\
]	]
^	^
_	_
`	`
{	{
\|	\|
}	}
~	~

Character	Encoding
<	<
>	>
&	&
"	"
'	'

Character	Replacement
<	<
>	>
&	&
"	"
\t
!	!
#	#
$	$
%	%
'	'
(	(
)	)
*	*
+	+
,	,
-	-
.	.
/	/
:	:
;	;
=	=
?	?
@	@
[	[
\	\
]	]
^	^
_	_
`	`
{	{
\|	\|
}	}
~	~

Max Planck Institute for the History of Science - Mpdl: Language technology services

Word information for: \"" + query + "\"

Morphology

Dictionary

Wikipedia

Forms for: \"" + query + "\"

Morphology

Lemmas for: \"" + query + "\"

Morphology

Max Planck Institute for the History of Science - Mpdl: XML Services

`Max Planck Institute for the History of Science - Mpdl: XML Services`

Character	Replacement
<	<
>	>
&	&
"	"
\t
!	!
#	#
$	$
%	%
'	'
(	(
)	)
*	*
+	+
,	,
-	-
.	.
/	/
:	:
;	;
=	=
?	?
@	@
[	[
\	\
]	]
^	^
_	_
`	`
{	{
\|	\|
}	}
~	~