# HG changeset patch # User Josef Willenborg # Date 1290615863 -3600 # Node ID 408254cf2f1d7e8e664c6933f24a6964cac1991e Erstellung diff -r 000000000000 -r 408254cf2f1d software/eXist/mpdl-modules/build.properties --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/build.properties Wed Nov 24 17:24:23 2010 +0100 @@ -0,0 +1,7 @@ +########################################################### +# This file specifies which XQuery extension modules should be compiled and +# $Id: $ +########################################################## + +include.module.example = true +include.module.text = true diff -r 000000000000 -r 408254cf2f1d software/eXist/mpdl-modules/build.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/build.xml Wed Nov 24 17:24:23 2010 +0100 @@ -0,0 +1,109 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff -r 000000000000 -r 408254cf2f1d software/eXist/mpdl-modules/mpdl-system.properties --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/mpdl-system.properties Wed Nov 24 17:24:23 2010 +0100 @@ -0,0 +1,18 @@ +# eXist settings: XML-RPC-Interface, doc-interface +exist.fullHostname=mpdl-proto.mpiwg-berlin.mpg.de +exist.hostname=localhost +exist.port=30030 +exist.adminUserName=admin +exist.adminUserPW= +exist.echoRelaxNGPath=/exist/rest/db/mpdl/schema/echo/echo.rnc + +# eSciDoc settings +escidoc.hostname=euler.mpiwg-berlin.mpg.de +escidoc.port=8080 +escidoc.oumId=/oum/organizational-unit/escidoc:ex3 +escidoc.cmmId=/cmm/content-model/escidoc:persistent4 +escidoc.contextId=/ir/context/escidoc:38600 +escidoc.archimedesContainerId=/ir/container/escidoc:42507 +escidoc.echoContainerId=/ir/container/escidoc:38602 +escidoc.adminUserName=jwillenborg +escidoc.adminUserPW= diff -r 000000000000 -r 408254cf2f1d software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/analysis/MpdlStandardAnalyzer.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/analysis/MpdlStandardAnalyzer.java Wed Nov 24 17:24:23 2010 +0100 @@ -0,0 +1,103 @@ +package de.mpg.mpiwg.berlin.mpdl.analysis; + +import org.apache.lucene.analysis.*; +import org.apache.lucene.analysis.standard.StandardFilter; +import org.apache.lucene.analysis.standard.StandardTokenizer; + +import java.io.File; +import java.io.IOException; +import java.io.Reader; +import java.util.Set; + +/** + * StandardAnalyzer which is case insensitive (no LowerCaseFilter in method tokenStream + * and reusableTokenStream) + * + */ +public class MpdlStandardAnalyzer extends Analyzer { + private Set stopSet; + /** An array containing some common English words that are usually not + useful for searching. */ + public static final String[] STOP_WORDS = StopAnalyzer.ENGLISH_STOP_WORDS; + + /** Builds an analyzer with the default stop words ({@link #STOP_WORDS}). */ + public MpdlStandardAnalyzer() { + this(STOP_WORDS); + } + + /** Builds an analyzer with the given stop words. */ + public MpdlStandardAnalyzer(Set stopWords) { + stopSet = stopWords; + } + + /** Builds an analyzer with the given stop words. */ + public MpdlStandardAnalyzer(String[] stopWords) { + stopSet = StopFilter.makeStopSet(stopWords); + } + + /** Builds an analyzer with the stop words from the given file. + * @see WordlistLoader#getWordSet(File) + */ + public MpdlStandardAnalyzer(File stopwords) throws IOException { + stopSet = WordlistLoader.getWordSet(stopwords); + } + + /** Builds an analyzer with the stop words from the given reader. + * @see WordlistLoader#getWordSet(Reader) + */ + public MpdlStandardAnalyzer(Reader stopwords) throws IOException { + stopSet = WordlistLoader.getWordSet(stopwords); + } + + /** Constructs a {@link StandardTokenizer} filtered by a {@link + StandardFilter}, not a {@link LowerCaseFilter} and a {@link StopFilter}. */ + public TokenStream tokenStream(String fieldName, Reader reader) { + StandardTokenizer tokenStream = new StandardTokenizer(reader); + tokenStream.setMaxTokenLength(maxTokenLength); + TokenStream result = new StandardFilter(tokenStream); + result = new StopFilter(result, stopSet); + return result; + } + + private static final class SavedStreams { + StandardTokenizer tokenStream; + TokenStream filteredTokenStream; + } + + /** Default maximum allowed token length */ + public static final int DEFAULT_MAX_TOKEN_LENGTH = 255; + + private int maxTokenLength = DEFAULT_MAX_TOKEN_LENGTH; + + /** + * Set maximum allowed token length. If a token is seen + * that exceeds this length then it is discarded. This + * setting only takes effect the next time tokenStream or + * reusableTokenStream is called. + */ + public void setMaxTokenLength(int length) { + maxTokenLength = length; + } + + /** + * @see #setMaxTokenLength + */ + public int getMaxTokenLength() { + return maxTokenLength; + } + + public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException { + SavedStreams streams = (SavedStreams) getPreviousTokenStream(); + if (streams == null) { + streams = new SavedStreams(); + setPreviousTokenStream(streams); + streams.tokenStream = new StandardTokenizer(reader); + streams.filteredTokenStream = new StandardFilter(streams.tokenStream); + streams.filteredTokenStream = new StopFilter(streams.filteredTokenStream, stopSet); + } else { + streams.tokenStream.reset(reader); + } + streams.tokenStream.setMaxTokenLength(maxTokenLength); + return streams.filteredTokenStream; + } +} \ No newline at end of file diff -r 000000000000 -r 408254cf2f1d software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/client/DocumentHandler.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/client/DocumentHandler.java Wed Nov 24 17:24:23 2010 +0100 @@ -0,0 +1,370 @@ +package de.mpg.mpiwg.berlin.mpdl.client; + +import java.io.File; +import java.io.FilenameFilter; +import java.net.MalformedURLException; +import java.net.URL; +import java.util.Date; + +import de.mpg.mpiwg.berlin.mpdl.escidoc.ESciDocIngestor; +import de.mpg.mpiwg.berlin.mpdl.escidoc.MetadataRecord; +import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; +import de.mpg.mpiwg.berlin.mpdl.general.MpdlConstants; +import de.mpg.mpiwg.berlin.mpdl.lt.doc.regularization.RegularizationManager; +import de.mpg.mpiwg.berlin.mpdl.schedule.MpdlDocOperation; +import de.mpg.mpiwg.berlin.mpdl.util.FileUtil; +import de.mpg.mpiwg.berlin.mpdl.util.MpdlITextRenderer; +import de.mpg.mpiwg.berlin.mpdl.xml.SchemaHandler; +import de.mpg.mpiwg.berlin.mpdl.xmlrpc.FilenameFilterExtension; +import de.mpg.mpiwg.berlin.mpdl.xmlrpc.MpdlXmlRpcDocHandler; + +/** + * Handler for eXist collections and documents (singleton). + * Your local directory structure should look like this: + * documents + * archimedes + * ar + * yourDoc1.xml + * ... + * ... + * zh + * yourDoc1.xml + * ... + * echo + * ar + * yourDoc1.xml + * ... + * ... + * zh + * yourDoc1.xml + * ... + * + */ +public class DocumentHandler { + private MpdlXmlRpcDocHandler mpdlXmlRpcDocHandler; + private ESciDocIngestor eSciDocIngestor; + + private String[] docBases = {"archimedes", "echo"}; + private String[] languages = {"ar", "de", "el", "en", "fr", "it", "la", "nl", "zh"}; + private String documentRootCollectionMorph = "/db/mpdl/documents/morph"; + private String documentRootCollectionStandard = "/db/mpdl/documents/standard"; + private String presentationRootCollection = "/db/mpdl/presentation"; + private String schemaRootCollection = "/db/mpdl/schema"; + private String localDocumentDirectory = "/Users/jwillenborg/texts/mpdl/documents"; + + private long beginOfOperation; + private long endOfOperation; + + + public DocumentHandler(MpdlXmlRpcDocHandler mpdlXmlRpcDocHandler) throws ApplicationException { + this.mpdlXmlRpcDocHandler = mpdlXmlRpcDocHandler; + } + + public DocumentHandler(MpdlXmlRpcDocHandler mpdlXmlRpcDocHandler, ESciDocIngestor eSciDocIngestor) throws ApplicationException { + this.mpdlXmlRpcDocHandler = mpdlXmlRpcDocHandler; + this.eSciDocIngestor = eSciDocIngestor; + } + + public void doOperation(MpdlDocOperation docOperation) throws ApplicationException{ + String operationName = docOperation.getName(); + if (operationName.equals("create") || operationName.equals("update")) { + createOrUpdate(docOperation); + } else if (operationName.equals("delete")) { + delete(docOperation); + } else if (operationName.equals("updateExist")) { + updateExist(docOperation); + } else if (operationName.equals("deleteExist")) { + deleteExist(docOperation); + } else if (operationName.equals("importAllDocumentsLocallyExist")) { + importAllDocumentsLocallyExist(); + } else if (operationName.equals("generatePdfHtmlDocumentFiles")) { + generatePdfHtmlDocumentFiles(); + } + } + + private void importAllDocumentsLocallyExist() throws ApplicationException { + System.out.println("Start of DocumentHandler. This operation could be time consuming because documents are indexed on eXist (normal indexing times are 10 seconds for a document) ..."); + beginOperation(); + // deletePresentationCollection(); + // createPresentationCollection(); + // deleteSchemaCollection(); + // createSchemaCollection(); + + deleteDocumentCollections(); + createDocumentCollections(); + saveDocumentFiles(); + endOperation(); + System.out.println("The DocumentHandler needed: " + (endOfOperation - beginOfOperation) + " ms" ); + } + + private void createOrUpdate(MpdlDocOperation docOperation) throws ApplicationException { + try { + String operationName = docOperation.getName(); + String language = docOperation.getLanguage(); + String srcUrlStr = docOperation.getSrcUrl(); + String eXistIdentifier = docOperation.getDestUrl(); + String destFileName = MpdlConstants.MPDL_EXIST_DATA_DIR + "/documents" + eXistIdentifier; + URL srcUrl = null; + String protocol = null; + if (srcUrlStr != null && ! srcUrlStr.equals("empty")) { + srcUrl = new URL(srcUrlStr); + protocol = srcUrl.getProtocol(); + } + SchemaHandler schemaHandler = new SchemaHandler(); + boolean docExists = mpdlXmlRpcDocHandler.documentExists(docOperation); + if (operationName.equals("create") && docExists) { + throw new ApplicationException("Document:" + eXistIdentifier + " already exists. Please use another name or perform the operation \"Update\" of that document."); + } + if (operationName.equals("update") && ! docExists) { + throw new ApplicationException("Document:" + eXistIdentifier + " does not exist. Please use a name that exists and perform the operation \"Update\" again or perform the operation \"Create\" of that document"); + } + // load file to local file system + if (protocol.equals("file")) { + docOperation.setStatus("upload file: " + srcUrlStr + " to eXist server"); + } else { + docOperation.setStatus("download file from: " + srcUrlStr + " to eXist server"); + } + FileUtil.getInstance().saveUrlToLocalFile(srcUrl, destFileName); + // perform validations + docOperation.setStatus("validate document: " + eXistIdentifier); + schemaHandler.validate(destFileName, docOperation); + // perform operation on eXist + docOperation.setStatus(operationName + " regularizations of document: " + eXistIdentifier + " on eXist server"); + RegularizationManager regManager = RegularizationManager.getInstance(); + regManager.saveRegularizations(language, destFileName); + docOperation.setStatus(operationName + " document: " + eXistIdentifier + " on eXist server"); + mpdlXmlRpcDocHandler.saveDocumentFile(docOperation); + // save PDF and HTML versions of the document + boolean includePdf = docOperation.includePdf(); + if (includePdf) { + docOperation.setStatus("create PDF and HTML versions of the document: " + eXistIdentifier); + MpdlITextRenderer mpdlRenderer = MpdlITextRenderer.getInstance(); + MetadataRecord mdRecord = docOperation.getMdRecord(); // after validation, docOperation has a mdRecord + mpdlRenderer.createFile(true, true, "text", mdRecord); // generate Pdf/Html document + } + // perform operation on eSciDoc + eSciDocIngestor.execute(docOperation); + } catch (MalformedURLException e) { + throw new ApplicationException(e); + } + } + + private void delete(MpdlDocOperation docOperation) throws ApplicationException { + String operationName = docOperation.getName(); + String eXistIdentifier = docOperation.getDestUrl(); + String fileName = docOperation.getFileName(); + if (fileName == null || fileName.trim().equals("")) + throw new ApplicationException("Your document file name is empty. Please specify a file name for your document."); + if (! fileName.endsWith(".xml")) + throw new ApplicationException("Your document file name does not end with \".xml\". Please specify a file name with the suffix \".xml\" for your document."); + boolean docExists = mpdlXmlRpcDocHandler.documentExists(docOperation); + if (! docExists) { + throw new ApplicationException("Document:" + eXistIdentifier + " does not exists. Please use a name that exists and perform the operation \"Delete\" again."); + } + // perform operation on eXist + docOperation.setStatus(operationName + " document: " + eXistIdentifier + " on eXist server"); + // delete file on local eXist file system: xml, pdf and html + String eXistIdentifierWithoutExtension = eXistIdentifier.substring(0, eXistIdentifier.length() - 4); // without ".xml" + String destFileNameXml = MpdlConstants.MPDL_EXIST_DATA_DIR + "/documents" + eXistIdentifier; + FileUtil.getInstance().deleteFile(destFileNameXml); + boolean includePdf = docOperation.includePdf(); + if (includePdf) { + String destFileNamePdf = MpdlConstants.MPDL_EXIST_DATA_DIR + "/documents" + eXistIdentifierWithoutExtension + ".pdf"; + String destFileNameHtml = MpdlConstants.MPDL_EXIST_DATA_DIR + "/documents" + eXistIdentifierWithoutExtension + ".html"; + FileUtil.getInstance().deleteFile(destFileNamePdf); + FileUtil.getInstance().deleteFile(destFileNameHtml); + } + // delete document in eXist + mpdlXmlRpcDocHandler.deleteDocumentFile(docOperation); + // perform operation on eSciDoc + eSciDocIngestor.execute(docOperation); + } + + private void updateExist(MpdlDocOperation docOperation) throws ApplicationException { + try { + String operationName = docOperation.getName(); + String language = docOperation.getLanguage(); + String srcUrlStr = docOperation.getSrcUrl(); + String eXistIdentifier = docOperation.getDestUrl(); + String destFileName = MpdlConstants.MPDL_EXIST_DATA_DIR + "/documents" + eXistIdentifier; + URL srcUrl = null; + String protocol = null; + if (srcUrlStr != null && ! srcUrlStr.equals("empty")) { + srcUrl = new URL(srcUrlStr); + protocol = srcUrl.getProtocol(); + } + SchemaHandler schemaHandler = new SchemaHandler(); + if (protocol.equals("file")) { + docOperation.setStatus("upload file: " + srcUrlStr + " to eXist server"); + } else { + docOperation.setStatus("download file from: " + srcUrlStr + " to eXist server"); + } + // load file to local file system + FileUtil.getInstance().saveUrlToLocalFile(srcUrl, destFileName); + // validation + docOperation.setStatus("validate document: " + eXistIdentifier); + schemaHandler.validate(destFileName, docOperation); + // save regularizations of the document + docOperation.setStatus(operationName + " regularizations of document: " + eXistIdentifier + " on eXist server"); + RegularizationManager regManager = RegularizationManager.getInstance(); + regManager.saveRegularizations(language, destFileName); + // perform operation on eXist + docOperation.setStatus(operationName + " document: " + eXistIdentifier + " on eXist server"); + mpdlXmlRpcDocHandler.saveDocumentFile(docOperation); + // save PDF and HTML versions of the document + boolean includePdf = docOperation.includePdf(); + if (includePdf) { + docOperation.setStatus("create PDF and HTML versions of the document: " + eXistIdentifier); + MpdlITextRenderer mpdlRenderer = MpdlITextRenderer.getInstance(); + MetadataRecord mdRecord = docOperation.getMdRecord(); // after validation, docOperation has a mdRecord + mpdlRenderer.createFile(true, true, "text", mdRecord); // generate Pdf/Html document + } + } catch (MalformedURLException e) { + throw new ApplicationException(e); + } + } + + private void deleteExist(MpdlDocOperation docOperation) throws ApplicationException { + String operationName = docOperation.getName(); + String eXistIdentifier = docOperation.getDestUrl(); + String fileName = docOperation.getFileName(); + if (fileName == null || fileName.trim().equals("")) + throw new ApplicationException("Your document file name is empty. Please specify a file name for your document."); + if (! fileName.endsWith(".xml")) + throw new ApplicationException("Your document file name does not end with \".xml\". Please specify a file name with the suffix \".xml\" for your document."); + boolean docExists = mpdlXmlRpcDocHandler.documentExists(docOperation); + if (! docExists) + throw new ApplicationException("Document:" + eXistIdentifier + " does not exist."); + // perform operation + docOperation.setStatus(operationName + " document: " + eXistIdentifier + " on eXist server"); + // delete file on local file system: xml, pdf and html + String eXistIdentifierWithoutExtension = eXistIdentifier.substring(0, eXistIdentifier.length() - 4); // without ".xml" + String destFileNameXml = MpdlConstants.MPDL_EXIST_DATA_DIR + "/documents" + eXistIdentifier; + FileUtil.getInstance().deleteFile(destFileNameXml); + boolean includePdf = docOperation.includePdf(); + if (includePdf) { + String destFileNamePdf = MpdlConstants.MPDL_EXIST_DATA_DIR + "/documents" + eXistIdentifierWithoutExtension + ".pdf"; + String destFileNameHtml = MpdlConstants.MPDL_EXIST_DATA_DIR + "/documents" + eXistIdentifierWithoutExtension + ".html"; + FileUtil.getInstance().deleteFile(destFileNamePdf); + FileUtil.getInstance().deleteFile(destFileNameHtml); + } + // delete document in eXist + mpdlXmlRpcDocHandler.deleteDocumentFile(docOperation); + } + + private void deleteDocumentCollections() throws ApplicationException { + mpdlXmlRpcDocHandler.deleteCollection(documentRootCollectionMorph); + mpdlXmlRpcDocHandler.deleteCollection(documentRootCollectionStandard); + } + + private void createDocumentCollections() throws ApplicationException { + for (int i=0; i < docBases.length; i++) { + String docBase = docBases[i]; + for (int j=0; j < languages.length; j++) { + String language = languages[j]; + String documentCollectionMorph = documentRootCollectionMorph + "/" + docBase + "/" + language; + mpdlXmlRpcDocHandler.createCollection(documentCollectionMorph); + String documentCollectionStandard = documentRootCollectionStandard + "/" + docBase + "/" + language; + mpdlXmlRpcDocHandler.createCollection(documentCollectionStandard); + } + } + } + + private void saveDocumentFiles() throws ApplicationException { + int counter = 0; + for (int i=0; i < docBases.length; i++) { + String docBase = docBases[i]; + for (int j=0; j < languages.length; j++) { + String language = languages[j]; + String documentCollection = "/" + docBase + "/" + language; + String localFileDirStr = localDocumentDirectory + "/" + docBase + "/" + language; + File localFileDir = new File(localFileDirStr); + FilenameFilter filter = new FilenameFilterExtension("xml"); + File[] files = localFileDir.listFiles(filter); + System.out.println("Adding all documents in path: \"" + localFileDirStr + "\" to eXist collection: \"" + documentCollection + "\" ..."); + for (int k=0; k < files.length; k++) { + File f = files[k]; + String localFileNameWithoutPath = f.getName(); + String fullLocalFileName = f.getPath(); + String srcUrl = "file://" + fullLocalFileName; + MpdlDocOperation docOperation = new MpdlDocOperation("updateExist", srcUrl, null, docBase, language, localFileNameWithoutPath); + long begin = new Date().getTime(); + doOperation(docOperation); + long end = new Date().getTime(); + System.out.println("Added document \"" + fullLocalFileName + "\" to eXist collection: \"" + documentCollection + "\" (" + (end - begin) + " ms)" ); + counter++; + } + } + } + System.out.println("Imported documents: " + counter); + } + + private void generatePdfHtmlDocumentFiles() throws ApplicationException { + int counter = 0; + MpdlITextRenderer mpdlRenderer = MpdlITextRenderer.getInstance(); + for (int i=0; i < docBases.length; i++) { + String docBase = docBases[i]; + for (int j=0; j < languages.length; j++) { + String language = languages[j]; + String localFileDirStr = localDocumentDirectory + "/" + docBase + "/" + language; + File localFileDir = new File(localFileDirStr); + FilenameFilter filter = new FilenameFilterExtension("xml"); + File[] files = localFileDir.listFiles(filter); + System.out.println("Generating Pdf/Html documents in path: \"" + localFileDirStr + "\" ..."); + for (int k=0; k < files.length; k++) { + File f = files[k]; + String localFileName = f.getName(); + String fullLocalFileName = f.getPath(); + String srcUrl = "file://" + fullLocalFileName; + String localFileNameWithoutExtension = localFileName.substring(0, localFileName.length() - 4); // without ".xml" + String fullLocalPdfFileName = MpdlConstants.MPDL_EXIST_DATA_DIR + "/documents/" + docBase + "/" + language + "/" + localFileNameWithoutExtension + ".pdf"; + File localPdfFile = new File(fullLocalPdfFileName); + boolean pdfFileAlreadyExists = localPdfFile.exists(); + // generate Pdf/Html file only if pdf file does not already exist + if (! pdfFileAlreadyExists) { + MpdlDocOperation docOperation = new MpdlDocOperation("generatePdf", srcUrl, null, docBase, language, localFileName); + SchemaHandler schemaHandler = new SchemaHandler(); + schemaHandler.validate(fullLocalFileName, docOperation); + long begin = new Date().getTime(); + MetadataRecord mdRecord = docOperation.getMdRecord(); // after validation, docOperation has a mdRecord + mpdlRenderer.createFile(true, true, "text", mdRecord); // generate Pdf/Html document + long end = new Date().getTime(); + System.out.println("Generate Pdf/Html document for: \"" + fullLocalFileName + "\" (" + (end - begin) + " ms)" ); + counter++; + try { + Thread.sleep(60000); // delay so that called servers (digilib, eXist) are not stressed too much + } catch (InterruptedException e) { + throw new ApplicationException(e); + } + } + } + } + } + System.out.println("Generated documents: " + counter); + } + + private void deletePresentationCollection() throws ApplicationException { + mpdlXmlRpcDocHandler.deleteCollection(presentationRootCollection); + } + + private void createPresentationCollection() throws ApplicationException { + mpdlXmlRpcDocHandler.createCollection(presentationRootCollection); + } + + private void deleteSchemaCollection() throws ApplicationException { + mpdlXmlRpcDocHandler.deleteCollection(schemaRootCollection); + } + + private void createSchemaCollection() throws ApplicationException { + mpdlXmlRpcDocHandler.createCollection(schemaRootCollection); + } + + private void beginOperation() { + beginOfOperation = new Date().getTime(); + } + + private void endOperation() { + endOfOperation = new Date().getTime(); + } + +} \ No newline at end of file diff -r 000000000000 -r 408254cf2f1d software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/doc/GetDocServlet.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/doc/GetDocServlet.java Wed Nov 24 17:24:23 2010 +0100 @@ -0,0 +1,54 @@ +package de.mpg.mpiwg.berlin.mpdl.doc; + +import java.io.BufferedInputStream; +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.OutputStream; +import java.io.PrintWriter; +import java.net.URLConnection; + +import javax.servlet.ServletException; +import javax.servlet.http.HttpServlet; +import javax.servlet.http.HttpServletRequest; +import javax.servlet.http.HttpServletResponse; + +import de.mpg.mpiwg.berlin.mpdl.general.MpdlConstants; + +public class GetDocServlet extends HttpServlet { + private static final long serialVersionUID = -4889427839010526185L; + + protected void doGet(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException { + String docEXistIdentifier = request.getParameter("doc"); + String docFileName = MpdlConstants.MPDL_EXIST_DATA_DIR + "/documents" + docEXistIdentifier; + File docFile = new File(docFileName); + if (docFile.exists()) + write(response, docFile); + else + write(response, "Document: " + docEXistIdentifier + " does not exist"); + } + + private void write(HttpServletResponse response, File file) throws IOException { + String fileName = file.getName(); + OutputStream out = response.getOutputStream(); + BufferedInputStream is = new BufferedInputStream(new FileInputStream(file)); + String contentType = URLConnection.guessContentTypeFromName(fileName); // other methods: URLConnection.guessContentTypeFromStream(is); or MIMEUtils.getMIMEType(file); + if (contentType != null) + response.setContentType(contentType); + response.setHeader("Content-Disposition", "filename=" + fileName); + byte[] buf = new byte[20000*1024]; // 20MB buffer + int bytesRead; + while ((bytesRead = is.read(buf)) != -1) { + out.write(buf, 0, bytesRead); + } + is.close(); + out.flush(); + out.close(); + } + + private void write(HttpServletResponse response, String str) throws IOException { + PrintWriter out = response.getWriter(); + out.write(str); + } + +} \ No newline at end of file diff -r 000000000000 -r 408254cf2f1d software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/donatus/analysis/DonatusAnalyzer.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/donatus/analysis/DonatusAnalyzer.java Wed Nov 24 17:24:23 2010 +0100 @@ -0,0 +1,181 @@ +package de.mpg.mpiwg.berlin.mpdl.donatus.analysis; +import java.io.File; +import java.io.IOException; +import java.io.Reader; +import java.io.StringReader; +import java.util.ArrayList; +import java.util.HashSet; +import java.util.Hashtable; +import java.util.Set; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.LowerCaseFilter; +import org.apache.lucene.analysis.StopFilter; +import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.WordlistLoader; +import org.apache.lucene.analysis.br.BrazilianAnalyzer; +import org.apache.lucene.analysis.cz.CzechAnalyzer; +import org.apache.lucene.analysis.de.GermanAnalyzer; +import org.apache.lucene.analysis.fr.FrenchAnalyzer; +import org.apache.lucene.analysis.nl.DutchAnalyzer; +import org.apache.lucene.analysis.standard.StandardAnalyzer; +import org.apache.lucene.analysis.standard.StandardFilter; +import org.apache.lucene.analysis.standard.StandardTokenizer; + +import de.mpg.mpiwg.berlin.mpdl.donatus.xmlrpc.DonatusConstants; + +/** + * Analyzer for specific languages. Supports an external list of stopwords (words that + * will not be indexed at all) and an external list of exclusions (word that will + * not be stemmed, but indexed). + * A default set of stopwords is used unless an alternative list is specified, the + * exclusion list is empty by default. + */ +public class DonatusAnalyzer extends Analyzer { + protected String language = DonatusConstants.DEFAULT_LANGUAGE; + + /** + * Contains the stopwords used with the StopFilter. + */ + protected Set stopSet = new HashSet(); + + /** + * Contains words that should be indexed but not stemmed. + */ + protected Set exclusionSet = new HashSet(); + + /** + * Builds an analyzer with the stop words for the given language + * (GERMAN_STOP_WORDS). + */ + public DonatusAnalyzer() { + String[] stopWords = getStopWords(language); // stopwords for the language + stopSet = StopFilter.makeStopSet(stopWords); + } + + /** + * Builds an analyzer with the given stop words. + */ + public DonatusAnalyzer(String[] stopwords) { + stopSet = StopFilter.makeStopSet(stopwords); + } + + /** + * Builds an analyzer with the given stop words. + */ + public DonatusAnalyzer(Hashtable stopwords) { + stopSet = new HashSet(stopwords.keySet()); + } + + /** + * Builds an analyzer with the given stop words. + */ + public DonatusAnalyzer(File stopwords) throws IOException { + stopSet = WordlistLoader.getWordSet(stopwords); + } + + public String getLanguage() { + return language; + } + + protected void setLanguage(String lang) { + this.language = lang; + } + + /** + * Get stopwords for the language: fetch them from the open language analyzers for some languages + * TODO other languages + * @param language + * @return stopwords + * + * +Taken from: http://www.perseus.tufts.edu/hopper/stopwords +# English: a, a's, able, about, above, according, accordingly, across, actually, after, afterwards, again, against, ain't, all, allow, allows, almost, alone, along, already, also, although, always, am, among, amongst, an, and, another, any, anybody, anyhow, anyone, anything, anyway, anyways, anywhere, apart, appear, appreciate, appropriate, are, aren't, around, as, aside, ask, asking, associated, at, available, away, awfully, b, be, became, because, become, becomes, becoming, been, before, beforehand, behind, being, believe, below, beside, besides, best, better, between, beyond, both, brief, but, by, c, c'mon, c's, came, can, can't, cannot, cant, cause, causes, certain, certainly, changes, clearly, co, com, come, comes, concerning, consequently, consider, considering, contain, containing, contains, corresponding, could, couldn't, course, currently, d, definitely, described, despite, did, didn't, different, do, does, doesn't, doing, don't, done, down, downwards, during, e, each, edu, eg, eight, either, else, elsewhere, enough, entirely, especially, et, etc, even, ever, every, everybody, everyone, everything, everywhere, ex, exactly, example, except, f, far, few, fifth, first, five, followed, following, follows, for, former, formerly, forth, four, from, further, furthermore, g, get, gets, getting, given, gives, go, goes, going, gone, got, gotten, greetings, h, had, hadn't, happens, hardly, has, hasn't, have, haven't, having, he, he's, hello, help, hence, her, here, here's, hereafter, hereby, herein, hereupon, hers, herself, hi, him, himself, his, hither, hopefully, how, howbeit, however, i, i'd, i'll, i'm, i've, ie, if, ignored, immediate, in, inasmuch, inc, indeed, indicate, indicated, indicates, inner, insofar, instead, into, inward, is, isn't, it, it'd, it'll, it's, its, itself, j, just, k, keep, keeps, kept, know, known, knows, l, last, lately, later, latter, latterly, least, less, lest, let, let's, like, liked, likely, little, look, looking, looks, ltd, m, mainly, many, may, maybe, me, mean, meanwhile, merely, might, more, moreover, most, mostly, much, must, my, myself, n, name, namely, nd, near, nearly, necessary, need, needs, neither, never, nevertheless, new, next, nine, no, nobody, non, none, noone, nor, normally, not, nothing, novel, now, nowhere, o, obviously, of, off, often, oh, ok, okay, old, on, once, one, ones, only, onto, or, other, others, otherwise, ought, our, ours, ourselves, out, outside, over, overall, own, p, particular, particularly, per, perhaps, placed, please, plus, possible, presumably, probably, provides, q, que, quite, qv, r, rather, rd, re, really, reasonably, regarding, regardless, regards, relatively, respectively, right, s, said, same, saw, say, saying, says, second, secondly, see, seeing, seem, seemed, seeming, seems, seen, self, selves, sensible, sent, serious, seriously, seven, several, shall, she, should, shouldn't, since, six, so, some, somebody, somehow, someone, something, sometime, sometimes, somewhat, somewhere, soon, sorry, specified, specify, specifying, still, sub, such, sup, sure, t, t's, take, taken, tell, tends, th, than, thank, thanks, thanx, that, that's, thats, the, their, theirs, them, themselves, then, thence, there, there's, thereafter, thereby, therefore, therein, theres, thereupon, these, they, they'd, they'll, they're, they've, think, third, this, thorough, thoroughly, those, though, three, through, throughout, thru, thus, to, together, too, took, toward, towards, tried, tries, truly, try, trying, twice, two, u, un, under, unfortunately, unless, unlikely, until, unto, up, upon, us, use, used, useful, uses, using, usually, uucp, v, value, various, very, via, viz, vs, w, want, wants, was, wasn't, way, we, we'd, we'll, we're, we've, welcome, well, went, were, weren't, what, what's, whatever, when, whence, whenever, where, where's, whereafter, whereas, whereby, wherein, whereupon, wherever, whether, which, while, whilst, whither, who, who's, whoever, whole, whom, whose, why, will, willing, wish, with, within, without, won't, wonder, would, wouldn't, x, y, yes, yet, you, you'd, you'll, you're, you've, your, yours, yourself, yourselves, z, zero + +# Greek: a)/llos, a)/n, a)/ra, a)ll', a)lla/, a)po/, au)to/s, d', dai/, dai/s, de/, dh/, dia/, e(autou=, e)/ti, e)a/n, e)gw/, e)k, e)mo/s, e)n, e)pi/, ei), ei)/mi, ei)mi/, ei)s, ga/r, ga^, ge, h(, h)/, kai/, kata/, me/n, meta/, mh/, o(, o(/de, o(/s, o(/stis, o(/ti, oi(, ou(/tws, ou(=tos, ou), ou)/te, ou)=n, ou)de/, ou)dei/s, ou)k, para/, peri/, pro/s, so/s, su/, su/n, ta/, te, th/n, th=s, th=|, ti, ti/, ti/s, tis, to/, to/n, toi/, toiou=tos, tou/s, tou=, tw=n, tw=|, u(mo/s, u(pe/r, u(po/, w(/ste, w(s, w)= + +# Latin: ab, ac, ad, adhic, aliqui, aliquis, an, ante, apud, at, atque, aut, autem, cum, cur, de, deinde, dum, ego, enim, ergo, es, est, et, etiam, etsi, ex, fio, haud, hic, iam, idem, igitur, ille, in, infra, inter, interim, ipse, is, ita, magis, modo, mox, nam, ne, nec, necque, neque, nisi, non, nos, o, ob, per, possum, post, pro, quae, quam, quare, qui, quia, quicumque, quidem, quilibet, quis, quisnam, quisquam, quisque, quisquis, quo, quoniam, sed, si, sic, sive, sub, sui, sum, super, suus, tam, tamen, trans, tu, tum, ubi, uel, uero, unus, ut + +# Italian: a, ad, agli, al, alcun, alcuno, all', alla, alle, allo, altra, altre, altri, altro, assai, avere, bene, c', ch', che, chi, ci, cio, co', col, come, con, cosi, cosi\, d', da, dal, dall', dalla, dalle, de, de', degli, dei, del, dell', della, delle, dello, di, duo, e, ed, egli, essere, et, gia, gia\, gli, gran, grande, i, il, in, io, l', la, le, li, lo, ma, maggior, maggiore, mai, mio, molto, ne, ne', nel, nell', nella, nelle, non, o, ogn', ogni, oue, ove, per, perche, piu, piu\, poco, poi, puo, qual, qualche, qualcun, qualcuno, quale, quanta, quante, quanti, quanto, quasi, quella, quelle, quelli, quello, questa, queste, questi, questo, qui, s', se, sempre, senza, si, sotto, su, sua, sue, sui, suo, tal, tanta, tante, tanti, tanto, tra, tre, tutta, tutte, tutti, tutto, un, una, uno, vn, vna, vno + +# German: aber, alle, als, also, am, an, andern, auch, auf, aus, bei, bey, bis, da, daher, das, dass, de, dem, den, der, des, die, diese, dieser, dieses, doch, durch, eben, ein, eine, einem, einen, einer, eines, er, es, fur, gegen, haben, hat, ihre, im, in, ist, kan, man, mehr, mit, nach, nicht, noch, nur, oder, ohne, sehr, sei, selbst, sey, sich, sie, sind, so, uber, um, und, unter, vgl, vom, von, weil, welche, wenn, werden, wie, wird, zu, zur + +# French: a, amp, au, auec, aussi, autre, autres, aux, bien, car, ce, ces, cette, ceux, chose, choses, comme, d', dans, de, des, deux, dire, dont, du, elle, elles, en, encore, est, estre, et, faire, fait, faut, force, grande, ie, il, ils, l', la, le, les, leur, leurs, lors, luy, mais, mesme, n', ne, nous, on, ont, or, ou, par, parce, pas, peut, plus, plusieurs, point, pour, pourquoy, puis, qu', quand, que, qui, quoy, sa, sans, se, ses, si, soit, son, sont, sur, tous, tout, toutes, vn, vne, y + */ + public String[] getStopWords(String language) { + String[] stopwords = new String[0]; + if (language != null) { + if (language.equals("en")) + stopwords = StandardAnalyzer.STOP_WORDS; + else if(language.equals("br")) + stopwords = BrazilianAnalyzer.BRAZILIAN_STOP_WORDS; + else if(language.equals("cz")) + stopwords = CzechAnalyzer.CZECH_STOP_WORDS; + else if(language.equals("de")) + stopwords = GermanAnalyzer.GERMAN_STOP_WORDS; + else if(language.equals("fr")) + stopwords = FrenchAnalyzer.FRENCH_STOP_WORDS; + else if(language.equals("nl")) + stopwords = DutchAnalyzer.DUTCH_STOP_WORDS; + } + return stopwords; + } + + /** + * Builds an exclusionlist from an array of Strings. + */ + public void setStemExclusionTable(String[] exclusionlist) { + exclusionSet = StopFilter.makeStopSet(exclusionlist); + } + + /** + * Builds an exclusionlist from a Hashtable. + */ + public void setStemExclusionTable(Hashtable exclusionlist) { + exclusionSet = new HashSet(exclusionlist.keySet()); + } + + /** + * Builds an exclusionlist from the words contained in the given file. + */ + public void setStemExclusionTable(File exclusionlist) throws IOException { + exclusionSet = WordlistLoader.getWordSet(exclusionlist); + } + + /** + * Creates a TokenStream which tokenizes all the text in the provided Reader. + * + * @return A TokenStream build from a StandardTokenizer filtered with + * StandardFilter, LowerCaseFilter, StopFilter, DonatusStemFilter + */ + public TokenStream tokenStream(String fieldName, Reader reader) { + TokenStream result = new StandardTokenizer(reader); + result = new StandardFilter(result); + result = new LowerCaseFilter(result); + result = new StopFilter(result, stopSet); + result = new DonatusStemFilter(this, result, exclusionSet); + return result; + } + + public ArrayList getToken(String inputString) { + ArrayList token = new ArrayList(); + try { + Reader reader = new StringReader(inputString); + TokenStream result = new StandardTokenizer(reader); + result = new StandardFilter(result); + result = new LowerCaseFilter(result); + result = new StopFilter(result, stopSet); + Token t = result.next(); + while (t != null) { + String currentToken = String.valueOf(t.termBuffer()); + token.add(currentToken); + t = result.next(); + } + } catch (IOException e) { + e.printStackTrace(); + } + return token; + } + +} diff -r 000000000000 -r 408254cf2f1d software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/donatus/analysis/DonatusStemFilter.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/donatus/analysis/DonatusStemFilter.java Wed Nov 24 17:24:23 2010 +0100 @@ -0,0 +1,53 @@ +package de.mpg.mpiwg.berlin.mpdl.donatus.analysis; + +import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; + +import java.io.IOException; +import java.util.Set; + +public final class DonatusStemFilter extends TokenFilter { + private DonatusAnalyzer analyzer; + private Token token = null; + private DonatusStemmer stemmer = null; + private Set exclusionSet = null; + + public DonatusStemFilter(TokenStream in) { + super(in); + stemmer = new DonatusStemmer(); + } + + public DonatusStemFilter(DonatusAnalyzer analyzer, TokenStream in, Set exclusionSet) { + this(in); + this.analyzer = analyzer; + this.exclusionSet = exclusionSet; + this.stemmer.setLanguage(analyzer.getLanguage()); + } + + public final Token next() throws IOException { + if (( token = input.next()) == null) { + return null; + } else if ( exclusionSet != null && exclusionSet.contains( token.termText() ) ) { + return token; + } else { + String s = stemmer.stem(token.termText()); + // If not stemmed, dont waste the time creating a new token + if ( !s.equals( token.termText() ) ) { + return new Token( s, token.startOffset(), + token.endOffset(), token.type() ); + } + return token; + } + } + + public void setStemmer(DonatusStemmer stemmer) { + if ( stemmer != null ) { + this.stemmer = stemmer; + } + } + + public void setExclusionSet(Set exclusionSet) { + this.exclusionSet = exclusionSet; + } +} diff -r 000000000000 -r 408254cf2f1d software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/donatus/analysis/DonatusStemmer.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/donatus/analysis/DonatusStemmer.java Wed Nov 24 17:24:23 2010 +0100 @@ -0,0 +1,146 @@ +package de.mpg.mpiwg.berlin.mpdl.donatus.analysis; + +import org.apache.log4j.Logger; + +import de.mpg.mpiwg.berlin.mpdl.donatus.xmlrpc.DonatusCache; +import de.mpg.mpiwg.berlin.mpdl.donatus.xmlrpc.DonatusConstants; +import de.mpg.mpiwg.berlin.mpdl.donatus.xmlrpc.DonatusLemma; +import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; + +public class DonatusStemmer { + private String language = DonatusConstants.DEFAULT_LANGUAGE; + + protected void setLanguage(String language) { + this.language = language; + } + + /** + * Used for indexing documents and for querying + * @param term + * @return + */ + protected String stem(String term) { + String stem = null; + term = term.toLowerCase(); + // try to find the stem by the DonatusCache + DonatusLemma donatusLemma = null; + try { + DonatusCache donatusCache = DonatusCache.getInstance(); + donatusLemma = donatusCache.getLemmaByVariantForm(language, term); + } catch (ApplicationException e) { + // nothing, do not disturb + } + if (donatusLemma != null) + stem = donatusLemma.getForm(); + // if not found by Donatus try to use Snowball (or later other language specific stemmers) + if (stem == null) { + stem = stemBySnowball(term, language); + // if term is not equal to the base form and also the stem is not too short (> 2 characters) then add this Snowball variant to the lemmas in cache + if ((! stem.equals(term)) && stem.length() > 2) { + try { + DonatusCache donatusCache = DonatusCache.getInstance(); + if (donatusCache.getMode() == DonatusCache.DOCUMENT_MODE) { + donatusCache.addVariant(language, stem, DonatusConstants.TYPE_SNOWBALL, term); + } + } catch (ApplicationException e) { + Logger.getLogger(DonatusStemmer.class).warn("DonatusCache: an exception was caught while indexing a document: " + e.getMessage(), e); + } + } + } + /* TODO if Snowball is too bad (for some languages) use Lucene analyzers + if (stem == null) { + stem = stemByLanguageStemmers(term, this.language); + } + */ + return stem; + } + + private String stemBySnowball(String term, String language) { + String stem = null; + if (language.equals("de")) { + net.sf.snowball.ext.GermanStemmer stemmer = new net.sf.snowball.ext.GermanStemmer(); + stemmer.setCurrent(term); + stemmer.stem(); + stem = stemmer.getCurrent(); + } else if (language.equals("en")) { + net.sf.snowball.ext.EnglishStemmer stemmer = new net.sf.snowball.ext.EnglishStemmer(); + stemmer.setCurrent(term); + stemmer.stem(); + stem = stemmer.getCurrent(); + } else if (language.equals("nl")) { + net.sf.snowball.ext.DutchStemmer stemmer = new net.sf.snowball.ext.DutchStemmer(); + stemmer.setCurrent(term); + stemmer.stem(); + stem = stemmer.getCurrent(); + } else if (language.equals("fi")) { + net.sf.snowball.ext.FinnishStemmer stemmer = new net.sf.snowball.ext.FinnishStemmer(); + stemmer.setCurrent(term); + stemmer.stem(); + stem = stemmer.getCurrent(); + } else if (language.equals("fr")) { + net.sf.snowball.ext.FrenchStemmer stemmer = new net.sf.snowball.ext.FrenchStemmer(); + stemmer.setCurrent(term); + stemmer.stem(); + stem = stemmer.getCurrent(); + } else if (language.equals("it")) { + net.sf.snowball.ext.ItalianStemmer stemmer = new net.sf.snowball.ext.ItalianStemmer(); + stemmer.setCurrent(term); + stemmer.stem(); + stem = stemmer.getCurrent(); + } else if (language.equals("no")) { + net.sf.snowball.ext.NorwegianStemmer stemmer = new net.sf.snowball.ext.NorwegianStemmer(); + stemmer.setCurrent(term); + stemmer.stem(); + stem = stemmer.getCurrent(); + } else if (language.equals("pt")) { + net.sf.snowball.ext.PortugueseStemmer stemmer = new net.sf.snowball.ext.PortugueseStemmer(); + stemmer.setCurrent(term); + stemmer.stem(); + stem = stemmer.getCurrent(); + } else if (language.equals("ru")) { + net.sf.snowball.ext.RussianStemmer stemmer = new net.sf.snowball.ext.RussianStemmer(); + stemmer.setCurrent(term); + stemmer.stem(); + stem = stemmer.getCurrent(); + } else if (language.equals("es")) { + net.sf.snowball.ext.SpanishStemmer stemmer = new net.sf.snowball.ext.SpanishStemmer(); + stemmer.setCurrent(term); + stemmer.stem(); + stem = stemmer.getCurrent(); + } else if (language.equals("sv")) { + net.sf.snowball.ext.SwedishStemmer stemmer = new net.sf.snowball.ext.SwedishStemmer(); + stemmer.setCurrent(term); + stemmer.stem(); + stem = stemmer.getCurrent(); + } else { + stem = term; // if no language fits deliver the term itself as the stem form + } + return stem; + } + + /* + private String stemByLanguageStemmers(String term, String language) { + // TODO provide other languages + String stem = null; + if (language.equals("br")) { + BrazilianStemmer stemmer = new BrazilianStemmer(); + stem = stemmer.stem(term); + } else if (language.equals("de")) { + GermanStemmer stemmer = new GermanStemmer(); + stem = stemmer.stem(term); + } else if (language.equals("fr")) { + FrenchStemmer stemmer = new FrenchStemmer(); + stem = stemmer.stem(term); + } else if (language.equals("nl")) { + DutchStemmer stemmer = new DutchStemmer(); + stem = stemmer.stem(term); + } else if (language.equals("ru")) { + RussianStemmer stemmer = new RussianStemmer(); + stem = stemmer.stem(term); + } else { + stem = term; // if no language fits deliver the term itself as the stem form + } + return stem; + } + */ +} diff -r 000000000000 -r 408254cf2f1d software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/donatus/analysis/lang/BrazilianStemmer.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/donatus/analysis/lang/BrazilianStemmer.java Wed Nov 24 17:24:23 2010 +0100 @@ -0,0 +1,1021 @@ +package de.mpg.mpiwg.berlin.mpdl.donatus.analysis.lang; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * A stemmer for Brazilian words. + */ +public class BrazilianStemmer { + + /** + * Changed term + */ + private String TERM ; + private String CT ; + private String R1 ; + private String R2 ; + private String RV ; + + + public BrazilianStemmer() { + } + + /** + * Stemms the given term to an unique discriminator. + * + * @param term The term that should be stemmed. + * @return Discriminator for term + */ + public String stem( String term ) { + boolean altered = false ; // altered the term + + // creates CT + createCT(term) ; + + if ( !isIndexable( CT ) ) { + return null; + } + if ( !isStemmable( CT ) ) { + return CT ; + } + + R1 = getR1(CT) ; + R2 = getR1(R1) ; + RV = getRV(CT) ; + TERM = term + ";" +CT ; + + altered = step1() ; + if (!altered) { + altered = step2() ; + } + + if (altered) { + step3(); + } else { + step4(); + } + + step5() ; + + return CT ; + } + + /** + * Checks a term if it can be processed correctly. + * + * @return true if, and only if, the given term consists in letters. + */ + private boolean isStemmable( String term ) { + for ( int c = 0; c < term.length(); c++ ) { + // Discard terms that contain non-letter characters. + if ( !Character.isLetter(term.charAt(c))) { + return false; + } + } + return true; + } + + /** + * Checks a term if it can be processed indexed. + * + * @return true if it can be indexed + */ + private boolean isIndexable( String term ) { + return (term.length() < 30) && (term.length() > 2) ; + } + + /** + * See if string is 'a','e','i','o','u' + * + * @return true if is vowel + */ + private boolean isVowel( char value ) { + return (value == 'a') || + (value == 'e') || + (value == 'i') || + (value == 'o') || + (value == 'u') ; + } + + /** + * Gets R1 + * + * R1 - is the region after the first non-vowel follwing a vowel, + * or is the null region at the end of the word if there is + * no such non-vowel. + * + * @return null or a string representing R1 + */ + private String getR1( String value ) { + int i; + int j; + + // be-safe !!! + if (value == null) { + return null ; + } + + // find 1st vowel + i = value.length()-1 ; + for (j=0 ; j < i ; j++) { + if (isVowel(value.charAt(j))) { + break ; + } + } + + if (!(j < i)) { + return null ; + } + + // find 1st non-vowel + for ( ; j < i ; j++) { + if (!(isVowel(value.charAt(j)))) { + break ; + } + } + + if (!(j < i)) { + return null ; + } + + return value.substring(j+1) ; + } + + /** + * Gets RV + * + * RV - IF the second letter is a consoant, RV is the region after + * the next following vowel, + * + * OR if the first two letters are vowels, RV is the region + * after the next consoant, + * + * AND otherwise (consoant-vowel case) RV is the region after + * the third letter. + * + * BUT RV is the end of the word if this positions cannot be + * found. + * + * @return null or a string representing RV + */ + private String getRV( String value ) { + int i; + int j; + + // be-safe !!! + if (value == null) { + return null ; + } + + i = value.length()-1 ; + + // RV - IF the second letter is a consoant, RV is the region after + // the next following vowel, + if ((i > 0) && !isVowel(value.charAt(1))) { + // find 1st vowel + for (j=2 ; j < i ; j++) { + if (isVowel(value.charAt(j))) { + break ; + } + } + + if (j < i) { + return value.substring(j+1) ; + } + } + + + // RV - OR if the first two letters are vowels, RV is the region + // after the next consoant, + if ((i > 1) && + isVowel(value.charAt(0)) && + isVowel(value.charAt(1))) { + // find 1st consoant + for (j=2 ; j < i ; j++) { + if (!isVowel(value.charAt(j))) { + break ; + } + } + + if (j < i) { + return value.substring(j+1) ; + } + } + + // RV - AND otherwise (consoant-vowel case) RV is the region after + // the third letter. + if (i > 2) { + return value.substring(3) ; + } + + return null ; + } + + /** + * 1) Turn to lowercase + * 2) Remove accents + * 3) ã -> a ; õ -> o + * 4) ç -> c + * + * @return null or a string transformed + */ + private String changeTerm( String value ) { + int j; + String r = "" ; + + // be-safe !!! + if (value == null) { + return null ; + } + + value = value.toLowerCase() ; + for (j=0 ; j < value.length() ; j++) { + if ((value.charAt(j) == 'á') || + (value.charAt(j) == 'â') || + (value.charAt(j) == 'ã')) { + r= r + "a" ; continue ; + } + if ((value.charAt(j) == 'é') || + (value.charAt(j) == 'ê')) { + r= r + "e" ; continue ; + } + if (value.charAt(j) == 'í') { + r= r + "i" ; continue ; + } + if ((value.charAt(j) == 'ó') || + (value.charAt(j) == 'ô') || + (value.charAt(j) == 'õ')) { + r= r + "o" ; continue ; + } + if ((value.charAt(j) == 'ú') || + (value.charAt(j) == 'ü')) { + r= r + "u" ; continue ; + } + if (value.charAt(j) == 'ç') { + r= r + "c" ; continue ; + } + if (value.charAt(j) == 'ñ') { + r= r + "n" ; continue ; + } + + r= r+ value.charAt(j) ; + } + + return r ; + } + + /** + * Check if a string ends with a suffix + * + * @return true if the string ends with the specified suffix + */ + private boolean suffix( String value, String suffix ) { + + // be-safe !!! + if ((value == null) || (suffix == null)) { + return false ; + } + + if (suffix.length() > value.length()) { + return false ; + } + + return value.substring(value.length()-suffix.length()).equals(suffix); + } + + /** + * Replace a string suffix by another + * + * @return the replaced String + */ + private String replaceSuffix( String value, String toReplace, String changeTo ) { + String vvalue ; + + // be-safe !!! + if ((value == null) || + (toReplace == null) || + (changeTo == null) ) { + return value ; + } + + vvalue = removeSuffix(value,toReplace) ; + + if (value.equals(vvalue)) { + return value ; + } else { + return vvalue + changeTo ; + } + } + + /** + * Remove a string suffix + * + * @return the String without the suffix + */ + private String removeSuffix( String value, String toRemove ) { + // be-safe !!! + if ((value == null) || + (toRemove == null) || + !suffix(value,toRemove) ) { + return value ; + } + + return value.substring(0,value.length()-toRemove.length()) ; + } + + /** + * See if a suffix is preceded by a String + * + * @return true if the suffix is preceded + */ + private boolean suffixPreceded( String value, String suffix, String preceded ) { + // be-safe !!! + if ((value == null) || + (suffix == null) || + (preceded == null) || + !suffix(value,suffix) ) { + return false ; + } + + return suffix(removeSuffix(value,suffix),preceded) ; + } + + /** + * Creates CT (changed term) , substituting * 'ã' and 'õ' for 'a~' and 'o~'. + */ + private void createCT( String term ) { + CT = changeTerm(term) ; + + if (CT.length() < 2) return ; + + // if the first character is ... , remove it + if ((CT.charAt(0) == '"') || + (CT.charAt(0) == '\'') || + (CT.charAt(0) == '-') || + (CT.charAt(0) == ',') || + (CT.charAt(0) == ';') || + (CT.charAt(0) == '.') || + (CT.charAt(0) == '?') || + (CT.charAt(0) == '!') + ) { + CT = CT.substring(1); + } + + if (CT.length() < 2) return ; + + // if the last character is ... , remove it + if ((CT.charAt(CT.length()-1) == '-') || + (CT.charAt(CT.length()-1) == ',') || + (CT.charAt(CT.length()-1) == ';') || + (CT.charAt(CT.length()-1) == '.') || + (CT.charAt(CT.length()-1) == '?') || + (CT.charAt(CT.length()-1) == '!') || + (CT.charAt(CT.length()-1) == '\'') || + (CT.charAt(CT.length()-1) == '"') + ) { + CT = CT.substring(0,CT.length()-1); + } + } + + + /** + * Standart suffix removal. + * Search for the longest among the following suffixes, and perform + * the following actions: + * + * @return false if no ending was removed + */ + private boolean step1() { + if (CT == null) return false ; + + // suffix lenght = 7 + if (suffix(CT,"uciones") && suffix(R2,"uciones")) { + CT = replaceSuffix(CT,"uciones","u") ; return true; + } + + // suffix lenght = 6 + if (CT.length() >= 6) { + if (suffix(CT,"imentos") && suffix(R2,"imentos")) { + CT = removeSuffix(CT,"imentos") ; return true; + } + if (suffix(CT,"amentos") && suffix(R2,"amentos")) { + CT = removeSuffix(CT,"amentos") ; return true; + } + if (suffix(CT,"adores") && suffix(R2,"adores")) { + CT = removeSuffix(CT,"adores") ; return true; + } + if (suffix(CT,"adoras") && suffix(R2,"adoras")) { + CT = removeSuffix(CT,"adoras") ; return true; + } + if (suffix(CT,"logias") && suffix(R2,"logias")) { + replaceSuffix(CT,"logias","log") ; return true; + } + if (suffix(CT,"encias") && suffix(R2,"encias")) { + CT = replaceSuffix(CT,"encias","ente") ; return true; + } + if (suffix(CT,"amente") && suffix(R1,"amente")) { + CT = removeSuffix(CT,"amente") ; return true; + } + if (suffix(CT,"idades") && suffix(R2,"idades")) { + CT = removeSuffix(CT,"idades") ; return true; + } + } + + // suffix lenght = 5 + if (CT.length() >= 5) { + if (suffix(CT,"acoes") && suffix(R2,"acoes")) { + CT = removeSuffix(CT,"acoes") ; return true; + } + if (suffix(CT,"imento") && suffix(R2,"imento")) { + CT = removeSuffix(CT,"imento") ; return true; + } + if (suffix(CT,"amento") && suffix(R2,"amento")) { + CT = removeSuffix(CT,"amento") ; return true; + } + if (suffix(CT,"adora") && suffix(R2,"adora")) { + CT = removeSuffix(CT,"adora") ; return true; + } + if (suffix(CT,"ismos") && suffix(R2,"ismos")) { + CT = removeSuffix(CT,"ismos") ; return true; + } + if (suffix(CT,"istas") && suffix(R2,"istas")) { + CT = removeSuffix(CT,"istas") ; return true; + } + if (suffix(CT,"logia") && suffix(R2,"logia")) { + CT = replaceSuffix(CT,"logia","log") ; return true; + } + if (suffix(CT,"ucion") && suffix(R2,"ucion")) { + CT = replaceSuffix(CT,"ucion","u") ; return true; + } + if (suffix(CT,"encia") && suffix(R2,"encia")) { + CT = replaceSuffix(CT,"encia","ente") ; return true; + } + if (suffix(CT,"mente") && suffix(R2,"mente")) { + CT = removeSuffix(CT,"mente") ; return true; + } + if (suffix(CT,"idade") && suffix(R2,"idade")) { + CT = removeSuffix(CT,"idade") ; return true; + } + } + + // suffix lenght = 4 + if (CT.length() >= 4) { + if (suffix(CT,"acao") && suffix(R2,"acao")) { + CT = removeSuffix(CT,"acao") ; return true; + } + if (suffix(CT,"ezas") && suffix(R2,"ezas")) { + CT = removeSuffix(CT,"ezas") ; return true; + } + if (suffix(CT,"icos") && suffix(R2,"icos")) { + CT = removeSuffix(CT,"icos") ; return true ; + } + if (suffix(CT,"icas") && suffix(R2,"icas")) { + CT = removeSuffix(CT,"icas") ; return true ; + } + if (suffix(CT,"ismo") && suffix(R2,"ismo")) { + CT = removeSuffix(CT,"ismo") ; return true ; + } + if (suffix(CT,"avel") && suffix(R2,"avel")) { + CT = removeSuffix(CT,"avel") ; return true ; + } + if (suffix(CT,"ivel") && suffix(R2,"ivel")) { + CT = removeSuffix(CT,"ivel") ; return true ; + } + if (suffix(CT,"ista") && suffix(R2,"ista")) { + CT = removeSuffix(CT,"ista") ; return true ; + } + if (suffix(CT,"osos") && suffix(R2,"osos")) { + CT = removeSuffix(CT,"osos") ; return true ; + } + if (suffix(CT,"osas") && suffix(R2,"osas")) { + CT = removeSuffix(CT,"osas") ; return true ; + } + if (suffix(CT,"ador") && suffix(R2,"ador")) { + CT = removeSuffix(CT,"ador") ; return true ; + } + if (suffix(CT,"ivas") && suffix(R2,"ivas")) { + CT = removeSuffix(CT,"ivas") ; return true ; + } + if (suffix(CT,"ivos") && suffix(R2,"ivos")) { + CT = removeSuffix(CT,"ivos") ; return true ; + } + if (suffix(CT,"iras") && + suffix(RV,"iras") && + suffixPreceded(CT,"iras","e")) { + CT = replaceSuffix(CT,"iras","ir") ; return true ; + } + } + + // suffix lenght = 3 + if (CT.length() >= 3) { + if (suffix(CT,"eza") && suffix(R2,"eza")) { + CT = removeSuffix(CT,"eza") ; return true ; + } + if (suffix(CT,"ico") && suffix(R2,"ico")) { + CT = removeSuffix(CT,"ico") ; return true ; + } + if (suffix(CT,"ica") && suffix(R2,"ica")) { + CT = removeSuffix(CT,"ica") ; return true ; + } + if (suffix(CT,"oso") && suffix(R2,"oso")) { + CT = removeSuffix(CT,"oso") ; return true ; + } + if (suffix(CT,"osa") && suffix(R2,"osa")) { + CT = removeSuffix(CT,"osa") ; return true ; + } + if (suffix(CT,"iva") && suffix(R2,"iva")) { + CT = removeSuffix(CT,"iva") ; return true ; + } + if (suffix(CT,"ivo") && suffix(R2,"ivo")) { + CT = removeSuffix(CT,"ivo") ; return true ; + } + if (suffix(CT,"ira") && + suffix(RV,"ira") && + suffixPreceded(CT,"ira","e")) { + CT = replaceSuffix(CT,"ira","ir") ; return true ; + } + } + + // no ending was removed by step1 + return false ; + } + + + /** + * Verb suffixes. + * + * Search for the longest among the following suffixes in RV, + * and if found, delete. + * + * @return false if no ending was removed + */ + private boolean step2() { + if (RV == null) return false ; + + // suffix lenght = 7 + if (RV.length() >= 7) { + if (suffix(RV,"issemos")) { + CT = removeSuffix(CT,"issemos") ; return true; + } + if (suffix(RV,"essemos")) { + CT = removeSuffix(CT,"essemos") ; return true; + } + if (suffix(RV,"assemos")) { + CT = removeSuffix(CT,"assemos") ; return true; + } + if (suffix(RV,"ariamos")) { + CT = removeSuffix(CT,"ariamos") ; return true; + } + if (suffix(RV,"eriamos")) { + CT = removeSuffix(CT,"eriamos") ; return true; + } + if (suffix(RV,"iriamos")) { + CT = removeSuffix(CT,"iriamos") ; return true; + } + } + + // suffix lenght = 6 + if (RV.length() >= 6) { + if (suffix(RV,"iremos")) { + CT = removeSuffix(CT,"iremos") ; return true; + } + if (suffix(RV,"eremos")) { + CT = removeSuffix(CT,"eremos") ; return true; + } + if (suffix(RV,"aremos")) { + CT = removeSuffix(CT,"aremos") ; return true; + } + if (suffix(RV,"avamos")) { + CT = removeSuffix(CT,"avamos") ; return true; + } + if (suffix(RV,"iramos")) { + CT = removeSuffix(CT,"iramos") ; return true; + } + if (suffix(RV,"eramos")) { + CT = removeSuffix(CT,"eramos") ; return true; + } + if (suffix(RV,"aramos")) { + CT = removeSuffix(CT,"aramos") ; return true; + } + if (suffix(RV,"asseis")) { + CT = removeSuffix(CT,"asseis") ; return true; + } + if (suffix(RV,"esseis")) { + CT = removeSuffix(CT,"esseis") ; return true; + } + if (suffix(RV,"isseis")) { + CT = removeSuffix(CT,"isseis") ; return true; + } + if (suffix(RV,"arieis")) { + CT = removeSuffix(CT,"arieis") ; return true; + } + if (suffix(RV,"erieis")) { + CT = removeSuffix(CT,"erieis") ; return true; + } + if (suffix(RV,"irieis")) { + CT = removeSuffix(CT,"irieis") ; return true; + } + } + + + // suffix lenght = 5 + if (RV.length() >= 5) { + if (suffix(RV,"irmos")) { + CT = removeSuffix(CT,"irmos") ; return true; + } + if (suffix(RV,"iamos")) { + CT = removeSuffix(CT,"iamos") ; return true; + } + if (suffix(RV,"armos")) { + CT = removeSuffix(CT,"armos") ; return true; + } + if (suffix(RV,"ermos")) { + CT = removeSuffix(CT,"ermos") ; return true; + } + if (suffix(RV,"areis")) { + CT = removeSuffix(CT,"areis") ; return true; + } + if (suffix(RV,"ereis")) { + CT = removeSuffix(CT,"ereis") ; return true; + } + if (suffix(RV,"ireis")) { + CT = removeSuffix(CT,"ireis") ; return true; + } + if (suffix(RV,"asses")) { + CT = removeSuffix(CT,"asses") ; return true; + } + if (suffix(RV,"esses")) { + CT = removeSuffix(CT,"esses") ; return true; + } + if (suffix(RV,"isses")) { + CT = removeSuffix(CT,"isses") ; return true; + } + if (suffix(RV,"astes")) { + CT = removeSuffix(CT,"astes") ; return true; + } + if (suffix(RV,"assem")) { + CT = removeSuffix(CT,"assem") ; return true; + } + if (suffix(RV,"essem")) { + CT = removeSuffix(CT,"essem") ; return true; + } + if (suffix(RV,"issem")) { + CT = removeSuffix(CT,"issem") ; return true; + } + if (suffix(RV,"ardes")) { + CT = removeSuffix(CT,"ardes") ; return true; + } + if (suffix(RV,"erdes")) { + CT = removeSuffix(CT,"erdes") ; return true; + } + if (suffix(RV,"irdes")) { + CT = removeSuffix(CT,"irdes") ; return true; + } + if (suffix(RV,"ariam")) { + CT = removeSuffix(CT,"ariam") ; return true; + } + if (suffix(RV,"eriam")) { + CT = removeSuffix(CT,"eriam") ; return true; + } + if (suffix(RV,"iriam")) { + CT = removeSuffix(CT,"iriam") ; return true; + } + if (suffix(RV,"arias")) { + CT = removeSuffix(CT,"arias") ; return true; + } + if (suffix(RV,"erias")) { + CT = removeSuffix(CT,"erias") ; return true; + } + if (suffix(RV,"irias")) { + CT = removeSuffix(CT,"irias") ; return true; + } + if (suffix(RV,"estes")) { + CT = removeSuffix(CT,"estes") ; return true; + } + if (suffix(RV,"istes")) { + CT = removeSuffix(CT,"istes") ; return true; + } + if (suffix(RV,"areis")) { + CT = removeSuffix(CT,"areis") ; return true; + } + if (suffix(RV,"aveis")) { + CT = removeSuffix(CT,"aveis") ; return true; + } + } + + // suffix lenght = 4 + if (RV.length() >= 4) { + if (suffix(RV,"aria")) { + CT = removeSuffix(CT,"aria") ; return true; + } + if (suffix(RV,"eria")) { + CT = removeSuffix(CT,"eria") ; return true; + } + if (suffix(RV,"iria")) { + CT = removeSuffix(CT,"iria") ; return true; + } + if (suffix(RV,"asse")) { + CT = removeSuffix(CT,"asse") ; return true; + } + if (suffix(RV,"esse")) { + CT = removeSuffix(CT,"esse") ; return true; + } + if (suffix(RV,"isse")) { + CT = removeSuffix(CT,"isse") ; return true; + } + if (suffix(RV,"aste")) { + CT = removeSuffix(CT,"aste") ; return true; + } + if (suffix(RV,"este")) { + CT = removeSuffix(CT,"este") ; return true; + } + if (suffix(RV,"iste")) { + CT = removeSuffix(CT,"iste") ; return true; + } + if (suffix(RV,"arei")) { + CT = removeSuffix(CT,"arei") ; return true; + } + if (suffix(RV,"erei")) { + CT = removeSuffix(CT,"erei") ; return true; + } + if (suffix(RV,"irei")) { + CT = removeSuffix(CT,"irei") ; return true; + } + if (suffix(RV,"aram")) { + CT = removeSuffix(CT,"aram") ; return true; + } + if (suffix(RV,"eram")) { + CT = removeSuffix(CT,"eram") ; return true; + } + if (suffix(RV,"iram")) { + CT = removeSuffix(CT,"iram") ; return true; + } + if (suffix(RV,"avam")) { + CT = removeSuffix(CT,"avam") ; return true; + } + if (suffix(RV,"arem")) { + CT = removeSuffix(CT,"arem") ; return true; + } + if (suffix(RV,"erem")) { + CT = removeSuffix(CT,"erem") ; return true; + } + if (suffix(RV,"irem")) { + CT = removeSuffix(CT,"irem") ; return true; + } + if (suffix(RV,"ando")) { + CT = removeSuffix(CT,"ando") ; return true; + } + if (suffix(RV,"endo")) { + CT = removeSuffix(CT,"endo") ; return true; + } + if (suffix(RV,"indo")) { + CT = removeSuffix(CT,"indo") ; return true; + } + if (suffix(RV,"arao")) { + CT = removeSuffix(CT,"arao") ; return true; + } + if (suffix(RV,"erao")) { + CT = removeSuffix(CT,"erao") ; return true; + } + if (suffix(RV,"irao")) { + CT = removeSuffix(CT,"irao") ; return true; + } + if (suffix(RV,"adas")) { + CT = removeSuffix(CT,"adas") ; return true; + } + if (suffix(RV,"idas")) { + CT = removeSuffix(CT,"idas") ; return true; + } + if (suffix(RV,"aras")) { + CT = removeSuffix(CT,"aras") ; return true; + } + if (suffix(RV,"eras")) { + CT = removeSuffix(CT,"eras") ; return true; + } + if (suffix(RV,"iras")) { + CT = removeSuffix(CT,"iras") ; return true; + } + if (suffix(RV,"avas")) { + CT = removeSuffix(CT,"avas") ; return true; + } + if (suffix(RV,"ares")) { + CT = removeSuffix(CT,"ares") ; return true; + } + if (suffix(RV,"eres")) { + CT = removeSuffix(CT,"eres") ; return true; + } + if (suffix(RV,"ires")) { + CT = removeSuffix(CT,"ires") ; return true; + } + if (suffix(RV,"ados")) { + CT = removeSuffix(CT,"ados") ; return true; + } + if (suffix(RV,"idos")) { + CT = removeSuffix(CT,"idos") ; return true; + } + if (suffix(RV,"amos")) { + CT = removeSuffix(CT,"amos") ; return true; + } + if (suffix(RV,"emos")) { + CT = removeSuffix(CT,"emos") ; return true; + } + if (suffix(RV,"imos")) { + CT = removeSuffix(CT,"imos") ; return true; + } + if (suffix(RV,"iras")) { + CT = removeSuffix(CT,"iras") ; return true; + } + if (suffix(RV,"ieis")) { + CT = removeSuffix(CT,"ieis") ; return true; + } + } + + // suffix lenght = 3 + if (RV.length() >= 3) { + if (suffix(RV,"ada")) { + CT = removeSuffix(CT,"ada") ; return true; + } + if (suffix(RV,"ida")) { + CT = removeSuffix(CT,"ida") ; return true; + } + if (suffix(RV,"ara")) { + CT = removeSuffix(CT,"ara") ; return true; + } + if (suffix(RV,"era")) { + CT = removeSuffix(CT,"era") ; return true; + } + if (suffix(RV,"ira")) { + CT = removeSuffix(CT,"ava") ; return true; + } + if (suffix(RV,"iam")) { + CT = removeSuffix(CT,"iam") ; return true; + } + if (suffix(RV,"ado")) { + CT = removeSuffix(CT,"ado") ; return true; + } + if (suffix(RV,"ido")) { + CT = removeSuffix(CT,"ido") ; return true; + } + if (suffix(RV,"ias")) { + CT = removeSuffix(CT,"ias") ; return true; + } + if (suffix(RV,"ais")) { + CT = removeSuffix(CT,"ais") ; return true; + } + if (suffix(RV,"eis")) { + CT = removeSuffix(CT,"eis") ; return true; + } + if (suffix(RV,"ira")) { + CT = removeSuffix(CT,"ira") ; return true; + } + if (suffix(RV,"ear")) { + CT = removeSuffix(CT,"ear") ; return true; + } + } + + // suffix lenght = 2 + if (RV.length() >= 2) { + if (suffix(RV,"ia")) { + CT = removeSuffix(CT,"ia") ; return true; + } + if (suffix(RV,"ei")) { + CT = removeSuffix(CT,"ei") ; return true; + } + if (suffix(RV,"am")) { + CT = removeSuffix(CT,"am") ; return true; + } + if (suffix(RV,"em")) { + CT = removeSuffix(CT,"em") ; return true; + } + if (suffix(RV,"ar")) { + CT = removeSuffix(CT,"ar") ; return true; + } + if (suffix(RV,"er")) { + CT = removeSuffix(CT,"er") ; return true; + } + if (suffix(RV,"ir")) { + CT = removeSuffix(CT,"ir") ; return true; + } + if (suffix(RV,"as")) { + CT = removeSuffix(CT,"as") ; return true; + } + if (suffix(RV,"es")) { + CT = removeSuffix(CT,"es") ; return true; + } + if (suffix(RV,"is")) { + CT = removeSuffix(CT,"is") ; return true; + } + if (suffix(RV,"eu")) { + CT = removeSuffix(CT,"eu") ; return true; + } + if (suffix(RV,"iu")) { + CT = removeSuffix(CT,"iu") ; return true; + } + if (suffix(RV,"iu")) { + CT = removeSuffix(CT,"iu") ; return true; + } + if (suffix(RV,"ou")) { + CT = removeSuffix(CT,"ou") ; return true; + } + } + + // no ending was removed by step2 + return false ; + } + + /** + * Delete suffix 'i' if in RV and preceded by 'c' + * + */ + private void step3() { + if (RV == null) return ; + + if (suffix(RV,"i") && suffixPreceded(RV,"i","c")) { + CT = removeSuffix(CT,"i") ; + } + + } + + /** + * Residual suffix + * + * If the word ends with one of the suffixes (os a i o á í ó) + * in RV, delete it + * + */ + private void step4() { + if (RV == null) return ; + + if (suffix(RV,"os")) { + CT = removeSuffix(CT,"os") ; return ; + } + if (suffix(RV,"a")) { + CT = removeSuffix(CT,"a") ; return ; + } + if (suffix(RV,"i")) { + CT = removeSuffix(CT,"i") ; return ; + } + if (suffix(RV,"o")) { + CT = removeSuffix(CT,"o") ; return ; + } + + } + + /** + * If the word ends with one of ( e é ê) in RV,delete it, + * and if preceded by 'gu' (or 'ci') with the 'u' (or 'i') in RV, + * delete the 'u' (or 'i') + * + * Or if the word ends ç remove the cedilha + * + */ + private void step5() { + if (RV == null) return ; + + if (suffix(RV,"e")) { + if (suffixPreceded(RV,"e","gu")) { + CT = removeSuffix(CT,"e") ; + CT = removeSuffix(CT,"u") ; + return ; + } + + if (suffixPreceded(RV,"e","ci")) { + CT = removeSuffix(CT,"e") ; + CT = removeSuffix(CT,"i") ; + return ; + } + + CT = removeSuffix(CT,"e") ; return ; + } + } + + /** + * For log and debug purpose + * + * @return TERM, CT, RV, R1 and R2 + */ + public String log() { + return " (TERM = " + TERM + ")" + + " (CT = " + CT +")" + + " (RV = " + RV +")" + + " (R1 = " + R1 +")" + + " (R2 = " + R2 +")" ; + } + +} + diff -r 000000000000 -r 408254cf2f1d software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/donatus/analysis/lang/DonatusAnalyzerAR.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/donatus/analysis/lang/DonatusAnalyzerAR.java Wed Nov 24 17:24:23 2010 +0100 @@ -0,0 +1,41 @@ +package de.mpg.mpiwg.berlin.mpdl.donatus.analysis.lang; + +import java.io.File; +import java.io.IOException; +import java.util.Hashtable; + +import de.mpg.mpiwg.berlin.mpdl.donatus.analysis.DonatusAnalyzer; + +public class DonatusAnalyzerAR extends DonatusAnalyzer { + private static String LANGUAGE = "ar"; + + public DonatusAnalyzerAR() { + super(); + setLanguage(LANGUAGE); + } + + /** + * Builds an analyzer with the given stop words. + */ + public DonatusAnalyzerAR(String[] stopwords) { + super(stopwords); + setLanguage(LANGUAGE); + } + + /** + * Builds an analyzer with the given stop words. + */ + public DonatusAnalyzerAR(Hashtable stopwords) { + super(stopwords); + setLanguage(LANGUAGE); + } + + /** + * Builds an analyzer with the given stop words. + */ + public DonatusAnalyzerAR(File stopwords) throws IOException { + super(stopwords); + setLanguage(LANGUAGE); + } + +} diff -r 000000000000 -r 408254cf2f1d software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/donatus/analysis/lang/DonatusAnalyzerDE.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/donatus/analysis/lang/DonatusAnalyzerDE.java Wed Nov 24 17:24:23 2010 +0100 @@ -0,0 +1,41 @@ +package de.mpg.mpiwg.berlin.mpdl.donatus.analysis.lang; + +import java.io.File; +import java.io.IOException; +import java.util.Hashtable; + +import de.mpg.mpiwg.berlin.mpdl.donatus.analysis.DonatusAnalyzer; + +public class DonatusAnalyzerDE extends DonatusAnalyzer { + private static String LANGUAGE = "de"; + + public DonatusAnalyzerDE() { + super(); + setLanguage(LANGUAGE); + } + + /** + * Builds an analyzer with the given stop words. + */ + public DonatusAnalyzerDE(String[] stopwords) { + super(stopwords); + setLanguage(LANGUAGE); + } + + /** + * Builds an analyzer with the given stop words. + */ + public DonatusAnalyzerDE(Hashtable stopwords) { + super(stopwords); + setLanguage(LANGUAGE); + } + + /** + * Builds an analyzer with the given stop words. + */ + public DonatusAnalyzerDE(File stopwords) throws IOException { + super(stopwords); + setLanguage(LANGUAGE); + } + +} diff -r 000000000000 -r 408254cf2f1d software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/donatus/analysis/lang/DonatusAnalyzerEL.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/donatus/analysis/lang/DonatusAnalyzerEL.java Wed Nov 24 17:24:23 2010 +0100 @@ -0,0 +1,41 @@ +package de.mpg.mpiwg.berlin.mpdl.donatus.analysis.lang; + +import java.io.File; +import java.io.IOException; +import java.util.Hashtable; + +import de.mpg.mpiwg.berlin.mpdl.donatus.analysis.DonatusAnalyzer; + +public class DonatusAnalyzerEL extends DonatusAnalyzer { + private static String LANGUAGE = "el"; + + public DonatusAnalyzerEL() { + super(); + setLanguage(LANGUAGE); + } + + /** + * Builds an analyzer with the given stop words. + */ + public DonatusAnalyzerEL(String[] stopwords) { + super(stopwords); + setLanguage(LANGUAGE); + } + + /** + * Builds an analyzer with the given stop words. + */ + public DonatusAnalyzerEL(Hashtable stopwords) { + super(stopwords); + setLanguage(LANGUAGE); + } + + /** + * Builds an analyzer with the given stop words. + */ + public DonatusAnalyzerEL(File stopwords) throws IOException { + super(stopwords); + setLanguage(LANGUAGE); + } + +} diff -r 000000000000 -r 408254cf2f1d software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/donatus/analysis/lang/DonatusAnalyzerEN.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/donatus/analysis/lang/DonatusAnalyzerEN.java Wed Nov 24 17:24:23 2010 +0100 @@ -0,0 +1,41 @@ +package de.mpg.mpiwg.berlin.mpdl.donatus.analysis.lang; + +import java.io.File; +import java.io.IOException; +import java.util.Hashtable; + +import de.mpg.mpiwg.berlin.mpdl.donatus.analysis.DonatusAnalyzer; + +public class DonatusAnalyzerEN extends DonatusAnalyzer { + private static String LANGUAGE = "en"; + + public DonatusAnalyzerEN() { + super(); + setLanguage(LANGUAGE); + } + + /** + * Builds an analyzer with the given stop words. + */ + public DonatusAnalyzerEN(String[] stopwords) { + super(stopwords); + setLanguage(LANGUAGE); + } + + /** + * Builds an analyzer with the given stop words. + */ + public DonatusAnalyzerEN(Hashtable stopwords) { + super(stopwords); + setLanguage(LANGUAGE); + } + + /** + * Builds an analyzer with the given stop words. + */ + public DonatusAnalyzerEN(File stopwords) throws IOException { + super(stopwords); + setLanguage(LANGUAGE); + } + +} diff -r 000000000000 -r 408254cf2f1d software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/donatus/analysis/lang/DonatusAnalyzerFR.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/donatus/analysis/lang/DonatusAnalyzerFR.java Wed Nov 24 17:24:23 2010 +0100 @@ -0,0 +1,41 @@ +package de.mpg.mpiwg.berlin.mpdl.donatus.analysis.lang; + +import java.io.File; +import java.io.IOException; +import java.util.Hashtable; + +import de.mpg.mpiwg.berlin.mpdl.donatus.analysis.DonatusAnalyzer; + +public class DonatusAnalyzerFR extends DonatusAnalyzer { + private static String LANGUAGE = "fr"; + + public DonatusAnalyzerFR() { + super(); + setLanguage(LANGUAGE); + } + + /** + * Builds an analyzer with the given stop words. + */ + public DonatusAnalyzerFR(String[] stopwords) { + super(stopwords); + setLanguage(LANGUAGE); + } + + /** + * Builds an analyzer with the given stop words. + */ + public DonatusAnalyzerFR(Hashtable stopwords) { + super(stopwords); + setLanguage(LANGUAGE); + } + + /** + * Builds an analyzer with the given stop words. + */ + public DonatusAnalyzerFR(File stopwords) throws IOException { + super(stopwords); + setLanguage(LANGUAGE); + } + +} diff -r 000000000000 -r 408254cf2f1d software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/donatus/analysis/lang/DonatusAnalyzerIT.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/donatus/analysis/lang/DonatusAnalyzerIT.java Wed Nov 24 17:24:23 2010 +0100 @@ -0,0 +1,41 @@ +package de.mpg.mpiwg.berlin.mpdl.donatus.analysis.lang; + +import java.io.File; +import java.io.IOException; +import java.util.Hashtable; + +import de.mpg.mpiwg.berlin.mpdl.donatus.analysis.DonatusAnalyzer; + +public class DonatusAnalyzerIT extends DonatusAnalyzer { + private static String LANGUAGE = "it"; + + public DonatusAnalyzerIT() { + super(); + setLanguage(LANGUAGE); + } + + /** + * Builds an analyzer with the given stop words. + */ + public DonatusAnalyzerIT(String[] stopwords) { + super(stopwords); + setLanguage(LANGUAGE); + } + + /** + * Builds an analyzer with the given stop words. + */ + public DonatusAnalyzerIT(Hashtable stopwords) { + super(stopwords); + setLanguage(LANGUAGE); + } + + /** + * Builds an analyzer with the given stop words. + */ + public DonatusAnalyzerIT(File stopwords) throws IOException { + super(stopwords); + setLanguage(LANGUAGE); + } + +} diff -r 000000000000 -r 408254cf2f1d software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/donatus/analysis/lang/DonatusAnalyzerLA.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/donatus/analysis/lang/DonatusAnalyzerLA.java Wed Nov 24 17:24:23 2010 +0100 @@ -0,0 +1,41 @@ +package de.mpg.mpiwg.berlin.mpdl.donatus.analysis.lang; + +import java.io.File; +import java.io.IOException; +import java.util.Hashtable; + +import de.mpg.mpiwg.berlin.mpdl.donatus.analysis.DonatusAnalyzer; + +public class DonatusAnalyzerLA extends DonatusAnalyzer { + private static String LANGUAGE = "la"; + + public DonatusAnalyzerLA() { + super(); + setLanguage(LANGUAGE); + } + + /** + * Builds an analyzer with the given stop words. + */ + public DonatusAnalyzerLA(String[] stopwords) { + super(stopwords); + setLanguage(LANGUAGE); + } + + /** + * Builds an analyzer with the given stop words. + */ + public DonatusAnalyzerLA(Hashtable stopwords) { + super(stopwords); + setLanguage(LANGUAGE); + } + + /** + * Builds an analyzer with the given stop words. + */ + public DonatusAnalyzerLA(File stopwords) throws IOException { + super(stopwords); + setLanguage(LANGUAGE); + } + +} diff -r 000000000000 -r 408254cf2f1d software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/donatus/analysis/lang/DonatusAnalyzerNL.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/donatus/analysis/lang/DonatusAnalyzerNL.java Wed Nov 24 17:24:23 2010 +0100 @@ -0,0 +1,41 @@ +package de.mpg.mpiwg.berlin.mpdl.donatus.analysis.lang; + +import java.io.File; +import java.io.IOException; +import java.util.Hashtable; + +import de.mpg.mpiwg.berlin.mpdl.donatus.analysis.DonatusAnalyzer; + +public class DonatusAnalyzerNL extends DonatusAnalyzer { + private static String LANGUAGE = "nl"; + + public DonatusAnalyzerNL() { + super(); + setLanguage(LANGUAGE); + } + + /** + * Builds an analyzer with the given stop words. + */ + public DonatusAnalyzerNL(String[] stopwords) { + super(stopwords); + setLanguage(LANGUAGE); + } + + /** + * Builds an analyzer with the given stop words. + */ + public DonatusAnalyzerNL(Hashtable stopwords) { + super(stopwords); + setLanguage(LANGUAGE); + } + + /** + * Builds an analyzer with the given stop words. + */ + public DonatusAnalyzerNL(File stopwords) throws IOException { + super(stopwords); + setLanguage(LANGUAGE); + } + +} diff -r 000000000000 -r 408254cf2f1d software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/donatus/analysis/lang/DonatusAnalyzerZH.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/donatus/analysis/lang/DonatusAnalyzerZH.java Wed Nov 24 17:24:23 2010 +0100 @@ -0,0 +1,41 @@ +package de.mpg.mpiwg.berlin.mpdl.donatus.analysis.lang; + +import java.io.File; +import java.io.IOException; +import java.util.Hashtable; + +import de.mpg.mpiwg.berlin.mpdl.donatus.analysis.DonatusAnalyzer; + +public class DonatusAnalyzerZH extends DonatusAnalyzer { + private static String LANGUAGE = "zh"; + + public DonatusAnalyzerZH() { + super(); + setLanguage(LANGUAGE); + } + + /** + * Builds an analyzer with the given stop words. + */ + public DonatusAnalyzerZH(String[] stopwords) { + super(stopwords); + setLanguage(LANGUAGE); + } + + /** + * Builds an analyzer with the given stop words. + */ + public DonatusAnalyzerZH(Hashtable stopwords) { + super(stopwords); + setLanguage(LANGUAGE); + } + + /** + * Builds an analyzer with the given stop words. + */ + public DonatusAnalyzerZH(File stopwords) throws IOException { + super(stopwords); + setLanguage(LANGUAGE); + } + +} diff -r 000000000000 -r 408254cf2f1d software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/donatus/analysis/lang/DutchStemmer.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/donatus/analysis/lang/DutchStemmer.java Wed Nov 24 17:24:23 2010 +0100 @@ -0,0 +1,407 @@ +package de.mpg.mpiwg.berlin.mpdl.donatus.analysis.lang; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.util.Map; + +/** + * + * A stemmer for Dutch words. The algorithm is an implementation of + * the dutch stemming + * algorithm in Martin Porter's snowball project. + * + * @author Edwin de Jonge (ejne at cbs.nl) + */ + +public class DutchStemmer { + /** + * Buffer for the terms while stemming them. + */ + private StringBuffer sb = new StringBuffer(); + private boolean _removedE; + private Map _stemDict; + + private int _R1; + private int _R2; + + //TODO convert to internal + /* + * Stemms the given term to an unique discriminator. + * + * @param term The term that should be stemmed. + * @return Discriminator for term + */ + public String stem(String term) { + term = term.toLowerCase(); + if (!isStemmable(term)) + return term; + if (_stemDict != null && _stemDict.containsKey(term)) + if (_stemDict.get(term) instanceof String) + return (String) _stemDict.get(term); + else + return null; + + // Reset the StringBuffer. + sb.delete(0, sb.length()); + sb.insert(0, term); + // Stemming starts here... + substitute(sb); + storeYandI(sb); + _R1 = getRIndex(sb, 0); + _R1 = Math.max(3, _R1); + step1(sb); + step2(sb); + _R2 = getRIndex(sb, _R1); + step3a(sb); + step3b(sb); + step4(sb); + reStoreYandI(sb); + return sb.toString(); + } + + private boolean enEnding(StringBuffer sb) { + String[] enend = new String[]{"ene", "en"}; + for (int i = 0; i < enend.length; i++) { + String end = enend[i]; + String s = sb.toString(); + int index = s.length() - end.length(); + if (s.endsWith(end) && + index >= _R1 && + isValidEnEnding(sb, index - 1) + ) { + sb.delete(index, index + end.length()); + unDouble(sb, index); + return true; + } + } + return false; + } + + + private void step1(StringBuffer sb) { + if (_R1 >= sb.length()) + return; + + String s = sb.toString(); + int lengthR1 = sb.length() - _R1; + int index; + + if (s.endsWith("heden")) { + sb.replace(_R1, lengthR1 + _R1, sb.substring(_R1, lengthR1 + _R1).replaceAll("heden", "heid")); + return; + } + + if (enEnding(sb)) + return; + + if (s.endsWith("se") && + (index = s.length() - 2) >= _R1 && + isValidSEnding(sb, index - 1) + ) { + sb.delete(index, index + 2); + return; + } + if (s.endsWith("s") && + (index = s.length() - 1) >= _R1 && + isValidSEnding(sb, index - 1)) { + sb.delete(index, index + 1); + } + } + + /** + * Delete suffix e if in R1 and + * preceded by a non-vowel, and then undouble the ending + * + * @param sb String being stemmed + */ + private void step2(StringBuffer sb) { + _removedE = false; + if (_R1 >= sb.length()) + return; + String s = sb.toString(); + int index = s.length() - 1; + if (index >= _R1 && + s.endsWith("e") && + !isVowel(sb.charAt(index - 1))) { + sb.delete(index, index + 1); + unDouble(sb); + _removedE = true; + } + } + + /** + * Delete "heid" + * + * @param sb String being stemmed + */ + private void step3a(StringBuffer sb) { + if (_R2 >= sb.length()) + return; + String s = sb.toString(); + int index = s.length() - 4; + if (s.endsWith("heid") && index >= _R2 && sb.charAt(index - 1) != 'c') { + sb.delete(index, index + 4); //remove heid + enEnding(sb); + } + } + + /** + *

A d-suffix, or derivational suffix, enables a new word, + * often with a different grammatical category, or with a different + * sense, to be built from another word. Whether a d-suffix can be + * attached is discovered not from the rules of grammar, but by + * referring to a dictionary. So in English, ness can be added to + * certain adjectives to form corresponding nouns (littleness, + * kindness, foolishness ...) but not to all adjectives + * (not for example, to big, cruel, wise ...) d-suffixes can be + * used to change meaning, often in rather exotic ways.

+ * Remove "ing", "end", "ig", "lijk", "baar" and "bar" + * + * @param sb String being stemmed + */ + private void step3b(StringBuffer sb) { + if (_R2 >= sb.length()) + return; + String s = sb.toString(); + int index = 0; + + if ((s.endsWith("end") || s.endsWith("ing")) && + (index = s.length() - 3) >= _R2) { + sb.delete(index, index + 3); + if (sb.charAt(index - 2) == 'i' && + sb.charAt(index - 1) == 'g') { + if (sb.charAt(index - 3) != 'e' & index - 2 >= _R2) { + index -= 2; + sb.delete(index, index + 2); + } + } else { + unDouble(sb, index); + } + return; + } + if (s.endsWith("ig") && + (index = s.length() - 2) >= _R2 + ) { + if (sb.charAt(index - 1) != 'e') + sb.delete(index, index + 2); + return; + } + if (s.endsWith("lijk") && + (index = s.length() - 4) >= _R2 + ) { + sb.delete(index, index + 4); + step2(sb); + return; + } + if (s.endsWith("baar") && + (index = s.length() - 4) >= _R2 + ) { + sb.delete(index, index + 4); + return; + } + if (s.endsWith("bar") && + (index = s.length() - 3) >= _R2 + ) { + if (_removedE) + sb.delete(index, index + 3); + return; + } + } + + /** + * undouble vowel + * If the words ends CVD, where C is a non-vowel, D is a non-vowel other than I, and V is double a, e, o or u, remove one of the vowels from V (for example, maan -> man, brood -> brod). + * + * @param sb String being stemmed + */ + private void step4(StringBuffer sb) { + if (sb.length() < 4) + return; + String end = sb.substring(sb.length() - 4, sb.length()); + char c = end.charAt(0); + char v1 = end.charAt(1); + char v2 = end.charAt(2); + char d = end.charAt(3); + if (v1 == v2 && + d != 'I' && + v1 != 'i' && + isVowel(v1) && + !isVowel(d) && + !isVowel(c)) { + sb.delete(sb.length() - 2, sb.length() - 1); + } + } + + /** + * Checks if a term could be stemmed. + * + * @return true if, and only if, the given term consists in letters. + */ + private boolean isStemmable(String term) { + for (int c = 0; c < term.length(); c++) { + if (!Character.isLetter(term.charAt(c))) return false; + } + return true; + } + + /** + * Substitute ä, ë, ï, ö, ü, á , é, í, ó, ú + */ + private void substitute(StringBuffer buffer) { + for (int i = 0; i < buffer.length(); i++) { + switch (buffer.charAt(i)) { + case 'ä': + case 'á': + { + buffer.setCharAt(i, 'a'); + break; + } + case 'ë': + case 'é': + { + buffer.setCharAt(i, 'e'); + break; + } + case 'ü': + case 'ú': + { + buffer.setCharAt(i, 'u'); + break; + } + case 'ï': + case 'i': + { + buffer.setCharAt(i, 'i'); + break; + } + case 'ö': + case 'ó': + { + buffer.setCharAt(i, 'o'); + break; + } + } + } + } + + /*private boolean isValidSEnding(StringBuffer sb) { + return isValidSEnding(sb, sb.length() - 1); + }*/ + + private boolean isValidSEnding(StringBuffer sb, int index) { + char c = sb.charAt(index); + if (isVowel(c) || c == 'j') + return false; + return true; + } + + /*private boolean isValidEnEnding(StringBuffer sb) { + return isValidEnEnding(sb, sb.length() - 1); + }*/ + + private boolean isValidEnEnding(StringBuffer sb, int index) { + char c = sb.charAt(index); + if (isVowel(c)) + return false; + if (c < 3) + return false; + // ends with "gem"? + if (c == 'm' && sb.charAt(index - 2) == 'g' && sb.charAt(index - 1) == 'e') + return false; + return true; + } + + private void unDouble(StringBuffer sb) { + unDouble(sb, sb.length()); + } + + private void unDouble(StringBuffer sb, int endIndex) { + String s = sb.substring(0, endIndex); + if (s.endsWith("kk") || s.endsWith("tt") || s.endsWith("dd") || s.endsWith("nn") || s.endsWith("mm") || s.endsWith("ff")) { + sb.delete(endIndex - 1, endIndex); + } + } + + private int getRIndex(StringBuffer sb, int start) { + if (start == 0) + start = 1; + int i = start; + for (; i < sb.length(); i++) { + //first non-vowel preceded by a vowel + if (!isVowel(sb.charAt(i)) && isVowel(sb.charAt(i - 1))) { + return i + 1; + } + } + return i + 1; + } + + private void storeYandI(StringBuffer sb) { + if (sb.charAt(0) == 'y') + sb.setCharAt(0, 'Y'); + + int last = sb.length() - 1; + + for (int i = 1; i < last; i++) { + switch (sb.charAt(i)) { + case 'i': + { + if (isVowel(sb.charAt(i - 1)) && + isVowel(sb.charAt(i + 1)) + ) + sb.setCharAt(i, 'I'); + break; + } + case 'y': + { + if (isVowel(sb.charAt(i - 1))) + sb.setCharAt(i, 'Y'); + break; + } + } + } + if (last > 0 && sb.charAt(last) == 'y' && isVowel(sb.charAt(last - 1))) + sb.setCharAt(last, 'Y'); + } + + private void reStoreYandI(StringBuffer sb) { + String tmp = sb.toString(); + sb.delete(0, sb.length()); + sb.insert(0, tmp.replaceAll("I", "i").replaceAll("Y", "y")); + } + + private boolean isVowel(char c) { + switch (c) { + case 'e': + case 'a': + case 'o': + case 'i': + case 'u': + case 'y': + case 'è': + { + return true; + } + } + return false; + } + + void setStemDictionary(Map dict) { + _stemDict = dict; + } + +} diff -r 000000000000 -r 408254cf2f1d software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/donatus/analysis/lang/FrenchStemmer.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/donatus/analysis/lang/FrenchStemmer.java Wed Nov 24 17:24:23 2010 +0100 @@ -0,0 +1,709 @@ +package de.mpg.mpiwg.berlin.mpdl.donatus.analysis.lang; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * A stemmer for French words. The algorithm is based on the work of + * Dr Martin Porter on his snowball project
+ * refer to http://snowball.sourceforge.net/french/stemmer.html
+ * (French stemming algorithm) for details + * + * @author Patrick Talbot + */ + +public class FrenchStemmer { + + /** + * Buffer for the terms while stemming them. + */ + private StringBuffer sb = new StringBuffer(); + + /** + * A temporary buffer, used to reconstruct R2 + */ + private StringBuffer tb = new StringBuffer(); + + /** + * Region R0 is equal to the whole buffer + */ + private String R0; + + /** + * Region RV + * "If the word begins with two vowels, RV is the region after the third letter, + * otherwise the region after the first vowel not at the beginning of the word, + * or the end of the word if these positions cannot be found." + */ + private String RV; + + /** + * Region R1 + * "R1 is the region after the first non-vowel following a vowel + * or is the null region at the end of the word if there is no such non-vowel" + */ + private String R1; + + /** + * Region R2 + * "R2 is the region after the first non-vowel in R1 following a vowel + * or is the null region at the end of the word if there is no such non-vowel" + */ + private String R2; + + + /** + * Set to true if we need to perform step 2 + */ + private boolean suite; + + /** + * Set to true if the buffer was modified + */ + private boolean modified; + + + /** + * Stemms the given term to a unique discriminator. + * + * @param term java.langString The term that should be stemmed + * @return java.lang.String Discriminator for term + */ + public String stem( String term ) { + if ( !isStemmable( term ) ) { + return term; + } + + // Use lowercase for medium stemming. + term = term.toLowerCase(); + + // Reset the StringBuffer. + sb.delete( 0, sb.length() ); + sb.insert( 0, term ); + + // reset the booleans + modified = false; + suite = false; + + sb = treatVowels( sb ); + + setStrings(); + + step1(); + + if (!modified || suite) + { + if (RV != null) + { + suite = step2a(); + if (!suite) + step2b(); + } + } + + if (modified || suite) + step3(); + else + step4(); + + step5(); + + step6(); + + return sb.toString(); + } + + /** + * Sets the search region Strings
+ * it needs to be done each time the buffer was modified + */ + private void setStrings() { + // set the strings + R0 = sb.toString(); + RV = retrieveRV( sb ); + R1 = retrieveR( sb ); + if ( R1 != null ) + { + tb.delete( 0, tb.length() ); + tb.insert( 0, R1 ); + R2 = retrieveR( tb ); + } + else + R2 = null; + } + + /** + * First step of the Porter Algorithmn
+ * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation + */ + private void step1( ) { + String[] suffix = { "ances", "iqUes", "ismes", "ables", "istes", "ance", "iqUe", "isme", "able", "iste" }; + deleteFrom( R2, suffix ); + + replaceFrom( R2, new String[] { "logies", "logie" }, "log" ); + replaceFrom( R2, new String[] { "usions", "utions", "usion", "ution" }, "u" ); + replaceFrom( R2, new String[] { "ences", "ence" }, "ent" ); + + String[] search = { "atrices", "ateurs", "ations", "atrice", "ateur", "ation"}; + deleteButSuffixFromElseReplace( R2, search, "ic", true, R0, "iqU" ); + + deleteButSuffixFromElseReplace( R2, new String[] { "ements", "ement" }, "eus", false, R0, "eux" ); + deleteButSuffixFrom( R2, new String[] { "ements", "ement" }, "ativ", false ); + deleteButSuffixFrom( R2, new String[] { "ements", "ement" }, "iv", false ); + deleteButSuffixFrom( R2, new String[] { "ements", "ement" }, "abl", false ); + deleteButSuffixFrom( R2, new String[] { "ements", "ement" }, "iqU", false ); + + deleteFromIfTestVowelBeforeIn( R1, new String[] { "issements", "issement" }, false, R0 ); + deleteFrom( RV, new String[] { "ements", "ement" } ); + + deleteButSuffixFromElseReplace( R2, new String[] { "ités", "ité" }, "abil", false, R0, "abl" ); + deleteButSuffixFromElseReplace( R2, new String[] { "ités", "ité" }, "ic", false, R0, "iqU" ); + deleteButSuffixFrom( R2, new String[] { "ités", "ité" }, "iv", true ); + + String[] autre = { "ifs", "ives", "if", "ive" }; + deleteButSuffixFromElseReplace( R2, autre, "icat", false, R0, "iqU" ); + deleteButSuffixFromElseReplace( R2, autre, "at", true, R2, "iqU" ); + + replaceFrom( R0, new String[] { "eaux" }, "eau" ); + + replaceFrom( R1, new String[] { "aux" }, "al" ); + + deleteButSuffixFromElseReplace( R2, new String[] { "euses", "euse" }, "", true, R1, "eux" ); + + deleteFrom( R2, new String[] { "eux" } ); + + // if one of the next steps is performed, we will need to perform step2a + boolean temp = false; + temp = replaceFrom( RV, new String[] { "amment" }, "ant" ); + if (temp == true) + suite = true; + temp = replaceFrom( RV, new String[] { "emment" }, "ent" ); + if (temp == true) + suite = true; + temp = deleteFromIfTestVowelBeforeIn( RV, new String[] { "ments", "ment" }, true, RV ); + if (temp == true) + suite = true; + + } + + /** + * Second step (A) of the Porter Algorithmn
+ * Will be performed if nothing changed from the first step + * or changed were done in the amment, emment, ments or ment suffixes
+ * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation + * + * @return boolean - true if something changed in the StringBuffer + */ + private boolean step2a() { + String[] search = { "îmes", "îtes", "iraIent", "irait", "irais", "irai", "iras", "ira", + "irent", "iriez", "irez", "irions", "irons", "iront", + "issaIent", "issais", "issantes", "issante", "issants", "issant", + "issait", "issais", "issions", "issons", "issiez", "issez", "issent", + "isses", "isse", "ir", "is", "ît", "it", "ies", "ie", "i" }; + return deleteFromIfTestVowelBeforeIn( RV, search, false, RV ); + } + + /** + * Second step (B) of the Porter Algorithmn
+ * Will be performed if step 2 A was performed unsuccessfully
+ * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation + */ + private void step2b() { + String[] suffix = { "eraIent", "erais", "erait", "erai", "eras", "erions", "eriez", + "erons", "eront","erez", "èrent", "era", "ées", "iez", + "ée", "és", "er", "ez", "é" }; + deleteFrom( RV, suffix ); + + String[] search = { "assions", "assiez", "assent", "asses", "asse", "aIent", + "antes", "aIent", "Aient", "ante", "âmes", "âtes", "ants", "ant", + "ait", "aît", "ais", "Ait", "Aît", "Ais", "ât", "as", "ai", "Ai", "a" }; + deleteButSuffixFrom( RV, search, "e", true ); + + deleteFrom( R2, new String[] { "ions" } ); + } + + /** + * Third step of the Porter Algorithmn
+ * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation + */ + private void step3() { + if (sb.length()>0) + { + char ch = sb.charAt( sb.length()-1 ); + if (ch == 'Y') + { + sb.setCharAt( sb.length()-1, 'i' ); + setStrings(); + } + else if (ch == 'ç') + { + sb.setCharAt( sb.length()-1, 'c' ); + setStrings(); + } + } + } + + /** + * Fourth step of the Porter Algorithmn
+ * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation + */ + private void step4() { + if (sb.length() > 1) + { + char ch = sb.charAt( sb.length()-1 ); + if (ch == 's') + { + char b = sb.charAt( sb.length()-2 ); + if (b != 'a' && b != 'i' && b != 'o' && b != 'u' && b != 'è' && b != 's') + { + sb.delete( sb.length() - 1, sb.length()); + setStrings(); + } + } + } + boolean found = deleteFromIfPrecededIn( R2, new String[] { "ion" }, RV, "s" ); + if (!found) + found = deleteFromIfPrecededIn( R2, new String[] { "ion" }, RV, "t" ); + + replaceFrom( RV, new String[] { "Ière", "ière", "Ier", "ier" }, "i" ); + deleteFrom( RV, new String[] { "e" } ); + deleteFromIfPrecededIn( RV, new String[] { "ë" }, R0, "gu" ); + } + + /** + * Fifth step of the Porter Algorithmn
+ * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation + */ + private void step5() { + if (R0 != null) + { + if (R0.endsWith("enn") || R0.endsWith("onn") || R0.endsWith("ett") || R0.endsWith("ell") || R0.endsWith("eill")) + { + sb.delete( sb.length() - 1, sb.length() ); + setStrings(); + } + } + } + + /** + * Sixth (and last!) step of the Porter Algorithmn
+ * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation + */ + private void step6() { + if (R0!=null && R0.length()>0) + { + boolean seenVowel = false; + boolean seenConson = false; + int pos = -1; + for (int i = R0.length()-1; i > -1; i--) + { + char ch = R0.charAt(i); + if (isVowel(ch)) + { + if (!seenVowel) + { + if (ch == 'é' || ch == 'è') + { + pos = i; + break; + } + } + seenVowel = true; + } + else + { + if (seenVowel) + break; + else + seenConson = true; + } + } + if (pos > -1 && seenConson && !seenVowel) + sb.setCharAt(pos, 'e'); + } + } + + /** + * Delete a suffix searched in zone "source" if zone "from" contains prefix + search string + * + * @param source java.lang.String - the primary source zone for search + * @param search java.lang.String[] - the strings to search for suppression + * @param from java.lang.String - the secondary source zone for search + * @param prefix java.lang.String - the prefix to add to the search string to test + * @return boolean - true if modified + */ + private boolean deleteFromIfPrecededIn( String source, String[] search, String from, String prefix ) { + boolean found = false; + if (source!=null ) + { + for (int i = 0; i < search.length; i++) { + if ( source.endsWith( search[i] )) + { + if (from!=null && from.endsWith( prefix + search[i] )) + { + sb.delete( sb.length() - search[i].length(), sb.length()); + found = true; + setStrings(); + break; + } + } + } + } + return found; + } + + /** + * Delete a suffix searched in zone "source" if the preceding letter is (or isn't) a vowel + * + * @param source java.lang.String - the primary source zone for search + * @param search java.lang.String[] - the strings to search for suppression + * @param vowel boolean - true if we need a vowel before the search string + * @param from java.lang.String - the secondary source zone for search (where vowel could be) + * @return boolean - true if modified + */ + private boolean deleteFromIfTestVowelBeforeIn( String source, String[] search, boolean vowel, String from ) { + boolean found = false; + if (source!=null && from!=null) + { + for (int i = 0; i < search.length; i++) { + if ( source.endsWith( search[i] )) + { + if ((search[i].length() + 1) <= from.length()) + { + boolean test = isVowel(sb.charAt(sb.length()-(search[i].length()+1))); + if (test == vowel) + { + sb.delete( sb.length() - search[i].length(), sb.length()); + modified = true; + found = true; + setStrings(); + break; + } + } + } + } + } + return found; + } + + /** + * Delete a suffix searched in zone "source" if preceded by the prefix + * + * @param source java.lang.String - the primary source zone for search + * @param search java.lang.String[] - the strings to search for suppression + * @param prefix java.lang.String - the prefix to add to the search string to test + * @param without boolean - true if it will be deleted even without prefix found + */ + private void deleteButSuffixFrom( String source, String[] search, String prefix, boolean without ) { + if (source!=null) + { + for (int i = 0; i < search.length; i++) { + if ( source.endsWith( prefix + search[i] )) + { + sb.delete( sb.length() - (prefix.length() + search[i].length()), sb.length() ); + modified = true; + setStrings(); + break; + } + else if ( without && source.endsWith( search[i] )) + { + sb.delete( sb.length() - search[i].length(), sb.length() ); + modified = true; + setStrings(); + break; + } + } + } + } + + /** + * Delete a suffix searched in zone "source" if preceded by prefix
+ * or replace it with the replace string if preceded by the prefix in the zone "from"
+ * or delete the suffix if specified + * + * @param source java.lang.String - the primary source zone for search + * @param search java.lang.String[] - the strings to search for suppression + * @param prefix java.lang.String - the prefix to add to the search string to test + * @param without boolean - true if it will be deleted even without prefix found + */ + private void deleteButSuffixFromElseReplace( String source, String[] search, String prefix, boolean without, String from, String replace ) { + if (source!=null) + { + for (int i = 0; i < search.length; i++) { + if ( source.endsWith( prefix + search[i] )) + { + sb.delete( sb.length() - (prefix.length() + search[i].length()), sb.length() ); + modified = true; + setStrings(); + break; + } + else if ( from!=null && from.endsWith( prefix + search[i] )) + { + sb.replace( sb.length() - (prefix.length() + search[i].length()), sb.length(), replace ); + modified = true; + setStrings(); + break; + } + else if ( without && source.endsWith( search[i] )) + { + sb.delete( sb.length() - search[i].length(), sb.length() ); + modified = true; + setStrings(); + break; + } + } + } + } + + /** + * Replace a search string with another within the source zone + * + * @param source java.lang.String - the source zone for search + * @param search java.lang.String[] - the strings to search for replacement + * @param replace java.lang.String - the replacement string + */ + private boolean replaceFrom( String source, String[] search, String replace ) { + boolean found = false; + if (source!=null) + { + for (int i = 0; i < search.length; i++) { + if ( source.endsWith( search[i] )) + { + sb.replace( sb.length() - search[i].length(), sb.length(), replace ); + modified = true; + found = true; + setStrings(); + break; + } + } + } + return found; + } + + /** + * Delete a search string within the source zone + * + * @param source the source zone for search + * @param suffix the strings to search for suppression + */ + private void deleteFrom(String source, String[] suffix ) { + if (source!=null) + { + for (int i = 0; i < suffix.length; i++) { + if (source.endsWith( suffix[i] )) + { + sb.delete( sb.length() - suffix[i].length(), sb.length()); + modified = true; + setStrings(); + break; + } + } + } + } + + /** + * Test if a char is a french vowel, including accentuated ones + * + * @param ch the char to test + * @return boolean - true if the char is a vowel + */ + private boolean isVowel(char ch) { + switch (ch) + { + case 'a': + case 'e': + case 'i': + case 'o': + case 'u': + case 'y': + case 'â': + case 'à': + case 'ë': + case 'é': + case 'ê': + case 'è': + case 'ï': + case 'î': + case 'ô': + case 'ü': + case 'ù': + case 'û': + return true; + default: + return false; + } + } + + /** + * Retrieve the "R zone" (1 or 2 depending on the buffer) and return the corresponding string
+ * "R is the region after the first non-vowel following a vowel + * or is the null region at the end of the word if there is no such non-vowel"
+ * @param buffer java.lang.StringBuffer - the in buffer + * @return java.lang.String - the resulting string + */ + private String retrieveR( StringBuffer buffer ) { + int len = buffer.length(); + int pos = -1; + for (int c = 0; c < len; c++) { + if (isVowel( buffer.charAt( c ))) + { + pos = c; + break; + } + } + if (pos > -1) + { + int consonne = -1; + for (int c = pos; c < len; c++) { + if (!isVowel(buffer.charAt( c ))) + { + consonne = c; + break; + } + } + if (consonne > -1 && (consonne+1) < len) + return buffer.substring( consonne+1, len ); + else + return null; + } + else + return null; + } + + /** + * Retrieve the "RV zone" from a buffer an return the corresponding string
+ * "If the word begins with two vowels, RV is the region after the third letter, + * otherwise the region after the first vowel not at the beginning of the word, + * or the end of the word if these positions cannot be found."
+ * @param buffer java.lang.StringBuffer - the in buffer + * @return java.lang.String - the resulting string + */ + private String retrieveRV( StringBuffer buffer ) { + int len = buffer.length(); + if ( buffer.length() > 3) + { + if ( isVowel(buffer.charAt( 0 )) && isVowel(buffer.charAt( 1 ))) { + return buffer.substring(3,len); + } + else + { + int pos = 0; + for (int c = 1; c < len; c++) { + if (isVowel( buffer.charAt( c ))) + { + pos = c; + break; + } + } + if ( pos+1 < len ) + return buffer.substring( pos+1, len ); + else + return null; + } + } + else + return null; + } + + + + /** + * Turns u and i preceded AND followed by a vowel to UpperCase
+ * Turns y preceded OR followed by a vowel to UpperCase
+ * Turns u preceded by q to UpperCase
+ * + * @param buffer java.util.StringBuffer - the buffer to treat + * @return java.util.StringBuffer - the treated buffer + */ + private StringBuffer treatVowels( StringBuffer buffer ) { + for ( int c = 0; c < buffer.length(); c++ ) { + char ch = buffer.charAt( c ); + + if (c == 0) // first char + { + if (buffer.length()>1) + { + if (ch == 'y' && isVowel(buffer.charAt( c + 1 ))) + buffer.setCharAt( c, 'Y' ); + } + } + else if (c == buffer.length()-1) // last char + { + if (ch == 'u' && buffer.charAt( c - 1 ) == 'q') + buffer.setCharAt( c, 'U' ); + if (ch == 'y' && isVowel(buffer.charAt( c - 1 ))) + buffer.setCharAt( c, 'Y' ); + } + else // other cases + { + if (ch == 'u') + { + if (buffer.charAt( c - 1) == 'q') + buffer.setCharAt( c, 'U' ); + else if (isVowel(buffer.charAt( c - 1 )) && isVowel(buffer.charAt( c + 1 ))) + buffer.setCharAt( c, 'U' ); + } + if (ch == 'i') + { + if (isVowel(buffer.charAt( c - 1 )) && isVowel(buffer.charAt( c + 1 ))) + buffer.setCharAt( c, 'I' ); + } + if (ch == 'y') + { + if (isVowel(buffer.charAt( c - 1 )) || isVowel(buffer.charAt( c + 1 ))) + buffer.setCharAt( c, 'Y' ); + } + } + } + + return buffer; + } + + /** + * Checks a term if it can be processed correctly. + * + * @return boolean - true if, and only if, the given term consists in letters. + */ + private boolean isStemmable( String term ) { + boolean upper = false; + int first = -1; + for ( int c = 0; c < term.length(); c++ ) { + // Discard terms that contain non-letter characters. + if ( !Character.isLetter( term.charAt( c ) ) ) { + return false; + } + // Discard terms that contain multiple uppercase letters. + if ( Character.isUpperCase( term.charAt( c ) ) ) { + if ( upper ) { + return false; + } + // First encountered uppercase letter, set flag and save + // position. + else { + first = c; + upper = true; + } + } + } + // Discard the term if it contains a single uppercase letter that + // is not starting the term. + if ( first > 0 ) { + return false; + } + return true; + } +} diff -r 000000000000 -r 408254cf2f1d software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/donatus/analysis/lang/GermanStemmer.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/donatus/analysis/lang/GermanStemmer.java Wed Nov 24 17:24:23 2010 +0100 @@ -0,0 +1,267 @@ +package de.mpg.mpiwg.berlin.mpdl.donatus.analysis.lang; +// This file is encoded in UTF-8 + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * A stemmer for German words. The algorithm is based on the report + * "A Fast and Simple Stemming Algorithm for German Words" by Jörg + * Caumanns (joerg.caumanns at isst.fhg.de). + * + * + * @version $Id: GermanStemmer.java 564236 2007-08-09 15:21:19Z gsingers $ + */ +public class GermanStemmer +{ + /** + * Buffer for the terms while stemming them. + */ + private StringBuffer sb = new StringBuffer(); + + /** + * Amount of characters that are removed with substitute() while stemming. + */ + private int substCount = 0; + + /** + * Stemms the given term to an unique discriminator. + * + * @param term The term that should be stemmed. + * @return Discriminator for term + */ + public String stem( String term ) + { + // Use lowercase for medium stemming. + term = term.toLowerCase(); + if ( !isStemmable( term ) ) + return term; + // Reset the StringBuffer. + sb.delete( 0, sb.length() ); + sb.insert( 0, term ); + // Stemming starts here... + substitute( sb ); + strip( sb ); + optimize( sb ); + resubstitute( sb ); + removeParticleDenotion( sb ); + return sb.toString(); + } + + /** + * Checks if a term could be stemmed. + * + * @return true if, and only if, the given term consists in letters. + */ + private boolean isStemmable( String term ) + { + for ( int c = 0; c < term.length(); c++ ) { + if ( !Character.isLetter( term.charAt( c ) ) ) + return false; + } + return true; + } + + /** + * suffix stripping (stemming) on the current term. The stripping is reduced + * to the seven "base" suffixes "e", "s", "n", "t", "em", "er" and * "nd", + * from which all regular suffixes are build of. The simplification causes + * some overstemming, and way more irregular stems, but still provides unique. + * discriminators in the most of those cases. + * The algorithm is context free, except of the length restrictions. + */ + private void strip( StringBuffer buffer ) + { + boolean doMore = true; + while ( doMore && buffer.length() > 3 ) { + if ( ( buffer.length() + substCount > 5 ) && + buffer.substring( buffer.length() - 2, buffer.length() ).equals( "nd" ) ) + { + buffer.delete( buffer.length() - 2, buffer.length() ); + } + else if ( ( buffer.length() + substCount > 4 ) && + buffer.substring( buffer.length() - 2, buffer.length() ).equals( "em" ) ) { + buffer.delete( buffer.length() - 2, buffer.length() ); + } + else if ( ( buffer.length() + substCount > 4 ) && + buffer.substring( buffer.length() - 2, buffer.length() ).equals( "er" ) ) { + buffer.delete( buffer.length() - 2, buffer.length() ); + } + else if ( buffer.charAt( buffer.length() - 1 ) == 'e' ) { + buffer.deleteCharAt( buffer.length() - 1 ); + } + else if ( buffer.charAt( buffer.length() - 1 ) == 's' ) { + buffer.deleteCharAt( buffer.length() - 1 ); + } + else if ( buffer.charAt( buffer.length() - 1 ) == 'n' ) { + buffer.deleteCharAt( buffer.length() - 1 ); + } + // "t" occurs only as suffix of verbs. + else if ( buffer.charAt( buffer.length() - 1 ) == 't' ) { + buffer.deleteCharAt( buffer.length() - 1 ); + } + else { + doMore = false; + } + } + } + + /** + * Does some optimizations on the term. This optimisations are + * contextual. + */ + private void optimize( StringBuffer buffer ) + { + // Additional step for female plurals of professions and inhabitants. + if ( buffer.length() > 5 && buffer.substring( buffer.length() - 5, buffer.length() ).equals( "erin*" ) ) { + buffer.deleteCharAt( buffer.length() -1 ); + strip( buffer ); + } + // Additional step for irregular plural nouns like "Matrizen -> Matrix". + if ( buffer.charAt( buffer.length() - 1 ) == ( 'z' ) ) { + buffer.setCharAt( buffer.length() - 1, 'x' ); + } + } + + /** + * Removes a particle denotion ("ge") from a term. + */ + private void removeParticleDenotion( StringBuffer buffer ) + { + if ( buffer.length() > 4 ) { + for ( int c = 0; c < buffer.length() - 3; c++ ) { + if ( buffer.substring( c, c + 4 ).equals( "gege" ) ) { + buffer.delete( c, c + 2 ); + return; + } + } + } + } + + /** + * Do some substitutions for the term to reduce overstemming: + * + * - Substitute Umlauts with their corresponding vowel: äöü -> aou, + * "ß" is substituted by "ss" + * - Substitute a second char of a pair of equal characters with + * an asterisk: ?? -> ?* + * - Substitute some common character combinations with a token: + * sch/ch/ei/ie/ig/st -> $/§/%/&/#/! + */ + private void substitute( StringBuffer buffer ) + { + substCount = 0; + for ( int c = 0; c < buffer.length(); c++ ) { + // Replace the second char of a pair of the equal characters with an asterisk + if ( c > 0 && buffer.charAt( c ) == buffer.charAt ( c - 1 ) ) { + buffer.setCharAt( c, '*' ); + } + // Substitute Umlauts. + else if ( buffer.charAt( c ) == 'ä' ) { + buffer.setCharAt( c, 'a' ); + } + else if ( buffer.charAt( c ) == 'ö' ) { + buffer.setCharAt( c, 'o' ); + } + else if ( buffer.charAt( c ) == 'ü' ) { + buffer.setCharAt( c, 'u' ); + } + // Fix bug so that 'ß' at the end of a word is replaced. + else if ( buffer.charAt( c ) == 'ß' ) { + buffer.setCharAt( c, 's' ); + buffer.insert( c + 1, 's' ); + substCount++; + } + // Take care that at least one character is left left side from the current one + if ( c < buffer.length() - 1 ) { + // Masking several common character combinations with an token + if ( ( c < buffer.length() - 2 ) && buffer.charAt( c ) == 's' && + buffer.charAt( c + 1 ) == 'c' && buffer.charAt( c + 2 ) == 'h' ) + { + buffer.setCharAt( c, '$' ); + buffer.delete( c + 1, c + 3 ); + substCount =+ 2; + } + else if ( buffer.charAt( c ) == 'c' && buffer.charAt( c + 1 ) == 'h' ) { + buffer.setCharAt( c, '§' ); + buffer.deleteCharAt( c + 1 ); + substCount++; + } + else if ( buffer.charAt( c ) == 'e' && buffer.charAt( c + 1 ) == 'i' ) { + buffer.setCharAt( c, '%' ); + buffer.deleteCharAt( c + 1 ); + substCount++; + } + else if ( buffer.charAt( c ) == 'i' && buffer.charAt( c + 1 ) == 'e' ) { + buffer.setCharAt( c, '&' ); + buffer.deleteCharAt( c + 1 ); + substCount++; + } + else if ( buffer.charAt( c ) == 'i' && buffer.charAt( c + 1 ) == 'g' ) { + buffer.setCharAt( c, '#' ); + buffer.deleteCharAt( c + 1 ); + substCount++; + } + else if ( buffer.charAt( c ) == 's' && buffer.charAt( c + 1 ) == 't' ) { + buffer.setCharAt( c, '!' ); + buffer.deleteCharAt( c + 1 ); + substCount++; + } + } + } + } + + /** + * Undoes the changes made by substitute(). That are character pairs and + * character combinations. Umlauts will remain as their corresponding vowel, + * as "ß" remains as "ss". + */ + private void resubstitute( StringBuffer buffer ) + { + for ( int c = 0; c < buffer.length(); c++ ) { + if ( buffer.charAt( c ) == '*' ) { + char x = buffer.charAt( c - 1 ); + buffer.setCharAt( c, x ); + } + else if ( buffer.charAt( c ) == '$' ) { + buffer.setCharAt( c, 's' ); + buffer.insert( c + 1, new char[]{'c', 'h'}, 0, 2 ); + } + else if ( buffer.charAt( c ) == '§' ) { + buffer.setCharAt( c, 'c' ); + buffer.insert( c + 1, 'h' ); + } + else if ( buffer.charAt( c ) == '%' ) { + buffer.setCharAt( c, 'e' ); + buffer.insert( c + 1, 'i' ); + } + else if ( buffer.charAt( c ) == '&' ) { + buffer.setCharAt( c, 'i' ); + buffer.insert( c + 1, 'e' ); + } + else if ( buffer.charAt( c ) == '#' ) { + buffer.setCharAt( c, 'i' ); + buffer.insert( c + 1, 'g' ); + } + else if ( buffer.charAt( c ) == '!' ) { + buffer.setCharAt( c, 's' ); + buffer.insert( c + 1, 't' ); + } + } + } + +} diff -r 000000000000 -r 408254cf2f1d software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/donatus/analysis/lang/RussianStemmer.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/donatus/analysis/lang/RussianStemmer.java Wed Nov 24 17:24:23 2010 +0100 @@ -0,0 +1,630 @@ +package de.mpg.mpiwg.berlin.mpdl.donatus.analysis.lang; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Russian stemming algorithm implementation (see http://snowball.sourceforge.net for detailed description). + * + * + * @version $Id: RussianStemmer.java 564236 2007-08-09 15:21:19Z gsingers $ + */ +public class RussianStemmer +{ + private char[] charset; + + // positions of RV, R1 and R2 respectively + private int RV, R1, R2; + + // letters (currently unused letters are commented out) + private final static char A = 0; + //private final static char B = 1; + private final static char V = 2; + private final static char G = 3; + //private final static char D = 4; + private final static char E = 5; + //private final static char ZH = 6; + //private final static char Z = 7; + private final static char I = 8; + private final static char I_ = 9; + //private final static char K = 10; + private final static char L = 11; + private final static char M = 12; + private final static char N = 13; + private final static char O = 14; + //private final static char P = 15; + //private final static char R = 16; + private final static char S = 17; + private final static char T = 18; + private final static char U = 19; + //private final static char F = 20; + private final static char X = 21; + //private final static char TS = 22; + //private final static char CH = 23; + private final static char SH = 24; + private final static char SHCH = 25; + //private final static char HARD = 26; + private final static char Y = 27; + private final static char SOFT = 28; + private final static char AE = 29; + private final static char IU = 30; + private final static char IA = 31; + + // stem definitions + private static char[] vowels = { A, E, I, O, U, Y, AE, IU, IA }; + + private static char[][] perfectiveGerundEndings1 = { + { V }, + { V, SH, I }, + { V, SH, I, S, SOFT } + }; + + private static char[][] perfectiveGerund1Predessors = { + { A }, + { IA } + }; + + private static char[][] perfectiveGerundEndings2 = { { I, V }, { + Y, V }, { + I, V, SH, I }, { + Y, V, SH, I }, { + I, V, SH, I, S, SOFT }, { + Y, V, SH, I, S, SOFT } + }; + + private static char[][] adjectiveEndings = { + { E, E }, + { I, E }, + { Y, E }, + { O, E }, + { E, I_ }, + { I, I_ }, + { Y, I_ }, + { O, I_ }, + { E, M }, + { I, M }, + { Y, M }, + { O, M }, + { I, X }, + { Y, X }, + { U, IU }, + { IU, IU }, + { A, IA }, + { IA, IA }, + { O, IU }, + { E, IU }, + { I, M, I }, + { Y, M, I }, + { E, G, O }, + { O, G, O }, + { E, M, U }, + {O, M, U } + }; + + private static char[][] participleEndings1 = { + { SHCH }, + { E, M }, + { N, N }, + { V, SH }, + { IU, SHCH } + }; + + private static char[][] participleEndings2 = { + { I, V, SH }, + { Y, V, SH }, + { U, IU, SHCH } + }; + + private static char[][] participle1Predessors = { + { A }, + { IA } + }; + + private static char[][] reflexiveEndings = { + { S, IA }, + { S, SOFT } + }; + + private static char[][] verbEndings1 = { + { I_ }, + { L }, + { N }, + { L, O }, + { N, O }, + { E, T }, + { IU, T }, + { L, A }, + { N, A }, + { L, I }, + { E, M }, + { N, Y }, + { E, T, E }, + { I_, T, E }, + { T, SOFT }, + { E, SH, SOFT }, + { N, N, O } + }; + + private static char[][] verbEndings2 = { + { IU }, + { U, IU }, + { E, N }, + { E, I_ }, + { IA, T }, + { U, I_ }, + { I, L }, + { Y, L }, + { I, M }, + { Y, M }, + { I, T }, + { Y, T }, + { I, L, A }, + { Y, L, A }, + { E, N, A }, + { I, T, E }, + { I, L, I }, + { Y, L, I }, + { I, L, O }, + { Y, L, O }, + { E, N, O }, + { U, E, T }, + { U, IU, T }, + { E, N, Y }, + { I, T, SOFT }, + { Y, T, SOFT }, + { I, SH, SOFT }, + { E, I_, T, E }, + { U, I_, T, E } + }; + + private static char[][] verb1Predessors = { + { A }, + { IA } + }; + + private static char[][] nounEndings = { + { A }, + { U }, + { I_ }, + { O }, + { U }, + { E }, + { Y }, + { I }, + { SOFT }, + { IA }, + { E, V }, + { O, V }, + { I, E }, + { SOFT, E }, + { IA, X }, + { I, IU }, + { E, I }, + { I, I }, + { E, I_ }, + { O, I_ }, + { E, M }, + { A, M }, + { O, M }, + { A, X }, + { SOFT, IU }, + { I, IA }, + { SOFT, IA }, + { I, I_ }, + { IA, M }, + { IA, M, I }, + { A, M, I }, + { I, E, I_ }, + { I, IA, M }, + { I, E, M }, + { I, IA, X }, + { I, IA, M, I } + }; + + private static char[][] superlativeEndings = { + { E, I_, SH }, + { E, I_, SH, E } + }; + + private static char[][] derivationalEndings = { + { O, S, T }, + { O, S, T, SOFT } + }; + + /** + * RussianStemmer constructor comment. + */ + public RussianStemmer() + { + super(); + } + + /** + * RussianStemmer constructor comment. + */ + public RussianStemmer(char[] charset) + { + super(); + this.charset = charset; + } + + /** + * Adjectival ending is an adjective ending, + * optionally preceded by participle ending. + * Creation date: (17/03/2002 12:14:58 AM) + * @param stemmingZone java.lang.StringBuffer + */ + private boolean adjectival(StringBuffer stemmingZone) + { + // look for adjective ending in a stemming zone + if (!findAndRemoveEnding(stemmingZone, adjectiveEndings)) + return false; + // if adjective ending was found, try for participle ending. + // variable r is unused, we are just interested in the side effect of + // findAndRemoveEnding(): + boolean r = + findAndRemoveEnding(stemmingZone, participleEndings1, participle1Predessors) + || + findAndRemoveEnding(stemmingZone, participleEndings2); + return true; + } + + /** + * Derivational endings + * Creation date: (17/03/2002 12:14:58 AM) + * @param stemmingZone java.lang.StringBuffer + */ + private boolean derivational(StringBuffer stemmingZone) + { + int endingLength = findEnding(stemmingZone, derivationalEndings); + if (endingLength == 0) + // no derivational ending found + return false; + else + { + // Ensure that the ending locates in R2 + if (R2 - RV <= stemmingZone.length() - endingLength) + { + stemmingZone.setLength(stemmingZone.length() - endingLength); + return true; + } + else + { + return false; + } + } + } + + /** + * Finds ending among given ending class and returns the length of ending found(0, if not found). + * Creation date: (17/03/2002 8:18:34 PM) + */ + private int findEnding(StringBuffer stemmingZone, int startIndex, char[][] theEndingClass) + { + boolean match = false; + for (int i = theEndingClass.length - 1; i >= 0; i--) + { + char[] theEnding = theEndingClass[i]; + // check if the ending is bigger than stemming zone + if (startIndex < theEnding.length - 1) + { + match = false; + continue; + } + match = true; + int stemmingIndex = startIndex; + for (int j = theEnding.length - 1; j >= 0; j--) + { + if (stemmingZone.charAt(stemmingIndex--) != charset[theEnding[j]]) + { + match = false; + break; + } + } + // check if ending was found + if (match) + { + return theEndingClass[i].length; // cut ending + } + } + return 0; + } + + private int findEnding(StringBuffer stemmingZone, char[][] theEndingClass) + { + return findEnding(stemmingZone, stemmingZone.length() - 1, theEndingClass); + } + + /** + * Finds the ending among the given class of endings and removes it from stemming zone. + * Creation date: (17/03/2002 8:18:34 PM) + */ + private boolean findAndRemoveEnding(StringBuffer stemmingZone, char[][] theEndingClass) + { + int endingLength = findEnding(stemmingZone, theEndingClass); + if (endingLength == 0) + // not found + return false; + else { + stemmingZone.setLength(stemmingZone.length() - endingLength); + // cut the ending found + return true; + } + } + + /** + * Finds the ending among the given class of endings, then checks if this ending was + * preceded by any of given predessors, and if so, removes it from stemming zone. + * Creation date: (17/03/2002 8:18:34 PM) + */ + private boolean findAndRemoveEnding(StringBuffer stemmingZone, + char[][] theEndingClass, char[][] thePredessors) + { + int endingLength = findEnding(stemmingZone, theEndingClass); + if (endingLength == 0) + // not found + return false; + else + { + int predessorLength = + findEnding(stemmingZone, + stemmingZone.length() - endingLength - 1, + thePredessors); + if (predessorLength == 0) + return false; + else { + stemmingZone.setLength(stemmingZone.length() - endingLength); + // cut the ending found + return true; + } + } + + } + + /** + * Marks positions of RV, R1 and R2 in a given word. + * Creation date: (16/03/2002 3:40:11 PM) + */ + private void markPositions(String word) + { + RV = 0; + R1 = 0; + R2 = 0; + int i = 0; + // find RV + while (word.length() > i && !isVowel(word.charAt(i))) + { + i++; + } + if (word.length() - 1 < ++i) + return; // RV zone is empty + RV = i; + // find R1 + while (word.length() > i && isVowel(word.charAt(i))) + { + i++; + } + if (word.length() - 1 < ++i) + return; // R1 zone is empty + R1 = i; + // find R2 + while (word.length() > i && !isVowel(word.charAt(i))) + { + i++; + } + if (word.length() - 1 < ++i) + return; // R2 zone is empty + while (word.length() > i && isVowel(word.charAt(i))) + { + i++; + } + if (word.length() - 1 < ++i) + return; // R2 zone is empty + R2 = i; + } + + /** + * Checks if character is a vowel.. + * Creation date: (16/03/2002 10:47:03 PM) + * @return boolean + * @param letter char + */ + private boolean isVowel(char letter) + { + for (int i = 0; i < vowels.length; i++) + { + if (letter == charset[vowels[i]]) + return true; + } + return false; + } + + /** + * Noun endings. + * Creation date: (17/03/2002 12:14:58 AM) + * @param stemmingZone java.lang.StringBuffer + */ + private boolean noun(StringBuffer stemmingZone) + { + return findAndRemoveEnding(stemmingZone, nounEndings); + } + + /** + * Perfective gerund endings. + * Creation date: (17/03/2002 12:14:58 AM) + * @param stemmingZone java.lang.StringBuffer + */ + private boolean perfectiveGerund(StringBuffer stemmingZone) + { + return findAndRemoveEnding( + stemmingZone, + perfectiveGerundEndings1, + perfectiveGerund1Predessors) + || findAndRemoveEnding(stemmingZone, perfectiveGerundEndings2); + } + + /** + * Reflexive endings. + * Creation date: (17/03/2002 12:14:58 AM) + * @param stemmingZone java.lang.StringBuffer + */ + private boolean reflexive(StringBuffer stemmingZone) + { + return findAndRemoveEnding(stemmingZone, reflexiveEndings); + } + + /** + * Insert the method's description here. + * Creation date: (17/03/2002 12:14:58 AM) + * @param stemmingZone java.lang.StringBuffer + */ + private boolean removeI(StringBuffer stemmingZone) + { + if (stemmingZone.length() > 0 + && stemmingZone.charAt(stemmingZone.length() - 1) == charset[I]) + { + stemmingZone.setLength(stemmingZone.length() - 1); + return true; + } + else + { + return false; + } + } + + /** + * Insert the method's description here. + * Creation date: (17/03/2002 12:14:58 AM) + * @param stemmingZone java.lang.StringBuffer + */ + private boolean removeSoft(StringBuffer stemmingZone) + { + if (stemmingZone.length() > 0 + && stemmingZone.charAt(stemmingZone.length() - 1) == charset[SOFT]) + { + stemmingZone.setLength(stemmingZone.length() - 1); + return true; + } + else + { + return false; + } + } + + /** + * Insert the method's description here. + * Creation date: (16/03/2002 10:58:42 PM) + * @param newCharset char[] + */ + public void setCharset(char[] newCharset) + { + charset = newCharset; + } + + /** + * Finds the stem for given Russian word. + * Creation date: (16/03/2002 3:36:48 PM) + * @return java.lang.String + * @param input java.lang.String + */ + public String stem(String input) + { + markPositions(input); + if (RV == 0) + return input; //RV wasn't detected, nothing to stem + StringBuffer stemmingZone = new StringBuffer(input.substring(RV)); + // stemming goes on in RV + // Step 1 + + if (!perfectiveGerund(stemmingZone)) + { + reflexive(stemmingZone); + // variable r is unused, we are just interested in the flow that gets + // created by logical expression: apply adjectival(); if that fails, + // apply verb() etc + boolean r = + adjectival(stemmingZone) + || verb(stemmingZone) + || noun(stemmingZone); + } + // Step 2 + removeI(stemmingZone); + // Step 3 + derivational(stemmingZone); + // Step 4 + superlative(stemmingZone); + undoubleN(stemmingZone); + removeSoft(stemmingZone); + // return result + return input.substring(0, RV) + stemmingZone.toString(); + } + + /** + * Superlative endings. + * Creation date: (17/03/2002 12:14:58 AM) + * @param stemmingZone java.lang.StringBuffer + */ + private boolean superlative(StringBuffer stemmingZone) + { + return findAndRemoveEnding(stemmingZone, superlativeEndings); + } + + /** + * Undoubles N. + * Creation date: (17/03/2002 12:14:58 AM) + * @param stemmingZone java.lang.StringBuffer + */ + private boolean undoubleN(StringBuffer stemmingZone) + { + char[][] doubleN = { + { N, N } + }; + if (findEnding(stemmingZone, doubleN) != 0) + { + stemmingZone.setLength(stemmingZone.length() - 1); + return true; + } + else + { + return false; + } + } + + /** + * Verb endings. + * Creation date: (17/03/2002 12:14:58 AM) + * @param stemmingZone java.lang.StringBuffer + */ + private boolean verb(StringBuffer stemmingZone) + { + return findAndRemoveEnding( + stemmingZone, + verbEndings1, + verb1Predessors) + || findAndRemoveEnding(stemmingZone, verbEndings2); + } + + /** + * Static method for stemming with different charsets + */ + public static String stem(String theWord, char[] charset) + { + RussianStemmer stemmer = new RussianStemmer(); + stemmer.setCharset(charset); + return stemmer.stem(theWord); + } +} diff -r 000000000000 -r 408254cf2f1d software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/donatus/example/ExampleDonatusHandler.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/donatus/example/ExampleDonatusHandler.java Wed Nov 24 17:24:23 2010 +0100 @@ -0,0 +1,154 @@ +package de.mpg.mpiwg.berlin.mpdl.donatus.example; + +import java.io.BufferedInputStream; +import java.io.FileInputStream; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.io.InputStream; +import java.util.Date; + +import de.mpg.mpiwg.berlin.mpdl.donatus.xmlrpc.DonatusXmlRpcClient; + +/** + * Example Handler for eXist XML documents (singleton). + * Local document files could be stored into eXist collections over XML-RPC. + * The eXistXmlRpcInterface could not be used in a multi threading environment. + * Collections could be configured language specific. You should ask your eXist + * administrator for the name of the document collection and for the language + * collection names which could be used. + * Then you have to set the instance variables: "serverName", "serverPort", + * "userName", "pw", "documentCollectionName", "localDirectoryName" and + * "languages" (see below). That's all. + * + * For example your local directory structure could look like this: + * archimedesForEXist + * documents + * ar + * yourDoc1.xml + * yourDoc2.xml + * ... + * de + * yourDoc1.xml + * yourDoc2.xml + * ... + * el + * yourDoc1.xml + * yourDoc2.xml + * ... + * ... + */ +public class ExampleDonatusHandler { + private static ExampleDonatusHandler instance; + private DonatusXmlRpcClient donatusXmlRpcClient = null; + + private String documentCollectionName = "/db/mpdl-example/archimedes/documents"; + private String localDirectoryName = "/Users/jwillenborg/texts/archimedesForEXist/documents"; + + private String exampleDocumentName = "achil_propo_087_la_1545.xml"; + + private long beginOfOperation; + private long endOfOperation; + + public static ExampleDonatusHandler getInstance() { + if (instance == null) { + instance = new ExampleDonatusHandler(); + instance.init(); + } + return instance; + } + + public static void main(String[] args) { + getInstance(); + instance.beginOperation(); + System.out.println("Start ... "); + String result = instance.analyzeExampleDocumentFile(); // example for analyzing one document + instance.endOperation(); + System.out.println(result); + System.out.println("End of operation. Elapsed time: " + (instance.endOfOperation - instance.beginOfOperation) + " ms" ); + } + + private void init() { + donatusXmlRpcClient = new DonatusXmlRpcClient(); // default server is "archimedes.fas.harvard.edu" + } + + private String analyzeExampleDocument() { + String locator = "xxx"; // TODO take uri + String language = "la"; + String s1 = "An recentiores Mathematici Aristotelem in errore deprehenderint proportionum regulas docentem, quibus motus invicem comparantur, disputandum."; + String s2 = "Praesens opus in quatuor secatur partes."; + String s3 = "Primo fundamenta quaedam subiiciam."; + String s4 = "Secundo regulas quasdam asseram."; + String s5 = "Tertio conclusiones aliquas probabo."; + String s6 = "Quarto ad obiecta respondebo."; + String s7 = "Hic deus lumen infundat."; + String sentences = s1 + s2 + s3 + s4 + s5 + s6 + s7; + String doc = "" + sentences + ""; + String morphDocTypeXmlStr = instance.donatusXmlRpcClient.analyze(locator, language, doc); + return morphDocTypeXmlStr; + } + + private String analyzeExampleDocumentFile() { + String morphDocTypeXmlStr = null; + try { + String locator = "xxx"; // TODO take uri + String language = "la"; + String exampleLocalFile = instance.localDirectoryName + "/" + language + "/" + instance.exampleDocumentName; // TODO example document with sentences + StringBuffer docStringArray = new StringBuffer(""); + int chunkSize = 20000 * 1024; // copies data from a file in 20 MB chunks to server file so that not too much RAM is consumed on server + InputStream localFileInputStream = new BufferedInputStream(new FileInputStream(exampleLocalFile)); + byte[] chunk = new byte[chunkSize]; + while ((chunk = readBytes(localFileInputStream, chunkSize)) != null) { + docStringArray.append(new String(chunk)); + } + morphDocTypeXmlStr = instance.donatusXmlRpcClient.analyze(locator, language, docStringArray.toString()); + } catch (FileNotFoundException e) { + e.printStackTrace(); + } + return morphDocTypeXmlStr; + } + + private String analyzeExampleSentence() { + String locator = "xxx"; // TODO take uri + String language = "la"; + String s = "Secundoregulasquasdamasseram"; + // String s = "Secundoregulasquasdamasseram"; + String morphDocTypeXmlStr = instance.donatusXmlRpcClient.analyze(locator, language, s); + return morphDocTypeXmlStr; + } + + /** + * Reads a chunk of data of an input stream. + * Does not close the stream until last bytes are read + * @in in the input stream to be read + * @chunkSize chunkSize length of the chunk which is read + * @return byte[] of bytes read + */ + private byte[] readBytes(InputStream in, int chunkSize) { + byte[] resultBytes = new byte[chunkSize]; + try { + int len = in.read(resultBytes, 0, chunkSize); + if (len == -1) { + try { in.close(); } catch (Exception e) { } // close the stream if end of file is reached + resultBytes = null; + } else if (len < chunkSize && len != chunkSize) { // if read chunk is last chunk of the file it delivers this chunk + byte[] tmp = new byte[len]; + System.arraycopy(resultBytes, 0, tmp, 0, len); + resultBytes = tmp; + } + } catch (FileNotFoundException e) { + e.printStackTrace(); + } catch (IOException e) { + e.printStackTrace(); + } + return resultBytes; + } + + private void beginOperation() { + beginOfOperation = new Date().getTime(); + } + + private void endOperation() { + endOfOperation = new Date().getTime(); + } + +} \ No newline at end of file diff -r 000000000000 -r 408254cf2f1d software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/donatus/example/ParseDonatusResult.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/donatus/example/ParseDonatusResult.java Wed Nov 24 17:24:23 2010 +0100 @@ -0,0 +1,63 @@ +package de.mpg.mpiwg.berlin.mpdl.donatus.example; + +import org.xml.sax.*; + +public class ParseDonatusResult implements ContentHandler { + public String value = null; + + public void startDocument() throws SAXException { + System.out.println(""); + } + + public void endDocument() throws SAXException { + + } + + + public void characters(char[] c, int start, int length) throws SAXException { + //System.out.print(new String(c, start, length)); + } + + public void ignorableWhitespace(char[] c, int start, int length) throws SAXException { + //characters(c, start, length); + } + + public void processingInstruction(String target, String data) throws SAXException { + /*System.out.print(" 0) + System.out.print(" " + data); + System.out.println("?>");*/ + } + + public void setDocumentLocator(org.xml.sax.Locator arg1) { + + } + + public void endElement(String uri, String localName, String name) throws SAXException { + // TODO Auto-generated method stub + } + + public void endPrefixMapping(String prefix) throws SAXException { + // TODO Auto-generated method stub + } + + public void skippedEntity(String name) throws SAXException { + // TODO Auto-generated method stub + } + + public void startElement(String uri, String localName, String name, Attributes attrs) throws SAXException { + if (name.equals("lemma")) + if (attrs != null) { + int length = attrs.getLength(); + for (int i = 0; (i < length) & (value==null); i++) { + if (attrs.getLocalName(i).equals("form")) + value = attrs.getValue(i); + } + } + } + + public void startPrefixMapping(String prefix, String uri) throws SAXException { + // TODO Auto-generated method stub + } +} diff -r 000000000000 -r 408254cf2f1d software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/donatus/example/TestDonatus.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/donatus/example/TestDonatus.java Wed Nov 24 17:24:23 2010 +0100 @@ -0,0 +1,179 @@ +package de.mpg.mpiwg.berlin.mpdl.donatus.example; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStreamReader; +import java.io.Reader; +import java.io.StringReader; +import java.io.UnsupportedEncodingException; +import java.net.MalformedURLException; +import java.net.URL; +import java.util.ArrayList; +import java.util.Hashtable; + +import org.apache.xmlrpc.XmlRpcException; +import org.apache.xmlrpc.client.XmlRpcClient; +import org.apache.xmlrpc.client.XmlRpcClientConfigImpl; +import org.xml.sax.InputSource; +import org.xml.sax.SAXException; +import org.xml.sax.XMLReader; + +import java.util.Vector; + +import com.sun.org.apache.xerces.internal.parsers.SAXParser; + +import de.mpg.mpiwg.berlin.mpdl.donatus.xmlrpc.DonatusMorphologyDocument; +import de.mpg.mpiwg.berlin.mpdl.donatus.xmlrpc.DonatusMorphologyDocumentContentHandler; +import de.mpg.mpiwg.berlin.mpdl.donatus.xmlrpc.DonatusWtagContainer; +import de.mpg.mpiwg.berlin.mpdl.donatus.xmlrpc.DonatusWtagDocument; +import de.mpg.mpiwg.berlin.mpdl.donatus.xmlrpc.DonatusWtagSection; +import de.mpg.mpiwg.berlin.mpdl.donatus.xmlrpc.DonatusXmlRpcClient; + +public class TestDonatus { + private DonatusXmlRpcClient donatusXmlRpcClient= null; + + public static void main(String[] args) { + TestDonatus testDonatus = new TestDonatus(); + // testDonatus.lineInputAnalyzer(); + DonatusMorphologyDocument result = testDonatus.analyzeMonte037(); + + } + + public TestDonatus() { + init(); + } + + private void init() { + donatusXmlRpcClient = new DonatusXmlRpcClient(); + } + + private DonatusMorphologyDocument analyze(DonatusWtagDocument doc) { + DonatusMorphologyDocument resultDoc = donatusXmlRpcClient.analyze(doc); + return resultDoc; + } + + private DonatusMorphologyDocument analyzeMonte037() { + String locator = "http://archimedes/037.xml"; + int docId = 4711; + String languageIt = "it"; + ArrayList s1It = new ArrayList(); + s1It.add("LE"); + s1It.add("MECHANICHE"); + s1It.add("DELL"); + s1It.add("ILLVSTRISS"); + s1It.add("SIG"); + s1It.add("GVIDO"); + s1It.add("VBALDO"); + s1It.add("DE"); + s1It.add("MARCHESI"); + s1It.add("DEL"); + s1It.add("MONTE"); + s1It.add("LE"); + ArrayList s2It = new ArrayList(); + s2It.add("TRADOTTE"); + s2It.add("IN"); + s2It.add("VOLGARE"); + s2It.add("DAL"); + s2It.add("SIG"); + s2It.add("FILIPPO"); + s2It.add("PIGAFETTA"); + s2It.add("Nellequali"); + s2It.add("ſi"); + s2It.add("contiene"); + s2It.add("la"); + DonatusWtagContainer sIt1 = new DonatusWtagContainer("s", "1", s1It); + DonatusWtagContainer sIt2 = new DonatusWtagContainer("s", "2", s2It); + DonatusWtagSection sectionIt = new DonatusWtagSection(languageIt); + sectionIt.addContainer(sIt1); + sectionIt.addContainer(sIt2); + String languageLa = "la"; + ArrayList s1La = new ArrayList(); + s1La.add("GVIDIVBALDI"); + s1La.add("E"); + s1La.add("MARCHIONIBVS"); + s1La.add("MONTIS"); + s1La.add("MECHANICORVM"); + s1La.add("LIBER"); + DonatusWtagContainer sLa1 = new DonatusWtagContainer("s", "1", s1La); + DonatusWtagSection sectionLa = new DonatusWtagSection(languageLa); + sectionLa.addContainer(sLa1); + DonatusWtagDocument donatusWtagDocument = new DonatusWtagDocument(locator); + donatusWtagDocument.addSection(sectionIt); + donatusWtagDocument.addSection(sectionLa); + DonatusMorphologyDocument donatusXmlMorphologyDoc = analyze(donatusWtagDocument); + return donatusXmlMorphologyDoc; + } + + private String analyze(String locator, String language, String documentString) { + String resultDoc = donatusXmlRpcClient.analyze(locator, language, documentString); + return resultDoc; + } + + private String askDonatus(String term) throws XmlRpcException, IOException { + String WTAG_START="
"; + String WTAG_END="
"; + ParseDonatusResult ch = new ParseDonatusResult(); + try { + URL url = new URL("http://archimedes.fas.harvard.edu/cgi-bin/donatus-rpc"); + + XmlRpcClientConfigImpl config = new XmlRpcClientConfigImpl(); + config.setServerURL(url); + XmlRpcClient xmlClient = new XmlRpcClient(); + xmlClient.setConfig(config); + + String wtag = WTAG_START + term + WTAG_END; + Vector params = new Vector(); + params.add(wtag.getBytes("UTF8")); + Hashtable donatusReturn = (Hashtable) xmlClient.execute("donatus.analyze", params); + Object s = donatusReturn.get("morphData"); + String st = new String((byte[])s); + XMLReader parser = new SAXParser(); + parser.setContentHandler(ch); + Reader reader = new StringReader(st); + InputSource input = new InputSource(reader); + parser.parse(input); + } catch (SAXException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } catch (MalformedURLException e) { + e.printStackTrace(); + } + return ch.value; + } + + private void lineInputAnalyzer() { + BufferedReader in = null; + try { + in = new BufferedReader(new InputStreamReader(System.in, "UTF8")); + } catch (UnsupportedEncodingException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + while(true) { + // prompt the user + System.out.println("Enter query: "); + String line = null; + try { + line = in.readLine(); + } catch (IOException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + if (line == null || line.length() == -1) + break; + line = line.trim(); + if (line.length() == 0) + break; + try { + String donatusResponse = askDonatus(line); + System.out.println(donatusResponse); + } catch (XmlRpcException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } catch (IOException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + } + } +} \ No newline at end of file diff -r 000000000000 -r 408254cf2f1d software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/donatus/xmlrpc/DonatusAnalysis.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/donatus/xmlrpc/DonatusAnalysis.java Wed Nov 24 17:24:23 2010 +0100 @@ -0,0 +1,22 @@ +package de.mpg.mpiwg.berlin.mpdl.donatus.xmlrpc; + +public class DonatusAnalysis { + private String desc; + private String xlinkType; + + public DonatusAnalysis(String desc, String xlinkType) { + this.desc = desc; + this.xlinkType = xlinkType; + } + + public String getDesc() { + return desc; + } + + public String getXmlString() { + String xmlString = ""; + xmlString += ""; + return xmlString; + } + +} diff -r 000000000000 -r 408254cf2f1d software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/donatus/xmlrpc/DonatusBerkeleyDbEnv.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/donatus/xmlrpc/DonatusBerkeleyDbEnv.java Wed Nov 24 17:24:23 2010 +0100 @@ -0,0 +1,75 @@ +package de.mpg.mpiwg.berlin.mpdl.donatus.xmlrpc; + +import java.io.File; + +import com.sleepycat.je.Database; +import com.sleepycat.je.DatabaseConfig; +import com.sleepycat.je.DatabaseException; +import com.sleepycat.je.Environment; +import com.sleepycat.je.EnvironmentConfig; + +public class DonatusBerkeleyDbEnv { + private File envPath; + private Environment env; + private Database lemmaDb; + private Database variantDb; + + public DonatusBerkeleyDbEnv() { + } + + public void setup(boolean readOnly) throws DatabaseException { + EnvironmentConfig envConfig = new EnvironmentConfig(); + DatabaseConfig dbConfig = new DatabaseConfig(); + // If the environment is read-only, then make the databases read-only too. + envConfig.setReadOnly(readOnly); + dbConfig.setReadOnly(readOnly); + // If the environment is opened for write, then we want to be able to create the environment and databases if they do not exist. + envConfig.setAllowCreate(!readOnly); + dbConfig.setAllowCreate(!readOnly); + // Allow transactions if we are writing to the database + envConfig.setTransactional(!readOnly); + dbConfig.setTransactional(!readOnly); + // Open the environment + envPath = new File(DonatusConstants.BERKELEY_DB_DIR); + env = new Environment(envPath, envConfig); + // open databases (and create them if not existent) + lemmaDb = env.openDatabase(null, "LemmaDB", dbConfig); + variantDb = env.openDatabase(null, "VariantDB", dbConfig); + } + + public void removeDatabases() { + try { + lemmaDb.close(); + variantDb.close(); + env.removeDatabase(null, "LemmaDB"); + env.removeDatabase(null, "VariantDB"); + } catch (DatabaseException e) { + e.printStackTrace(); // TODO + } + } + + public Environment getEnv() { + return env; + } + + public Database getLemmaDB() { + return lemmaDb; + } + + public Database getVariantDB() { + return variantDb; + } + + public void close() { + if (env != null) { + try { + lemmaDb.close(); + variantDb.close(); + env.close(); + } catch(DatabaseException e) { + e.printStackTrace(); // TODO + } + } + } +} + diff -r 000000000000 -r 408254cf2f1d software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/donatus/xmlrpc/DonatusCache.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/donatus/xmlrpc/DonatusCache.java Wed Nov 24 17:24:23 2010 +0100 @@ -0,0 +1,375 @@ +package de.mpg.mpiwg.berlin.mpdl.donatus.xmlrpc; + +import java.io.FileNotFoundException; +import java.io.IOException; +import java.io.Reader; +import java.io.StringReader; +import java.io.UnsupportedEncodingException; +import java.net.MalformedURLException; +import java.net.URL; +import java.util.ArrayList; +import java.util.Date; + +import org.xml.sax.InputSource; +import org.xml.sax.SAXException; +import org.xml.sax.XMLReader; + +import com.sleepycat.je.Cursor; +import com.sleepycat.je.Database; +import com.sleepycat.je.DatabaseEntry; +import com.sleepycat.je.DatabaseException; +import com.sleepycat.je.LockMode; +import com.sleepycat.je.OperationStatus; +import com.sleepycat.je.Transaction; +import com.sun.org.apache.xerces.internal.parsers.SAXParser; + +import de.mpg.mpiwg.berlin.mpdl.donatus.analysis.DonatusAnalyzer; +import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; +import de.mpg.mpiwg.berlin.mpdl.util.FileUtil; + +public class DonatusCache { + private static DonatusCache instance; + private DonatusBerkeleyDbEnv berkeleyDBEnv = null; + private Date state = null; // last time the cache is written + + // for performance reasons these variables are needed + public static int QUERY_MODE = 0; + public static int DOCUMENT_MODE = 1; + protected int mode = QUERY_MODE; + // for performance reasons the cache contains a donatusMorphologyDocument which + // caches all lemmas for one document (in DOCUMENT_MODE) + private DonatusMorphologyDocument donatusMorphologyDocument = null; + + public static DonatusCache getInstance() throws ApplicationException { + if (instance == null) { + instance = new DonatusCache(); + instance.init(); + } + return instance; + } + + private void init() throws ApplicationException { + try { + berkeleyDBEnv = new DonatusBerkeleyDbEnv(); + berkeleyDBEnv.setup(false); // open databases in read/write mode + state = new Date(); + } catch (DatabaseException e) { + throw new ApplicationException(e); + } + } + + public int getMode() { + return mode; + } + + public void setMode(int newMode) { + this.mode = newMode; + if (newMode == QUERY_MODE) + donatusMorphologyDocument = null; // reset the morphology document + } + + public void close() { + berkeleyDBEnv.close(); + } + + // TODO Aufruf über RPC-API: execute(String path, HashMap parameters); spez. MPDL-Funktion zum Administrieren von BerkeleyDB: org.exist.xquery.modules.mpdldb.BerkeleyDBAdmin + public void deleteCache() { + berkeleyDBEnv.removeDatabases(); + state = new Date(); + } + + public void analyze(DonatusAnalyzer analyzer, String docUri, ArrayList sentences) throws ApplicationException { + DonatusHandler donatusHandler = new DonatusHandler(analyzer); + donatusMorphologyDocument = donatusHandler.analyze(docUri, sentences); + } + + public void addVariant(String language, String lemmaForm, String type, String variantForm) throws ApplicationException { + DonatusLemma lemma = getLemmaByVariantForm(language, variantForm); + // if variantForm is already cached in a lemma then do nothing + if (lemma == null) { + // if lemmaForm is already cached as a lemma then do nothing else build the new lemma with the variant + lemma = getLemmaByLemmaForm(language, lemmaForm); + if (lemma == null) { + lemma = new DonatusLemma(donatusMorphologyDocument, language, type, lemmaForm); + donatusMorphologyDocument.putLemma(lemma); + } else { + // nothing + } + } + DonatusVariant v = new DonatusVariant(lemma, type, variantForm); + lemma.addVariant(v); + } + + public void saveLemmas() throws ApplicationException { + try { + String docUri = donatusMorphologyDocument.getDocUri(); + URL url = new URL(docUri); + String path = url.getPath(); + writeLemmas(donatusMorphologyDocument); + Date endOfOperation2 = new Date(); + String donMorphPath = path.replaceFirst(".xml", "-donatus-morph-v" + endOfOperation2.getTime() + ".xml"); + String morphDocFilePathStr = DonatusConstants.BERKELEY_DB_DIR + "/donatusAnalyzedFiles" + donMorphPath; + FileUtil fileUtil = new FileUtil(); + byte[] morphDocBytes = donatusMorphologyDocument.getDocumentBytes(); + fileUtil.saveFile(morphDocBytes, morphDocFilePathStr); + String donWtagPath = path.replaceFirst(".xml", "-donatus-wtag-v" + endOfOperation2.getTime() + ".xml"); + String wtagFilePathStr = DonatusConstants.BERKELEY_DB_DIR + "/donatusAnalyzedFiles" + donWtagPath; + byte[] wtagBytes = donatusMorphologyDocument.getWtagBytes(); + fileUtil.saveFile(wtagBytes, wtagFilePathStr); + } catch (MalformedURLException e) { + throw new ApplicationException(e); + } catch (IOException e) { + throw new ApplicationException(e); + } + state = new Date(); + } + + public DonatusLemma getLemmaByVariantForm(String language, String variantForm) throws ApplicationException { + DonatusLemma lemma = null; + if (mode == QUERY_MODE) { + lemma = readVariantLemma(null, language, variantForm); + } else { + if (donatusMorphologyDocument != null) { + DonatusVariant v = donatusMorphologyDocument.getVariant(variantForm); + if (v != null) { + DonatusLemma l = v.getLemma(); + lemma = donatusMorphologyDocument.getLemma(l.getForm()); + } + } + } + return lemma; + } + + public DonatusLemma getLemmaByLemmaForm(String language, String lemmaForm) throws ApplicationException { + DonatusLemma lemma = null; + if (mode == QUERY_MODE) { + lemma = readLemma(null, language, lemmaForm); + } else { + if (donatusMorphologyDocument != null) { + lemma = donatusMorphologyDocument.getLemma(lemmaForm); + } + } + return lemma; + } + + public ArrayList getQueryVariants(String language, String luceneQueryString) throws ApplicationException { + ArrayList result = new ArrayList(); + ArrayList variantsFromQuery = getVariantsFromLuceneQuery(luceneQueryString); + if (! (variantsFromQuery == null || variantsFromQuery.isEmpty())) { + for (int i=0; i lemmaVariants = lemma.getVariants(); + result.addAll(lemmaVariants); + } + } + } + return result; + } + + private void writeLemmas(DonatusMorphologyDocument donatusMorphologyDocument) throws ApplicationException { + Transaction txn = null; // without txn + // Transaction txn = berkeleyDBEnv.getEnv().beginTransaction(null, null); + // delivers all variants of all lemmas - so for example more than one variant with the same form name but in different lemmas + ArrayList variants = donatusMorphologyDocument.getVariants(); + for (int i=0; i newVariantLemmaVariants = newVariantLemma.getVariants(); + for (int j=0; j lemmas = donatusMorphologyDocument.getLemmas(); + for (int i=0; i getVariantsFromLuceneQuery(String queryString) { + ArrayList variants = new ArrayList(); + String[] variantTokens = queryString.split(" "); // TODO throw the phrases away (e.g.: "bla bla bla") + for (int i = 0; i < variantTokens.length; i++) { + String token = variantTokens[i]; + if (! (token.contains("*") || token.contains("?") || token.contains("~") || token.contains("-") || token.contains("+") || token.contains("^") || token.contains("OR") || token.contains("AND") || token.contains("NOT"))) { + variants.add(token); + } + } + return variants; + } + + private void writeLemmaByVariantKey(Transaction txn, DonatusVariant variantKey, DonatusLemma lemma) throws ApplicationException { + try { + String variantKeyStr = variantKey.getLemma().getLanguage() + "###" + variantKey.getForm(); + DatabaseEntry dbEntryKey = new DatabaseEntry(variantKeyStr.getBytes("UTF-8")); + String lemmaXmlValue = lemma.getXmlString(); + DatabaseEntry dbEntryValue = new DatabaseEntry(lemmaXmlValue.getBytes("UTF-8")); + Database variantDB = berkeleyDBEnv.getVariantDB(); + variantDB.put(txn, dbEntryKey, dbEntryValue); + } catch (DatabaseException e) { + throw new ApplicationException(e); + } catch (UnsupportedEncodingException e) { + throw new ApplicationException(e); + } + } + + private void writeLemmaByLemmaKey(Transaction txn, DonatusLemma lemma) throws ApplicationException { + try { + String lemmaKeyStr = lemma.getLanguage() + "###" + lemma.getForm(); + DatabaseEntry dbEntryKey = new DatabaseEntry(lemmaKeyStr.getBytes("UTF-8")); + String lemmaXmlValue = lemma.getXmlString(); + DatabaseEntry dbEntryValue = new DatabaseEntry(lemmaXmlValue.getBytes("UTF-8")); + Database lemmaDB = berkeleyDBEnv.getLemmaDB(); + lemmaDB.put(txn, dbEntryKey, dbEntryValue); + } catch (DatabaseException e) { + throw new ApplicationException(e); + } catch (UnsupportedEncodingException e) { + throw new ApplicationException(e); + } + } + + private DonatusLemma readVariantLemma(Transaction txn, String language, String variantForm) throws ApplicationException { + DonatusLemma lemma = null; + String hashKey = language + "###" + variantForm; + try { + Database variantDB = berkeleyDBEnv.getVariantDB(); + Cursor cursor = variantDB.openCursor(txn, null); + byte[] bHashKey = hashKey.getBytes("UTF-8"); + DatabaseEntry dbEntryKey = new DatabaseEntry(bHashKey); + DatabaseEntry foundXmlLemmaValue = new DatabaseEntry(); + OperationStatus operationStatus = variantDB.get(null, dbEntryKey, foundXmlLemmaValue, LockMode.DEFAULT); + if (operationStatus == OperationStatus.SUCCESS) { + byte[] foundXmlLemmaValueBytes = foundXmlLemmaValue.getData(); + String foundXmlLemmaStr = new String(foundXmlLemmaValueBytes, "UTF-8"); + lemma = parseXmlLemmaString(language, foundXmlLemmaStr); + } + cursor.close(); + } catch (DatabaseException e) { + throw new ApplicationException(e); + } catch (UnsupportedEncodingException e) { + throw new ApplicationException(e); + } + return lemma; + } + + private DonatusLemma readLemma(Transaction txn, String language, String lemmaForm) throws ApplicationException { + DonatusLemma lemma = null; + String hashKey = language + "###" + lemmaForm; + try { + Database lemmaDB = berkeleyDBEnv.getLemmaDB(); + Cursor cursor = lemmaDB.openCursor(txn, null); + byte[] bHashKey = hashKey.getBytes("UTF-8"); + DatabaseEntry dbEntryKey = new DatabaseEntry(bHashKey); + DatabaseEntry foundXmlLemmaValue = new DatabaseEntry(); + OperationStatus operationStatus = lemmaDB.get(null, dbEntryKey, foundXmlLemmaValue, LockMode.DEFAULT); + if (operationStatus == OperationStatus.SUCCESS) { + byte[] foundXmlLemmaValueBytes = foundXmlLemmaValue.getData(); + String foundXmlLemmaStr = new String(foundXmlLemmaValueBytes, "UTF-8"); + lemma = parseXmlLemmaString(language, foundXmlLemmaStr); + } + cursor.close(); + } catch (DatabaseException e) { + throw new ApplicationException(e); + } catch (UnsupportedEncodingException e) { + throw new ApplicationException(e); + } + return lemma; + } + + private DonatusLemma parseXmlLemmaString(String language, String xmlLemmaString) throws ApplicationException { + DonatusLemma lemma = null; + DonatusMorphologyDocument morphologyDoc = parseDonatusMorphDoc(language, xmlLemmaString); + ArrayList lemmas = morphologyDoc.getLemmas(); + if (lemmas.size() > 0) + lemma = lemmas.get(0); + return lemma; + } + + private DonatusMorphologyDocument parseDonatusMorphDoc(String language, String xmlString) throws ApplicationException { + DonatusMorphologyDocument morphologyDoc = null; + try { + XMLReader xmlParser = new SAXParser(); + DonatusMorphologyDocumentContentHandler donatusMorphContentHandler = new DonatusMorphologyDocumentContentHandler("tempDummyUri", language); + xmlParser.setContentHandler(donatusMorphContentHandler); + String morphDocDefXml = getDonatusMorphDocDefXml(); + String morphDocMorphStartXml = "\n"; + String morphDocMorphEndXml = ""; + String morphDocXml = morphDocDefXml + morphDocMorphStartXml + xmlString + morphDocMorphEndXml; + Reader reader = new StringReader(morphDocXml); + InputSource input = new InputSource(reader); + xmlParser.parse(input); + morphologyDoc = donatusMorphContentHandler.getResult(); + } catch (SAXException e) { + throw new ApplicationException(e); + } catch (IOException e) { + throw new ApplicationException(e); + } + return morphologyDoc; + } + + private static String getDonatusMorphDocDefXml() { + String defXml = + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "]>\n"; + return defXml; + } +} \ No newline at end of file diff -r 000000000000 -r 408254cf2f1d software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/donatus/xmlrpc/DonatusCacheOld.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/donatus/xmlrpc/DonatusCacheOld.java Wed Nov 24 17:24:23 2010 +0100 @@ -0,0 +1,327 @@ +package de.mpg.mpiwg.berlin.mpdl.donatus.xmlrpc; + +import java.io.FileNotFoundException; +import java.io.IOException; +import java.io.Reader; +import java.io.StringReader; +import java.io.UnsupportedEncodingException; +import java.net.MalformedURLException; +import java.net.URL; +import java.util.ArrayList; +import java.util.Date; + +import org.xml.sax.InputSource; +import org.xml.sax.SAXException; +import org.xml.sax.XMLReader; + +import com.sleepycat.je.Cursor; +import com.sleepycat.je.Database; +import com.sleepycat.je.DatabaseEntry; +import com.sleepycat.je.DatabaseException; +import com.sleepycat.je.LockMode; +import com.sleepycat.je.OperationStatus; +import com.sleepycat.je.Transaction; +import com.sun.org.apache.xerces.internal.parsers.SAXParser; + +import de.mpg.mpiwg.berlin.mpdl.donatus.analysis.DonatusAnalyzer; +import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; +import de.mpg.mpiwg.berlin.mpdl.util.FileUtil; +import de.mpg.mpiwg.berlin.mpdl.util.Util; + +public class DonatusCacheOld { + private static DonatusCacheOld instance; + private DonatusBerkeleyDbEnv berkeleyDBEnv = null; + private Date state = null; // last time the cache is written + + // for performance reasons these variables are needed + public static int QUERY_MODE = 0; + public static int DOCUMENT_MODE = 1; + protected int mode = QUERY_MODE; + // for performance reasons the cache contains a donatusMorphologyDocument which + // caches all lemmas for one document (in DOCUMENT_MODE) + private DonatusMorphologyDocument donatusMorphologyDocument = null; + + public static DonatusCacheOld getInstance() throws ApplicationException { + if (instance == null) { + instance = new DonatusCacheOld(); + instance.init(); + } + return instance; + } + + private void init() throws ApplicationException { + try { + berkeleyDBEnv = new DonatusBerkeleyDbEnv(); + berkeleyDBEnv.setup(false); // open databases in read/write mode + state = new Date(); + } catch (DatabaseException e) { + throw new ApplicationException(e); + } + } + + public int getMode() { + return mode; + } + + public void setMode(int newMode) { + this.mode = newMode; + if (newMode == QUERY_MODE) + donatusMorphologyDocument = null; // reset the morphology document + } + + public void close() { + berkeleyDBEnv.close(); + } + + // TODO Aufruf über RPC-API: execute(String path, HashMap parameters); spez. MPDL-Funktion zum Administrieren von BerkeleyDB: org.exist.xquery.modules.mpdldb.BerkeleyDBAdmin + public void deleteCache() { + berkeleyDBEnv.removeDatabases(); + state = new Date(); + } + + public void cacheLemmas(DonatusAnalyzer analyzer, String docUri, ArrayList sentences) throws ApplicationException { + try { + Date beginOfOperation1 = new Date(); + URL url = new URL(docUri); + String path = url.getPath(); + System.out.print("Indexing: " + path + " Donatus-Analyze ... "); + DonatusHandler donatusHandler = new DonatusHandler(analyzer); + donatusMorphologyDocument = donatusHandler.analyze(docUri, sentences); + Date endOfOperation1 = new Date(); + Double elapsedTime1 = new Util().getSecondWithMillisecondsBetween(beginOfOperation1, endOfOperation1); + System.out.print(elapsedTime1 + " sec ... Writing lemmas to BerkeleyDB ... "); + Date beginOfOperation2 = new Date(); + writeLemmas(donatusMorphologyDocument); + Date endOfOperation2 = new Date(); + Double elapsedTime2 = new Util().getSecondWithMillisecondsBetween(beginOfOperation2, endOfOperation2); + System.out.print(elapsedTime2 + " sec ... Stemming ... "); + String donMorphPath = path.replaceFirst(".xml", "-donatus-morph-v" + endOfOperation2.getTime() + ".xml"); + String morphDocFilePathStr = DonatusConstants.BERKELEY_DB_DIR + "/donatusAnalyzedFiles" + donMorphPath; + FileUtil fileUtil = new FileUtil(); + byte[] morphDocBytes = donatusMorphologyDocument.getDocumentBytes(); + fileUtil.saveFile(morphDocBytes, morphDocFilePathStr); + String donWtagPath = path.replaceFirst(".xml", "-donatus-wtag-v" + endOfOperation2.getTime() + ".xml"); + String wtagFilePathStr = DonatusConstants.BERKELEY_DB_DIR + "/donatusAnalyzedFiles" + donWtagPath; + byte[] wtagBytes = donatusMorphologyDocument.getWtagBytes(); + fileUtil.saveFile(wtagBytes, wtagFilePathStr); + } catch (MalformedURLException e) { + throw new ApplicationException(e); + } catch (IOException e) { + throw new ApplicationException(e); + } + state = new Date(); + } + + public DonatusLemma getLemma(String language, String variantForm) throws ApplicationException { + DonatusLemma lemma = null; + if (mode == QUERY_MODE) { + lemma = readVariantLemma(null, language, variantForm); + } else { + if (donatusMorphologyDocument != null) { + DonatusVariant v = donatusMorphologyDocument.getVariant(variantForm); + if (v != null) { + DonatusLemma l = v.getLemma(); + lemma = donatusMorphologyDocument.getLemma(l.getForm()); + } + } + } + return lemma; + } + + public ArrayList getQueryVariants(String language, String luceneQueryString) throws ApplicationException { + ArrayList result = new ArrayList(); + ArrayList variantsFromQuery = getVariantsFromLuceneQuery(luceneQueryString); + if (! (variantsFromQuery == null || variantsFromQuery.isEmpty())) { + for (int i=0; i lemmaVariants = lemma.getVariants(); + result.addAll(lemmaVariants); + } + } + } + return result; + } + + private void writeLemmas(DonatusMorphologyDocument donatusMorphologyDocument) throws ApplicationException { + Transaction txn = null; // without txn + // Transaction txn = berkeleyDBEnv.getEnv().beginTransaction(null, null); + // delivers all variants of all lemmas - so for example more than one variant with the same form name but in different lemmas + ArrayList variants = donatusMorphologyDocument.getVariants(); + for (int i=0; i newVariantLemmaVariants = newVariantLemma.getVariants(); + for (int j=0; j lemmas = donatusMorphologyDocument.getLemmas(); + for (int i=0; i getVariantsFromLuceneQuery(String queryString) { + ArrayList variants = new ArrayList(); + String[] variantTokens = queryString.split(" "); // TODO throw the phrases away (e.g.: "bla bla bla") + for (int i = 0; i < variantTokens.length; i++) { + String token = variantTokens[i]; + if (! (token.contains("*") || token.contains("?") || token.contains("~") || token.contains("-") || token.contains("+") || token.contains("^") || token.contains("OR") || token.contains("AND") || token.contains("NOT"))) { + variants.add(token); + } + } + return variants; + } + + private void writeLemmaByVariantKey(Transaction txn, DonatusVariant variantKey, DonatusLemma lemma) throws ApplicationException { + try { + String variantKeyStr = variantKey.getLemma().getLanguage() + "###" + variantKey.getForm(); + DatabaseEntry dbEntryKey = new DatabaseEntry(variantKeyStr.getBytes("UTF-8")); + String lemmaXmlValue = lemma.getXmlString(); + DatabaseEntry dbEntryValue = new DatabaseEntry(lemmaXmlValue.getBytes("UTF-8")); + Database variantDB = berkeleyDBEnv.getVariantDB(); + variantDB.put(txn, dbEntryKey, dbEntryValue); + } catch (DatabaseException e) { + throw new ApplicationException(e); + } catch (UnsupportedEncodingException e) { + throw new ApplicationException(e); + } + } + + private void writeLemmaByLemmaKey(Transaction txn, DonatusLemma lemma) throws ApplicationException { + try { + String lemmaKeyStr = lemma.getLanguage() + "###" + lemma.getForm(); + DatabaseEntry dbEntryKey = new DatabaseEntry(lemmaKeyStr.getBytes("UTF-8")); + String lemmaXmlValue = lemma.getXmlString(); + DatabaseEntry dbEntryValue = new DatabaseEntry(lemmaXmlValue.getBytes("UTF-8")); + Database lemmaDB = berkeleyDBEnv.getLemmaDB(); + lemmaDB.put(txn, dbEntryKey, dbEntryValue); + } catch (DatabaseException e) { + throw new ApplicationException(e); + } catch (UnsupportedEncodingException e) { + throw new ApplicationException(e); + } + } + + private DonatusLemma readVariantLemma(Transaction txn, String language, String variantForm) throws ApplicationException { + DonatusLemma lemma = null; + String hashKey = language + "###" + variantForm; + try { + Database variantDB = berkeleyDBEnv.getVariantDB(); + Cursor cursor = variantDB.openCursor(txn, null); + byte[] bHashKey = hashKey.getBytes("UTF-8"); + DatabaseEntry dbEntryKey = new DatabaseEntry(bHashKey); + DatabaseEntry foundXmlLemmaValue = new DatabaseEntry(); + OperationStatus operationStatus = variantDB.get(null, dbEntryKey, foundXmlLemmaValue, LockMode.DEFAULT); + if (operationStatus == OperationStatus.SUCCESS) { + byte[] foundXmlLemmaValueBytes = foundXmlLemmaValue.getData(); + String foundXmlLemmaStr = new String(foundXmlLemmaValueBytes, "UTF-8"); + lemma = parseXmlLemmaString(language, foundXmlLemmaStr); + } + cursor.close(); + } catch (DatabaseException e) { + throw new ApplicationException(e); + } catch (UnsupportedEncodingException e) { + throw new ApplicationException(e); + } + return lemma; + } + + private DonatusLemma parseXmlLemmaString(String language, String xmlLemmaString) throws ApplicationException { + DonatusLemma lemma = null; + DonatusMorphologyDocument morphologyDoc = parseDonatusMorphDoc(language, xmlLemmaString); + ArrayList lemmas = morphologyDoc.getLemmas(); + if (lemmas.size() > 0) + lemma = lemmas.get(0); + return lemma; + } + + private DonatusMorphologyDocument parseDonatusMorphDoc(String language, String xmlString) throws ApplicationException { + DonatusMorphologyDocument morphologyDoc = null; + try { + XMLReader xmlParser = new SAXParser(); + DonatusMorphologyDocumentContentHandler donatusMorphContentHandler = new DonatusMorphologyDocumentContentHandler("tempDummyUri", language); + xmlParser.setContentHandler(donatusMorphContentHandler); + String morphDocDefXml = getDonatusMorphDocDefXml(); + String morphDocMorphStartXml = "\n"; + String morphDocMorphEndXml = ""; + String morphDocXml = morphDocDefXml + morphDocMorphStartXml + xmlString + morphDocMorphEndXml; + Reader reader = new StringReader(morphDocXml); + InputSource input = new InputSource(reader); + xmlParser.parse(input); + morphologyDoc = donatusMorphContentHandler.getResult(); + } catch (SAXException e) { + throw new ApplicationException(e); + } catch (IOException e) { + throw new ApplicationException(e); + } + return morphologyDoc; + } + + private static String getDonatusMorphDocDefXml() { + String defXml = + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "]>\n"; + return defXml; + } +} \ No newline at end of file diff -r 000000000000 -r 408254cf2f1d software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/donatus/xmlrpc/DonatusConstants.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/donatus/xmlrpc/DonatusConstants.java Wed Nov 24 17:24:23 2010 +0100 @@ -0,0 +1,11 @@ +package de.mpg.mpiwg.berlin.mpdl.donatus.xmlrpc; + +public class DonatusConstants { + public static String DEFAULT_LANGUAGE = "en"; + public static String SERVER_NAME = "archimedes.fas.harvard.edu"; + public static int PORT = 80; + public static String URI_RPC_CALL = "/cgi-bin/donatus-rpc"; + public static String BERKELEY_DB_DIR = System.getProperty("exist.home") + "/webapp/WEB-INF/data/berkeleyDB"; // other call would be: ConfigurationHelper.getExistHome() + public static String TYPE_DONATUS = "donatus"; + public static String TYPE_SNOWBALL = "snowball"; +} diff -r 000000000000 -r 408254cf2f1d software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/donatus/xmlrpc/DonatusContextForm.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/donatus/xmlrpc/DonatusContextForm.java Wed Nov 24 17:24:23 2010 +0100 @@ -0,0 +1,24 @@ +package de.mpg.mpiwg.berlin.mpdl.donatus.xmlrpc; + +import java.util.ArrayList; + +public class DonatusContextForm { + private String language; + private String xlinkHref; + private ArrayList tokens; + private DonatusAnalysis analysis; + + public DonatusContextForm(String language) { + } + + public String getLanguage() { + return language; + } + + // TODO rest + public String getXmlString() { + String xmlString = ""; + return xmlString; + } + +} diff -r 000000000000 -r 408254cf2f1d software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/donatus/xmlrpc/DonatusHandler.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/donatus/xmlrpc/DonatusHandler.java Wed Nov 24 17:24:23 2010 +0100 @@ -0,0 +1,31 @@ +package de.mpg.mpiwg.berlin.mpdl.donatus.xmlrpc; + +import java.util.ArrayList; + +import de.mpg.mpiwg.berlin.mpdl.donatus.analysis.DonatusAnalyzer; + +public class DonatusHandler { + private DonatusXmlRpcClient xmlClient = null; + private DonatusAnalyzer donatusAnalyzer; + + public DonatusHandler(DonatusAnalyzer donatusAnalyzer) { + this.xmlClient = new DonatusXmlRpcClient(); + this.donatusAnalyzer = donatusAnalyzer; + } + + public DonatusMorphologyDocument analyze(String docUri, ArrayList sentences) { + DonatusWtagDocument donatusWtagDoc = new DonatusWtagDocument(docUri); + String language = donatusAnalyzer.getLanguage(); + DonatusWtagSection donatusWtagSection = new DonatusWtagSection(language); + donatusWtagDoc.addSection(donatusWtagSection); + for (int i = 0; i < sentences.size(); i++) { + String sentence = sentences.get(i); + ArrayList token = donatusAnalyzer.getToken(sentence); + DonatusWtagContainer donatusWtagContainer = new DonatusWtagContainer("s", Integer.valueOf(i).toString(), token); // a sentence container + donatusWtagSection.addContainer(donatusWtagContainer); + } + DonatusMorphologyDocument donatusMorpDocument = xmlClient.analyze(donatusWtagDoc); + return donatusMorpDocument; + } + +} diff -r 000000000000 -r 408254cf2f1d software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/donatus/xmlrpc/DonatusLemma.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/donatus/xmlrpc/DonatusLemma.java Wed Nov 24 17:24:23 2010 +0100 @@ -0,0 +1,83 @@ +package de.mpg.mpiwg.berlin.mpdl.donatus.xmlrpc; + +import java.util.ArrayList; + +public class DonatusLemma { + private DonatusMorphologyDocument morphDocument; + private String language; + private String type = DonatusConstants.TYPE_DONATUS; + private String form; + private String definition = ""; + private ArrayList variants; + + public DonatusLemma(DonatusMorphologyDocument morphDoc, String language, String type, String form) { + this.morphDocument = morphDoc; + this.language = language; + this.type = type; + this.form = form; + this.variants = new ArrayList(); + // always the Lemma form is variant itself + DonatusVariant variant = new DonatusVariant(this, type, form); + variants.add(variant); + } + + public String getLanguage() { + return language; + } + + public String getForm() { + return form; + } + + public ArrayList getVariants() { + return variants; + } + + public ArrayList getVariants(String type) { + ArrayList result = new ArrayList(); + for (int i=0; i"; + xmlString += "" + definition + ""; + for (int i=0; i variants; // hold this variable for performance reasons: a key on each lemma variant + private HashMap lemmas; + private ArrayList contextForms; + private byte[] documentBytes; + private byte[] wtagBytes; + + public DonatusMorphologyDocument(String docUri) { + this.docUri = docUri; + this.variants = new HashMap(); + this.lemmas = new HashMap(); + this.contextForms = new ArrayList(); + } + + public ArrayList getLemmas() { + Collection values = lemmas.values(); + ArrayList retArrayList = new ArrayList(); + Iterator iter = values.iterator(); + while(iter.hasNext()) { + DonatusLemma lemma = iter.next(); + retArrayList.add(lemma); + } + return retArrayList; + } + + public String getDocUri() { + return docUri; + } + + public ArrayList getContextForms() { + return contextForms; + } + + public void setWtagBytes(byte[] wtagBytes) { + this.wtagBytes = wtagBytes; + } + + public byte[] getWtagBytes() { + return wtagBytes; + } + + public byte[] getDocumentBytes() throws ApplicationException { + String contentXml = getXmlString(); + try { + documentBytes = contentXml.getBytes("UTF-8"); + } catch (UnsupportedEncodingException e) { + throw new ApplicationException(e); + } + return documentBytes; + } + + public String getLemmaFormsString() { + String result = ""; + ArrayList lemmas = getLemmas(); + for (int i=0; i getVariants() { + ArrayList result = new ArrayList(); + ArrayList lemmas = getLemmas(); + for (int i=0; i lemmaVariants = lemma.getVariants(); + result.addAll(lemmaVariants); + } + return result; + } + + public String getLanguage() { + return language; + } + + public void setLanguage(String language) { + this.language = language; + } + + public DonatusVariant getVariant(String variantForm) { + String hashKey = language + "###" + variantForm; + DonatusVariant variant = variants.get(hashKey); + return variant; + } + + public DonatusVariant putVariant(DonatusVariant variant) { + String variantForm = variant.getForm(); + String hashKey = language + "###" + variantForm; + DonatusVariant putReturn = this.variants.put(hashKey, variant); + return putReturn; + } + + public DonatusLemma getLemma(String lemmaForm) { + String hashKey = language + "###" + lemmaForm; + DonatusLemma lemma = lemmas.get(hashKey); + return lemma; + } + + public void putLemma(DonatusLemma lemma) { + String lemmaForm = lemma.getForm(); + String lang = lemma.getLanguage(); + String hashKey = lang + "###" + lemmaForm; + lemmas.put(hashKey, lemma); + } + + public void addContextForm(DonatusContextForm form) { + contextForms.add(form); + } + + public String getXmlString() { + StringBuffer xmlString = new StringBuffer(""); + ArrayList lemmas = getLemmas(); + for (int i=0; i"); + return xmlString.toString(); + } + +} diff -r 000000000000 -r 408254cf2f1d software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/donatus/xmlrpc/DonatusMorphologyDocumentContentHandler.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/donatus/xmlrpc/DonatusMorphologyDocumentContentHandler.java Wed Nov 24 17:24:23 2010 +0100 @@ -0,0 +1,132 @@ +package de.mpg.mpiwg.berlin.mpdl.donatus.xmlrpc; + +import org.xml.sax.*; + +public class DonatusMorphologyDocumentContentHandler implements ContentHandler { + private String docUri; + private DonatusMorphologyDocument result; + private String language; + private Element currentElement; + private DonatusLemma currentLemma; + private DonatusVariant currentVariant; + + public DonatusMorphologyDocumentContentHandler(String docUri, String language) { + this.docUri = docUri; + this.language = language; + } + + public DonatusMorphologyDocument getResult() { + return result; + } + + public void startDocument() throws SAXException { + } + + public void endDocument() throws SAXException { + } + + public void characters(char[] c, int start, int length) throws SAXException { + if (currentElement != null && currentElement.name.equals("definition")) { + char[] cCopy = new char[length]; + System.arraycopy(c, start, cCopy, 0, length); + currentLemma.setDefinition(String.valueOf(cCopy)); + } + } + + public void ignorableWhitespace(char[] c, int start, int length) throws SAXException { + } + + public void processingInstruction(String target, String data) throws SAXException { + } + + public void setDocumentLocator(org.xml.sax.Locator arg1) { + } + + public void endPrefixMapping(String prefix) throws SAXException { + } + + public void skippedEntity(String name) throws SAXException { + } + + public void endElement(String uri, String localName, String name) throws SAXException { + } + + public void startElement(String uri, String localName, String name, Attributes attrs) throws SAXException { + currentElement = new Element(name); + if (name.equals("morphology")) { + result = new DonatusMorphologyDocument(docUri); + result.setLanguage(language); + } else if (name.equals("lemma")) { + String language = ""; + String form = ""; + if (attrs != null) { + int length = attrs.getLength(); + for (int i = 0; i < length; i++) { + String attrName = attrs.getLocalName(i); + if (attrName.equals("form")) { + form = attrs.getValue(i); + if (form.matches(".*#\\d*")) + form = form.replaceFirst("#\\d*", ""); // remove #number in the lemma form + } else if (attrName.equals("lang")) { + language = attrs.getValue(i); + } + } + } + DonatusLemma morphDocLemma = result.getLemma(form); + if (morphDocLemma == null) { + DonatusLemma newLemma = new DonatusLemma(result, language, DonatusConstants.TYPE_DONATUS, form); + currentLemma = newLemma; + result.putLemma(currentLemma); + } else { + currentLemma = morphDocLemma; // lemma with same form exists already in morphology document e.g. a lemma with a different #number in its form + } + } else if (name.equals("variant")) { + String form = ""; + if (attrs != null) { + int length = attrs.getLength(); + for (int i = 0; i < length; i++) { + String attrName = attrs.getLocalName(i); + if (attrName.equals("form")) + form = attrs.getValue(i); + } + } + DonatusVariant variant = new DonatusVariant(currentLemma, DonatusConstants.TYPE_DONATUS, form); + currentVariant = variant; + if (currentLemma != null) + currentLemma.addVariant(variant); + } else if (name.equals("analysis")) { + String desc = ""; + String xlink = ""; + if (attrs != null) { + int length = attrs.getLength(); + for (int i = 0; i < length; i++) { + String attrName = attrs.getQName(i); + if (attrName.equals("desc")) + desc = attrs.getValue(i); + else if (attrName.equals("xlink:type")) + xlink = attrs.getValue(i); + } + } + DonatusAnalysis analysis = new DonatusAnalysis(desc, xlink); + if(currentVariant != null) + currentVariant.addAnalysis(analysis); + } + } + + public void startPrefixMapping(String prefix, String uri) throws SAXException { + } + + private class Element { + String name; + String value; + + Element(String name) { + this.name = name; + } + + Element(String name, String value) { + this.name = name; + this.value = value; + } + } +} diff -r 000000000000 -r 408254cf2f1d software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/donatus/xmlrpc/DonatusToken.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/donatus/xmlrpc/DonatusToken.java Wed Nov 24 17:24:23 2010 +0100 @@ -0,0 +1,19 @@ +package de.mpg.mpiwg.berlin.mpdl.donatus.xmlrpc; + +public class DonatusToken { + private int count; + private String form; + + public DonatusToken(int count, String form) { + this.count = count; + this.form = form; + } + + public String getForm() { + return form; + } + + public int getCount() { + return count; + } +} diff -r 000000000000 -r 408254cf2f1d software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/donatus/xmlrpc/DonatusVariant.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/donatus/xmlrpc/DonatusVariant.java Wed Nov 24 17:24:23 2010 +0100 @@ -0,0 +1,45 @@ +package de.mpg.mpiwg.berlin.mpdl.donatus.xmlrpc; + +import java.util.ArrayList; + +public class DonatusVariant { + private DonatusLemma lemma; // lemma to which this variant belongs + private String form; + private String type = DonatusConstants.TYPE_DONATUS; + private ArrayList analysis; + + public DonatusVariant(DonatusLemma lemma, String type, String form) { + this.lemma = lemma; + this.type = type; + this.form = form; + this.analysis = new ArrayList(); + } + + public String getForm() { + return form; + } + + public DonatusLemma getLemma() { + return lemma; + } + + public String getType() { + return type; + } + + public void addAnalysis(DonatusAnalysis an) { + analysis.add(an); + } + + public String getXmlString() { + String xmlString = ""; + for (int i=0; i words; + + public DonatusWtagContainer(String name, String id, ArrayList words) { + this.name = name; + this.id = id; + this.words = words; + } + + public String getName() { + return name; + } + + public String getId() { + return id; + } + + public ArrayList getWords() { + return words; + } + + public String getXmlString() { + StringBuffer xmlString = new StringBuffer("<" + name + " id=\"" + id + "\">"); + for (int i=0; i" + w + ""); + } + xmlString.append(""); + return xmlString.toString(); + } + +} diff -r 000000000000 -r 408254cf2f1d software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/donatus/xmlrpc/DonatusWtagDocument.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/donatus/xmlrpc/DonatusWtagDocument.java Wed Nov 24 17:24:23 2010 +0100 @@ -0,0 +1,72 @@ +package de.mpg.mpiwg.berlin.mpdl.donatus.xmlrpc; + +import java.util.ArrayList; + +public class DonatusWtagDocument { + private String locator; // e.g. http://archimedes/037.xml + private ArrayList donatusWtagSections; + + public DonatusWtagDocument(String locator) { + this.locator = locator; + donatusWtagSections = new ArrayList(); + } + + public DonatusWtagDocument(String locator, ArrayList donatusWtagSections) { + this.locator = locator; + this.donatusWtagSections = donatusWtagSections; + } + + public String getLocator() { + return locator; + } + + public ArrayList getSections() { + return donatusWtagSections; + } + + public String getLanguage() { + if (donatusWtagSections != null && donatusWtagSections.size() > 0) + return donatusWtagSections.get(0).getLanguage(); + else + return DonatusConstants.DEFAULT_LANGUAGE; + } + + public DonatusWtagContainer getContainer(String language, int position) { + DonatusWtagContainer retContainer = null; + DonatusWtagSection s = getSection(language); + if (s != null) { + ArrayList containerArray = s.getContainer(); + if (containerArray != null) + retContainer = containerArray.get(position); + } + return retContainer; + } + + public DonatusWtagSection getSection(String language) { + if (donatusWtagSections == null) + return null; + for (int i=0; i"); + ArrayList sections = getSections(); + for (int i=0; i"); + return xmlString.toString(); + } + + public void addSection(DonatusWtagSection section) { + donatusWtagSections.add(section); + } + +} diff -r 000000000000 -r 408254cf2f1d software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/donatus/xmlrpc/DonatusWtagSection.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/donatus/xmlrpc/DonatusWtagSection.java Wed Nov 24 17:24:23 2010 +0100 @@ -0,0 +1,45 @@ +package de.mpg.mpiwg.berlin.mpdl.donatus.xmlrpc; + +import java.util.ArrayList; + +public class DonatusWtagSection { + private String language = "en"; + private ArrayList container; + + public DonatusWtagSection() { + this.container = new ArrayList(); + } + + public DonatusWtagSection(String language) { + this.language = language; + this.container = new ArrayList(); + } + + public void addContainer(DonatusWtagContainer c) { + container.add(c); + } + + public String getLanguage() { + return language; + } + + public void setLanguage(String language) { + this.language = language; + } + + public ArrayList getContainer() { + return container; + } + + public String getXmlString() { + StringBuffer xmlString = new StringBuffer("
"); + for (int i=0; i"); + return xmlString.toString(); + } + + +} diff -r 000000000000 -r 408254cf2f1d software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/donatus/xmlrpc/DonatusXmlRpcClient.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/donatus/xmlrpc/DonatusXmlRpcClient.java Wed Nov 24 17:24:23 2010 +0100 @@ -0,0 +1,106 @@ +package de.mpg.mpiwg.berlin.mpdl.donatus.xmlrpc; + +import java.io.IOException; +import java.io.Reader; +import java.io.StringReader; +import java.net.MalformedURLException; +import java.net.URL; +import java.util.HashMap; +import java.util.Vector; + +import org.apache.xmlrpc.XmlRpcException; +import org.apache.xmlrpc.client.XmlRpcClient; +import org.apache.xmlrpc.client.XmlRpcClientConfigImpl; +import org.xml.sax.InputSource; +import org.xml.sax.SAXException; +import org.xml.sax.XMLReader; + +import com.sun.org.apache.xerces.internal.parsers.SAXParser; + +public class DonatusXmlRpcClient { + private String serverName = DonatusConstants.SERVER_NAME; + private int port = DonatusConstants.PORT; + private String uriRpcCall = DonatusConstants.URI_RPC_CALL; + private String xmlRpcUri = "http://" + serverName + ":" + port + uriRpcCall; + private XmlRpcClient xmlClient = null; + + public DonatusXmlRpcClient() { + this.xmlRpcUri = "http://" + serverName + uriRpcCall; + init(); + } + + public DonatusXmlRpcClient(String serverName) { + this.serverName = serverName; + this.xmlRpcUri = "http://" + serverName + uriRpcCall; + init(); + } + + public DonatusXmlRpcClient(String serverName, int port) { + this.serverName = serverName; + this.port = port; + this.xmlRpcUri = "http://" + serverName + ":" + port + uriRpcCall; + init(); + } + + private void init() { + try { + XmlRpcClientConfigImpl config = new XmlRpcClientConfigImpl(); + config.setServerURL(new URL(xmlRpcUri)); + xmlClient = new XmlRpcClient(); + xmlClient.setConfig(config); + } catch (MalformedURLException e) { + e.printStackTrace(); + } + } + + public DonatusMorphologyDocument analyze(DonatusWtagDocument donatusWtagDocument) { + DonatusMorphologyDocument morphologyDoc = null; + try { + String language = donatusWtagDocument.getLanguage(); + String wtagString = donatusWtagDocument.getXmlString(); + Vector params = new Vector(); + byte[] wtagBytes = wtagString.getBytes("UTF-8"); + params.add(wtagBytes); + HashMap donatusReturn = (HashMap) xmlClient.execute("donatus.analyze", params); + Object morphologyDocTypeObject = donatusReturn.get("morphData"); + byte[] bytesMorphologyDocTypeObject = (byte[]) morphologyDocTypeObject; + String morphologyDocTypeString = new String(bytesMorphologyDocTypeObject, "UTF-8"); + XMLReader xmlParser = new SAXParser(); + String docUri = donatusWtagDocument.getLocator(); + DonatusMorphologyDocumentContentHandler donatusMorphContentHandler = new DonatusMorphologyDocumentContentHandler(docUri, language); + xmlParser.setContentHandler(donatusMorphContentHandler); + Reader reader = new StringReader(morphologyDocTypeString); + InputSource input = new InputSource(reader); + xmlParser.parse(input); + morphologyDoc = donatusMorphContentHandler.getResult(); + morphologyDoc.setWtagBytes(wtagBytes); + } catch (SAXException e) { + e.printStackTrace(); + } catch (XmlRpcException e) { + e.printStackTrace(); + } catch (IOException e) { + e.printStackTrace(); + } + return morphologyDoc; + } + + public String analyze(String locator, String language, String documentString) { + String morphologyDocTypeString = null; + try { + String wtagStart = "
"; + String wtagEnd = "
"; + String wtagString = wtagStart + documentString + wtagEnd; + Vector params = new Vector(); + params.add(wtagString.getBytes("UTF8")); + HashMap donatusReturn = (HashMap) xmlClient.execute("donatus.analyze", params); + Object morphologyDocTypeObject = donatusReturn.get("morphData"); + morphologyDocTypeString = new String((byte[])morphologyDocTypeObject); + } catch (XmlRpcException e) { + e.printStackTrace(); + } catch (IOException e) { + e.printStackTrace(); + } + return morphologyDocTypeString; + } + +} diff -r 000000000000 -r 408254cf2f1d software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/escidoc/Component.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/escidoc/Component.java Wed Nov 24 17:24:23 2010 +0100 @@ -0,0 +1,39 @@ +package de.mpg.mpiwg.berlin.mpdl.escidoc; + +public class Component { + private String validStatus; // e.g. valid + private String visibility; // e.g. public + // private String fileName; // original file name: e.g. blabla.xml + private String contentCategory; // e.g. "pre-print" or "JPEG_DEFAULT" or ... + private String mimeType; // e.g. "text/xml" + private String url; + private String storage; // e.g. "internal-managed" or "external-url" or "external-managed" + + public Component(String validStatus, String visibility, String contentCategory, String mimeType, String url, String storage) { + this.validStatus = validStatus; + this.visibility = visibility; + this.contentCategory = contentCategory; + this.mimeType = mimeType; + this.url = url; + this.storage = storage; + } + + public String toXmlString() { + StringBuilder str = new StringBuilder(); + str.append(" \n"); + str.append(" \n"); + str.append(" " + validStatus + "\n"); + str.append(" " + visibility + "\n"); + // if (componentFileName != null) + // str.append(" " + componentFileName + "\n"); + if (contentCategory != null) + str.append(" " + contentCategory + "\n"); + if (mimeType != null) + str.append(" " + mimeType + "\n"); + str.append(" \n"); + str.append(" \n"); + str.append(" \n"); + + return str.toString(); + } +} diff -r 000000000000 -r 408254cf2f1d software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/escidoc/Container.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/escidoc/Container.java Wed Nov 24 17:24:23 2010 +0100 @@ -0,0 +1,74 @@ +package de.mpg.mpiwg.berlin.mpdl.escidoc; + +import java.util.Date; + +public class Container { + private String id; + private String contentModelId; + private String contextId; + private String pid; + private MetadataRecord mdRecord; + private Date lastModificationDate; + + public Container(String id, Date lastModificationDate) { + this.id = id; + this.lastModificationDate = lastModificationDate; + } + + public Container(String contentModelId, String contextId, String pid, MetadataRecord mdRecord) { + this.contentModelId = contentModelId; + this.contextId = contextId; + this.pid = pid; + this.mdRecord = mdRecord; + } + + public String getId() { + return id; + } + + public Date getLastModificationDate() { + return lastModificationDate; + } + + public String toXmlString() { + StringBuilder str = new StringBuilder(); + str.append("\n"); + str.append("\n"); + + str.append(" \n"); + str.append(" \n"); + str.append(" \n"); + if (pid != null) { + str.append(" " + pid + "\n"); + } + str.append(" \n"); + str.append(" \n"); + str.append(" \n"); + str.append(" \n"); + + str.append(" \n"); + str.append(" \n"); + str.append(" \n"); + str.append(" \n"); + if (mdRecord != null) { + str.append(mdRecord.toXmlString()); + } else { + str.append(" \n"); + str.append(" \n"); + } + str.append(" \n"); + + str.append(" \n"); + + str.append("\n"); + return str.toString(); + } +} diff -r 000000000000 -r 408254cf2f1d software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/escidoc/Context.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/escidoc/Context.java Wed Nov 24 17:24:23 2010 +0100 @@ -0,0 +1,40 @@ +package de.mpg.mpiwg.berlin.mpdl.escidoc; + +public class Context { + private String organizationalUnit = "/oum/organizational-unit/escidoc:ex3"; + private String name = "MPDL-XML-Test"; + private String description = "MPDL-XML-Test"; + private String type = "MpdlType"; + + public Context(String organizationalUnit, String name, String description, String type) { + this.organizationalUnit = organizationalUnit; + this.name = name; + this.description = description; + this.type = type; + } + + public String toXmlString() { + StringBuilder str = new StringBuilder(); + str.append("\n"); + str.append("\n"); + str.append(" \n"); + str.append(" " + name + "\n"); + str.append(" " + description + "\n"); + str.append(" " + type + "\n"); + if (organizationalUnit != null) { + str.append(" \n"); + str.append(" \n"); + str.append(" \n"); + } + str.append(" \n"); + str.append("\n"); + return str.toString(); + } +} diff -r 000000000000 -r 408254cf2f1d software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/escidoc/ESciDocIngestor.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/escidoc/ESciDocIngestor.java Wed Nov 24 17:24:23 2010 +0100 @@ -0,0 +1,213 @@ +package de.mpg.mpiwg.berlin.mpdl.escidoc; + +import java.util.ArrayList; +import java.util.Date; + +import javax.xml.namespace.NamespaceContext; + +import org.w3c.dom.Node; + +import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; +import de.mpg.mpiwg.berlin.mpdl.general.MpdlConstants; +import de.mpg.mpiwg.berlin.mpdl.schedule.MpdlDocOperation; +import de.mpg.mpiwg.berlin.mpdl.util.StringUtilEscapeChars; +import de.mpg.mpiwg.berlin.mpdl.util.XmlUtil; +import de.mpg.mpiwg.berlin.mpdl.xml.SchemaHandler; + +public class ESciDocIngestor { + private ESciDocRestSession eSciDocSession; + + public ESciDocIngestor(ESciDocRestSession eSciDocSession) { + this.eSciDocSession = eSciDocSession; + } + + public String execute(MpdlDocOperation docOperation) throws ApplicationException { + String performedContainerId = null; + String operationName = docOperation.getName(); + if (operationName.equals("create")) { + performedContainerId = createDocument(docOperation); + } else if (operationName.equals("update")) { + performedContainerId = updateDocument(docOperation); + } else if (operationName.equals("delete")) { + performedContainerId = deleteDocument(docOperation); + } + String performedESciDocUrl = "http://" + MpdlConstants.MPDL_ESCIDOC_HOST_NAME + ":" + MpdlConstants.MPDL_ESCIDOC_PORT + performedContainerId; + docOperation.setESciDocDestUrl(performedESciDocUrl); + return performedESciDocUrl; + } + + private String createDocument(MpdlDocOperation docOperation) throws ApplicationException { + String pid = eSciDocSession.getPid(); + String docBaseContainerId = MpdlConstants.MPDL_ESCIDOC_ECHO_CONTAINER_ID; + String eXistIdentifier = docOperation.getDestUrl(); // e.g. /echo/la/Benedetti_1585.xml + String docBase = docOperation.getDocBase(); + if (docBase != null && docBase.equals("archimedes")) + docBaseContainerId = MpdlConstants.MPDL_ESCIDOC_ARCHIMEDES_CONTAINER_ID; + docOperation.setStatus("create document: " + eXistIdentifier + " on eSciDoc server"); + String destFileName = MpdlConstants.MPDL_EXIST_DATA_DIR + "/documents" + eXistIdentifier; + // upload the file to the eSciDoc stage area + String eSciDocStageAreaUrl = eSciDocSession.uploadFileToESciDocStageArea(destFileName); + Node docNode = docOperation.getDocNode(); + MetadataRecord mdRecord = docOperation.getMdRecord(); + if (mdRecord != null) { + mdRecord.setMediaType("fulltext"); + } + // create document container for all items + Container newContainer = eSciDocSession.createContainerInContainer(pid, mdRecord, docBaseContainerId); + String newContainerId = newContainer.getId(); + Date lastModificationDate = newContainer.getLastModificationDate(); + eSciDocSession.submitContainer(newContainerId, lastModificationDate, "create document"); + // create the fulltext item + String existViewerUrl = "http://mpdl-proto.mpiwg-berlin.mpg.de/mpdl/page-query-result.xql?document=" + eXistIdentifier + "&mode=text"; + ArrayList components = new ArrayList(); + String contentCategory = "fulltext XML - ECHO"; + if (docBase != null && docBase.equals("archimedes")) + contentCategory = "fulltext XML - Archimedes"; + Component componentXmlFulltext = new Component("valid", "public", contentCategory, "text/xml", eSciDocStageAreaUrl, "internal-managed"); + Component componentExistViewer = new Component("valid", "public", contentCategory, "text/html", existViewerUrl, "external-url"); + components.add(componentXmlFulltext); + components.add(componentExistViewer); + Item fulltextItem = eSciDocSession.createItemInContainer(newContainerId, pid, mdRecord, components); + String fulltextItemId = fulltextItem.getId(); + Date fulltextItemLastModificationDate = fulltextItem.getLastModificationDate(); + eSciDocSession.submitItem(fulltextItemId, fulltextItemLastModificationDate, "create document"); + // page items: for each page create one item + SchemaHandler schemaHandler = new SchemaHandler(); + ArrayList pbFileNamesArrayStr = schemaHandler.getPBFileNames(docNode, docBase); + createPageItemsInContainer(docOperation, mdRecord, pbFileNamesArrayStr, newContainerId); + return newContainerId; + } + + private String updateDocument(MpdlDocOperation docOperation) throws ApplicationException { + String docBase = docOperation.getDocBase(); + String eXistIdentifier = docOperation.getDestUrl(); + String pid = eSciDocSession.getPid(); + String destFileName = MpdlConstants.MPDL_EXIST_DATA_DIR + "/documents" + eXistIdentifier; + String documentContainerId = eSciDocSession.getContainerIdByEXistId(eXistIdentifier); + if (documentContainerId == null) + throw new ApplicationException("Document:" + eXistIdentifier + " does not exist."); + docOperation.setStatus("update document: " + eXistIdentifier + " on eSciDoc server"); + // first: upload file to eSciDoc stage area and validate it + String eSciDocStageAreaUrl = eSciDocSession.uploadFileToESciDocStageArea(destFileName); + // RelaxNG schema validation + Node docNode = docOperation.getDocNode(); + MetadataRecord mdRecord = docOperation.getMdRecord(); + if (mdRecord != null) { + mdRecord.setMediaType("fulltext"); + } + // second: delete all members of the container (page image reference items) + String deleteFilter = "" + "image" + ""; // filter to find items of type image + deleteContainerItems(docOperation, documentContainerId, deleteFilter); + // third: update the fulltext item + String fulltextItemFilter = "" + "fulltext" + ""; // filter to find items of type fulltext + String fulltextItemsXmlStr = eSciDocSession.getMembersByContainerIdAndFilter(documentContainerId, fulltextItemFilter); + String fulltextItemId = eSciDocSession.getFirstItemId(fulltextItemsXmlStr); + if (fulltextItemId == null || fulltextItemId.trim().equals("")) + throw new ApplicationException("Update of document is not possible: there is no fulltext item in the document container."); + Date fulltextItemVersionDate = eSciDocSession.getVersionDate(fulltextItemsXmlStr); + ArrayList components = new ArrayList(); + String existViewerUrl = "http://mpdl-proto.mpiwg-berlin.mpg.de/mpdl/page-query-result.xql?document=" + eXistIdentifier + "&mode=text"; + String contentCategory = "fulltext XML - ECHO"; + if (docBase != null && docBase.equals("archimedes")) + contentCategory = "fulltext XML - Archimedes"; + Component componentXmlFulltext = new Component("valid", "public", contentCategory, "text/xml", eSciDocStageAreaUrl, "internal-managed"); + Component componentExistViewer = new Component("valid", "public", contentCategory, "text/html", existViewerUrl, "external-url"); + components.add(componentXmlFulltext); + components.add(componentExistViewer); + eSciDocSession.updateItem(fulltextItemId, fulltextItemVersionDate, pid, mdRecord, components); + // fourth: page items: for each page create one item + SchemaHandler schemaHandler = new SchemaHandler(); + ArrayList pbFileNamesArrayStr = schemaHandler.getPBFileNames(docNode, docBase); + createPageItemsInContainer(docOperation, mdRecord, pbFileNamesArrayStr, documentContainerId); + return documentContainerId; + } + + private String deleteDocument(MpdlDocOperation docOperation) throws ApplicationException { + String eXistIdentifier = docOperation.getDestUrl(); // e.g. /echo/la/bla.xml + String documentContainerId = eSciDocSession.getContainerIdByEXistId(eXistIdentifier); + if (documentContainerId == null) + throw new ApplicationException("eSciDoc: Deletion of eSciDoc container is not possible. There is no eSciDoc container for your document: " + eXistIdentifier); + deleteContainer(docOperation, documentContainerId); + return documentContainerId; + } + + public void deleteContainer(MpdlDocOperation docOperation, String containerId) throws ApplicationException { + String eXistIdentifier = docOperation.getDestUrl(); // e.g. /echo/la/bla.xml + docOperation.setStatus("delete document: " + eXistIdentifier + " on eSciDoc server"); + // first: delete all members + deleteContainerItems(docOperation, containerId, null); + // second: delete container itself + eSciDocSession.deleteContainer(containerId); + } + + public void deleteContainerItems(MpdlDocOperation docOperation, String containerId, String filter) throws ApplicationException { + String operationName = docOperation.getName(); + String eXistIdentifier = docOperation.getDestUrl(); + NamespaceContext nsContext = ESciDocRestSession.getNsContext(); + XmlUtil xmlUtil = XmlUtil.getInstance(); + String containerXmlStr = eSciDocSession.getContainer(containerId); + Date lastModificationDate = eSciDocSession.getLastModificationDate(containerXmlStr); + String membersXmlStr = eSciDocSession.getMembersByContainerIdAndFilter(containerId, filter); + ArrayList itemMemberIds = xmlUtil.evaluateToStringArray(membersXmlStr, "//escidocItem:item/@xlink:href", nsContext); + if (itemMemberIds != null) { + eSciDocSession.removeMembers(containerId, lastModificationDate, itemMemberIds); + for (int i=0; i< itemMemberIds.size(); i++) { + String itemId = itemMemberIds.get(i); + int pageNumber = i + 1; + docOperation.setStatus(operationName + " document: " + eXistIdentifier + " on eSciDoc server (delete " + pageNumber + " of " + itemMemberIds.size() + " fulltext and image reference items)"); + eSciDocSession.deleteItem(itemId); + } + } + } + + private void createPageItemsInContainer(MpdlDocOperation docOperation, MetadataRecord mdRecord, ArrayList pbFileNamesArrayStr, String containerId) throws ApplicationException { + if (mdRecord == null || pbFileNamesArrayStr == null || containerId == null) + return; + String operationName = docOperation.getName(); + String eXistIdentifier = docOperation.getDestUrl(); + ArrayList memberIds = new ArrayList(); + SchemaHandler schemaHandler = new SchemaHandler(); + String pageImgDir = schemaHandler.getPageImgDir(mdRecord); + String docBase = docOperation.getDocBase(); + for (int i=0; i< 10; i++) { // TODO + // for (int i=0; i< pbFileNamesArrayStr.size(); i++) { + String pid = eSciDocSession.getPid(); + int pageNumber = i + 1; + docOperation.setStatus(operationName + " document: " + eXistIdentifier + " on eSciDoc server (create " + pageNumber + " of " + pbFileNamesArrayStr.size() + " image references)"); + String fileName = pbFileNamesArrayStr.get(i); + fileName = StringUtilEscapeChars.deresolveXmlEntities(fileName); + MetadataRecord mdRecordImage = new MetadataRecord(); + mdRecordImage.setIdentifier(fileName); + mdRecordImage.setTitle("Page: " + pageNumber); + mdRecordImage.setMediaType("image"); + ArrayList components = new ArrayList(); + String imageEchoViewerUrl = "http://echo.mpiwg-berlin.mpg.de/zogilib?fn=" + pageImgDir + "/" + fileName + "&pn=" + pageNumber; + String imageExistViewerUrl = "http://mpdl-proto.mpiwg-berlin.mpg.de/mpdl/page-query-result.xql?document=" + eXistIdentifier + "&mode=image" + "&pn=" + pageNumber; + String fulltextExistViewerUrl = "http://mpdl-proto.mpiwg-berlin.mpg.de/mpdl/page-query-result.xql?document=" + eXistIdentifier + "&mode=text" + "&pn=" + pageNumber; + Component componentImageEchoViewer = new Component("valid", "public", "JPEG_DEFAULT", "text/html", imageEchoViewerUrl, "external-url"); + Component componentImageExistViewer = new Component("valid", "public", "JPEG_DEFAULT", "text/html", imageExistViewerUrl, "external-url"); + String contentCategory = "fulltext XML - ECHO"; + if (docBase != null && docBase.equals("archimedes")) + contentCategory = "fulltext XML - Archimedes"; + Component componentFulltextExistViewer = new Component("valid", "public", contentCategory, "text/html", fulltextExistViewerUrl, "external-url"); + components.add(componentImageEchoViewer); + components.add(componentImageExistViewer); + components.add(componentFulltextExistViewer); + Item item = eSciDocSession.createItem(pid, mdRecordImage, components); + String itemId = item.getId(); + Date lastModificationDate = item.getLastModificationDate(); + eSciDocSession.submitItem(itemId, lastModificationDate, "create document"); + String memberId = null; + if (itemId != null) { + int index = itemId.indexOf(":"); + if (index > 0) { + memberId = itemId.substring(index + 1); + memberIds.add(memberId); + } + } + } + String containerXmlStr = eSciDocSession.getContainer(containerId); + Date lastModificationDate = eSciDocSession.getLastModificationDate(containerXmlStr); + eSciDocSession.addMembers(containerId, lastModificationDate, memberIds); + } +} diff -r 000000000000 -r 408254cf2f1d software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/escidoc/ESciDocRESTServlet.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/escidoc/ESciDocRESTServlet.java Wed Nov 24 17:24:23 2010 +0100 @@ -0,0 +1,364 @@ +package de.mpg.mpiwg.berlin.mpdl.escidoc; + +import java.io.File; +import java.io.IOException; +import java.io.PrintWriter; +import java.io.UnsupportedEncodingException; +import java.net.URLEncoder; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; + +import javax.servlet.ServletException; +import javax.servlet.ServletInputStream; +import javax.servlet.http.Cookie; +import javax.servlet.http.HttpServlet; +import javax.servlet.http.HttpServletRequest; +import javax.servlet.http.HttpServletResponse; + +import org.apache.commons.fileupload.FileItem; +import org.apache.commons.fileupload.disk.DiskFileItemFactory; +import org.apache.commons.fileupload.servlet.ServletFileUpload; +import org.apache.commons.httpclient.HttpClient; +import org.apache.commons.httpclient.HttpException; +import org.apache.commons.httpclient.methods.GetMethod; + +import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; +import de.mpg.mpiwg.berlin.mpdl.general.MpdlConstants; +import de.mpg.mpiwg.berlin.mpdl.util.FileUtil; +import de.mpg.mpiwg.berlin.mpdl.util.XmlUtil; +import de.mpg.mpiwg.berlin.mpdl.xmlrpc.MpdlXmlRpcDocHandler; + +public class ESciDocRESTServlet extends HttpServlet { + private static final long serialVersionUID = -4889427839010526185L; + private static int FILE_UPLOAD_MAX_MEMORY_SIZE = 100000; // 100 KB + private static String FILE_UPLOAD_TEMP_DIRECTORY = MpdlConstants.MPDL_EXIST_DATA_DIR + "/tmpUploadDir"; + + protected void doGet(HttpServletRequest req, HttpServletResponse resp) throws ServletException, IOException { + try { + String escidocUrl = req.getParameter("escidocUrl"); + String query = req.getParameter("query"); + String queryPath = req.getParameter("queryPath"); + String parameters = req.getParameter("parameters"); + String startRecord = req.getParameter("startRecord"); + String maximumRecords = req.getParameter("maximumRecords"); + Cookie[] cookies = req.getCookies(); + String eScidDocCookie = getCookieValue(cookies, "escidocCookie"); + int startRecordInt = -1; + if (startRecord != null && ! startRecord.equals("")) + startRecordInt = Integer.parseInt(startRecord); + int maximumRecordsInt = -1; + if (maximumRecords != null && ! maximumRecords.equals("")) + maximumRecordsInt = Integer.parseInt(maximumRecords); + resp.setContentType("text/xml;charset=\"utf-8\""); // important: otherwise the response is sent as iso-8859-1 + PrintWriter out = resp.getWriter(); + // execute xquery script on eXist server + if (escidocUrl != null && escidocUrl.startsWith("/exist:xquery/execute")) { + if (query != null && ! query.equals("")) { + String xqueryResult = xquery("string", query, null, startRecordInt, maximumRecordsInt); + out.print(xqueryResult); + } else if (queryPath != null && ! queryPath.equals("")) { + String xqueryResult = xquery("uri", queryPath, parameters, startRecordInt, maximumRecordsInt); + out.print(xqueryResult); + } + } else if (escidocUrl != null && escidocUrl.startsWith("/ir/item/") && escidocUrl.contains("/exist:xquery/execute")) { + String existDocUrl = getExistDocUrl(eScidDocCookie, escidocUrl); + String xQueryParamDocument = "" + existDocUrl + ""; + String params = ""; + if (parameters != null && ! parameters.equals("")) { + int index = parameters.indexOf(""); + params = parameters.substring(0, index) + xQueryParamDocument + parameters.substring(index); + } else { + params = "" + xQueryParamDocument + ""; + } + if (query != null && ! query.equals("")) { + String xqueryResult = xquery("string", query, params, startRecordInt, maximumRecordsInt); + out.print(xqueryResult); + } else if (queryPath != null && ! queryPath.equals("")) { + String xqueryResult = xquery("uri", queryPath, params, startRecordInt, maximumRecordsInt); + out.print(xqueryResult); + } + } else { + out.println(""); + out.println(""); + out.print("EsciDoc does not support your URL: " + escidocUrl); + out.print(""); + } + } catch (ApplicationException e) { + throw new ServletException(e); + } + } + + protected void doPost(HttpServletRequest req, HttpServletResponse resp) throws ServletException, IOException { + try { + String escidocUrl = req.getParameter("escidocUrl"); + Cookie[] cookies = req.getCookies(); + String eScidDocCookie = getCookieValue(cookies, "escidocCookie"); + PrintWriter out = resp.getWriter(); + boolean isMultipart = ServletFileUpload.isMultipartContent(req); + if (! isMultipart) { // if not multipartContent (e.g. set by client with setRequestBody or setRequestEntity) + ServletInputStream is = req.getInputStream(); + File tmpUploadItemFile = File.createTempFile("item", ".xml", new File(FILE_UPLOAD_TEMP_DIRECTORY)); + String tmpUploadItemFileName = tmpUploadItemFile.getPath(); + FileUtil.getInstance().saveInputStreamToLocalFile(is, tmpUploadItemFileName); + String containerId = getESciDocContainerId(escidocUrl); + String newItemXmlStr = createItemInContainer(eScidDocCookie, containerId, tmpUploadItemFileName); // create item and eXist document + out.println(newItemXmlStr); + } else { // multipart content (each file item is uploaded) + DiskFileItemFactory factory = new DiskFileItemFactory(); + factory.setSizeThreshold(FILE_UPLOAD_MAX_MEMORY_SIZE); // default is 100 KB + File tmpFileUplaodDir = new File(FILE_UPLOAD_TEMP_DIRECTORY); + factory.setRepository(tmpFileUplaodDir); // for files which are bigger than the threshold; files are deleted, when they are garbage collected + ServletFileUpload upload = new ServletFileUpload(factory); + List items = upload.parseRequest(req); + Iterator iter = items.iterator(); + while (iter.hasNext()) { + /* fetch an item from the iterator above */ + FileItem item = iter.next(); + if (!item.isFormField()) { + String fileName = item.getName(); + long sizeInBytes = item.getSize(); + File uploadedFile = new File(FILE_UPLOAD_TEMP_DIRECTORY + "/" + fileName); + item.write(uploadedFile); + out.println("

File: " + uploadedFile.getAbsolutePath() + " (size: " + sizeInBytes + ") written


"); + } + } + } + } catch (Exception e ) { + throw new ServletException(e); + } + } + + protected void doPut(HttpServletRequest req, HttpServletResponse resp) throws ServletException, IOException { + PrintWriter out = resp.getWriter(); + // TODO + out.println(""); + out.println(""); + out.println("do put"); + out.println(""); + out.println(""); + } + + protected void doDelete(HttpServletRequest req, HttpServletResponse resp) throws ServletException, IOException { + try { + String escidocUrl = req.getParameter("escidocUrl"); + Cookie[] cookies = req.getCookies(); + String cookieId = getCookieValue(cookies, "escidocCookie"); + PrintWriter out = resp.getWriter(); + if (escidocUrl != null && escidocUrl.startsWith("/ir/item/escidoc:")) { + String existId = getExistDocUrl(cookieId, escidocUrl); + ESciDocRestSession escidocRestSession = ESciDocRestSession.getInstance(cookieId); + String itemId = escidocUrl; + escidocRestSession.deleteItem(itemId); + // delete eXist document + MpdlXmlRpcDocHandler docHandler = MpdlXmlRpcDocHandler.getInstance(); + docHandler.deleteDocumentFile(existId); + out.println(""); + out.println("item: "); + out.print(itemId); + out.print(" sucessfully deleted in eSciDoc and eXist"); + } else { + out.println(""); + out.println(""); + out.print("EsciDoc does not support your URL: " + escidocUrl); + out.print(""); + } + } catch (ApplicationException e) { + throw new ServletException(e); + } + } + + private void doDeleteOld(HttpServletRequest req, HttpServletResponse resp) throws ServletException, IOException { + try { + String escidocUrl = req.getParameter("escidocUrl"); + Cookie[] cookies = req.getCookies(); + String eScidDocCookie = getCookieValue(cookies, "escidocCookie"); + PrintWriter out = resp.getWriter(); + if (escidocUrl != null && escidocUrl.startsWith("/ir/item")) { + String xQueryPath = "/mpdl/doc/rest-doc-operation.xql"; + String existDocUrl = getExistDocUrl(eScidDocCookie, escidocUrl); + String strTmp = existDocUrl.substring(1); + int index = strTmp.indexOf("/"); + String docBase = strTmp.substring(0, index); + strTmp = strTmp.substring(index + 1); + index = strTmp.indexOf("/"); + String language = strTmp.substring(0, index); + String fileName = strTmp.substring(index + 1); + String xQueryParamOperation = "delete"; + String xQueryParamDocBase = "" + docBase + ""; + String xQueryParamLanguage = "" + language + ""; + String xQueryParamFileName = "" + fileName + ""; + String xQueryParams = "" + xQueryParamOperation + xQueryParamDocBase + xQueryParamLanguage + xQueryParamFileName + ""; + String xqueryResult = xqueryByPath(xQueryPath, xQueryParams, 1, 1); + out.print(xqueryResult); + } else { + out.println(""); + out.println(""); + out.print("EsciDoc does not support your URL: " + escidocUrl); + out.print(""); + } + } catch (ApplicationException e) { + throw new ServletException(e); + } + } + + private String createItemInContainer(String cookieId, String containerId, String localFileName) throws ApplicationException { + String retItemXmlStr = null; + try { + // create item in eSciDoc + byte[] itemXmlBytes = FileUtil.getInstance().readBytes(localFileName); + String itemXmlStr = new String(itemXmlBytes, "utf-8"); + ESciDocRestSession escidocRestSession = ESciDocRestSession.getInstance(cookieId); + Item retItem = escidocRestSession.createItemInContainer(containerId, itemXmlStr); + retItemXmlStr = retItem.toXmlString(); + // create/update eXist document + MpdlXmlRpcDocHandler docHandler = MpdlXmlRpcDocHandler.getInstance(); + String existId = escidocRestSession.getFirstEXistId(itemXmlStr); + String componentContentId = escidocRestSession.getFirstComponentId(retItemXmlStr); + File tmpUploadExistFile = File.createTempFile("exist", ".xml", new File(FILE_UPLOAD_TEMP_DIRECTORY)); + String tmpUploadExistFileName = tmpUploadExistFile.getPath(); + escidocRestSession.saveComponentContentToLocalFile(componentContentId, tmpUploadExistFileName); + docHandler.saveDocumentFile(tmpUploadExistFileName, existId); + FileUtil.getInstance().deleteFile(localFileName); + FileUtil.getInstance().deleteFile(tmpUploadExistFileName); + } catch (Exception e) { + throw new ApplicationException(e); + } + return retItemXmlStr; + } + + private String getExistDocUrl(String cookieId, String escidocUrl) throws ApplicationException { + ESciDocRestSession escidocRestSession = ESciDocRestSession.getInstance(cookieId); + String itemId = getESciDocItemId(escidocUrl); + String itemXmlStr = escidocRestSession.getItem(itemId); + String existDocUrl = getEXistDocUrl(itemXmlStr); + return existDocUrl; + } + + private String xquery(String queryType, String xQuery, String xmlParameters, int startRecord, int maximumRecords) throws ApplicationException { + String xQueryMethod = "/mpdl/interface/xquery.xql"; + String xQueryPath = ""; + String xqueryResult = null; + if (queryType.equals("uri")) { + xQueryPath = xQuery; + xqueryResult = xqueryByPath(xQueryPath, xmlParameters, startRecord, maximumRecords); + } else if (queryType.equals("string")) { + xQueryPath = xQueryMethod; + String paramXQuery = "" + xQuery + ""; + String params = "" + paramXQuery + ""; + if (xmlParameters != null) { + int index = xmlParameters.indexOf(""); + params = xmlParameters.substring(0, index) + paramXQuery + xmlParameters.substring(index); + } + xqueryResult = xqueryByPath(xQueryPath, params, startRecord, maximumRecords); + } + return xqueryResult; + } + + private String xqueryByPath(String xQueryPath, String xmlParameters, int startRecord, int maximumRecords) throws ApplicationException { + String requestName = xQueryPath; + String parameters = getUrlStringByXmlParams(xmlParameters); + if (parameters != null && ! parameters.equals("")) { + if (startRecord != -1) + parameters = parameters + "&startRecord=" + startRecord; + if (maximumRecords != -1) + parameters = parameters + "&maximumRecords=" + maximumRecords; + requestName = requestName + "?" + parameters; + } + String xqueryResult = performGetRequest(requestName, null); + return xqueryResult; + } + + private String getUrlStringByXmlParams(String xmlParameters) throws ApplicationException { + String parameters = ""; + if (xmlParameters != null) { + XmlUtil xmlUtil = XmlUtil.getInstance(); + ArrayList paramNames = xmlUtil.evaluateToStringArray(xmlParameters, "//param/@name", null); + ArrayList paramValues = xmlUtil.evaluateToStringArray(xmlParameters, "//param", null); + int size = paramValues.size(); + for (int i=0; i components = new ArrayList(); + components.add(component); + Item xmlTemplate = new Item(contextId, pid, mdRecord, contentModelId, components); + String itemXmlStr = xmlTemplate.toXmlString(); + String uri = "/validation/rest/validateItemXmlBySchema"; + HttpMethodParams parameter = new HttpMethodParams(); + parameter.setParameter("validation-point", ""); // None (Pick the validation schema from the context provided with the item) + parameter.setParameter("validation-schema", ""); // None (Default) + String valAnswer = performPostRequest(uri, itemXmlStr, parameter); + } + + public String getCookieId() { + return cookieId; + } + + public void openContext(String contextId) throws ApplicationException { + String contextXmlStr = getContextById(contextId); + Date lastModificationDate = getLastModificationDate(contextXmlStr); + String lastModificationDateStr = XmlUtil.getInstance().toXsDate(lastModificationDate); + String bodyContentStr = ""; + String uri = "/ir/context/" + contextId + "/open"; + performPostRequestByBody(uri, bodyContentStr); + } + + public String createContext(String organizationalUnit, String name, String description, String type) throws ApplicationException { + Context xmlTemplate = new Context(organizationalUnit, name, description, type); + String bodyContentXmlStr = xmlTemplate.toXmlString(); + String contextXmlStr = performPutRequestByBody("/ir/context", bodyContentXmlStr); + String contextId = getFirstContextId(contextXmlStr); + return contextId; + } + + public String getContextById(String contextId) throws ApplicationException { + String bodyContent = "" + contextId + ""; + String requestUrlStr = "/ir/contexts/filter"; + String resultXmlStr = performPostRequestByBody(requestUrlStr, bodyContent); + return resultXmlStr; + } + + public void grant(String userName, String roleName) throws ApplicationException { + String grantXmlStr = null; + String userId = null; + String internalRoleName = null; + if (roleName != null && roleName.equals("admin")) { + internalRoleName = "escidoc:role-system-administrator"; + userId = getUserId(userName); + Grant grant = new Grant(userName, userId, "System-Administrator", "/aa/role/" + internalRoleName); + grantXmlStr = grant.toXmlString(); + } + String grantHref = getGrantHrefByUserNameAndRoleName(userName, internalRoleName); + if (grantHref == null || grantHref.equals("")) + performPutRequestByBody(userId + "/resources/grants/grant", grantXmlStr); + } + + public String getGrantHrefByUserNameAndRoleName(String userName, String roleName) throws ApplicationException { + String resultXmlStr = null; + String fullUserId = getUserId(userName); // // e.g. userId=/aa/user-account/escidoc:22650 + if (fullUserId != null) { + int userIdIndex = fullUserId.lastIndexOf("/"); + if (userIdIndex != -1) { + String userId = fullUserId.substring(userIdIndex + 1); + String filterUserName = "" + userId + ""; // e.g. userId=escidoc:22650 + String filterRoleName = "" + roleName + ""; // e.g. roleName=escidoc:role-system-administrator + String bodyContent = "" + filterUserName + filterRoleName + ""; + String requestUrlStr = "/aa/grants/filter"; + resultXmlStr = performPostRequestByBody(requestUrlStr, bodyContent); + resultXmlStr = getFirstGrantId(resultXmlStr); + } + } + return resultXmlStr; + } + + public String getGrantsByUserName(String userName) throws ApplicationException { + String resultXmlStr = null; + String fullUserId = getUserId(userName); // // e.g. userId=/aa/user-account/escidoc:22650 + if (fullUserId != null) { + int userIdIndex = fullUserId.lastIndexOf("/"); + if (userIdIndex != -1) { + String userId = fullUserId.substring(userIdIndex + 1); + String filterUserName = "" + userId + ""; // e.g. userId=escidoc:22650 + String bodyContent = "" + filterUserName + ""; + String requestUrlStr = "/aa/grants/filter"; + resultXmlStr = performPostRequestByBody(requestUrlStr, bodyContent); + } + } + return resultXmlStr; + } + + public String createContainer(String pid, MetadataRecord mdRecord) throws ApplicationException { + Container xmlTemplate = new Container(contentModelId, contextId, pid, mdRecord); + String bodyContentXmlStr = xmlTemplate.toXmlString(); + String containerXmlStr = performPutRequestByBody("/ir/container", bodyContentXmlStr); + String containerId = getFirstContainerId(containerXmlStr); + return containerId; + } + + public Container createContainerInContainer(String pid, MetadataRecord mdRecord, String containerId) throws ApplicationException { + Container xmlTemplate = new Container(contentModelId, contextId, pid, mdRecord); + String bodyContentXmlStr = xmlTemplate.toXmlString(); + String uri = containerId + "/create-container"; + String containerXmlStr = performPostRequestByBody(uri, bodyContentXmlStr); + String retContainerId = getFirstContainerId(containerXmlStr); + Date lastModificationDate = getLastModificationDate(containerXmlStr); + Container container = new Container(retContainerId, lastModificationDate); + return container; + } + + public Item createItemInContainer(String containerId, String pid, MetadataRecord mdRecord, ArrayList components) throws ApplicationException { + Item xmlTemplate = new Item(contextId, pid, mdRecord, contentModelId, components); + String bodyContentXmlStr = xmlTemplate.toXmlString(); + String uri = containerId + "/create-item"; + String itemXmlStr = performPostRequestByBody(uri, bodyContentXmlStr); + String itemId = getFirstItemId(itemXmlStr); + Date lastModificationDate = getLastModificationDate(itemXmlStr); + Item item = new Item(itemId, lastModificationDate); + return item; + } + + public Item createItemInContainer(String containerId, String itemXmlStr) throws ApplicationException { + String uri = containerId + "/create-item"; + String retItemXmlStr = performPostRequestByBody(uri, itemXmlStr); + String itemId = getFirstItemId(retItemXmlStr); + Date lastModificationDate = getLastModificationDate(retItemXmlStr); + String validStatus = ""; // TODO + String visibility = ""; // TODO + String contentCategory = ""; // TODO + String mimeType = ""; // TODO + String url = getFirstComponentId(retItemXmlStr); + String storage = ""; // TODO + Component component = new Component(validStatus, visibility, contentCategory, mimeType, url, storage); + Item item = new Item(itemId, lastModificationDate); + item.addComponent(component); + return item; + } + + public void submitContainer(String containerId, Date lastModificationDate, String comment) throws ApplicationException { + String uri = containerId + "/submit"; + String dateStr = XmlUtil.getInstance().toXsDate(lastModificationDate); + String xmlStr = "" + comment + ""; + performPostRequestByBody(uri, xmlStr); + } + + public void submitItem(String itemId, Date lastModificationDate, String comment) throws ApplicationException { + String uri = itemId + "/submit"; + String dateStr = XmlUtil.getInstance().toXsDate(lastModificationDate); + String xmlStr = "" + comment + ""; + performPostRequestByBody(uri, xmlStr); + } + + public Date addMembers(String containerId, Date lastModificationDate, ArrayList memberIds) throws ApplicationException { + if (containerId == null || lastModificationDate == null || memberIds == null) + return null; + String dateStr = XmlUtil.getInstance().toXsDate(lastModificationDate); + String membersXmlStr = ""; + for (int i=0; i< memberIds.size(); i++) { + String memberId = memberIds.get(i); + membersXmlStr = membersXmlStr + "" + "escidoc:" + memberId +""; + } + membersXmlStr += ""; + String lastModDateXmlStr = performPostRequestByBody(containerId + "/members/add", membersXmlStr); + Date lastModDate = getLastModificationDate(lastModDateXmlStr); + return lastModDate; + } + + public Date removeMembers(String containerId, Date lastModificationDate, ArrayList memberIds) throws ApplicationException { + if (containerId == null || lastModificationDate == null || memberIds == null) + return null; + String dateStr = XmlUtil.getInstance().toXsDate(lastModificationDate); + String membersXmlStr = ""; + for (int i=0; i< memberIds.size(); i++) { + String memberId = memberIds.get(i); + // if memberId is a full id and contains non digits they will be removed: e.g. /ir/item/escidoc:4711 will be replaced by 4711 + if (! memberId.matches("[0-9]+")) { + memberId = memberId.replaceAll("[^0-9]+", ""); + } + membersXmlStr = membersXmlStr + "" + "escidoc:" + memberId +""; + } + membersXmlStr += ""; + String lastModDateXmlStr = performPostRequestByBody(containerId + "/members/remove", membersXmlStr); + Date lastModDate = getLastModificationDate(lastModDateXmlStr); + return lastModDate; + } + + public Item createItem(String pid, MetadataRecord mdRecord, ArrayList components) throws ApplicationException { + Item xmlTemplate = new Item(contextId, pid, mdRecord, contentModelId, components); + String xmlStr = xmlTemplate.toXmlString(); + String itemXmlStr = performPutRequestByBody("/ir/item", xmlStr); + String itemId = getFirstItemId(itemXmlStr); + Date lastModificationDate = getLastModificationDate(itemXmlStr); + Item item = new Item(itemId, lastModificationDate); + return item; + } + + public Date updateItem(String itemId, Date lastModificationDate, String pid, MetadataRecord mdRecord, ArrayList components) throws ApplicationException { + if (itemId == null || lastModificationDate == null) + return null; + Item xmlTemplate = new Item(contextId, pid, mdRecord, contentModelId, components); + xmlTemplate.setLastModificationDate(lastModificationDate); + String xmlStr = xmlTemplate.toXmlString(); + String itemXmlStr = performPutRequestByBody(itemId, xmlStr); + Date newVersionDate = getVersionDate(itemXmlStr); + return newVersionDate; + } + + public void deleteItem(String itemId) { + if (itemId != null) { + performDeleteRequest(itemId); + } + } + + public void deleteContainer(String containerId) { + if (containerId != null) { + performDeleteRequest(containerId); + } + } + + public Date getContainerLastModificationDate(String containerId) throws ApplicationException { + Date lastModificationDate = null; + String resultXmlStr = getContainer(containerId); + if (resultXmlStr != null) { + lastModificationDate = getLastModificationDate(resultXmlStr); + } + return lastModificationDate; + } + + public String getContainer(String containerId) throws ApplicationException { + String resultXmlStr = null; + if (containerId != null) { + resultXmlStr = performGetRequest(containerId); + } + return resultXmlStr; + } + + public String getItem(String itemId) throws ApplicationException { + String resultXmlStr = null; + if (itemId != null) { + resultXmlStr = performGetRequest(itemId); + } + return resultXmlStr; + } + + public void saveComponentContentToLocalFile(String componentContentId, String localFileName) throws ApplicationException { + if (componentContentId != null) { + performGetRequestToLocalFile(componentContentId, localFileName); + } + } + + public String getUserId(String userName) throws ApplicationException { + String userId = null; + if (userName != null) { + String userNameAccessStr = userName + ",uid=" + userName + ",ou=users,dc=wisges,dc=rz-berlin,dc=mpg,dc=de"; + String resultXmlStr = performGetRequest("/aa/user-account/" + userNameAccessStr); + userId = getFirstUserId(resultXmlStr); + } + return userId; + } + + public String getUser(String userName) throws ApplicationException { + String resultXmlStr = null; + if (userName != null) { + String userNameAccessStr = userName + ",uid=" + userName + ",ou=users,dc=wisges,dc=rz-berlin,dc=mpg,dc=de"; + resultXmlStr = performGetRequest("/aa/user-account/" + userNameAccessStr); + } + return resultXmlStr; + } + + public String getAllUsers() throws ApplicationException { + String bodyContent = ""; + String requestUrlStr = "/aa/user-accounts/filter"; + String resultXmlStr = performPostRequestByBody(requestUrlStr, bodyContent); + return resultXmlStr; + } + + public String getMembersByContainerIdAndFilter(String containerId, String filter) throws ApplicationException { + String bodyContent = ""; + if (filter != null) + bodyContent = "" + filter + ""; + String requestUrlStr = containerId + "/members/filter"; + String resultXmlStr = performPostRequestByBody(requestUrlStr, bodyContent); + return resultXmlStr; + } + + public String getAllItems() throws ApplicationException { + String bodyContent = ""; + String requestUrlStr = "/ir/items/filter"; + String resultXmlStr = performPostRequestByBody(requestUrlStr, bodyContent); + return resultXmlStr; + } + + private String getContainerByEXistId(String existId) throws ApplicationException { + String bodyContent = "" + existId + ""; // e.g. existId = /echo/la/alvarus_1509_lat_V40_10.xml + String requestUrlStr = "/ir/containers/filter"; + String resultXmlStr = performPostRequestByBody(requestUrlStr, bodyContent); + return resultXmlStr; + } + + public String getContainerIdByEXistId(String existId) throws ApplicationException { + String containerXmlStr = getContainerByEXistId(existId); + String eScidDocContainerId = null; + if (containerXmlStr != null && containerXmlStr != "") { + eScidDocContainerId = getFirstContainerId(containerXmlStr); + } + return eScidDocContainerId; + } + + public String getItemByPid(String pid) throws ApplicationException { + String bodyContent = "" + pid + ""; + String requestUrlStr = "/ir/items/filter"; + String resultXmlStr = performPostRequestByBody(requestUrlStr, bodyContent); + return resultXmlStr; + } + + public String uploadFileToESciDocStageArea(String filePath) throws ApplicationException { + StringBuffer result = new StringBuffer(); + try { + URL createUrl = new URL(protocol + "://" + host + ":" + port + STAGE_PATH); + HttpURLConnection uploadConnection = (HttpURLConnection) createUrl.openConnection(); + uploadConnection.setRequestProperty("Cookie", "escidocCookie=" + cookieId); + uploadConnection.setRequestMethod("PUT"); + uploadConnection.setDoOutput(true); + // open POST Request + OutputStream out = uploadConnection.getOutputStream(); + // access binary content + InputStream in = new FileInputStream(filePath); + // write template to POST Request + byte[] bytes = new byte[4096]; + int l = in.read(bytes); + while (l > -1) { + out.write(bytes, 0, l); + l = in.read(bytes); + } + in.close(); + out.close(); + uploadConnection.connect(); + // connect response reader + BufferedReader createdReader = null; + String contentEncoding = uploadConnection.getContentEncoding(); + if (contentEncoding == null) { + contentEncoding = CONTENT_ENCODING; + } + createdReader = new BufferedReader(new InputStreamReader(uploadConnection.getInputStream(), contentEncoding)); + // read response + String line = createdReader.readLine(); + while (line != null) { + result.append(line); + result.append(LINE_SEPARATOR); + line = createdReader.readLine(); + } + createdReader.close(); + } catch (IOException e) { + throw new ApplicationException(e); + } + String stageUrl = obtainResourceHref(result.toString()); + return stageUrl; + } + + + public String getPid() throws ApplicationException { + return "mpiwg:47114711"; // TODO + /* + try { + XmlRpcClientConfigImpl config = new XmlRpcClientConfigImpl(); + XmlRpcClient client = new XmlRpcClient(); + String zopeUrlStr = "http://xserve07.mpiwg-berlin.mpg.de:18080"; + config.setServerURL(new URL(zopeUrlStr + "/idGenerator")); + client.setConfig(config); + Object[] params = new Object[]{}; + String pid = (String) client.execute("generateId", params); + return pid; + } catch (MalformedURLException e) { + throw new ApplicationException(e); + } catch (XmlRpcException e) { + throw new ApplicationException(e); + } + */ + } + + public ArrayList getContainerIds(String xmlStr) throws ApplicationException { + NamespaceContext nsContext = getNsContext(); + XmlUtil xmlUtil = XmlUtil.getInstance(); + ArrayList containerIds = xmlUtil.evaluateToStringArray(xmlStr, "//srel:container/@xlink:href", nsContext); + return containerIds; + } + + public ArrayList getContainerTitles(String xmlStr) throws ApplicationException { + NamespaceContext nsContext = getNsContext(); + XmlUtil xmlUtil = XmlUtil.getInstance(); + ArrayList containerTitles = xmlUtil.evaluateToStringArray(xmlStr, "//srel:container/@xlink:title", nsContext); + return containerTitles; + } + + public String getLatestVersionId(String xmlStr) throws ApplicationException { + NamespaceContext nsContext = getNsContext(); + XmlUtil xmlUtil = XmlUtil.getInstance(); + String id = xmlUtil.evaluateToString(xmlStr, "//prop:latest-version/@xlink:href", nsContext); + return id; + } + + public String getFirstUserId(String xmlStr) throws ApplicationException { + NamespaceContext nsContext = getNsContext(); + XmlUtil xmlUtil = XmlUtil.getInstance(); + String id = xmlUtil.evaluateToString(xmlStr, "//user-account:user-account/@xlink:href", nsContext); + return id; + } + + public String getFirstGrantId(String xmlStr) throws ApplicationException { + NamespaceContext nsContext = getNsContext(); + XmlUtil xmlUtil = XmlUtil.getInstance(); + String g = xmlUtil.evaluateToString(xmlStr, "//grants:grant/@xlink:href", nsContext); + return g; + } + + public String getFirstContextId(String xmlStr) throws ApplicationException { + NamespaceContext nsContext = getNsContext(); + XmlUtil xmlUtil = XmlUtil.getInstance(); + String id = xmlUtil.evaluateToString(xmlStr, "//context:context/@xlink:href", nsContext); + return id; + } + + public String getFirstContainerId(String xmlStr) throws ApplicationException { + NamespaceContext nsContext = getNsContext(); + XmlUtil xmlUtil = XmlUtil.getInstance(); + String id = xmlUtil.evaluateToString(xmlStr, "//container:container/@xlink:href", nsContext); + return id; + } + + public String getFirstItemId(String xmlStr) throws ApplicationException { + NamespaceContext nsContext = getNsContext(); + XmlUtil xmlUtil = XmlUtil.getInstance(); + String id = xmlUtil.evaluateToString(xmlStr, "//escidocItem:item/@xlink:href", nsContext); + return id; + } + + public String getFirstStageAreaURL(String xmlStr) throws ApplicationException { + NamespaceContext nsContext = getNsContext(); + XmlUtil xmlUtil = XmlUtil.getInstance(); + String eSciDocStageAreaUrl = xmlUtil.evaluateToString(xmlStr, "//escidocComponents:content/@xlink:href", nsContext); + return eSciDocStageAreaUrl; + } + + public String getFirstComponentId(String xmlStr) throws ApplicationException { + NamespaceContext nsContext = getNsContext(); + XmlUtil xmlUtil = XmlUtil.getInstance(); + String componentId = xmlUtil.evaluateToString(xmlStr, "//escidocComponents:content/@xlink:href", nsContext); + return componentId; + } + + public String getFirstEXistId(String xmlStr) throws ApplicationException { + NamespaceContext nsContext = getNsContext(); + XmlUtil xmlUtil = XmlUtil.getInstance(); + String id = xmlUtil.evaluateToString(xmlStr, "/escidocItem:item/escidocMetadataRecords:md-records/escidocMetadataRecords:md-record/metadata/mpiwg:exist-identifier", nsContext); + return id; + } + + public MetadataRecord getFirstMdRecord(String xmlStr) throws ApplicationException { + NamespaceContext nsContext = getNsContext(); + XmlUtil xmlUtil = XmlUtil.getInstance(); + String id = xmlUtil.evaluateToString(xmlStr, "/escidocItem:item/escidocMetadataRecords:md-records/escidocMetadataRecords:md-record/metadata/dc:identifier", nsContext); + String language = xmlUtil.evaluateToString(xmlStr, "/escidocItem:item/escidocMetadataRecords:md-records/escidocMetadataRecords:md-record/metadata/dc:language", nsContext); + String creator = xmlUtil.evaluateToString(xmlStr, "/escidocItem:item/escidocMetadataRecords:md-records/escidocMetadataRecords:md-record/metadata/dc:creator", nsContext); + String title = xmlUtil.evaluateToString(xmlStr, "/escidocItem:item/escidocMetadataRecords:md-records/escidocMetadataRecords:md-record/metadata/dc:title", nsContext); + String type = xmlUtil.evaluateToString(xmlStr, "/escidocItem:item/escidocMetadataRecords:md-records/escidocMetadataRecords:md-record/metadata/dc:type", nsContext); + String rights = xmlUtil.evaluateToString(xmlStr, "/escidocItem:item/escidocMetadataRecords:md-records/escidocMetadataRecords:md-record/metadata/dc:rights", nsContext); + String dateStr = xmlUtil.evaluateToString(xmlStr, "/escidocItem:item/escidocMetadataRecords:md-records/escidocMetadataRecords:md-record/metadata/dc:date", nsContext); + Date date = new Date(dateStr); + String license = xmlUtil.evaluateToString(xmlStr, "/escidocItem:item/escidocMetadataRecords:md-records/escidocMetadataRecords:md-record/metadata/mpiwg:license", nsContext); + String accessRights = xmlUtil.evaluateToString(xmlStr, "/escidocItem:item/escidocMetadataRecords:md-records/escidocMetadataRecords:md-record/metadata/mpiwg:accessRights", nsContext); + String mediaType = xmlUtil.evaluateToString(xmlStr, "/escidocItem:item/escidocMetadataRecords:md-records/escidocMetadataRecords:md-record/metadata/mpiwg:mediaType", nsContext); + String existId = xmlUtil.evaluateToString(xmlStr, "/escidocItem:item/escidocMetadataRecords:md-records/escidocMetadataRecords:md-record/metadata/mpiwg:exist-identifier", nsContext); + MetadataRecord mdRecord = new MetadataRecord(id, language, creator, title, null, null, type, rights, date); // TODO vervollständigen, testen + mdRecord.setLicense(license); + mdRecord.setAccessRights(accessRights); + mdRecord.setMediaType(mediaType); + mdRecord.setEXistIdentifier(existId); + return mdRecord; + } + + public Date getVersionDate(String xmlStr) throws ApplicationException { + NamespaceContext nsContext = getNsContext(); + XmlUtil xmlUtil = XmlUtil.getInstance(); + String dateStr = xmlUtil.evaluateToString(xmlStr, "//version:date", nsContext); + Date lastModificationDate = xmlUtil.toDate(dateStr); + return lastModificationDate; + } + + public Date getLastModificationDate(String xmlStr) throws ApplicationException { + NamespaceContext nsContext = getNsContext(); + XmlUtil xmlUtil = XmlUtil.getInstance(); + String dateStr = xmlUtil.evaluateToString(xmlStr, "//*/@last-modification-date", nsContext); + Date lastModificationDate = xmlUtil.toDate(dateStr); + return lastModificationDate; + } + + private String obtainResourceHref(String xml) { + // base + String base = ""; + Matcher baseMatcher = PATTERN_XML_BASE_ATTRIBUTE.matcher(xml); + if (baseMatcher.find()) { + base = baseMatcher.group(1); + } + // href + String href = null; + Matcher hrefMatcher = PATTERN_XLINK_HREF_ATTRIBUTE.matcher(xml); + if (hrefMatcher.find()) { + href = hrefMatcher.group(1); + } else { + throw new UnsupportedOperationException("Can not obtain href for resources without xlink:href attribute."); + } + return base + href; + } + + private String performPostRequestByBody(String requestUrlStr, String bodyContent) throws ApplicationException { + String resultStr = null; + try { + String urlStr = protocol + "://" + host + ":" + port + requestUrlStr; + PostMethod method = new PostMethod(urlStr); + method.setFollowRedirects(false); + method.setRequestHeader("Cookie", "escidocCookie=" + cookieId); + if (bodyContent != null) { + method.setRequestBody(bodyContent); + } + httpClient.executeMethod(method); + byte[] responseBody = method.getResponseBody(); + resultStr = new String(responseBody, "utf-8"); + method.releaseConnection(); + } catch (HttpException e) { + throw new ApplicationException(e); + } catch (IOException e) { + throw new ApplicationException(e); + } + return resultStr; + } + + // TODO + private String performPostRequest(String requestUrlStr, String bodyContent, HttpMethodParams parameter) throws ApplicationException { + String resultStr = null; + try { + String urlStr = protocol + "://" + host + ":" + port + requestUrlStr; + PostMethod method = new PostMethod(urlStr); + method.setFollowRedirects(false); + method.setRequestHeader("Cookie", "escidocCookie=" + cookieId); + if (bodyContent != null) { + method.setRequestBody(bodyContent); + } + if (parameter != null) { + method.setParams(parameter); + } + httpClient.executeMethod(method); + byte[] responseBody = method.getResponseBody(); + resultStr = new String(responseBody, "utf-8"); + method.releaseConnection(); + } catch (HttpException e) { + throw new ApplicationException(e); + } catch (IOException e) { + throw new ApplicationException(e); + } + return resultStr; + } + + private String performPutRequestByBody(String requestName, String bodyContent) { + String resultStr = null; + try { + String urlStr = protocol + "://" + host + ":" + port + requestName; + PutMethod method = new PutMethod(urlStr); + method.setRequestHeader("Cookie", "escidocCookie=" + cookieId); + if (bodyContent != null) { + method.setRequestBody(bodyContent); + } + httpClient.executeMethod(method); + byte[] responseBody = method.getResponseBody(); + resultStr = new String(responseBody, "utf-8"); + method.releaseConnection(); + } catch (HttpException e) { + e.printStackTrace(); + } catch (IOException e) { + e.printStackTrace(); + } + return resultStr; + } + + private void performDeleteRequest(String requestName) { + try { + String urlStr = protocol + "://" + host + ":" + port + requestName; + DeleteMethod method = new DeleteMethod(urlStr); + method.setRequestHeader("Cookie", "escidocCookie=" + cookieId); + httpClient.executeMethod(method); + method.releaseConnection(); + } catch (HttpException e) { + e.printStackTrace(); + } catch (IOException e) { + e.printStackTrace(); + } + } + + private String performGetRequest(String requestName) { + String resultStr = null; + try { + String urlStr = protocol + "://" + host + ":" + port + requestName; + GetMethod method = new GetMethod(urlStr); + method.setRequestHeader("Cookie", "escidocCookie=" + cookieId); + httpClient.executeMethod(method); + byte[] responseBody = method.getResponseBody(); + resultStr = new String(responseBody, "utf-8"); + method.releaseConnection(); + } catch (HttpException e) { + e.printStackTrace(); + } catch (IOException e) { + e.printStackTrace(); + } + return resultStr; + } + + private void performGetRequestToLocalFile(String requestName, String localFileName) throws ApplicationException { + try { + String urlStr = protocol + "://" + host + ":" + port + requestName; + GetMethod method = new GetMethod(urlStr); + method.setRequestHeader("Cookie", "escidocCookie=" + cookieId); + httpClient.executeMethod(method); + InputStream responseBodyInputStream = method.getResponseBodyAsStream(); + FileUtil.getInstance().saveInputStreamToLocalFile(responseBodyInputStream, localFileName); + method.releaseConnection(); + } catch (HttpException e) { + throw new ApplicationException(e); + } catch (IOException e) { + throw new ApplicationException(e); + } + } + + public static NamespaceContext getNsContext() { + NamespaceContext nsContext = new NamespaceContext() { + public String getNamespaceURI(String prefix) { + String uri; + if (prefix.equals("xlink")) + uri = "http://www.w3.org/1999/xlink"; + else if (prefix.equals("escidocItem")) + uri = "http://www.escidoc.de/schemas/item/0.9"; + else if (prefix.equals("user-account")) + uri = "http://www.escidoc.de/schemas/useraccount/0.7"; + else if (prefix.equals("grants")) + uri = "http://www.escidoc.de/schemas/grants/0.5"; + else if (prefix.equals("context")) + uri = "http://www.escidoc.de/schemas/context/0.7"; + else if (prefix.equals("container")) + uri = "http://www.escidoc.de/schemas/container/0.8"; + else if (prefix.equals("escidocMetadataRecords")) + uri = "http://www.escidoc.de/schemas/metadatarecords/0.5"; + else if (prefix.equals("escidocComponents")) + uri = "http://www.escidoc.de/schemas/components/0.9"; + else if (prefix.equals("prop")) + uri = "http://escidoc.de/core/01/properties"; + else if (prefix.equals("struct-map")) + uri = "http://www.escidoc.de/schemas/structmap/0.4"; + else if (prefix.equals("version")) + uri = "http://escidoc.de/core/01/properties/version/"; + else if (prefix.equals("srel")) + uri = "http://escidoc.de/core/01/structural-relations/"; + else if (prefix.equals("xml")) + uri = "http://www.w3.org/XML/1998/namespace"; + else if (prefix.equals("dc")) + uri = "http://purl.org/dc/elements/1.1/"; + else if (prefix.equals("mpiwg")) + uri = "http://www.mpiwg-berlin.mpg.de/ns/mpiwg"; + else + uri = null; + return uri; + } + + public String getPrefix(String uri) { + if (uri.equals("http://www.w3.org/1999/xlink")) + return "xlink"; + else if (uri.equals("http://www.escidoc.de/schemas/item/0.9")) + return "escidocItem"; + else if (uri.equals("http://www.escidoc.de/schemas/useraccount/0.7")) + return "user-account"; + else if (uri.equals("http://www.escidoc.de/schemas/grants/0.5")) + return "grants"; + else if (uri.equals("http://www.escidoc.de/schemas/context/0.7")) + return "context"; + else if (uri.equals("http://www.escidoc.de/schemas/container/0.8")) + return "container"; + else if (uri.equals("http://www.escidoc.de/schemas/metadatarecords/0.5")) + return "escidocMetadataRecords"; + else if (uri.equals("http://www.escidoc.de/schemas/components/0.9")) + return "escidocComponents"; + else if (uri.equals("http://escidoc.de/core/01/properties")) + return "prop"; + else if (uri.equals("http://www.escidoc.de/schemas/structmap/0.4")) + return "struct-map"; + else if (uri.equals("http://escidoc.de/core/01/properties/version/")) + return "version"; + else if (uri.equals("http://escidoc.de/core/01/structural-relations/")) + return "srel"; + else if (uri.equals("http://www.w3.org/XML/1998/namespace")) + return "xml"; + else if (uri.equals("http://purl.org/dc/elements/1.1/")) + return "dc"; + else if (uri.equals("http://www.mpiwg-berlin.mpg.de/ns/mpiwg")) + return "mpiwg"; + else + return null; + } + + public Iterator getPrefixes(String namespace) { + return null; + } + }; + return nsContext; + } +} diff -r 000000000000 -r 408254cf2f1d software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/escidoc/Grant.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/escidoc/Grant.java Wed Nov 24 17:24:23 2010 +0100 @@ -0,0 +1,29 @@ +package de.mpg.mpiwg.berlin.mpdl.escidoc; + +public class Grant { + private String userName = "jwillenborg"; + private String userId = "/aa/user-account/escidoc:22650"; + private String roleName = "System-Administrator"; + private String roleId = "/aa/role/escidoc:role-system-administrator"; + + public Grant(String userName, String userId, String roleName, String roleId) { + this.userName = userName; + this.userId = userId; + this.roleName = roleName; + this.roleId = roleId; + } + + public String toXmlString() { + StringBuilder str = new StringBuilder(); + str.append("\n"); + str.append(""); + str.append(""); + str.append(""); + str.append(""); + str.append(""); + str.append(""); + str.append(""); + return str.toString(); + } +} diff -r 000000000000 -r 408254cf2f1d software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/escidoc/Item.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/escidoc/Item.java Wed Nov 24 17:24:23 2010 +0100 @@ -0,0 +1,103 @@ +package de.mpg.mpiwg.berlin.mpdl.escidoc; + +import java.util.ArrayList; +import java.util.Date; + +import de.mpg.mpiwg.berlin.mpdl.util.XmlUtil; + +public class Item { + private String id; + private String contextId; // e.g. "/ir/context/escidoc:23002" + private String contentModelId; + private String pid; + private MetadataRecord mdRecord; + private ArrayList components; + private Date lastModificationDate; + + public Item(String id, Date lastModificationDate) { + this.id = id; + this.lastModificationDate = lastModificationDate; + } + + public Item(String contextId, String pid, MetadataRecord mdRecord, String contentModelId, ArrayList components) { + this.contextId = contextId; + this.pid = pid; + this.mdRecord = mdRecord; + this.contentModelId = contentModelId; + this.components = components; + } + + public void addComponent(Component component) { + if (components == null) + components = new ArrayList(); + components.add(component); + } + + public String getId() { + return id; + } + + public Date getLastModificationDate() { + return lastModificationDate; + } + + public void setLastModificationDate(Date lastModificationDate) { + this.lastModificationDate = lastModificationDate; + } + + public String toXmlString() { + StringBuilder str = new StringBuilder(); + str.append("\n"); + str.append("\n"); + str.append(" \n"); + str.append(" \n"); + str.append(" \n"); + if (pid != null) { + str.append(" " + pid + "\n"); + } + str.append(" \n"); + str.append(" \n"); + str.append(" \n"); + str.append(" \n"); + + str.append(" \n"); + str.append(" \n"); + str.append(" \n"); + if (mdRecord != null) { + str.append(mdRecord.toXmlString()); + } else { + str.append(" \n"); + str.append(" \n"); + } + str.append(" \n"); + + str.append(" \n"); + + if (components != null) { + str.append(" \n"); + for (int i=0; i\n"); + } + + str.append("\n"); + return str.toString(); + } + +} diff -r 000000000000 -r 408254cf2f1d software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/escidoc/MetadataRecord.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/escidoc/MetadataRecord.java Wed Nov 24 17:24:23 2010 +0100 @@ -0,0 +1,233 @@ +package de.mpg.mpiwg.berlin.mpdl.escidoc; + +import java.util.Calendar; +import java.util.Date; + +public class MetadataRecord { + private String docBase; // e.g. echo or archimedes + private String identifier; // identification filename, id number, etc. + private String language; + private String creator; // author + private String title; + private String description; + private String publisher; // publisher with place: e.g. Springer, New York + private String type; // mime type: e.g. text/xml // TODO ist eigentlich das Feld "format" --> zus. instnace variable "format" definieren + private String rights; // e.g. open access + private Date date; // creation date, modification date, etc. + private String license; // e.g. http://echo.mpiwg-berlin.mpg.de/policy/oa_basics/declaration + private String accessRights; // e.g. free + private String mediaType; // e.g. image or fulltext + private String eXistIdentifier; // e.g. /echo/la/Benedetti_1585.xml + private String echoLink; // e.g. echo.mpiwg-berlin.mpg.de?titleID=163127KK + private String echoDir; // e.g. /permanent/echo/china/songy_tiang_zh_1637 + + public MetadataRecord() { + + } + + public MetadataRecord(String identifier, String language, String creator, String title, String description, String publisher, String type, String rights, Date date) { + this.identifier = identifier; + this.language = language; + this.creator = creator; + this.title = title; + this.description = description; + this.publisher = publisher; + this.type = type; + this.rights = rights; + this.date = date; + } + + public String toXmlString() { + String xmlString = "\n"; + if (identifier != null) + xmlString += "" + identifier + "\n"; + if (language != null) + xmlString += "" + language + "\n"; + if (creator != null) + xmlString += "" + creator + "\n"; + if (title != null) + xmlString += "" + title + "\n"; + if (description != null) + xmlString += "" + description + "\n"; + if (publisher != null) + xmlString += "" + publisher + "\n"; + if (type != null) + xmlString += "" + type + "\n"; + if (rights != null) + xmlString += "" + rights + "\n"; + if (date != null) + xmlString += "" + date + "\n"; + if (license != null) + xmlString += "" + license + "\n"; + if (accessRights != null) + xmlString += "" + accessRights + "\n"; + if (mediaType != null) + xmlString += "" + mediaType + "\n"; + if (eXistIdentifier != null) + xmlString += "" + eXistIdentifier + "\n"; + if (echoLink != null) + xmlString += "" + echoLink + "\n"; + if (echoDir != null) + xmlString += "" + echoDir + "\n"; + xmlString += "\n"; + return xmlString; + } + + public boolean hasEchoDocBase() { + boolean retValue = false; + if (docBase != null && docBase.equals("echo")) + return true; + return retValue; + } + + public boolean hasArchimedesDocBase() { + boolean retValue = false; + if (docBase != null && docBase.equals("archimedes")) + return true; + return retValue; + } + + public String getDocBase() { + return docBase; + } + + public void setDocBase(String docBase) { + this.docBase = docBase; + } + + public String getMediaType() { + return mediaType; + } + + public void setMediaType(String mediaType) { + this.mediaType = mediaType; + } + + public String getRights() { + return rights; + } + + public void setRights(String rights) { + this.rights = rights; + } + + public String getLicense() { + return license; + } + + public void setLicense(String license) { + this.license = license; + } + + public String getAccessRights() { + return accessRights; + } + + public void setAccessRights(String accessRights) { + this.accessRights = accessRights; + } + + public String getEchoLink() { + return echoLink; + } + + public void setEchoLink(String echoLink) { + this.echoLink = echoLink; + } + + public String getEchoDir() { + return echoDir; + } + + public void setEchoDir(String echoDir) { + this.echoDir = echoDir; + } + + public String toString() { + return toXmlString(); + } + + public String getCreator() { + return creator; + } + + public void setCreator(String creator) { + this.creator = creator; + } + + public String getTitle() { + return title; + } + + public void setTitle(String title) { + this.title = title; + } + + public Date getDate() { + return date; + } + + public void setDate(Date date) { + this.date = date; + } + + public String getYear() { + String year = null; + if (date != null) { + Calendar cal = Calendar.getInstance(); + cal.setTime(date); + int iYear = cal.get(Calendar.YEAR); + year = "" + iYear; + } + return year; + } + + public String getDescription() { + return description; + } + + public String getEXistIdentifier() { + return eXistIdentifier; + } + + public void setEXistIdentifier(String xistIdentifier) { + eXistIdentifier = xistIdentifier; + } + + public void setDescription(String description) { + this.description = description; + } + + public String getIdentifier() { + return identifier; + } + + public void setIdentifier(String identifier) { + this.identifier = identifier; + } + + public String getLanguage() { + return language; + } + + public void setLanguage(String language) { + this.language = language; + } + + public String getPublisher() { + return publisher; + } + + public void setPublisher(String publisher) { + this.publisher = publisher; + } + + public String getType() { + return type; + } + + public void setType(String type) { + this.type = type; + } + +} diff -r 000000000000 -r 408254cf2f1d software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/escidoc/TestESciDoc.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/escidoc/TestESciDoc.java Wed Nov 24 17:24:23 2010 +0100 @@ -0,0 +1,367 @@ +package de.mpg.mpiwg.berlin.mpdl.escidoc; + +import java.io.File; +import java.io.IOException; +import java.net.MalformedURLException; +import java.net.URL; +import java.util.ArrayList; +import java.util.Date; +import java.util.Scanner; + +import javax.xml.XMLConstants; +import javax.xml.parsers.DocumentBuilder; +import javax.xml.parsers.DocumentBuilderFactory; +import javax.xml.transform.Source; +import javax.xml.transform.dom.DOMSource; +import javax.xml.transform.sax.SAXSource; +import javax.xml.validation.Schema; +import javax.xml.validation.SchemaFactory; +import javax.xml.validation.Validator; + +import org.quartz.impl.StdSchedulerFactory; +import org.w3c.dom.Document; +import org.w3c.dom.Node; +import org.xml.sax.InputSource; +import org.xml.sax.SAXException; + +import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; +import de.mpg.mpiwg.berlin.mpdl.general.MpdlConstants; +import de.mpg.mpiwg.berlin.mpdl.schedule.MpdlChainScheduler; +import de.mpg.mpiwg.berlin.mpdl.util.XmlUtil; + + +public class TestESciDoc { + private String cookieId; + private ESciDocRestSession eSciDocRestSession; + private String organizationalUnit = MpdlConstants.MPDL_ESCIDOC_OUM_ID; + private String fullContextId = MpdlConstants.MPDL_ESCIDOC_CONTEXT_ID; + private String archimedesContainerId = MpdlConstants.MPDL_ESCIDOC_ARCHIMEDES_CONTAINER_ID; + private String echoContainerId = MpdlConstants.MPDL_ESCIDOC_ECHO_CONTAINER_ID; + + public static void main(String[] args) { + try { + /* + byte[] ligatureBytes = new byte[5]; + ligatureBytes[0] = (byte) Integer.parseInt("61", 16); + ligatureBytes[1] = (byte) Integer.parseInt("74", 16); + ligatureBytes[2] = (byte) Integer.parseInt("EE", 16); + ligatureBytes[3] = (byte) Integer.parseInt("A2", 16); + ligatureBytes[4] = (byte) Integer.parseInt("BF", 16); + + String ligature = new String(ligatureBytes, "utf-8"); + char[] chars = new char[3]; + chars[0] = ligature.charAt(0); + chars[1] = ligature.charAt(1); + chars[2] = ligature.charAt(2); + int codepoint = Character.codePointAt(chars, 2); + int num = Character.getNumericValue(ligature.charAt(2)); + int type = Character.getType(ligature.charAt(2)); + */ + + TestESciDoc test = new TestESciDoc(); + test.init("jwillenborg"); // init eSciDoc-Session with cookie as user jwillenborg + + // test.grant("aeisemann", "admin"); + String uid = test.getUserId("aeisemann"); + String users = test.getAllUsers(); + String grantAdmin = test.getGrantHrefByUserNameAndRoleName("aeisemann", "escidoc:role-system-administrator"); + String grants = test.getGrantsByUserName("aeisemann"); + String bla = ""; + + // test.testSchemaValidation(); + + // test.deleteItem("/ir/item/escidoc:48488"); + // test.deleteContainer("/ir/container/escidoc:48486"); + /* + String containerId = test.createContainer("testJoey1"); + System.out.println("Begin: " + (new Date()).getTime()); + Date successDate = test.addMembersToContainer("/ir/container/escidoc:41646"); + System.out.println("End: " + (new Date()).getTime()); + */ + + // String contextId = test.createContext(test.organizationalUnit, "MPDL-XML-Test", "MPDL-XML-Test", "MpdlType"); + // test.openContext("escidoc:38600"); + + // String containerId = test.createContainer("eXistArchimedesContainer"); + // System.out.println(containerId); + // String containerId = test.createContainer("eXistEchoContainer"); + + /* + for (int i=0; i< 443; i++) { + MetadataRecord mdRecordImage = new MetadataRecord(); + mdRecordImage.setIdentifier("file_" + i); + mdRecordImage.setTitle("ECHO scanned page: " + i); + String fileName = "000" + i; + String srcUrl = "http://echo.mpiwg-berlin.mpg.de/zogilib?fn=/permanent/library/" + "163127KK" + "/pageimg/" + fileName; // TODO + String itemId = test.createItem("mpiwg:47114711", mdRecordImage, fileName, "image/jpeg", "JPEG_DEFAULT", "external-url", srcUrl); + System.out.println(i + ". " + itemId + " created"); + } + */ + /* + String srcUrlAlvarus = "http://mpdl-proto.mpiwg-berlin.mpg.de/exist/rest/db/mpdl/documents/standard/echo/la/alvarus_1509_lat_V40_10.xml"; + Date pubYearAlvarus = XmlUtil.getInstance().toDate("1509-01-01T00:00:00.000Z"); + MetadataRecord mdAlvarus = new MetadataRecord("/echo/la/alvarus_1509_lat_V40_10.xml", "la", "Alvarus, Thomas", "Liber de triplici motu proportionibus annexis magiſtri Aluari Thome Ulixboneñ philoſophicas Suiſeth calculationes ex parte declarans", null, null, "text/xml", pubYearAlvarus); + String result = test.createItemInContainer(test.echoContainerId, mdAlvarus.getIdentifier(), mdAlvarus, srcUrlAlvarus); + + String srcUrlBenedetti = "http://mpdl-proto.mpiwg-berlin.mpg.de/exist/rest/db/mpdl/documents/standard/echo/la/Benedetti_1585.xml"; + Date pubYearBenedetti = XmlUtil.getInstance().toDate("1585-01-01T00:00:00.000Z"); + MetadataRecord mdBenedetti = new MetadataRecord("/echo/la/Benedetti_1585.xml", "la", "Benedetti, Giovanni Battista de", "Diversarum Speculationum mathematicum, & physicarum liber", null, null, "text/xml", pubYearBenedetti); + String result = test.createItemInContainer(test.echoContainerId, mdBenedetti.getIdentifier(), mdBenedetti, srcUrlBenedetti); + + String srcUrlEuclid = "http://mpdl-proto.mpiwg-berlin.mpg.de/exist/rest/db/mpdl/documents/standard/echo/el/Euclid-300.xml"; + Date pubYearEuclid = XmlUtil.getInstance().toDate("0300-01-01T00:00:00.000Z"); + MetadataRecord mdEuclid = new MetadataRecord("/echo/el/Euclid-300.xml", "el", "Euclid", "Στοιχεῖα", null, null, "text/xml", pubYearEuclid); + String result = test.createItemInContainer(test.echoContainerId, mdEuclid.getIdentifier(), mdEuclid, srcUrlEuclid); + + String srcUrlEuclid2 = "http://mpdl-proto.mpiwg-berlin.mpg.de/exist/rest/db/mpdl/documents/standard/echo/zh/Euclid_1966_V8.xml"; + Date pubYearEuclid2 = XmlUtil.getInstance().toDate("0300-01-01T00:00:00.000Z"); + MetadataRecord mdEuclid2 = new MetadataRecord("/echo/zh/Euclid_1966_V8.xml", "zh", "Euclid", "Jihe yuanben, 幾何原本", null, null, "text/xml", pubYearEuclid2); + String result = test.createItemInContainer(test.echoContainerId, mdEuclid2.getIdentifier(), mdEuclid2, "/echo/zh/Euclid_1966_V8.xml", "text/xml", "any fulltext", "internal-managed", srcUrlEuclid2); + + String items = test.getItemsByContainerIdAndFilter(test.echoContainerId, null); + */ + + } catch (Exception e) { + e.printStackTrace(); + } + } + + private void init(String userName) throws ApplicationException { + Scanner in = new Scanner(System.in); + System.out.print("Username: " + userName + ", Password: "); + String password = in.nextLine(); + in.close(); + cookieId = ESciDocRestSession.login(userName, password); + eSciDocRestSession = ESciDocRestSession.getInstance(cookieId); + fullContextId = MpdlConstants.MPDL_ESCIDOC_CONTEXT_ID; + } + + private void testSchemaValidation() throws ApplicationException { + String[] rncSchemaFiles = { + "echo/echo.rnc", + "echo/modules/echo-datatype.rnc", "echo/modules/echo-handwritten.rnc", "echo/modules/echo-start.rnc", + "echo/modules/echo-attribute.rnc", "echo/modules/echo-de.rnc", "echo/modules/echo-import-mathml.rnc", "echo/modules/echo-text.rnc", + "echo/modules/echo-block-scholarly.rnc", "echo/modules/echo-div.rnc", "echo/modules/echo-import-xhtml.rnc", "echo/modules/echo-textflows.rnc", + "echo/modules/echo-block.rnc", "echo/modules/echo-figure.rnc", "echo/modules/echo-mathematics.rnc", + "echo/modules/echo-chinese-text.rnc", "echo/modules/echo-float.rnc", "echo/modules/echo-metadata.rnc", + "echo/modules/echo-content-scholarly.rnc", "echo/modules/echo-gap.rnc", "echo/modules/echo-milestone.rnc", + "echo/modules/echo-content.rnc", "echo/modules/echo-gis.rnc", "echo/modules/echo-note.rnc", + "xhtml/xhtml-datatypes.rnc", "xhtml/xhtml-list.rnc", "xhtml/xhtml-attribs.rnc", "xhtml/xhtml-basic-table.rnc" + }; + String[] schemas = { + "echo-schema/dcterms.xsd", "echo-schema/echo-datatype.xsd", "echo-schema/echo-handwritten.xsd", "echo-schema/echo-start.xsd", "echo-schema/xhtml-datatypes.xsd", + "echo-schema/echo-attribute.xsd", "echo-schema/echo-de.xsd", "echo-schema/ echo-import-mathml.xsd", "echo-schema/echo-text.xsd", "echo-schema/xhtml-list.xsd", + "echo-schema/echo-block-scholarly.xsd", "echo-schema/echo-div.xsd", "echo-schema/echo-import-xhtml.xsd", "echo-schema/echo-textflows.xsd", "echo-schema/xlink.xsd", + "echo-schema/echo-block.xsd", "echo-schema/echo-figure.xsd", "echo-schema/echo-mathematics.xsd", "echo-schema/echo.xsd", "echo-schema/xml.xsd", + "echo-schema/echo-chinese-text.xsd", "echo-schema/echo-float.xsd", "echo-schema/ echo-metadata.xsd", "echo-schema/local.xsd", "echo-schema/xsi.xsd", + "echo-schema/echo-content-scholarly.xsd", "echo-schema/echo-gap.xsd", "echo-schema/echo-milestone.xsd", "echo-schema/xhtml-attribs.xsd", + "echo-schema/echo-content.xsd", "echo-schema/echo-gis.xsd", "echo-schema/echo-note.xsd", "echo-schema/xhtml-basic-table.xsd" + }; + File xmlFile = new File("/Users/jwillenborg/texts/echo/SongYingxing_1637.xml"); + // validate(xmlFile, schemas); + validateByRelaxNG(xmlFile, rncSchemaFiles); + } + + private void validateDocumentBuilder(File xmlFile, String[] schemaFileNames) throws ApplicationException { + String JAXP_SCHEMA_SOURCE = "http://java.sun.com/xml/jaxp/properties/schemaSource"; + String JAXP_SCHEMA_LANGUAGE = "http://java.sun.com/xml/jaxp/properties/schemaLanguage"; + String W3C_XML_SCHEMA = XMLConstants.W3C_XML_SCHEMA_NS_URI; + Node root = null; + try { + DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance(); + dbf.setNamespaceAware(true); + dbf.setAttribute(JAXP_SCHEMA_LANGUAGE, W3C_XML_SCHEMA); + dbf.setAttribute(JAXP_SCHEMA_SOURCE, schemaFileNames); + DocumentBuilder db = dbf.newDocumentBuilder(); + Document doc = db.parse(xmlFile); + root = doc.getFirstChild(); + String bla = ""; + } catch (Exception e) { + throw new ApplicationException(e); + } + } + + public void validateByRelaxNG(File xmlFile, String[] schemaFileNames) throws ApplicationException { + // System.setProperty(SchemaFactory.class.getName() + ":" + XMLConstants.RELAXNG_NS_URI, "com.thaiopensource.relaxng.jaxp.XMLSyntaxSchemaFactory"); + System.setProperty(SchemaFactory.class.getName() + ":" + XMLConstants.RELAXNG_NS_URI, "com.thaiopensource.relaxng.jaxp.CompactSyntaxSchemaFactory"); + // RELAX NG factory + SchemaFactory factory = SchemaFactory.newInstance(XMLConstants.RELAXNG_NS_URI); + // Compile the schema. + Schema schema = null; + try { + URL schemaUrl = new URL("http://mpdl-test.mpiwg-berlin.mpg.de:30030/exist/rest/db/mpdl/schema/echo/echo.rnc"); + schema = factory.newSchema(schemaUrl); + } catch (SAXException e) { + throw new ApplicationException(e); + } catch (MalformedURLException e) { + throw new ApplicationException(e); + } + // Get a validator from the schema. + Validator validator = schema.newValidator(); + // Check the document + InputSource inputSource = new InputSource(xmlFile.getPath()); + Source source = new SAXSource(inputSource); + try { + validator.validate(source); + } catch (SAXException e) { + throw new ApplicationException(e); + } catch (IOException e) { + throw new ApplicationException(e); + } + } + + private void validate(File xmlFile, String[] schemaFileNames) throws ApplicationException { + SchemaFactory factory = SchemaFactory.newInstance(XMLConstants.W3C_XML_SCHEMA_NS_URI); + // Compile the schema. + Schema schema = null; + Source[] schemaInputSources = new SAXSource[schemaFileNames.length]; + for (int i=0; i components) throws ApplicationException { + Item result = eSciDocRestSession.createItemInContainer(containerId, pid, mdRecord, components); + return result; + } + + private Item createItem(String pid, MetadataRecord mdRecord, ArrayList components) throws ApplicationException { + Item result = eSciDocRestSession.createItem(pid, mdRecord, components); + return result; + } + + private Date addMembersToContainer(String containerId) throws ApplicationException { + String modDateStr = "2010-04-16T15:00:53.409Z"; + Date modDate = XmlUtil.getInstance().toDate(modDateStr); + ArrayList memberIds = new ArrayList(); + for (int i=40761; i<= 41645; i = i + 2) { + String memberId = "" + i; + memberIds.add(memberId); + } + Date result = eSciDocRestSession.addMembers(containerId, modDate, memberIds); + return result; + } + + private String getItemsByContainerIdAndFilter(String containerId, String filter) throws ApplicationException { + String members = eSciDocRestSession.getMembersByContainerIdAndFilter(containerId, filter); + return members; + } + + private void deleteContainer(String containerId) throws ApplicationException { + eSciDocRestSession.deleteContainer(containerId); + } + + private void deleteItem(String itemId) throws ApplicationException { + eSciDocRestSession.deleteItem(itemId); + } + + private void test() { + /* + XmlUtil xmlUtil = XmlUtil.getInstance(); + ClientSession eSciDocClientSession = new ClientSession("http", "xserve07.mpiwg-berlin.mpg.de", 8080, "jwillenborg", "pucki123"); + String contextId = "/ir/context/escidoc:23002"; + String filterDetail = "/id"; + String filter = "" + filterDetail + ""; + String members = eSciDocClientSession.getItemsByContainerIdAndFilter("escidoc:23003", filter); + System.out.println(members); + + String item = eSciDocClientSession.getItem("escidoc:13003"); + String itemId = xmlUtil.getFirstElementAttributeValue(item, "prop:latest-version", "xlink:href"); + System.out.println(itemId); + /* + Date pubYearAlvarus = xmlUtil.toDate("1509-01-01T00:00:00.000Z"); + MetadataRecord mdAlvarus = new MetadataRecord("/archimedes/la/alvarus_1509_lat_V40_10.xml", "la", "Alvarus, Thomas", "Liber de triplici motu proportionibus annexis magiſtri Aluari Thome Ulixboneñ philoſophicas Suiſeth calculationes ex parte declarans", null, null, "text/xml", pubYearAlvarus); + String srcUrlAlvarus = "http://mpdl-proto.mpiwg-berlin.mpg.de/exist/rest/db/mpdl/documents/standard/echo/la/alvarus_1509_lat_V40_10.xml"; + String result = eSciDocClientSession.createItemInContainer(contextId, "MPDL-XML-Test", "abcdefg", mdAlvarus, "escidoc:23003", srcUrlAlvarus); + */ + + /* + Date pubYearAgricola = xmlUtil.toDate("1912-01-01T00:00:00.000Z"); + MetadataRecord mdAgricola = new MetadataRecord("/archimedes/en/agric_remet_002_en.xml", "la", "Agricola, Georgius", "De re metallica", null, "London", "text/xml", pubYearAgricola); + String srcUrlAgricola = "http://mpdl-proto.mpiwg-berlin.mpg.de/exist/rest/db/mpdl/documents/standard/archimedes/en/agric_remet_002_en.xml"; + String item = eSciDocClientSession.getItemById("escidoc:23012"); + String lastVersionDateStr = xmlUtil.getFirstElementValue(item, "version:date"); + Date lastModificationDate = xmlUtil.toDate(lastVersionDateStr); + String lastModificationDateStr = xmlUtil.toXsDate(lastModificationDate); + String itemXmlResult = eSciDocClientSession.updateItem(contextId, "MPDL-XML-Test", "abcdefg", mdAgricola, srcUrlAgricola, "escidoc:23012", lastModificationDate); + */ + /* + String itemIdAgricola = xmlUtil.getFirstElementAttributeValue(itemXmlResult, "prop:latest-version", "xlink:href"); + eSciDocClientSession.deleteItem(contextId, "escidoc:23010"); + */ + /* + String containerName = "testEXistEcho"; + MetadataRecord mdRecord = new MetadataRecord("testEXistArchimedes", "Echo document container"); + String containerXmlResult = eSciDocClientSession.addContainer("/ir/context/escidoc:23002", "MPDL-XML-Test", containerPid, mdRecord); + String containerId = xmlUtil.getFirstElementAttributeValue(containerXmlResult, "prop:latest-version", "xlink:href"); + System.out.println(containerId); + */ + // eSciDocClientSession.deleteItem(contextId, "escidoc:23012"); + } +} diff -r 000000000000 -r 408254cf2f1d software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/escidoc/TestESciDocEXist.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/escidoc/TestESciDocEXist.java Wed Nov 24 17:24:23 2010 +0100 @@ -0,0 +1,305 @@ +package de.mpg.mpiwg.berlin.mpdl.escidoc; + +import java.io.ByteArrayInputStream; +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.UnsupportedEncodingException; +import java.net.URLEncoder; +import java.util.ArrayList; +import java.util.Date; +import java.util.Scanner; + +import org.apache.commons.httpclient.HttpClient; +import org.apache.commons.httpclient.HttpException; +import org.apache.commons.httpclient.methods.DeleteMethod; +import org.apache.commons.httpclient.methods.GetMethod; +import org.apache.commons.httpclient.methods.InputStreamRequestEntity; +import org.apache.commons.httpclient.methods.PostMethod; +import org.apache.commons.httpclient.methods.PutMethod; +import org.apache.commons.httpclient.params.HttpMethodParams; +import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; +import de.mpg.mpiwg.berlin.mpdl.util.XmlUtil; + +public class TestESciDocEXist { + private String protocol = "http"; + private String host = "mpdl-test.mpiwg-berlin.mpg.de"; + private int port = 30030; + private String userName = "jwillenborg"; + private HttpClient httpClient; + private String eSciDocCookieId; + + public static void main(String[] args) throws ApplicationException { + try { + TestESciDocEXist test = new TestESciDocEXist(); + test.init(); + test.testCalls(); + } catch (Exception e) { + e.printStackTrace(); + } + } + + private void init() throws ApplicationException { + httpClient = new HttpClient(); + Scanner in = new Scanner(System.in); + System.out.print("Username: " + userName + ", Type your password: "); + String password = in.nextLine(); + in.close(); + eSciDocCookieId = ESciDocRestSession.login(userName, password); + } + + private void testCalls() throws ApplicationException { + String result = ""; + xQuery(); + // xQueryPath(); + // xQueryByEscidocItemId(); + // result = createItem(); + // result = deleteItem(); + } + + private void xQuery() throws ApplicationException { + String result = ""; + try { + String xquery = + "xquery version \"1.0\"; \n" + + "declare namespace echo=\"http://www.mpiwg-berlin.mpg.de/ns/echo/1.0/\"; \n" + + "let $doc := doc(\"/db/mpdl/documents/standard/echo/zh/SongYingxing_1637.xml\") \n" + + "let $sentences := $doc//echo:s \n" + + "return $sentences \n"; + String xQueryEncoded = URLEncoder.encode(xquery, "utf-8"); + String request = "/mpdl/escidoc/exist:xquery/execute?query=" + xQueryEncoded + "&startRecord=1&maximumRecords=50"; + result = performGetRequest(request); + System.out.println(result); + } catch (UnsupportedEncodingException e) { + throw new ApplicationException(e); + } + } + + + private void xQueryPath() throws ApplicationException { + String result = ""; + try { + String xqueryPath = "/mpdl/interface/doc-query.xql"; + String xQueryPathEncoded = URLEncoder.encode(xqueryPath, "utf-8"); + String xqueryPathParams = + "" + + "fulltextMorph" + + "/echo/la/Benedetti_1585.xml" + + "text" + + "multiplicare" + + "1" + + "10" + + ""; + String xqueryPathParamsEncoded = URLEncoder.encode(xqueryPathParams, "utf-8"); + String request = "/mpdl/escidoc/exist:xquery/execute?queryPath=" + xQueryPathEncoded + "¶meters=" + xqueryPathParamsEncoded; + result = performGetRequest(request); + } catch (UnsupportedEncodingException e) { + throw new ApplicationException(e); + } + System.out.println(result); + } + + private void xQueryByEscidocItemId() throws ApplicationException { + // /ir/item/escidoc:47344 is fulltext item of "/echo/la/Benedetti_1585" + String result = ""; + try { + String xquery = "//echo:s"; + String xqueryEncoded = URLEncoder.encode(xquery, "utf-8"); + String request = "/mpdl/escidoc/ir/item/escidoc:47344/exist:xquery/execute?query=" + xqueryEncoded + "&startRecord=1&maximumRecords=50"; + result = performGetRequest(request); + } catch (UnsupportedEncodingException e) { + throw new ApplicationException(e); + } + System.out.println(result); + } + + private String createItem() throws ApplicationException { + String xmlDocumentFileName = "/Users/jwillenborg/tmp/echo/la/Archimedes_1565.xml"; + ESciDocRestSession escidocRestSession = ESciDocRestSession.getInstance(eSciDocCookieId); + String eSciDocStageAreaUrl = escidocRestSession.uploadFileToESciDocStageArea(xmlDocumentFileName); + String contentModelId = "/cmm/content-model/escidoc:persistent4"; // TODO take final contentModelId: "/cmm/content-model/escidoc:exist-xml" + String contextId = "/ir/context/escidoc:38600"; // TODO take final contextId + String pid = escidocRestSession.getPid(); + MetadataRecord mdRecord = new MetadataRecord(); + String docBase = "echo"; + mdRecord.setCreator("Archimedes"); + mdRecord.setTitle("Archimedis De iis quae ve huntur in aqua libri duo"); + Date py = XmlUtil.getInstance().toDate("1565" + "-01-01T00:00:00.000Z"); + mdRecord.setDate(py); + mdRecord.setMediaType("fulltext"); + mdRecord.setDocBase(docBase); + mdRecord.setLanguage("la"); + mdRecord.setEXistIdentifier("/echo/la/Archimedes_1565.xml"); + ArrayList components = new ArrayList(); + String contentCategory = "fulltext XML - ECHO"; + if (docBase != null && docBase.equals("archimedes")) + contentCategory = "fulltext XML - Archimedes"; + Component componentXmlFulltext = new Component("valid", "public", contentCategory, "text/xml", eSciDocStageAreaUrl, "internal-managed"); + components.add(componentXmlFulltext); + Item xmlFulltextItem = new Item(contextId, pid, mdRecord, contentModelId, components); + String xmlFulltextItemStr = xmlFulltextItem.toXmlString(); + String containerIdOfFulltextItem = "/ir/container/escidoc:51122"; + String newItemXmlStr = performPostRequest("/mpdl/escidoc" + containerIdOfFulltextItem + "/create-item", xmlFulltextItemStr, null); + return newItemXmlStr; + } + + private String deleteItem() { + String itemId = "/ir/item/escidoc:73012"; + String retStr = performDeleteRequest("/mpdl/escidoc" + itemId); + return retStr; + } + + private String createContainer() throws ApplicationException { + String containerIdOfEchoDocBase = "/ir/container/escidoc:51122"; // TODO + String contentModelId = "/cmm/content-model/escidoc:persistent4"; // TODO take final contentModelId: "/cmm/content-model/escidoc:exist-xml" + String contextId = "/ir/context/escidoc:38600"; // TODO take final contextId + ESciDocRestSession escidocRestSession = ESciDocRestSession.getInstance(eSciDocCookieId); + String pid = escidocRestSession.getPid(); + MetadataRecord mdRecord = new MetadataRecord(); + mdRecord.setEXistIdentifier("/echo"); + Container container = new Container(contentModelId, contextId, pid, mdRecord); + String xmlContainerStr = container.toXmlString(); + String newContainerXmlStr = performPostRequest("/mpdl/escidoc" + containerIdOfEchoDocBase + "/create-container", xmlContainerStr, null); + return newContainerXmlStr; + } + + private String deleteContainer() { + String containerId = "/ir/container/escidoc:71010"; // TODO + String retStr = performDeleteRequest("/mpdl/escidoc" + containerId); + return retStr; + } + + private String performPostRequest(String requestUrlStr, String requestInputStreamStr, HttpMethodParams parameter) throws ApplicationException { + String resultStr = null; + try { + String urlStr = protocol + "://" + host + ":" + port + requestUrlStr; + PostMethod method = new PostMethod(urlStr); + method.setRequestHeader("Cookie", "escidocCookie=" + eSciDocCookieId); + method.setFollowRedirects(false); + if (requestInputStreamStr != null) { + InputStream requestInputStream = new ByteArrayInputStream(requestInputStreamStr.getBytes("UTF-8")); + InputStreamRequestEntity inputStreamRequestEntity = new InputStreamRequestEntity(requestInputStream); + method.setRequestEntity(inputStreamRequestEntity); + } + if (parameter != null) { + method.setParams(parameter); + } + httpClient.executeMethod(method); + resultStr = method.getResponseBodyAsString(); + method.releaseConnection(); + } catch (HttpException e) { + throw new ApplicationException(e); + } catch (IOException e) { + throw new ApplicationException(e); + } + return resultStr; + } + + private String performPostRequest(String requestUrlStr, File requestFile, HttpMethodParams parameter) throws ApplicationException { + String resultStr = null; + try { + String urlStr = protocol + "://" + host + ":" + port + requestUrlStr; + PostMethod method = new PostMethod(urlStr); + method.setRequestHeader("Cookie", "escidocCookie=" + eSciDocCookieId); + method.setFollowRedirects(false); + if (requestFile != null) { + /** + FilePart requestFilePart = new FilePart(requestFile.getName(), requestFile); + Part[] parts = { requestFilePart }; + MultipartRequestEntity multipartRequestEntity = new MultipartRequestEntity(parts, method.getParams()); + method.setRequestEntity(multipartRequestEntity); + **/ + FileInputStream requestFileInputStream = new FileInputStream(requestFile); + InputStreamRequestEntity inputStreamRequestEntity = new InputStreamRequestEntity(requestFileInputStream); + method.setRequestEntity(inputStreamRequestEntity); + } + if (parameter != null) { + method.setParams(parameter); + } + httpClient.executeMethod(method); + resultStr = method.getResponseBodyAsString(); + method.releaseConnection(); + } catch (HttpException e) { + throw new ApplicationException(e); + } catch (IOException e) { + throw new ApplicationException(e); + } + return resultStr; + } + + private String performPutRequestByBody(String requestName, String bodyContent) { + String resultStr = null; + try { + String urlStr = protocol + "://" + host + ":" + port + requestName; + PutMethod method = new PutMethod(urlStr); + method.setRequestHeader("Cookie", "escidocCookie=" + eSciDocCookieId); + if (bodyContent != null) { + method.setRequestBody(bodyContent); + } + httpClient.executeMethod(method); + resultStr = method.getResponseBodyAsString(); + method.releaseConnection(); + } catch (HttpException e) { + e.printStackTrace(); + } catch (IOException e) { + e.printStackTrace(); + } + return resultStr; + } + + private String performDeleteRequest(String requestName) { + String resultStr = null; + try { + String urlStr = protocol + "://" + host + ":" + port + requestName; + DeleteMethod method = new DeleteMethod(urlStr); + method.setRequestHeader("Cookie", "escidocCookie=" + eSciDocCookieId); + httpClient.executeMethod(method); + resultStr = method.getResponseBodyAsString(); + method.releaseConnection(); + } catch (HttpException e) { + e.printStackTrace(); + } catch (IOException e) { + e.printStackTrace(); + } + return resultStr; + } + + private String performGetRequest(String requestName) { + String resultStr = null; + try { + String urlStr = protocol + "://" + host + ":" + port + requestName; + GetMethod method = new GetMethod(urlStr); + method.setRequestHeader("Cookie", "escidocCookie=" + eSciDocCookieId); + httpClient.executeMethod(method); + resultStr = method.getResponseBodyAsString(); + method.releaseConnection(); + } catch (HttpException e) { + e.printStackTrace(); + } catch (IOException e) { + e.printStackTrace(); + } + return resultStr; + } + + private String performPostRequestByBody(String requestUrlStr, String bodyContent) throws ApplicationException { + String resultStr = null; + try { + String urlStr = protocol + "://" + host + ":" + port + requestUrlStr; + PostMethod method = new PostMethod(urlStr); + method.setFollowRedirects(false); + if (bodyContent != null) { + method.setRequestBody(bodyContent); + } + httpClient.executeMethod(method); + resultStr = method.getResponseBodyAsString(); + method.releaseConnection(); + } catch (HttpException e) { + throw new ApplicationException(e); + } catch (IOException e) { + throw new ApplicationException(e); + } + return resultStr; + } + +} diff -r 000000000000 -r 408254cf2f1d software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/exception/ApplicationException.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/exception/ApplicationException.java Wed Nov 24 17:24:23 2010 +0100 @@ -0,0 +1,14 @@ +package de.mpg.mpiwg.berlin.mpdl.exception; + +public class ApplicationException extends Exception { + + public ApplicationException(Exception e) { + super(e); + } + + public ApplicationException(String str) { + super(str); + } + +} + diff -r 000000000000 -r 408254cf2f1d software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/general/MpdlConstants.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/general/MpdlConstants.java Wed Nov 24 17:24:23 2010 +0100 @@ -0,0 +1,47 @@ +package de.mpg.mpiwg.berlin.mpdl.general; + +import java.util.Properties; + +import de.mpg.mpiwg.berlin.mpdl.util.Util; + +public class MpdlConstants { + public static String EXIST_HOME = System.getProperty("exist.home"); + public static String MPDL_SYSTEM_PROPERTIES_FILENAME = EXIST_HOME + "/mpdl/extensions/mpdl-modules/mpdl-system.properties"; + public static Properties MPDL_SYSTEM_PROPERTIES = new Util().getProperties(MPDL_SYSTEM_PROPERTIES_FILENAME); + + // Mpdl: general settings + public static String MPDL_PROJECT_NAME = "mpdl"; + public static String TYPE_STATIC = "static"; + public static String TYPE_DYNAMIC = "dynamic"; + + // eXist settings: data + public static String MPDL_EXIST_DATA_DIR = EXIST_HOME + "/webapp/WEB-INF/dataMpdl"; // other call would be: ConfigurationHelper.getExistHome() + + // eXist settings: XML-RPC-Interface, doc-interface + public static String MPDL_FULL_EXIST_HOST_NAME = MPDL_SYSTEM_PROPERTIES.getProperty("exist.fullHostname"); // official eXist server name; used for user displays etc. + public static String MPDL_EXIST_HOST_NAME = MPDL_SYSTEM_PROPERTIES.getProperty("exist.hostname"); // used in XML-RPC-Interface etc. + public static int MPDL_EXIST_PORT = new Integer(MPDL_SYSTEM_PROPERTIES.getProperty("exist.port")); // other call could but does not work: System.getProperty("jetty.port") + public static String MPDL_EXIST_ADMIN_USER_NAME = MPDL_SYSTEM_PROPERTIES.getProperty("exist.adminUserName"); + public static String MPDL_EXIST_ADMIN_USER_PW = MPDL_SYSTEM_PROPERTIES.getProperty("exist.adminUserPW"); + public static String MPDL_ECHO_RELAXNG_PATH = MPDL_SYSTEM_PROPERTIES.getProperty("exist.echoRelaxNGPath"); + + // eSciDoc settings + public static String MPDL_ESCIDOC_HOST_NAME = MPDL_SYSTEM_PROPERTIES.getProperty("escidoc.hostname"); + public static int MPDL_ESCIDOC_PORT = new Integer(MPDL_SYSTEM_PROPERTIES.getProperty("escidoc.port")); + public static String MPDL_ESCIDOC_OUM_ID = MPDL_SYSTEM_PROPERTIES.getProperty("escidoc.oumId"); // organizational unit + public static String MPDL_ESCIDOC_CMM_ID = MPDL_SYSTEM_PROPERTIES.getProperty("escidoc.cmmId"); // content model + public static String MPDL_ESCIDOC_CONTEXT_ID = MPDL_SYSTEM_PROPERTIES.getProperty("escidoc.contextId"); // context id + public static String MPDL_ESCIDOC_ARCHIMEDES_CONTAINER_ID = MPDL_SYSTEM_PROPERTIES.getProperty("escidoc.archimedesContainerId"); // archimedes container id + public static String MPDL_ESCIDOC_ECHO_CONTAINER_ID = MPDL_SYSTEM_PROPERTIES.getProperty("escidoc.echoContainerId"); // echo container id + public static String MPDL_ESCIDOC_ADMIN_USER_NAME = MPDL_SYSTEM_PROPERTIES.getProperty("escidoc.adminUserName"); + public static String MPDL_ESCIDOC_ADMIN_USER_PW = MPDL_SYSTEM_PROPERTIES.getProperty("escidoc.adminUserPW"); + + // Mpdl: language technology + public static String DEFAULT_LANGUAGE = "en"; + public static int MORPHOLOGY_CACHE_SIZE = 1000000; + + // Mpdl: language technology: static data management (BerkeleyDB etc.) + public static String MPDL_DATA_DIR = "/Users/jwillenborg/java/exist1.4/webapp/WEB-INF/dataMpdl"; // TODO + public static String MPDL_DOC_DIR = "/Users/jwillenborg/texts/mpdl"; // TODO + public static String MPDL_TEST_DATA_DIR = "/Users/jwillenborg/data/mpdl"; // TODO +} diff -r 000000000000 -r 408254cf2f1d software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/MpdlFilter.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/MpdlFilter.java Wed Nov 24 17:24:23 2010 +0100 @@ -0,0 +1,48 @@ +package de.mpg.mpiwg.berlin.mpdl.lt.analyzer; + +import java.io.IOException; + +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.TokenStream; + +public class MpdlFilter extends TokenFilter { + + public MpdlFilter(TokenStream in) { + super(in); + } + + public Token next() throws IOException { + return getNext(null); + } + + public Token next(Token reusableToken) throws IOException { + return getNext(reusableToken); + } + + private Token getNext(Token reusableToken) throws IOException { + Token nextToken = null; + if (reusableToken == null) + nextToken = input.next(); + else + nextToken = input.next(reusableToken); + if (nextToken == null) + return null; + char[] termBuffer = nextToken.termBuffer(); + int termBufferLength = nextToken.termLength(); + int newTermBufferLength = 0; + // if a hyphen or a newlineChar or tabChar is in the token it is removed + for(int i=0;iGERMAN_STOP_WORDS). + */ + public MpdlMorphAnalyzer() { + String[] stopWords = getStopWords(language); // stopwords for the language + stopSet = StopFilter.makeStopSet(stopWords); + } + + /** + * Builds an analyzer with the given stop words. + */ + public MpdlMorphAnalyzer(String[] stopwords) { + stopSet = StopFilter.makeStopSet(stopwords); + } + + /** + * Builds an analyzer with the given stop words. + */ + public MpdlMorphAnalyzer(Hashtable stopwords) { + stopSet = new HashSet(stopwords.keySet()); + } + + /** + * Builds an analyzer with the given stop words. + */ + public MpdlMorphAnalyzer(File stopwords) throws IOException { + stopSet = WordlistLoader.getWordSet(stopwords); + } + + public String getLanguage() { + return language; + } + + protected void setLanguage(String lang) { + this.language = lang; + } + + /** + * Get stopwords for the language: fetch them from the open language analyzers for some languages +Taken from: http://www.perseus.tufts.edu/hopper/stopwords +# English: a, a's, able, about, above, according, accordingly, across, actually, after, afterwards, again, against, ain't, all, allow, allows, almost, alone, along, already, also, although, always, am, among, amongst, an, and, another, any, anybody, anyhow, anyone, anything, anyway, anyways, anywhere, apart, appear, appreciate, appropriate, are, aren't, around, as, aside, ask, asking, associated, at, available, away, awfully, b, be, became, because, become, becomes, becoming, been, before, beforehand, behind, being, believe, below, beside, besides, best, better, between, beyond, both, brief, but, by, c, c'mon, c's, came, can, can't, cannot, cant, cause, causes, certain, certainly, changes, clearly, co, com, come, comes, concerning, consequently, consider, considering, contain, containing, contains, corresponding, could, couldn't, course, currently, d, definitely, described, despite, did, didn't, different, do, does, doesn't, doing, don't, done, down, downwards, during, e, each, edu, eg, eight, either, else, elsewhere, enough, entirely, especially, et, etc, even, ever, every, everybody, everyone, everything, everywhere, ex, exactly, example, except, f, far, few, fifth, first, five, followed, following, follows, for, former, formerly, forth, four, from, further, furthermore, g, get, gets, getting, given, gives, go, goes, going, gone, got, gotten, greetings, h, had, hadn't, happens, hardly, has, hasn't, have, haven't, having, he, he's, hello, help, hence, her, here, here's, hereafter, hereby, herein, hereupon, hers, herself, hi, him, himself, his, hither, hopefully, how, howbeit, however, i, i'd, i'll, i'm, i've, ie, if, ignored, immediate, in, inasmuch, inc, indeed, indicate, indicated, indicates, inner, insofar, instead, into, inward, is, isn't, it, it'd, it'll, it's, its, itself, j, just, k, keep, keeps, kept, know, known, knows, l, last, lately, later, latter, latterly, least, less, lest, let, let's, like, liked, likely, little, look, looking, looks, ltd, m, mainly, many, may, maybe, me, mean, meanwhile, merely, might, more, moreover, most, mostly, much, must, my, myself, n, name, namely, nd, near, nearly, necessary, need, needs, neither, never, nevertheless, new, next, nine, no, nobody, non, none, noone, nor, normally, not, nothing, novel, now, nowhere, o, obviously, of, off, often, oh, ok, okay, old, on, once, one, ones, only, onto, or, other, others, otherwise, ought, our, ours, ourselves, out, outside, over, overall, own, p, particular, particularly, per, perhaps, placed, please, plus, possible, presumably, probably, provides, q, que, quite, qv, r, rather, rd, re, really, reasonably, regarding, regardless, regards, relatively, respectively, right, s, said, same, saw, say, saying, says, second, secondly, see, seeing, seem, seemed, seeming, seems, seen, self, selves, sensible, sent, serious, seriously, seven, several, shall, she, should, shouldn't, since, six, so, some, somebody, somehow, someone, something, sometime, sometimes, somewhat, somewhere, soon, sorry, specified, specify, specifying, still, sub, such, sup, sure, t, t's, take, taken, tell, tends, th, than, thank, thanks, thanx, that, that's, thats, the, their, theirs, them, themselves, then, thence, there, there's, thereafter, thereby, therefore, therein, theres, thereupon, these, they, they'd, they'll, they're, they've, think, third, this, thorough, thoroughly, those, though, three, through, throughout, thru, thus, to, together, too, took, toward, towards, tried, tries, truly, try, trying, twice, two, u, un, under, unfortunately, unless, unlikely, until, unto, up, upon, us, use, used, useful, uses, using, usually, uucp, v, value, various, very, via, viz, vs, w, want, wants, was, wasn't, way, we, we'd, we'll, we're, we've, welcome, well, went, were, weren't, what, what's, whatever, when, whence, whenever, where, where's, whereafter, whereas, whereby, wherein, whereupon, wherever, whether, which, while, whilst, whither, who, who's, whoever, whole, whom, whose, why, will, willing, wish, with, within, without, won't, wonder, would, wouldn't, x, y, yes, yet, you, you'd, you'll, you're, you've, your, yours, yourself, yourselves, z, zero + +# Greek: a)/llos, a)/n, a)/ra, a)ll', a)lla/, a)po/, au)to/s, d', dai/, dai/s, de/, dh/, dia/, e(autou=, e)/ti, e)a/n, e)gw/, e)k, e)mo/s, e)n, e)pi/, ei), ei)/mi, ei)mi/, ei)s, ga/r, ga^, ge, h(, h)/, kai/, kata/, me/n, meta/, mh/, o(, o(/de, o(/s, o(/stis, o(/ti, oi(, ou(/tws, ou(=tos, ou), ou)/te, ou)=n, ou)de/, ou)dei/s, ou)k, para/, peri/, pro/s, so/s, su/, su/n, ta/, te, th/n, th=s, th=|, ti, ti/, ti/s, tis, to/, to/n, toi/, toiou=tos, tou/s, tou=, tw=n, tw=|, u(mo/s, u(pe/r, u(po/, w(/ste, w(s, w)= + +# Latin: ab, ac, ad, adhic, aliqui, aliquis, an, ante, apud, at, atque, aut, autem, cum, cur, de, deinde, dum, ego, enim, ergo, es, est, et, etiam, etsi, ex, fio, haud, hic, iam, idem, igitur, ille, in, infra, inter, interim, ipse, is, ita, magis, modo, mox, nam, ne, nec, necque, neque, nisi, non, nos, o, ob, per, possum, post, pro, quae, quam, quare, qui, quia, quicumque, quidem, quilibet, quis, quisnam, quisquam, quisque, quisquis, quo, quoniam, sed, si, sic, sive, sub, sui, sum, super, suus, tam, tamen, trans, tu, tum, ubi, uel, uero, unus, ut + +# Italian: a, ad, agli, al, alcun, alcuno, all', alla, alle, allo, altra, altre, altri, altro, assai, avere, bene, c', ch', che, chi, ci, cio, co', col, come, con, cosi, cosi\, d', da, dal, dall', dalla, dalle, de, de', degli, dei, del, dell', della, delle, dello, di, duo, e, ed, egli, essere, et, gia, gia\, gli, gran, grande, i, il, in, io, l', la, le, li, lo, ma, maggior, maggiore, mai, mio, molto, ne, ne', nel, nell', nella, nelle, non, o, ogn', ogni, oue, ove, per, perche, piu, piu\, poco, poi, puo, qual, qualche, qualcun, qualcuno, quale, quanta, quante, quanti, quanto, quasi, quella, quelle, quelli, quello, questa, queste, questi, questo, qui, s', se, sempre, senza, si, sotto, su, sua, sue, sui, suo, tal, tanta, tante, tanti, tanto, tra, tre, tutta, tutte, tutti, tutto, un, una, uno, vn, vna, vno + +# German: aber, alle, als, also, am, an, andern, auch, auf, aus, bei, bey, bis, da, daher, das, dass, de, dem, den, der, des, die, diese, dieser, dieses, doch, durch, eben, ein, eine, einem, einen, einer, eines, er, es, fur, gegen, haben, hat, ihre, im, in, ist, kan, man, mehr, mit, nach, nicht, noch, nur, oder, ohne, sehr, sei, selbst, sey, sich, sie, sind, so, uber, um, und, unter, vgl, vom, von, weil, welche, wenn, werden, wie, wird, zu, zur + +# French: a, amp, au, auec, aussi, autre, autres, aux, bien, car, ce, ces, cette, ceux, chose, choses, comme, d', dans, de, des, deux, dire, dont, du, elle, elles, en, encore, est, estre, et, faire, fait, faut, force, grande, ie, il, ils, l', la, le, les, leur, leurs, lors, luy, mais, mesme, n', ne, nous, on, ont, or, ou, par, parce, pas, peut, plus, plusieurs, point, pour, pourquoy, puis, qu', quand, que, qui, quoy, sa, sans, se, ses, si, soit, son, sont, sur, tous, tout, toutes, vn, vne, y + * @param language + * @return stopwords + */ + public String[] getStopWords(String language) { + String[] stopwords = new String[0]; + if (language != null) { + if (language.equals("en")) + stopwords = StandardAnalyzer.STOP_WORDS; + else if(language.equals("br")) + stopwords = BrazilianAnalyzer.BRAZILIAN_STOP_WORDS; + else if(language.equals("cz")) + stopwords = CzechAnalyzer.CZECH_STOP_WORDS; + else if(language.equals("de")) + stopwords = GermanAnalyzer.GERMAN_STOP_WORDS; + else if(language.equals("fr")) + stopwords = FrenchAnalyzer.FRENCH_STOP_WORDS; + else if(language.equals("nl")) + stopwords = DutchAnalyzer.DUTCH_STOP_WORDS; + } + return stopwords; + } + + /** + * Builds an exclusionlist from an array of Strings. + */ + public void setStemExclusionTable(String[] exclusionlist) { + exclusionSet = StopFilter.makeStopSet(exclusionlist); + } + + /** + * Builds an exclusionlist from a Hashtable. + */ + public void setStemExclusionTable(Hashtable exclusionlist) { + exclusionSet = new HashSet(exclusionlist.keySet()); + } + + /** + * Builds an exclusionlist from the words contained in the given file. + */ + public void setStemExclusionTable(File exclusionlist) throws IOException { + exclusionSet = WordlistLoader.getWordSet(exclusionlist); + } + + /** + * Creates a TokenStream which tokenizes all the text in the provided Reader. + * + * @return A TokenStream build from a StandardTokenizer filtered with + * StandardFilter, LowerCaseFilter, StopFilter, DonatusStemFilter + */ + public TokenStream tokenStream(String fieldName, Reader reader) { + MpdlNormalizer mpdlNormalizer = new MpdlNormalizer(language); + TokenStream result = new MpdlTokenizer(reader, language, mpdlNormalizer); + result = new MpdlFilter(result); // filter to remove the hyphen in a token etc. + result = new LowerCaseFilter(result); + result = new StopFilter(result, stopSet); + result = new MpdlStemFilter(this, result, exclusionSet); + return result; + } + + public ArrayList getToken(String inputString) { + ArrayList token = new ArrayList(); + try { + Reader reader = new StringReader(inputString); + MpdlNormalizer mpdlNormalizer = new MpdlNormalizer(language); + TokenStream result = new MpdlTokenizer(reader, language, mpdlNormalizer); + result = new MpdlFilter(result); // filter to remove the hyphen in a token etc. + result = new LowerCaseFilter(result); + result = new StopFilter(result, stopSet); + Token t = result.next(); + while (t != null) { + String currentToken = String.valueOf(t.termBuffer()); + token.add(currentToken); + t = result.next(); + } + } catch (IOException e) { + e.printStackTrace(); + } + return token; + } + +} diff -r 000000000000 -r 408254cf2f1d software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/MpdlNormalizer.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/MpdlNormalizer.java Wed Nov 24 17:24:23 2010 +0100 @@ -0,0 +1,1078 @@ +package de.mpg.mpiwg.berlin.mpdl.lt.analyzer; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStreamReader; +import java.io.UnsupportedEncodingException; +import java.util.ArrayList; + +import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; +import de.mpg.mpiwg.berlin.mpdl.lt.doc.regularization.Regularization; +import de.mpg.mpiwg.berlin.mpdl.lt.doc.regularization.RegularizationManager; +import de.mpg.mpiwg.berlin.mpdl.lt.general.Language; + +public class MpdlNormalizer { + static final private String IT_VOWELS = "AEIOUaeiou" + + "\u00c6\u00e6" + // AE ligatures + "\u0152\u0153"; // OE ligatures + static final private String IT_CONS = "BCDFGHKLMNPQRSTVWXZ" + + "bcdfghklmnpqrstvwxz" + + "\u017f\u00df"; // long/sharp S + private String[] normFunctionsToUse = {"reg", "norm"}; // default is to use all of these normalization functions + private String language; + private int[] offsets; + + public MpdlNormalizer(String[] normFunctionsToUse, String lang) { + this.normFunctionsToUse = normFunctionsToUse; + String language = Language.getInstance().getLanguageId(lang); + this.language = language; + } + + public MpdlNormalizer(String language) { + this.language = language; + } + + /** + * Applies the normalization rules in language to + * s, without offset tracking. + * + * @param s source string + * @return normalized string + */ + public String normalize(String s) throws ApplicationException { + String normStr = s; + if (useRegFunction()) { + // try to regularize the string to the norm form over predefined regularizations + RegularizationManager regManager = RegularizationManager.getInstance(); + ArrayList regs = regManager.findRegsByOrig(language, s); + if (regs != null && regs.size() > 0) { + Regularization reg = regs.get(0); // only one: the first one + String regNormStr = reg.getNorm(); + normStr = regNormStr; + } + } + if (useNormFunction()) { + // normalize the string by string replace + normStr = normalize(normStr, null); + } + return normStr; + } + + private boolean useRegFunction() { + boolean useReg = false; + for (int i=0; i< normFunctionsToUse.length; i++) { + String function = normFunctionsToUse[i]; + if (function.equals("reg")) + return true; + } + return useReg; + } + + private boolean useNormFunction() { + boolean useNorm = false; + for (int i=0; i< normFunctionsToUse.length; i++) { + String function = normFunctionsToUse[i]; + if (function.equals("norm")) + return true; + } + return useNorm; + } + + /** + * Applies the normalization rules in language to + * s, with offset tracking.

+ * + * WARNING: + * Arboreal will not work properly if a normalization substitution + * replaces a source character with more than two target characters! + * This is simply a BUG, and should be fixed. Fortunately, however, + * one does not often need such a replacement.

+ * + * @param s source string + * @param offsets character offset table + * @return normalized string + */ + public String normalize(String s, int[] offsets) { + this.offsets = offsets; + if (language.equals("la") || language.equals("lat")) { + StringBuffer buf = new StringBuffer(); + int n = 0; + for (int i = 0; i < s.length(); i++) { + char c = s.charAt(i); + String replace = new String(); + switch (c) { + case 'j': replace = "i"; break; + case 'v': replace = "u"; break; + /* + * Linguistic note: /u/ and /v/ are rarely phonemic + * in Latin, as in alui 's/he nourished' vs. + * alvi 'of a belly', volui 's/he wished' or 'it rolled' + * vs. volvi 'to be rolled', (in)seruit 's/he joined + * together' vs. (in)servit 's/he serves'. + */ + case 'q': + if ((i < s.length() - 1) && (s.charAt(i + 1) == ';')) + replace = "qu"; + else + replace = "q"; + break; + case ';': + if ((i > 0) && (s.charAt(i - 1) == 'q')) + replace = "e"; + else if ((i == 0) || ! Character.isLetter(s.charAt(i - 1))) + replace = ";"; + else + replace = ""; + break; + case '\u0300': replace = ""; break; // COMBINING GRAVE ACCENT + case '\u0301': replace = ""; break; // COMBINING ACCUTE ACCENT + case '\u0302': replace = ""; break; // COMBINING CIRCUMFLEX ACCENT + + case '\u00c0': replace = "A"; break; // LATIN CAPITAL LETTER A GRAVE + case '\u00c1': replace = "A"; break; // LATIN CAPITAL LETTER A ACUTE + case '\u00c2': replace = "A"; break; // LATIN CAPITAL LETTER A CIRCUMFLEX + case '\u00c4': replace = "A"; break; // LATIN CAPITAL LETTER A DIAERESIS + case '\u00c6': replace = "Ae"; break; // LATIN CAPITAL LETTER A E + case '\u00c7': replace = "C"; break; // LATIN CAPITAL LETTER C CEDILLA + case '\u00c8': replace = "E"; break; // LATIN CAPITAL LETTER E GRAVE + case '\u00c9': replace = "E"; break; // LATIN CAPITAL LETTER E ACUTE + case '\u00ca': replace = "E"; break; // LATIN CAPITAL LETTER E CIRCUMFLEX + case '\u00cb': replace = "E"; break; // LATIN CAPITAL LETTER E DIAERESIS + case '\u00cc': replace = "I"; break; // LATIN CAPITAL LETTER I GRAVE; + case '\u00cd': replace = "I"; break; // LATIN CAPITAL LETTER I ACUTE + case '\u00ce': replace = "I"; break; // LATIN CAPITAL LETTER I CIRCUMFLEX + case '\u00cf': replace = "I"; break; // LATIN CAPITAL LETTER I DIAERESIS + case '\u00d2': replace = "O"; break; // LATIN CAPITAL LETTER O GRAVE + case '\u00d3': replace = "O"; break; // LATIN CAPITAL LETTER O ACUTE + case '\u00d4': replace = "O"; break; // LATIN CAPITAL LETTER O CIRCUMFLEX + case '\u00d6': replace = "O"; break; // LATIN CAPITAL LETTER O DIAERESIS + case '\u00d9': replace = "U"; break; // LATIN CAPITAL LETTER U GRAVE + case '\u00da': replace = "U"; break; // LATIN CAPITAL LETTER U ACUTE + case '\u00db': replace = "U"; break; // LATIN CAPITAL LETTER U CIRCUMFLEX + case '\u00dc': replace = "U"; break; // LATIN CAPITAL LETTER U DIAERESIS + case '\u00e0': replace = "a"; break; // LATIN SMALL LETTER A GRAVE + case '\u00e1': replace = "a"; break; // LATIN SMALL LETTER A ACUTE + case '\u00e2': replace = "a"; break; // LATIN SMALL LETTER A CIRCUMFLEX + case '\u00e4': replace = "a"; break; // LATIN SMALL LETTER A DIAERESIS + case '\u00e6': replace = "ae"; break; // LATIN SMALL LETTER A E + case '\u00e7': replace = "c"; break; // LATIN SMALL LETTER C CEDILLA + case '\u00e8': replace = "e"; break; // LATIN SMALL LETTER E GRAVE + case '\u00e9': replace = "e"; break; // LATIN SMALL LETTER E ACUTE + case '\u00ea': replace = "e"; break; // LATIN SMALL LETTER E CIRCUMFLEX + case '\u00eb': replace = "e"; break; // LATIN SMALL LETTER E DIAERESIS + case '\u00ec': replace = "i"; break; // LATIN SMALL LETTER I GRAVE + case '\u00ed': replace = "i"; break; // LATIN SMALL LETTER I ACUTE + case '\u00ee': replace = "i"; break; // LATIN SMALL LETTER I CIRCUMFLEX + case '\u00ef': replace = "i"; break; // LATIN SMALL LETTER I DIAERESIS + case '\u00f2': replace = "o"; break; // LATIN SMALL LETTER O GRAVE + case '\u00f3': replace = "o"; break; // LATIN SMALL LETTER O ACUTE + case '\u00f4': replace = "o"; break; // LATIN SMALL LETTER O CIRCUMFLEX + case '\u00f6': replace = "o"; break; // LATIN SMALL LETTER O DIAERESIS + case '\u00f9': replace = "u"; break; // LATIN SMALL LETTER U GRAVE + case '\u00fa': replace = "u"; break; // LATIN SMALL LETTER U ACUTE + case '\u00fb': replace = "u"; break; // LATIN SMALL LETTER U CIRCUMFLEX + case '\u00fc': replace = "u"; break; // LATIN SMALL LETTER U DIAERESIS + case '\u0100': replace = "A"; break; // LATIN CAPITAL LETTER A MACRON + case '\u0101': replace = "a"; break; // LATIN SMALL LETTER A MACRON + case '\u0102': replace = "A"; break; // LATIN CAPITAL LETTER A BREVE + case '\u0103': replace = "a"; break; // LATIN SMALL LETTER A BREVE + case '\u0112': replace = "E"; break; // LATIN CAPITAL LETTER E MACRON + case '\u0113': replace = "e"; break; // LATIN SMALL LETTER E MACRON + case '\u0114': replace = "E"; break; // LATIN CAPITAL LETTER E BREVE + case '\u0115': replace = "e"; break; // LATIN SMALL LETTER E BREVE + case '\u0118': replace = "Ae"; break; // LATIN CAPITAL LETTER E OGONEK + case '\u0119': replace = "ae"; break; // LATIN SMALL LETTER E OGONEK + case '\u012a': replace = "I"; break; // LATIN CAPITAL LETTER I MACRON + case '\u012b': replace = "i"; break; // LATIN SMALL LETTER I MACRON + case '\u012c': replace = "I"; break; // LATIN CAPITAL LETTER I BREVE + case '\u012d': replace = "i"; break; // LATIN SMALL LETTER I BREVE + case '\u014c': replace = "O"; break; // LATIN CAPITAL LETTER O MACRON + case '\u014d': replace = "o"; break; // LATIN SMALL LETTER O MACRON + case '\u014e': replace = "O"; break; // LATIN CAPITAL LETTER O BREVE + case '\u014f': replace = "o"; break; // LATIN SMALL LETTER O BREVE + case '\u0152': replace = "Oe"; break; // LATIN CAPITAL LETTER O E + case '\u0153': replace = "oe"; break; // LATIN SMALL LETTER O E + case '\u016a': replace = "U"; break; // LATIN CAPITAL LETTER U MACRON + case '\u016b': replace = "u"; break; // LATIN SMALL LETTER U MACRON + case '\u016c': replace = "U"; break; // LATIN CAPITAL LETTER U BREVE + case '\u016d': replace = "u"; break; // LATIN SMALL LETTER U BREVE + case '\u017f': replace = "s"; break; // LATIN SMALL LETTER LONG S + case '\u00df': replace = "ss"; break; // LATIN SMALL LETTER SHARP S + case '\u00ad': break; // soft hyphen + // new in MPDL project by J. Willenborg + case '\u1e14': replace = "e"; break; // LATIN ... LETTER E WITH ... + case '\u1e15': replace = "e"; break; // LATIN ... LETTER E WITH ... + case '\u1e16': replace = "e"; break; // LATIN ... LETTER E WITH ... + case '\u1e17': replace = "e"; break; // LATIN ... LETTER E WITH ... + case '\u1e18': replace = "e"; break; // LATIN ... LETTER E WITH ... + case '\u1e19': replace = "e"; break; // LATIN ... LETTER E WITH ... + case '\u1e1a': replace = "e"; break; // LATIN ... LETTER E WITH ... + case '\u1e1b': replace = "e"; break; // LATIN ... LETTER E WITH ... + case '\u1e1c': replace = "e"; break; // LATIN ... LETTER E WITH ... + case '\u1e1d': replace = "e"; break; // LATIN ... LETTER E WITH ... + case '\u1eb8': replace = "e"; break; // LATIN ... LETTER E WITH ... + case '\u1eb9': replace = "e"; break; // LATIN ... LETTER E WITH ... + case '\u1eba': replace = "e"; break; // LATIN ... LETTER E WITH ... + case '\u1ebb': replace = "e"; break; // LATIN ... LETTER E WITH ... + case '\u1ebc': replace = "e"; break; // LATIN ... LETTER E WITH ... + case '\u1ebd': replace = "e"; break; // LATIN ... LETTER E WITH ... + case '\u1ebe': replace = "e"; break; // LATIN ... LETTER E WITH ... + case '\u1ebf': replace = "e"; break; // LATIN ... LETTER E WITH ... + case '\u1ec0': replace = "e"; break; // LATIN ... LETTER E WITH ... + case '\u1ec1': replace = "e"; break; // LATIN ... LETTER E WITH ... + case '\u1ec2': replace = "e"; break; // LATIN ... LETTER E WITH ... + case '\u1ec3': replace = "e"; break; // LATIN ... LETTER E WITH ... + case '\u1ec4': replace = "e"; break; // LATIN ... LETTER E WITH ... + case '\u1ec5': replace = "e"; break; // LATIN ... LETTER E WITH ... + case '\u1ec6': replace = "e"; break; // LATIN ... LETTER E WITH ... + case '\u1ec7': replace = "e"; break; // LATIN ... LETTER E WITH ... + // by Malcolm + case '\u2329': break; // BRA + case '\u232a': break; // KET + default: replace += c; break; + } + buf.append(replace); + // update offsets if replacement is a different length + if (offsets != null) { + int r = replace.length(); + if (r == 0) + this.offsets = arrayKill(this.offsets, i - n); + else if (r == 2) + this.offsets = arrayInsert(this.offsets, i - n + 1, this.offsets[i - n], r - 1); + n += 1 - r; + } + } + return buf.toString(); + } else if (language.equals("it")) { + // new Mpdl code: added by J. Willenborg: some of Malcolms code did not work without errors so it has to be taken away, also all latin stuff is imported + StringBuffer buf = new StringBuffer(); + int n = 0; + for (int i = 0; i < s.length(); i++) { + char c = s.charAt(i); + String replace = new String(); + switch (c) { + case '\u00c0': replace = "A"; break; // LATIN CAPITAL LETTER A GRAVE + case '\u00c1': replace = "A"; break; // LATIN CAPITAL LETTER A ACUTE + case '\u00c2': replace = "A"; break; // LATIN CAPITAL LETTER A CIRCUMFLEX + case '\u00c4': replace = "A"; break; // LATIN CAPITAL LETTER A DIAERESIS + case '\u00c6': replace = "Ae"; break; // LATIN CAPITAL LETTER A E + case '\u00c7': replace = "C"; break; // LATIN CAPITAL LETTER C CEDILLA + case '\u00c8': replace = "E"; break; // LATIN CAPITAL LETTER E GRAVE + case '\u00c9': replace = "E"; break; // LATIN CAPITAL LETTER E ACUTE + case '\u00ca': replace = "E"; break; // LATIN CAPITAL LETTER E CIRCUMFLEX + case '\u00cb': replace = "E"; break; // LATIN CAPITAL LETTER E DIAERESIS + case '\u00cc': replace = "I"; break; // LATIN CAPITAL LETTER I GRAVE; + case '\u00cd': replace = "I"; break; // LATIN CAPITAL LETTER I ACUTE + case '\u00ce': replace = "I"; break; // LATIN CAPITAL LETTER I CIRCUMFLEX + case '\u00cf': replace = "I"; break; // LATIN CAPITAL LETTER I DIAERESIS + case '\u00d2': replace = "O"; break; // LATIN CAPITAL LETTER O GRAVE + case '\u00d3': replace = "O"; break; // LATIN CAPITAL LETTER O ACUTE + case '\u00d4': replace = "O"; break; // LATIN CAPITAL LETTER O CIRCUMFLEX + case '\u00d6': replace = "O"; break; // LATIN CAPITAL LETTER O DIAERESIS + case '\u00d9': replace = "U"; break; // LATIN CAPITAL LETTER U GRAVE + case '\u00da': replace = "U"; break; // LATIN CAPITAL LETTER U ACUTE + case '\u00db': replace = "U"; break; // LATIN CAPITAL LETTER U CIRCUMFLEX + case '\u00dc': replace = "U"; break; // LATIN CAPITAL LETTER U DIAERESIS + case '\u00e0': replace = "a"; break; // LATIN SMALL LETTER A GRAVE + case '\u00e1': replace = "a"; break; // LATIN SMALL LETTER A ACUTE + case '\u00e2': replace = "a"; break; // LATIN SMALL LETTER A CIRCUMFLEX + case '\u00e4': replace = "a"; break; // LATIN SMALL LETTER A DIAERESIS + case '\u00e6': replace = "ae"; break; // LATIN SMALL LETTER A E + case '\u00e7': replace = "c"; break; // LATIN SMALL LETTER C CEDILLA + case '\u00e8': replace = "e"; break; // LATIN SMALL LETTER E GRAVE + case '\u00e9': replace = "e"; break; // LATIN SMALL LETTER E ACUTE + case '\u00ea': replace = "e"; break; // LATIN SMALL LETTER E CIRCUMFLEX + case '\u00eb': replace = "e"; break; // LATIN SMALL LETTER E DIAERESIS + case '\u00ec': replace = "i"; break; // LATIN SMALL LETTER I GRAVE + case '\u00ed': replace = "i"; break; // LATIN SMALL LETTER I ACUTE + case '\u00ee': replace = "i"; break; // LATIN SMALL LETTER I CIRCUMFLEX + case '\u00ef': replace = "i"; break; // LATIN SMALL LETTER I DIAERESIS + case '\u00f2': replace = "o"; break; // LATIN SMALL LETTER O GRAVE + case '\u00f3': replace = "o"; break; // LATIN SMALL LETTER O ACUTE + case '\u00f4': replace = "o"; break; // LATIN SMALL LETTER O CIRCUMFLEX + case '\u00f6': replace = "o"; break; // LATIN SMALL LETTER O DIAERESIS + case '\u00f9': replace = "u"; break; // LATIN SMALL LETTER U GRAVE + case '\u00fa': replace = "u"; break; // LATIN SMALL LETTER U ACUTE + case '\u00fb': replace = "u"; break; // LATIN SMALL LETTER U CIRCUMFLEX + case '\u00fc': replace = "u"; break; // LATIN SMALL LETTER U DIAERESIS + case '\u0100': replace = "A"; break; // LATIN CAPITAL LETTER A MACRON + case '\u0101': replace = "a"; break; // LATIN SMALL LETTER A MACRON + case '\u0102': replace = "A"; break; // LATIN CAPITAL LETTER A BREVE + case '\u0103': replace = "a"; break; // LATIN SMALL LETTER A BREVE + case '\u0112': replace = "E"; break; // LATIN CAPITAL LETTER E MACRON + case '\u0113': replace = "e"; break; // LATIN SMALL LETTER E MACRON + case '\u0114': replace = "E"; break; // LATIN CAPITAL LETTER E BREVE + case '\u0115': replace = "e"; break; // LATIN SMALL LETTER E BREVE + case '\u0118': replace = "Ae"; break; // LATIN CAPITAL LETTER E OGONEK + case '\u0119': replace = "ae"; break; // LATIN SMALL LETTER E OGONEK + case '\u012a': replace = "I"; break; // LATIN CAPITAL LETTER I MACRON + case '\u012b': replace = "i"; break; // LATIN SMALL LETTER I MACRON + case '\u012c': replace = "I"; break; // LATIN CAPITAL LETTER I BREVE + case '\u012d': replace = "i"; break; // LATIN SMALL LETTER I BREVE + case '\u014c': replace = "O"; break; // LATIN CAPITAL LETTER O MACRON + case '\u014d': replace = "o"; break; // LATIN SMALL LETTER O MACRON + case '\u014e': replace = "O"; break; // LATIN CAPITAL LETTER O BREVE + case '\u014f': replace = "o"; break; // LATIN SMALL LETTER O BREVE + case '\u0152': replace = "Oe"; break; // LATIN CAPITAL LETTER O E + case '\u0153': replace = "oe"; break; // LATIN SMALL LETTER O E + case '\u016a': replace = "U"; break; // LATIN CAPITAL LETTER U MACRON + case '\u016b': replace = "u"; break; // LATIN SMALL LETTER U MACRON + case '\u016c': replace = "U"; break; // LATIN CAPITAL LETTER U BREVE + case '\u016d': replace = "u"; break; // LATIN SMALL LETTER U BREVE + case '\u017f': replace = "s"; break; // LATIN SMALL LETTER LONG S + case '\u00df': replace = "ss"; break; // LATIN SMALL LETTER SHARP S + // new in MPDL project by J. Willenborg + case '\u1e8d': replace = "e"; break; // LATIN SMALL LETTER E WITH TILDE + default: replace += c; break; + } + buf.append(replace); + // update offsets if replacement is a different length + if (offsets != null) { + int r = replace.length(); + if (r == 0) this.offsets = arrayKill(this.offsets, i - n); + else if (r == 2) + this.offsets = arrayInsert(this.offsets, i - n + 1, this.offsets[i - n], r - 1); + n += 1 - r; + } + } + return buf.toString(); + // new Mpdl code: added by J. Willenborg: most of the latin replacements also in english + } else if (language.equals("en")) { + StringBuffer buf = new StringBuffer(); + int n = 0; + for (int i = 0; i < s.length(); i++) { + char c = s.charAt(i); + String replace = new String(); + switch (c) { + case '\u0300': replace = ""; break; // COMBINING GRAVE ACCENT + case '\u0301': replace = ""; break; // COMBINING ACCUTE ACCENT + case '\u0302': replace = ""; break; // COMBINING CIRCUMFLEX ACCENT + + case '\u00c0': replace = "A"; break; // LATIN CAPITAL LETTER A GRAVE + case '\u00c1': replace = "A"; break; // LATIN CAPITAL LETTER A ACUTE + case '\u00c2': replace = "A"; break; // LATIN CAPITAL LETTER A CIRCUMFLEX + case '\u00c4': replace = "A"; break; // LATIN CAPITAL LETTER A DIAERESIS + case '\u00c6': replace = "Ae"; break; // LATIN CAPITAL LETTER A E + case '\u00c7': replace = "C"; break; // LATIN CAPITAL LETTER C CEDILLA + case '\u00c8': replace = "E"; break; // LATIN CAPITAL LETTER E GRAVE + case '\u00c9': replace = "E"; break; // LATIN CAPITAL LETTER E ACUTE + case '\u00ca': replace = "E"; break; // LATIN CAPITAL LETTER E CIRCUMFLEX + case '\u00cb': replace = "E"; break; // LATIN CAPITAL LETTER E DIAERESIS + case '\u00cc': replace = "I"; break; // LATIN CAPITAL LETTER I GRAVE; + case '\u00cd': replace = "I"; break; // LATIN CAPITAL LETTER I ACUTE + case '\u00ce': replace = "I"; break; // LATIN CAPITAL LETTER I CIRCUMFLEX + case '\u00cf': replace = "I"; break; // LATIN CAPITAL LETTER I DIAERESIS + case '\u00d2': replace = "O"; break; // LATIN CAPITAL LETTER O GRAVE + case '\u00d3': replace = "O"; break; // LATIN CAPITAL LETTER O ACUTE + case '\u00d4': replace = "O"; break; // LATIN CAPITAL LETTER O CIRCUMFLEX + case '\u00d6': replace = "O"; break; // LATIN CAPITAL LETTER O DIAERESIS + case '\u00d9': replace = "U"; break; // LATIN CAPITAL LETTER U GRAVE + case '\u00da': replace = "U"; break; // LATIN CAPITAL LETTER U ACUTE + case '\u00db': replace = "U"; break; // LATIN CAPITAL LETTER U CIRCUMFLEX + case '\u00dc': replace = "U"; break; // LATIN CAPITAL LETTER U DIAERESIS + case '\u00e0': replace = "a"; break; // LATIN SMALL LETTER A GRAVE + case '\u00e1': replace = "a"; break; // LATIN SMALL LETTER A ACUTE + case '\u00e2': replace = "a"; break; // LATIN SMALL LETTER A CIRCUMFLEX + case '\u00e4': replace = "a"; break; // LATIN SMALL LETTER A DIAERESIS + case '\u00e6': replace = "ae"; break; // LATIN SMALL LETTER A E + case '\u00e7': replace = "c"; break; // LATIN SMALL LETTER C CEDILLA + case '\u00e8': replace = "e"; break; // LATIN SMALL LETTER E GRAVE + case '\u00e9': replace = "e"; break; // LATIN SMALL LETTER E ACUTE + case '\u00ea': replace = "e"; break; // LATIN SMALL LETTER E CIRCUMFLEX + case '\u00eb': replace = "e"; break; // LATIN SMALL LETTER E DIAERESIS + case '\u00ec': replace = "i"; break; // LATIN SMALL LETTER I GRAVE + case '\u00ed': replace = "i"; break; // LATIN SMALL LETTER I ACUTE + case '\u00ee': replace = "i"; break; // LATIN SMALL LETTER I CIRCUMFLEX + case '\u00ef': replace = "i"; break; // LATIN SMALL LETTER I DIAERESIS + case '\u00f2': replace = "o"; break; // LATIN SMALL LETTER O GRAVE + case '\u00f3': replace = "o"; break; // LATIN SMALL LETTER O ACUTE + case '\u00f4': replace = "o"; break; // LATIN SMALL LETTER O CIRCUMFLEX + case '\u00f6': replace = "o"; break; // LATIN SMALL LETTER O DIAERESIS + case '\u00f9': replace = "u"; break; // LATIN SMALL LETTER U GRAVE + case '\u00fa': replace = "u"; break; // LATIN SMALL LETTER U ACUTE + case '\u00fb': replace = "u"; break; // LATIN SMALL LETTER U CIRCUMFLEX + case '\u00fc': replace = "u"; break; // LATIN SMALL LETTER U DIAERESIS + case '\u0100': replace = "A"; break; // LATIN CAPITAL LETTER A MACRON + case '\u0101': replace = "a"; break; // LATIN SMALL LETTER A MACRON + case '\u0102': replace = "A"; break; // LATIN CAPITAL LETTER A BREVE + case '\u0103': replace = "a"; break; // LATIN SMALL LETTER A BREVE + case '\u0112': replace = "E"; break; // LATIN CAPITAL LETTER E MACRON + case '\u0113': replace = "e"; break; // LATIN SMALL LETTER E MACRON + case '\u0114': replace = "E"; break; // LATIN CAPITAL LETTER E BREVE + case '\u0115': replace = "e"; break; // LATIN SMALL LETTER E BREVE + case '\u0118': replace = "Ae"; break; // LATIN CAPITAL LETTER E OGONEK + case '\u0119': replace = "ae"; break; // LATIN SMALL LETTER E OGONEK + case '\u012a': replace = "I"; break; // LATIN CAPITAL LETTER I MACRON + case '\u012b': replace = "i"; break; // LATIN SMALL LETTER I MACRON + case '\u012c': replace = "I"; break; // LATIN CAPITAL LETTER I BREVE + case '\u012d': replace = "i"; break; // LATIN SMALL LETTER I BREVE + case '\u014c': replace = "O"; break; // LATIN CAPITAL LETTER O MACRON + case '\u014d': replace = "o"; break; // LATIN SMALL LETTER O MACRON + case '\u014e': replace = "O"; break; // LATIN CAPITAL LETTER O BREVE + case '\u014f': replace = "o"; break; // LATIN SMALL LETTER O BREVE + case '\u0152': replace = "Oe"; break; // LATIN CAPITAL LETTER O E + case '\u0153': replace = "oe"; break; // LATIN SMALL LETTER O E + case '\u016a': replace = "U"; break; // LATIN CAPITAL LETTER U MACRON + case '\u016b': replace = "u"; break; // LATIN SMALL LETTER U MACRON + case '\u016c': replace = "U"; break; // LATIN CAPITAL LETTER U BREVE + case '\u016d': replace = "u"; break; // LATIN SMALL LETTER U BREVE + case '\u017f': replace = "s"; break; // LATIN SMALL LETTER LONG S + case '\u00df': replace = "ss"; break; // LATIN SMALL LETTER SHARP S + // new in MPDL project by J. Willenborg + case '\u1e8d': replace = "e"; break; // LATIN SMALL LETTER E WITH TILDE + // by Malcolm + case '\u00ad': break; // soft hyphen + case '\u2329': break; // BRA + case '\u232a': break; // KET + default: replace += c; break; + } + buf.append(replace); + // update offsets if replacement is a different length + if (offsets != null) { + int r = replace.length(); + if (r == 0) + this.offsets = arrayKill(this.offsets, i - n); + else if (r == 2) + this.offsets = arrayInsert(this.offsets, i - n + 1, this.offsets[i - n], r - 1); + n += 1 - r; + } + } + return buf.toString(); + } else if (language.equals("fr")) { + // new Mpdl code: added by J. Willenborg: some of Malcolms code did not work without errors so it has to be taken away + StringBuffer buf = new StringBuffer(); + int n = 0; + for (int i = 0; i < s.length(); i++) { + char c = s.charAt(i); + String replace = new String(); + switch (c) { + case '\u00e6': replace = "ae"; break; // LATIN SMALL LETTER A E + case '\u017f': replace = "s"; break; // LATIN SMALL LETTER LONG S + case '\u00df': replace = "ss"; break; // LATIN SMALL LETTER SHARP S + case '\u00ad': break; // soft hyphen + case '-': break; + default: replace += c; break; + } + buf.append(replace); + // update offsets if replacement is a different length + if (offsets != null) { + int r = replace.length(); + if (r == 0) + this.offsets = arrayKill(this.offsets, i - n); + else if (r == 2) + this.offsets = arrayInsert(this.offsets, i - n + 1, this.offsets[i - n], r - 1); + n += 1 - r; + } + } + return buf.toString(); + } else if (language.equals("de")) { + StringBuffer buf = new StringBuffer(); + int n = 0; + for (int i = 0; i < s.length(); i++) { + char c = s.charAt(i); + String replace = new String(); + switch (c) { + case '\u00c4': replace = "Ae"; break; + case '\u00d6': replace = "Oe"; break; + case '\u00dc': replace = "Ue"; break; + case '\u00df': replace = "ss"; break; + case '\u00e4': replace = "ae"; break; + case '\u00f6': replace = "oe"; break; + case '\u00fc': replace = "ue"; break; + case '\u00e9': replace = "e"; break; + case '\u00ad': break; // soft hyphen + case '-': break; + default: replace += c; break; + } + buf.append(replace); + // update offsets if replacement is a different length + if (offsets != null) { + int r = replace.length(); + if (r == 0) + this.offsets = arrayKill(this.offsets, i - n); + else if (r == 2) + this.offsets = arrayInsert(this.offsets, i - n + 1, this.offsets[i - n], r - 1); + n += 1 - r; + } + } + return buf.toString(); + } else if (language.equals("zh")) { + StringBuffer buf = new StringBuffer(); + int n = 0; + for (int i = 0; i < s.length(); i++) { + char c = s.charAt(i); + String replace = new String(); + switch (c) { + case '\u00b9': replace = "1"; break; + case '\u00b2': replace = "2"; break; + case '\u00b3': replace = "3"; break; + case '\u2074': replace = "4"; break; + case '\u2075': replace = "5"; break; + // original by Malcolm Hyman: with the following replacements // TODO uncomment these 3 lines + // case '\u3000': replace = " "; break; + // case '\u3001': replace = ","; break; + // case '\u3002': replace = "."; break; + // case '\u200b': break; // BREAKS EVERYTHING! + default: replace += c; break; + } + buf.append(replace); + // update offsets if replacement is a different length + if (offsets != null) { + int r = replace.length(); + if (r == 0) + this.offsets = arrayKill(this.offsets, i - n); + else if (r == 2) + this.offsets = arrayInsert(this.offsets, i - n + 1, this.offsets[i - n], r - 1); + n += 1 - r; + } + } + return buf.toString(); + } else if (language.equals("akk") || + language.equals("qam") || + language.equals("qpc") || + language.equals("elx") || + language.equals("sux") || + language.equals("hit") || + language.equals("qhu") || + language.equals("peo") || + language.equals("uga") || + language.equals("ura") || + language.equals("qcu")) { + StringBuffer buf = new StringBuffer(); + int n = 0; + char last = '\u0000'; + for (int i = 0; i < s.length(); i++) { + char c = s.charAt(i); + c = Character.toLowerCase(c); + String replace = new String(); + switch (c) { + case '{': replace += "-"; break; + case '}': replace += "-"; break; + // These are from PSD::ATF::Unicode by Steve Tinney + case '\u0161': replace += "sz"; break; + case '\u1e63': replace += "s,"; break; + case '\u1e6d': replace += "t,"; break; + case '\u014b': replace += "j"; break; + case '\u015b': replace += "s'"; break; + case '\u2080': replace += "0"; break; + case '\u2081': replace += "1"; break; + case '\u2082': replace += "2"; break; + case '\u2083': replace += "3"; break; + case '\u2084': replace += "4"; break; + case '\u2085': replace += "5"; break; + case '\u2086': replace += "6"; break; + case '\u2087': replace += "7"; break; + case '\u2088': replace += "8"; break; + case '\u2089': replace += "9"; break; + + case 'c': // shin (except where used as modifier) + if ((i > 0) && ((last == '~') || (last == '@'))) + replace += "c"; + else replace += "sz"; + break; + default: replace += c; break; + } + // suppress grapheme boundary before or after word boundary + if (replace.equals("-")) { + if ((i + 1 == s.length()) || (s.charAt(i + 1) == ' ') || (i == 0) || (s.charAt(i - 1) == ' ')) + replace = ""; + } + last = c; + buf.append(replace); + // update offsets if replacement is a different length + if (offsets != null) { + int r = replace.length(); + if (r == 0) + this.offsets = arrayKill(this.offsets, i - n); + else if (r == 2) + this.offsets = arrayInsert(this.offsets, i - n + 1, this.offsets[i - n], r - 1); + n += 1 - r; + } + } + return buf.toString(); + } else if (language.equals("el") || language.equals("grc")) { + StringBuffer buf = new StringBuffer(); + int n = 0; + for (int i = 0; i < s.length(); i++) { + char c = s.charAt(i); + String replace = new String(); + switch (c) { + case '\u03c2': replace = "\u03c3"; break; // GREEK SMALL LETTER FINAL SIGMA + case '<': break; + case '>': break; + case '[': break; + case ']': break; + case '1': break; + case '2': break; + case '\u03ac': replace = "\u1f71"; break; + case '\u03ad': replace = "\u1f73"; break; + case '\u03ae': replace = "\u1f75"; break; + case '\u03af': replace = "\u1f77"; break; + case '\u03cc': replace = "\u1f79"; break; + case '\u03cd': replace = "\u1f7b"; break; + case '\u03ce': replace = "\u1f7d"; break; + case '-': break; // same treatment as soft hyphen + case '\u00ad': break; // soft hyphen + default: replace += c; break; + } + buf.append(replace); + // update offsets if replacement is a different length + if (offsets != null) { + int r = replace.length(); + if (r == 0) + this.offsets = arrayKill(this.offsets, i - n); + else if (r == 2) + this.offsets = arrayInsert(this.offsets, i - n + 1, this.offsets[i - n], r - 1); + n += 1 - r; + } + } + return buf.toString(); + } else if (language.equals("el_atonic")) { + StringBuffer buf = new StringBuffer(); + int n = 0; + for (int i = 0; i < s.length(); i++) { + char c = s.charAt(i); + String replace = new String(); + switch (c) { + case '\u03c2': replace = "\u03c3"; break; // GREEK SMALL LETTER FINAL SIGMA + // map characters with diacritics to their plain equivalent + // cf. BetaCode.java + case '\u03aa': replace = "\u0399"; break; + case '\u03ab': replace = "\u03a5"; break; + case '\u03ac': replace = "\u0381"; break; + case '\u03ad': replace = "\u0385"; break; + case '\u03ae': replace = "\u0387"; break; + case '\u03af': replace = "\u0389"; break; + case '\u03ca': replace = "\u03b9"; break; + case '\u03cb': replace = "\u03c5"; break; + case '\u03cc': replace = "\u03bf"; break; + case '\u03cd': replace = "\u03c5"; break; + case '\u03ce': replace = "\u03c9"; break; + case '\u1f00': replace = "\u03b1"; break; + case '\u1f01': replace = "\u03b1"; break; + case '\u1f02': replace = "\u03b1"; break; + case '\u1f03': replace = "\u03b1"; break; + case '\u1f04': replace = "\u03b1"; break; + case '\u1f05': replace = "\u03b1"; break; + case '\u1f06': replace = "\u03b1"; break; + case '\u1f07': replace = "\u03b1"; break; + case '\u1f08': replace = "\u0391"; break; + case '\u1f09': replace = "\u0391"; break; + case '\u1f0a': replace = "\u0391"; break; + case '\u1f0b': replace = "\u0391"; break; + case '\u1f0c': replace = "\u0391"; break; + case '\u1f0d': replace = "\u0391"; break; + case '\u1f0e': replace = "\u0391"; break; + case '\u1f0f': replace = "\u0391"; break; + case '\u1f10': replace = "\u03b5"; break; + case '\u1f11': replace = "\u03b5"; break; + case '\u1f12': replace = "\u03b5"; break; + case '\u1f13': replace = "\u03b5"; break; + case '\u1f14': replace = "\u03b5"; break; + case '\u1f15': replace = "\u03b5"; break; + case '\u1f18': replace = "\u0395"; break; + case '\u1f19': replace = "\u0395"; break; + case '\u1f1a': replace = "\u0395"; break; + case '\u1f1b': replace = "\u0395"; break; + case '\u1f1c': replace = "\u0395"; break; + case '\u1f1d': replace = "\u0395"; break; + case '\u1f20': replace = "\u03b7"; break; + case '\u1f21': replace = "\u03b7"; break; + case '\u1f22': replace = "\u03b7"; break; + case '\u1f23': replace = "\u03b7"; break; + case '\u1f24': replace = "\u03b7"; break; + case '\u1f25': replace = "\u03b7"; break; + case '\u1f26': replace = "\u03b7"; break; + case '\u1f27': replace = "\u03b7"; break; + case '\u1f28': replace = "\u0397"; break; + case '\u1f29': replace = "\u0397"; break; + case '\u1f2a': replace = "\u0397"; break; + case '\u1f2b': replace = "\u0397"; break; + case '\u1f2c': replace = "\u0397"; break; + case '\u1f2d': replace = "\u0397"; break; + case '\u1f2e': replace = "\u0397"; break; + case '\u1f2f': replace = "\u0397"; break; + case '\u1f30': replace = "\u03b9"; break; + case '\u1f31': replace = "\u03b9"; break; + case '\u1f32': replace = "\u03b9"; break; + case '\u1f33': replace = "\u03b9"; break; + case '\u1f34': replace = "\u03b9"; break; + case '\u1f35': replace = "\u03b9"; break; + case '\u1f36': replace = "\u03b9"; break; + case '\u1f37': replace = "\u03b9"; break; + case '\u1f38': replace = "\u0399"; break; + case '\u1f39': replace = "\u0399"; break; + case '\u1f3a': replace = "\u0399"; break; + case '\u1f3b': replace = "\u0399"; break; + case '\u1f3c': replace = "\u0399"; break; + case '\u1f3d': replace = "\u0399"; break; + case '\u1f3e': replace = "\u0399"; break; + case '\u1f3f': replace = "\u0399"; break; + case '\u1f40': replace = "\u03bf"; break; + case '\u1f41': replace = "\u03bf"; break; + case '\u1f42': replace = "\u03bf"; break; + case '\u1f43': replace = "\u03bf"; break; + case '\u1f44': replace = "\u03bf"; break; + case '\u1f45': replace = "\u03bf"; break; + case '\u1f48': replace = "\u039f"; break; + case '\u1f49': replace = "\u039f"; break; + case '\u1f4a': replace = "\u039f"; break; + case '\u1f4b': replace = "\u039f"; break; + case '\u1f4c': replace = "\u039f"; break; + case '\u1f4d': replace = "\u039f"; break; + case '\u1f50': replace = "\u03c5"; break; + case '\u1f51': replace = "\u03c5"; break; + case '\u1f52': replace = "\u03c5"; break; + case '\u1f53': replace = "\u03c5"; break; + case '\u1f54': replace = "\u03c5"; break; + case '\u1f55': replace = "\u03c5"; break; + case '\u1f56': replace = "\u03c5"; break; + case '\u1f57': replace = "\u03c5"; break; + case '\u1f58': replace = "\u03a5"; break; + case '\u1f59': replace = "\u03a5"; break; + case '\u1f5a': replace = "\u03a5"; break; + case '\u1f5b': replace = "\u03a5"; break; + case '\u1f5c': replace = "\u03a5"; break; + case '\u1f5d': replace = "\u03a5"; break; + case '\u1f5e': replace = "\u03a5"; break; + case '\u1f5f': replace = "\u03a5"; break; + case '\u1f60': replace = "\u03c9"; break; + case '\u1f61': replace = "\u03c9"; break; + case '\u1f62': replace = "\u03c9"; break; + case '\u1f63': replace = "\u03c9"; break; + case '\u1f64': replace = "\u03c9"; break; + case '\u1f65': replace = "\u03c9"; break; + case '\u1f66': replace = "\u03c9"; break; + case '\u1f67': replace = "\u03c9"; break; + case '\u1f68': replace = "\u03a9"; break; + case '\u1f69': replace = "\u03a9"; break; + case '\u1f6a': replace = "\u03a9"; break; + case '\u1f6b': replace = "\u03a9"; break; + case '\u1f6c': replace = "\u03a9"; break; + case '\u1f6d': replace = "\u03a9"; break; + case '\u1f6e': replace = "\u03a9"; break; + case '\u1f6f': replace = "\u03a9"; break; + case '\u1f70': replace = "\u03b1"; break; + case '\u1f71': replace = "\u03b1"; break; + case '\u1f72': replace = "\u03b5"; break; + case '\u1f73': replace = "\u03b5"; break; + case '\u1f74': replace = "\u03b7"; break; + case '\u1f75': replace = "\u03b7"; break; + case '\u1f76': replace = "\u03b9"; break; + case '\u1f77': replace = "\u03b9"; break; + case '\u1f78': replace = "\u03bf"; break; + case '\u1f79': replace = "\u03bf"; break; + case '\u1f7a': replace = "\u03c5"; break; + case '\u1f7b': replace = "\u03c5"; break; + case '\u1f7c': replace = "\u03c9"; break; + case '\u1f7d': replace = "\u03c9"; break; + case '\u1f80': replace = "\u03b1"; break; + case '\u1f81': replace = "\u03b1"; break; + case '\u1f82': replace = "\u03b1"; break; + case '\u1f83': replace = "\u03b1"; break; + case '\u1f84': replace = "\u03b1"; break; + case '\u1f85': replace = "\u03b1"; break; + case '\u1f86': replace = "\u03b1"; break; + case '\u1f87': replace = "\u03b1"; break; + case '\u1f88': replace = "\u0391"; break; + case '\u1f89': replace = "\u0391"; break; + case '\u1f8a': replace = "\u0391"; break; + case '\u1f8b': replace = "\u0391"; break; + case '\u1f8c': replace = "\u0391"; break; + case '\u1f8d': replace = "\u0391"; break; + case '\u1f8e': replace = "\u0391"; break; + case '\u1f8f': replace = "\u0391"; break; + case '\u1f90': replace = "\u03b7"; break; + case '\u1f91': replace = "\u03b7"; break; + case '\u1f92': replace = "\u03b7"; break; + case '\u1f93': replace = "\u03b7"; break; + case '\u1f94': replace = "\u03b7"; break; + case '\u1f95': replace = "\u03b7"; break; + case '\u1f96': replace = "\u03b7"; break; + case '\u1f97': replace = "\u03b7"; break; + case '\u1f98': replace = "\u0397"; break; + case '\u1f99': replace = "\u0397"; break; + case '\u1f9a': replace = "\u0397"; break; + case '\u1f9b': replace = "\u0397"; break; + case '\u1f9c': replace = "\u0397"; break; + case '\u1f9d': replace = "\u0397"; break; + case '\u1f9e': replace = "\u0397"; break; + case '\u1f9f': replace = "\u0397"; break; + case '\u1fa0': replace = "\u03c9"; break; + case '\u1fa1': replace = "\u03c9"; break; + case '\u1fa2': replace = "\u03c9"; break; + case '\u1fa3': replace = "\u03c9"; break; + case '\u1fa4': replace = "\u03c9"; break; + case '\u1fa5': replace = "\u03c9"; break; + case '\u1fa6': replace = "\u03c9"; break; + case '\u1fa7': replace = "\u03c9"; break; + case '\u1fa8': replace = "\u03a9"; break; + case '\u1fa9': replace = "\u03a9"; break; + case '\u1faa': replace = "\u03a9"; break; + case '\u1fab': replace = "\u03a9"; break; + case '\u1fac': replace = "\u03a9"; break; + case '\u1fad': replace = "\u03a9"; break; + case '\u1fae': replace = "\u03a9"; break; + case '\u1faf': replace = "\u03a9"; break; + case '\u1fb2': replace = "\u03b1"; break; + case '\u1fb3': replace = "\u03b1"; break; + case '\u1fb4': replace = "\u03b1"; break; + case '\u1fb6': replace = "\u03b1"; break; + case '\u1fb7': replace = "\u03b1"; break; + case '\u1fba': replace = "\u0391"; break; + case '\u1fbb': replace = "\u0391"; break; + case '\u1fbc': replace = "\u0391"; break; + case '\u1fc2': replace = "\u03b7"; break; + case '\u1fc3': replace = "\u03b7"; break; + case '\u1fc4': replace = "\u03b7"; break; + case '\u1fc6': replace = "\u03b7"; break; + case '\u1fc7': replace = "\u03b7"; break; + case '\u1fca': replace = "\u0397"; break; + case '\u1fcb': replace = "\u0397"; break; + case '\u1fcc': replace = "\u0397"; break; + case '\u1fd2': replace = "\u03b9"; break; + case '\u1fd3': replace = "\u03b9"; break; + case '\u1fd6': replace = "\u03b9"; break; + case '\u1fd7': replace = "\u03b9"; break; + case '\u1fda': replace = "\u0399"; break; + case '\u1fdb': replace = "\u039f"; break; + case '\u1fe2': replace = "\u03c5"; break; + case '\u1fe3': replace = "\u03c5"; break; + case '\u1fe4': replace = "\u03c1"; break; + case '\u1fe5': replace = "\u03c1"; break; + case '\u1fe6': replace = "\u03c5"; break; + case '\u1fe7': replace = "\u03c5"; break; + case '\u1fea': replace = "\u03a5"; break; + case '\u1feb': replace = "\u03a5"; break; + case '\u1fec': replace = "\u03a1"; break; + case '\u1ff2': replace = "\u03c9"; break; + case '\u1ff3': replace = "\u03c9"; break; + case '\u1ff4': replace = "\u03c9"; break; + case '\u1ff6': replace = "\u03c9"; break; + case '\u1ff7': replace = "\u03c9"; break; + case '\u1ff8': replace = "\u039f"; break; + case '\u1ff9': replace = "\u039f"; break; + case '\u1ffa': replace = "\u03a9"; break; + case '\u1ffb': replace = "\u03a9"; break; + case '\u1ffc': replace = "\u03a9"; break; + + case '<': break; + case '>': break; + case '-': break; // same treatment as soft hyphen + case '\u00ad': break; // soft hyphen + default: replace += c; break; + } + buf.append(replace); + // update offsets if replacement is a different length + if (offsets != null) { + int r = replace.length(); + if (r == 0) + this.offsets = arrayKill(this.offsets, i - n); + else if (r == 2) + this.offsets = arrayInsert(this.offsets, i - n + 1, this.offsets[i - n], r - 1); + n += 1 - r; + } + } + return buf.toString(); + } else { // unknown or no language + return s; + } + } + + public String deNormalizeToRegExpr(String s) { + // TODO all characters in all languages + if (language.equals("la") || language.equals("lat")) { + StringBuffer buf = new StringBuffer(); + if (s.indexOf("ae") != -1) { + String str1 = s; + str1 = str1.replaceAll("ae", "\u0119"); + String str2 = s; + str2 = str2.replaceAll("ae", "\u00c6"); + String str3 = s; + str3 = str3.replaceAll("ae", "\u00e6"); + buf.append(str1 + "|" + str2 + "|" + str3 + "|"); + } + if (s.indexOf("oe") != -1) { + String str1 = s; + str1 = str1.replaceAll("oe", "\u0152"); + String str2 = s; + str2 = str2.replaceAll("oe", "\u0153"); + buf.append(str1 + "|" + str2 + "|"); + } + if (s.indexOf("ss") != -1) { + String str1 = s; + str1 = str1.replaceAll("ss", "\u00df"); + buf.append(str1 + "|"); + } + boolean beginWord = true; + for (int i = 0; i < s.length(); i++) { + char c = s.charAt(i); + if (! beginWord) + c = Character.toLowerCase(c); + beginWord = Character.isWhitespace(c); + String replace = new String(); + switch (c) { + case 'a': replace = "[a\u00c0\u00c1\u00c2\u00c4\u00e0\u00e1\u00e2\u00e4]"; break; + case 'c': replace = "[c\u00c7\u00e7]"; break; + case 'e': replace = "[e\u00c8\u00c9\u00ca\u00cb\u00e8\u00e9\u00ea\u00eb\u0113\u0115\u1ebd]"; break; + case 'i': replace = "[ij\u00cc\u00cd\u00ce\u00cf\u00ec\u00ed\u00ee\u00ef\u012a\u012b\u012c\u012d]"; break; + case 'o': replace = "[o\u00d2\u00d3\u00d4\u00d6\u00f2\u00f3\u00f4\u00f6\u014c\u014d\u014e\u014f]"; break; + case 'u': replace = "[uv\u00d9\u00da\u00db\u00dc\u00f9\u00fa\u00fb\u00fc\u016a\u016b\u016c\u016d]"; break; + case 's': replace = "[s\u017f]"; break; + default: replace += c; break; + } + buf.append(replace); + } + return buf.toString(); + } else if (language.equals("en")) { + StringBuffer buf = new StringBuffer(); + if (s.indexOf("ae") != -1) { + String str1 = s; + str1 = str1.replaceAll("ae", "\u0119"); + String str2 = s; + str2 = str2.replaceAll("ae", "\u00c6"); + String str3 = s; + str3 = str3.replaceAll("ae", "\u00e6"); + buf.append(str1 + "|" + str2 + "|" + str3 + "|"); + } + if (s.indexOf("oe") != -1) { + String str1 = s; + str1 = str1.replaceAll("oe", "\u0152"); + String str2 = s; + str2 = str2.replaceAll("oe", "\u0153"); + buf.append(str1 + "|" + str2 + "|"); + } + if (s.indexOf("ss") != -1) { + String str1 = s; + str1 = str1.replaceAll("ss", "\u00df"); + buf.append(str1 + "|"); + } + boolean beginWord = true; + for (int i = 0; i < s.length(); i++) { + char c = s.charAt(i); + if (! beginWord) + c = Character.toLowerCase(c); + beginWord = Character.isWhitespace(c); + String replace = new String(); + switch (c) { + case 'a': replace = "[a\u00c0\u00c1\u00c2\u00c4\u00e0\u00e1\u00e2\u00e4]"; break; + case 'c': replace = "[c\u00c7\u00e7]"; break; + case 'e': replace = "[e\u00c8\u00c9\u00ca\u00cb\u00e8\u00e9\u00ea\u00eb\u0113\u0115\u1e8d]"; break; + case 'i': replace = "[i\u00cc\u00cd\u00ce\u00cf\u00ec\u00ed\u00ee\u00ef\u012a\u012b\u012c\u012d]"; break; + case 'o': replace = "[o\u00d2\u00d3\u00d4\u00d6\u00f2\u00f3\u00f4\u00f6\u014c\u014d\u014e\u014f]‚"; break; + case 'u': replace = "[u\u00d9\u00da\u00db\u00dc\u00f9\u00fa\u00fb\u00fc\u016a\u016b\u016c\u016d]"; break; + case 's': replace = "[s\u017f]"; break; + default: replace += c; break; + } + buf.append(replace); + } + return buf.toString(); + } else if (language.equals("de")) { + StringBuffer buf = new StringBuffer(); + if (s.indexOf("ss") != -1) { + String str1 = s; + str1 = str1.replaceAll("ss", "\u00df"); + buf.append(str1 + "|"); + } + if (s.indexOf("ae") != -1) { + String str1 = s; + str1 = str1.replaceAll("ae", "\u00e4"); + buf.append(str1 + "|"); + } + if (s.indexOf("oe") != -1) { + String str1 = s; + str1 = str1.replaceAll("oe", "\u00f6"); + buf.append(str1 + "|"); + } + if (s.indexOf("ue") != -1) { + String str1 = s; + str1 = str1.replaceAll("ue", "\u00fc"); + buf.append(str1 + "|"); + } + boolean beginWord = true; + for (int i = 0; i < s.length(); i++) { + char c = s.charAt(i); + if (! beginWord) + c = Character.toLowerCase(c); + beginWord = Character.isWhitespace(c); + String replace = new String(); + switch (c) { + case 'e': replace = "[e\u00e9]"; break; + default: replace += c; break; + } + buf.append(replace); + } + return buf.toString(); + } else { // unknown or no language + return s; + } + } + + /** + * Returns the offset table. + * + * @return offset table + */ + public int[] getOffsetTable() { + return offsets; + } + + /** + * Returns a copy of an integer array with the element at + * index removed ("killed"). + * + * @param array integer array + * @param index index of element to remove + */ + static private int[] arrayKill(int[] array, int index) { + int[] newArray = new int[array.length - 1]; + System.arraycopy(array, 0, newArray, 0, index); + System.arraycopy(array, index + 1, newArray, index, array.length - index - 1); + return newArray; + } + + /** + * Returns a copy of an integer array with count elements + * inserted at index. + * + * @param array integer array + * @param index index to insert new elements + * @param value value to insert into new slots + * @param count number of new slots to insert + */ + static private int[] arrayInsert(int[] array, int index, int value, int count) { + int[] newArray = new int[array.length + count]; + System.arraycopy(array, 0, newArray, 0, index); + for (int i = 0; i < count; i++) newArray[index + i] = value; + System.arraycopy(array, index, newArray, index + count, array.length - index); + return newArray; + } + + /** + * We provide main() so that our services will be available + * outside Java (i.e., so we can run as a Un*x-style filter). + */ + static public void main(String[] argv) throws ApplicationException { + if (argv.length != 1) { + System.err.println("You must specify a language."); + System.exit(1); + } + String rec; + StringBuffer buf = new StringBuffer(); + BufferedReader bin = null; + try { + bin = new BufferedReader(new InputStreamReader(System.in, "UTF-8")); + while ((rec = bin.readLine()) != null) + buf.append(rec + "\n"); + } + catch (UnsupportedEncodingException e) { + System.err.println(e); + System.exit(1); + } catch (IOException e) { + System.err.println(e); + System.exit(1); + } + MpdlNormalizer orth = new MpdlNormalizer(argv[0]); + System.out.print(orth.normalize(buf.toString())); + } +} \ No newline at end of file diff -r 000000000000 -r 408254cf2f1d software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/MpdlStandardAnalyzer.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/MpdlStandardAnalyzer.java Wed Nov 24 17:24:23 2010 +0100 @@ -0,0 +1,103 @@ +package de.mpg.mpiwg.berlin.mpdl.lt.analyzer; + +import org.apache.lucene.analysis.*; +import org.apache.lucene.analysis.standard.StandardFilter; +import org.apache.lucene.analysis.standard.StandardTokenizer; + +import java.io.File; +import java.io.IOException; +import java.io.Reader; +import java.util.Set; + +/** + * StandardAnalyzer which is case insensitive (no LowerCaseFilter in method tokenStream + * and reusableTokenStream) + * + */ +public class MpdlStandardAnalyzer extends Analyzer { + private Set stopSet; + /** An array containing some common English words that are usually not + useful for searching. */ + public static final String[] STOP_WORDS = StopAnalyzer.ENGLISH_STOP_WORDS; + + /** Builds an analyzer with the default stop words ({@link #STOP_WORDS}). */ + public MpdlStandardAnalyzer() { + this(STOP_WORDS); + } + + /** Builds an analyzer with the given stop words. */ + public MpdlStandardAnalyzer(Set stopWords) { + stopSet = stopWords; + } + + /** Builds an analyzer with the given stop words. */ + public MpdlStandardAnalyzer(String[] stopWords) { + stopSet = StopFilter.makeStopSet(stopWords); + } + + /** Builds an analyzer with the stop words from the given file. + * @see WordlistLoader#getWordSet(File) + */ + public MpdlStandardAnalyzer(File stopwords) throws IOException { + stopSet = WordlistLoader.getWordSet(stopwords); + } + + /** Builds an analyzer with the stop words from the given reader. + * @see WordlistLoader#getWordSet(Reader) + */ + public MpdlStandardAnalyzer(Reader stopwords) throws IOException { + stopSet = WordlistLoader.getWordSet(stopwords); + } + + /** Constructs a {@link StandardTokenizer} filtered by a {@link + StandardFilter}, not a {@link LowerCaseFilter} and a {@link StopFilter}. */ + public TokenStream tokenStream(String fieldName, Reader reader) { + StandardTokenizer tokenStream = new StandardTokenizer(reader); + tokenStream.setMaxTokenLength(maxTokenLength); + TokenStream result = new StandardFilter(tokenStream); + result = new StopFilter(result, stopSet); + return result; + } + + private static final class SavedStreams { + StandardTokenizer tokenStream; + TokenStream filteredTokenStream; + } + + /** Default maximum allowed token length */ + public static final int DEFAULT_MAX_TOKEN_LENGTH = 255; + + private int maxTokenLength = DEFAULT_MAX_TOKEN_LENGTH; + + /** + * Set maximum allowed token length. If a token is seen + * that exceeds this length then it is discarded. This + * setting only takes effect the next time tokenStream or + * reusableTokenStream is called. + */ + public void setMaxTokenLength(int length) { + maxTokenLength = length; + } + + /** + * @see #setMaxTokenLength + */ + public int getMaxTokenLength() { + return maxTokenLength; + } + + public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException { + SavedStreams streams = (SavedStreams) getPreviousTokenStream(); + if (streams == null) { + streams = new SavedStreams(); + setPreviousTokenStream(streams); + streams.tokenStream = new StandardTokenizer(reader); + streams.filteredTokenStream = new StandardFilter(streams.tokenStream); + streams.filteredTokenStream = new StopFilter(streams.filteredTokenStream, stopSet); + } else { + streams.tokenStream.reset(reader); + } + streams.tokenStream.setMaxTokenLength(maxTokenLength); + return streams.filteredTokenStream; + } +} \ No newline at end of file diff -r 000000000000 -r 408254cf2f1d software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/MpdlStemFilter.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/MpdlStemFilter.java Wed Nov 24 17:24:23 2010 +0100 @@ -0,0 +1,52 @@ +package de.mpg.mpiwg.berlin.mpdl.lt.analyzer; + +import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; + +import java.io.IOException; +import java.util.Set; + +public final class MpdlStemFilter extends TokenFilter { + private MpdlMorphAnalyzer analyzer; + private Token token = null; + private MpdlStemmer stemmer = null; + private Set exclusionSet = null; + + public MpdlStemFilter(TokenStream in) { + super(in); + stemmer = new MpdlStemmer(); + } + + public MpdlStemFilter(MpdlMorphAnalyzer analyzer, TokenStream in, Set exclusionSet) { + this(in); + this.analyzer = analyzer; + this.exclusionSet = exclusionSet; + this.stemmer.setLanguage(analyzer.getLanguage()); + } + + public final Token next() throws IOException { + if (( token = input.next()) == null) { + return null; + } else if ( exclusionSet != null && exclusionSet.contains( token.termText() ) ) { + return token; + } else { + String s = stemmer.stem(token.termText()); + // If not stemmed, dont waste the time creating a new token + if ( !s.equals( token.termText() ) ) { + return new Token( s, token.startOffset(), token.endOffset(), token.type() ); + } + return token; + } + } + + public void setStemmer(MpdlStemmer stemmer) { + if ( stemmer != null ) { + this.stemmer = stemmer; + } + } + + public void setExclusionSet(Set exclusionSet) { + this.exclusionSet = exclusionSet; + } +} diff -r 000000000000 -r 408254cf2f1d software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/MpdlStemmer.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/MpdlStemmer.java Wed Nov 24 17:24:23 2010 +0100 @@ -0,0 +1,159 @@ +package de.mpg.mpiwg.berlin.mpdl.lt.analyzer; + +import java.util.ArrayList; + +import org.apache.log4j.Logger; + +import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; +import de.mpg.mpiwg.berlin.mpdl.general.MpdlConstants; +import de.mpg.mpiwg.berlin.mpdl.lt.morph.app.Form; +import de.mpg.mpiwg.berlin.mpdl.lt.morph.app.Lemma; +import de.mpg.mpiwg.berlin.mpdl.lt.morph.app.MorphologyCache; + +public class MpdlStemmer { + private String language = MpdlConstants.DEFAULT_LANGUAGE; + + protected void setLanguage(String language) { + this.language = language; + } + + /** + * Used for indexing documents and for querying + * @param term + * @return + */ + protected String stem(String term) { + String stem = null; + term = term.toLowerCase(); + // special case: term is already lemma: begins with "lemmalemma" + if (term.startsWith("lemmalemma")) + return term; + // try to find the stem by the MorphologyCache + ArrayList lemmas = null; + try { + MorphologyCache morphologyCache = MorphologyCache.getInstance(); + lemmas = morphologyCache.getLemmasByFormName(language, term, false); // do not normalize again, already done + } catch (ApplicationException e) { + // nothing, do not disturb + } + if (lemmas != null && ! lemmas.isEmpty()) { + if (lemmas.size() == 1) { + stem = lemmas.get(0).getLemmaName(); + } else { + stem = ""; + for (int i=0; i 2 characters) then add this Snowball form to the dynamic morphology cache + if ((! stem.equals(term)) && stem.length() > 2) { + try { + MorphologyCache morphologyCache = MorphologyCache.getInstance(); + if (morphologyCache.getMode() == MorphologyCache.DOCUMENT_MODE) { + Form newForm = new Form("snowball", language, term); + newForm.setLemmaName(stem); + morphologyCache.insertFormDynamic(newForm); + } + } catch (ApplicationException e) { + Logger.getLogger(MpdlStemmer.class).warn("MorphologyCache: an exception was caught while indexing a document: " + e.getMessage(), e); + } + } + } + return stem; + } + + private String stemBySnowball(String term, String language) { + String stem = null; + if (language.equals("de")) { + net.sf.snowball.ext.GermanStemmer stemmer = new net.sf.snowball.ext.GermanStemmer(); + stemmer.setCurrent(term); + stemmer.stem(); + stem = stemmer.getCurrent(); + } else if (language.equals("en")) { + net.sf.snowball.ext.EnglishStemmer stemmer = new net.sf.snowball.ext.EnglishStemmer(); + stemmer.setCurrent(term); + stemmer.stem(); + stem = stemmer.getCurrent(); + } else if (language.equals("nl")) { + net.sf.snowball.ext.DutchStemmer stemmer = new net.sf.snowball.ext.DutchStemmer(); + stemmer.setCurrent(term); + stemmer.stem(); + stem = stemmer.getCurrent(); + } else if (language.equals("fi")) { + net.sf.snowball.ext.FinnishStemmer stemmer = new net.sf.snowball.ext.FinnishStemmer(); + stemmer.setCurrent(term); + stemmer.stem(); + stem = stemmer.getCurrent(); + } else if (language.equals("fr")) { + net.sf.snowball.ext.FrenchStemmer stemmer = new net.sf.snowball.ext.FrenchStemmer(); + stemmer.setCurrent(term); + stemmer.stem(); + stem = stemmer.getCurrent(); + } else if (language.equals("it")) { + net.sf.snowball.ext.ItalianStemmer stemmer = new net.sf.snowball.ext.ItalianStemmer(); + stemmer.setCurrent(term); + stemmer.stem(); + stem = stemmer.getCurrent(); + } else if (language.equals("no")) { + net.sf.snowball.ext.NorwegianStemmer stemmer = new net.sf.snowball.ext.NorwegianStemmer(); + stemmer.setCurrent(term); + stemmer.stem(); + stem = stemmer.getCurrent(); + } else if (language.equals("pt")) { + net.sf.snowball.ext.PortugueseStemmer stemmer = new net.sf.snowball.ext.PortugueseStemmer(); + stemmer.setCurrent(term); + stemmer.stem(); + stem = stemmer.getCurrent(); + } else if (language.equals("ru")) { + net.sf.snowball.ext.RussianStemmer stemmer = new net.sf.snowball.ext.RussianStemmer(); + stemmer.setCurrent(term); + stemmer.stem(); + stem = stemmer.getCurrent(); + } else if (language.equals("es")) { + net.sf.snowball.ext.SpanishStemmer stemmer = new net.sf.snowball.ext.SpanishStemmer(); + stemmer.setCurrent(term); + stemmer.stem(); + stem = stemmer.getCurrent(); + } else if (language.equals("sv")) { + net.sf.snowball.ext.SwedishStemmer stemmer = new net.sf.snowball.ext.SwedishStemmer(); + stemmer.setCurrent(term); + stemmer.stem(); + stem = stemmer.getCurrent(); + } else { + stem = term; // if no language fits deliver the term itself as the stem form + } + return stem; + } + + /* + private String stemByLanguageStemmers(String term, String language) { + // TODO provide other languages + String stem = null; + if (language.equals("br")) { + BrazilianStemmer stemmer = new BrazilianStemmer(); + stem = stemmer.stem(term); + } else if (language.equals("de")) { + GermanStemmer stemmer = new GermanStemmer(); + stem = stemmer.stem(term); + } else if (language.equals("fr")) { + FrenchStemmer stemmer = new FrenchStemmer(); + stem = stemmer.stem(term); + } else if (language.equals("nl")) { + DutchStemmer stemmer = new DutchStemmer(); + stem = stemmer.stem(term); + } else if (language.equals("ru")) { + RussianStemmer stemmer = new RussianStemmer(); + stem = stemmer.stem(term); + } else { + stem = term; // if no language fits deliver the term itself as the stem form + } + return stem; + } + */ +} diff -r 000000000000 -r 408254cf2f1d software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/MpdlTokenizer.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/MpdlTokenizer.java Wed Nov 24 17:24:23 2010 +0100 @@ -0,0 +1,113 @@ +package de.mpg.mpiwg.berlin.mpdl.lt.analyzer; + +import java.io.IOException; +import java.io.Reader; + +import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.Tokenizer; + +import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; + +public class MpdlTokenizer extends Tokenizer { + private static final int MAX_WORD_LEN = 255; + private static final int IO_BUFFER_SIZE = 1024; + private String language; // TODO make the tokenizer language dependent + private int offset = 0, bufferIndex = 0, dataLen = 0; + private char[] buffer = new char[MAX_WORD_LEN]; + private char[] ioBuffer = new char[IO_BUFFER_SIZE]; + private MpdlNormalizer normalizer; + + public MpdlTokenizer(Reader input, String language) { + super(input); + this.language = language; + } + + public MpdlTokenizer(Reader input, String language, MpdlNormalizer normalizer) { + super(input); + this.language = language; + this.normalizer = normalizer; + } + + /** Returns true iff a character should be included in a token. This + * tokenizer generates as tokens adjacent sequences of characters which + * satisfy this predicate. Characters for which this is false are used to + * define token boundaries and are not included in tokens. */ + protected boolean isTokenChar(char c) { + boolean isTokenChar = true; + switch (c) { + case ' ': isTokenChar = false; break; + case '.': isTokenChar = false; break; + case ',': isTokenChar = false; break; + case '!': isTokenChar = false; break; + case '?': isTokenChar = false; break; + case ';': isTokenChar = false; break; + case ':': isTokenChar = false; break; + case '(': isTokenChar = false; break; + case ')': isTokenChar = false; break; + case '[': isTokenChar = false; break; + case ']': isTokenChar = false; break; + case '<': isTokenChar = false; break; + case '>': isTokenChar = false; break; + case '&': isTokenChar = false; break; + case '+': isTokenChar = false; break; + case '"': isTokenChar = false; break; + case '\'': isTokenChar = false; break; + // case '\t': isTokenChar = false; break; + // case '\n': isTokenChar = false; break; // do not break words which are on another line + } + return isTokenChar; + } + + /** Called on each token character to normalize it before it is added to the + * token. The default implementation does nothing. Subclasses may use this + * to, e.g., lowercase tokens. */ + protected char normalize(char c) { + return c; + } + + /** Returns the next token in the stream, or null at EOS. */ + public final Token next() throws IOException { + int length = 0; + int start = offset; + while (true) { + final char c; + offset++; + if (bufferIndex >= dataLen) { + dataLen = input.read(ioBuffer); + bufferIndex = 0; + } + if (dataLen == -1) { + if (length > 0) + break; + else + return null; + } else { + c = ioBuffer[bufferIndex++]; + } + if (isTokenChar(c)) { // if it's a token char + if (length == 0) // start of token + start = offset - 1; + buffer[length++] = normalize(c); // buffer it, normalized + if (length == MAX_WORD_LEN) // buffer overflow! + break; + } else if (length > 0) // at non-Letter w/ chars + break; // return 'em + } + Token newToken = new Token(start, start + length); + newToken.setTermBuffer(buffer, 0, length); + if (normalizer != null) { + char[] termBuffer = newToken.termBuffer(); + int termBufferLength = newToken.termLength(); + String tokenText = new String(termBuffer, 0, termBufferLength); + try { + String normalizedTokenText = normalizer.normalize(tokenText); + int normalizedTokenTextLength = normalizedTokenText.length(); + char[] normalizedTokenTextBuffer = normalizedTokenText.toCharArray(); + newToken.setTermBuffer(normalizedTokenTextBuffer, 0, normalizedTokenTextLength); + } catch (ApplicationException e) { + throw new IOException(e); + } + } + return newToken; + } +} diff -r 000000000000 -r 408254cf2f1d software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/MpdlTokenizerAnalyzer.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/MpdlTokenizerAnalyzer.java Wed Nov 24 17:24:23 2010 +0100 @@ -0,0 +1,55 @@ +package de.mpg.mpiwg.berlin.mpdl.lt.analyzer; + +import java.io.IOException; +import java.io.Reader; +import java.io.StringReader; +import java.util.ArrayList; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.LowerCaseFilter; +import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.TokenStream; + +import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; +import de.mpg.mpiwg.berlin.mpdl.general.MpdlConstants; + +public class MpdlTokenizerAnalyzer extends Analyzer { + protected String language = MpdlConstants.DEFAULT_LANGUAGE; + protected MpdlNormalizer normalizer = null; + + public MpdlTokenizerAnalyzer(String language) { + this.language = language; + this.normalizer = new MpdlNormalizer(language); // default normalizer + } + + public MpdlTokenizerAnalyzer(MpdlNormalizer normalizer, String language) { + this.language = language; + this.normalizer = normalizer; + } + + public TokenStream tokenStream(String fieldName, Reader reader) { + TokenStream result = new MpdlTokenizer(reader, language, normalizer); + result = new MpdlFilter(result); // filter to remove the hyphen in a token etc. + result = new LowerCaseFilter(result); + return result; + } + + public ArrayList getToken(String inputString) throws ApplicationException { + ArrayList token = new ArrayList(); + try { + Reader reader = new StringReader(inputString); + TokenStream result = new MpdlTokenizer(reader, language, normalizer); + result = new MpdlFilter(result); // filter to remove the hyphen in a token etc. + result = new LowerCaseFilter(result); + Token t = result.next(); + while (t != null) { + token.add(t); + t = result.next(); + } + } catch (IOException e) { + throw new ApplicationException(e); + } + return token; + } + +} diff -r 000000000000 -r 408254cf2f1d software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/BrazilianStemmer.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/BrazilianStemmer.java Wed Nov 24 17:24:23 2010 +0100 @@ -0,0 +1,1021 @@ +package de.mpg.mpiwg.berlin.mpdl.lt.analyzer.lang; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * A stemmer for Brazilian words. + */ +public class BrazilianStemmer { + + /** + * Changed term + */ + private String TERM ; + private String CT ; + private String R1 ; + private String R2 ; + private String RV ; + + + public BrazilianStemmer() { + } + + /** + * Stemms the given term to an unique discriminator. + * + * @param term The term that should be stemmed. + * @return Discriminator for term + */ + public String stem( String term ) { + boolean altered = false ; // altered the term + + // creates CT + createCT(term) ; + + if ( !isIndexable( CT ) ) { + return null; + } + if ( !isStemmable( CT ) ) { + return CT ; + } + + R1 = getR1(CT) ; + R2 = getR1(R1) ; + RV = getRV(CT) ; + TERM = term + ";" +CT ; + + altered = step1() ; + if (!altered) { + altered = step2() ; + } + + if (altered) { + step3(); + } else { + step4(); + } + + step5() ; + + return CT ; + } + + /** + * Checks a term if it can be processed correctly. + * + * @return true if, and only if, the given term consists in letters. + */ + private boolean isStemmable( String term ) { + for ( int c = 0; c < term.length(); c++ ) { + // Discard terms that contain non-letter characters. + if ( !Character.isLetter(term.charAt(c))) { + return false; + } + } + return true; + } + + /** + * Checks a term if it can be processed indexed. + * + * @return true if it can be indexed + */ + private boolean isIndexable( String term ) { + return (term.length() < 30) && (term.length() > 2) ; + } + + /** + * See if string is 'a','e','i','o','u' + * + * @return true if is vowel + */ + private boolean isVowel( char value ) { + return (value == 'a') || + (value == 'e') || + (value == 'i') || + (value == 'o') || + (value == 'u') ; + } + + /** + * Gets R1 + * + * R1 - is the region after the first non-vowel follwing a vowel, + * or is the null region at the end of the word if there is + * no such non-vowel. + * + * @return null or a string representing R1 + */ + private String getR1( String value ) { + int i; + int j; + + // be-safe !!! + if (value == null) { + return null ; + } + + // find 1st vowel + i = value.length()-1 ; + for (j=0 ; j < i ; j++) { + if (isVowel(value.charAt(j))) { + break ; + } + } + + if (!(j < i)) { + return null ; + } + + // find 1st non-vowel + for ( ; j < i ; j++) { + if (!(isVowel(value.charAt(j)))) { + break ; + } + } + + if (!(j < i)) { + return null ; + } + + return value.substring(j+1) ; + } + + /** + * Gets RV + * + * RV - IF the second letter is a consoant, RV is the region after + * the next following vowel, + * + * OR if the first two letters are vowels, RV is the region + * after the next consoant, + * + * AND otherwise (consoant-vowel case) RV is the region after + * the third letter. + * + * BUT RV is the end of the word if this positions cannot be + * found. + * + * @return null or a string representing RV + */ + private String getRV( String value ) { + int i; + int j; + + // be-safe !!! + if (value == null) { + return null ; + } + + i = value.length()-1 ; + + // RV - IF the second letter is a consoant, RV is the region after + // the next following vowel, + if ((i > 0) && !isVowel(value.charAt(1))) { + // find 1st vowel + for (j=2 ; j < i ; j++) { + if (isVowel(value.charAt(j))) { + break ; + } + } + + if (j < i) { + return value.substring(j+1) ; + } + } + + + // RV - OR if the first two letters are vowels, RV is the region + // after the next consoant, + if ((i > 1) && + isVowel(value.charAt(0)) && + isVowel(value.charAt(1))) { + // find 1st consoant + for (j=2 ; j < i ; j++) { + if (!isVowel(value.charAt(j))) { + break ; + } + } + + if (j < i) { + return value.substring(j+1) ; + } + } + + // RV - AND otherwise (consoant-vowel case) RV is the region after + // the third letter. + if (i > 2) { + return value.substring(3) ; + } + + return null ; + } + + /** + * 1) Turn to lowercase + * 2) Remove accents + * 3) ã -> a ; õ -> o + * 4) ç -> c + * + * @return null or a string transformed + */ + private String changeTerm( String value ) { + int j; + String r = "" ; + + // be-safe !!! + if (value == null) { + return null ; + } + + value = value.toLowerCase() ; + for (j=0 ; j < value.length() ; j++) { + if ((value.charAt(j) == 'á') || + (value.charAt(j) == 'â') || + (value.charAt(j) == 'ã')) { + r= r + "a" ; continue ; + } + if ((value.charAt(j) == 'é') || + (value.charAt(j) == 'ê')) { + r= r + "e" ; continue ; + } + if (value.charAt(j) == 'í') { + r= r + "i" ; continue ; + } + if ((value.charAt(j) == 'ó') || + (value.charAt(j) == 'ô') || + (value.charAt(j) == 'õ')) { + r= r + "o" ; continue ; + } + if ((value.charAt(j) == 'ú') || + (value.charAt(j) == 'ü')) { + r= r + "u" ; continue ; + } + if (value.charAt(j) == 'ç') { + r= r + "c" ; continue ; + } + if (value.charAt(j) == 'ñ') { + r= r + "n" ; continue ; + } + + r= r+ value.charAt(j) ; + } + + return r ; + } + + /** + * Check if a string ends with a suffix + * + * @return true if the string ends with the specified suffix + */ + private boolean suffix( String value, String suffix ) { + + // be-safe !!! + if ((value == null) || (suffix == null)) { + return false ; + } + + if (suffix.length() > value.length()) { + return false ; + } + + return value.substring(value.length()-suffix.length()).equals(suffix); + } + + /** + * Replace a string suffix by another + * + * @return the replaced String + */ + private String replaceSuffix( String value, String toReplace, String changeTo ) { + String vvalue ; + + // be-safe !!! + if ((value == null) || + (toReplace == null) || + (changeTo == null) ) { + return value ; + } + + vvalue = removeSuffix(value,toReplace) ; + + if (value.equals(vvalue)) { + return value ; + } else { + return vvalue + changeTo ; + } + } + + /** + * Remove a string suffix + * + * @return the String without the suffix + */ + private String removeSuffix( String value, String toRemove ) { + // be-safe !!! + if ((value == null) || + (toRemove == null) || + !suffix(value,toRemove) ) { + return value ; + } + + return value.substring(0,value.length()-toRemove.length()) ; + } + + /** + * See if a suffix is preceded by a String + * + * @return true if the suffix is preceded + */ + private boolean suffixPreceded( String value, String suffix, String preceded ) { + // be-safe !!! + if ((value == null) || + (suffix == null) || + (preceded == null) || + !suffix(value,suffix) ) { + return false ; + } + + return suffix(removeSuffix(value,suffix),preceded) ; + } + + /** + * Creates CT (changed term) , substituting * 'ã' and 'õ' for 'a~' and 'o~'. + */ + private void createCT( String term ) { + CT = changeTerm(term) ; + + if (CT.length() < 2) return ; + + // if the first character is ... , remove it + if ((CT.charAt(0) == '"') || + (CT.charAt(0) == '\'') || + (CT.charAt(0) == '-') || + (CT.charAt(0) == ',') || + (CT.charAt(0) == ';') || + (CT.charAt(0) == '.') || + (CT.charAt(0) == '?') || + (CT.charAt(0) == '!') + ) { + CT = CT.substring(1); + } + + if (CT.length() < 2) return ; + + // if the last character is ... , remove it + if ((CT.charAt(CT.length()-1) == '-') || + (CT.charAt(CT.length()-1) == ',') || + (CT.charAt(CT.length()-1) == ';') || + (CT.charAt(CT.length()-1) == '.') || + (CT.charAt(CT.length()-1) == '?') || + (CT.charAt(CT.length()-1) == '!') || + (CT.charAt(CT.length()-1) == '\'') || + (CT.charAt(CT.length()-1) == '"') + ) { + CT = CT.substring(0,CT.length()-1); + } + } + + + /** + * Standart suffix removal. + * Search for the longest among the following suffixes, and perform + * the following actions: + * + * @return false if no ending was removed + */ + private boolean step1() { + if (CT == null) return false ; + + // suffix lenght = 7 + if (suffix(CT,"uciones") && suffix(R2,"uciones")) { + CT = replaceSuffix(CT,"uciones","u") ; return true; + } + + // suffix lenght = 6 + if (CT.length() >= 6) { + if (suffix(CT,"imentos") && suffix(R2,"imentos")) { + CT = removeSuffix(CT,"imentos") ; return true; + } + if (suffix(CT,"amentos") && suffix(R2,"amentos")) { + CT = removeSuffix(CT,"amentos") ; return true; + } + if (suffix(CT,"adores") && suffix(R2,"adores")) { + CT = removeSuffix(CT,"adores") ; return true; + } + if (suffix(CT,"adoras") && suffix(R2,"adoras")) { + CT = removeSuffix(CT,"adoras") ; return true; + } + if (suffix(CT,"logias") && suffix(R2,"logias")) { + replaceSuffix(CT,"logias","log") ; return true; + } + if (suffix(CT,"encias") && suffix(R2,"encias")) { + CT = replaceSuffix(CT,"encias","ente") ; return true; + } + if (suffix(CT,"amente") && suffix(R1,"amente")) { + CT = removeSuffix(CT,"amente") ; return true; + } + if (suffix(CT,"idades") && suffix(R2,"idades")) { + CT = removeSuffix(CT,"idades") ; return true; + } + } + + // suffix lenght = 5 + if (CT.length() >= 5) { + if (suffix(CT,"acoes") && suffix(R2,"acoes")) { + CT = removeSuffix(CT,"acoes") ; return true; + } + if (suffix(CT,"imento") && suffix(R2,"imento")) { + CT = removeSuffix(CT,"imento") ; return true; + } + if (suffix(CT,"amento") && suffix(R2,"amento")) { + CT = removeSuffix(CT,"amento") ; return true; + } + if (suffix(CT,"adora") && suffix(R2,"adora")) { + CT = removeSuffix(CT,"adora") ; return true; + } + if (suffix(CT,"ismos") && suffix(R2,"ismos")) { + CT = removeSuffix(CT,"ismos") ; return true; + } + if (suffix(CT,"istas") && suffix(R2,"istas")) { + CT = removeSuffix(CT,"istas") ; return true; + } + if (suffix(CT,"logia") && suffix(R2,"logia")) { + CT = replaceSuffix(CT,"logia","log") ; return true; + } + if (suffix(CT,"ucion") && suffix(R2,"ucion")) { + CT = replaceSuffix(CT,"ucion","u") ; return true; + } + if (suffix(CT,"encia") && suffix(R2,"encia")) { + CT = replaceSuffix(CT,"encia","ente") ; return true; + } + if (suffix(CT,"mente") && suffix(R2,"mente")) { + CT = removeSuffix(CT,"mente") ; return true; + } + if (suffix(CT,"idade") && suffix(R2,"idade")) { + CT = removeSuffix(CT,"idade") ; return true; + } + } + + // suffix lenght = 4 + if (CT.length() >= 4) { + if (suffix(CT,"acao") && suffix(R2,"acao")) { + CT = removeSuffix(CT,"acao") ; return true; + } + if (suffix(CT,"ezas") && suffix(R2,"ezas")) { + CT = removeSuffix(CT,"ezas") ; return true; + } + if (suffix(CT,"icos") && suffix(R2,"icos")) { + CT = removeSuffix(CT,"icos") ; return true ; + } + if (suffix(CT,"icas") && suffix(R2,"icas")) { + CT = removeSuffix(CT,"icas") ; return true ; + } + if (suffix(CT,"ismo") && suffix(R2,"ismo")) { + CT = removeSuffix(CT,"ismo") ; return true ; + } + if (suffix(CT,"avel") && suffix(R2,"avel")) { + CT = removeSuffix(CT,"avel") ; return true ; + } + if (suffix(CT,"ivel") && suffix(R2,"ivel")) { + CT = removeSuffix(CT,"ivel") ; return true ; + } + if (suffix(CT,"ista") && suffix(R2,"ista")) { + CT = removeSuffix(CT,"ista") ; return true ; + } + if (suffix(CT,"osos") && suffix(R2,"osos")) { + CT = removeSuffix(CT,"osos") ; return true ; + } + if (suffix(CT,"osas") && suffix(R2,"osas")) { + CT = removeSuffix(CT,"osas") ; return true ; + } + if (suffix(CT,"ador") && suffix(R2,"ador")) { + CT = removeSuffix(CT,"ador") ; return true ; + } + if (suffix(CT,"ivas") && suffix(R2,"ivas")) { + CT = removeSuffix(CT,"ivas") ; return true ; + } + if (suffix(CT,"ivos") && suffix(R2,"ivos")) { + CT = removeSuffix(CT,"ivos") ; return true ; + } + if (suffix(CT,"iras") && + suffix(RV,"iras") && + suffixPreceded(CT,"iras","e")) { + CT = replaceSuffix(CT,"iras","ir") ; return true ; + } + } + + // suffix lenght = 3 + if (CT.length() >= 3) { + if (suffix(CT,"eza") && suffix(R2,"eza")) { + CT = removeSuffix(CT,"eza") ; return true ; + } + if (suffix(CT,"ico") && suffix(R2,"ico")) { + CT = removeSuffix(CT,"ico") ; return true ; + } + if (suffix(CT,"ica") && suffix(R2,"ica")) { + CT = removeSuffix(CT,"ica") ; return true ; + } + if (suffix(CT,"oso") && suffix(R2,"oso")) { + CT = removeSuffix(CT,"oso") ; return true ; + } + if (suffix(CT,"osa") && suffix(R2,"osa")) { + CT = removeSuffix(CT,"osa") ; return true ; + } + if (suffix(CT,"iva") && suffix(R2,"iva")) { + CT = removeSuffix(CT,"iva") ; return true ; + } + if (suffix(CT,"ivo") && suffix(R2,"ivo")) { + CT = removeSuffix(CT,"ivo") ; return true ; + } + if (suffix(CT,"ira") && + suffix(RV,"ira") && + suffixPreceded(CT,"ira","e")) { + CT = replaceSuffix(CT,"ira","ir") ; return true ; + } + } + + // no ending was removed by step1 + return false ; + } + + + /** + * Verb suffixes. + * + * Search for the longest among the following suffixes in RV, + * and if found, delete. + * + * @return false if no ending was removed + */ + private boolean step2() { + if (RV == null) return false ; + + // suffix lenght = 7 + if (RV.length() >= 7) { + if (suffix(RV,"issemos")) { + CT = removeSuffix(CT,"issemos") ; return true; + } + if (suffix(RV,"essemos")) { + CT = removeSuffix(CT,"essemos") ; return true; + } + if (suffix(RV,"assemos")) { + CT = removeSuffix(CT,"assemos") ; return true; + } + if (suffix(RV,"ariamos")) { + CT = removeSuffix(CT,"ariamos") ; return true; + } + if (suffix(RV,"eriamos")) { + CT = removeSuffix(CT,"eriamos") ; return true; + } + if (suffix(RV,"iriamos")) { + CT = removeSuffix(CT,"iriamos") ; return true; + } + } + + // suffix lenght = 6 + if (RV.length() >= 6) { + if (suffix(RV,"iremos")) { + CT = removeSuffix(CT,"iremos") ; return true; + } + if (suffix(RV,"eremos")) { + CT = removeSuffix(CT,"eremos") ; return true; + } + if (suffix(RV,"aremos")) { + CT = removeSuffix(CT,"aremos") ; return true; + } + if (suffix(RV,"avamos")) { + CT = removeSuffix(CT,"avamos") ; return true; + } + if (suffix(RV,"iramos")) { + CT = removeSuffix(CT,"iramos") ; return true; + } + if (suffix(RV,"eramos")) { + CT = removeSuffix(CT,"eramos") ; return true; + } + if (suffix(RV,"aramos")) { + CT = removeSuffix(CT,"aramos") ; return true; + } + if (suffix(RV,"asseis")) { + CT = removeSuffix(CT,"asseis") ; return true; + } + if (suffix(RV,"esseis")) { + CT = removeSuffix(CT,"esseis") ; return true; + } + if (suffix(RV,"isseis")) { + CT = removeSuffix(CT,"isseis") ; return true; + } + if (suffix(RV,"arieis")) { + CT = removeSuffix(CT,"arieis") ; return true; + } + if (suffix(RV,"erieis")) { + CT = removeSuffix(CT,"erieis") ; return true; + } + if (suffix(RV,"irieis")) { + CT = removeSuffix(CT,"irieis") ; return true; + } + } + + + // suffix lenght = 5 + if (RV.length() >= 5) { + if (suffix(RV,"irmos")) { + CT = removeSuffix(CT,"irmos") ; return true; + } + if (suffix(RV,"iamos")) { + CT = removeSuffix(CT,"iamos") ; return true; + } + if (suffix(RV,"armos")) { + CT = removeSuffix(CT,"armos") ; return true; + } + if (suffix(RV,"ermos")) { + CT = removeSuffix(CT,"ermos") ; return true; + } + if (suffix(RV,"areis")) { + CT = removeSuffix(CT,"areis") ; return true; + } + if (suffix(RV,"ereis")) { + CT = removeSuffix(CT,"ereis") ; return true; + } + if (suffix(RV,"ireis")) { + CT = removeSuffix(CT,"ireis") ; return true; + } + if (suffix(RV,"asses")) { + CT = removeSuffix(CT,"asses") ; return true; + } + if (suffix(RV,"esses")) { + CT = removeSuffix(CT,"esses") ; return true; + } + if (suffix(RV,"isses")) { + CT = removeSuffix(CT,"isses") ; return true; + } + if (suffix(RV,"astes")) { + CT = removeSuffix(CT,"astes") ; return true; + } + if (suffix(RV,"assem")) { + CT = removeSuffix(CT,"assem") ; return true; + } + if (suffix(RV,"essem")) { + CT = removeSuffix(CT,"essem") ; return true; + } + if (suffix(RV,"issem")) { + CT = removeSuffix(CT,"issem") ; return true; + } + if (suffix(RV,"ardes")) { + CT = removeSuffix(CT,"ardes") ; return true; + } + if (suffix(RV,"erdes")) { + CT = removeSuffix(CT,"erdes") ; return true; + } + if (suffix(RV,"irdes")) { + CT = removeSuffix(CT,"irdes") ; return true; + } + if (suffix(RV,"ariam")) { + CT = removeSuffix(CT,"ariam") ; return true; + } + if (suffix(RV,"eriam")) { + CT = removeSuffix(CT,"eriam") ; return true; + } + if (suffix(RV,"iriam")) { + CT = removeSuffix(CT,"iriam") ; return true; + } + if (suffix(RV,"arias")) { + CT = removeSuffix(CT,"arias") ; return true; + } + if (suffix(RV,"erias")) { + CT = removeSuffix(CT,"erias") ; return true; + } + if (suffix(RV,"irias")) { + CT = removeSuffix(CT,"irias") ; return true; + } + if (suffix(RV,"estes")) { + CT = removeSuffix(CT,"estes") ; return true; + } + if (suffix(RV,"istes")) { + CT = removeSuffix(CT,"istes") ; return true; + } + if (suffix(RV,"areis")) { + CT = removeSuffix(CT,"areis") ; return true; + } + if (suffix(RV,"aveis")) { + CT = removeSuffix(CT,"aveis") ; return true; + } + } + + // suffix lenght = 4 + if (RV.length() >= 4) { + if (suffix(RV,"aria")) { + CT = removeSuffix(CT,"aria") ; return true; + } + if (suffix(RV,"eria")) { + CT = removeSuffix(CT,"eria") ; return true; + } + if (suffix(RV,"iria")) { + CT = removeSuffix(CT,"iria") ; return true; + } + if (suffix(RV,"asse")) { + CT = removeSuffix(CT,"asse") ; return true; + } + if (suffix(RV,"esse")) { + CT = removeSuffix(CT,"esse") ; return true; + } + if (suffix(RV,"isse")) { + CT = removeSuffix(CT,"isse") ; return true; + } + if (suffix(RV,"aste")) { + CT = removeSuffix(CT,"aste") ; return true; + } + if (suffix(RV,"este")) { + CT = removeSuffix(CT,"este") ; return true; + } + if (suffix(RV,"iste")) { + CT = removeSuffix(CT,"iste") ; return true; + } + if (suffix(RV,"arei")) { + CT = removeSuffix(CT,"arei") ; return true; + } + if (suffix(RV,"erei")) { + CT = removeSuffix(CT,"erei") ; return true; + } + if (suffix(RV,"irei")) { + CT = removeSuffix(CT,"irei") ; return true; + } + if (suffix(RV,"aram")) { + CT = removeSuffix(CT,"aram") ; return true; + } + if (suffix(RV,"eram")) { + CT = removeSuffix(CT,"eram") ; return true; + } + if (suffix(RV,"iram")) { + CT = removeSuffix(CT,"iram") ; return true; + } + if (suffix(RV,"avam")) { + CT = removeSuffix(CT,"avam") ; return true; + } + if (suffix(RV,"arem")) { + CT = removeSuffix(CT,"arem") ; return true; + } + if (suffix(RV,"erem")) { + CT = removeSuffix(CT,"erem") ; return true; + } + if (suffix(RV,"irem")) { + CT = removeSuffix(CT,"irem") ; return true; + } + if (suffix(RV,"ando")) { + CT = removeSuffix(CT,"ando") ; return true; + } + if (suffix(RV,"endo")) { + CT = removeSuffix(CT,"endo") ; return true; + } + if (suffix(RV,"indo")) { + CT = removeSuffix(CT,"indo") ; return true; + } + if (suffix(RV,"arao")) { + CT = removeSuffix(CT,"arao") ; return true; + } + if (suffix(RV,"erao")) { + CT = removeSuffix(CT,"erao") ; return true; + } + if (suffix(RV,"irao")) { + CT = removeSuffix(CT,"irao") ; return true; + } + if (suffix(RV,"adas")) { + CT = removeSuffix(CT,"adas") ; return true; + } + if (suffix(RV,"idas")) { + CT = removeSuffix(CT,"idas") ; return true; + } + if (suffix(RV,"aras")) { + CT = removeSuffix(CT,"aras") ; return true; + } + if (suffix(RV,"eras")) { + CT = removeSuffix(CT,"eras") ; return true; + } + if (suffix(RV,"iras")) { + CT = removeSuffix(CT,"iras") ; return true; + } + if (suffix(RV,"avas")) { + CT = removeSuffix(CT,"avas") ; return true; + } + if (suffix(RV,"ares")) { + CT = removeSuffix(CT,"ares") ; return true; + } + if (suffix(RV,"eres")) { + CT = removeSuffix(CT,"eres") ; return true; + } + if (suffix(RV,"ires")) { + CT = removeSuffix(CT,"ires") ; return true; + } + if (suffix(RV,"ados")) { + CT = removeSuffix(CT,"ados") ; return true; + } + if (suffix(RV,"idos")) { + CT = removeSuffix(CT,"idos") ; return true; + } + if (suffix(RV,"amos")) { + CT = removeSuffix(CT,"amos") ; return true; + } + if (suffix(RV,"emos")) { + CT = removeSuffix(CT,"emos") ; return true; + } + if (suffix(RV,"imos")) { + CT = removeSuffix(CT,"imos") ; return true; + } + if (suffix(RV,"iras")) { + CT = removeSuffix(CT,"iras") ; return true; + } + if (suffix(RV,"ieis")) { + CT = removeSuffix(CT,"ieis") ; return true; + } + } + + // suffix lenght = 3 + if (RV.length() >= 3) { + if (suffix(RV,"ada")) { + CT = removeSuffix(CT,"ada") ; return true; + } + if (suffix(RV,"ida")) { + CT = removeSuffix(CT,"ida") ; return true; + } + if (suffix(RV,"ara")) { + CT = removeSuffix(CT,"ara") ; return true; + } + if (suffix(RV,"era")) { + CT = removeSuffix(CT,"era") ; return true; + } + if (suffix(RV,"ira")) { + CT = removeSuffix(CT,"ava") ; return true; + } + if (suffix(RV,"iam")) { + CT = removeSuffix(CT,"iam") ; return true; + } + if (suffix(RV,"ado")) { + CT = removeSuffix(CT,"ado") ; return true; + } + if (suffix(RV,"ido")) { + CT = removeSuffix(CT,"ido") ; return true; + } + if (suffix(RV,"ias")) { + CT = removeSuffix(CT,"ias") ; return true; + } + if (suffix(RV,"ais")) { + CT = removeSuffix(CT,"ais") ; return true; + } + if (suffix(RV,"eis")) { + CT = removeSuffix(CT,"eis") ; return true; + } + if (suffix(RV,"ira")) { + CT = removeSuffix(CT,"ira") ; return true; + } + if (suffix(RV,"ear")) { + CT = removeSuffix(CT,"ear") ; return true; + } + } + + // suffix lenght = 2 + if (RV.length() >= 2) { + if (suffix(RV,"ia")) { + CT = removeSuffix(CT,"ia") ; return true; + } + if (suffix(RV,"ei")) { + CT = removeSuffix(CT,"ei") ; return true; + } + if (suffix(RV,"am")) { + CT = removeSuffix(CT,"am") ; return true; + } + if (suffix(RV,"em")) { + CT = removeSuffix(CT,"em") ; return true; + } + if (suffix(RV,"ar")) { + CT = removeSuffix(CT,"ar") ; return true; + } + if (suffix(RV,"er")) { + CT = removeSuffix(CT,"er") ; return true; + } + if (suffix(RV,"ir")) { + CT = removeSuffix(CT,"ir") ; return true; + } + if (suffix(RV,"as")) { + CT = removeSuffix(CT,"as") ; return true; + } + if (suffix(RV,"es")) { + CT = removeSuffix(CT,"es") ; return true; + } + if (suffix(RV,"is")) { + CT = removeSuffix(CT,"is") ; return true; + } + if (suffix(RV,"eu")) { + CT = removeSuffix(CT,"eu") ; return true; + } + if (suffix(RV,"iu")) { + CT = removeSuffix(CT,"iu") ; return true; + } + if (suffix(RV,"iu")) { + CT = removeSuffix(CT,"iu") ; return true; + } + if (suffix(RV,"ou")) { + CT = removeSuffix(CT,"ou") ; return true; + } + } + + // no ending was removed by step2 + return false ; + } + + /** + * Delete suffix 'i' if in RV and preceded by 'c' + * + */ + private void step3() { + if (RV == null) return ; + + if (suffix(RV,"i") && suffixPreceded(RV,"i","c")) { + CT = removeSuffix(CT,"i") ; + } + + } + + /** + * Residual suffix + * + * If the word ends with one of the suffixes (os a i o á í ó) + * in RV, delete it + * + */ + private void step4() { + if (RV == null) return ; + + if (suffix(RV,"os")) { + CT = removeSuffix(CT,"os") ; return ; + } + if (suffix(RV,"a")) { + CT = removeSuffix(CT,"a") ; return ; + } + if (suffix(RV,"i")) { + CT = removeSuffix(CT,"i") ; return ; + } + if (suffix(RV,"o")) { + CT = removeSuffix(CT,"o") ; return ; + } + + } + + /** + * If the word ends with one of ( e é ê) in RV,delete it, + * and if preceded by 'gu' (or 'ci') with the 'u' (or 'i') in RV, + * delete the 'u' (or 'i') + * + * Or if the word ends ç remove the cedilha + * + */ + private void step5() { + if (RV == null) return ; + + if (suffix(RV,"e")) { + if (suffixPreceded(RV,"e","gu")) { + CT = removeSuffix(CT,"e") ; + CT = removeSuffix(CT,"u") ; + return ; + } + + if (suffixPreceded(RV,"e","ci")) { + CT = removeSuffix(CT,"e") ; + CT = removeSuffix(CT,"i") ; + return ; + } + + CT = removeSuffix(CT,"e") ; return ; + } + } + + /** + * For log and debug purpose + * + * @return TERM, CT, RV, R1 and R2 + */ + public String log() { + return " (TERM = " + TERM + ")" + + " (CT = " + CT +")" + + " (RV = " + RV +")" + + " (R1 = " + R1 +")" + + " (R2 = " + R2 +")" ; + } + +} + diff -r 000000000000 -r 408254cf2f1d software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/DutchStemmer.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/DutchStemmer.java Wed Nov 24 17:24:23 2010 +0100 @@ -0,0 +1,407 @@ +package de.mpg.mpiwg.berlin.mpdl.lt.analyzer.lang; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.util.Map; + +/** + * + * A stemmer for Dutch words. The algorithm is an implementation of + * the dutch stemming + * algorithm in Martin Porter's snowball project. + * + * @author Edwin de Jonge (ejne at cbs.nl) + */ + +public class DutchStemmer { + /** + * Buffer for the terms while stemming them. + */ + private StringBuffer sb = new StringBuffer(); + private boolean _removedE; + private Map _stemDict; + + private int _R1; + private int _R2; + + //TODO convert to internal + /* + * Stemms the given term to an unique discriminator. + * + * @param term The term that should be stemmed. + * @return Discriminator for term + */ + public String stem(String term) { + term = term.toLowerCase(); + if (!isStemmable(term)) + return term; + if (_stemDict != null && _stemDict.containsKey(term)) + if (_stemDict.get(term) instanceof String) + return (String) _stemDict.get(term); + else + return null; + + // Reset the StringBuffer. + sb.delete(0, sb.length()); + sb.insert(0, term); + // Stemming starts here... + substitute(sb); + storeYandI(sb); + _R1 = getRIndex(sb, 0); + _R1 = Math.max(3, _R1); + step1(sb); + step2(sb); + _R2 = getRIndex(sb, _R1); + step3a(sb); + step3b(sb); + step4(sb); + reStoreYandI(sb); + return sb.toString(); + } + + private boolean enEnding(StringBuffer sb) { + String[] enend = new String[]{"ene", "en"}; + for (int i = 0; i < enend.length; i++) { + String end = enend[i]; + String s = sb.toString(); + int index = s.length() - end.length(); + if (s.endsWith(end) && + index >= _R1 && + isValidEnEnding(sb, index - 1) + ) { + sb.delete(index, index + end.length()); + unDouble(sb, index); + return true; + } + } + return false; + } + + + private void step1(StringBuffer sb) { + if (_R1 >= sb.length()) + return; + + String s = sb.toString(); + int lengthR1 = sb.length() - _R1; + int index; + + if (s.endsWith("heden")) { + sb.replace(_R1, lengthR1 + _R1, sb.substring(_R1, lengthR1 + _R1).replaceAll("heden", "heid")); + return; + } + + if (enEnding(sb)) + return; + + if (s.endsWith("se") && + (index = s.length() - 2) >= _R1 && + isValidSEnding(sb, index - 1) + ) { + sb.delete(index, index + 2); + return; + } + if (s.endsWith("s") && + (index = s.length() - 1) >= _R1 && + isValidSEnding(sb, index - 1)) { + sb.delete(index, index + 1); + } + } + + /** + * Delete suffix e if in R1 and + * preceded by a non-vowel, and then undouble the ending + * + * @param sb String being stemmed + */ + private void step2(StringBuffer sb) { + _removedE = false; + if (_R1 >= sb.length()) + return; + String s = sb.toString(); + int index = s.length() - 1; + if (index >= _R1 && + s.endsWith("e") && + !isVowel(sb.charAt(index - 1))) { + sb.delete(index, index + 1); + unDouble(sb); + _removedE = true; + } + } + + /** + * Delete "heid" + * + * @param sb String being stemmed + */ + private void step3a(StringBuffer sb) { + if (_R2 >= sb.length()) + return; + String s = sb.toString(); + int index = s.length() - 4; + if (s.endsWith("heid") && index >= _R2 && sb.charAt(index - 1) != 'c') { + sb.delete(index, index + 4); //remove heid + enEnding(sb); + } + } + + /** + *

A d-suffix, or derivational suffix, enables a new word, + * often with a different grammatical category, or with a different + * sense, to be built from another word. Whether a d-suffix can be + * attached is discovered not from the rules of grammar, but by + * referring to a dictionary. So in English, ness can be added to + * certain adjectives to form corresponding nouns (littleness, + * kindness, foolishness ...) but not to all adjectives + * (not for example, to big, cruel, wise ...) d-suffixes can be + * used to change meaning, often in rather exotic ways.

+ * Remove "ing", "end", "ig", "lijk", "baar" and "bar" + * + * @param sb String being stemmed + */ + private void step3b(StringBuffer sb) { + if (_R2 >= sb.length()) + return; + String s = sb.toString(); + int index = 0; + + if ((s.endsWith("end") || s.endsWith("ing")) && + (index = s.length() - 3) >= _R2) { + sb.delete(index, index + 3); + if (sb.charAt(index - 2) == 'i' && + sb.charAt(index - 1) == 'g') { + if (sb.charAt(index - 3) != 'e' & index - 2 >= _R2) { + index -= 2; + sb.delete(index, index + 2); + } + } else { + unDouble(sb, index); + } + return; + } + if (s.endsWith("ig") && + (index = s.length() - 2) >= _R2 + ) { + if (sb.charAt(index - 1) != 'e') + sb.delete(index, index + 2); + return; + } + if (s.endsWith("lijk") && + (index = s.length() - 4) >= _R2 + ) { + sb.delete(index, index + 4); + step2(sb); + return; + } + if (s.endsWith("baar") && + (index = s.length() - 4) >= _R2 + ) { + sb.delete(index, index + 4); + return; + } + if (s.endsWith("bar") && + (index = s.length() - 3) >= _R2 + ) { + if (_removedE) + sb.delete(index, index + 3); + return; + } + } + + /** + * undouble vowel + * If the words ends CVD, where C is a non-vowel, D is a non-vowel other than I, and V is double a, e, o or u, remove one of the vowels from V (for example, maan -> man, brood -> brod). + * + * @param sb String being stemmed + */ + private void step4(StringBuffer sb) { + if (sb.length() < 4) + return; + String end = sb.substring(sb.length() - 4, sb.length()); + char c = end.charAt(0); + char v1 = end.charAt(1); + char v2 = end.charAt(2); + char d = end.charAt(3); + if (v1 == v2 && + d != 'I' && + v1 != 'i' && + isVowel(v1) && + !isVowel(d) && + !isVowel(c)) { + sb.delete(sb.length() - 2, sb.length() - 1); + } + } + + /** + * Checks if a term could be stemmed. + * + * @return true if, and only if, the given term consists in letters. + */ + private boolean isStemmable(String term) { + for (int c = 0; c < term.length(); c++) { + if (!Character.isLetter(term.charAt(c))) return false; + } + return true; + } + + /** + * Substitute ä, ë, ï, ö, ü, á , é, í, ó, ú + */ + private void substitute(StringBuffer buffer) { + for (int i = 0; i < buffer.length(); i++) { + switch (buffer.charAt(i)) { + case 'ä': + case 'á': + { + buffer.setCharAt(i, 'a'); + break; + } + case 'ë': + case 'é': + { + buffer.setCharAt(i, 'e'); + break; + } + case 'ü': + case 'ú': + { + buffer.setCharAt(i, 'u'); + break; + } + case 'ï': + case 'i': + { + buffer.setCharAt(i, 'i'); + break; + } + case 'ö': + case 'ó': + { + buffer.setCharAt(i, 'o'); + break; + } + } + } + } + + /*private boolean isValidSEnding(StringBuffer sb) { + return isValidSEnding(sb, sb.length() - 1); + }*/ + + private boolean isValidSEnding(StringBuffer sb, int index) { + char c = sb.charAt(index); + if (isVowel(c) || c == 'j') + return false; + return true; + } + + /*private boolean isValidEnEnding(StringBuffer sb) { + return isValidEnEnding(sb, sb.length() - 1); + }*/ + + private boolean isValidEnEnding(StringBuffer sb, int index) { + char c = sb.charAt(index); + if (isVowel(c)) + return false; + if (c < 3) + return false; + // ends with "gem"? + if (c == 'm' && sb.charAt(index - 2) == 'g' && sb.charAt(index - 1) == 'e') + return false; + return true; + } + + private void unDouble(StringBuffer sb) { + unDouble(sb, sb.length()); + } + + private void unDouble(StringBuffer sb, int endIndex) { + String s = sb.substring(0, endIndex); + if (s.endsWith("kk") || s.endsWith("tt") || s.endsWith("dd") || s.endsWith("nn") || s.endsWith("mm") || s.endsWith("ff")) { + sb.delete(endIndex - 1, endIndex); + } + } + + private int getRIndex(StringBuffer sb, int start) { + if (start == 0) + start = 1; + int i = start; + for (; i < sb.length(); i++) { + //first non-vowel preceded by a vowel + if (!isVowel(sb.charAt(i)) && isVowel(sb.charAt(i - 1))) { + return i + 1; + } + } + return i + 1; + } + + private void storeYandI(StringBuffer sb) { + if (sb.charAt(0) == 'y') + sb.setCharAt(0, 'Y'); + + int last = sb.length() - 1; + + for (int i = 1; i < last; i++) { + switch (sb.charAt(i)) { + case 'i': + { + if (isVowel(sb.charAt(i - 1)) && + isVowel(sb.charAt(i + 1)) + ) + sb.setCharAt(i, 'I'); + break; + } + case 'y': + { + if (isVowel(sb.charAt(i - 1))) + sb.setCharAt(i, 'Y'); + break; + } + } + } + if (last > 0 && sb.charAt(last) == 'y' && isVowel(sb.charAt(last - 1))) + sb.setCharAt(last, 'Y'); + } + + private void reStoreYandI(StringBuffer sb) { + String tmp = sb.toString(); + sb.delete(0, sb.length()); + sb.insert(0, tmp.replaceAll("I", "i").replaceAll("Y", "y")); + } + + private boolean isVowel(char c) { + switch (c) { + case 'e': + case 'a': + case 'o': + case 'i': + case 'u': + case 'y': + case 'è': + { + return true; + } + } + return false; + } + + void setStemDictionary(Map dict) { + _stemDict = dict; + } + +} diff -r 000000000000 -r 408254cf2f1d software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/FrenchStemmer.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/FrenchStemmer.java Wed Nov 24 17:24:23 2010 +0100 @@ -0,0 +1,709 @@ +package de.mpg.mpiwg.berlin.mpdl.lt.analyzer.lang; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * A stemmer for French words. The algorithm is based on the work of + * Dr Martin Porter on his snowball project
+ * refer to http://snowball.sourceforge.net/french/stemmer.html
+ * (French stemming algorithm) for details + * + * @author Patrick Talbot + */ + +public class FrenchStemmer { + + /** + * Buffer for the terms while stemming them. + */ + private StringBuffer sb = new StringBuffer(); + + /** + * A temporary buffer, used to reconstruct R2 + */ + private StringBuffer tb = new StringBuffer(); + + /** + * Region R0 is equal to the whole buffer + */ + private String R0; + + /** + * Region RV + * "If the word begins with two vowels, RV is the region after the third letter, + * otherwise the region after the first vowel not at the beginning of the word, + * or the end of the word if these positions cannot be found." + */ + private String RV; + + /** + * Region R1 + * "R1 is the region after the first non-vowel following a vowel + * or is the null region at the end of the word if there is no such non-vowel" + */ + private String R1; + + /** + * Region R2 + * "R2 is the region after the first non-vowel in R1 following a vowel + * or is the null region at the end of the word if there is no such non-vowel" + */ + private String R2; + + + /** + * Set to true if we need to perform step 2 + */ + private boolean suite; + + /** + * Set to true if the buffer was modified + */ + private boolean modified; + + + /** + * Stemms the given term to a unique discriminator. + * + * @param term java.langString The term that should be stemmed + * @return java.lang.String Discriminator for term + */ + public String stem( String term ) { + if ( !isStemmable( term ) ) { + return term; + } + + // Use lowercase for medium stemming. + term = term.toLowerCase(); + + // Reset the StringBuffer. + sb.delete( 0, sb.length() ); + sb.insert( 0, term ); + + // reset the booleans + modified = false; + suite = false; + + sb = treatVowels( sb ); + + setStrings(); + + step1(); + + if (!modified || suite) + { + if (RV != null) + { + suite = step2a(); + if (!suite) + step2b(); + } + } + + if (modified || suite) + step3(); + else + step4(); + + step5(); + + step6(); + + return sb.toString(); + } + + /** + * Sets the search region Strings
+ * it needs to be done each time the buffer was modified + */ + private void setStrings() { + // set the strings + R0 = sb.toString(); + RV = retrieveRV( sb ); + R1 = retrieveR( sb ); + if ( R1 != null ) + { + tb.delete( 0, tb.length() ); + tb.insert( 0, R1 ); + R2 = retrieveR( tb ); + } + else + R2 = null; + } + + /** + * First step of the Porter Algorithmn
+ * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation + */ + private void step1( ) { + String[] suffix = { "ances", "iqUes", "ismes", "ables", "istes", "ance", "iqUe", "isme", "able", "iste" }; + deleteFrom( R2, suffix ); + + replaceFrom( R2, new String[] { "logies", "logie" }, "log" ); + replaceFrom( R2, new String[] { "usions", "utions", "usion", "ution" }, "u" ); + replaceFrom( R2, new String[] { "ences", "ence" }, "ent" ); + + String[] search = { "atrices", "ateurs", "ations", "atrice", "ateur", "ation"}; + deleteButSuffixFromElseReplace( R2, search, "ic", true, R0, "iqU" ); + + deleteButSuffixFromElseReplace( R2, new String[] { "ements", "ement" }, "eus", false, R0, "eux" ); + deleteButSuffixFrom( R2, new String[] { "ements", "ement" }, "ativ", false ); + deleteButSuffixFrom( R2, new String[] { "ements", "ement" }, "iv", false ); + deleteButSuffixFrom( R2, new String[] { "ements", "ement" }, "abl", false ); + deleteButSuffixFrom( R2, new String[] { "ements", "ement" }, "iqU", false ); + + deleteFromIfTestVowelBeforeIn( R1, new String[] { "issements", "issement" }, false, R0 ); + deleteFrom( RV, new String[] { "ements", "ement" } ); + + deleteButSuffixFromElseReplace( R2, new String[] { "ités", "ité" }, "abil", false, R0, "abl" ); + deleteButSuffixFromElseReplace( R2, new String[] { "ités", "ité" }, "ic", false, R0, "iqU" ); + deleteButSuffixFrom( R2, new String[] { "ités", "ité" }, "iv", true ); + + String[] autre = { "ifs", "ives", "if", "ive" }; + deleteButSuffixFromElseReplace( R2, autre, "icat", false, R0, "iqU" ); + deleteButSuffixFromElseReplace( R2, autre, "at", true, R2, "iqU" ); + + replaceFrom( R0, new String[] { "eaux" }, "eau" ); + + replaceFrom( R1, new String[] { "aux" }, "al" ); + + deleteButSuffixFromElseReplace( R2, new String[] { "euses", "euse" }, "", true, R1, "eux" ); + + deleteFrom( R2, new String[] { "eux" } ); + + // if one of the next steps is performed, we will need to perform step2a + boolean temp = false; + temp = replaceFrom( RV, new String[] { "amment" }, "ant" ); + if (temp == true) + suite = true; + temp = replaceFrom( RV, new String[] { "emment" }, "ent" ); + if (temp == true) + suite = true; + temp = deleteFromIfTestVowelBeforeIn( RV, new String[] { "ments", "ment" }, true, RV ); + if (temp == true) + suite = true; + + } + + /** + * Second step (A) of the Porter Algorithmn
+ * Will be performed if nothing changed from the first step + * or changed were done in the amment, emment, ments or ment suffixes
+ * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation + * + * @return boolean - true if something changed in the StringBuffer + */ + private boolean step2a() { + String[] search = { "îmes", "îtes", "iraIent", "irait", "irais", "irai", "iras", "ira", + "irent", "iriez", "irez", "irions", "irons", "iront", + "issaIent", "issais", "issantes", "issante", "issants", "issant", + "issait", "issais", "issions", "issons", "issiez", "issez", "issent", + "isses", "isse", "ir", "is", "ît", "it", "ies", "ie", "i" }; + return deleteFromIfTestVowelBeforeIn( RV, search, false, RV ); + } + + /** + * Second step (B) of the Porter Algorithmn
+ * Will be performed if step 2 A was performed unsuccessfully
+ * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation + */ + private void step2b() { + String[] suffix = { "eraIent", "erais", "erait", "erai", "eras", "erions", "eriez", + "erons", "eront","erez", "èrent", "era", "ées", "iez", + "ée", "és", "er", "ez", "é" }; + deleteFrom( RV, suffix ); + + String[] search = { "assions", "assiez", "assent", "asses", "asse", "aIent", + "antes", "aIent", "Aient", "ante", "âmes", "âtes", "ants", "ant", + "ait", "aît", "ais", "Ait", "Aît", "Ais", "ât", "as", "ai", "Ai", "a" }; + deleteButSuffixFrom( RV, search, "e", true ); + + deleteFrom( R2, new String[] { "ions" } ); + } + + /** + * Third step of the Porter Algorithmn
+ * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation + */ + private void step3() { + if (sb.length()>0) + { + char ch = sb.charAt( sb.length()-1 ); + if (ch == 'Y') + { + sb.setCharAt( sb.length()-1, 'i' ); + setStrings(); + } + else if (ch == 'ç') + { + sb.setCharAt( sb.length()-1, 'c' ); + setStrings(); + } + } + } + + /** + * Fourth step of the Porter Algorithmn
+ * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation + */ + private void step4() { + if (sb.length() > 1) + { + char ch = sb.charAt( sb.length()-1 ); + if (ch == 's') + { + char b = sb.charAt( sb.length()-2 ); + if (b != 'a' && b != 'i' && b != 'o' && b != 'u' && b != 'è' && b != 's') + { + sb.delete( sb.length() - 1, sb.length()); + setStrings(); + } + } + } + boolean found = deleteFromIfPrecededIn( R2, new String[] { "ion" }, RV, "s" ); + if (!found) + found = deleteFromIfPrecededIn( R2, new String[] { "ion" }, RV, "t" ); + + replaceFrom( RV, new String[] { "Ière", "ière", "Ier", "ier" }, "i" ); + deleteFrom( RV, new String[] { "e" } ); + deleteFromIfPrecededIn( RV, new String[] { "ë" }, R0, "gu" ); + } + + /** + * Fifth step of the Porter Algorithmn
+ * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation + */ + private void step5() { + if (R0 != null) + { + if (R0.endsWith("enn") || R0.endsWith("onn") || R0.endsWith("ett") || R0.endsWith("ell") || R0.endsWith("eill")) + { + sb.delete( sb.length() - 1, sb.length() ); + setStrings(); + } + } + } + + /** + * Sixth (and last!) step of the Porter Algorithmn
+ * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation + */ + private void step6() { + if (R0!=null && R0.length()>0) + { + boolean seenVowel = false; + boolean seenConson = false; + int pos = -1; + for (int i = R0.length()-1; i > -1; i--) + { + char ch = R0.charAt(i); + if (isVowel(ch)) + { + if (!seenVowel) + { + if (ch == 'é' || ch == 'è') + { + pos = i; + break; + } + } + seenVowel = true; + } + else + { + if (seenVowel) + break; + else + seenConson = true; + } + } + if (pos > -1 && seenConson && !seenVowel) + sb.setCharAt(pos, 'e'); + } + } + + /** + * Delete a suffix searched in zone "source" if zone "from" contains prefix + search string + * + * @param source java.lang.String - the primary source zone for search + * @param search java.lang.String[] - the strings to search for suppression + * @param from java.lang.String - the secondary source zone for search + * @param prefix java.lang.String - the prefix to add to the search string to test + * @return boolean - true if modified + */ + private boolean deleteFromIfPrecededIn( String source, String[] search, String from, String prefix ) { + boolean found = false; + if (source!=null ) + { + for (int i = 0; i < search.length; i++) { + if ( source.endsWith( search[i] )) + { + if (from!=null && from.endsWith( prefix + search[i] )) + { + sb.delete( sb.length() - search[i].length(), sb.length()); + found = true; + setStrings(); + break; + } + } + } + } + return found; + } + + /** + * Delete a suffix searched in zone "source" if the preceding letter is (or isn't) a vowel + * + * @param source java.lang.String - the primary source zone for search + * @param search java.lang.String[] - the strings to search for suppression + * @param vowel boolean - true if we need a vowel before the search string + * @param from java.lang.String - the secondary source zone for search (where vowel could be) + * @return boolean - true if modified + */ + private boolean deleteFromIfTestVowelBeforeIn( String source, String[] search, boolean vowel, String from ) { + boolean found = false; + if (source!=null && from!=null) + { + for (int i = 0; i < search.length; i++) { + if ( source.endsWith( search[i] )) + { + if ((search[i].length() + 1) <= from.length()) + { + boolean test = isVowel(sb.charAt(sb.length()-(search[i].length()+1))); + if (test == vowel) + { + sb.delete( sb.length() - search[i].length(), sb.length()); + modified = true; + found = true; + setStrings(); + break; + } + } + } + } + } + return found; + } + + /** + * Delete a suffix searched in zone "source" if preceded by the prefix + * + * @param source java.lang.String - the primary source zone for search + * @param search java.lang.String[] - the strings to search for suppression + * @param prefix java.lang.String - the prefix to add to the search string to test + * @param without boolean - true if it will be deleted even without prefix found + */ + private void deleteButSuffixFrom( String source, String[] search, String prefix, boolean without ) { + if (source!=null) + { + for (int i = 0; i < search.length; i++) { + if ( source.endsWith( prefix + search[i] )) + { + sb.delete( sb.length() - (prefix.length() + search[i].length()), sb.length() ); + modified = true; + setStrings(); + break; + } + else if ( without && source.endsWith( search[i] )) + { + sb.delete( sb.length() - search[i].length(), sb.length() ); + modified = true; + setStrings(); + break; + } + } + } + } + + /** + * Delete a suffix searched in zone "source" if preceded by prefix
+ * or replace it with the replace string if preceded by the prefix in the zone "from"
+ * or delete the suffix if specified + * + * @param source java.lang.String - the primary source zone for search + * @param search java.lang.String[] - the strings to search for suppression + * @param prefix java.lang.String - the prefix to add to the search string to test + * @param without boolean - true if it will be deleted even without prefix found + */ + private void deleteButSuffixFromElseReplace( String source, String[] search, String prefix, boolean without, String from, String replace ) { + if (source!=null) + { + for (int i = 0; i < search.length; i++) { + if ( source.endsWith( prefix + search[i] )) + { + sb.delete( sb.length() - (prefix.length() + search[i].length()), sb.length() ); + modified = true; + setStrings(); + break; + } + else if ( from!=null && from.endsWith( prefix + search[i] )) + { + sb.replace( sb.length() - (prefix.length() + search[i].length()), sb.length(), replace ); + modified = true; + setStrings(); + break; + } + else if ( without && source.endsWith( search[i] )) + { + sb.delete( sb.length() - search[i].length(), sb.length() ); + modified = true; + setStrings(); + break; + } + } + } + } + + /** + * Replace a search string with another within the source zone + * + * @param source java.lang.String - the source zone for search + * @param search java.lang.String[] - the strings to search for replacement + * @param replace java.lang.String - the replacement string + */ + private boolean replaceFrom( String source, String[] search, String replace ) { + boolean found = false; + if (source!=null) + { + for (int i = 0; i < search.length; i++) { + if ( source.endsWith( search[i] )) + { + sb.replace( sb.length() - search[i].length(), sb.length(), replace ); + modified = true; + found = true; + setStrings(); + break; + } + } + } + return found; + } + + /** + * Delete a search string within the source zone + * + * @param source the source zone for search + * @param suffix the strings to search for suppression + */ + private void deleteFrom(String source, String[] suffix ) { + if (source!=null) + { + for (int i = 0; i < suffix.length; i++) { + if (source.endsWith( suffix[i] )) + { + sb.delete( sb.length() - suffix[i].length(), sb.length()); + modified = true; + setStrings(); + break; + } + } + } + } + + /** + * Test if a char is a french vowel, including accentuated ones + * + * @param ch the char to test + * @return boolean - true if the char is a vowel + */ + private boolean isVowel(char ch) { + switch (ch) + { + case 'a': + case 'e': + case 'i': + case 'o': + case 'u': + case 'y': + case 'â': + case 'à': + case 'ë': + case 'é': + case 'ê': + case 'è': + case 'ï': + case 'î': + case 'ô': + case 'ü': + case 'ù': + case 'û': + return true; + default: + return false; + } + } + + /** + * Retrieve the "R zone" (1 or 2 depending on the buffer) and return the corresponding string
+ * "R is the region after the first non-vowel following a vowel + * or is the null region at the end of the word if there is no such non-vowel"
+ * @param buffer java.lang.StringBuffer - the in buffer + * @return java.lang.String - the resulting string + */ + private String retrieveR( StringBuffer buffer ) { + int len = buffer.length(); + int pos = -1; + for (int c = 0; c < len; c++) { + if (isVowel( buffer.charAt( c ))) + { + pos = c; + break; + } + } + if (pos > -1) + { + int consonne = -1; + for (int c = pos; c < len; c++) { + if (!isVowel(buffer.charAt( c ))) + { + consonne = c; + break; + } + } + if (consonne > -1 && (consonne+1) < len) + return buffer.substring( consonne+1, len ); + else + return null; + } + else + return null; + } + + /** + * Retrieve the "RV zone" from a buffer an return the corresponding string
+ * "If the word begins with two vowels, RV is the region after the third letter, + * otherwise the region after the first vowel not at the beginning of the word, + * or the end of the word if these positions cannot be found."
+ * @param buffer java.lang.StringBuffer - the in buffer + * @return java.lang.String - the resulting string + */ + private String retrieveRV( StringBuffer buffer ) { + int len = buffer.length(); + if ( buffer.length() > 3) + { + if ( isVowel(buffer.charAt( 0 )) && isVowel(buffer.charAt( 1 ))) { + return buffer.substring(3,len); + } + else + { + int pos = 0; + for (int c = 1; c < len; c++) { + if (isVowel( buffer.charAt( c ))) + { + pos = c; + break; + } + } + if ( pos+1 < len ) + return buffer.substring( pos+1, len ); + else + return null; + } + } + else + return null; + } + + + + /** + * Turns u and i preceded AND followed by a vowel to UpperCase
+ * Turns y preceded OR followed by a vowel to UpperCase
+ * Turns u preceded by q to UpperCase
+ * + * @param buffer java.util.StringBuffer - the buffer to treat + * @return java.util.StringBuffer - the treated buffer + */ + private StringBuffer treatVowels( StringBuffer buffer ) { + for ( int c = 0; c < buffer.length(); c++ ) { + char ch = buffer.charAt( c ); + + if (c == 0) // first char + { + if (buffer.length()>1) + { + if (ch == 'y' && isVowel(buffer.charAt( c + 1 ))) + buffer.setCharAt( c, 'Y' ); + } + } + else if (c == buffer.length()-1) // last char + { + if (ch == 'u' && buffer.charAt( c - 1 ) == 'q') + buffer.setCharAt( c, 'U' ); + if (ch == 'y' && isVowel(buffer.charAt( c - 1 ))) + buffer.setCharAt( c, 'Y' ); + } + else // other cases + { + if (ch == 'u') + { + if (buffer.charAt( c - 1) == 'q') + buffer.setCharAt( c, 'U' ); + else if (isVowel(buffer.charAt( c - 1 )) && isVowel(buffer.charAt( c + 1 ))) + buffer.setCharAt( c, 'U' ); + } + if (ch == 'i') + { + if (isVowel(buffer.charAt( c - 1 )) && isVowel(buffer.charAt( c + 1 ))) + buffer.setCharAt( c, 'I' ); + } + if (ch == 'y') + { + if (isVowel(buffer.charAt( c - 1 )) || isVowel(buffer.charAt( c + 1 ))) + buffer.setCharAt( c, 'Y' ); + } + } + } + + return buffer; + } + + /** + * Checks a term if it can be processed correctly. + * + * @return boolean - true if, and only if, the given term consists in letters. + */ + private boolean isStemmable( String term ) { + boolean upper = false; + int first = -1; + for ( int c = 0; c < term.length(); c++ ) { + // Discard terms that contain non-letter characters. + if ( !Character.isLetter( term.charAt( c ) ) ) { + return false; + } + // Discard terms that contain multiple uppercase letters. + if ( Character.isUpperCase( term.charAt( c ) ) ) { + if ( upper ) { + return false; + } + // First encountered uppercase letter, set flag and save + // position. + else { + first = c; + upper = true; + } + } + } + // Discard the term if it contains a single uppercase letter that + // is not starting the term. + if ( first > 0 ) { + return false; + } + return true; + } +} diff -r 000000000000 -r 408254cf2f1d software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/GermanStemmer.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/GermanStemmer.java Wed Nov 24 17:24:23 2010 +0100 @@ -0,0 +1,267 @@ +package de.mpg.mpiwg.berlin.mpdl.lt.analyzer.lang; +// This file is encoded in UTF-8 + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * A stemmer for German words. The algorithm is based on the report + * "A Fast and Simple Stemming Algorithm for German Words" by Jörg + * Caumanns (joerg.caumanns at isst.fhg.de). + * + * + * @version $Id: GermanStemmer.java 564236 2007-08-09 15:21:19Z gsingers $ + */ +public class GermanStemmer +{ + /** + * Buffer for the terms while stemming them. + */ + private StringBuffer sb = new StringBuffer(); + + /** + * Amount of characters that are removed with substitute() while stemming. + */ + private int substCount = 0; + + /** + * Stemms the given term to an unique discriminator. + * + * @param term The term that should be stemmed. + * @return Discriminator for term + */ + public String stem( String term ) + { + // Use lowercase for medium stemming. + term = term.toLowerCase(); + if ( !isStemmable( term ) ) + return term; + // Reset the StringBuffer. + sb.delete( 0, sb.length() ); + sb.insert( 0, term ); + // Stemming starts here... + substitute( sb ); + strip( sb ); + optimize( sb ); + resubstitute( sb ); + removeParticleDenotion( sb ); + return sb.toString(); + } + + /** + * Checks if a term could be stemmed. + * + * @return true if, and only if, the given term consists in letters. + */ + private boolean isStemmable( String term ) + { + for ( int c = 0; c < term.length(); c++ ) { + if ( !Character.isLetter( term.charAt( c ) ) ) + return false; + } + return true; + } + + /** + * suffix stripping (stemming) on the current term. The stripping is reduced + * to the seven "base" suffixes "e", "s", "n", "t", "em", "er" and * "nd", + * from which all regular suffixes are build of. The simplification causes + * some overstemming, and way more irregular stems, but still provides unique. + * discriminators in the most of those cases. + * The algorithm is context free, except of the length restrictions. + */ + private void strip( StringBuffer buffer ) + { + boolean doMore = true; + while ( doMore && buffer.length() > 3 ) { + if ( ( buffer.length() + substCount > 5 ) && + buffer.substring( buffer.length() - 2, buffer.length() ).equals( "nd" ) ) + { + buffer.delete( buffer.length() - 2, buffer.length() ); + } + else if ( ( buffer.length() + substCount > 4 ) && + buffer.substring( buffer.length() - 2, buffer.length() ).equals( "em" ) ) { + buffer.delete( buffer.length() - 2, buffer.length() ); + } + else if ( ( buffer.length() + substCount > 4 ) && + buffer.substring( buffer.length() - 2, buffer.length() ).equals( "er" ) ) { + buffer.delete( buffer.length() - 2, buffer.length() ); + } + else if ( buffer.charAt( buffer.length() - 1 ) == 'e' ) { + buffer.deleteCharAt( buffer.length() - 1 ); + } + else if ( buffer.charAt( buffer.length() - 1 ) == 's' ) { + buffer.deleteCharAt( buffer.length() - 1 ); + } + else if ( buffer.charAt( buffer.length() - 1 ) == 'n' ) { + buffer.deleteCharAt( buffer.length() - 1 ); + } + // "t" occurs only as suffix of verbs. + else if ( buffer.charAt( buffer.length() - 1 ) == 't' ) { + buffer.deleteCharAt( buffer.length() - 1 ); + } + else { + doMore = false; + } + } + } + + /** + * Does some optimizations on the term. This optimisations are + * contextual. + */ + private void optimize( StringBuffer buffer ) + { + // Additional step for female plurals of professions and inhabitants. + if ( buffer.length() > 5 && buffer.substring( buffer.length() - 5, buffer.length() ).equals( "erin*" ) ) { + buffer.deleteCharAt( buffer.length() -1 ); + strip( buffer ); + } + // Additional step for irregular plural nouns like "Matrizen -> Matrix". + if ( buffer.charAt( buffer.length() - 1 ) == ( 'z' ) ) { + buffer.setCharAt( buffer.length() - 1, 'x' ); + } + } + + /** + * Removes a particle denotion ("ge") from a term. + */ + private void removeParticleDenotion( StringBuffer buffer ) + { + if ( buffer.length() > 4 ) { + for ( int c = 0; c < buffer.length() - 3; c++ ) { + if ( buffer.substring( c, c + 4 ).equals( "gege" ) ) { + buffer.delete( c, c + 2 ); + return; + } + } + } + } + + /** + * Do some substitutions for the term to reduce overstemming: + * + * - Substitute Umlauts with their corresponding vowel: äöü -> aou, + * "ß" is substituted by "ss" + * - Substitute a second char of a pair of equal characters with + * an asterisk: ?? -> ?* + * - Substitute some common character combinations with a token: + * sch/ch/ei/ie/ig/st -> $/§/%/&/#/! + */ + private void substitute( StringBuffer buffer ) + { + substCount = 0; + for ( int c = 0; c < buffer.length(); c++ ) { + // Replace the second char of a pair of the equal characters with an asterisk + if ( c > 0 && buffer.charAt( c ) == buffer.charAt ( c - 1 ) ) { + buffer.setCharAt( c, '*' ); + } + // Substitute Umlauts. + else if ( buffer.charAt( c ) == 'ä' ) { + buffer.setCharAt( c, 'a' ); + } + else if ( buffer.charAt( c ) == 'ö' ) { + buffer.setCharAt( c, 'o' ); + } + else if ( buffer.charAt( c ) == 'ü' ) { + buffer.setCharAt( c, 'u' ); + } + // Fix bug so that 'ß' at the end of a word is replaced. + else if ( buffer.charAt( c ) == 'ß' ) { + buffer.setCharAt( c, 's' ); + buffer.insert( c + 1, 's' ); + substCount++; + } + // Take care that at least one character is left left side from the current one + if ( c < buffer.length() - 1 ) { + // Masking several common character combinations with an token + if ( ( c < buffer.length() - 2 ) && buffer.charAt( c ) == 's' && + buffer.charAt( c + 1 ) == 'c' && buffer.charAt( c + 2 ) == 'h' ) + { + buffer.setCharAt( c, '$' ); + buffer.delete( c + 1, c + 3 ); + substCount =+ 2; + } + else if ( buffer.charAt( c ) == 'c' && buffer.charAt( c + 1 ) == 'h' ) { + buffer.setCharAt( c, '§' ); + buffer.deleteCharAt( c + 1 ); + substCount++; + } + else if ( buffer.charAt( c ) == 'e' && buffer.charAt( c + 1 ) == 'i' ) { + buffer.setCharAt( c, '%' ); + buffer.deleteCharAt( c + 1 ); + substCount++; + } + else if ( buffer.charAt( c ) == 'i' && buffer.charAt( c + 1 ) == 'e' ) { + buffer.setCharAt( c, '&' ); + buffer.deleteCharAt( c + 1 ); + substCount++; + } + else if ( buffer.charAt( c ) == 'i' && buffer.charAt( c + 1 ) == 'g' ) { + buffer.setCharAt( c, '#' ); + buffer.deleteCharAt( c + 1 ); + substCount++; + } + else if ( buffer.charAt( c ) == 's' && buffer.charAt( c + 1 ) == 't' ) { + buffer.setCharAt( c, '!' ); + buffer.deleteCharAt( c + 1 ); + substCount++; + } + } + } + } + + /** + * Undoes the changes made by substitute(). That are character pairs and + * character combinations. Umlauts will remain as their corresponding vowel, + * as "ß" remains as "ss". + */ + private void resubstitute( StringBuffer buffer ) + { + for ( int c = 0; c < buffer.length(); c++ ) { + if ( buffer.charAt( c ) == '*' ) { + char x = buffer.charAt( c - 1 ); + buffer.setCharAt( c, x ); + } + else if ( buffer.charAt( c ) == '$' ) { + buffer.setCharAt( c, 's' ); + buffer.insert( c + 1, new char[]{'c', 'h'}, 0, 2 ); + } + else if ( buffer.charAt( c ) == '§' ) { + buffer.setCharAt( c, 'c' ); + buffer.insert( c + 1, 'h' ); + } + else if ( buffer.charAt( c ) == '%' ) { + buffer.setCharAt( c, 'e' ); + buffer.insert( c + 1, 'i' ); + } + else if ( buffer.charAt( c ) == '&' ) { + buffer.setCharAt( c, 'i' ); + buffer.insert( c + 1, 'e' ); + } + else if ( buffer.charAt( c ) == '#' ) { + buffer.setCharAt( c, 'i' ); + buffer.insert( c + 1, 'g' ); + } + else if ( buffer.charAt( c ) == '!' ) { + buffer.setCharAt( c, 's' ); + buffer.insert( c + 1, 't' ); + } + } + } + +} diff -r 000000000000 -r 408254cf2f1d software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlAnalyzerAR.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlAnalyzerAR.java Wed Nov 24 17:24:23 2010 +0100 @@ -0,0 +1,41 @@ +package de.mpg.mpiwg.berlin.mpdl.lt.analyzer.lang; + +import java.io.File; +import java.io.IOException; +import java.util.Hashtable; + +import de.mpg.mpiwg.berlin.mpdl.lt.analyzer.MpdlMorphAnalyzer; + +public class MpdlAnalyzerAR extends MpdlMorphAnalyzer { + private static String LANGUAGE = "ar"; + + public MpdlAnalyzerAR() { + super(); + setLanguage(LANGUAGE); + } + + /** + * Builds an analyzer with the given stop words. + */ + public MpdlAnalyzerAR(String[] stopwords) { + super(stopwords); + setLanguage(LANGUAGE); + } + + /** + * Builds an analyzer with the given stop words. + */ + public MpdlAnalyzerAR(Hashtable stopwords) { + super(stopwords); + setLanguage(LANGUAGE); + } + + /** + * Builds an analyzer with the given stop words. + */ + public MpdlAnalyzerAR(File stopwords) throws IOException { + super(stopwords); + setLanguage(LANGUAGE); + } + +} diff -r 000000000000 -r 408254cf2f1d software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlAnalyzerDE.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlAnalyzerDE.java Wed Nov 24 17:24:23 2010 +0100 @@ -0,0 +1,41 @@ +package de.mpg.mpiwg.berlin.mpdl.lt.analyzer.lang; + +import java.io.File; +import java.io.IOException; +import java.util.Hashtable; + +import de.mpg.mpiwg.berlin.mpdl.lt.analyzer.MpdlMorphAnalyzer; + +public class MpdlAnalyzerDE extends MpdlMorphAnalyzer { + private static String LANGUAGE = "de"; + + public MpdlAnalyzerDE() { + super(); + setLanguage(LANGUAGE); + } + + /** + * Builds an analyzer with the given stop words. + */ + public MpdlAnalyzerDE(String[] stopwords) { + super(stopwords); + setLanguage(LANGUAGE); + } + + /** + * Builds an analyzer with the given stop words. + */ + public MpdlAnalyzerDE(Hashtable stopwords) { + super(stopwords); + setLanguage(LANGUAGE); + } + + /** + * Builds an analyzer with the given stop words. + */ + public MpdlAnalyzerDE(File stopwords) throws IOException { + super(stopwords); + setLanguage(LANGUAGE); + } + +} diff -r 000000000000 -r 408254cf2f1d software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlAnalyzerEL.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlAnalyzerEL.java Wed Nov 24 17:24:23 2010 +0100 @@ -0,0 +1,41 @@ +package de.mpg.mpiwg.berlin.mpdl.lt.analyzer.lang; + +import java.io.File; +import java.io.IOException; +import java.util.Hashtable; + +import de.mpg.mpiwg.berlin.mpdl.lt.analyzer.MpdlMorphAnalyzer; + +public class MpdlAnalyzerEL extends MpdlMorphAnalyzer { + private static String LANGUAGE = "el"; + + public MpdlAnalyzerEL() { + super(); + setLanguage(LANGUAGE); + } + + /** + * Builds an analyzer with the given stop words. + */ + public MpdlAnalyzerEL(String[] stopwords) { + super(stopwords); + setLanguage(LANGUAGE); + } + + /** + * Builds an analyzer with the given stop words. + */ + public MpdlAnalyzerEL(Hashtable stopwords) { + super(stopwords); + setLanguage(LANGUAGE); + } + + /** + * Builds an analyzer with the given stop words. + */ + public MpdlAnalyzerEL(File stopwords) throws IOException { + super(stopwords); + setLanguage(LANGUAGE); + } + +} diff -r 000000000000 -r 408254cf2f1d software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlAnalyzerEN.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlAnalyzerEN.java Wed Nov 24 17:24:23 2010 +0100 @@ -0,0 +1,41 @@ +package de.mpg.mpiwg.berlin.mpdl.lt.analyzer.lang; + +import java.io.File; +import java.io.IOException; +import java.util.Hashtable; + +import de.mpg.mpiwg.berlin.mpdl.lt.analyzer.MpdlMorphAnalyzer; + +public class MpdlAnalyzerEN extends MpdlMorphAnalyzer { + private static String LANGUAGE = "en"; + + public MpdlAnalyzerEN() { + super(); + setLanguage(LANGUAGE); + } + + /** + * Builds an analyzer with the given stop words. + */ + public MpdlAnalyzerEN(String[] stopwords) { + super(stopwords); + setLanguage(LANGUAGE); + } + + /** + * Builds an analyzer with the given stop words. + */ + public MpdlAnalyzerEN(Hashtable stopwords) { + super(stopwords); + setLanguage(LANGUAGE); + } + + /** + * Builds an analyzer with the given stop words. + */ + public MpdlAnalyzerEN(File stopwords) throws IOException { + super(stopwords); + setLanguage(LANGUAGE); + } + +} diff -r 000000000000 -r 408254cf2f1d software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlAnalyzerFR.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlAnalyzerFR.java Wed Nov 24 17:24:23 2010 +0100 @@ -0,0 +1,41 @@ +package de.mpg.mpiwg.berlin.mpdl.lt.analyzer.lang; + +import java.io.File; +import java.io.IOException; +import java.util.Hashtable; + +import de.mpg.mpiwg.berlin.mpdl.lt.analyzer.MpdlMorphAnalyzer; + +public class MpdlAnalyzerFR extends MpdlMorphAnalyzer { + private static String LANGUAGE = "fr"; + + public MpdlAnalyzerFR() { + super(); + setLanguage(LANGUAGE); + } + + /** + * Builds an analyzer with the given stop words. + */ + public MpdlAnalyzerFR(String[] stopwords) { + super(stopwords); + setLanguage(LANGUAGE); + } + + /** + * Builds an analyzer with the given stop words. + */ + public MpdlAnalyzerFR(Hashtable stopwords) { + super(stopwords); + setLanguage(LANGUAGE); + } + + /** + * Builds an analyzer with the given stop words. + */ + public MpdlAnalyzerFR(File stopwords) throws IOException { + super(stopwords); + setLanguage(LANGUAGE); + } + +} diff -r 000000000000 -r 408254cf2f1d software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlAnalyzerIT.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlAnalyzerIT.java Wed Nov 24 17:24:23 2010 +0100 @@ -0,0 +1,41 @@ +package de.mpg.mpiwg.berlin.mpdl.lt.analyzer.lang; + +import java.io.File; +import java.io.IOException; +import java.util.Hashtable; + +import de.mpg.mpiwg.berlin.mpdl.lt.analyzer.MpdlMorphAnalyzer; + +public class MpdlAnalyzerIT extends MpdlMorphAnalyzer { + private static String LANGUAGE = "it"; + + public MpdlAnalyzerIT() { + super(); + setLanguage(LANGUAGE); + } + + /** + * Builds an analyzer with the given stop words. + */ + public MpdlAnalyzerIT(String[] stopwords) { + super(stopwords); + setLanguage(LANGUAGE); + } + + /** + * Builds an analyzer with the given stop words. + */ + public MpdlAnalyzerIT(Hashtable stopwords) { + super(stopwords); + setLanguage(LANGUAGE); + } + + /** + * Builds an analyzer with the given stop words. + */ + public MpdlAnalyzerIT(File stopwords) throws IOException { + super(stopwords); + setLanguage(LANGUAGE); + } + +} diff -r 000000000000 -r 408254cf2f1d software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlAnalyzerLA.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlAnalyzerLA.java Wed Nov 24 17:24:23 2010 +0100 @@ -0,0 +1,41 @@ +package de.mpg.mpiwg.berlin.mpdl.lt.analyzer.lang; + +import java.io.File; +import java.io.IOException; +import java.util.Hashtable; + +import de.mpg.mpiwg.berlin.mpdl.lt.analyzer.MpdlMorphAnalyzer; + +public class MpdlAnalyzerLA extends MpdlMorphAnalyzer { + private static String LANGUAGE = "la"; + + public MpdlAnalyzerLA() { + super(); + setLanguage(LANGUAGE); + } + + /** + * Builds an analyzer with the given stop words. + */ + public MpdlAnalyzerLA(String[] stopwords) { + super(stopwords); + setLanguage(LANGUAGE); + } + + /** + * Builds an analyzer with the given stop words. + */ + public MpdlAnalyzerLA(Hashtable stopwords) { + super(stopwords); + setLanguage(LANGUAGE); + } + + /** + * Builds an analyzer with the given stop words. + */ + public MpdlAnalyzerLA(File stopwords) throws IOException { + super(stopwords); + setLanguage(LANGUAGE); + } + +} diff -r 000000000000 -r 408254cf2f1d software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlAnalyzerNL.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlAnalyzerNL.java Wed Nov 24 17:24:23 2010 +0100 @@ -0,0 +1,41 @@ +package de.mpg.mpiwg.berlin.mpdl.lt.analyzer.lang; + +import java.io.File; +import java.io.IOException; +import java.util.Hashtable; + +import de.mpg.mpiwg.berlin.mpdl.lt.analyzer.MpdlMorphAnalyzer; + +public class MpdlAnalyzerNL extends MpdlMorphAnalyzer { + private static String LANGUAGE = "nl"; + + public MpdlAnalyzerNL() { + super(); + setLanguage(LANGUAGE); + } + + /** + * Builds an analyzer with the given stop words. + */ + public MpdlAnalyzerNL(String[] stopwords) { + super(stopwords); + setLanguage(LANGUAGE); + } + + /** + * Builds an analyzer with the given stop words. + */ + public MpdlAnalyzerNL(Hashtable stopwords) { + super(stopwords); + setLanguage(LANGUAGE); + } + + /** + * Builds an analyzer with the given stop words. + */ + public MpdlAnalyzerNL(File stopwords) throws IOException { + super(stopwords); + setLanguage(LANGUAGE); + } + +} diff -r 000000000000 -r 408254cf2f1d software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlAnalyzerZH.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlAnalyzerZH.java Wed Nov 24 17:24:23 2010 +0100 @@ -0,0 +1,41 @@ +package de.mpg.mpiwg.berlin.mpdl.lt.analyzer.lang; + +import java.io.File; +import java.io.IOException; +import java.util.Hashtable; + +import de.mpg.mpiwg.berlin.mpdl.lt.analyzer.MpdlMorphAnalyzer; + +public class MpdlAnalyzerZH extends MpdlMorphAnalyzer { + private static String LANGUAGE = "zh"; + + public MpdlAnalyzerZH() { + super(); + setLanguage(LANGUAGE); + } + + /** + * Builds an analyzer with the given stop words. + */ + public MpdlAnalyzerZH(String[] stopwords) { + super(stopwords); + setLanguage(LANGUAGE); + } + + /** + * Builds an analyzer with the given stop words. + */ + public MpdlAnalyzerZH(Hashtable stopwords) { + super(stopwords); + setLanguage(LANGUAGE); + } + + /** + * Builds an analyzer with the given stop words. + */ + public MpdlAnalyzerZH(File stopwords) throws IOException { + super(stopwords); + setLanguage(LANGUAGE); + } + +} diff -r 000000000000 -r 408254cf2f1d software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/RussianStemmer.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/RussianStemmer.java Wed Nov 24 17:24:23 2010 +0100 @@ -0,0 +1,630 @@ +package de.mpg.mpiwg.berlin.mpdl.lt.analyzer.lang; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Russian stemming algorithm implementation (see http://snowball.sourceforge.net for detailed description). + * + * + * @version $Id: RussianStemmer.java 564236 2007-08-09 15:21:19Z gsingers $ + */ +public class RussianStemmer +{ + private char[] charset; + + // positions of RV, R1 and R2 respectively + private int RV, R1, R2; + + // letters (currently unused letters are commented out) + private final static char A = 0; + //private final static char B = 1; + private final static char V = 2; + private final static char G = 3; + //private final static char D = 4; + private final static char E = 5; + //private final static char ZH = 6; + //private final static char Z = 7; + private final static char I = 8; + private final static char I_ = 9; + //private final static char K = 10; + private final static char L = 11; + private final static char M = 12; + private final static char N = 13; + private final static char O = 14; + //private final static char P = 15; + //private final static char R = 16; + private final static char S = 17; + private final static char T = 18; + private final static char U = 19; + //private final static char F = 20; + private final static char X = 21; + //private final static char TS = 22; + //private final static char CH = 23; + private final static char SH = 24; + private final static char SHCH = 25; + //private final static char HARD = 26; + private final static char Y = 27; + private final static char SOFT = 28; + private final static char AE = 29; + private final static char IU = 30; + private final static char IA = 31; + + // stem definitions + private static char[] vowels = { A, E, I, O, U, Y, AE, IU, IA }; + + private static char[][] perfectiveGerundEndings1 = { + { V }, + { V, SH, I }, + { V, SH, I, S, SOFT } + }; + + private static char[][] perfectiveGerund1Predessors = { + { A }, + { IA } + }; + + private static char[][] perfectiveGerundEndings2 = { { I, V }, { + Y, V }, { + I, V, SH, I }, { + Y, V, SH, I }, { + I, V, SH, I, S, SOFT }, { + Y, V, SH, I, S, SOFT } + }; + + private static char[][] adjectiveEndings = { + { E, E }, + { I, E }, + { Y, E }, + { O, E }, + { E, I_ }, + { I, I_ }, + { Y, I_ }, + { O, I_ }, + { E, M }, + { I, M }, + { Y, M }, + { O, M }, + { I, X }, + { Y, X }, + { U, IU }, + { IU, IU }, + { A, IA }, + { IA, IA }, + { O, IU }, + { E, IU }, + { I, M, I }, + { Y, M, I }, + { E, G, O }, + { O, G, O }, + { E, M, U }, + {O, M, U } + }; + + private static char[][] participleEndings1 = { + { SHCH }, + { E, M }, + { N, N }, + { V, SH }, + { IU, SHCH } + }; + + private static char[][] participleEndings2 = { + { I, V, SH }, + { Y, V, SH }, + { U, IU, SHCH } + }; + + private static char[][] participle1Predessors = { + { A }, + { IA } + }; + + private static char[][] reflexiveEndings = { + { S, IA }, + { S, SOFT } + }; + + private static char[][] verbEndings1 = { + { I_ }, + { L }, + { N }, + { L, O }, + { N, O }, + { E, T }, + { IU, T }, + { L, A }, + { N, A }, + { L, I }, + { E, M }, + { N, Y }, + { E, T, E }, + { I_, T, E }, + { T, SOFT }, + { E, SH, SOFT }, + { N, N, O } + }; + + private static char[][] verbEndings2 = { + { IU }, + { U, IU }, + { E, N }, + { E, I_ }, + { IA, T }, + { U, I_ }, + { I, L }, + { Y, L }, + { I, M }, + { Y, M }, + { I, T }, + { Y, T }, + { I, L, A }, + { Y, L, A }, + { E, N, A }, + { I, T, E }, + { I, L, I }, + { Y, L, I }, + { I, L, O }, + { Y, L, O }, + { E, N, O }, + { U, E, T }, + { U, IU, T }, + { E, N, Y }, + { I, T, SOFT }, + { Y, T, SOFT }, + { I, SH, SOFT }, + { E, I_, T, E }, + { U, I_, T, E } + }; + + private static char[][] verb1Predessors = { + { A }, + { IA } + }; + + private static char[][] nounEndings = { + { A }, + { U }, + { I_ }, + { O }, + { U }, + { E }, + { Y }, + { I }, + { SOFT }, + { IA }, + { E, V }, + { O, V }, + { I, E }, + { SOFT, E }, + { IA, X }, + { I, IU }, + { E, I }, + { I, I }, + { E, I_ }, + { O, I_ }, + { E, M }, + { A, M }, + { O, M }, + { A, X }, + { SOFT, IU }, + { I, IA }, + { SOFT, IA }, + { I, I_ }, + { IA, M }, + { IA, M, I }, + { A, M, I }, + { I, E, I_ }, + { I, IA, M }, + { I, E, M }, + { I, IA, X }, + { I, IA, M, I } + }; + + private static char[][] superlativeEndings = { + { E, I_, SH }, + { E, I_, SH, E } + }; + + private static char[][] derivationalEndings = { + { O, S, T }, + { O, S, T, SOFT } + }; + + /** + * RussianStemmer constructor comment. + */ + public RussianStemmer() + { + super(); + } + + /** + * RussianStemmer constructor comment. + */ + public RussianStemmer(char[] charset) + { + super(); + this.charset = charset; + } + + /** + * Adjectival ending is an adjective ending, + * optionally preceded by participle ending. + * Creation date: (17/03/2002 12:14:58 AM) + * @param stemmingZone java.lang.StringBuffer + */ + private boolean adjectival(StringBuffer stemmingZone) + { + // look for adjective ending in a stemming zone + if (!findAndRemoveEnding(stemmingZone, adjectiveEndings)) + return false; + // if adjective ending was found, try for participle ending. + // variable r is unused, we are just interested in the side effect of + // findAndRemoveEnding(): + boolean r = + findAndRemoveEnding(stemmingZone, participleEndings1, participle1Predessors) + || + findAndRemoveEnding(stemmingZone, participleEndings2); + return true; + } + + /** + * Derivational endings + * Creation date: (17/03/2002 12:14:58 AM) + * @param stemmingZone java.lang.StringBuffer + */ + private boolean derivational(StringBuffer stemmingZone) + { + int endingLength = findEnding(stemmingZone, derivationalEndings); + if (endingLength == 0) + // no derivational ending found + return false; + else + { + // Ensure that the ending locates in R2 + if (R2 - RV <= stemmingZone.length() - endingLength) + { + stemmingZone.setLength(stemmingZone.length() - endingLength); + return true; + } + else + { + return false; + } + } + } + + /** + * Finds ending among given ending class and returns the length of ending found(0, if not found). + * Creation date: (17/03/2002 8:18:34 PM) + */ + private int findEnding(StringBuffer stemmingZone, int startIndex, char[][] theEndingClass) + { + boolean match = false; + for (int i = theEndingClass.length - 1; i >= 0; i--) + { + char[] theEnding = theEndingClass[i]; + // check if the ending is bigger than stemming zone + if (startIndex < theEnding.length - 1) + { + match = false; + continue; + } + match = true; + int stemmingIndex = startIndex; + for (int j = theEnding.length - 1; j >= 0; j--) + { + if (stemmingZone.charAt(stemmingIndex--) != charset[theEnding[j]]) + { + match = false; + break; + } + } + // check if ending was found + if (match) + { + return theEndingClass[i].length; // cut ending + } + } + return 0; + } + + private int findEnding(StringBuffer stemmingZone, char[][] theEndingClass) + { + return findEnding(stemmingZone, stemmingZone.length() - 1, theEndingClass); + } + + /** + * Finds the ending among the given class of endings and removes it from stemming zone. + * Creation date: (17/03/2002 8:18:34 PM) + */ + private boolean findAndRemoveEnding(StringBuffer stemmingZone, char[][] theEndingClass) + { + int endingLength = findEnding(stemmingZone, theEndingClass); + if (endingLength == 0) + // not found + return false; + else { + stemmingZone.setLength(stemmingZone.length() - endingLength); + // cut the ending found + return true; + } + } + + /** + * Finds the ending among the given class of endings, then checks if this ending was + * preceded by any of given predessors, and if so, removes it from stemming zone. + * Creation date: (17/03/2002 8:18:34 PM) + */ + private boolean findAndRemoveEnding(StringBuffer stemmingZone, + char[][] theEndingClass, char[][] thePredessors) + { + int endingLength = findEnding(stemmingZone, theEndingClass); + if (endingLength == 0) + // not found + return false; + else + { + int predessorLength = + findEnding(stemmingZone, + stemmingZone.length() - endingLength - 1, + thePredessors); + if (predessorLength == 0) + return false; + else { + stemmingZone.setLength(stemmingZone.length() - endingLength); + // cut the ending found + return true; + } + } + + } + + /** + * Marks positions of RV, R1 and R2 in a given word. + * Creation date: (16/03/2002 3:40:11 PM) + */ + private void markPositions(String word) + { + RV = 0; + R1 = 0; + R2 = 0; + int i = 0; + // find RV + while (word.length() > i && !isVowel(word.charAt(i))) + { + i++; + } + if (word.length() - 1 < ++i) + return; // RV zone is empty + RV = i; + // find R1 + while (word.length() > i && isVowel(word.charAt(i))) + { + i++; + } + if (word.length() - 1 < ++i) + return; // R1 zone is empty + R1 = i; + // find R2 + while (word.length() > i && !isVowel(word.charAt(i))) + { + i++; + } + if (word.length() - 1 < ++i) + return; // R2 zone is empty + while (word.length() > i && isVowel(word.charAt(i))) + { + i++; + } + if (word.length() - 1 < ++i) + return; // R2 zone is empty + R2 = i; + } + + /** + * Checks if character is a vowel.. + * Creation date: (16/03/2002 10:47:03 PM) + * @return boolean + * @param letter char + */ + private boolean isVowel(char letter) + { + for (int i = 0; i < vowels.length; i++) + { + if (letter == charset[vowels[i]]) + return true; + } + return false; + } + + /** + * Noun endings. + * Creation date: (17/03/2002 12:14:58 AM) + * @param stemmingZone java.lang.StringBuffer + */ + private boolean noun(StringBuffer stemmingZone) + { + return findAndRemoveEnding(stemmingZone, nounEndings); + } + + /** + * Perfective gerund endings. + * Creation date: (17/03/2002 12:14:58 AM) + * @param stemmingZone java.lang.StringBuffer + */ + private boolean perfectiveGerund(StringBuffer stemmingZone) + { + return findAndRemoveEnding( + stemmingZone, + perfectiveGerundEndings1, + perfectiveGerund1Predessors) + || findAndRemoveEnding(stemmingZone, perfectiveGerundEndings2); + } + + /** + * Reflexive endings. + * Creation date: (17/03/2002 12:14:58 AM) + * @param stemmingZone java.lang.StringBuffer + */ + private boolean reflexive(StringBuffer stemmingZone) + { + return findAndRemoveEnding(stemmingZone, reflexiveEndings); + } + + /** + * Insert the method's description here. + * Creation date: (17/03/2002 12:14:58 AM) + * @param stemmingZone java.lang.StringBuffer + */ + private boolean removeI(StringBuffer stemmingZone) + { + if (stemmingZone.length() > 0 + && stemmingZone.charAt(stemmingZone.length() - 1) == charset[I]) + { + stemmingZone.setLength(stemmingZone.length() - 1); + return true; + } + else + { + return false; + } + } + + /** + * Insert the method's description here. + * Creation date: (17/03/2002 12:14:58 AM) + * @param stemmingZone java.lang.StringBuffer + */ + private boolean removeSoft(StringBuffer stemmingZone) + { + if (stemmingZone.length() > 0 + && stemmingZone.charAt(stemmingZone.length() - 1) == charset[SOFT]) + { + stemmingZone.setLength(stemmingZone.length() - 1); + return true; + } + else + { + return false; + } + } + + /** + * Insert the method's description here. + * Creation date: (16/03/2002 10:58:42 PM) + * @param newCharset char[] + */ + public void setCharset(char[] newCharset) + { + charset = newCharset; + } + + /** + * Finds the stem for given Russian word. + * Creation date: (16/03/2002 3:36:48 PM) + * @return java.lang.String + * @param input java.lang.String + */ + public String stem(String input) + { + markPositions(input); + if (RV == 0) + return input; //RV wasn't detected, nothing to stem + StringBuffer stemmingZone = new StringBuffer(input.substring(RV)); + // stemming goes on in RV + // Step 1 + + if (!perfectiveGerund(stemmingZone)) + { + reflexive(stemmingZone); + // variable r is unused, we are just interested in the flow that gets + // created by logical expression: apply adjectival(); if that fails, + // apply verb() etc + boolean r = + adjectival(stemmingZone) + || verb(stemmingZone) + || noun(stemmingZone); + } + // Step 2 + removeI(stemmingZone); + // Step 3 + derivational(stemmingZone); + // Step 4 + superlative(stemmingZone); + undoubleN(stemmingZone); + removeSoft(stemmingZone); + // return result + return input.substring(0, RV) + stemmingZone.toString(); + } + + /** + * Superlative endings. + * Creation date: (17/03/2002 12:14:58 AM) + * @param stemmingZone java.lang.StringBuffer + */ + private boolean superlative(StringBuffer stemmingZone) + { + return findAndRemoveEnding(stemmingZone, superlativeEndings); + } + + /** + * Undoubles N. + * Creation date: (17/03/2002 12:14:58 AM) + * @param stemmingZone java.lang.StringBuffer + */ + private boolean undoubleN(StringBuffer stemmingZone) + { + char[][] doubleN = { + { N, N } + }; + if (findEnding(stemmingZone, doubleN) != 0) + { + stemmingZone.setLength(stemmingZone.length() - 1); + return true; + } + else + { + return false; + } + } + + /** + * Verb endings. + * Creation date: (17/03/2002 12:14:58 AM) + * @param stemmingZone java.lang.StringBuffer + */ + private boolean verb(StringBuffer stemmingZone) + { + return findAndRemoveEnding( + stemmingZone, + verbEndings1, + verb1Predessors) + || findAndRemoveEnding(stemmingZone, verbEndings2); + } + + /** + * Static method for stemming with different charsets + */ + public static String stem(String theWord, char[] charset) + { + RussianStemmer stemmer = new RussianStemmer(); + stemmer.setCharset(charset); + return stemmer.stem(theWord); + } +} diff -r 000000000000 -r 408254cf2f1d software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/doc/ArchimedesDocContentHandler.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/doc/ArchimedesDocContentHandler.java Wed Nov 24 17:24:23 2010 +0100 @@ -0,0 +1,184 @@ +package de.mpg.mpiwg.berlin.mpdl.lt.doc; + +import java.io.BufferedOutputStream; +import java.io.File; +import java.io.FileNotFoundException; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.OutputStream; + +import org.xml.sax.*; + +import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; +import de.mpg.mpiwg.berlin.mpdl.lt.general.Transcoder; +import de.mpg.mpiwg.berlin.mpdl.util.StringUtilEscapeChars; + +public class ArchimedesDocContentHandler implements ContentHandler { + private String xmlnsString = ""; + private File outputFile; + private String language; + private String fromEncoding; + private String toEncoding; + private OutputStream out; + private Element currentElement; + + public ArchimedesDocContentHandler(String language, String fromEncoding, String toEncoding, File outputFile) throws ApplicationException { + this.language = language; + this.outputFile = outputFile; + this.fromEncoding = fromEncoding; + this.toEncoding = toEncoding; + } + + public void startDocument() throws SAXException { + try { + out = new BufferedOutputStream(new FileOutputStream(outputFile)); + write("\n"); + } catch (FileNotFoundException e) { + throw new SAXException(e); + } + } + + public void endDocument() throws SAXException { + try { + if (out != null) + out.close(); + } catch (Exception e) { + // nothing: always close the stream at the end of the method + } + } + + public void characters(char[] c, int start, int length) throws SAXException { + char[] cCopy = new char[length]; + System.arraycopy(c, start, cCopy, 0, length); + String charactersStr = String.valueOf(cCopy); + if (charactersStr != null) { + String elemName = null; + if (currentElement != null) + elemName = currentElement.name; + if ((! isArchMetadata(elemName)) && (currentElement == null || currentElement.isGreek() || currentElement.isArabic())) { + try { + if (fromEncoding.equals("betacode") && toEncoding.equals("unicode")) { + charactersStr = transcodeFromBetaCode2Unicode(charactersStr); + } else if (fromEncoding.equals("buckwalter") && toEncoding.equals("unicode")) { + charactersStr = transcodeFromBuckwalter2Unicode(charactersStr); + } + } catch (ApplicationException e) { + throw new SAXException(e); + } + } + charactersStr = StringUtilEscapeChars.forXML(charactersStr); + if (currentElement != null) + currentElement.value = charactersStr; + write(charactersStr); + } + } + + public void ignorableWhitespace(char[] c, int start, int length) throws SAXException { + } + + public void processingInstruction(String target, String data) throws SAXException { + + } + + public void setDocumentLocator(Locator locator) { + } + + public void startPrefixMapping(String prefix, String uri) throws SAXException { + xmlnsString += "xmlns:" + prefix + "=\"" + uri + "\" "; + } + + public void endPrefixMapping(String prefix) throws SAXException { + } + + public void skippedEntity(String name) throws SAXException { + } + + public void startElement(String uri, String localName, String name, Attributes attrs) throws SAXException { + currentElement = new Element(language, name); + int attrSize = attrs.getLength(); + String attrString = ""; + for (int i=0; i"); + } else { + currentElement.xmlnsString = xmlnsString; + write("<" + name + " " + xmlnsString + attrString + ">"); + } + xmlnsString = ""; + } + + public void endElement(String uri, String localName, String name) throws SAXException { + currentElement = null; + write(""); + } + + private void write(String outStr) throws SAXException { + try { + byte[] bytes = outStr.getBytes("utf-8"); + out.write(bytes, 0, bytes.length); + out.flush(); + } catch (IOException e) { + throw new SAXException(e); + } + } + + private String transcodeFromBetaCode2Unicode(String inputStr) throws ApplicationException { + Transcoder transcoder = Transcoder.getInstance(); + String encodedUnicodeStr = transcoder.transcodeFromBetaCode2Unicode(inputStr); + return encodedUnicodeStr; + } + + private String transcodeFromBuckwalter2Unicode(String inputStr) throws ApplicationException { + Transcoder transcoder = Transcoder.getInstance(); + String encodedUnicodeStr = transcoder.transcodeFromBuckwalter2Unicode(inputStr); + return encodedUnicodeStr; + } + + private boolean isArchMetadata(String elemName) { + boolean isArchMetadata = false; + if (elemName == null) + return false; + String elName = elemName.toLowerCase().trim(); + if (elName.equals("info") || elName.equals("author") || elName.equals("title") || elName.equals("date") || elName.equals("place") + || elName.equals("translator") || elName.equals("lang") || elName.equals("cvs_file") || elName.equals("cvs_version") || elName.equals("comments") || elName.equals("locator")) { + isArchMetadata = true; + } + return isArchMetadata; + } + + private class Element { + String name; + String language; + String xmlnsString; + String attrString; + String value; + + Element(String language, String name) { + this.language = language; + this.name = name; + } + + boolean isGreek() { + boolean isGreek = false; + if (language != null && language.equals("el")) + isGreek = true; + return isGreek; + } + + boolean isArabic() { + boolean isArabic = false; + if (language != null && language.equals("ar")) + isArabic = true; + return isArabic; + } + } +} diff -r 000000000000 -r 408254cf2f1d software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/doc/ArchimedesDocForeignLangContentHandler.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/doc/ArchimedesDocForeignLangContentHandler.java Wed Nov 24 17:24:23 2010 +0100 @@ -0,0 +1,176 @@ +package de.mpg.mpiwg.berlin.mpdl.lt.doc; + +import java.io.BufferedOutputStream; +import java.io.File; +import java.io.FileNotFoundException; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.OutputStream; + +import org.xml.sax.*; + +import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; +import de.mpg.mpiwg.berlin.mpdl.lt.general.Transcoder; +import de.mpg.mpiwg.berlin.mpdl.util.StringUtilEscapeChars; + +public class ArchimedesDocForeignLangContentHandler implements ContentHandler { + private String xmlnsString = ""; + private File outputFile; + private OutputStream out; + private Element currentElement; + private boolean currentElementIsForeign = false; + + public ArchimedesDocForeignLangContentHandler(File outputFile) throws ApplicationException { + this.outputFile = outputFile; + } + + public void startDocument() throws SAXException { + try { + out = new BufferedOutputStream(new FileOutputStream(outputFile)); + write("\n"); + } catch (FileNotFoundException e) { + throw new SAXException(e); + } + } + + public void endDocument() throws SAXException { + try { + if (out != null) + out.close(); + } catch (Exception e) { + // nothing: always close the stream at the end of the method + } + } + + public void characters(char[] c, int start, int length) throws SAXException { + char[] cCopy = new char[length]; + System.arraycopy(c, start, cCopy, 0, length); + String charactersStr = String.valueOf(cCopy); + if (charactersStr != null) { + String elemName = null; + if (currentElement != null) + elemName = currentElement.name; + if ((! isArchMetadata(elemName)) && (currentElementIsForeign)) { + try { + charactersStr = transcodeFromBetaCode2Unicode(charactersStr); + } catch (ApplicationException e) { + throw new SAXException(e); + } + } + charactersStr = StringUtilEscapeChars.forXML(charactersStr); + if (currentElement != null) + currentElement.value = charactersStr; + write(charactersStr); + } + } + + public void ignorableWhitespace(char[] c, int start, int length) throws SAXException { + } + + public void processingInstruction(String target, String data) throws SAXException { + + } + + public void setDocumentLocator(Locator locator) { + } + + public void startPrefixMapping(String prefix, String uri) throws SAXException { + xmlnsString += "xmlns:" + prefix + "=\"" + uri + "\" "; + } + + public void endPrefixMapping(String prefix) throws SAXException { + } + + public void skippedEntity(String name) throws SAXException { + } + + public void startElement(String uri, String localName, String name, Attributes attrs) throws SAXException { + Element newElement = new Element(null, name); + if (currentElement != null) { + if (currentElement.language != null) + newElement.language = currentElement.language; // language wird an Kinder vererbt + } + currentElement = newElement; + int attrSize = attrs.getLength(); + String attrString = ""; + for (int i=0; i"); + } else { + currentElement.xmlnsString = xmlnsString; + write("<" + name + " " + xmlnsString + attrString + ">"); + } + xmlnsString = ""; + } + + public void endElement(String uri, String localName, String name) throws SAXException { + if (name.equals("foreign")) + currentElementIsForeign = false; + currentElement = null; + write(""); + } + + private void write(String outStr) throws SAXException { + try { + byte[] bytes = outStr.getBytes("utf-8"); + out.write(bytes, 0, bytes.length); + out.flush(); + } catch (IOException e) { + throw new SAXException(e); + } + } + + private String transcodeFromBetaCode2Unicode(String inputStr) throws ApplicationException { + Transcoder transcoder = Transcoder.getInstance(); + String encodedUnicodeStr = transcoder.transcodeFromBetaCode2Unicode(inputStr); + return encodedUnicodeStr; + } + + private boolean isArchMetadata(String elemName) { + boolean isArchMetadata = false; + if (elemName == null) + return false; + String elName = elemName.toLowerCase().trim(); + if (elName.equals("info") || elName.equals("author") || elName.equals("title") || elName.equals("date") || elName.equals("place") + || elName.equals("translator") || elName.equals("lang") || elName.equals("cvs_file") || elName.equals("cvs_version") || elName.equals("comments") || elName.equals("locator")) { + isArchMetadata = true; + } + return isArchMetadata; + } + + private class Element { + String name; + String language; + String xmlnsString; + String attrString; + String value; + + Element(String language, String name) { + this.language = language; + this.name = name; + } + + boolean isGreek() { + boolean isGreek = false; + if (language != null && (language.equals("el") || language.equals("greek") || language.equals("grc"))) + isGreek = true; + return isGreek; + } + + } +} diff -r 000000000000 -r 408254cf2f1d software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/doc/ArchimedesDocManager.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/doc/ArchimedesDocManager.java Wed Nov 24 17:24:23 2010 +0100 @@ -0,0 +1,147 @@ +package de.mpg.mpiwg.berlin.mpdl.lt.doc; + +import java.io.BufferedInputStream; +import java.io.File; +import java.io.FileInputStream; +import java.io.FilenameFilter; +import java.io.IOException; +import java.io.InputStream; +import java.util.Date; + +import org.xml.sax.InputSource; +import org.xml.sax.SAXException; +import org.xml.sax.XMLReader; + +import com.sun.org.apache.xerces.internal.parsers.SAXParser; + +import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; +import de.mpg.mpiwg.berlin.mpdl.general.MpdlConstants; +import de.mpg.mpiwg.berlin.mpdl.util.Util; +import de.mpg.mpiwg.berlin.mpdl.util.FilenameFilterExtension; + +public class ArchimedesDocManager { + private static ArchimedesDocManager instance; + private static String MPDL_DOC_DIR = MpdlConstants.MPDL_DOC_DIR; + private static String ARCH_DOC_DIR = MPDL_DOC_DIR + "/documents/archimedes"; + private static String ARCH_DOC_OUT_DIR = MPDL_DOC_DIR + "/documentsTranscodedToUnicode/archimedes"; + private ArchimedesDocContentHandler archimedesDocContentHandler; + private ArchimedesDocForeignLangContentHandler archimedesDocForeignLangContentHandler; + private Date beginOfOperation; + private Date endOfOperation; + + public static ArchimedesDocManager getInstance() throws ApplicationException { + if (instance == null) { + instance = new ArchimedesDocManager(); + } + return instance; + } + + /** + * + */ + public static void main(String[] args) throws ApplicationException { + getInstance(); + instance.beginOperation(); + System.out.print("Start ..."); + // Greek + String inputDirGreek = ARCH_DOC_DIR + "/el"; + String outputDirGreek = ARCH_DOC_OUT_DIR + "/el"; + // instance.transcodeDirectory("el", "betacode", "unicode", inputDirGreek, outputDirGreek); + // Arabic + String inputDirArabic = ARCH_DOC_DIR + "/ar"; + String outputDirArabic = ARCH_DOC_OUT_DIR + "/ar"; + // instance.transcodeDirectory("ar", "buckwalter", "unicode", inputDirArabic, outputDirArabic); + + // Foreign lang=greek transcoding + instance.transcodeForeignLangFiles(); + + instance.end(); + instance.endOperation(); + Double elapsedTime = new Util().getSecondWithMillisecondsBetween(instance.beginOfOperation, instance.endOfOperation); + System.out.println("End."); + System.out.println("Needed time: " + elapsedTime + " seconds"); + } + + private void transcodeDirectory(String language, String fromEncoding, String toEncoding, String inputDirName, String outputDirName) throws ApplicationException { + File inputDir = new File(inputDirName); + FilenameFilter filter = new FilenameFilterExtension("xml"); + File[] files = inputDir.listFiles(filter); + for (int i=0; i < files.length; i++) { + File inputFile = files[i]; + String outputFileName = inputFile.getName(); + File outputFile = new File(outputDirName + "/" + outputFileName); + File outputDir = new File(outputFile.getParent()); + if (! outputDir.exists()) { + outputDir.mkdirs(); // create the directory including parent directories which do not exist + } + transcodeFile(language, fromEncoding, toEncoding, inputFile, outputFile); + } + } + + private void transcodeFile(String language, String fromEncoding, String toEncoding, File inputFile, File outputFile) throws ApplicationException { + archimedesDocContentHandler = new ArchimedesDocContentHandler(language, fromEncoding, toEncoding, outputFile); + try { + XMLReader xmlParser = new SAXParser(); + xmlParser.setContentHandler(archimedesDocContentHandler); + InputStream inputStream = new FileInputStream(inputFile); + BufferedInputStream bufferedInputStream = new BufferedInputStream(inputStream); + InputSource input = new InputSource(bufferedInputStream); + xmlParser.parse(input); + bufferedInputStream.close(); + } catch (SAXException e) { + throw new ApplicationException(e); + } catch (IOException e) { + throw new ApplicationException(e); + } + } + + private void transcodeForeignLangFiles() throws ApplicationException { + String[] languages = {"en", "fr", "it", "la"}; + for (int i=0; i elementQueue; + + public DictionarizerContentHandler(String language) throws ApplicationException { + this.language = language; + } + + public String getXmlFragment() { + return outputXmlFragment; + } + + public void startDocument() throws SAXException { + } + + public void endDocument() throws SAXException { + String rootElemToStr = rootElement.toXmlString(); + write(rootElemToStr); + write("\n"); + } + + public void characters(char[] c, int start, int length) throws SAXException { + char[] cCopy = new char[length]; + System.arraycopy(c, start, cCopy, 0, length); + String charactersStr = String.valueOf(cCopy); + if (charactersStr != null && ! charactersStr.equals("")) { + if (currentElement != null) { + Element charElement = new Element("characters", ELEMENT_TYPE_CHARACTERS); + charElement.value = StringUtilEscapeChars.deresolveXmlEntities(charactersStr); + if (currentElement.composites == null) + currentElement.composites = new ArrayList(); + currentElement.composites.add(charElement); + } + } + } + + public void ignorableWhitespace(char[] c, int start, int length) throws SAXException { + } + + public void processingInstruction(String target, String data) throws SAXException { + } + + public void setDocumentLocator(Locator locator) { + } + + public void startPrefixMapping(String prefix, String uri) throws SAXException { + xmlnsString += "xmlns:" + prefix + "=\"" + uri + "\" "; + } + + public void endPrefixMapping(String prefix) throws SAXException { + } + + public void skippedEntity(String name) throws SAXException { + } + + public void startElement(String uri, String localName, String name, Attributes attrs) throws SAXException { + if (elementQueue == null) + elementQueue = new ArrayList(); + Element newElement = new Element(name); // element of type: complex + if (currentElement != null) { + if (currentElement.composites == null) + currentElement.composites = new ArrayList(); + if (currentElement.lang != null) + newElement.lang = currentElement.lang; // language wird an Kinder vererbt + currentElement.composites.add(newElement); + } + currentElement = newElement; + int attrSize = attrs.getLength(); + String attrString = ""; + for (int i=0; i 0) { + int lastIndex = elementQueue.size() - 1; + elementQueue.remove(lastIndex); + } + if (elementQueue != null && elementQueue.size() > 0) { + int lastIndex = elementQueue.size() - 1; + currentElement = elementQueue.get(lastIndex); + } else { + currentElement = null; + } + } + + public int getCharIndex(String compositesCharsDictionarized, int indexComplexElemCompositesCharsWithMarks) { + if (indexComplexElemCompositesCharsWithMarks == 0) + return -1; + int size = compositesCharsDictionarized.length(); + if (size == 0) + return -1; + int index = 0; + int counter = 0; + boolean isInTag = false; + boolean success = false; + while (!success) { + if (counter > size) + return -1; + char c = compositesCharsDictionarized.charAt(counter); + if (c == '<') + isInTag = true; + if (! isInTag) + index++; + if (index == indexComplexElemCompositesCharsWithMarks) + success = true; + if (c == '>') + isInTag = false; + counter++; + } + return counter + 1; + } + + private void write(String outStr) throws SAXException { + outputXmlFragment += outStr; + } + + private class Element { + private int type; + private String name; + private String xmlnsString; + private String attrString; + private String value; + private String lang; // normalerweise mit dem Wert aus dem Attribut xml:lang belegt bzw. mit dem aus dem Vaterknoten wererbten xml:lang-Wert + private ArrayList composites; + + private Element(String name) { + this.type = ELEMENT_TYPE_COMPLEX; + this.name = name; + } + + private Element(String name, int type) { + this.type = type; + this.name = name; + } + + private boolean isComplex() { + boolean isComplex = false; + if (type == ELEMENT_TYPE_COMPLEX) + isComplex = true; + return isComplex; + } + + private String toXmlString() throws SAXException { + String retString = ""; + String elemLanguage = language; // default value for the document/page + if (lang != null) + elemLanguage = lang; // der Wert des Elements falls vorhanden + // write this element + if (! isComplex()) { + retString += value; + } else { + String xmlNsString = this.xmlnsString; + if (xmlNsString == null || xmlNsString.equals("")) { + retString = retString + "<" + name + attrString + ">"; + } else { + retString = retString + "<" + name + " " + xmlNsString + attrString + ">"; + } + if (composites != null) { + String compositesChars = ""; + String compositesCharsWithMarks = ""; + ArrayList complexElements = new ArrayList(); + for (int i=0; i 0) { + for (int i=0; i 0) { + firstPiece = compositesCharsDictionarized.substring(0, indexComplexElemCompositesCharsDictionarized - 1); + compositesCharsDictionarized = compositesCharsDictionarized.substring(indexComplexElemCompositesCharsDictionarized - 1); + } + retString = retString + firstPiece + complexElementStr; + compositesCharsWithMarks = compositesCharsWithMarks.substring(indexComplexElemCompositesCharsWithMarks + MARK_SIZE); + } + retString = retString + compositesCharsDictionarized; // last one must also be added + } else { + retString = retString + compositesCharsDictionarized; // last one must also be added + } + } + retString = retString + ""; + } + return retString; + } + + private String characters2DictWords(String charactersStrDeresolved, String language) throws SAXException { + String charactersStr = StringUtilEscapeChars.resolveXmlEntities(charactersStrDeresolved); + String retStr = ""; + try { + MpdlTokenizerAnalyzer dictionarizerAnalyzer = new MpdlTokenizerAnalyzer(language); + ArrayList wordTokens = dictionarizerAnalyzer.getToken(charactersStr); + int endPos = 0; + for (int i=0; i < wordTokens.size(); i++) { + Token wordToken = wordTokens.get(i); + int startPos = wordToken.startOffset(); + String beforeStr = charactersStr.substring(endPos, startPos); + endPos = wordToken.endOffset(); + String wordStr = charactersStr.substring(startPos, endPos); + String beforeStrDeresolved = StringUtilEscapeChars.deresolveXmlEntities(beforeStr); + String wordStrDeresolved = StringUtilEscapeChars.deresolveXmlEntities(wordStr); + String wordTokenText = wordToken.termText(); + LexHandler lexHandler = LexHandler.getInstance(); + // delivers lex entries by help of the morphology component (lex entry of the stem of the normalized word form) + ArrayList lexEntryKeys = lexHandler.getLexEntryKeys(wordTokenText, language, false); + if (lexEntryKeys != null) { + String lexForms = ""; + for (int j=0; j" + wordStrDeresolved + ""; + } else { + retStr = retStr + beforeStrDeresolved + wordStrDeresolved; + } + } + String lastAfterStr = charactersStr.substring(endPos); + String lastAfterStrDeresolved = StringUtilEscapeChars.deresolveXmlEntities(lastAfterStr); + retStr = retStr + lastAfterStrDeresolved; + } catch (ApplicationException e) { + throw new SAXException(e); + } + return retStr; + } + } +} diff -r 000000000000 -r 408254cf2f1d software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/doc/NormalizeCharsContentHandler.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/doc/NormalizeCharsContentHandler.java Wed Nov 24 17:24:23 2010 +0100 @@ -0,0 +1,145 @@ +package de.mpg.mpiwg.berlin.mpdl.lt.doc; + +import java.util.ArrayList; + +import org.apache.lucene.analysis.Token; +import org.xml.sax.Attributes; +import org.xml.sax.ContentHandler; +import org.xml.sax.Locator; +import org.xml.sax.SAXException; + +import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; +import de.mpg.mpiwg.berlin.mpdl.lt.analyzer.MpdlNormalizer; +import de.mpg.mpiwg.berlin.mpdl.lt.analyzer.MpdlTokenizerAnalyzer; +import de.mpg.mpiwg.berlin.mpdl.util.StringUtilEscapeChars; + +public class NormalizeCharsContentHandler implements ContentHandler { + private String xmlnsString = ""; + private String[] normalizeFunctions = {}; // default: without normalize functions + private String language; + private String outputXmlFragment = ""; + private Element currentElement; + + public NormalizeCharsContentHandler(String[] normalizeFunctions, String language) throws ApplicationException { + this.normalizeFunctions = normalizeFunctions; + this.language = language; + } + + public String getXmlFragment() { + return outputXmlFragment; + } + + public void startDocument() throws SAXException { + } + + public void endDocument() throws SAXException { + } + + public void characters(char[] c, int start, int length) throws SAXException { + char[] cCopy = new char[length]; + System.arraycopy(c, start, cCopy, 0, length); + String charactersStr = String.valueOf(cCopy); + if (charactersStr != null && ! charactersStr.equals("")) { + charactersStr = normalize(charactersStr); + if (currentElement != null) + currentElement.value = charactersStr; + write(charactersStr); + } + } + + public void ignorableWhitespace(char[] c, int start, int length) throws SAXException { + } + + public void processingInstruction(String target, String data) throws SAXException { + } + + public void setDocumentLocator(Locator locator) { + } + + public void startPrefixMapping(String prefix, String uri) throws SAXException { + xmlnsString += "xmlns:" + prefix + "=\"" + uri + "\" "; + } + + public void endPrefixMapping(String prefix) throws SAXException { + } + + public void skippedEntity(String name) throws SAXException { + } + + public void startElement(String uri, String localName, String name, Attributes attrs) throws SAXException { + currentElement = new Element(language, name); + int attrSize = attrs.getLength(); + String attrString = ""; + for (int i=0; i"); + } else { + currentElement.xmlnsString = xmlnsString; + write("<" + name + " " + xmlnsString + attrString + ">"); + } + xmlnsString = ""; + } + + public void endElement(String uri, String localName, String name) throws SAXException { + currentElement = null; + write(""); + } + + private void write(String outStr) throws SAXException { + outputXmlFragment += outStr; + } + + private String normalize(String charactersStr) throws SAXException { + String retStr = ""; + try { + MpdlTokenizerAnalyzer tokenizerAnalyzer = new MpdlTokenizerAnalyzer(language); + ArrayList wordTokens = tokenizerAnalyzer.getToken(charactersStr); + int endPos = 0; + for (int i=0; i < wordTokens.size(); i++) { + Token wordToken = wordTokens.get(i); + int startPos = wordToken.startOffset(); + String beforeStr = charactersStr.substring(endPos, startPos); + String beforeStrDeresolved = StringUtilEscapeChars.deresolveXmlEntities(beforeStr); + endPos = wordToken.endOffset(); + String wordStr = charactersStr.substring(startPos, endPos); + + MpdlNormalizer mpdlNormalizer = new MpdlNormalizer(normalizeFunctions, language); + String normalizedWordStr = mpdlNormalizer.normalize(wordStr); + + String normalizedWordStrDeresolved = StringUtilEscapeChars.deresolveXmlEntities(normalizedWordStr); + // String wordTokenText = wordToken.termText(); + retStr = retStr + beforeStrDeresolved + normalizedWordStrDeresolved; + } + String lastAfterStr = charactersStr.substring(endPos); + String lastAfterStrDeresolved = StringUtilEscapeChars.deresolveXmlEntities(lastAfterStr); + retStr = retStr + lastAfterStrDeresolved; + } catch (ApplicationException e) { + throw new SAXException(e); + } + return retStr; + } + + private class Element { + String name; + String language; + String xmlnsString; + String attrString; + String value; + + Element(String language, String name) { + this.language = language; + this.name = name; + } + + } + +} diff -r 000000000000 -r 408254cf2f1d software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/doc/regularization/DBRegularizationHandler.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/doc/regularization/DBRegularizationHandler.java Wed Nov 24 17:24:23 2010 +0100 @@ -0,0 +1,146 @@ +package de.mpg.mpiwg.berlin.mpdl.lt.doc.regularization; + +import java.io.UnsupportedEncodingException; +import java.util.ArrayList; + +import com.sleepycat.je.Cursor; +import com.sleepycat.je.Database; +import com.sleepycat.je.DatabaseEntry; +import com.sleepycat.je.DatabaseException; +import com.sleepycat.je.LockMode; +import com.sleepycat.je.OperationStatus; + +import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; +import de.mpg.mpiwg.berlin.mpdl.lt.general.Language; + +public class DBRegularizationHandler { + private String dbDirectory; + private DbEnvRegularization regDbEnv; + + public DBRegularizationHandler(String dbDir) { + this.dbDirectory = dbDir; + } + + public void start() throws ApplicationException { + regDbEnv = new DbEnvRegularization(); + regDbEnv.setDataDir(dbDirectory); + regDbEnv.init(); // open databases in read/write mode + } + + public void openDatabases() throws ApplicationException { + regDbEnv.openDatabases(); + } + + public void closeDatabases() throws ApplicationException { + regDbEnv.close(); + } + + public void deleteData() throws ApplicationException { + regDbEnv.removeDatabases(); + } + + public void writeOrigReg(Regularization reg) throws ApplicationException { + try { + String language = Language.getInstance().getLanguageId(reg.getLanguage()); + String keyStr = language + "###" + reg.getOrig(); + String valueStr = reg.getXmlString(); + DatabaseEntry dbEntryKey = new DatabaseEntry(keyStr.getBytes("utf-8")); + DatabaseEntry dbEntryValue = new DatabaseEntry(valueStr.getBytes("utf-8")); + Database origDB = regDbEnv.getOrigDB(); + origDB.put(null, dbEntryKey, dbEntryValue); + } catch (DatabaseException e) { + throw new ApplicationException(e); + } catch (UnsupportedEncodingException e) { + throw new ApplicationException(e); + } + } + + public void writeNormReg(Regularization reg) throws ApplicationException { + try { + String language = Language.getInstance().getLanguageId(reg.getLanguage()); + String keyStr = language + "###" + reg.getNorm(); + String valueStr = reg.getXmlString(); + DatabaseEntry dbEntryKey = new DatabaseEntry(keyStr.getBytes("utf-8")); + DatabaseEntry dbEntryValue = new DatabaseEntry(valueStr.getBytes("utf-8")); + Database normDB = regDbEnv.getNormDB(); + normDB.put(null, dbEntryKey, dbEntryValue); + } catch (DatabaseException e) { + throw new ApplicationException(e); + } catch (UnsupportedEncodingException e) { + throw new ApplicationException(e); + } + } + + public void deleteReg(Regularization reg) throws ApplicationException { + try { + String language = Language.getInstance().getLanguageId(reg.getLanguage()); + String keyStrOrig = language + "###" + reg.getOrig(); + DatabaseEntry dbEntryKey = new DatabaseEntry(keyStrOrig.getBytes("utf-8")); + Database origDB = regDbEnv.getOrigDB(); + origDB.delete(null, dbEntryKey); + String keyStrNorm = reg.getLanguage() + "###" + reg.getNorm(); + dbEntryKey = new DatabaseEntry(keyStrNorm.getBytes("utf-8")); + Database normDB = regDbEnv.getNormDB(); + normDB.delete(null, dbEntryKey); + } catch (DatabaseException e) { + throw new ApplicationException(e); + } catch (UnsupportedEncodingException e) { + throw new ApplicationException(e); + } + } + + public ArrayList readRegsByOrig(String lang, String orig) throws ApplicationException { + String language = Language.getInstance().getLanguageId(lang); + ArrayList retRegs = new ArrayList(); + String hashKey = language + "###" + orig; + try { + Database origDB = regDbEnv.getOrigDB(); + Cursor cursor = origDB.openCursor(null, null); + byte[] bHashKey = hashKey.getBytes("utf-8"); + DatabaseEntry dbEntryKey = new DatabaseEntry(bHashKey); + DatabaseEntry foundValue = new DatabaseEntry(); + OperationStatus operationStatus = cursor.getSearchKey(dbEntryKey, foundValue, LockMode.DEFAULT); + while (operationStatus == OperationStatus.SUCCESS) { + byte[] foundValueBytes = foundValue.getData(); + String foundValueStr = new String(foundValueBytes, "utf-8"); + Regularization reg = Regularization.getInstance(foundValueStr); + retRegs.add(reg); + operationStatus = cursor.getNextDup(dbEntryKey, foundValue, LockMode.DEFAULT); + } + cursor.close(); + } catch (DatabaseException e) { + throw new ApplicationException(e); + } catch (UnsupportedEncodingException e) { + throw new ApplicationException(e); + } + return retRegs; + } + + public ArrayList readRegsByNorm(String lang, String norm) throws ApplicationException { + String language = Language.getInstance().getLanguageId(lang); + ArrayList retRegs = new ArrayList(); + String hashKey = language + "###" + norm; + try { + Database normDB = regDbEnv.getNormDB(); + Cursor cursor = normDB.openCursor(null, null); + byte[] bHashKey = hashKey.getBytes("utf-8"); + DatabaseEntry dbEntryKey = new DatabaseEntry(bHashKey); + DatabaseEntry foundValue = new DatabaseEntry(); + OperationStatus operationStatus = cursor.getSearchKey(dbEntryKey, foundValue, LockMode.DEFAULT); + while (operationStatus == OperationStatus.SUCCESS) { + byte[] foundValueBytes = foundValue.getData(); + String foundValueStr = new String(foundValueBytes, "utf-8"); + Regularization reg = Regularization.getInstance(foundValueStr); + retRegs.add(reg); + operationStatus = cursor.getNextDup(dbEntryKey, foundValue, LockMode.DEFAULT); + } + cursor.close(); + } catch (DatabaseException e) { + throw new ApplicationException(e); + } catch (UnsupportedEncodingException e) { + throw new ApplicationException(e); + } + return retRegs; + } + +} \ No newline at end of file diff -r 000000000000 -r 408254cf2f1d software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/doc/regularization/DbEnvRegularization.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/doc/regularization/DbEnvRegularization.java Wed Nov 24 17:24:23 2010 +0100 @@ -0,0 +1,100 @@ +package de.mpg.mpiwg.berlin.mpdl.lt.doc.regularization; + +import java.io.File; + +import com.sleepycat.je.Database; +import com.sleepycat.je.DatabaseConfig; +import com.sleepycat.je.DatabaseException; +import com.sleepycat.je.Environment; +import com.sleepycat.je.EnvironmentConfig; + +import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; + +public class DbEnvRegularization { + private String dataDir; + private File envPath; + private Environment env; + private EnvironmentConfig envConfig; + private DatabaseConfig dbConfig; + private Database origDB; + private Database normDB; + + public DbEnvRegularization() { + } + + public void setDataDir(String dataDir) { + this.dataDir = dataDir; + } + + public void init() throws ApplicationException { + try { + envConfig = new EnvironmentConfig(); + dbConfig = new DatabaseConfig(); + envConfig.setReadOnly(false); + dbConfig.setReadOnly(false); + envConfig.setAllowCreate(true); + dbConfig.setAllowCreate(true); + envConfig.setTransactional(true); + dbConfig.setTransactional(true); + // allow duplicates for keys + dbConfig.setSortedDuplicates(true); + envPath = new File(dataDir); + env = new Environment(envPath, envConfig); + } catch (DatabaseException e) { + throw new ApplicationException(e); + } + } + + public void openDatabases() throws ApplicationException { + try { + // open databases (and create them if they do not exist) + origDB = env.openDatabase(null, "OrigDB", dbConfig); + normDB = env.openDatabase(null, "NormDB", dbConfig); + } catch (DatabaseException e) { + throw new ApplicationException(e); + } + } + + public void removeDatabases() throws ApplicationException { + try { + if (origDB != null) + origDB.close(); + if (normDB != null) + normDB.close(); + env.removeDatabase(null, "OrigDB"); + env.removeDatabase(null, "NormDB"); + origDB = null; + normDB = null; + } catch (DatabaseException e) { + throw new ApplicationException(e); + } + } + + public Environment getEnv() { + return env; + } + + public Database getNormDB() { + return normDB; + } + + public Database getOrigDB() { + return origDB; + } + + public void close() throws ApplicationException { + if (env != null) { + try { + if (origDB != null) + origDB.close(); + if (normDB != null) + normDB.close(); + if (env != null) + env.close(); + } catch (DatabaseException e) { + throw new ApplicationException(e); + } + } + } +} + diff -r 000000000000 -r 408254cf2f1d software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/doc/regularization/Regularization.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/doc/regularization/Regularization.java Wed Nov 24 17:24:23 2010 +0100 @@ -0,0 +1,89 @@ +package de.mpg.mpiwg.berlin.mpdl.lt.doc.regularization; + +import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; +import de.mpg.mpiwg.berlin.mpdl.util.StringUtilEscapeChars; +import de.mpg.mpiwg.berlin.mpdl.util.XmlUtil; + +public class Regularization { + private String language; + private String orig; + private String norm; + private String source; + private int sourcePosition; + + public Regularization(String language, String orig, String norm, String source) { + this.language = language; + this.orig = orig; + this.norm = norm; + this.source = source; + } + + public static Regularization getInstance(String xmlStr) throws ApplicationException { + XmlUtil xmlUtil = XmlUtil.getInstance(); + String language = xmlUtil.evaluateToString(xmlStr, "//language", null); + String orig = xmlUtil.evaluateToString(xmlStr, "//orig", null); + String norm = xmlUtil.evaluateToString(xmlStr, "//norm", null); + String source = xmlUtil.evaluateToString(xmlStr, "//source", null); + String sourcePosStr = xmlUtil.evaluateToString(xmlStr, "//source/@position", null); + int sourcePos = new Integer(sourcePosStr); + Regularization reg = new Regularization(language, orig, norm, source); + reg.setSourcePosition(sourcePos); + return reg; + } + + public String getLanguage() { + return language; + } + + public void setLanguage(String language) { + this.language = language; + } + + public String getOrig() { + return orig; + } + + public void setOrig(String orig) { + this.orig = orig; + } + + public String getNorm() { + return norm; + } + + public void setNorm(String norm) { + this.norm = norm; + } + + public String getSource() { + return source; + } + + public void setSource(String source) { + this.source = source; + } + + public int getSourcePosition() { + return sourcePosition; + } + + public void setSourcePosition(int sourcePosition) { + this.sourcePosition = sourcePosition; + } + + public String getXmlString() { + String xmlString = "\n"; + if (language != null) + xmlString += " " + language + "\n"; + if (orig != null) + xmlString += " " + StringUtilEscapeChars.deresolveXmlEntities(orig) + "\n"; + if (norm != null) + xmlString += " " + StringUtilEscapeChars.deresolveXmlEntities(norm) + "\n"; + if (source != null) + xmlString += " " + StringUtilEscapeChars.deresolveXmlEntities(source) + "\n"; + xmlString += "\n"; + return xmlString; + } + + +} diff -r 000000000000 -r 408254cf2f1d software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/doc/regularization/RegularizationManager.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/doc/regularization/RegularizationManager.java Wed Nov 24 17:24:23 2010 +0100 @@ -0,0 +1,287 @@ +package de.mpg.mpiwg.berlin.mpdl.lt.doc.regularization; + +import java.io.BufferedOutputStream; +import java.io.File; +import java.io.FileNotFoundException; +import java.io.FileOutputStream; +import java.io.FilenameFilter; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Date; +import java.util.Enumeration; +import java.util.Hashtable; + +import javax.xml.namespace.NamespaceContext; + +import org.w3c.dom.Node; +import org.xml.sax.InputSource; + +import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; +import de.mpg.mpiwg.berlin.mpdl.general.MpdlConstants; +import de.mpg.mpiwg.berlin.mpdl.lucene.LuceneUtil; +import de.mpg.mpiwg.berlin.mpdl.util.FileUtil; +import de.mpg.mpiwg.berlin.mpdl.util.FilenameFilterExtension; +import de.mpg.mpiwg.berlin.mpdl.util.Util; +import de.mpg.mpiwg.berlin.mpdl.util.XmlUtil; +import de.mpg.mpiwg.berlin.mpdl.xml.SchemaHandler; + +public class RegularizationManager { + private static RegularizationManager instance; + private static String MPDL_DOC_DIR = MpdlConstants.MPDL_DOC_DIR; + private static String MPDL_EXIST_DATA_DIR = MpdlConstants.MPDL_EXIST_DATA_DIR; + private static String ECHO_DOC_DIR = MPDL_DOC_DIR + "/documents/echo"; + private static String REGULARIZATION_DATA_DIR = MPDL_EXIST_DATA_DIR + "/dataFiles/regularization"; + private static String REGULARIZATION_DB_DIR = MPDL_EXIST_DATA_DIR + "/dataBerkeleyDB/regularization"; + private static String[] LANGUAGES = {"ar", "de", "el", "en", "fr", "it", "la", "nl", "zh"}; + private DBRegularizationHandler dbRegHandler; + private Hashtable> regsOrig; + private Hashtable> regsNorm; + private Date beginOfOperation; + private Date endOfOperation; + + public static RegularizationManager getInstance() throws ApplicationException { + if (instance == null) { + instance = new RegularizationManager(); + instance.init(); + } + return instance; + } + + public static void main(String[] args) throws ApplicationException { + getInstance(); + instance.beginOperation(); + System.out.print("Start ..."); + + // instance.writeAllRegs(); + + ArrayList regs = instance.findRegsByNorm("la", "Illiusque"); + Regularization bla = regs.get(0); + + instance.end(); + instance.endOperation(); + Double elapsedTime = new Util().getSecondWithMillisecondsBetween(instance.beginOfOperation, instance.endOfOperation); + System.out.println("End."); + System.out.println("Needed time: " + elapsedTime + " seconds"); + } + + private void init() throws ApplicationException { + regsOrig = new Hashtable>(); + regsNorm = new Hashtable>(); + dbRegHandler = new DBRegularizationHandler(REGULARIZATION_DB_DIR); + dbRegHandler.start(); + dbRegHandler.openDatabases(); + } + + public ArrayList findRegsByOrig(String language, String orig) throws ApplicationException { + orig = orig.toLowerCase(); + String hashKey = language + "###" + orig; + ArrayList regs = regsOrig.get(hashKey); + if (regs == null) { + regs = dbRegHandler.readRegsByOrig(language, orig); + if (regs == null || regs.isEmpty()) + regsOrig.put(hashKey, new ArrayList()); + else + regsOrig.put(hashKey, regs); + } + return regs; + } + + public ArrayList findRegsByNorm(String language, String norm) throws ApplicationException { + norm = norm.toLowerCase(); + String hashKey = language + "###" + norm; + ArrayList regs = regsNorm.get(hashKey); + if (regs == null || regs.isEmpty()) { + regs = dbRegHandler.readRegsByNorm(language, norm); + if (regs == null) + regsNorm.put(hashKey, new ArrayList()); + else + regsNorm.put(hashKey, regs); + } + return regs; + } + + public ArrayList getRegOrigsByNormLuceneQueryString(String language, String luceneQueryString) throws ApplicationException { + ArrayList regForms = new ArrayList(); + LuceneUtil luceneUtil = LuceneUtil.getInstance(); + ArrayList variants = luceneUtil.getVariantsFromLuceneQuery(luceneQueryString); + if (variants != null) { + for (int i=0; i regs = findRegsByNorm(language, variant); + if (regs != null) { + for (int j=0; j hashTableRegOrig = new Hashtable(); + Hashtable hashTableRegNorm = new Hashtable(); + File docFile = new File(docFileName); + // hack: in the two hashTables all regs are hold + getRegs(language, docFile, hashTableRegOrig, hashTableRegNorm); + // write all regs to DB + writeRegsOrigToDb(hashTableRegOrig); + writeRegsNormToDb(hashTableRegNorm); + } + + private void getRegs(String language, File docFile, Hashtable hashTableRegOrig, Hashtable hashTableRegNorm) throws ApplicationException { + XmlUtil xmlUtil = XmlUtil.getInstance(); + InputSource docFileInputSource = new InputSource(docFile.toURI().getPath()); + SchemaHandler echoSchemaHandler = new SchemaHandler(); + NamespaceContext echoNS = echoSchemaHandler.getEchoNsContext(); + ArrayList regsArray = xmlUtil.evaluateToNodeArray(docFileInputSource, "//echo:reg", echoNS); + if (regsArray != null) { + String docFileName = docFile.getName(); + for (int i=0; i hashTableRegOrig) throws ApplicationException { + Enumeration regElements = hashTableRegOrig.elements(); + while (regElements.hasMoreElements()) { + Regularization reg = regElements.nextElement(); + boolean regAlreadyExists = false; + String language = reg.getLanguage(); + String orig = reg.getOrig(); + String norm = reg.getNorm(); + ArrayList existingRegs = dbRegHandler.readRegsByOrig(language, orig); + if (existingRegs != null && existingRegs.size() > 0) { + for (int i=0; i hashTableRegNorm) throws ApplicationException { + Enumeration regElements = hashTableRegNorm.elements(); + while (regElements.hasMoreElements()) { + Regularization reg = regElements.nextElement(); + boolean regAlreadyExists = false; + String language = reg.getLanguage(); + String orig = reg.getOrig(); + String norm = reg.getNorm(); + ArrayList existingRegs = dbRegHandler.readRegsByNorm(language, norm); + if (existingRegs != null && existingRegs.size() > 0) { + for (int i=0; i 0) { + Hashtable hashTableRegOrig = new Hashtable(); + Hashtable hashTableRegNorm = new Hashtable(); + for (int j=0; j\n", regOut); + writeRegs(hashTableRegOrig, regOut); + writeRegsToDb(hashTableRegOrig, hashTableRegNorm); + write("", regOut); + } + } + } catch (FileNotFoundException e) { + throw new ApplicationException(e); + } finally { + // always close the stream + if (regOut != null) try { regOut.close(); } catch (Exception e) { } + } + } + + private void writeRegs(Hashtable hashTableReg, BufferedOutputStream regOut) throws ApplicationException { + Enumeration regElements = hashTableReg.elements(); + while (regElements.hasMoreElements()) { + Regularization reg = regElements.nextElement(); + String regXmlStr = reg.getXmlString(); + write(regXmlStr, regOut); + } + } + + private void writeRegsToDb(Hashtable hashTableRegOrig, Hashtable hashTableRegNorm) throws ApplicationException { + Enumeration regElements = hashTableRegOrig.elements(); + while (regElements.hasMoreElements()) { + Regularization reg = regElements.nextElement(); + dbRegHandler.writeOrigReg(reg); + } + regElements = hashTableRegNorm.elements(); + while (regElements.hasMoreElements()) { + Regularization reg = regElements.nextElement(); + dbRegHandler.writeNormReg(reg); + } + } + + private void deleteDbData() throws ApplicationException { + dbRegHandler.deleteData(); + } + + private void write(String inputString, BufferedOutputStream out) throws ApplicationException { + try { + byte[] bytes = inputString.getBytes("utf-8"); + out.write(bytes, 0, bytes.length); + out.flush(); + } catch (IOException e) { + throw new ApplicationException(e); + } + } + + public void end() throws ApplicationException { + dbRegHandler.closeDatabases(); + } + + private void beginOperation() { + beginOfOperation = new Date(); + } + + private void endOperation() { + endOfOperation = new Date(); + } + +} \ No newline at end of file diff -r 000000000000 -r 408254cf2f1d software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/general/Betacode2Unicode.lex --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/general/Betacode2Unicode.lex Wed Nov 24 17:24:23 2010 +0100 @@ -0,0 +1,332 @@ +package de.mpg.mpiwg.berlin.mpdl.lt.general; + +%% +%{ + /* + * Betacode to Unicode conversion + */ + + private String toUnicode(int in) { + char c = (char) in; + Character ch = new Character(c); + String retString = ch.toString(); + return retString; + } + +%} + +%class Betacode2UnicodeLex +%public +%type java.lang.String +%unicode +%% + + +"<"[^>]+">" { return yytext(); } + +"*j" { return "H"; } +"j" { return "h"; } +"*v" { return "F"; } +"v" { return "f"; } +"*s" { return toUnicode(0x03a3); } + +"!" { return "."; } +":" { return toUnicode(0x00B7); } /* MPDL update */ + +"a)" { return toUnicode(0x1F00); } +"a(" { return toUnicode(0x1F01); } +"a)\\" { return toUnicode(0x1F02); } +"a(\\" { return toUnicode(0x1F03); } +"a)/" { return toUnicode(0x1F04); } +"a(/" { return toUnicode(0x1F05); } +"a)=" { return toUnicode(0x1F06); } +"a(=" { return toUnicode(0x1F07); } +"*)a" { return toUnicode(0x1F08); } +"*(a" { return toUnicode(0x1F09); } +"*)\\a" { return toUnicode(0x1F0A); } +"*(\\a" { return toUnicode(0x1F0B); } +"*)/a" { return toUnicode(0x1F0C); } +"*(/a" { return toUnicode(0x1F0D); } +"*)=a" { return toUnicode(0x1F0E); } +"*(=a" { return toUnicode(0x1F0F); } +"e)" { return toUnicode(0x1F10); } +"e(" { return toUnicode(0x1F11); } +"e)\\" { return toUnicode(0x1F12); } +"e(\\" { return toUnicode(0x1F13); } +"e)/" { return toUnicode(0x1F14); } +"e(/" { return toUnicode(0x1F15); } +"*)e" { return toUnicode(0x1F18); } +"*(e" { return toUnicode(0x1F19); } +"*)\\e" { return toUnicode(0x1F1A); } +"*(\\e" { return toUnicode(0x1F1B); } +"*)/e" { return toUnicode(0x1F1C); } +"*(/e" { return toUnicode(0x1F1D); } +"h)" { return toUnicode(0x1F20); } +"h(" { return toUnicode(0x1F21); } +"h)\\" { return toUnicode(0x1F22); } +"h(\\" { return toUnicode(0x1F23); } +"h)/" { return toUnicode(0x1F24); } +"h(/" { return toUnicode(0x1F25); } +"h)=" { return toUnicode(0x1F26); } +"h(=" { return toUnicode(0x1F27); } +"*)h" { return toUnicode(0x1F28); } +"*(h" { return toUnicode(0x1F29); } +"*)\\h" { return toUnicode(0x1F2A); } +"*(\\h" { return toUnicode(0x1F2B); } +"*)/h" { return toUnicode(0x1F2C); } +"*(/h" { return toUnicode(0x1F2D); } +"*)=h" { return toUnicode(0x1F2E); } +"*(=h" { return toUnicode(0x1F2F); } +"i)" { return toUnicode(0x1F30); } +"i(" { return toUnicode(0x1F31); } +"i)\\" { return toUnicode(0x1F32); } +"i(\\" { return toUnicode(0x1F33); } +"i)/" { return toUnicode(0x1F34); } +"i(/" { return toUnicode(0x1F35); } +"i)=" { return toUnicode(0x1F36); } +"i(=" { return toUnicode(0x1F37); } +"*)i" { return toUnicode(0x1F38); } +"*(i" { return toUnicode(0x1F39); } +"*)\\i" { return toUnicode(0x1F3A); } +"*(\\i" { return toUnicode(0x1F3B); } +"*)/i" { return toUnicode(0x1F3C); } +"*(/i" { return toUnicode(0x1F3D); } +"*)=i" { return toUnicode(0x1F3E); } +"*(=i" { return toUnicode(0x1F3F); } +"o)" { return toUnicode(0x1F40); } +"o(" { return toUnicode(0x1F41); } +"o)\\" { return toUnicode(0x1F42); } +"o(\\" { return toUnicode(0x1F43); } +"o)/" { return toUnicode(0x1F44); } +"o(/" { return toUnicode(0x1F45); } +"*)o" { return toUnicode(0x1F48); } +"*(o" { return toUnicode(0x1F49); } +"*)\\o" { return toUnicode(0x1F4A); } +"*(\\o" { return toUnicode(0x1F4B); } +"*)/o" { return toUnicode(0x1F4C); } +"*(/o" { return toUnicode(0x1F4D); } +"u)" { return toUnicode(0x1F50); } +"u(" { return toUnicode(0x1F51); } +"u)\\" { return toUnicode(0x1F52); } +"u(\\" { return toUnicode(0x1F53); } +"u)/" { return toUnicode(0x1F54); } +"u(/" { return toUnicode(0x1F55); } +"u)=" { return toUnicode(0x1F56); } +"u(=" { return toUnicode(0x1F57); } +"*(u" { return toUnicode(0x1F59); } +"*(\\u" { return toUnicode(0x1F5B); } +"*(/u" { return toUnicode(0x1F5D); } +"*(=u" { return toUnicode(0x1F5F); } +"w)" { return toUnicode(0x1F60); } +"w(" { return toUnicode(0x1F61); } +"w)\\" { return toUnicode(0x1F62); } +"w(\\" { return toUnicode(0x1F63); } +"w)/" { return toUnicode(0x1F64); } +"w(/" { return toUnicode(0x1F65); } +"w)=" { return toUnicode(0x1F66); } +"w(=" { return toUnicode(0x1F67); } +"*)w" { return toUnicode(0x1F68); } +"*(w" { return toUnicode(0x1F69); } +"*)\\w" { return toUnicode(0x1F6A); } +"*(\\w" { return toUnicode(0x1F6B); } +"*)/w" { return toUnicode(0x1F6C); } +"*(/w" { return toUnicode(0x1F6D); } +"*)=w" { return toUnicode(0x1F6E); } +"*(=w" { return toUnicode(0x1F6F); } +"a\\" { return toUnicode(0x1F70); } +"a/" { return toUnicode(0x1F71); } +"e\\" { return toUnicode(0x1F72); } +"e/" { return toUnicode(0x1F73); } +"h\\" { return toUnicode(0x1F74); } +"h/" { return toUnicode(0x1F75); } +"i\\" { return toUnicode(0x1F76); } +"i/" { return toUnicode(0x1F77); } +"o\\" { return toUnicode(0x1F78); } +"o/" { return toUnicode(0x1F79); } +"u\\" { return toUnicode(0x1F7A); } +"u/" { return toUnicode(0x1F7B); } +"w\\" { return toUnicode(0x1F7C); } +"w/" { return toUnicode(0x1F7D); } +"a)|" { return toUnicode(0x1F80); } +"a(|" { return toUnicode(0x1F81); } +"a)\\|" { return toUnicode(0x1F82); } +"a(\\|" { return toUnicode(0x1F83); } +"a)/|" { return toUnicode(0x1F84); } +"a(/|" { return toUnicode(0x1F85); } +"a)=|" { return toUnicode(0x1F86); } +"a(=|" { return toUnicode(0x1F87); } +"*)|a" { return toUnicode(0x1F88); } +"*(|a" { return toUnicode(0x1F89); } +"*)\\|a" { return toUnicode(0x1F8A); } +"*(\\|a" { return toUnicode(0x1F8B); } +"*)/|a" { return toUnicode(0x1F8C); } +"*(/|a" { return toUnicode(0x1F8D); } +"*)=|a" { return toUnicode(0x1F8E); } +"*(=|a" { return toUnicode(0x1F8F); } +"h)|" { return toUnicode(0x1F90); } +"h(|" { return toUnicode(0x1F91); } +"h)\\|" { return toUnicode(0x1F92); } +"h(\\|" { return toUnicode(0x1F93); } +"h)/|" { return toUnicode(0x1F94); } +"h(/|" { return toUnicode(0x1F95); } +"h)=|" { return toUnicode(0x1F96); } +"h(=|" { return toUnicode(0x1F97); } +"*)|h" { return toUnicode(0x1F98); } +"*(|h" { return toUnicode(0x1F99); } +"*)\\|h" { return toUnicode(0x1F9A); } +"*(\\|h" { return toUnicode(0x1F9B); } +"*)/|h" { return toUnicode(0x1F9C); } +"*(/|h" { return toUnicode(0x1F9D); } +"*)=|h" { return toUnicode(0x1F9E); } +"*(=|h" { return toUnicode(0x1F9F); } +"w)|" { return toUnicode(0x1FA0); } +"w(|" { return toUnicode(0x1FA1); } +"w)\\|" { return toUnicode(0x1FA2); } +"w(\\|" { return toUnicode(0x1FA3); } +"w)/|" { return toUnicode(0x1FA4); } +"w(/|" { return toUnicode(0x1FA5); } +"w)=|" { return toUnicode(0x1FA6); } +"w(=|" { return toUnicode(0x1FA7); } +"*)|w" { return toUnicode(0x1FA8); } +"*(|w" { return toUnicode(0x1FA9); } +"*)\\|w" { return toUnicode(0x1FAA); } +"*(\\|w" { return toUnicode(0x1FAB); } +"*)/|w" { return toUnicode(0x1FAC); } +"*(/|w" { return toUnicode(0x1FAD); } +"*)=|w" { return toUnicode(0x1FAE); } +"*(=|w" { return toUnicode(0x1FAF); } +"a^" { return toUnicode(0x1FB0); } +"a_" { return toUnicode(0x1FB1); } +"a\\|" { return toUnicode(0x1FB2); } +"a|" { return toUnicode(0x1FB3); } +"a/|" { return toUnicode(0x1FB4); } +"a=" { return toUnicode(0x1FB6); } +"a=|" { return toUnicode(0x1FB7); } +"*a^" { return toUnicode(0x1FB8); } +"*a_" { return toUnicode(0x1FB9); } +"*a\\" { return toUnicode(0x1FBA); } +"*a/" { return toUnicode(0x1FBB); } +"*a|" { return toUnicode(0x1FBC); } +"h\\|" { return toUnicode(0x1FC2); } +"h|" { return toUnicode(0x1FC3); } +"h/|" { return toUnicode(0x1FC4); } +"h=" { return toUnicode(0x1FC6); } +"h=|" { return toUnicode(0x1FC7); } +"*e\\" { return toUnicode(0x1FC8); } +"*e/" { return toUnicode(0x1FC9); } +"*h\\" { return toUnicode(0x1FCA); } +"*h/" { return toUnicode(0x1FCB); } +"*h|" { return toUnicode(0x1FCC); } +"i^" { return toUnicode(0x1FD0); } +"i_" { return toUnicode(0x1FD1); } +"i+\\" { return toUnicode(0x1FD2); } +"i+/" { return toUnicode(0x1FD3); } +"i=" { return toUnicode(0x1FD6); } +"i+=" { return toUnicode(0x1FD7); } +"*i^" { return toUnicode(0x1FD8); } +"*i_" { return toUnicode(0x1FD9); } +"*i\\" { return toUnicode(0x1FDA); } +"*i/" { return toUnicode(0x1FDB); } +"u^" { return toUnicode(0x1FE0); } +"u_" { return toUnicode(0x1FE1); } +"u+\\" { return toUnicode(0x1FE2); } +"u+/" { return toUnicode(0x1FE3); } +"r)" { return toUnicode(0x1FE4); } +"r(" { return toUnicode(0x1FE5); } +"u=" { return toUnicode(0x1FE6); } +"u+=" { return toUnicode(0x1FE7); } +"*u^" { return toUnicode(0x1FE8); } +"*u_" { return toUnicode(0x1FE9); } +"*u\\" { return toUnicode(0x1FEA); } +"*u/" { return toUnicode(0x1FEB); } +"*(r" { return toUnicode(0x1FEC); } +"w\\|" { return toUnicode(0x1FF2); } +"w|" { return toUnicode(0x1FF3); } +"w/|" { return toUnicode(0x1FF4); } +"*w\\" { return toUnicode(0x1FFA); } +"*w/" { return toUnicode(0x1FFB); } +"*w|" { return toUnicode(0x1FFC); } +"w=" { return toUnicode(0x1FF6); } +"w=|" { return toUnicode(0x1FF7); } +"*o\\" { return toUnicode(0x1FF8); } +"*o/" { return toUnicode(0x1FF9); } + +"\\" { return toUnicode(0x0300); } +"/" { return toUnicode(0x0301); } +"_" { return toUnicode(0x0304); } +"^" { return toUnicode(0x0306); } +"+" { return toUnicode(0x0308); } +"=" { return toUnicode(0x0302); } +")" { return toUnicode(0x0313); } +"(" { return toUnicode(0x0314); } +"?" { return toUnicode(0x0323); } +"|" { return toUnicode(0x0345); } + +"a" { return toUnicode(0x03b1); } /* MPDL update */ +"*a" { return toUnicode(0x0391); } /* MPDL update */ +"b" { return toUnicode(0x03b2); } /* MPDL update */ +"*b" { return toUnicode(0x0392); } /* MPDL update */ +"g" { return toUnicode(0x03b3); } /* MPDL update */ +"*g" { return toUnicode(0x0393); } /* MPDL update */ +"d" { return toUnicode(0x03b4); } /* MPDL update */ +"*d" { return toUnicode(0x0394); } /* MPDL update */ +"e" { return toUnicode(0x03b5); } /* MPDL update */ +"*e" { return toUnicode(0x0395); } /* MPDL update */ +"z" { return toUnicode(0x03b6); } /* MPDL update */ +"*z" { return toUnicode(0x0396); } /* MPDL update */ +"h" { return toUnicode(0x03b7); } /* MPDL update */ +"*h" { return toUnicode(0x0397); } /* MPDL update */ +"q" { return toUnicode(0x03b8); } /* MPDL update */ +"*q" { return toUnicode(0x0398); } /* MPDL update */ +"i" { return toUnicode(0x03b9); } /* MPDL update */ +"*i" { return toUnicode(0x0399); } /* MPDL update */ +"k" { return toUnicode(0x03ba); } /* MPDL update */ +"*k" { return toUnicode(0x039a); } /* MPDL update */ +"l" { return toUnicode(0x03bb); } /* MPDL update */ +"*l" { return toUnicode(0x039b); } /* MPDL update */ +"m" { return toUnicode(0x03bc); } /* MPDL update */ +"*m" { return toUnicode(0x039c); } /* MPDL update */ +"n" { return toUnicode(0x03bd); } /* MPDL update */ +"*n" { return toUnicode(0x039d); } /* MPDL update */ +"c" { return toUnicode(0x03be); } /* MPDL update */ +"*c" { return toUnicode(0x039e); } /* MPDL update */ +"o" { return toUnicode(0x03bf); } /* MPDL update */ +"*o" { return toUnicode(0x039f); } /* MPDL update */ +"p" { return toUnicode(0x03c0); } /* MPDL update */ +"*p" { return toUnicode(0x03a0); } /* MPDL update */ +"r" { return toUnicode(0x03c1); } /* MPDL update */ +"*r" { return toUnicode(0x03a1); } /* MPDL update */ + +"*s" { return toUnicode(0x03a3); } /* MPDL update */ +"s1" { return toUnicode(0x03c3); } /* mdh 2002-01-07 */ +"s"/\-\- { return toUnicode(0x03c2); } +"s"/\> }[a-z\?\!0-9*=\/()\'\-] { return toUnicode(0x03c3); } /* MPDL update */ +"s"/\< { return toUnicode(0x03c2); } /* MPDL update */ +"s"/[\[\]][a-z\?\!0-9*=\/()\'\-] { return toUnicode(0x03c3); } /* MPDL update */ +"s"/\??[^a-z0-9*=\/()\'\-\[\?] { return toUnicode(0x03c2); } +"s" { return toUnicode(0x03c3); } /* MPDL update */ + +"t" { return toUnicode(0x03c4); } /* MPDL update */ +"*t" { return toUnicode(0x03a4); } /* MPDL update */ +"u" { return toUnicode(0x03c5); } /* MPDL update */ +"*u" { return toUnicode(0x03a5); } /* MPDL update */ +"f" { return toUnicode(0x03c6); } /* MPDL update */ +"*f" { return toUnicode(0x03a6); } /* MPDL update */ +"x" { return toUnicode(0x03c7); } /* MPDL update */ +"*x" { return toUnicode(0x03a7); } /* MPDL update */ +"y" { return toUnicode(0x03c8); } /* MPDL update */ +"*y" { return toUnicode(0x03a8); } /* MPDL update */ +"w" { return toUnicode(0x03c9); } /* MPDL update */ +"*w" { return toUnicode(0x03a9); } /* MPDL update */ + +[\&_]"vert;" { return "|"; } +[\&_]"lpar;" { return "("; } +[\&_]"rpar;" { return ")"; } +[\_\&]"lt;" { return "<"; } +[\_\&]"gt;" { return ">"; } +"'" { return "'"; } /* MPDL update */ + +"&"[a-zA-Z]+";" { return yytext(); } + +. { return yytext(); } +\n { return yytext(); } \ No newline at end of file diff -r 000000000000 -r 408254cf2f1d software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/general/Betacode2Unicode.lex.old --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/general/Betacode2Unicode.lex.old Wed Nov 24 17:24:23 2010 +0100 @@ -0,0 +1,318 @@ +package de.mpg.mpiwg.berlin.mpdl.lt.general; + +%% +%{ + /* + * Betacode to Unicode conversion + */ + + private int isUpper = 0; + + private String toUnicodeGreek(int in) { + String retStr = toUnicode(in - (isUpper * 0x0020)); + isUpper = 0; + return retStr; + } + + private String toUnicode(int in) { + char c = (char) in; + Character ch = new Character(c); + String retString = ch.toString(); + return retString; + } + +%} + +%class Betacode2UnicodeLex +%public +%type java.lang.String +%unicode +%% + + +"<"[^>]+">" { return yytext(); } + +"*j" { return "H"; } +"j" { return "h"; } +"*v" { return "F"; } +"v" { return "f"; } +"*s" { return toUnicode(0x03a3); } + +"!" { return "."; } +":" { return toUnicode(0x00B7); } /* MPDL update */ + +"a)" { return toUnicode(0x1F00); } +"a(" { return toUnicode(0x1F01); } +"a)\\" { return toUnicode(0x1F02); } +"a(\\" { return toUnicode(0x1F03); } +"a)/" { return toUnicode(0x1F04); } +"a(/" { return toUnicode(0x1F05); } +"a)=" { return toUnicode(0x1F06); } +"a(=" { return toUnicode(0x1F07); } +"*)a" { return toUnicode(0x1F08); } +"*(a" { return toUnicode(0x1F09); } +"*)\\a" { return toUnicode(0x1F0A); } +"*(\\a" { return toUnicode(0x1F0B); } +"*)/a" { return toUnicode(0x1F0C); } +"*(/a" { return toUnicode(0x1F0D); } +"*)=a" { return toUnicode(0x1F0E); } +"*(=a" { return toUnicode(0x1F0F); } +"e)" { return toUnicode(0x1F10); } +"e(" { return toUnicode(0x1F11); } +"e)\\" { return toUnicode(0x1F12); } +"e(\\" { return toUnicode(0x1F13); } +"e)/" { return toUnicode(0x1F14); } +"e(/" { return toUnicode(0x1F15); } +"*)e" { return toUnicode(0x1F18); } +"*(e" { return toUnicode(0x1F19); } +"*)\\e" { return toUnicode(0x1F1A); } +"*(\\e" { return toUnicode(0x1F1B); } +"*)/e" { return toUnicode(0x1F1C); } +"*(/e" { return toUnicode(0x1F1D); } +"h)" { return toUnicode(0x1F20); } +"h(" { return toUnicode(0x1F21); } +"h)\\" { return toUnicode(0x1F22); } +"h(\\" { return toUnicode(0x1F23); } +"h)/" { return toUnicode(0x1F24); } +"h(/" { return toUnicode(0x1F25); } +"h)=" { return toUnicode(0x1F26); } +"h(=" { return toUnicode(0x1F27); } +"*)h" { return toUnicode(0x1F28); } +"*(h" { return toUnicode(0x1F29); } +"*)\\h" { return toUnicode(0x1F2A); } +"*(\\h" { return toUnicode(0x1F2B); } +"*)/h" { return toUnicode(0x1F2C); } +"*(/h" { return toUnicode(0x1F2D); } +"*)=h" { return toUnicode(0x1F2E); } +"*(=h" { return toUnicode(0x1F2F); } +"i)" { return toUnicode(0x1F30); } +"i(" { return toUnicode(0x1F31); } +"i)\\" { return toUnicode(0x1F32); } +"i(\\" { return toUnicode(0x1F33); } +"i)/" { return toUnicode(0x1F34); } +"i(/" { return toUnicode(0x1F35); } +"i)=" { return toUnicode(0x1F36); } +"i(=" { return toUnicode(0x1F37); } +"*)i" { return toUnicode(0x1F38); } +"*(i" { return toUnicode(0x1F39); } +"*)\\i" { return toUnicode(0x1F3A); } +"*(\\i" { return toUnicode(0x1F3B); } +"*)/i" { return toUnicode(0x1F3C); } +"*(/i" { return toUnicode(0x1F3D); } +"*)=i" { return toUnicode(0x1F3E); } +"*(=i" { return toUnicode(0x1F3F); } +"o)" { return toUnicode(0x1F40); } +"o(" { return toUnicode(0x1F41); } +"o)\\" { return toUnicode(0x1F42); } +"o(\\" { return toUnicode(0x1F43); } +"o)/" { return toUnicode(0x1F44); } +"o(/" { return toUnicode(0x1F45); } +"*)o" { return toUnicode(0x1F48); } +"*(o" { return toUnicode(0x1F49); } +"*)\\o" { return toUnicode(0x1F4A); } +"*(\\o" { return toUnicode(0x1F4B); } +"*)/o" { return toUnicode(0x1F4C); } +"*(/o" { return toUnicode(0x1F4D); } +"u)" { return toUnicode(0x1F50); } +"u(" { return toUnicode(0x1F51); } +"u)\\" { return toUnicode(0x1F52); } +"u(\\" { return toUnicode(0x1F53); } +"u)/" { return toUnicode(0x1F54); } +"u(/" { return toUnicode(0x1F55); } +"u)=" { return toUnicode(0x1F56); } +"u(=" { return toUnicode(0x1F57); } +"*(u" { return toUnicode(0x1F59); } +"*(\\u" { return toUnicode(0x1F5B); } +"*(/u" { return toUnicode(0x1F5D); } +"*(=u" { return toUnicode(0x1F5F); } +"w)" { return toUnicode(0x1F60); } +"w(" { return toUnicode(0x1F61); } +"w)\\" { return toUnicode(0x1F62); } +"w(\\" { return toUnicode(0x1F63); } +"w)/" { return toUnicode(0x1F64); } +"w(/" { return toUnicode(0x1F65); } +"w)=" { return toUnicode(0x1F66); } +"w(=" { return toUnicode(0x1F67); } +"*)w" { return toUnicode(0x1F68); } +"*(w" { return toUnicode(0x1F69); } +"*)\\w" { return toUnicode(0x1F6A); } +"*(\\w" { return toUnicode(0x1F6B); } +"*)/w" { return toUnicode(0x1F6C); } +"*(/w" { return toUnicode(0x1F6D); } +"*)=w" { return toUnicode(0x1F6E); } +"*(=w" { return toUnicode(0x1F6F); } +"a\\" { return toUnicode(0x1F70); } +"a/" { return toUnicode(0x1F71); } +"e\\" { return toUnicode(0x1F72); } +"e/" { return toUnicode(0x1F73); } +"h\\" { return toUnicode(0x1F74); } +"h/" { return toUnicode(0x1F75); } +"i\\" { return toUnicode(0x1F76); } +"i/" { return toUnicode(0x1F77); } +"o\\" { return toUnicode(0x1F78); } +"o/" { return toUnicode(0x1F79); } +"u\\" { return toUnicode(0x1F7A); } +"u/" { return toUnicode(0x1F7B); } +"w\\" { return toUnicode(0x1F7C); } +"w/" { return toUnicode(0x1F7D); } +"a)|" { return toUnicode(0x1F80); } +"a(|" { return toUnicode(0x1F81); } +"a)\\|" { return toUnicode(0x1F82); } +"a(\\|" { return toUnicode(0x1F83); } +"a)/|" { return toUnicode(0x1F84); } +"a(/|" { return toUnicode(0x1F85); } +"a)=|" { return toUnicode(0x1F86); } +"a(=|" { return toUnicode(0x1F87); } +"*)|a" { return toUnicode(0x1F88); } +"*(|a" { return toUnicode(0x1F89); } +"*)\\|a" { return toUnicode(0x1F8A); } +"*(\\|a" { return toUnicode(0x1F8B); } +"*)/|a" { return toUnicode(0x1F8C); } +"*(/|a" { return toUnicode(0x1F8D); } +"*)=|a" { return toUnicode(0x1F8E); } +"*(=|a" { return toUnicode(0x1F8F); } +"h)|" { return toUnicode(0x1F90); } +"h(|" { return toUnicode(0x1F91); } +"h)\\|" { return toUnicode(0x1F92); } +"h(\\|" { return toUnicode(0x1F93); } +"h)/|" { return toUnicode(0x1F94); } +"h(/|" { return toUnicode(0x1F95); } +"h)=|" { return toUnicode(0x1F96); } +"h(=|" { return toUnicode(0x1F97); } +"*)|h" { return toUnicode(0x1F98); } +"*(|h" { return toUnicode(0x1F99); } +"*)\\|h" { return toUnicode(0x1F9A); } +"*(\\|h" { return toUnicode(0x1F9B); } +"*)/|h" { return toUnicode(0x1F9C); } +"*(/|h" { return toUnicode(0x1F9D); } +"*)=|h" { return toUnicode(0x1F9E); } +"*(=|h" { return toUnicode(0x1F9F); } +"w)|" { return toUnicode(0x1FA0); } +"w(|" { return toUnicode(0x1FA1); } +"w)\\|" { return toUnicode(0x1FA2); } +"w(\\|" { return toUnicode(0x1FA3); } +"w)/|" { return toUnicode(0x1FA4); } +"w(/|" { return toUnicode(0x1FA5); } +"w)=|" { return toUnicode(0x1FA6); } +"w(=|" { return toUnicode(0x1FA7); } +"*)|w" { return toUnicode(0x1FA8); } +"*(|w" { return toUnicode(0x1FA9); } +"*)\\|w" { return toUnicode(0x1FAA); } +"*(\\|w" { return toUnicode(0x1FAB); } +"*)/|w" { return toUnicode(0x1FAC); } +"*(/|w" { return toUnicode(0x1FAD); } +"*)=|w" { return toUnicode(0x1FAE); } +"*(=|w" { return toUnicode(0x1FAF); } +"a^" { return toUnicode(0x1FB0); } +"a_" { return toUnicode(0x1FB1); } +"a\\|" { return toUnicode(0x1FB2); } +"a|" { return toUnicode(0x1FB3); } +"a/|" { return toUnicode(0x1FB4); } +"a=" { return toUnicode(0x1FB6); } +"a=|" { return toUnicode(0x1FB7); } +"*a^" { return toUnicode(0x1FB8); } +"*a_" { return toUnicode(0x1FB9); } +"*a\\" { return toUnicode(0x1FBA); } +"*a/" { return toUnicode(0x1FBB); } +"*a|" { return toUnicode(0x1FBC); } +"h\\|" { return toUnicode(0x1FC2); } +"h|" { return toUnicode(0x1FC3); } +"h/|" { return toUnicode(0x1FC4); } +"h=" { return toUnicode(0x1FC6); } +"h=|" { return toUnicode(0x1FC7); } +"*e\\" { return toUnicode(0x1FC8); } +"*e/" { return toUnicode(0x1FC9); } +"*h\\" { return toUnicode(0x1FCA); } +"*h/" { return toUnicode(0x1FCB); } +"*h|" { return toUnicode(0x1FCC); } +"i^" { return toUnicode(0x1FD0); } +"i_" { return toUnicode(0x1FD1); } +"i+\\" { return toUnicode(0x1FD2); } +"i+/" { return toUnicode(0x1FD3); } +"i=" { return toUnicode(0x1FD6); } +"i+=" { return toUnicode(0x1FD7); } +"*i^" { return toUnicode(0x1FD8); } +"*i_" { return toUnicode(0x1FD9); } +"*i\\" { return toUnicode(0x1FDA); } +"*i/" { return toUnicode(0x1FDB); } +"u^" { return toUnicode(0x1FE0); } +"u_" { return toUnicode(0x1FE1); } +"u+\\" { return toUnicode(0x1FE2); } +"u+/" { return toUnicode(0x1FE3); } +"r)" { return toUnicode(0x1FE4); } +"r(" { return toUnicode(0x1FE5); } +"u=" { return toUnicode(0x1FE6); } +"u+=" { return toUnicode(0x1FE7); } +"*u^" { return toUnicode(0x1FE8); } +"*u_" { return toUnicode(0x1FE9); } +"*u\\" { return toUnicode(0x1FEA); } +"*u/" { return toUnicode(0x1FEB); } +"*(r" { return toUnicode(0x1FEC); } +"w\\|" { return toUnicode(0x1FF2); } +"w|" { return toUnicode(0x1FF3); } +"w/|" { return toUnicode(0x1FF4); } +"*w\\" { return toUnicode(0x1FFA); } +"*w/" { return toUnicode(0x1FFB); } +"*w|" { return toUnicode(0x1FFC); } +"w=" { return toUnicode(0x1FF6); } +"w=|" { return toUnicode(0x1FF7); } +"*o\\" { return toUnicode(0x1FF8); } +"*o/" { return toUnicode(0x1FF9); } + +"*" isUpper = 1; + +"\\" { return toUnicode(0x0300); } +"/" { return toUnicode(0x0301); } +"_" { return toUnicode(0x0304); } +"^" { return toUnicode(0x0306); } +"+" { return toUnicode(0x0308); } +"=" { return toUnicode(0x0302); } +")" { return toUnicode(0x0313); } +"(" { return toUnicode(0x0314); } +"?" { return toUnicode(0x0323); } +"|" { return toUnicode(0x0345); } + +"a" { return toUnicodeGreek(0x03b1); } +"b" { return toUnicodeGreek(0x03b2); } +"g" { return toUnicodeGreek(0x03b3); } +"d" { return toUnicodeGreek(0x03b4); } +"e" { return toUnicodeGreek(0x03b5); } +"z" { return toUnicodeGreek(0x03b6); } +"h" { return toUnicodeGreek(0x03b7); } +"q" { return toUnicodeGreek(0x03b8); } +"i" { return toUnicodeGreek(0x03b9); } +"k" { return toUnicodeGreek(0x03ba); } +"l" { return toUnicodeGreek(0x03bb); } +"m" { return toUnicodeGreek(0x03bc); } +"n" { return toUnicodeGreek(0x03bd); } +"c" { return toUnicodeGreek(0x03be); } +"o" { return toUnicodeGreek(0x03bf); } +"p" { return toUnicodeGreek(0x03c0); } +"r" { return toUnicodeGreek(0x03c1); } + +"s1" { return toUnicode(0x03c3); } /* mdh 2002-01-07 */ +"s"/\-\- { return toUnicode(0x03c2); } +"s"/\> }[a-z\?\!0-9*=\/()\'\-] { return toUnicodeGreek(0x03c3); } +"s"/\< { return toUnicodeGreek(0x03c2); } /* MPDL update */ +"s"/[\[\]][a-z\?\!0-9*=\/()\'\-] { return toUnicodeGreek(0x03c3); } +"s"/\??[^a-z0-9*=\/()\'\-\[\?] { return toUnicode(0x03c2); } +"s" { return toUnicodeGreek(0x03c3); } + +"t" { return toUnicodeGreek(0x03c4); } +"u" { return toUnicodeGreek(0x03c5); } +"f" { return toUnicodeGreek(0x03c6); } +"x" { return toUnicodeGreek(0x03c7); } +"y" { return toUnicodeGreek(0x03c8); } +"w" { return toUnicodeGreek(0x03c9); } + +[\&_]"vert;" { return "|"; } +[\&_]"lpar;" { return "("; } +[\&_]"rpar;" { return ")"; } +[\_\&]"lt;" { return "<"; } +[\_\&]"gt;" { return ">"; } +"'" { return "'"; } /* MPDL update */ + +"&"[a-zA-Z]+";" { return yytext(); } + +. { return yytext(); } +\n { return yytext(); } \ No newline at end of file diff -r 000000000000 -r 408254cf2f1d software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/general/Betacode2UnicodeLex.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/general/Betacode2UnicodeLex.java Wed Nov 24 17:24:23 2010 +0100 @@ -0,0 +1,1908 @@ +package de.mpg.mpiwg.berlin.mpdl.lt.general; + + +/** + * This class is a scanner generated by + * JFlex 1.4.3 + * on 19.11.09 20:01 from the specification file + * /Users/jwillenborg/java/existDevMai2009/mpdl/extensions/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/general/Betacode2Unicode.lex + */ +public class Betacode2UnicodeLex { + + /** This character denotes the end of file */ + public static final int YYEOF = -1; + + /** initial size of the lookahead buffer */ + private static final int ZZ_BUFFERSIZE = 16384; + + /** lexical states */ + public static final int YYINITIAL = 0; + + /** + * ZZ_LEXSTATE[l] is the state in the DFA for the lexical state l + * ZZ_LEXSTATE[l+1] is the state in the DFA for the lexical state l + * at the beginning of a line + * l is of the form l = 2*k, k a non negative integer + */ + private static final int ZZ_LEXSTATE[] = { + 0, 0 + }; + + /** + * Translates characters to character classes + */ + private static final String ZZ_CMAP_PACKED = + "\12\0\1\0\26\0\1\7\1\0\1\62\2\0\1\50\1\54\1\13"+ + "\1\12\1\3\1\30\1\0\1\47\1\0\1\15\1\63\1\46\1\54"+ + "\1\64\5\54\1\65\1\10\1\52\1\1\1\16\1\2\1\32\1\0"+ + "\32\66\1\56\1\14\1\55\1\26\1\27\1\0\1\11\1\33\1\44"+ + "\1\35\1\17\1\57\1\34\1\20\1\21\1\4\1\40\1\41\1\42"+ + "\1\43\1\22\1\45\1\37\1\31\1\6\1\51\1\23\1\5\1\24"+ + "\1\60\1\61\1\36\1\0\1\25\1\53\uff82\0"; + + /** + * Translates characters to character classes + */ + private static final char [] ZZ_CMAP = zzUnpackCMap(ZZ_CMAP_PACKED); + + /** + * Translates DFA states to action switch labels. + */ + private static final int [] ZZ_ACTION = zzUnpackAction(); + + private static final String ZZ_ACTION_PACKED_0 = + "\1\0\3\1\1\2\1\3\1\4\1\5\1\6\1\7"+ + "\1\10\1\11\1\12\1\13\1\14\1\15\1\16\1\17"+ + "\1\20\1\21\1\22\1\23\1\24\1\25\1\26\1\27"+ + "\1\30\1\31\1\32\1\33\1\34\1\35\1\36\1\37"+ + "\1\40\1\41\1\42\1\43\1\1\1\44\1\45\1\46"+ + "\1\47\1\0\1\50\1\51\1\52\1\53\2\0\1\54"+ + "\1\55\1\56\1\57\1\60\1\61\1\62\1\63\1\64"+ + "\1\65\1\66\1\67\1\70\1\71\1\72\1\73\1\74"+ + "\1\75\1\76\1\77\1\100\1\101\1\102\1\0\1\4"+ + "\1\0\2\102\1\0\1\103\1\104\1\105\1\106\1\107"+ + "\1\110\1\111\1\112\1\113\1\114\1\115\1\116\1\117"+ + "\1\120\1\121\1\122\1\123\1\124\1\125\1\126\1\127"+ + "\1\130\1\131\1\132\1\133\1\0\1\134\1\135\1\136"+ + "\1\137\1\140\1\141\1\142\1\143\1\144\1\145\1\146"+ + "\1\0\1\147\1\150\1\151\1\152\1\153\1\154\4\0"+ + "\1\155\1\156\6\0\1\157\1\160\1\161\1\162\1\163"+ + "\1\164\3\0\1\165\1\166\1\167\1\170\1\171\1\0"+ + "\1\172\3\0\1\173\1\174\1\175\1\176\1\177\1\200"+ + "\1\0\1\201\1\202\1\203\1\204\1\205\1\206\1\207"+ + "\1\210\1\211\1\212\1\213\1\214\1\215\1\216\1\217"+ + "\1\220\1\221\1\222\1\223\2\0\1\224\1\225\1\226"+ + "\1\227\1\230\1\231\1\232\1\233\1\234\1\235\1\236"+ + "\1\237\1\240\1\241\1\242\1\243\1\244\1\245\1\246"+ + "\1\247\1\250\1\251\1\252\1\253\1\254\1\255\1\256"+ + "\1\257\1\260\1\261\1\262\1\263\1\264\1\265\1\266"+ + "\1\267\1\270\1\271\1\272\1\273\1\274\1\275\1\276"+ + "\1\277\1\300\1\301\1\302\1\303\1\304\1\305\1\306"+ + "\1\307\1\310\1\311\1\312\1\313\1\314\1\315\1\316"+ + "\1\317\13\0\1\320\1\321\1\322\1\323\1\324\1\325"+ + "\1\0\1\326\1\327\1\330\1\331\1\332\1\333\1\0"+ + "\1\334\1\335\1\336\1\337\1\0\1\340\1\341\1\342"+ + "\1\343\1\344\1\345\1\346\1\347\1\350\1\351\1\0"+ + "\1\352\1\353\1\354\1\355\1\356\1\357\1\360\1\0"+ + "\1\361\1\362\1\363\1\364\1\365\1\0\1\366\1\367"+ + "\1\370\2\0\1\371\1\372\1\373\1\374\1\375\1\376"+ + "\1\377\1\u0100\1\u0101\1\u0102\1\u0103\1\u0104\1\u0105\1\u0106"+ + "\1\u0107\1\u0108\1\u0109\1\u010a\2\0\1\u010b\1\0\1\u010c"+ + "\4\0\1\u010d\1\u010e\1\u010f\1\u0110\1\u0111\1\u0112\1\u0113"+ + "\1\u0114\1\u0115\1\u0116\1\u0117\1\u0118\1\u0119\1\u011a\1\u011b"+ + "\1\u011c\1\u011d\1\u011e\10\0\1\u011f\1\u0120\1\u0121\1\u0122"; + + private static int [] zzUnpackAction() { + int [] result = new int[359]; + int offset = 0; + offset = zzUnpackAction(ZZ_ACTION_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackAction(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int count = packed.charAt(i++); + int value = packed.charAt(i++); + do result[j++] = value; while (--count > 0); + } + return j; + } + + + /** + * Translates a state to a row index in the transition table + */ + private static final int [] ZZ_ROWMAP = zzUnpackRowMap(); + + private static final String ZZ_ROWMAP_PACKED_0 = + "\0\0\0\67\0\156\0\245\0\67\0\67\0\334\0\67"+ + "\0\67\0\u0113\0\67\0\67\0\67\0\67\0\67\0\u014a"+ + "\0\u0181\0\u01b8\0\u01ef\0\u0226\0\u025d\0\67\0\67\0\u0294"+ + "\0\67\0\u02cb\0\67\0\67\0\67\0\67\0\67\0\67"+ + "\0\67\0\67\0\67\0\67\0\67\0\67\0\u0302\0\67"+ + "\0\67\0\67\0\67\0\u0339\0\67\0\67\0\67\0\u0370"+ + "\0\u03a7\0\u03de\0\u0415\0\u044c\0\u0483\0\u04ba\0\u04f1\0\u0528"+ + "\0\67\0\67\0\67\0\67\0\67\0\67\0\67\0\67"+ + "\0\67\0\67\0\67\0\67\0\67\0\67\0\67\0\67"+ + "\0\67\0\u055f\0\67\0\u0596\0\u05cd\0\u0604\0\u0604\0\u063b"+ + "\0\u0672\0\u06a9\0\u06e0\0\u0717\0\67\0\67\0\67\0\u074e"+ + "\0\u0785\0\67\0\67\0\u07bc\0\u07f3\0\u082a\0\u0861\0\u0898"+ + "\0\67\0\u08cf\0\u0906\0\67\0\67\0\67\0\67\0\67"+ + "\0\u093d\0\u0974\0\u09ab\0\67\0\67\0\u09e2\0\u0a19\0\67"+ + "\0\67\0\67\0\67\0\67\0\u0a50\0\u0a87\0\u0abe\0\u0af5"+ + "\0\u0b2c\0\u0b63\0\67\0\u0b9a\0\u0bd1\0\u0c08\0\u0c3f\0\67"+ + "\0\67\0\u0c76\0\u0cad\0\u0ce4\0\u0d1b\0\u0d52\0\u0d89\0\67"+ + "\0\67\0\67\0\67\0\67\0\67\0\u0dc0\0\u0df7\0\u0e2e"+ + "\0\67\0\67\0\67\0\67\0\67\0\u0e65\0\67\0\u0e9c"+ + "\0\u0ed3\0\u0f0a\0\67\0\67\0\67\0\67\0\67\0\67"+ + "\0\u0f41\0\67\0\67\0\67\0\67\0\67\0\67\0\67"+ + "\0\67\0\67\0\67\0\67\0\67\0\67\0\67\0\67"+ + "\0\67\0\67\0\67\0\67\0\u0f78\0\u0faf\0\67\0\u0fe6"+ + "\0\u101d\0\u1054\0\67\0\u108b\0\u10c2\0\u10f9\0\67\0\67"+ + "\0\67\0\67\0\67\0\67\0\67\0\67\0\u1130\0\u1167"+ + "\0\u119e\0\67\0\u11d5\0\u120c\0\u1243\0\67\0\67\0\67"+ + "\0\67\0\67\0\67\0\67\0\67\0\67\0\67\0\67"+ + "\0\67\0\67\0\67\0\67\0\67\0\67\0\67\0\67"+ + "\0\67\0\67\0\67\0\67\0\67\0\67\0\67\0\u127a"+ + "\0\u12b1\0\u12e8\0\67\0\u131f\0\u1356\0\u138d\0\67\0\67"+ + "\0\67\0\67\0\u13c4\0\u13fb\0\u1432\0\u1469\0\u14a0\0\u14d7"+ + "\0\u150e\0\u1545\0\u157c\0\u15b3\0\u15ea\0\67\0\67\0\67"+ + "\0\67\0\67\0\67\0\u1621\0\67\0\67\0\67\0\67"+ + "\0\67\0\67\0\u1658\0\67\0\67\0\67\0\67\0\u168f"+ + "\0\67\0\67\0\67\0\67\0\67\0\67\0\67\0\67"+ + "\0\67\0\67\0\u16c6\0\67\0\67\0\67\0\67\0\67"+ + "\0\67\0\67\0\u16fd\0\67\0\67\0\67\0\67\0\67"+ + "\0\u1734\0\67\0\67\0\67\0\u176b\0\u17a2\0\67\0\67"+ + "\0\67\0\67\0\67\0\67\0\67\0\67\0\67\0\67"+ + "\0\67\0\67\0\67\0\67\0\67\0\67\0\67\0\67"+ + "\0\u17d9\0\u1810\0\67\0\u1847\0\67\0\u187e\0\u18b5\0\u18ec"+ + "\0\u1923\0\67\0\67\0\67\0\67\0\67\0\67\0\67"+ + "\0\67\0\67\0\67\0\67\0\67\0\67\0\67\0\67"+ + "\0\67\0\67\0\67\0\u195a\0\u1991\0\u19c8\0\u19ff\0\u1a36"+ + "\0\u1a6d\0\u1aa4\0\u1adb\0\67\0\67\0\67\0\67"; + + private static int [] zzUnpackRowMap() { + int [] result = new int[359]; + int offset = 0; + offset = zzUnpackRowMap(ZZ_ROWMAP_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackRowMap(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int high = packed.charAt(i++) << 16; + result[j++] = high | packed.charAt(i++); + } + return j; + } + + /** + * The transition table of the DFA + */ + private static final int [] ZZ_TRANS = zzUnpackTrans(); + + private static final String ZZ_TRANS_PACKED_0 = + "\1\2\1\3\1\2\1\4\1\5\1\6\1\7\1\10"+ + "\1\11\1\12\1\13\1\14\1\15\1\16\1\17\1\20"+ + "\1\21\1\22\1\23\1\24\1\25\1\26\1\27\1\30"+ + "\1\31\1\32\1\33\1\34\1\35\1\36\1\37\1\40"+ + "\1\41\1\42\1\43\1\44\1\45\1\46\2\2\1\47"+ + "\1\50\5\2\1\51\1\52\1\53\5\2\67\0\2\54"+ + "\1\0\64\54\4\0\1\55\1\56\1\57\2\0\1\60"+ + "\1\61\1\62\3\0\1\63\1\64\1\65\1\66\1\67"+ + "\1\70\4\0\1\71\1\0\1\72\1\73\1\74\1\75"+ + "\1\76\1\77\1\100\1\101\1\102\1\103\1\104\3\0"+ + "\1\105\5\0\1\106\1\107\1\110\5\0\3\111\4\0"+ + "\2\111\3\0\1\111\10\0\4\111\1\0\1\112\13\0"+ + "\1\113\1\114\1\115\1\0\2\111\1\0\1\116\1\117"+ + "\3\0\1\111\3\0\1\111\12\0\1\120\1\121\1\122"+ + "\1\123\1\124\6\0\1\125\1\126\1\127\51\0\1\130"+ + "\1\131\1\132\1\133\63\0\1\134\1\135\1\136\1\137"+ + "\1\140\6\0\1\141\53\0\1\142\1\143\1\144\1\145"+ + "\1\146\7\0\1\147\1\150\1\151\50\0\1\152\1\153"+ + "\1\154\1\155\63\0\1\156\1\157\1\160\1\161\1\162"+ + "\7\0\1\163\1\164\1\165\50\0\1\166\1\167\1\170"+ + "\1\171\1\172\6\0\1\173\46\0\1\174\23\0\1\175"+ + "\2\0\1\176\4\0\1\177\37\0\1\200\1\201\57\0"+ + "\1\202\1\203\1\202\2\0\1\202\5\0\6\202\4\0"+ + "\1\204\1\0\1\202\1\205\4\202\1\206\4\202\3\0"+ + "\1\202\5\0\3\202\1\207\3\0\1\202\2\54\1\2"+ + "\64\54\14\0\1\210\1\211\7\0\1\212\1\213\1\214"+ + "\50\0\1\215\2\0\1\216\1\217\1\220\1\221\1\222"+ + "\1\223\1\224\1\0\1\225\1\226\52\0\1\227\2\0"+ + "\1\230\1\231\1\232\1\233\1\234\1\235\1\236\1\237"+ + "\1\240\1\241\3\0\1\242\51\0\1\243\1\244\65\0"+ + "\1\245\1\246\7\0\1\247\55\0\1\250\1\251\10\0"+ + "\1\252\1\253\53\0\1\254\1\255\65\0\1\256\1\257"+ + "\10\0\1\260\1\261\53\0\1\262\1\263\7\0\1\264"+ + "\41\0\3\111\4\0\2\111\3\0\1\111\10\0\4\111"+ + "\17\0\1\111\1\0\2\111\1\0\1\111\4\0\1\111"+ + "\3\0\1\111\47\0\1\111\53\0\1\265\4\0\1\266"+ + "\30\0\5\267\1\0\3\267\1\0\10\267\4\0\17\267"+ + "\1\0\1\267\2\0\1\267\2\0\3\267\1\0\3\267"+ + "\15\0\1\270\1\271\1\272\6\0\1\273\55\0\1\274"+ + "\1\275\1\276\6\0\1\277\66\0\1\300\66\0\1\301"+ + "\66\0\1\302\55\0\1\303\1\304\65\0\1\305\1\306"+ + "\65\0\1\307\1\310\1\311\6\0\1\312\55\0\1\313"+ + "\1\314\1\315\6\0\1\316\66\0\1\317\66\0\1\320"+ + "\66\0\1\321\55\0\1\322\1\323\1\324\64\0\1\325"+ + "\1\326\1\327\64\0\1\330\1\331\1\332\64\0\1\333"+ + "\1\334\65\0\1\335\1\336\65\0\1\337\1\340\1\341"+ + "\64\0\1\342\1\343\1\344\64\0\1\345\1\346\1\347"+ + "\64\0\1\350\1\351\1\352\6\0\1\353\55\0\1\354"+ + "\1\355\1\356\6\0\1\357\66\0\1\360\66\0\1\361"+ + "\66\0\1\362\60\0\1\363\114\0\1\364\72\0\1\365"+ + "\62\0\1\366\3\0\1\367\21\0\3\202\2\0\1\202"+ + "\5\0\6\202\4\0\1\202\1\0\13\202\3\0\1\202"+ + "\1\2\4\0\3\202\4\0\1\202\4\0\3\202\2\0"+ + "\1\202\5\0\1\370\5\202\4\0\1\202\1\0\13\202"+ + "\3\0\1\202\1\2\4\0\3\202\4\0\1\202\4\0"+ + "\3\202\2\0\1\202\5\0\6\202\4\0\1\202\1\0"+ + "\12\202\1\371\3\0\1\202\1\2\4\0\3\202\4\0"+ + "\1\202\4\0\3\202\2\0\1\202\5\0\6\202\4\0"+ + "\1\202\1\0\13\202\3\0\1\372\1\2\4\0\3\202"+ + "\4\0\1\202\4\0\3\202\2\0\1\202\5\0\6\202"+ + "\4\0\1\202\1\0\12\202\1\373\3\0\1\374\1\2"+ + "\4\0\3\202\4\0\1\202\63\0\1\375\14\0\1\376"+ + "\5\0\1\377\1\u0100\1\u0101\1\u0102\1\0\1\u0103\1\u0104"+ + "\52\0\1\u0105\5\0\1\u0106\1\u0107\1\u0108\1\u0109\1\0"+ + "\1\u010a\1\u010b\52\0\1\u010c\6\0\1\u010d\1\u010e\2\0"+ + "\1\u010f\1\u0110\52\0\1\u0111\6\0\1\u0112\3\0\1\u0113"+ + "\53\0\1\u0114\5\0\1\u0115\1\u0116\1\u0117\1\u0118\1\u0119"+ + "\1\u011a\1\u011b\52\0\1\u011c\5\0\1\u011d\1\u011e\1\u011f"+ + "\1\u0120\1\u0121\1\u0122\1\u0123\52\0\1\u0124\6\0\1\u0125"+ + "\1\u0126\1\0\1\u0127\1\u0128\1\u0129\52\0\1\u012a\6\0"+ + "\1\u012b\3\0\1\u012c\113\0\1\u012d\66\0\1\u012e\42\0"+ + "\1\u012f\66\0\1\u0130\66\0\1\u0131\66\0\1\u0132\66\0"+ + "\1\u0133\66\0\1\u0134\66\0\1\u0135\66\0\1\u0136\66\0"+ + "\1\u0137\66\0\1\u0138\66\0\1\u0139\66\0\1\u013a\66\0"+ + "\1\u013b\66\0\1\u013c\66\0\1\u013d\66\0\1\u013e\66\0"+ + "\1\u013f\66\0\1\u0140\72\0\1\u0141\46\0\1\u0142\127\0"+ + "\1\u0143\25\0\1\u0144\127\0\1\u0145\20\0\3\202\2\0"+ + "\1\202\5\0\6\202\4\0\1\u0146\1\0\13\202\3\0"+ + "\1\202\1\2\4\0\3\202\4\0\1\202\4\0\3\202"+ + "\2\0\1\u0147\5\0\6\202\4\0\1\202\1\0\13\202"+ + "\3\0\1\202\1\2\4\0\3\202\4\0\1\202\4\0"+ + "\3\202\2\0\1\202\5\0\6\202\4\0\1\202\1\0"+ + "\13\202\3\0\1\202\1\u0143\4\0\3\202\4\0\1\202"+ + "\4\0\3\202\2\0\1\u0148\5\0\6\202\4\0\1\202"+ + "\1\0\13\202\3\0\1\202\1\2\4\0\3\202\4\0"+ + "\1\202\4\0\3\202\2\0\1\202\5\0\6\202\4\0"+ + "\1\202\1\0\13\202\3\0\1\202\1\u0145\4\0\3\202"+ + "\4\0\1\202\64\0\1\u0149\13\0\1\u014a\6\0\1\u014b"+ + "\3\0\1\u014c\53\0\1\u014d\6\0\1\u014e\3\0\1\u014f"+ + "\53\0\1\u0150\6\0\1\u0151\3\0\1\u0152\53\0\1\u0153"+ + "\6\0\1\u0154\3\0\1\u0155\53\0\1\u0156\6\0\1\u0157"+ + "\3\0\1\u0158\53\0\1\u0159\6\0\1\u015a\3\0\1\u015b"+ + "\114\0\1\u015c\66\0\1\111\65\0\1\u015d\46\0\1\u015e"+ + "\66\0\1\u015f\41\0\3\202\2\0\1\202\5\0\6\202"+ + "\4\0\1\202\1\0\13\202\3\0\1\u0160\1\2\4\0"+ + "\3\202\4\0\1\202\4\0\3\202\2\0\1\202\5\0"+ + "\6\202\4\0\1\u0161\1\0\13\202\3\0\1\202\1\2"+ + "\4\0\3\202\4\0\1\202\4\0\3\202\2\0\1\202"+ + "\5\0\6\202\4\0\1\u0162\1\0\13\202\3\0\1\202"+ + "\1\2\4\0\3\202\4\0\1\202\65\0\1\u0163\54\0"+ + "\1\117\65\0\1\u0164\66\0\1\u0165\66\0\1\u0166\20\0"+ + "\3\202\2\0\1\202\5\0\6\202\4\0\1\202\1\0"+ + "\13\202\3\0\1\202\1\u0164\4\0\3\202\4\0\1\202"+ + "\4\0\3\202\2\0\1\202\5\0\6\202\4\0\1\202"+ + "\1\0\13\202\3\0\1\202\1\u0165\4\0\3\202\4\0"+ + "\1\202\4\0\3\202\2\0\1\202\5\0\6\202\4\0"+ + "\1\202\1\0\13\202\3\0\1\202\1\u0166\4\0\3\202"+ + "\4\0\1\202\52\0\1\u0167\14\0"; + + private static int [] zzUnpackTrans() { + int [] result = new int[6930]; + int offset = 0; + offset = zzUnpackTrans(ZZ_TRANS_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackTrans(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int count = packed.charAt(i++); + int value = packed.charAt(i++); + value--; + do result[j++] = value; while (--count > 0); + } + return j; + } + + + /* error codes */ + private static final int ZZ_UNKNOWN_ERROR = 0; + private static final int ZZ_NO_MATCH = 1; + private static final int ZZ_PUSHBACK_2BIG = 2; + + /* error messages for the codes above */ + private static final String ZZ_ERROR_MSG[] = { + "Unkown internal scanner error", + "Error: could not match input", + "Error: pushback value was too large" + }; + + /** + * ZZ_ATTRIBUTE[aState] contains the attributes of state aState + */ + private static final int [] ZZ_ATTRIBUTE = zzUnpackAttribute(); + + private static final String ZZ_ATTRIBUTE_PACKED_0 = + "\1\0\1\11\2\1\2\11\1\1\2\11\1\1\5\11"+ + "\6\1\2\11\1\1\1\11\1\1\14\11\1\1\4\11"+ + "\1\0\3\11\1\1\2\0\6\1\21\11\1\0\1\11"+ + "\1\0\2\1\1\0\5\1\3\11\2\1\2\11\5\1"+ + "\1\11\2\1\5\11\1\0\2\1\2\11\2\1\5\11"+ + "\1\0\5\1\1\11\4\0\2\11\6\0\6\11\3\0"+ + "\5\11\1\0\1\11\3\0\6\11\1\0\23\11\2\0"+ + "\1\11\3\1\1\11\3\1\10\11\3\1\1\11\3\1"+ + "\32\11\3\1\1\11\3\1\4\11\13\0\6\11\1\0"+ + "\6\11\1\0\4\11\1\0\12\11\1\0\7\11\1\0"+ + "\5\11\1\0\3\11\2\0\22\11\2\0\1\11\1\0"+ + "\1\11\4\0\22\11\10\0\4\11"; + + private static int [] zzUnpackAttribute() { + int [] result = new int[359]; + int offset = 0; + offset = zzUnpackAttribute(ZZ_ATTRIBUTE_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackAttribute(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int count = packed.charAt(i++); + int value = packed.charAt(i++); + do result[j++] = value; while (--count > 0); + } + return j; + } + + /** the input device */ + private java.io.Reader zzReader; + + /** the current state of the DFA */ + private int zzState; + + /** the current lexical state */ + private int zzLexicalState = YYINITIAL; + + /** this buffer contains the current text to be matched and is + the source of the yytext() string */ + private char zzBuffer[] = new char[ZZ_BUFFERSIZE]; + + /** the textposition at the last accepting state */ + private int zzMarkedPos; + + /** the current text position in the buffer */ + private int zzCurrentPos; + + /** startRead marks the beginning of the yytext() string in the buffer */ + private int zzStartRead; + + /** endRead marks the last character in the buffer, that has been read + from input */ + private int zzEndRead; + + /** number of newlines encountered up to the start of the matched text */ + private int yyline; + + /** the number of characters up to the start of the matched text */ + private int yychar; + + /** + * the number of characters from the last newline up to the start of the + * matched text + */ + private int yycolumn; + + /** + * zzAtBOL == true <=> the scanner is currently at the beginning of a line + */ + private boolean zzAtBOL = true; + + /** zzAtEOF == true <=> the scanner is at the EOF */ + private boolean zzAtEOF; + + /** denotes if the user-EOF-code has already been executed */ + private boolean zzEOFDone; + + /* user code: */ + /* + * Betacode to Unicode conversion + */ + + private String toUnicode(int in) { + char c = (char) in; + Character ch = new Character(c); + String retString = ch.toString(); + return retString; + } + + + + /** + * Creates a new scanner + * There is also a java.io.InputStream version of this constructor. + * + * @param in the java.io.Reader to read input from. + */ + public Betacode2UnicodeLex(java.io.Reader in) { + this.zzReader = in; + } + + /** + * Creates a new scanner. + * There is also java.io.Reader version of this constructor. + * + * @param in the java.io.Inputstream to read input from. + */ + public Betacode2UnicodeLex(java.io.InputStream in) { + this(new java.io.InputStreamReader(in)); + } + + /** + * Unpacks the compressed character translation table. + * + * @param packed the packed character translation table + * @return the unpacked character translation table + */ + private static char [] zzUnpackCMap(String packed) { + char [] map = new char[0x10000]; + int i = 0; /* index in packed string */ + int j = 0; /* index in unpacked array */ + while (i < 134) { + int count = packed.charAt(i++); + char value = packed.charAt(i++); + do map[j++] = value; while (--count > 0); + } + return map; + } + + + /** + * Refills the input buffer. + * + * @return false, iff there was new input. + * + * @exception java.io.IOException if any I/O-Error occurs + */ + private boolean zzRefill() throws java.io.IOException { + + /* first: make room (if you can) */ + if (zzStartRead > 0) { + System.arraycopy(zzBuffer, zzStartRead, + zzBuffer, 0, + zzEndRead-zzStartRead); + + /* translate stored positions */ + zzEndRead-= zzStartRead; + zzCurrentPos-= zzStartRead; + zzMarkedPos-= zzStartRead; + zzStartRead = 0; + } + + /* is the buffer big enough? */ + if (zzCurrentPos >= zzBuffer.length) { + /* if not: blow it up */ + char newBuffer[] = new char[zzCurrentPos*2]; + System.arraycopy(zzBuffer, 0, newBuffer, 0, zzBuffer.length); + zzBuffer = newBuffer; + } + + /* finally: fill the buffer with new input */ + int numRead = zzReader.read(zzBuffer, zzEndRead, + zzBuffer.length-zzEndRead); + + if (numRead > 0) { + zzEndRead+= numRead; + return false; + } + // unlikely but not impossible: read 0 characters, but not at end of stream + if (numRead == 0) { + int c = zzReader.read(); + if (c == -1) { + return true; + } else { + zzBuffer[zzEndRead++] = (char) c; + return false; + } + } + + // numRead < 0 + return true; + } + + + /** + * Closes the input stream. + */ + public final void yyclose() throws java.io.IOException { + zzAtEOF = true; /* indicate end of file */ + zzEndRead = zzStartRead; /* invalidate buffer */ + + if (zzReader != null) + zzReader.close(); + } + + + /** + * Resets the scanner to read from a new input stream. + * Does not close the old reader. + * + * All internal variables are reset, the old input stream + * cannot be reused (internal buffer is discarded and lost). + * Lexical state is set to ZZ_INITIAL. + * + * @param reader the new input stream + */ + public final void yyreset(java.io.Reader reader) { + zzReader = reader; + zzAtBOL = true; + zzAtEOF = false; + zzEOFDone = false; + zzEndRead = zzStartRead = 0; + zzCurrentPos = zzMarkedPos = 0; + yyline = yychar = yycolumn = 0; + zzLexicalState = YYINITIAL; + } + + + /** + * Returns the current lexical state. + */ + public final int yystate() { + return zzLexicalState; + } + + + /** + * Enters a new lexical state + * + * @param newState the new lexical state + */ + public final void yybegin(int newState) { + zzLexicalState = newState; + } + + + /** + * Returns the text matched by the current regular expression. + */ + public final String yytext() { + return new String( zzBuffer, zzStartRead, zzMarkedPos-zzStartRead ); + } + + + /** + * Returns the character at position pos from the + * matched text. + * + * It is equivalent to yytext().charAt(pos), but faster + * + * @param pos the position of the character to fetch. + * A value from 0 to yylength()-1. + * + * @return the character at position pos + */ + public final char yycharat(int pos) { + return zzBuffer[zzStartRead+pos]; + } + + + /** + * Returns the length of the matched text region. + */ + public final int yylength() { + return zzMarkedPos-zzStartRead; + } + + + /** + * Reports an error that occured while scanning. + * + * In a wellformed scanner (no or only correct usage of + * yypushback(int) and a match-all fallback rule) this method + * will only be called with things that "Can't Possibly Happen". + * If this method is called, something is seriously wrong + * (e.g. a JFlex bug producing a faulty scanner etc.). + * + * Usual syntax/scanner level error handling should be done + * in error fallback rules. + * + * @param errorCode the code of the errormessage to display + */ + private void zzScanError(int errorCode) { + String message; + try { + message = ZZ_ERROR_MSG[errorCode]; + } + catch (ArrayIndexOutOfBoundsException e) { + message = ZZ_ERROR_MSG[ZZ_UNKNOWN_ERROR]; + } + + throw new Error(message); + } + + + /** + * Pushes the specified amount of characters back into the input stream. + * + * They will be read again by then next call of the scanning method + * + * @param number the number of characters to be read again. + * This number must not be greater than yylength()! + */ + public void yypushback(int number) { + if ( number > yylength() ) + zzScanError(ZZ_PUSHBACK_2BIG); + + zzMarkedPos -= number; + } + + + /** + * Resumes scanning until the next regular expression is matched, + * the end of input is encountered or an I/O-Error occurs. + * + * @return the next token + * @exception java.io.IOException if any I/O-Error occurs + */ + public java.lang.String yylex() throws java.io.IOException { + int zzInput; + int zzAction; + + // cached fields: + int zzCurrentPosL; + int zzMarkedPosL; + int zzEndReadL = zzEndRead; + char [] zzBufferL = zzBuffer; + char [] zzCMapL = ZZ_CMAP; + + int [] zzTransL = ZZ_TRANS; + int [] zzRowMapL = ZZ_ROWMAP; + int [] zzAttrL = ZZ_ATTRIBUTE; + + while (true) { + zzMarkedPosL = zzMarkedPos; + + zzAction = -1; + + zzCurrentPosL = zzCurrentPos = zzStartRead = zzMarkedPosL; + + zzState = ZZ_LEXSTATE[zzLexicalState]; + + + zzForAction: { + while (true) { + + if (zzCurrentPosL < zzEndReadL) + zzInput = zzBufferL[zzCurrentPosL++]; + else if (zzAtEOF) { + zzInput = YYEOF; + break zzForAction; + } + else { + // store back cached positions + zzCurrentPos = zzCurrentPosL; + zzMarkedPos = zzMarkedPosL; + boolean eof = zzRefill(); + // get translated positions and possibly new buffer + zzCurrentPosL = zzCurrentPos; + zzMarkedPosL = zzMarkedPos; + zzBufferL = zzBuffer; + zzEndReadL = zzEndRead; + if (eof) { + zzInput = YYEOF; + break zzForAction; + } + else { + zzInput = zzBufferL[zzCurrentPosL++]; + } + } + int zzNext = zzTransL[ zzRowMapL[zzState] + zzCMapL[zzInput] ]; + if (zzNext == -1) break zzForAction; + zzState = zzNext; + + int zzAttributes = zzAttrL[zzState]; + if ( (zzAttributes & 1) == 1 ) { + zzAction = zzState; + zzMarkedPosL = zzCurrentPosL; + if ( (zzAttributes & 8) == 8 ) break zzForAction; + } + + } + } + + // store back cached position + zzMarkedPos = zzMarkedPosL; + + switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) { + case 139: + { return toUnicode(0x1FF8); + } + case 291: break; + case 85: + { return toUnicode(0x1F30); + } + case 292: break; + case 64: + { return toUnicode(0x03a7); + } + case 293: break; + case 60: + { return toUnicode(0x039e); + } + case 294: break; + case 151: + { return toUnicode(0x1F06); + } + case 295: break; + case 206: + { return toUnicode(0x1FF4); + } + case 296: break; + case 42: + { return toUnicode(0x03a3); + } + case 297: break; + case 56: + { return toUnicode(0x039a); + } + case 298: break; + case 149: + { return toUnicode(0x1F02); + } + case 299: break; + case 254: + { return toUnicode(0x1F87); + } + case 300: break; + case 83: + { return toUnicode(0x1FC6); + } + case 301: break; + case 32: + { return toUnicode(0x03bc); + } + case 302: break; + case 216: + { return toUnicode(0x1F2C); + } + case 303: break; + case 252: + { return toUnicode(0x1F83); + } + case 304: break; + case 172: + { return toUnicode(0x1FC2); + } + case 305: break; + case 127: + { return toUnicode(0x1F59); + } + case 306: break; + case 192: + { return toUnicode(0x1F55); + } + case 307: break; + case 129: + { return toUnicode(0x1FEC); + } + case 308: break; + case 97: + { return toUnicode(0x1F51); + } + case 309: break; + case 39: + { return toUnicode(0x03c8); + } + case 310: break; + case 170: + { return toUnicode(0x1F27); + } + case 311: break; + case 36: + { return toUnicode(0x03c4); + } + case 312: break; + case 168: + { return toUnicode(0x1F23); + } + case 313: break; + case 99: + { return toUnicode(0x1F7B); + } + case 314: break; + case 111: + { return toUnicode(0x1FBA); + } + case 315: break; + case 35: + { return toUnicode(0x03c0); + } + case 316: break; + case 196: + { return toUnicode(0x1FE7); + } + case 317: break; + case 238: + { return toUnicode(0x1F4D); + } + case 318: break; + case 195: + { return toUnicode(0x1FE3); + } + case 319: break; + case 115: + { return toUnicode(0x1FB9); + } + case 320: break; + case 87: + { return toUnicode(0x1F76); + } + case 321: break; + case 9: + { return toUnicode(0x0314); + } + case 322: break; + case 228: + { return toUnicode(0x1F1B); + } + case 323: break; + case 77: + { return toUnicode(0x1F72); + } + case 324: break; + case 46: + { return toUnicode(0x0399); + } + case 325: break; + case 74: + { return toUnicode(0x1FB1); + } + case 326: break; + case 120: + { return toUnicode(0x1F48); + } + case 327: break; + case 44: + { return toUnicode(0x0395); + } + case 328: break; + case 185: + { return toUnicode(0x1F44); + } + case 329: break; + case 273: + { return toUnicode(0x1F9C); + } + case 330: break; + case 136: + { return toUnicode(0x1FDB); + } + case 331: break; + case 43: + { return toUnicode(0x0391); + } + case 332: break; + case 92: + { return toUnicode(0x1F40); + } + case 333: break; + case 14: + { return toUnicode(0x03b7); + } + case 334: break; + case 268: + { return "<"; + } + case 335: break; + case 223: + { return toUnicode(0x1F6E); + } + case 336: break; + case 283: + { return toUnicode(0x1FAD); + } + case 337: break; + case 26: + { return toUnicode(0x03b3); + } + case 338: break; + case 160: + { return toUnicode(0x1F12); + } + case 339: break; + case 213: + { return toUnicode(0x1F6A); + } + case 340: break; + case 260: + { return toUnicode(0x1F97); + } + case 341: break; + case 89: + { return toUnicode(0x1FD6); + } + case 342: break; + case 217: + { return toUnicode(0x1F3C); + } + case 343: break; + case 258: + { return toUnicode(0x1F93); + } + case 344: break; + case 181: + { return toUnicode(0x1FD2); + } + case 345: break; + case 128: + { return toUnicode(0x1F69); + } + case 346: break; + case 226: + { return toUnicode(0x1FA8); + } + case 347: break; + case 220: + { return toUnicode(0x1F0E); + } + case 348: break; + case 202: + { return toUnicode(0x1F65); + } + case 349: break; + case 262: + { return toUnicode(0x1FA4); + } + case 350: break; + case 147: + { return toUnicode(0x1FFC); + } + case 351: break; + case 208: + { return toUnicode(0x1F0A); + } + case 352: break; + case 104: + { return toUnicode(0x1F61); + } + case 353: break; + case 288: + { return ")"; + } + case 354: break; + case 200: + { return toUnicode(0x1FA0); + } + case 355: break; + case 180: + { return toUnicode(0x1F37); + } + case 356: break; + case 284: + { return toUnicode(0x1F8F); + } + case 357: break; + case 287: + { return "|"; + } + case 358: break; + case 178: + { return toUnicode(0x1F33); + } + case 359: break; + case 278: + { return toUnicode(0x1F8B); + } + case 360: break; + case 132: + { return toUnicode(0x1FCA); + } + case 361: break; + case 122: + { return toUnicode(0x1F09); + } + case 362: break; + case 207: + { return toUnicode(0x1FF7); + } + case 363: break; + case 63: + { return toUnicode(0x03a6); + } + case 364: break; + case 59: + { return toUnicode(0x039d); + } + case 365: break; + case 154: + { return toUnicode(0x1F05); + } + case 366: break; + case 239: + { return toUnicode(0x1F5D); + } + case 367: break; + case 108: + { return toUnicode(0x1FF3); + } + case 368: break; + case 131: + { return toUnicode(0x1FC9); + } + case 369: break; + case 68: + { return toUnicode(0x1F01); + } + case 370: break; + case 16: + { return toUnicode(0x03bf); + } + case 371: break; + case 242: + { return toUnicode(0x1F2F); + } + case 372: break; + case 251: + { return toUnicode(0x1F86); + } + case 373: break; + case 6: + { return toUnicode(0x00B7); + } + case 374: break; + case 31: + { return toUnicode(0x03bb); + } + case 375: break; + case 229: + { return toUnicode(0x1F2B); + } + case 376: break; + case 249: + { return toUnicode(0x1F82); + } + case 377: break; + case 2: + { return "h"; + } + case 378: break; + case 189: + { return toUnicode(0x1F54); + } + case 379: break; + case 142: + { return toUnicode(0x1FEB); + } + case 380: break; + case 96: + { return toUnicode(0x1F50); + } + case 381: break; + case 38: + { return toUnicode(0x03c7); + } + case 382: break; + case 166: + { return toUnicode(0x1F26); + } + case 383: break; + case 4: + { return toUnicode(0x03c3); + } + case 384: break; + case 148: + // lookahead expression with fixed base length + zzMarkedPos = zzStartRead + 1; + { return toUnicode(0x03c3); + } + case 385: break; + case 164: + { return toUnicode(0x1F22); + } + case 386: break; + case 98: + { return toUnicode(0x1F7A); + } + case 387: break; + case 100: + { return toUnicode(0x1FE6); + } + case 388: break; + case 19: + { return toUnicode(0x0345); + } + case 389: break; + case 218: + { return toUnicode(0x1F4C); + } + case 390: break; + case 194: + { return toUnicode(0x1FE2); + } + case 391: break; + case 95: + { return toUnicode(0x1F79); + } + case 392: break; + case 114: + { return toUnicode(0x1FB8); + } + case 393: break; + case 82: + { return toUnicode(0x1F75); + } + case 394: break; + case 158: + { return toUnicode(0x1FB4); + } + case 395: break; + case 8: + { return toUnicode(0x0313); + } + case 396: break; + case 209: + { return toUnicode(0x1F1A); + } + case 397: break; + case 70: + { return toUnicode(0x1F71); + } + case 398: break; + case 40: + { return "H"; + } + case 399: break; + case 55: + { return toUnicode(0x0398); + } + case 400: break; + case 73: + { return toUnicode(0x1FB0); + } + case 401: break; + case 285: + { return toUnicode(0x1F9F); + } + case 402: break; + case 53: + { return toUnicode(0x0394); + } + case 403: break; + case 186: + { return toUnicode(0x1F43); + } + case 404: break; + case 279: + { return toUnicode(0x1F9B); + } + case 405: break; + case 135: + { return toUnicode(0x1FDA); + } + case 406: break; + case 123: + { return toUnicode(0x1F19); + } + case 407: break; + case 28: + { return toUnicode(0x03b6); + } + case 408: break; + case 163: + { return toUnicode(0x1F15); + } + case 409: break; + case 240: + { return toUnicode(0x1F6D); + } + case 410: break; + case 274: + { return toUnicode(0x1FAC); + } + case 411: break; + case 25: + { return toUnicode(0x03b2); + } + case 412: break; + case 138: + { return toUnicode(0x1FD9); + } + case 413: break; + case 76: + { return toUnicode(0x1F11); + } + case 414: break; + case 243: + { return toUnicode(0x1F3F); + } + case 415: break; + case 257: + { return toUnicode(0x1F96); + } + case 416: break; + case 230: + { return toUnicode(0x1F3B); + } + case 417: break; + case 255: + { return toUnicode(0x1F92); + } + case 418: break; + case 91: + { return toUnicode(0x1FD1); + } + case 419: break; + case 121: + { return toUnicode(0x1F68); + } + case 420: break; + case 266: + { return toUnicode(0x1FA7); + } + case 421: break; + case 20: + { return toUnicode(0x0306); + } + case 422: break; + case 234: + { return toUnicode(0x1F0D); + } + case 423: break; + case 198: + { return toUnicode(0x1F64); + } + case 424: break; + case 264: + { return toUnicode(0x1FA3); + } + case 425: break; + case 146: + { return toUnicode(0x1FFB); + } + case 426: break; + case 12: + { return toUnicode(0x0302); + } + case 427: break; + case 103: + { return toUnicode(0x1F60); + } + case 428: break; + case 289: + { return "("; + } + case 429: break; + case 177: + { return toUnicode(0x1F36); + } + case 430: break; + case 275: + { return toUnicode(0x1F8E); + } + case 431: break; + case 175: + { return toUnicode(0x1F32); + } + case 432: break; + case 49: + { return toUnicode(0x03a9); + } + case 433: break; + case 269: + { return toUnicode(0x1F8A); + } + case 434: break; + case 116: + { return toUnicode(0x1F08); + } + case 435: break; + case 107: + { return toUnicode(0x1FF6); + } + case 436: break; + case 267: + { return ">"; + } + case 437: break; + case 48: + { return toUnicode(0x03a5); + } + case 438: break; + case 58: + { return toUnicode(0x039c); + } + case 439: break; + case 150: + { return toUnicode(0x1F04); + } + case 440: break; + case 205: + { return toUnicode(0x1FF2); + } + case 441: break; + case 50: + { return toUnicode(0x03a1); + } + case 442: break; + case 246: + { return toUnicode(0x1F89); + } + case 443: break; + case 130: + { return toUnicode(0x1FC8); + } + case 444: break; + case 67: + { return toUnicode(0x1F00); + } + case 445: break; + case 34: + { return toUnicode(0x03be); + } + case 446: break; + case 221: + { return toUnicode(0x1F2E); + } + case 447: break; + case 253: + { return toUnicode(0x1F85); + } + case 448: break; + case 173: + { return toUnicode(0x1FC4); + } + case 449: break; + case 24: + { return toUnicode(0x0323); + } + case 450: break; + case 30: + { return toUnicode(0x03ba); + } + case 451: break; + case 210: + { return toUnicode(0x1F2A); + } + case 452: break; + case 156: + { return toUnicode(0x1F81); + } + case 453: break; + case 193: + { return toUnicode(0x1F57); + } + case 454: break; + case 191: + { return toUnicode(0x1F53); + } + case 455: break; + case 141: + { return toUnicode(0x1FEA); + } + case 456: break; + case 124: + { return toUnicode(0x1F29); + } + case 457: break; + case 37: + { return toUnicode(0x03c6); + } + case 458: break; + case 169: + { return toUnicode(0x1F25); + } + case 459: break; + case 106: + { return toUnicode(0x1F7D); + } + case 460: break; + case 113: + { return toUnicode(0x1FBC); + } + case 461: break; + case 66: + // lookahead expression with fixed base length + zzMarkedPos = zzStartRead + 1; + { return toUnicode(0x03c2); + } + case 462: break; + case 144: + { return toUnicode(0x1FE9); + } + case 463: break; + case 80: + { return toUnicode(0x1F21); + } + case 464: break; + case 110: + { return toUnicode(0x1FE5); + } + case 465: break; + case 1: + { return yytext(); + } + case 466: break; + case 231: + { return toUnicode(0x1F4B); + } + case 467: break; + case 102: + { return toUnicode(0x1FE1); + } + case 468: break; + case 94: + { return toUnicode(0x1F78); + } + case 469: break; + case 159: + { return toUnicode(0x1FB7); + } + case 470: break; + case 235: + { return toUnicode(0x1F1D); + } + case 471: break; + case 81: + { return toUnicode(0x1F74); + } + case 472: break; + case 72: + { return toUnicode(0x1FB3); + } + case 473: break; + case 69: + { return toUnicode(0x1F70); + } + case 474: break; + case 45: + { return toUnicode(0x0397); + } + case 475: break; + case 276: + { return toUnicode(0x1F9E); + } + case 476: break; + case 52: + { return toUnicode(0x0393); + } + case 477: break; + case 184: + { return toUnicode(0x1F42); + } + case 478: break; + case 15: + { return toUnicode(0x03b9); + } + case 479: break; + case 270: + { return toUnicode(0x1F9A); + } + case 480: break; + case 117: + { return toUnicode(0x1F18); + } + case 481: break; + case 286: + { return toUnicode(0x1FAF); + } + case 482: break; + case 13: + { return toUnicode(0x03b5); + } + case 483: break; + case 161: + { return toUnicode(0x1F14); + } + case 484: break; + case 219: + { return toUnicode(0x1F6C); + } + case 485: break; + case 280: + { return toUnicode(0x1FAB); + } + case 486: break; + case 7: + { return toUnicode(0x03b1); + } + case 487: break; + case 247: + { return toUnicode(0x1F99); + } + case 488: break; + case 137: + { return toUnicode(0x1FD8); + } + case 489: break; + case 75: + { return toUnicode(0x1F10); + } + case 490: break; + case 222: + { return toUnicode(0x1F3E); + } + case 491: break; + case 259: + { return toUnicode(0x1F95); + } + case 492: break; + case 211: + { return toUnicode(0x1F3A); + } + case 493: break; + case 171: + { return toUnicode(0x1F91); + } + case 494: break; + case 90: + { return toUnicode(0x1FD0); + } + case 495: break; + case 203: + { return toUnicode(0x1F67); + } + case 496: break; + case 263: + { return toUnicode(0x1FA6); + } + case 497: break; + case 214: + { return toUnicode(0x1F0C); + } + case 498: break; + case 201: + { return toUnicode(0x1F63); + } + case 499: break; + case 261: + { return toUnicode(0x1FA2); + } + case 500: break; + case 145: + { return toUnicode(0x1FFA); + } + case 501: break; + case 125: + { return toUnicode(0x1F39); + } + case 502: break; + case 11: + { return toUnicode(0x0301); + } + case 503: break; + case 290: + { return "'"; + } + case 504: break; + case 179: + { return toUnicode(0x1F35); + } + case 505: break; + case 281: + { return toUnicode(0x1F8D); + } + case 506: break; + case 134: + { return toUnicode(0x1FCC); + } + case 507: break; + case 140: + { return toUnicode(0x1FF9); + } + case 508: break; + case 86: + { return toUnicode(0x1F31); + } + case 509: break; + case 65: + { return toUnicode(0x03a8); + } + case 510: break; + case 47: + { return toUnicode(0x039f); + } + case 511: break; + case 155: + { return toUnicode(0x1F07); + } + case 512: break; + case 244: + { return toUnicode(0x1F5F); + } + case 513: break; + case 62: + { return toUnicode(0x03a4); + } + case 514: break; + case 57: + { return toUnicode(0x039b); + } + case 515: break; + case 153: + { return toUnicode(0x1F03); + } + case 516: break; + case 232: + { return toUnicode(0x1F5B); + } + case 517: break; + case 61: + { return toUnicode(0x03a0); + } + case 518: break; + case 224: + { return toUnicode(0x1F88); + } + case 519: break; + case 174: + { return toUnicode(0x1FC7); + } + case 520: break; + case 33: + { return toUnicode(0x03bd); + } + case 521: break; + case 236: + { return toUnicode(0x1F2D); + } + case 522: break; + case 250: + { return toUnicode(0x1F84); + } + case 523: break; + case 84: + { return toUnicode(0x1FC3); + } + case 524: break; + case 152: + { return toUnicode(0x1F80); + } + case 525: break; + case 3: + { return "f"; + } + case 526: break; + case 190: + { return toUnicode(0x1F56); + } + case 527: break; + case 188: + { return toUnicode(0x1F52); + } + case 528: break; + case 18: + { return toUnicode(0x03c9); + } + case 529: break; + case 118: + { return toUnicode(0x1F28); + } + case 530: break; + case 17: + { return toUnicode(0x03c5); + } + case 531: break; + case 165: + { return toUnicode(0x1F24); + } + case 532: break; + case 105: + { return toUnicode(0x1F7C); + } + case 533: break; + case 112: + { return toUnicode(0x1FBB); + } + case 534: break; + case 23: + { return toUnicode(0x03c1); + } + case 535: break; + case 143: + { return toUnicode(0x1FE8); + } + case 536: break; + case 79: + { return toUnicode(0x1F20); + } + case 537: break; + case 109: + { return toUnicode(0x1FE4); + } + case 538: break; + case 212: + { return toUnicode(0x1F4A); + } + case 539: break; + case 101: + { return toUnicode(0x1FE0); + } + case 540: break; + case 88: + { return toUnicode(0x1F77); + } + case 541: break; + case 71: + { return toUnicode(0x1FB6); + } + case 542: break; + case 215: + { return toUnicode(0x1F1C); + } + case 543: break; + case 78: + { return toUnicode(0x1F73); + } + case 544: break; + case 157: + { return toUnicode(0x1FB2); + } + case 545: break; + case 126: + { return toUnicode(0x1F49); + } + case 546: break; + case 41: + { return "F"; + } + case 547: break; + case 54: + { return toUnicode(0x0396); + } + case 548: break; + case 187: + { return toUnicode(0x1F45); + } + case 549: break; + case 282: + { return toUnicode(0x1F9D); + } + case 550: break; + case 51: + { return toUnicode(0x0392); + } + case 551: break; + case 93: + { return toUnicode(0x1F41); + } + case 552: break; + case 29: + { return toUnicode(0x03b8); + } + case 553: break; + case 245: + { return toUnicode(0x1F6F); + } + case 554: break; + case 277: + { return toUnicode(0x1FAE); + } + case 555: break; + case 27: + { return toUnicode(0x03b4); + } + case 556: break; + case 162: + { return toUnicode(0x1F13); + } + case 557: break; + case 233: + { return toUnicode(0x1F6B); + } + case 558: break; + case 271: + { return toUnicode(0x1FAA); + } + case 559: break; + case 225: + { return toUnicode(0x1F98); + } + case 560: break; + case 183: + { return toUnicode(0x1FD7); + } + case 561: break; + case 237: + { return toUnicode(0x1F3D); + } + case 562: break; + case 256: + { return toUnicode(0x1F94); + } + case 563: break; + case 182: + { return toUnicode(0x1FD3); + } + case 564: break; + case 248: + { return toUnicode(0x1FA9); + } + case 565: break; + case 22: + { return toUnicode(0x0308); + } + case 566: break; + case 167: + { return toUnicode(0x1F90); + } + case 567: break; + case 241: + { return toUnicode(0x1F0F); + } + case 568: break; + case 199: + { return toUnicode(0x1F66); + } + case 569: break; + case 5: + { return "."; + } + case 570: break; + case 265: + { return toUnicode(0x1FA5); + } + case 571: break; + case 21: + { return toUnicode(0x0304); + } + case 572: break; + case 227: + { return toUnicode(0x1F0B); + } + case 573: break; + case 197: + { return toUnicode(0x1F62); + } + case 574: break; + case 204: + { return toUnicode(0x1FA1); + } + case 575: break; + case 119: + { return toUnicode(0x1F38); + } + case 576: break; + case 10: + { return toUnicode(0x0300); + } + case 577: break; + case 176: + { return toUnicode(0x1F34); + } + case 578: break; + case 272: + { return toUnicode(0x1F8C); + } + case 579: break; + case 133: + { return toUnicode(0x1FCB); + } + case 580: break; + default: + if (zzInput == YYEOF && zzStartRead == zzCurrentPos) { + zzAtEOF = true; + return null; + } + else { + zzScanError(ZZ_NO_MATCH); + } + } + } + } + + +} diff -r 000000000000 -r 408254cf2f1d software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/general/Buckwalter2Unicode.lex --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/general/Buckwalter2Unicode.lex Wed Nov 24 17:24:23 2010 +0100 @@ -0,0 +1,121 @@ +package de.mpg.mpiwg.berlin.mpdl.lt.general; + +%% +%{ + /* + * Betacode to Unicode conversion + */ + +%} + +%class Buckwalter2UnicodeLex +%public +%type java.lang.String +%unicode +%% + + +"<"[^>]+">" { return yytext(); } + +"'" { return "\u0621"; } /* Hamza */ +"|" { return "\u0622"; } /* ALEF WITH MADDA ABOVE from AraMorph */ +">" { return "\u0623"; } /* Hamza */ +"&" { return "\u0624"; } /* Hamza */ +"<" { return "\u0625"; } /* Alif + HamzaBelow */ +"}" { return "\u0626"; } /* Ya + HamzaAbove */ +"A" { return "\u0627"; } /* Alif */ +"b" { return "\u0628"; } /* Ba */ +"p" { return "\u0629"; } /* TaMarbuta */ +"t" { return "\u062A"; } /* Ta */ +"v" { return "\u062B"; } /* Tha */ +"j" { return "\u062C"; } /* Jeem */ +"H" { return "\u062D"; } /* HHa */ +"x" { return "\u062E"; } /* Kha */ +"d" { return "\u062F"; } /* Dal */ +"*" { return "\u0630"; } /* Thal */ +"r" { return "\u0631"; } /* Ra */ +"z" { return "\u0632"; } /* Zain */ +"s" { return "\u0633"; } /* Seen */ +"$" { return "\u0634"; } /* Sheen */ +"S" { return "\u0635"; } /* Sad */ +"D" { return "\u0636"; } /* DDad */ +"T" { return "\u0637"; } /* TTa */ +"Z" { return "\u0638"; } /* DTha */ +"E" { return "\u0639"; } /* Ain */ +"g" { return "\u063A"; } /* Ghain */ + +"_" { return "\u0640"; } /* Tatweel */ +"f" { return "\u0641"; } /* Fa */ +"q" { return "\u0642"; } /* Qaf */ +"k" { return "\u0643"; } /* Kaf */ +"l" { return "\u0644"; } /* Lam */ +"m" { return "\u0645"; } /* Meem */ +"n" { return "\u0646"; } /* Noon */ +"h" { return "\u0647"; } /* Ha */ +"w" { return "\u0648"; } /* Waw */ +"Y" { return "\u0649"; } /* AlifMaksura */ +"y" { return "\u064A"; } /* Ya */ +"F" { return "\u064B"; } /* Fathatan */ +"N" { return "\u064C"; } /* Dammatan */ +"K" { return "\u064D"; } /* Kasratan */ +"a" { return "\u064E"; } /* Fatha */ +"u" { return "\u064F"; } /* Damma */ +"i" { return "\u0650"; } /* Kasra */ +"~" { return "\u0651"; } /* Shadda */ +"o" { return "\u0652"; } /* Sukun */ +"^" { return "\u0653"; } /* Maddah */ +"#" { return "\u0654"; } /* HamzaAbove */ + +"`" { return "\u0670"; } /* AlifKhanjareeya */ +"{" { return "\u0671"; } /* Alif + HamzatWasl */ + +"P" { return "\u067E"; } /* PEH from AraMorph */ +"J" { return "\u0686"; } /* TCHEH from AraMorph */ +"V" { return "\u06A4"; } /* VEH from AraMorph */ +"G" { return "\u06AF"; } /* GAF from AraMorph */ +"R" { return "\u0698"; } /* JEH from AraMorph */ +"?" { return "\u061F"; } /* QUESTION MARK from AraMorph */ + +":" { return "\u06DC"; } /* SmallHighSeen */ +"@" { return "\u06DF"; } /* SmallHighRoundedZero */ + +"[" { return "\u06E2"; } /* SmallHighMeemIsolatedForm */ +";" { return "\u06E3"; } /* SmallLowSeen */ +"," { return "\u06E5"; } /* SmallWaw */ +"." { return "\u06E6"; } /* SmallYa */ +"!" { return "\u06E8"; } /* SmallHighNoon */ +"-" { return "\u06EA"; } /* EmptyCentreLowStop */ +"+" { return "\u06EB"; } /* EmptyCentreHighStop */ +"%" { return "\u06EC"; } /* RoundedHighStopWithFilledCentre */ +"]" { return "\u06ED"; } /* SmallLowMeem */ + +[\&_]"vert;" { return "|"; } +[\&_]"lpar;" { return "("; } +[\&_]"rpar;" { return ")"; } +[\_\&]"lt;" { return "<"; } +[\_\&]"gt;" { return ">"; } +"'" { return "'"; } + +"&"[a-zA-Z]+";" { return yytext(); } + +. { return yytext(); } +\n { return yytext(); } + +/* make problemes */ +/* "\\"" { return "\u06E0"; } SmallHighUprightRectangularZero */ + + +/* double entries */ +/* "," { return "\u060C"; } COMMA from AraMorph */ +/* ";" { return "\u061B"; } SEMICOLON from AraMorph */ + +/* not in buckwalter contained */ +/* \u0679 : ARABIC LETTER TTEH */ +/* \u0688 : ARABIC LETTER DDAL */ +/* \u06A9 : ARABIC LETTER KEHEH */ +/* \u0691 : ARABIC LETTER RREH */ +/* \u06BA : ARABIC LETTER NOON GHUNNA */ +/* \u06BE : ARABIC LETTER HEH DOACHASHMEE */ +/* \u06C1 : ARABIC LETTER HEH GOAL */ +/* \u06D2 : ARABIC LETTER YEH BARREE */ + diff -r 000000000000 -r 408254cf2f1d software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/general/Buckwalter2UnicodeLex.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/general/Buckwalter2UnicodeLex.java Wed Nov 24 17:24:23 2010 +0100 @@ -0,0 +1,909 @@ +package de.mpg.mpiwg.berlin.mpdl.lt.general; + + +/** + * This class is a scanner generated by + * JFlex 1.4.3 + * on 20.11.09 17:57 from the specification file + * /Users/jwillenborg/java/existDevMai2009/mpdl/extensions/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/general/Buckwalter2Unicode.lex + */ +public class Buckwalter2UnicodeLex { + + /** This character denotes the end of file */ + public static final int YYEOF = -1; + + /** initial size of the lookahead buffer */ + private static final int ZZ_BUFFERSIZE = 16384; + + /** lexical states */ + public static final int YYINITIAL = 0; + + /** + * ZZ_LEXSTATE[l] is the state in the DFA for the lexical state l + * ZZ_LEXSTATE[l+1] is the state in the DFA for the lexical state l + * at the beginning of a line + * l is of the form l = 2*k, k a non negative integer + */ + private static final int ZZ_LEXSTATE[] = { + 0, 0 + }; + + /** + * Translates characters to character classes + */ + private static final String ZZ_CMAP_PACKED = + "\12\0\1\0\26\0\1\76\1\0\1\57\1\24\1\101\1\5\1\3"+ + "\2\0\1\20\1\100\1\74\1\77\1\75\1\0\1\104\2\0\1\105"+ + "\5\0\1\106\1\70\1\73\1\1\1\0\1\2\1\67\1\71\1\7"+ + "\2\107\1\26\1\31\1\46\1\65\1\15\1\107\1\63\1\50\2\107"+ + "\1\47\1\107\1\62\1\107\1\66\1\25\1\27\1\107\1\64\2\107"+ + "\1\44\1\30\1\72\1\0\1\102\1\56\1\33\1\60\1\51\1\10"+ + "\1\107\1\17\1\103\1\34\1\32\1\42\1\53\1\14\1\36\1\37"+ + "\1\40\1\41\1\55\1\11\1\35\1\21\1\23\1\12\1\52\1\13"+ + "\1\43\1\16\1\45\1\22\1\61\1\4\1\6\1\54\uff81\0"; + + /** + * Translates characters to character classes + */ + private static final char [] ZZ_CMAP = zzUnpackCMap(ZZ_CMAP_PACKED); + + /** + * Translates DFA states to action switch labels. + */ + private static final int [] ZZ_ACTION = zzUnpackAction(); + + private static final String ZZ_ACTION_PACKED_0 = + "\1\0\1\1\1\2\1\3\1\4\1\5\1\6\1\7"+ + "\1\10\1\11\1\12\1\13\1\14\1\15\1\16\1\17"+ + "\1\20\1\21\1\22\1\23\1\24\1\25\1\26\1\27"+ + "\1\30\1\31\1\32\1\33\1\34\1\35\1\36\1\37"+ + "\1\40\1\41\1\42\1\43\1\44\1\45\1\46\1\47"+ + "\1\50\1\51\1\52\1\53\1\54\1\55\1\56\1\57"+ + "\1\60\1\61\1\62\1\63\1\64\1\65\1\66\1\67"+ + "\1\70\1\71\1\72\1\73\1\74\1\75\1\76\1\77"+ + "\1\100\1\101\1\102\1\103\30\0\1\104\1\0\1\105"+ + "\13\0\1\106\1\107\1\110\1\111"; + + private static int [] zzUnpackAction() { + int [] result = new int[110]; + int offset = 0; + offset = zzUnpackAction(ZZ_ACTION_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackAction(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int count = packed.charAt(i++); + int value = packed.charAt(i++); + do result[j++] = value; while (--count > 0); + } + return j; + } + + + /** + * Translates a state to a row index in the transition table + */ + private static final int [] ZZ_ROWMAP = zzUnpackRowMap(); + + private static final String ZZ_ROWMAP_PACKED_0 = + "\0\0\0\110\0\220\0\110\0\110\0\110\0\330\0\110"+ + "\0\110\0\110\0\110\0\110\0\110\0\110\0\110\0\110"+ + "\0\110\0\110\0\110\0\110\0\110\0\110\0\110\0\110"+ + "\0\110\0\110\0\110\0\110\0\u0120\0\110\0\110\0\110"+ + "\0\110\0\110\0\110\0\110\0\110\0\110\0\110\0\110"+ + "\0\110\0\110\0\110\0\110\0\110\0\110\0\110\0\110"+ + "\0\110\0\110\0\110\0\110\0\110\0\110\0\110\0\110"+ + "\0\110\0\110\0\110\0\110\0\110\0\110\0\110\0\110"+ + "\0\110\0\110\0\110\0\110\0\u0168\0\u01b0\0\u01f8\0\u0240"+ + "\0\u0288\0\u02d0\0\u0318\0\u0360\0\u03a8\0\u03f0\0\u0438\0\u0480"+ + "\0\u04c8\0\u0510\0\u0558\0\u05a0\0\u05e8\0\u0630\0\u0678\0\u06c0"+ + "\0\u0708\0\u0750\0\u0798\0\u07e0\0\110\0\u0828\0\110\0\u0870"+ + "\0\u08b8\0\u0900\0\u0948\0\u0990\0\u09d8\0\u0a20\0\u0a68\0\u0ab0"+ + "\0\u0af8\0\u0b40\0\110\0\110\0\110\0\110"; + + private static int [] zzUnpackRowMap() { + int [] result = new int[110]; + int offset = 0; + offset = zzUnpackRowMap(ZZ_ROWMAP_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackRowMap(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int high = packed.charAt(i++) << 16; + result[j++] = high | packed.charAt(i++); + } + return j; + } + + /** + * The transition table of the DFA + */ + private static final int [] ZZ_TRANS = zzUnpackTrans(); + + private static final String ZZ_TRANS_PACKED_0 = + "\1\2\1\3\1\4\1\5\1\6\1\7\1\10\1\11"+ + "\1\12\1\13\1\14\1\15\1\16\1\17\1\20\1\21"+ + "\1\22\1\23\1\24\1\25\1\26\1\27\1\30\1\31"+ + "\1\32\1\33\1\34\1\35\1\36\1\37\1\40\1\41"+ + "\1\42\1\43\1\44\1\45\1\46\1\47\1\50\1\51"+ + "\1\52\1\53\1\54\1\55\1\56\1\57\1\60\1\61"+ + "\1\62\1\63\1\64\1\65\1\66\1\67\1\70\1\71"+ + "\1\72\1\73\1\74\1\75\1\76\1\77\1\100\1\101"+ + "\1\102\1\103\1\104\5\2\110\0\2\105\1\0\105\105"+ + "\7\0\4\106\1\107\4\106\1\0\1\110\2\106\1\0"+ + "\5\106\1\111\1\0\3\106\1\112\14\106\1\0\1\106"+ + "\1\0\1\113\2\0\5\106\14\0\1\106\3\0\1\106"+ + "\13\0\1\114\5\0\1\115\10\0\1\116\4\0\1\117"+ + "\50\0\2\105\1\2\105\105\7\0\11\106\1\0\3\106"+ + "\1\0\6\106\1\0\20\106\1\0\1\106\4\0\5\106"+ + "\4\0\1\2\7\0\1\106\3\0\1\106\7\0\11\106"+ + "\1\0\3\106\1\0\6\106\1\0\20\106\1\0\1\106"+ + "\4\0\5\106\4\0\1\2\7\0\1\120\3\0\1\106"+ + "\7\0\2\106\1\121\6\106\1\0\3\106\1\0\6\106"+ + "\1\0\20\106\1\0\1\106\4\0\5\106\4\0\1\2"+ + "\7\0\1\106\3\0\1\106\7\0\3\106\1\122\5\106"+ + "\1\0\3\106\1\0\6\106\1\0\20\106\1\0\1\106"+ + "\4\0\5\106\4\0\1\2\7\0\1\106\3\0\1\106"+ + "\7\0\2\106\1\123\1\124\5\106\1\0\3\106\1\0"+ + "\6\106\1\0\20\106\1\0\1\106\4\0\5\106\4\0"+ + "\1\2\7\0\1\106\3\0\1\106\104\0\1\125\106\0"+ + "\1\126\15\0\1\127\110\0\1\130\106\0\1\131\1\132"+ + "\104\0\11\106\1\0\1\133\2\106\1\0\6\106\1\0"+ + "\20\106\1\0\1\106\4\0\5\106\4\0\1\2\7\0"+ + "\1\106\3\0\1\106\7\0\11\106\1\0\3\106\1\0"+ + "\6\106\1\0\15\106\1\134\2\106\1\0\1\106\4\0"+ + "\5\106\4\0\1\2\7\0\1\106\3\0\1\106\7\0"+ + "\11\106\1\0\3\106\1\0\6\106\1\0\20\106\1\0"+ + "\1\106\4\0\5\106\4\0\1\135\7\0\1\106\3\0"+ + "\1\106\7\0\11\106\1\0\3\106\1\0\6\106\1\0"+ + "\15\106\1\136\2\106\1\0\1\106\4\0\5\106\4\0"+ + "\1\2\7\0\1\106\3\0\1\106\7\0\11\106\1\0"+ + "\3\106\1\0\6\106\1\0\20\106\1\0\1\106\4\0"+ + "\5\106\4\0\1\137\7\0\1\106\3\0\1\106\105\0"+ + "\1\140\23\0\1\141\137\0\1\142\131\0\1\135\65\0"+ + "\1\143\131\0\1\137\23\0\3\106\1\144\5\106\1\0"+ + "\3\106\1\0\6\106\1\0\20\106\1\0\1\106\4\0"+ + "\5\106\4\0\1\2\7\0\1\106\3\0\1\106\7\0"+ + "\11\106\1\0\1\145\2\106\1\0\6\106\1\0\20\106"+ + "\1\0\1\106\4\0\5\106\4\0\1\2\7\0\1\106"+ + "\3\0\1\106\7\0\11\106\1\0\1\146\2\106\1\0"+ + "\6\106\1\0\20\106\1\0\1\106\4\0\5\106\4\0"+ + "\1\2\7\0\1\106\3\0\1\106\106\0\1\147\13\0"+ + "\1\150\116\0\1\151\107\0\1\152\75\0\11\106\1\0"+ + "\3\106\1\0\6\106\1\0\20\106\1\0\1\106\4\0"+ + "\5\106\4\0\1\153\7\0\1\106\3\0\1\106\7\0"+ + "\11\106\1\0\3\106\1\0\6\106\1\0\20\106\1\0"+ + "\1\106\4\0\5\106\4\0\1\154\7\0\1\106\3\0"+ + "\1\106\7\0\11\106\1\0\3\106\1\0\6\106\1\0"+ + "\20\106\1\0\1\106\4\0\5\106\4\0\1\155\7\0"+ + "\1\106\3\0\1\106\73\0\1\156\107\0\1\153\107\0"+ + "\1\154\107\0\1\155\14\0"; + + private static int [] zzUnpackTrans() { + int [] result = new int[2952]; + int offset = 0; + offset = zzUnpackTrans(ZZ_TRANS_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackTrans(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int count = packed.charAt(i++); + int value = packed.charAt(i++); + value--; + do result[j++] = value; while (--count > 0); + } + return j; + } + + + /* error codes */ + private static final int ZZ_UNKNOWN_ERROR = 0; + private static final int ZZ_NO_MATCH = 1; + private static final int ZZ_PUSHBACK_2BIG = 2; + + /* error messages for the codes above */ + private static final String ZZ_ERROR_MSG[] = { + "Unkown internal scanner error", + "Error: could not match input", + "Error: pushback value was too large" + }; + + /** + * ZZ_ATTRIBUTE[aState] contains the attributes of state aState + */ + private static final int [] ZZ_ATTRIBUTE = zzUnpackAttribute(); + + private static final String ZZ_ATTRIBUTE_PACKED_0 = + "\1\0\1\11\1\1\3\11\1\1\25\11\1\1\47\11"+ + "\30\0\1\11\1\0\1\11\13\0\4\11"; + + private static int [] zzUnpackAttribute() { + int [] result = new int[110]; + int offset = 0; + offset = zzUnpackAttribute(ZZ_ATTRIBUTE_PACKED_0, offset, result); + return result; + } + + private static int zzUnpackAttribute(String packed, int offset, int [] result) { + int i = 0; /* index in packed string */ + int j = offset; /* index in unpacked array */ + int l = packed.length(); + while (i < l) { + int count = packed.charAt(i++); + int value = packed.charAt(i++); + do result[j++] = value; while (--count > 0); + } + return j; + } + + /** the input device */ + private java.io.Reader zzReader; + + /** the current state of the DFA */ + private int zzState; + + /** the current lexical state */ + private int zzLexicalState = YYINITIAL; + + /** this buffer contains the current text to be matched and is + the source of the yytext() string */ + private char zzBuffer[] = new char[ZZ_BUFFERSIZE]; + + /** the textposition at the last accepting state */ + private int zzMarkedPos; + + /** the current text position in the buffer */ + private int zzCurrentPos; + + /** startRead marks the beginning of the yytext() string in the buffer */ + private int zzStartRead; + + /** endRead marks the last character in the buffer, that has been read + from input */ + private int zzEndRead; + + /** number of newlines encountered up to the start of the matched text */ + private int yyline; + + /** the number of characters up to the start of the matched text */ + private int yychar; + + /** + * the number of characters from the last newline up to the start of the + * matched text + */ + private int yycolumn; + + /** + * zzAtBOL == true <=> the scanner is currently at the beginning of a line + */ + private boolean zzAtBOL = true; + + /** zzAtEOF == true <=> the scanner is at the EOF */ + private boolean zzAtEOF; + + /** denotes if the user-EOF-code has already been executed */ + private boolean zzEOFDone; + + /* user code: */ + /* + * Betacode to Unicode conversion + */ + + + + /** + * Creates a new scanner + * There is also a java.io.InputStream version of this constructor. + * + * @param in the java.io.Reader to read input from. + */ + public Buckwalter2UnicodeLex(java.io.Reader in) { + this.zzReader = in; + } + + /** + * Creates a new scanner. + * There is also java.io.Reader version of this constructor. + * + * @param in the java.io.Inputstream to read input from. + */ + public Buckwalter2UnicodeLex(java.io.InputStream in) { + this(new java.io.InputStreamReader(in)); + } + + /** + * Unpacks the compressed character translation table. + * + * @param packed the packed character translation table + * @return the unpacked character translation table + */ + private static char [] zzUnpackCMap(String packed) { + char [] map = new char[0x10000]; + int i = 0; /* index in packed string */ + int j = 0; /* index in unpacked array */ + while (i < 178) { + int count = packed.charAt(i++); + char value = packed.charAt(i++); + do map[j++] = value; while (--count > 0); + } + return map; + } + + + /** + * Refills the input buffer. + * + * @return false, iff there was new input. + * + * @exception java.io.IOException if any I/O-Error occurs + */ + private boolean zzRefill() throws java.io.IOException { + + /* first: make room (if you can) */ + if (zzStartRead > 0) { + System.arraycopy(zzBuffer, zzStartRead, + zzBuffer, 0, + zzEndRead-zzStartRead); + + /* translate stored positions */ + zzEndRead-= zzStartRead; + zzCurrentPos-= zzStartRead; + zzMarkedPos-= zzStartRead; + zzStartRead = 0; + } + + /* is the buffer big enough? */ + if (zzCurrentPos >= zzBuffer.length) { + /* if not: blow it up */ + char newBuffer[] = new char[zzCurrentPos*2]; + System.arraycopy(zzBuffer, 0, newBuffer, 0, zzBuffer.length); + zzBuffer = newBuffer; + } + + /* finally: fill the buffer with new input */ + int numRead = zzReader.read(zzBuffer, zzEndRead, + zzBuffer.length-zzEndRead); + + if (numRead > 0) { + zzEndRead+= numRead; + return false; + } + // unlikely but not impossible: read 0 characters, but not at end of stream + if (numRead == 0) { + int c = zzReader.read(); + if (c == -1) { + return true; + } else { + zzBuffer[zzEndRead++] = (char) c; + return false; + } + } + + // numRead < 0 + return true; + } + + + /** + * Closes the input stream. + */ + public final void yyclose() throws java.io.IOException { + zzAtEOF = true; /* indicate end of file */ + zzEndRead = zzStartRead; /* invalidate buffer */ + + if (zzReader != null) + zzReader.close(); + } + + + /** + * Resets the scanner to read from a new input stream. + * Does not close the old reader. + * + * All internal variables are reset, the old input stream + * cannot be reused (internal buffer is discarded and lost). + * Lexical state is set to ZZ_INITIAL. + * + * @param reader the new input stream + */ + public final void yyreset(java.io.Reader reader) { + zzReader = reader; + zzAtBOL = true; + zzAtEOF = false; + zzEOFDone = false; + zzEndRead = zzStartRead = 0; + zzCurrentPos = zzMarkedPos = 0; + yyline = yychar = yycolumn = 0; + zzLexicalState = YYINITIAL; + } + + + /** + * Returns the current lexical state. + */ + public final int yystate() { + return zzLexicalState; + } + + + /** + * Enters a new lexical state + * + * @param newState the new lexical state + */ + public final void yybegin(int newState) { + zzLexicalState = newState; + } + + + /** + * Returns the text matched by the current regular expression. + */ + public final String yytext() { + return new String( zzBuffer, zzStartRead, zzMarkedPos-zzStartRead ); + } + + + /** + * Returns the character at position pos from the + * matched text. + * + * It is equivalent to yytext().charAt(pos), but faster + * + * @param pos the position of the character to fetch. + * A value from 0 to yylength()-1. + * + * @return the character at position pos + */ + public final char yycharat(int pos) { + return zzBuffer[zzStartRead+pos]; + } + + + /** + * Returns the length of the matched text region. + */ + public final int yylength() { + return zzMarkedPos-zzStartRead; + } + + + /** + * Reports an error that occured while scanning. + * + * In a wellformed scanner (no or only correct usage of + * yypushback(int) and a match-all fallback rule) this method + * will only be called with things that "Can't Possibly Happen". + * If this method is called, something is seriously wrong + * (e.g. a JFlex bug producing a faulty scanner etc.). + * + * Usual syntax/scanner level error handling should be done + * in error fallback rules. + * + * @param errorCode the code of the errormessage to display + */ + private void zzScanError(int errorCode) { + String message; + try { + message = ZZ_ERROR_MSG[errorCode]; + } + catch (ArrayIndexOutOfBoundsException e) { + message = ZZ_ERROR_MSG[ZZ_UNKNOWN_ERROR]; + } + + throw new Error(message); + } + + + /** + * Pushes the specified amount of characters back into the input stream. + * + * They will be read again by then next call of the scanning method + * + * @param number the number of characters to be read again. + * This number must not be greater than yylength()! + */ + public void yypushback(int number) { + if ( number > yylength() ) + zzScanError(ZZ_PUSHBACK_2BIG); + + zzMarkedPos -= number; + } + + + /** + * Resumes scanning until the next regular expression is matched, + * the end of input is encountered or an I/O-Error occurs. + * + * @return the next token + * @exception java.io.IOException if any I/O-Error occurs + */ + public java.lang.String yylex() throws java.io.IOException { + int zzInput; + int zzAction; + + // cached fields: + int zzCurrentPosL; + int zzMarkedPosL; + int zzEndReadL = zzEndRead; + char [] zzBufferL = zzBuffer; + char [] zzCMapL = ZZ_CMAP; + + int [] zzTransL = ZZ_TRANS; + int [] zzRowMapL = ZZ_ROWMAP; + int [] zzAttrL = ZZ_ATTRIBUTE; + + while (true) { + zzMarkedPosL = zzMarkedPos; + + zzAction = -1; + + zzCurrentPosL = zzCurrentPos = zzStartRead = zzMarkedPosL; + + zzState = ZZ_LEXSTATE[zzLexicalState]; + + + zzForAction: { + while (true) { + + if (zzCurrentPosL < zzEndReadL) + zzInput = zzBufferL[zzCurrentPosL++]; + else if (zzAtEOF) { + zzInput = YYEOF; + break zzForAction; + } + else { + // store back cached positions + zzCurrentPos = zzCurrentPosL; + zzMarkedPos = zzMarkedPosL; + boolean eof = zzRefill(); + // get translated positions and possibly new buffer + zzCurrentPosL = zzCurrentPos; + zzMarkedPosL = zzMarkedPos; + zzBufferL = zzBuffer; + zzEndReadL = zzEndRead; + if (eof) { + zzInput = YYEOF; + break zzForAction; + } + else { + zzInput = zzBufferL[zzCurrentPosL++]; + } + } + int zzNext = zzTransL[ zzRowMapL[zzState] + zzCMapL[zzInput] ]; + if (zzNext == -1) break zzForAction; + zzState = zzNext; + + int zzAttributes = zzAttrL[zzState]; + if ( (zzAttributes & 1) == 1 ) { + zzAction = zzState; + zzMarkedPosL = zzCurrentPosL; + if ( (zzAttributes & 8) == 8 ) break zzForAction; + } + + } + } + + // store back cached position + zzMarkedPos = zzMarkedPosL; + + switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) { + case 54: + { return "\u06AF"; + } + case 74: break; + case 10: + { return "\u0629"; + } + case 75: break; + case 26: + { return "\u0639"; + } + case 76: break; + case 9: + { return "\u0628"; + } + case 77: break; + case 37: + { return "\u0649"; + } + case 78: break; + case 25: + { return "\u0638"; + } + case 79: break; + case 8: + { return "\u0627"; + } + case 80: break; + case 58: + { return "\u06DF"; + } + case 81: break; + case 36: + { return "\u0648"; + } + case 82: break; + case 68: + { return ">"; + } + case 83: break; + case 24: + { return "\u0637"; + } + case 84: break; + case 7: + { return "\u0626"; + } + case 85: break; + case 35: + { return "\u0647"; + } + case 86: break; + case 23: + { return "\u0636"; + } + case 87: break; + case 2: + { return "\u0625"; + } + case 88: break; + case 69: + { return "<"; + } + case 89: break; + case 34: + { return "\u0646"; + } + case 90: break; + case 67: + { return "\u06ED"; + } + case 91: break; + case 22: + { return "\u0635"; + } + case 92: break; + case 6: + { return "\u0624"; + } + case 93: break; + case 57: + { return "\u06DC"; + } + case 94: break; + case 33: + { return "\u0645"; + } + case 95: break; + case 66: + { return "\u06EC"; + } + case 96: break; + case 21: + { return "\u0634"; + } + case 97: break; + case 3: + { return "\u0623"; + } + case 98: break; + case 32: + { return "\u0644"; + } + case 99: break; + case 70: + { return "|"; + } + case 100: break; + case 65: + { return "\u06EB"; + } + case 101: break; + case 20: + { return "\u0633"; + } + case 102: break; + case 55: + { return "\u0698"; + } + case 103: break; + case 5: + { return "\u0622"; + } + case 104: break; + case 48: + { return "\u0654"; + } + case 105: break; + case 31: + { return "\u0643"; + } + case 106: break; + case 19: + { return "\u0632"; + } + case 107: break; + case 64: + { return "\u06EA"; + } + case 108: break; + case 4: + { return "\u0621"; + } + case 109: break; + case 52: + { return "\u0686"; + } + case 110: break; + case 47: + { return "\u0653"; + } + case 111: break; + case 30: + { return "\u0642"; + } + case 112: break; + case 18: + { return "\u0631"; + } + case 113: break; + case 46: + { return "\u0652"; + } + case 114: break; + case 29: + { return "\u0641"; + } + case 115: break; + case 17: + { return "\u0630"; + } + case 116: break; + case 45: + { return "\u0651"; + } + case 117: break; + case 28: + { return "\u0640"; + } + case 118: break; + case 44: + { return "\u0650"; + } + case 119: break; + case 1: + { return yytext(); + } + case 120: break; + case 50: + { return "\u0671"; + } + case 121: break; + case 49: + { return "\u0670"; + } + case 122: break; + case 63: + { return "\u06E8"; + } + case 123: break; + case 53: + { return "\u06A4"; + } + case 124: break; + case 56: + { return "\u061F"; + } + case 125: break; + case 16: + { return "\u062F"; + } + case 126: break; + case 62: + { return "\u06E6"; + } + case 127: break; + case 15: + { return "\u062E"; + } + case 128: break; + case 61: + { return "\u06E5"; + } + case 129: break; + case 43: + { return "\u064F"; + } + case 130: break; + case 14: + { return "\u062D"; + } + case 131: break; + case 42: + { return "\u064E"; + } + case 132: break; + case 60: + { return "\u06E3"; + } + case 133: break; + case 13: + { return "\u062C"; + } + case 134: break; + case 41: + { return "\u064D"; + } + case 135: break; + case 59: + { return "\u06E2"; + } + case 136: break; + case 12: + { return "\u062B"; + } + case 137: break; + case 40: + { return "\u064C"; + } + case 138: break; + case 11: + { return "\u062A"; + } + case 139: break; + case 51: + { return "\u067E"; + } + case 140: break; + case 39: + { return "\u064B"; + } + case 141: break; + case 27: + { return "\u063A"; + } + case 142: break; + case 38: + { return "\u064A"; + } + case 143: break; + case 71: + { return ")"; + } + case 144: break; + case 72: + { return "("; + } + case 145: break; + case 73: + { return "'"; + } + case 146: break; + default: + if (zzInput == YYEOF && zzStartRead == zzCurrentPos) { + zzAtEOF = true; + return null; + } + else { + zzScanError(ZZ_NO_MATCH); + } + } + } + } + + +} diff -r 000000000000 -r 408254cf2f1d software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/general/Language.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/general/Language.java Wed Nov 24 17:24:23 2010 +0100 @@ -0,0 +1,38 @@ +package de.mpg.mpiwg.berlin.mpdl.lt.general; + +import java.util.HashMap; + +public class Language { + private static Language instance; + private static HashMap languageIds = new HashMap(); + + public static Language getInstance() { + if (instance == null) { + instance = new Language(); + instance.init(); + } + return instance; + } + + private void init() { + languageIds.put("ar", "ar"); + languageIds.put("de", "de"); + languageIds.put("el", "el"); + languageIds.put("grc", "el"); + languageIds.put("en", "en"); + languageIds.put("fr", "fr"); + languageIds.put("it", "it"); + languageIds.put("la", "la"); + languageIds.put("lat", "la"); + languageIds.put("nl", "nl"); + languageIds.put("zh", "zh"); + } + + public String getLanguageId(String language) { + if (language == null) + return null; + String retLanguageId = null; + retLanguageId = languageIds.get(language); + return retLanguageId; + } +} \ No newline at end of file diff -r 000000000000 -r 408254cf2f1d software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/general/Transcoder.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/general/Transcoder.java Wed Nov 24 17:24:23 2010 +0100 @@ -0,0 +1,163 @@ +package de.mpg.mpiwg.berlin.mpdl.lt.general; + +import java.io.IOException; +import java.io.StringReader; + +import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; +import edu.unc.epidoc.transcoder.TransCoder; + +public class Transcoder { + private static Transcoder instance; + private TransCoder betaCodeTranscoder; + + public static Transcoder getInstance() { + if (instance == null) { + instance = new Transcoder(); + } + return instance; + } + + public String transcodeFromBetaCode2UnicodeEpidoc(String inputStr) throws ApplicationException { + String encodedUnicodeStr = null; + try { + if (betaCodeTranscoder == null) { + betaCodeTranscoder = new TransCoder(); + betaCodeTranscoder.setParser("BetaCode"); + betaCodeTranscoder.setConverter("UnicodeC"); + } + encodedUnicodeStr = betaCodeTranscoder.getString(inputStr); + } catch (Exception e) { + throw new ApplicationException(e); + } + return encodedUnicodeStr; + } + + public String transcodeFromBetaCode2Unicode(String inputStr) throws ApplicationException { + StringReader strReader = new StringReader(inputStr); + Betacode2UnicodeLex betacode2UnicodeLex = new Betacode2UnicodeLex(strReader); + String retStr = ""; + String token = ""; + while (token != null) { + try { + token = betacode2UnicodeLex.yylex(); + if (token != null) + retStr += token; + } catch (IOException e ) { + throw new ApplicationException(e); + } + } + return retStr; + /* + // alternative to JFlex + String encodedUnicodeStr = null; + if (inputStr.matches("^a)")) + encodedUnicodeStr = inputStr.replaceFirst("^a)", "\u1F00"); + else if (inputStr.matches("^a(")) + encodedUnicodeStr = inputStr.replaceFirst("^a(", "\u1F01"); + else if (inputStr.matches("^a)\\")) + encodedUnicodeStr = inputStr.replaceFirst("^a)\\", "\u1F02"); + + // the longest regular expressions first + + return encodedUnicodeStr; + */ + } + + public String transcodeFromBuckwalter2Unicode(String inputStr) throws ApplicationException { + StringReader strReader = new StringReader(inputStr); + Buckwalter2UnicodeLex buckwalter2UnicodeLex = new Buckwalter2UnicodeLex(strReader); + String retStr = ""; + String token = ""; + while (token != null) { + try { + token = buckwalter2UnicodeLex.yylex(); + if (token != null) + retStr += token; + } catch (IOException e ) { + throw new ApplicationException(e); + } + } + return retStr; + } + + + + public String transcodeFromBuckwalter2UnicodeAraMorph(String inputStr) { + String encodedUnicodeStr = arabizeWord(inputStr); + return encodedUnicodeStr; + } + + /* + * copied from http://www.nongnu.org/aramorph/english/download.html + * Class: AraMorph + */ + private String arabizeWord(String translitered) { + String tmp_word = translitered; + // convert to transliteration + tmp_word = tmp_word.replaceAll("'", "\u0621"); //\u0621 : ARABIC LETTER HAMZA + tmp_word = tmp_word.replaceAll("\\|", "\u0622"); //\u0622 : ARABIC LETTER ALEF WITH MADDA ABOVE + tmp_word = tmp_word.replaceAll(">", "\u0623"); //\u0623 : ARABIC LETTER ALEF WITH HAMZA ABOVE + tmp_word = tmp_word.replaceAll("&", "\u0624"); //\u0624 : ARABIC LETTER WAW WITH HAMZA ABOVE + tmp_word = tmp_word.replaceAll("<", "\u0625"); //\u0625 : ARABIC LETTER ALEF WITH HAMZA BELOW + tmp_word = tmp_word.replaceAll("}", "\u0626"); //\u0626 : ARABIC LETTER YEH WITH HAMZA ABOVE + tmp_word = tmp_word.replaceAll("A", "\u0627"); //\u0627 : ARABIC LETTER ALEF + tmp_word = tmp_word.replaceAll("b", "\u0628"); //\u0628 : ARABIC LETTER BEH + tmp_word = tmp_word.replaceAll("p", "\u0629"); //\u0629 : ARABIC LETTER TEH MARBUTA + tmp_word = tmp_word.replaceAll("t", "\u062A"); //\u062A : ARABIC LETTER TEH + tmp_word = tmp_word.replaceAll("v", "\u062B"); //\u062B : ARABIC LETTER THEH + tmp_word = tmp_word.replaceAll("j", "\u062C"); //\u062C : ARABIC LETTER JEEM + tmp_word = tmp_word.replaceAll("H", "\u062D"); //\u062D : ARABIC LETTER HAH + tmp_word = tmp_word.replaceAll("x", "\u062E"); //\u062E : ARABIC LETTER KHAH + tmp_word = tmp_word.replaceAll("d", "\u062F"); //\u062F : ARABIC LETTER DAL + tmp_word = tmp_word.replaceAll("\\*", "\u0630"); //\u0630 : ARABIC LETTER THAL + tmp_word = tmp_word.replaceAll("r", "\u0631"); //\u0631 : ARABIC LETTER REH + tmp_word = tmp_word.replaceAll("z", "\u0632"); //\u0632 : ARABIC LETTER ZAIN + tmp_word = tmp_word.replaceAll("s", "\u0633" ); //\u0633 : ARABIC LETTER SEEN + tmp_word = tmp_word.replaceAll("\\$", "\u0634"); //\u0634 : ARABIC LETTER SHEEN + tmp_word = tmp_word.replaceAll("S", "\u0635"); //\u0635 : ARABIC LETTER SAD + tmp_word = tmp_word.replaceAll("D", "\u0636"); //\u0636 : ARABIC LETTER DAD + tmp_word = tmp_word.replaceAll("T", "\u0637"); //\u0637 : ARABIC LETTER TAH + tmp_word = tmp_word.replaceAll("Z", "\u0638"); //\u0638 : ARABIC LETTER ZAH + tmp_word = tmp_word.replaceAll("E", "\u0639"); //\u0639 : ARABIC LETTER AIN + tmp_word = tmp_word.replaceAll("g", "\u063A"); //\u063A : ARABIC LETTER GHAIN + tmp_word = tmp_word.replaceAll("_", "\u0640"); //\u0640 : ARABIC TATWEEL + tmp_word = tmp_word.replaceAll("f", "\u0641"); //\u0641 : ARABIC LETTER FEH + tmp_word = tmp_word.replaceAll("q", "\u0642"); //\u0642 : ARABIC LETTER QAF + tmp_word = tmp_word.replaceAll("k", "\u0643"); //\u0643 : ARABIC LETTER KAF + tmp_word = tmp_word.replaceAll("l", "\u0644"); //\u0644 : ARABIC LETTER LAM + tmp_word = tmp_word.replaceAll("m", "\u0645"); //\u0645 : ARABIC LETTER MEEM + tmp_word = tmp_word.replaceAll("n", "\u0646"); //\u0646 : ARABIC LETTER NOON + tmp_word = tmp_word.replaceAll("h", "\u0647"); //\u0647 : ARABIC LETTER HEH + tmp_word = tmp_word.replaceAll("w", "\u0648"); //\u0648 : ARABIC LETTER WAW + tmp_word = tmp_word.replaceAll("Y", "\u0649"); //\u0649 : ARABIC LETTER ALEF MAKSURA + tmp_word = tmp_word.replaceAll("y", "\u064A"); //\u064A : ARABIC LETTER YEH + tmp_word = tmp_word.replaceAll("F", "\u064B"); //\u064B : ARABIC FATHATAN + tmp_word = tmp_word.replaceAll("N", "\u064C"); //\u064C : ARABIC DAMMATAN + tmp_word = tmp_word.replaceAll("K", "\u064D"); //\u064D : ARABIC KASRATAN + tmp_word = tmp_word.replaceAll("a", "\u064E"); //\u064E : ARABIC FATHA + tmp_word = tmp_word.replaceAll("u", "\u064F"); //\u064F : ARABIC DAMMA + tmp_word = tmp_word.replaceAll("i", "\u0650"); //\u0650 : ARABIC KASRA + tmp_word = tmp_word.replaceAll("~", "\u0651"); //\u0651 : ARABIC SHADDA + tmp_word = tmp_word.replaceAll("o", "\u0652"); //\u0652 : ARABIC SUKUN + tmp_word = tmp_word.replaceAll("`", "\u0670"); //\u0670 : ARABIC LETTER SUPERSCRIPT ALEF + tmp_word = tmp_word.replaceAll("\\{", "\u0671"); //\u0671 : ARABIC LETTER ALEF WASLA + tmp_word = tmp_word.replaceAll("P", "\u067E"); //\u067E : ARABIC LETTER PEH + tmp_word = tmp_word.replaceAll("J", "\u0686"); //\u0686 : ARABIC LETTER TCHEH + tmp_word = tmp_word.replaceAll("V", "\u06A4"); //\u06A4 : ARABIC LETTER VEH + tmp_word = tmp_word.replaceAll("G", "\u06AF"); //\u06AF : ARABIC LETTER GAF + tmp_word = tmp_word.replaceAll("R", "\u0698"); //\u0698 : ARABIC LETTER JEH (no more in Buckwalter system) + //Not in Buckwalter system \u0679 : ARABIC LETTER TTEH + //Not in Buckwalter system \u0688 : ARABIC LETTER DDAL + //Not in Buckwalter system \u06A9 : ARABIC LETTER KEHEH + //Not in Buckwalter system \u0691 : ARABIC LETTER RREH + //Not in Buckwalter system \u06BA : ARABIC LETTER NOON GHUNNA + //Not in Buckwalter system \u06BE : ARABIC LETTER HEH DOACHASHMEE + //Not in Buckwalter system \u06C1 : ARABIC LETTER HEH GOAL + //Not in Buckwalter system \u06D2 : ARABIC LETTER YEH BARREE + tmp_word = tmp_word.replaceAll(",", "\u060C" ); //\u060C : ARABIC COMMA + tmp_word = tmp_word.replaceAll(";", "\u061B"); //\u061B : ARABIC SEMICOLON + tmp_word = tmp_word.replaceAll("\\?", "\u061F"); //\u061F : ARABIC QUESTION MARK + return tmp_word; + } + +} \ No newline at end of file diff -r 000000000000 -r 408254cf2f1d software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/lex/app/Lexica.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/lex/app/Lexica.java Wed Nov 24 17:24:23 2010 +0100 @@ -0,0 +1,157 @@ +package de.mpg.mpiwg.berlin.mpdl.lt.lex.app; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.Iterator; +import java.util.Set; + +import de.mpg.mpiwg.berlin.mpdl.lt.general.Language; + +/* +florio: 70091 records (6 of them are not xml valid) +bonitz: 14648 records (46 of them are not xml valid) +webster: 111733 records (3 of them are not xml valid) +ls: 53500 records (14 of them are not xml valid) +autenrieth: 10158 records (468 of them are not xml valid) +cooper: 33124 records (116 of them are not xml valid) +baretti: 53555 records (0 of them are not xml valid) +salmone: 6360 records (11 of them are not xml valid) +lsj: 112631 records (26922 of them are not xml valid) + */ +public class Lexica { + private static Lexica instance; + private static HashMap lexica = new HashMap(); + + public static Lexica getInstance() { + if (instance == null) { + instance = new Lexica(); + instance.init(); + } + return instance; + } + + private void init() { + Lexicon autenrieth = new Lexicon("autenrieth", "el"); + autenrieth.setDescription("Autenrieth, a Homeric lexicon"); + Lexicon baretti = new Lexicon("baretti", "it"); + baretti.setDescription("Baretti, a dictionary of the English and Italian languages"); + Lexicon bonitz = new Lexicon("bonitz", "el"); + bonitz.setDescription("Bonitz, index Aristotelicus"); + Lexicon cooper = new Lexicon("cooper", "la"); + cooper.setDescription("Cooper, Thesaurus Linguae Romanae et Brittanicae"); + Lexicon florio = new Lexicon("florio", "it"); + florio.setDescription("Florio, a worlde of wordes, or most copious, dictionarie in Italian and English"); + Lexicon ls = new Lexicon("ls", "la"); + ls.setDescription("Lewis and Short, Latin dictionary"); + Lexicon lsj = new Lexicon("lsj", "el"); + lsj.setDescription("Liddell-Scott-Jones, a Greek-English lexicon"); + Lexicon salmone = new Lexicon("salmone", "ar"); + salmone.setDescription("Salmoné, an advanced learner's Arabic-English dictionary"); + Lexicon salmoneUnicode = new Lexicon("salmoneUnicode", "ar"); + salmoneUnicode.setDescription("Salmoné, an advanced learner's Arabic-English dictionary"); + Lexicon webster = new Lexicon("webster", "en"); + webster.setDescription("Webster's revised unabridged dictionary (1913)"); + lexica.put("autenrieth", autenrieth); + lexica.put("baretti", baretti); + lexica.put("bonitz", bonitz); + lexica.put("cooper", cooper); + lexica.put("florio", florio); + lexica.put("ls", ls); + lexica.put("lsj", lsj); + lexica.put("salmone", salmone); + lexica.put("webster", webster); + } + + public ArrayList getLexicons(String lang) { + String language = Language.getInstance().getLanguageId(lang); + ArrayList retLexicons = null; + Set keys = lexica.keySet(); + Iterator it = keys.iterator(); + while (it.hasNext()) { + String lexName = it.next(); + Lexicon lexicon = lexica.get(lexName); + String sourceLanguage = lexicon.getSourceLanguage(); + if (sourceLanguage != null && sourceLanguage.equals(language)) { + if (retLexicons == null) + retLexicons = new ArrayList(); + retLexicons.add(lexicon); + } + } + return retLexicons; + } + + public ArrayList getLexicons() { + ArrayList retLexicons = null; + Set keys = lexica.keySet(); + Iterator it = keys.iterator(); + while (it.hasNext()) { + String lexName = it.next(); + Lexicon lexicon = lexica.get(lexName); + if (retLexicons == null) + retLexicons = new ArrayList(); + retLexicons.add(lexicon); + } + return retLexicons; + } + + public ArrayList getBetacodeLexicons() { + ArrayList retLexicons = new ArrayList(); + retLexicons.add(lexica.get("autenrieth")); + retLexicons.add(lexica.get("bonitz")); + retLexicons.add(lexica.get("lsj")); + return retLexicons; + } + + public ArrayList getBuckwalterLexicons() { + ArrayList retLexicons = new ArrayList(); + retLexicons.add(lexica.get("salmone")); + return retLexicons; + } + +} + +/* TODO + + + + + + else if (dictname == "dwds") lang="de"; + else if (dictname == "grimm") lang="de"; + else if (dictname == "artfl") lang="fr"; + else of (dictname == "epsd") lang="sux"; + +DWDS: + +Link: http://www.dwds.de/?woerterbuch=1&qu=auto +Logo: http://www.dwds.de/images/dwds_logo.gif +Copyright: Copyright © by Berlin-Brandenburgische Akademie der Wissenschaften, Wörterbuch der deutschen Gegenwartssprache, all rights reserved. + +Grimm: + +Link: http://germa63.uni-trier.de:8080/Projects/WBB/woerterbuecher/dwb/report_lemma?wb=G&word=auto +View: http://germa63.uni-trier.de:8080/Projects/WBB/woerterbuecher/dwb/selectarticles?lemid= +Output: + + +Deutsches Wörterbuch von Jacob und Wilhelm Grimm + + + + + + +ARTFL: + +Name: Dictionnaire de l'Académie francaise, 4e éd. +Vorverarbeitung des Wortes yourWord: $word =~ s/%([0-9A-F]{2})/pack("H2", $1)/ge; +Link: http://colet.uchicago.edu/cgi-bin/dico1look.pl?dicoid=ACAD1762&strippedhw=yourWord + +EPSD: + +Name: ePSD (Pennsylvania Sumerian Dictionary) +Link: http://psd.museum.upenn.edu/cgi-bin/epsd.plx?x=epsd&q=yourWord + + + + */ diff -r 000000000000 -r 408254cf2f1d software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/lex/app/Lexicon.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/lex/app/Lexicon.java Wed Nov 24 17:24:23 2010 +0100 @@ -0,0 +1,96 @@ +package de.mpg.mpiwg.berlin.mpdl.lt.lex.app; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.Enumeration; +import java.util.Hashtable; + +public class Lexicon { + private String name; + private String sourceLang; + private String description; + private Hashtable entries; + + public Lexicon(String name, String sourceLanguage) { + this.name = name; + this.sourceLang = sourceLanguage; + this.entries = new Hashtable(); + } + + public String getName() { + return name; + } + + public void setName(String name) { + this.name = name; + } + + public String getSourceLanguage() { + return sourceLang; + } + + public String getDescription() { + return description; + } + + public void setDescription(String description) { + this.description = description; + } + + public boolean isBetacodeLexicon() { + boolean isBetacode = false; + if (name.equals("autenrieth") || name.equals("bonitz") || name.equals("lsj")) + isBetacode = true; + return isBetacode; + } + + public boolean isBuckwalterLexicon() { + boolean isBuckwalter = false; + if (name.equals("salmone")) + isBuckwalter = true; + return isBuckwalter; + } + + public ArrayList getEntries() { + ArrayList result = new ArrayList(); + if (entries != null) { + Enumeration entryKeys = entries.keys(); + while(entryKeys.hasMoreElements()) { + String entryKey = entryKeys.nextElement(); + LexiconEntry le = entries.get(entryKey); + result.add(le); + } + } + Collections.sort(result); + if (result.isEmpty()) + return null; + else + return result; + } + + public boolean isEmpty() { + if (entries == null || entries.isEmpty()) + return true; + else + return false; + } + + public void addEntry(LexiconEntry newEntry) { + if (entries == null) + this.entries = new Hashtable(); + entries.put(newEntry.getFormName(), newEntry); + } + + /* + * without lexicon entries (non-Javadoc) + * @see java.lang.Object#clone() + */ + public Lexicon clone() { + Lexicon lex = new Lexicon(name, sourceLang); + lex.description = description; + lex.entries = new Hashtable(); + return lex; + } + + +} diff -r 000000000000 -r 408254cf2f1d software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/lex/app/LexiconEntry.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/lex/app/LexiconEntry.java Wed Nov 24 17:24:23 2010 +0100 @@ -0,0 +1,121 @@ +package de.mpg.mpiwg.berlin.mpdl.lt.lex.app; + +public class LexiconEntry implements Comparable { + private String lexiconName; + private String formName; + private String content; + private boolean xmlValid = false; + private boolean xmlMadeValid = false; + private String validationCode; + private String validationFailElementName; + + public LexiconEntry(String lexiconName, String formName, String content) { + this.lexiconName = lexiconName; + this.formName = formName; + this.content = content; + if (content != null) { + int begin = content.indexOf(""); + int end = content.indexOf(""); + if (begin != -1 && end != -1) { + String xmlValid = content.substring(begin + 11, end); + if (xmlValid != null) { + if (xmlValid.equals("true")) + this.xmlValid = true; + else if (xmlValid.equals("false")) + this.xmlValid = false; + } + } + } + } + + public String getLexiconName() { + return lexiconName; + } + + public String getFormName() { + return formName; + } + + public void setFormName(String formName) { + this.formName = formName; + } + + public String getContent() { + return content; + } + + public void setContent(String content) { + this.content = content; + } + + public boolean isXmlValid() { + return xmlValid; + } + + public void setXmlValid(boolean xmlValid) { + this.xmlValid = xmlValid; + } + + public String getValidationCode() { + return validationCode; + } + + public void setValidationCode(String validationCode) { + this.validationCode = validationCode; + } + + public String getValidationFailElementName() { + return validationFailElementName; + } + + public void setValidationFailElementName(String validationFailElementName) { + this.validationFailElementName = validationFailElementName; + } + + public boolean isXmlMadeValid() { + return xmlMadeValid; + } + + public void setXmlMadeValid(boolean xmlMadeValid) { + this.xmlMadeValid = xmlMadeValid; + } + + public String getRepairedEntry() { + String retStr = null; + if (content != null) { + int begin = content.indexOf(""); + int end = content.indexOf(""); + if (begin != -1 && end != -1) { + retStr = content.substring(begin, end) + ""; + } + } + return retStr; + } + + public String getOriginalEntry() { + String retStr = null; + if (content != null) { + int begin = content.indexOf(""); + int end = content.indexOf(""); + if (begin != -1 && end != -1) { + retStr = content.substring(begin, end) + ""; + } + } + return retStr; + } + + public int compareTo(LexiconEntry l) { + if (l.formName == null && this.formName == null) { + return 0; + } + if (this.formName == null) { + return 1; + } + if (l.formName == null) { + return -1; + } + return this.formName.compareTo(l.formName); + } + + +} diff -r 000000000000 -r 408254cf2f1d software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/lex/db/DBLexWriter.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/lex/db/DBLexWriter.java Wed Nov 24 17:24:23 2010 +0100 @@ -0,0 +1,630 @@ +package de.mpg.mpiwg.berlin.mpdl.lt.lex.db; + +import java.io.BufferedOutputStream; +import java.io.BufferedReader; +import java.io.File; +import java.io.FileNotFoundException; +import java.io.FileOutputStream; +import java.io.FileReader; +import java.io.IOException; +import java.io.Reader; +import java.io.StringReader; +import java.io.UnsupportedEncodingException; +import java.util.ArrayList; +import java.util.Date; +import java.util.HashMap; +import java.util.Iterator; +import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.xml.sax.InputSource; +import org.xml.sax.SAXException; +import org.xml.sax.XMLReader; + +import com.sleepycat.je.Cursor; +import com.sleepycat.je.Database; +import com.sleepycat.je.DatabaseEntry; +import com.sleepycat.je.DatabaseException; +import com.sleepycat.je.LockMode; +import com.sleepycat.je.OperationStatus; +import com.sleepycat.je.util.DbLoad; +import com.sun.org.apache.xerces.internal.parsers.SAXParser; + +import de.mpg.mpiwg.berlin.mpdl.util.StringUtilEscapeChars; +import de.mpg.mpiwg.berlin.mpdl.util.Util; +import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; +import de.mpg.mpiwg.berlin.mpdl.general.MpdlConstants; +import de.mpg.mpiwg.berlin.mpdl.lt.general.Transcoder; +import de.mpg.mpiwg.berlin.mpdl.lt.lex.app.Lexica; +import de.mpg.mpiwg.berlin.mpdl.lt.lex.app.Lexicon; +import de.mpg.mpiwg.berlin.mpdl.lt.lex.app.LexiconEntry; + +public class DBLexWriter { + private static DBLexWriter instance; + private static String MPDL_DATA_DIR = MpdlConstants.MPDL_DATA_DIR; + private static String DATA_FILES_DIR_LEXICA = MPDL_DATA_DIR + "/dataFiles/pollux"; + private static String DB_DIR_LEXICA = MPDL_DATA_DIR + "/dataBerkeleyDB/pollux"; + private DbEnvLex dbEnvLexica; + private Date beginOfOperation; + private Date endOfOperation; + + public static DBLexWriter getInstance() throws ApplicationException { + if (instance == null) { + instance = new DBLexWriter(); + } + return instance; + } + + public static void main(String[] args) throws ApplicationException { + getInstance(); + instance.beginOperation(); + System.out.print("Start ..."); + // instance.initReadOnly(); + instance.initReadWrite(); + // instance.readSampleData(); + // instance.testTranscoder(); + // instance.printSizeOfAllLexicons(); + instance.writeLexiconsToFiles(); + // instance.loadPolluxDbDumpsToDb(); + // instance.copyAndRepairAndTranscodeDumps(); + instance.end(); + instance.endOperation(); + Double elapsedTime = new Util().getSecondWithMillisecondsBetween(instance.beginOfOperation, instance.endOfOperation); + System.out.println("End."); + System.out.println("Needed time: " + elapsedTime + " seconds"); + } + + private void initReadWrite() throws ApplicationException { + dbEnvLexica = new DbEnvLex(); + dbEnvLexica.setDataDir(DB_DIR_LEXICA); + dbEnvLexica.initReadWrite(); + } + + private void initReadOnly() throws ApplicationException { + dbEnvLexica = new DbEnvLex(); + dbEnvLexica.setDataDir(DB_DIR_LEXICA); + dbEnvLexica.initReadOnly(); + ArrayList lexicons = Lexica.getInstance().getLexicons(); + for (int i=0; i lexicons = Lexica.getInstance().getLexicons(); + for (int i=0; i dbNames = dbEnvLexica.getEnv().getDatabaseNames(); + String l1 = readEntry("autenrieth", "au)to/s"); + String l2 = readEntry("ls", "laudabilis"); + String l3 = readEntry("lsjUnicode", "ἄδρεπτος"); + String l4 = readEntry("salmoneUnicode", "ءرش"); + System.out.println("Autenrieth: autos: " + l1); + System.out.println("Lewis & Short: Laudabilis: " + l2); + System.out.println("LSJ: ἄδρεπτος: " + l3); + System.out.println("Salmone: طب: " + l4); + printSampleEntries("salmoneUnicode", 10); + printSampleEntries("lsjUnicode", 1000); + } catch (DatabaseException e) { + throw new ApplicationException(e); + } + } + + private void end() throws ApplicationException { + ArrayList lexicons = Lexica.getInstance().getLexicons(); + for (int i=0; i lexicons = Lexica.getInstance().getLexicons(); + for (int i=0; i"); + int end = dbEntryValueStr.indexOf(""); + dbEntryValueStr = dbEntryValueStr.substring(begin, end) + ""; + LexiconEntry dbLexEntry = new LexiconEntry(lexiconName, dbEntryKeyStr, dbEntryValueStr); + LexiconEntry xmlLexiconEntry = xmlParse(dbLexEntry); + if (! xmlLexiconEntry.isXmlValid()) { + sizeXmlNotValidEntries ++; + } + size++; + } + cursor.close(); + } catch (DatabaseException e) { + throw new ApplicationException(e); + } catch (UnsupportedEncodingException e) { + throw new ApplicationException(e); + } + int[] sizes = new int[2]; + sizes[0] = size; + sizes[1] = sizeXmlNotValidEntries; + return sizes; + } + + private void copyAndRepairAndTranscodeDumps() throws ApplicationException { + try { + ArrayList lexicons = Lexica.getInstance().getLexicons(); + for (int i=0; i lexDumpHashMap = getWholeLexiconHashMap(lexiconName + "Dump"); + dbEnvLexica.openDatabase(lexiconName); + Database lexDB = dbEnvLexica.getLexiconDB(lexiconName); + Iterator lexDumpIter = lexDumpHashMap.keySet().iterator(); + while (lexDumpIter.hasNext()) { + String lexDumpKeyStr = lexDumpIter.next(); + DatabaseEntry lexDumpValue = lexDumpHashMap.get(lexDumpKeyStr); + byte[] lexDumpValueBytes = lexDumpValue.getData(); + String lexDumpValueStr = new String(lexDumpValueBytes, "utf-8"); + String newLexValueStr = new String(lexDumpValueBytes, "utf-8"); + // repair lsj + if (lexiconName.equals("lsj")) { + newLexValueStr = newLexValueStr.replaceAll("
", "
"); + newLexValueStr = newLexValueStr.replaceAll("

", "

"); + String elementNameGreek = "G"; + newLexValueStr = deleteNestedTags(elementNameGreek, newLexValueStr); // delete tags and inside + newLexValueStr = newLexValueStr.replaceAll("lang=greek", "lang=\"greek\""); + boolean senseContained = newLexValueStr.matches(".*.*"); + boolean endSenseContained = newLexValueStr.matches(".*.*"); + if (senseContained && ! endSenseContained) + newLexValueStr = newLexValueStr.replaceAll("", ""); + else if (!senseContained && endSenseContained) + newLexValueStr = newLexValueStr.replaceAll("", ""); + boolean refContained = newLexValueStr.matches(".*.*"); + boolean endRefContained = newLexValueStr.matches(".*.*"); + if (refContained && ! endRefContained) + newLexValueStr = newLexValueStr.replaceAll("", ""); + else if (!refContained && endRefContained) + newLexValueStr = newLexValueStr.replaceAll("", ""); + /* + boolean itypeContained = newLexValueStr.matches(".*.*"); + boolean endItypeContained = newLexValueStr.matches(".*.*"); + if (itypeContained && ! endItypeContained) + newLexValueStr = newLexValueStr.replaceAll("", ""); + else if (!itypeContained && endItypeContained) + newLexValueStr = newLexValueStr.replaceAll("", ""); + */ + } + // repair cooper + if (lexiconName.equals("cooper")) { + newLexValueStr = newLexValueStr.replaceAll("", ""); // TODO hack + newLexValueStr = newLexValueStr.replaceAll("

", "

"); // TODO hack + } + // repair baretti + if (lexiconName.equals("baretti")) { + newLexValueStr = newLexValueStr.replaceAll("

  • ", "
  • "); // TODO hack + } + // repair for all lexicons + newLexValueStr = newLexValueStr.replaceAll("type=style", "type=\"style\""); + newLexValueStr = newLexValueStr.replaceAll("type=dom", "type=\"dom\""); + newLexValueStr = newLexValueStr.replaceAll("<\\*>", ""); + newLexValueStr = newLexValueStr.replaceAll("

    ", "

    "); + LexiconEntry newLexEntryTemp = new LexiconEntry(lexiconName, lexDumpKeyStr, newLexValueStr); // lexDumpKeyStr is not transcoded yet but it will not be used in further in the code + LexiconEntry newLexEntry = xmlParseAndRepair(newLexEntryTemp); + String xmlValidString = "true"; + if (! newLexEntry.isXmlValid()) { + xmlValidString = "false"; + } + newLexValueStr = newLexEntry.getContent(); + // transcode the Betacode lexicon entries to Unicode (key and value) + if (lexicon.isBetacodeLexicon()) { + Transcoder transcoder = Transcoder.getInstance(); + lexDumpKeyStr = transcoder.transcodeFromBetaCode2Unicode(lexDumpKeyStr); + String elementName = "G"; + if (newLexEntry.isXmlValid()) { + newLexValueStr = transcodeByElementName("fromBetacode2Unicode", elementName, newLexValueStr); + } + } + // transcode the Buckwalter entries to Unicode (key and value) + if (lexicon.isBuckwalterLexicon()) { + Transcoder transcoder = Transcoder.getInstance(); + lexDumpKeyStr = transcoder.transcodeFromBuckwalter2Unicode(lexDumpKeyStr); + String elementName = "AR"; + if (newLexEntry.isXmlValid()) { + newLexValueStr = transcodeByElementName("fromBuckwalter2Unicode", elementName, newLexValueStr); + } + } + // put the entry into database + newLexValueStr = "" + xmlValidString + "" + lexDumpValueStr + "" + "" + newLexValueStr + "" + ""; + DatabaseEntry newLexDumpKey = new DatabaseEntry(lexDumpKeyStr.getBytes("utf-8")); + DatabaseEntry newLexValue = new DatabaseEntry(newLexValueStr.getBytes("utf-8")); + lexDB.put(null, newLexDumpKey, newLexValue); + } + } + } catch (DatabaseException e) { + throw new ApplicationException(e); + } catch (UnsupportedEncodingException e) { + throw new ApplicationException(e); + } + } + + private void printSampleEntries(String lexiconName, int count) throws ApplicationException { + try { + int counter = 0; + dbEnvLexica.openDatabase(lexiconName); + Database lexDB = dbEnvLexica.getLexiconDB(lexiconName); + Cursor cursor = lexDB.openCursor(null, null); + DatabaseEntry dbEntryKey = new DatabaseEntry(); + DatabaseEntry dbEntryValue = new DatabaseEntry(); + OperationStatus operationStatus = cursor.getFirst(dbEntryKey, dbEntryValue, LockMode.DEFAULT); + while (operationStatus == OperationStatus.SUCCESS && counter < count) { + int size = dbEntryKey.getSize(); + if (size > 0) { + byte[] dbEntryKeyBytes = dbEntryKey.getData(); + String dbEntryKeyStr = new String(dbEntryKeyBytes, "utf-8"); + System.out.println(lexiconName + ": key: " + dbEntryKeyStr + " value size: " + dbEntryValue.getSize()); + } + operationStatus = cursor.getNext(dbEntryKey, dbEntryValue, LockMode.DEFAULT); + counter++; + } + cursor.close(); + } catch (DatabaseException e) { + throw new ApplicationException(e); + } catch (UnsupportedEncodingException e) { + throw new ApplicationException(e); + } + } + + private void testTranscoder() throws ApplicationException { + String testStr = "hfhf fdfdei)mi/ (sum), Aeol. e)/mmi hfhfh Sapph.2.15, Theoc.20.32; Cret. h)mi/ GDI 4959a; 2sg. ei)=, Ep. and Ion. ei)s Od.17.388, al., Aeol. e)/ssi, Ep. and Dor. e)ssi/ Il.1.176, Pi."; + String testStr2 = "aaaaa 1111a 2222a 3333a 1111a aaaaa bbbbb 1111b 2222b 3333b 1111b bbbbb "; + String testStr3 = "e)pano/rqwsin e)/xein, opp a)ni/aton ei)=nai *hi3. 1165 b18. --e)panorqw/seis kai boh/qeiai *rb5. 1383 a20."; + String testStr4 = "suni^hmi Ar.Av.946 (s. v.l.), Strato Com.1.3: with variation of quantity, plei=ston ou)=lon i(/ei [i^], i)/oulon i(/ei [i_] Carm.Pop. 1.]:—" + + ";
    release, let go, h(=ka ..po/das kai\\ xei=re fe/resqai Od.12.442; h(=ke fe/resqai let him float" + + "off, Il.21.120; let fall, ka\\d de\\ ka/rhtos h(=ke ko/mas made his locks flow down from his head, Od.<" + + "/author>6.231; [e)qei/ras] i(/ei lo/fon a)mfi/ .... ggg"; + String testStr5 = "plei=ston ou)=lon i(/ei "; + String testStr6 = "*a as< as as: *)a *s ss "; + Transcoder t = Transcoder.getInstance(); + String transcoded = t.transcodeFromBetaCode2Unicode(testStr4); + transcoded = t.transcodeFromBetaCode2Unicode(testStr5); + transcoded = t.transcodeFromBetaCode2Unicode(testStr6); + + String arabTestStr1 = "^nutaf"; + String arabTestStr2 = "min"; + String arabTestStr3 = "Aal-Hiyal (^qAla ^>arisTwTAlys) yataEaj~aba Aal-nAs minhA <im~A fy Aal->a$yA' Aal~aty taEriDu TabEAF fa-mim~A lA yuElamu Eil~atuhu wa-<im~A fy Aal->a$yA' Aal-muxAlifap li-l-TabE fa-mim~A yuEmalu bi-Aal-SinAEap li-manfaEap Aal-nAs li->an~a Aal-TabyEap tulzimu >abadAF jihap wAHidap wa->am~A manAfiE Aal-nAs fa-<in~ahA taxtalifu <ixtilAfAF kavyrAF."; + transcoded = t.transcodeFromBuckwalter2Unicode(arabTestStr1); + transcoded = t.transcodeFromBuckwalter2Unicode(arabTestStr2); + transcoded = t.transcodeFromBuckwalter2Unicode(arabTestStr3); + + // String deletedNestedTags = deleteNestedTags("G", testStr4); + // String regExpr = "(.*?)(.*?)(.*?)(.*?)(.*?)"; + String regExpr = "(.*?)(.*)(){1,}(.*?)"; + // String regExpr = "(.*?)(.*?)(.*?)(.*?)(.*?)"; + String replaceStr = testStr2.replaceAll(regExpr, "$1$2$4"); + // String replaceStr2 = testStr2.replaceAll("(.*)(.*)(.*)(.*)(.*)", "$2$3$4$5"); + regExpr = ".*?(.*?){1,}.*?"; + regExpr = "(.*?)(.*?)(.*?){1,}(.*?)"; + // String regExpr = "[a-zA-Z0-9]+?\\[.+?\\]/" + "|" + "[a-zA-Z0-9]+?/" + "|" + "[a-zA-Z0-9]+?\\[.+\\]$" + "|" + "[a-zA-Z0-9]+?$"; // pathName example: "/archimedes[@xmlns:xlink eq "http://www.w3.org/1999/xlink"]/text/body/chap/p[@type eq "main"]/s/foreign[@lang eq "en"]" + Pattern p = Pattern.compile(regExpr, Pattern.CASE_INSENSITIVE | Pattern.MULTILINE); // both flags enabled + Matcher m = p.matcher(testStr2); + while (m.find()) { + int msBeginPos = m.start(); + int msEndPos = m.end(); + String matchStr = testStr2.substring(msBeginPos, msEndPos); + String bla = ""; + } + + String retStr = transcodeByElementName("fromBetacode2Unicode", "G", testStr); + retStr = transcodeByElementName("fromBetacode2Unicode", "G", "bla"); + retStr = transcodeByElementName("fromBetacode2Unicode", "G", ""); + } + + private String transcodeByElementName(String transcodeDirection, String elementName, String inputStr) throws ApplicationException { + if (inputStr == null || elementName == null) + return null; + String elemBeginTag = "<" + elementName + ">"; + String elemEndTag = ""; + Transcoder transcoder = Transcoder.getInstance(); + String outputStr = ""; + int begin = inputStr.indexOf(elemBeginTag); + int end = inputStr.indexOf(elemEndTag); + while (begin != -1 && end != -1 && begin < end) { + String before = inputStr.substring(0, begin); + String origStr = inputStr.substring(begin + elemBeginTag.length(), end); + origStr = StringUtilEscapeChars.deleteSpecialXmlEntities(origStr); + String transcodedStr = origStr; + if (transcodeDirection.equals("fromBetacode2Unicode")) + transcodedStr = transcoder.transcodeFromBetaCode2Unicode(origStr); + else if (transcodeDirection.equals("fromBuckwalter2Unicode")) + transcodedStr = transcoder.transcodeFromBuckwalter2Unicode(origStr); + outputStr = outputStr + before + new String(elemBeginTag); + outputStr = outputStr + transcodedStr; + outputStr = outputStr + new String(elemEndTag); + inputStr = inputStr.substring(end + elemEndTag.length()); + begin = inputStr.indexOf(elemBeginTag); + end = inputStr.indexOf(elemEndTag); + } + outputStr = outputStr + inputStr; + return outputStr; + } + + private String deleteNestedTags(String elementName, String inputStr) { + String inputStrTmp = new String(inputStr); + String elemBeginTag = "<" + elementName + ">"; + String elemEndTag = ""; + String outputStr = ""; + int begin = inputStrTmp.indexOf(elemBeginTag); + int end = inputStrTmp.indexOf(elemEndTag); + while (begin != -1 && end != -1) { + end = getIndexClosedTag(begin, elementName, inputStrTmp); + String before = inputStrTmp.substring(0, begin); + String origStr = null; + if (end == -1) // if no end tag could be found + origStr = inputStrTmp.substring(begin + elemBeginTag.length(), inputStrTmp.length()); + else + origStr = inputStrTmp.substring(begin + elemBeginTag.length(), end); + origStr = origStr.replaceAll(elemBeginTag, ""); + origStr = origStr.replaceAll(elemEndTag, ""); + outputStr = outputStr + before + new String(elemBeginTag); + outputStr = outputStr + origStr; + outputStr = outputStr + new String(elemEndTag); + inputStrTmp = inputStrTmp.substring(end + elemEndTag.length()); + begin = inputStrTmp.indexOf(elemBeginTag); + } + outputStr = outputStr + inputStrTmp; + return outputStr; + } + + private int getIndexClosedTag(int begin, String elementName, String inputStr) { + int beginTmp = begin; + int retIndex = -1; + String elemBeginTag = "<" + elementName + ">"; + String elemEndTag = ""; + int indexEndTag = inputStr.indexOf(elemEndTag); + while (indexEndTag != -1) { + String betweenTmpStr = inputStr.substring(beginTmp + elemBeginTag.length(), indexEndTag); + int indexBeginTag = betweenTmpStr.indexOf(elemBeginTag); + if (indexBeginTag != -1) { + beginTmp = indexEndTag; + } else { + return indexEndTag; + } + indexEndTag = inputStr.indexOf(elemEndTag, indexEndTag + elemEndTag.length()); + } + return retIndex; + } + + private HashMap getWholeLexiconHashMap(String lexiconName) throws ApplicationException { + HashMap lexHashMap = new HashMap(); + try { + dbEnvLexica.openDatabase(lexiconName); + Database lexDB = dbEnvLexica.getLexiconDB(lexiconName); + Cursor cursor = lexDB.openCursor(null, null); + DatabaseEntry dbEntryKey = new DatabaseEntry(); + DatabaseEntry dbEntryValue = new DatabaseEntry(); + OperationStatus operationStatus = cursor.getFirst(dbEntryKey, dbEntryValue, LockMode.DEFAULT); + while (operationStatus == OperationStatus.SUCCESS) { + int size = dbEntryKey.getSize(); + if (size > 0) { + byte[] dbEntryKeyBytes = dbEntryKey.getData(); + String dbEntryKeyStr = new String(dbEntryKeyBytes, "utf-8"); + DatabaseEntry newDbEntryValue = new DatabaseEntry(dbEntryValue.getData()); + lexHashMap.put(dbEntryKeyStr, newDbEntryValue); + } + operationStatus = cursor.getNext(dbEntryKey, dbEntryValue, LockMode.DEFAULT); + } + cursor.close(); + } catch (DatabaseException e) { + throw new ApplicationException(e); + } catch (UnsupportedEncodingException e) { + throw new ApplicationException(e); + } + return lexHashMap; + } + + private LexiconEntry xmlParseAndRepair(LexiconEntry lexEntry) throws ApplicationException { + String origLexEntryContent = lexEntry.getContent(); + String lexEntryContent = new String(origLexEntryContent); + lexEntry.setContent(lexEntryContent); + // parse and repair: try to repair it 3 times through parsing + LexiconEntry retLexiconEntry = xmParseAndRepairLocal(lexEntry); + retLexiconEntry = xmParseAndRepairLocal(retLexiconEntry); + retLexiconEntry = xmParseAndRepairLocal(retLexiconEntry); + // if it could not be repaired the original content (which is not XML valid) is delivered + if (! retLexiconEntry.isXmlValid()) + retLexiconEntry.setContent(origLexEntryContent); + return retLexiconEntry; + } + + private LexiconEntry xmParseAndRepairLocal(LexiconEntry lexEntry) throws ApplicationException { + if (! lexEntry.isXmlValid()) { + lexEntry = xmlParse(lexEntry); + } + if (! lexEntry.isXmlValid() && lexEntry.getValidationCode() != null && lexEntry.getValidationCode().equals("elementNotClosed")) { + String elementName = lexEntry.getValidationFailElementName(); + String lexiconEntryContent = lexEntry.getContent(); + lexiconEntryContent = lexiconEntryContent.replaceAll("<" + elementName + " .*?>", ""); + lexiconEntryContent = lexiconEntryContent.replaceAll("", ""); + lexEntry.setContent(lexiconEntryContent); + lexEntry.setXmlMadeValid(true); + } + return lexEntry; + } + + private LexiconEntry xmlParse(LexiconEntry lexEntry) throws ApplicationException { + String lexEntryContent = "" + lexEntry.getContent() + ""; + LexEntryContentHandler lexEntryContentHandler = new LexEntryContentHandler(); + XMLReader xmlParser = new SAXParser(); + xmlParser.setContentHandler(lexEntryContentHandler); + LexEntryErrorHandler lexEntryErrorHandler = new LexEntryErrorHandler(); + xmlParser.setErrorHandler(lexEntryErrorHandler); + try { + Reader reader = new StringReader(lexEntryContent); + InputSource input = new InputSource(reader); + xmlParser.parse(input); + lexEntry.setXmlValid(true); + } catch (SAXException e) { + // nothing but following + lexEntry.setXmlValid(false); + String exceptionMessage = e.getMessage(); + if (exceptionMessage.matches("The element type .* must be terminated by the matching end-tag .*")) { + int begin = exceptionMessage.indexOf("\""); + if (begin != -1) { + String subStr = exceptionMessage.substring(begin + 1); + int end = subStr.indexOf("\""); + if (end != -1) { + String elementName = exceptionMessage.substring(begin + 1, begin + 1 + end); + lexEntry.setValidationCode("elementNotClosed"); + lexEntry.setValidationFailElementName(elementName); + } + } + } + } catch (IOException e) { + throw new ApplicationException(e); + } + return lexEntry; + } + + private void writeLexiconsToFiles() throws ApplicationException { + BufferedReader in = null; + BufferedOutputStream out = null; + try { + ArrayList lexicons = Lexica.getInstance().getLexicons(); + for (int i=0; i lexHashMap = getWholeLexiconHashMap(lexiconName); + Iterator lexDumpIter = lexHashMap.keySet().iterator(); + File outputFile = new File(DATA_FILES_DIR_LEXICA + "/" + lexiconName + ".xml"); + out = new BufferedOutputStream(new FileOutputStream(outputFile)); + write("\n", out); + write("" + lexiconName + "\n", out); + write("" + lexicon.getDescription() + "\n", out); + write("\n", out); + while (lexDumpIter.hasNext()) { + write("\n", out); + String lexKeyStr = lexDumpIter.next(); + write("

    " + lexKeyStr + "
    \n", out); + DatabaseEntry lexValue = lexHashMap.get(lexKeyStr); + byte[] lexValueBytes = lexValue.getData(); + write(lexValueBytes, out); + write("\n", out); + } + write("\n", out); + write("\n", out); + } + } catch (FileNotFoundException e) { + throw new ApplicationException(e); + } finally { + // always close the stream + if (in != null) try { in.close(); } catch (Exception e) { } + if (out != null) try { out.close(); } catch (Exception e) { } + } + } + + private void write(byte[] inputBytes, BufferedOutputStream out) throws ApplicationException { + try { + out.write(inputBytes, 0, inputBytes.length); + out.flush(); + } catch (IOException e) { + throw new ApplicationException(e); + } + } + + private void write(String outStr, BufferedOutputStream out) throws ApplicationException { + try { + byte[] bytes = outStr.getBytes("utf-8"); + out.write(bytes, 0, bytes.length); + out.flush(); + } catch (IOException e) { + throw new ApplicationException(e); + } + } + + private void beginOperation() { + beginOfOperation = new Date(); + } + + private void endOperation() { + endOfOperation = new Date(); + } + +} \ No newline at end of file diff -r 000000000000 -r 408254cf2f1d software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/lex/db/DbEnvLex.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/lex/db/DbEnvLex.java Wed Nov 24 17:24:23 2010 +0100 @@ -0,0 +1,101 @@ +package de.mpg.mpiwg.berlin.mpdl.lt.lex.db; + +import java.io.File; +import java.util.HashMap; + +import com.sleepycat.je.Database; +import com.sleepycat.je.DatabaseConfig; +import com.sleepycat.je.DatabaseException; +import com.sleepycat.je.Environment; +import com.sleepycat.je.EnvironmentConfig; + +import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; + +public class DbEnvLex { + private String dataDir; + private File envPath; + private Environment env; + private EnvironmentConfig envConfig; + private DatabaseConfig dbConfig; + private HashMap lexiconDBs = new HashMap(); + + public DbEnvLex() { + } + + public void setDataDir(String dataDir) { + this.dataDir = dataDir; + } + + public void initReadOnly() throws ApplicationException { + try { + envConfig = new EnvironmentConfig(); + dbConfig = new DatabaseConfig(); + envPath = new File(dataDir); + env = new Environment(envPath, envConfig); + } catch (DatabaseException e) { + throw new ApplicationException(e); + } + } + + public void initReadWrite() throws ApplicationException { + try { + envConfig = new EnvironmentConfig(); + dbConfig = new DatabaseConfig(); + envConfig.setReadOnly(false); + dbConfig.setReadOnly(false); + envConfig.setAllowCreate(true); + dbConfig.setAllowCreate(true); + envConfig.setTransactional(true); + dbConfig.setTransactional(true); + envPath = new File(dataDir); + env = new Environment(envPath, envConfig); + } catch (DatabaseException e) { + throw new ApplicationException(e); + } + } + + public void openDatabase(String lexiconName) throws ApplicationException { + try { + Database lexDB = lexiconDBs.get(lexiconName); + if (lexDB == null) { + Database lexiconDB = env.openDatabase(null, lexiconName + ".db", dbConfig); + lexiconDBs.put(lexiconName, lexiconDB); + } + } catch (DatabaseException e) { + throw new ApplicationException(e); + } + } + + public void closeDatabase(String lexiconName) throws ApplicationException { + try { + if (lexiconDBs != null) { + Database lexiconDB = lexiconDBs.get(lexiconName); + if (lexiconDB != null) + lexiconDB.close(); + } + } catch (DatabaseException e) { + throw new ApplicationException(e); + } + } + + public Environment getEnv() { + return env; + } + + public Database getLexiconDB(String lexiconName) { + Database lexiconDB = lexiconDBs.get(lexiconName); + return lexiconDB; + } + + public void close() throws ApplicationException { + if (env != null) { + try { + if (env != null) + env.close(); + } catch (DatabaseException e) { + throw new ApplicationException(e); + } + } + } +} + diff -r 000000000000 -r 408254cf2f1d software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/lex/db/LexEntryContentHandler.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/lex/db/LexEntryContentHandler.java Wed Nov 24 17:24:23 2010 +0100 @@ -0,0 +1,43 @@ +package de.mpg.mpiwg.berlin.mpdl.lt.lex.db; + +import org.xml.sax.*; + +public class LexEntryContentHandler implements ContentHandler { + + public LexEntryContentHandler() { + } + + public void startDocument() throws SAXException { + } + + public void endDocument() throws SAXException { + } + + public void characters(char[] c, int start, int length) throws SAXException { + } + + public void ignorableWhitespace(char[] c, int start, int length) throws SAXException { + } + + public void processingInstruction(String target, String data) throws SAXException { + } + + public void setDocumentLocator(org.xml.sax.Locator arg1) { + } + + public void endPrefixMapping(String prefix) throws SAXException { + } + + public void skippedEntity(String name) throws SAXException { + } + + public void startElement(String uri, String localName, String name, Attributes attrs) throws SAXException { + } + + public void endElement(String uri, String localName, String name) throws SAXException { + } + + public void startPrefixMapping(String prefix, String uri) throws SAXException { + } + +} diff -r 000000000000 -r 408254cf2f1d software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/lex/db/LexEntryErrorHandler.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/lex/db/LexEntryErrorHandler.java Wed Nov 24 17:24:23 2010 +0100 @@ -0,0 +1,12 @@ +package de.mpg.mpiwg.berlin.mpdl.lt.lex.db; + +import org.xml.sax.*; + +public class LexEntryErrorHandler implements ErrorHandler { + public void warning(SAXParseException exception) throws SAXException { + } + public void error(SAXParseException exception) throws SAXException { + } + public void fatalError(SAXParseException exception) throws SAXException { + } +} \ No newline at end of file diff -r 000000000000 -r 408254cf2f1d software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/lex/db/LexHandler.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/lex/db/LexHandler.java Wed Nov 24 17:24:23 2010 +0100 @@ -0,0 +1,175 @@ +package de.mpg.mpiwg.berlin.mpdl.lt.lex.db; + +import java.io.UnsupportedEncodingException; +import java.util.ArrayList; +import java.util.Date; + +import com.sleepycat.je.Cursor; +import com.sleepycat.je.Database; +import com.sleepycat.je.DatabaseEntry; +import com.sleepycat.je.DatabaseException; +import com.sleepycat.je.LockMode; +import com.sleepycat.je.OperationStatus; + +import de.mpg.mpiwg.berlin.mpdl.util.Util; +import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; +import de.mpg.mpiwg.berlin.mpdl.general.MpdlConstants; +import de.mpg.mpiwg.berlin.mpdl.lt.general.Transcoder; +import de.mpg.mpiwg.berlin.mpdl.lt.lex.app.Lexica; +import de.mpg.mpiwg.berlin.mpdl.lt.lex.app.Lexicon; +import de.mpg.mpiwg.berlin.mpdl.lt.lex.app.LexiconEntry; +import de.mpg.mpiwg.berlin.mpdl.lt.morph.app.Lemma; +import de.mpg.mpiwg.berlin.mpdl.lt.morph.app.MorphologyCache; + +public class LexHandler { + private static LexHandler instance; + private static String MPDL_DATA_DIR = MpdlConstants.MPDL_EXIST_DATA_DIR; + private static String DB_DIR_LEXICA = MPDL_DATA_DIR + "/dataBerkeleyDB/pollux"; + private DbEnvLex dbEnvLexica; + private Date beginOfOperation; + private Date endOfOperation; + + public static LexHandler getInstance() throws ApplicationException { + if (instance == null) { + instance = new LexHandler(); + instance.initReadOnly(); + } + return instance; + } + + /** + * + * @param formName + * @param language + * @return delivers lexical entries by help of the morphology component (lexical entry of the stem of the normalized word form) + * @throws ApplicationException + */ + public ArrayList getLexEntryKeys(String formName, String language, boolean normalize) throws ApplicationException { + ArrayList lexEntryKeys = new ArrayList(); + MorphologyCache morphologyCache = MorphologyCache.getInstance(); + ArrayList formLemmas = morphologyCache.getLemmasByFormName(language, formName, normalize); + boolean hasLexEntry = false; + hasLexEntry = hasLexEntryKey(formName, language); + if (hasLexEntry) + lexEntryKeys.add(formName); + if (formLemmas != null) { + for (int j=0; j statLexicons = Lexica.getInstance().getLexicons(language); + if (statLexicons != null) { + for (int i=0; i lexicons = Lexica.getInstance().getLexicons(); + for (int i=0; i dbNames = dbEnvLexica.getEnv().getDatabaseNames(); + String l1 = readEntry("autenrieth", "au)to/s").getContent(); // greek: see also bonitz and lsj + String l2 = readEntry("ls", "laudabilis").getContent(); // latin + System.out.println("Autenrieth: autos: " + l1); + System.out.println("Lewis & Short: Laudabilis: " + l2); + } + + private void end() throws ApplicationException { + ArrayList lexicons = Lexica.getInstance().getLexicons(); + for (int i=0; i", ">"); + formName = formName.replaceAll("\"", """); + lemmaName = lemmaName.replaceAll("&", "&"); + lemmaName = lemmaName.replaceAll("'", "'"); + lemmaName = lemmaName.replaceAll("<", "<"); + lemmaName = lemmaName.replaceAll(">", ">"); + lemmaName = lemmaName.replaceAll("\"", """); + // unification of lemma names (homographs) TODO do not unificate the homographs + lemmaName = lemmaName.replaceAll("#[0-9]", ""); + if (isArabic()) { + if (lemmaName != null) { + int length = lemmaName.length(); + char lastChar = lemmaName.charAt(length - 1); + boolean isDigit = Character.isDigit(lastChar); + if (isDigit) + lemmaName = lemmaName.substring(0, length - 1); + } + } + // unification of forms and lemmas with hyphens: remove the hyphen + formName = formName.replaceAll("-", ""); + lemmaName = lemmaName.replaceAll("-", ""); + // unification of forms and lemmas with blanks (sequence of words): remove the blanks + formName = formName.replaceAll(" ", ""); + lemmaName = lemmaName.replaceAll(" ", ""); + // unification of forms and lemmas with plus symbols: remove the plus symbol + formName = formName.replaceAll("\\+", ""); + lemmaName = lemmaName.replaceAll("\\+", ""); + // TODO call MpdlMorphDataNormalizer (handle Umlauts in german, accents in french, character classes (longs, s, ...) ...) + + } + + public boolean isOk() { + boolean ret = true; + if (formName == null || lemmaName == null) + ret = false; + else if (formName.length() == 0 || lemmaName.length() == 0 || formName.length() == 1 || lemmaName.length() == 1) + ret = false; + return ret; + } + + public boolean isGreek() { + boolean ret = false; + if (language != null && language.equals("el")) + ret = true; + return ret; + } + + public boolean isArabic() { + boolean ret = false; + if (language != null && language.equals("ar")) + ret = true; + return ret; + } + + public boolean isRicherThan(Form otherForm) { + boolean richer = false; + if (! isOk()) + return false; + else if (! otherForm.isOk()) + return true; + String otherFormPos = otherForm.getPos(); + if (pos != null && pos.length() > 0 && (otherFormPos == null || otherFormPos.length() == 0)) + return true; + // TODO all other cases + return richer; + } + + public String getXmlString() { + String xmlString = "
    \n"; + if (provider != null) + xmlString += " " + provider + "\n"; + if (language != null) + xmlString += " " + language + "\n"; + if (formName != null) + xmlString += " " + formName + "\n"; + if (lemmaName != null) + xmlString += " " + lemmaName + "\n"; + if (pos != null) + xmlString += " " + pos + "\n"; + if (tense != null) + xmlString += " " + tense + "\n"; + if (voice != null) + xmlString += " " + voice + "\n"; + if (casus != null) + xmlString += " " + casus + "\n"; + if (number != null) + xmlString += " " + number + "\n"; + if (mood != null) + xmlString += " " + mood + "\n"; + if (person != null) + xmlString += " " + person + "\n"; + if (gender != null) + xmlString += " " + gender + "\n"; + if (definite != null) + xmlString += " " + definite + "\n"; + xmlString += "
    \n"; + return xmlString; + } + + public String toString() { + return getXmlString(); + } + + public String getTense() { + return tense; + } + + public void setTense(String tense) { + this.tense = tense; + } + + public void addTense(String newTense) { + if (tense == null) + this.tense = newTense; + else + tense += newTense; + } + + public String getVoice() { + return voice; + } + + public void setVoice(String voice) { + this.voice = voice; + } + + public void addVoice(String newVoice) { + if (voice == null) + this.voice = newVoice; + else + voice += newVoice; + } + + public String getCasus() { + return casus; + } + + public void setCasus(String casus) { + this.casus = casus; + } + + public void addCasus(String newCasus) { + if (casus == null) + this.casus = newCasus; + else + casus += newCasus; + } + + public String getNumber() { + return number; + } + + public void setNumber(String number) { + this.number = number; + } + + public void addNumber(String newNumber) { + if (number == null) + this.number = newNumber; + else + number += newNumber; + } + + public String getMood() { + return mood; + } + + public void setMood(String mood) { + this.mood = mood; + } + + public void addMood(String newMood) { + if (mood == null) + this.mood = newMood; + else + mood += newMood; + } + + public String getPerson() { + return person; + } + + public void setPerson(String person) { + this.person = person; + } + + public void addPerson(String newPerson) { + if (person == null) + this.person = newPerson; + else + person += newPerson; + } + + public String getGender() { + return gender; + } + + public void setGender(String gender) { + this.gender = gender; + } + + public void addGender(String newGender) { + if (gender == null) + this.gender = newGender; + else + gender += newGender; + } + + public String getDefinite() { + return definite; + } + + public void setDefinite(String definite) { + this.definite = definite; + } + + public void addDefinite(String newDefinite) { + if (definite == null) + this.definite = newDefinite; + else + definite += newDefinite; + } + + public String getLemmaName() { + return lemmaName; + } + + public String getPos() { + return pos; + } + + public String getProvider() { + return provider; + } + + public void setProvider(String provider) { + this.provider = provider; + } + + public void addProvider(String newProvider) { + if (provider == null) + this.provider = newProvider; + else + provider += newProvider; + } + + public String getLanguage() { + return language; + } + + public void setLanguage(String language) { + this.language = language; + } + + public void addLanguage(String newLanguage) { + if (language == null) + this.language = newLanguage; + else + language += newLanguage; + } + + public String getFormName() { + return formName; + } + + public void setFormName(String formName) { + this.formName = formName; + } + + public void addFormName(String newFormName) { + if (formName == null) + this.formName = newFormName; + else + formName += newFormName; + } + + public void setLemmaName(String lemmaName) { + this.lemmaName = lemmaName; + } + + public void addLemmaName(String newLemmaName) { + if (lemmaName == null) + this.lemmaName = newLemmaName; + else + lemmaName += newLemmaName; + } + + public void setPos(String pos) { + this.pos = pos; + } + + public void addPos(String newPos) { + if (pos == null) + this.pos = newPos; + else + pos += newPos; + } + +} diff -r 000000000000 -r 408254cf2f1d software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/morph/app/Lemma.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/morph/app/Lemma.java Wed Nov 24 17:24:23 2010 +0100 @@ -0,0 +1,152 @@ +package de.mpg.mpiwg.berlin.mpdl.lt.morph.app; + +import java.util.ArrayList; +import java.util.Enumeration; +import java.util.Hashtable; + + +public class Lemma implements Comparable { + private String provider; + private String language; + private String lemmaName; + private Hashtable forms; + + public Lemma() { + } + + public Lemma(String provider, String language, String lemmaName) { + this.provider = provider; + this.language = language; + this.lemmaName = lemmaName; + this.forms = new Hashtable(); + // always contains the form with the same lemma name + Form form = new Form(provider, language, lemmaName); + addForm(form); + } + + public String getProvider() { + return provider; + } + + public void setProvider(String provider) { + this.provider = provider; + } + + public void addProvider(String newProvider) { + if (provider == null) + this.provider = newProvider; + else + provider += newProvider; + } + + public String getLanguage() { + return language; + } + + public void setLanguage(String language) { + this.language = language; + } + + public void addLanguage(String newLanguage) { + if (language == null) + this.language = newLanguage; + else + language += newLanguage; + } + + public String getLemmaName() { + return lemmaName; + } + + public void setLemmaName(String lemmaName) { + this.lemmaName = lemmaName; + } + + public void addLemmaName(String newLemmaName) { + if (lemmaName == null) + this.lemmaName = newLemmaName; + else + lemmaName += newLemmaName; + } + + public Hashtable getForms() { + return forms; + } + + public ArrayList
    getForms(String provider) { + ArrayList result = new ArrayList(); + Enumeration keys = forms.keys(); + while (keys.hasMoreElements()) { + String key = keys.nextElement(); + Form form = forms.get(key); + String prov = form.getProvider(); + if (prov.equals(provider)) + result.add(form); + } + return result; + } + + public ArrayList getFormsList() { + ArrayList result = new ArrayList(); + if(forms != null) { + Enumeration keys = forms.keys(); + while (keys.hasMoreElements()) { + String key = keys.nextElement(); + Form form = forms.get(key); + result.add(form); + } + } + return result; + } + + public void setForms(ArrayList forms) { + for (int i=0; i(); + Form f = forms.get(formKey); + if (f == null) { + forms.put(formKey, newForm); + } else { + if(newForm.isRicherThan(f)) + forms.put(formKey, newForm); + } + } + + public Form getForm(String formKey) { + return forms.get(formKey); + } + + public String getXmlString() { + String xmlString = "\n"; + xmlString += " " + provider + "\n"; + xmlString += " " + language + "\n"; + xmlString += " " + lemmaName + "\n"; + xmlString += ""; + return xmlString; + } + + public String toString() { + return getXmlString(); + } + + public int compareTo(Lemma l) { + if (l.getLemmaName() == null && this.getLemmaName() == null) { + return 0; + } + if (this.getLemmaName() == null) { + return 1; + } + if (l.getLemmaName() == null) { + return -1; + } + return this.getLemmaName().compareTo(l.getLemmaName()); + } + +} diff -r 000000000000 -r 408254cf2f1d software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/morph/app/MorphFileReaderContentHandler.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/morph/app/MorphFileReaderContentHandler.java Wed Nov 24 17:24:23 2010 +0100 @@ -0,0 +1,127 @@ +package de.mpg.mpiwg.berlin.mpdl.lt.morph.app; + +import java.util.Hashtable; + +import org.xml.sax.*; + +import de.mpg.mpiwg.berlin.mpdl.lt.morph.app.Form; +import de.mpg.mpiwg.berlin.mpdl.lt.morph.app.Lemma; + +public class MorphFileReaderContentHandler implements ContentHandler { + private Hashtable forms; + private Hashtable lemmas; + private Element currentElement; + private Form currentForm; + + public MorphFileReaderContentHandler(Hashtable forms, Hashtable lemmas) { + this.forms = forms; + this.lemmas = lemmas; + } + + public void startDocument() throws SAXException { + } + + public void endDocument() throws SAXException { + } + + public void characters(char[] c, int start, int length) throws SAXException { + if (currentElement != null) { + String elemName = currentElement.name; + if (currentForm != null) { + char[] cCopy = new char[length]; + System.arraycopy(c, start, cCopy, 0, length); + String charactersStr = String.valueOf(cCopy); + if (charactersStr != null && ! (charactersStr.trim().equals(""))) { + if (elemName.equals("provider")) { + currentForm.setProvider(charactersStr); + } else if (elemName.equals("language")) { + currentForm.setLanguage(charactersStr); + } else if (elemName.equals("form-name")) { + currentForm.setFormName(charactersStr); + } else if (elemName.equals("lemma-name")) { + currentForm.setLemmaName(charactersStr); + } else if (elemName.equals("pos")) { + currentForm.setPos(charactersStr); + } else if (elemName.equals("tense")) { + currentForm.setTense(charactersStr); + } else if (elemName.equals("voice")) { + currentForm.setVoice(charactersStr); + } else if (elemName.equals("casus")) { + currentForm.setCasus(charactersStr); + } else if (elemName.equals("number")) { + currentForm.setNumber(charactersStr); + } else if (elemName.equals("mood")) { + currentForm.setMood(charactersStr); + } else if (elemName.equals("person")) { + currentForm.setPerson(charactersStr); + } else if (elemName.equals("gender")) { + currentForm.setGender(charactersStr); + } else if (elemName.equals("definite")) { + currentForm.setDefinite(charactersStr); + } + } + } + } + } + + public void ignorableWhitespace(char[] c, int start, int length) throws SAXException { + } + + public void processingInstruction(String target, String data) throws SAXException { + } + + public void setDocumentLocator(org.xml.sax.Locator arg1) { + } + + public void endPrefixMapping(String prefix) throws SAXException { + } + + public void skippedEntity(String name) throws SAXException { + } + + public void endElement(String uri, String localName, String name) throws SAXException { + currentElement = null; + if (name.equals("form")) { + String provider = currentForm.getProvider(); + String language = currentForm.getLanguage(); + String formName = currentForm.getFormName(); + String lemmaName = currentForm.getLemmaName(); + String formKey = language + "###" + formName; + forms.put(formKey, currentForm); + String lemmaKey = language + "###" + lemmaName; + Lemma lemma = lemmas.get(lemmaKey); + if(lemma == null) { + Lemma l = new Lemma(provider, language, lemmaName); + l.addForm(currentForm); + lemmas.put(lemmaKey, l); + } else { + lemma.addForm(currentForm); + } + currentForm = null; + } + } + + public void startElement(String uri, String localName, String name, Attributes attrs) throws SAXException { + currentElement = new Element(name); + if (name.equals("form")) { + currentForm = new Form(); + } + } + + public void startPrefixMapping(String prefix, String uri) throws SAXException { + } + + private class Element { + String name; + String value; + + Element(String name) { + this.name = name; + } + + Element(String name, String value) { + this.name = name; + this.value = value; + } + } +} diff -r 000000000000 -r 408254cf2f1d software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/morph/app/MorphologyCache.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/morph/app/MorphologyCache.java Wed Nov 24 17:24:23 2010 +0100 @@ -0,0 +1,402 @@ +package de.mpg.mpiwg.berlin.mpdl.lt.morph.app; + +import java.io.BufferedOutputStream; +import java.io.File; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.OutputStream; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Date; +import java.util.Enumeration; +import java.util.Hashtable; + +import org.apache.log4j.Logger; + +import de.mpg.mpiwg.berlin.mpdl.lt.analyzer.MpdlNormalizer; +import de.mpg.mpiwg.berlin.mpdl.lt.general.Language; +import de.mpg.mpiwg.berlin.mpdl.lt.morph.app.Form; +import de.mpg.mpiwg.berlin.mpdl.lt.morph.app.Lemma; +import de.mpg.mpiwg.berlin.mpdl.lt.morph.db.DBMorphHandler; +import de.mpg.mpiwg.berlin.mpdl.lucene.LuceneUtil; +import de.mpg.mpiwg.berlin.mpdl.util.FileUtil; +import de.mpg.mpiwg.berlin.mpdl.util.Util; +import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; +import de.mpg.mpiwg.berlin.mpdl.general.MpdlConstants; + +public class MorphologyCache { + private static MorphologyCache instance; + private static Logger LOGGER = Logger.getLogger(MorphologyCache.class); // Logs to EXIST_HOME/webapp/WEB-INF/logs/exist.log + private static String MPDL_DATA_DIR = MpdlConstants.MPDL_EXIST_DATA_DIR; + private static String DB_DIR_DONATUS = MPDL_DATA_DIR + "/dataBerkeleyDB/donatus"; + private static String DB_DIR_DYNAMIC = MPDL_DATA_DIR + "/dataBerkeleyDB/dynamic"; + private static String DATA_FILES_DIR = MPDL_DATA_DIR + "/dataFiles"; + private static String DATA_FILE_DYNAMIC_FORMS = DATA_FILES_DIR + "/snowball-all-forms.xml"; + public static int QUERY_MODE = 0; + public static int DOCUMENT_MODE = 1; + private static int MAX_HASHTABLE_SIZE = MpdlConstants.MORPHOLOGY_CACHE_SIZE; + protected int mode = QUERY_MODE; + private Hashtable> forms = new Hashtable>(); // cache of forms: hashKey is formName + private Hashtable lemmas = new Hashtable(); // cache of lemmas: hashKey is lemmaName + private DBMorphHandler dbMorphHandlerStatic; // handles static morph data (BerkeleyDB) + private DBMorphHandler dbMorphHandlerDynamic; // handles dynamic morph data (BerkeleyDB) + private OutputStream outputStreamDynamicForms; // backup file for all dynamic forms + private Date beginOfOperation; + private Date endOfOperation; + + public static MorphologyCache getInstance() throws ApplicationException { + if (instance == null) { + instance = new MorphologyCache(); + instance.init(); + } + return instance; + } + + private void init() throws ApplicationException { + LOGGER.info("Mpdl: Init morphology cache ..."); + instance.beginOperation(); + dbMorphHandlerStatic = new DBMorphHandler(DB_DIR_DONATUS); + dbMorphHandlerStatic.start(); + dbMorphHandlerStatic.openDatabases(); + dbMorphHandlerDynamic = new DBMorphHandler(DB_DIR_DYNAMIC); + dbMorphHandlerDynamic.start(); + dbMorphHandlerDynamic.openDatabases(); + openDynamicFormsDataFile(); + instance.endOperation(); + Double elapsedTime = new Util().getSecondWithMillisecondsBetween(instance.beginOfOperation, instance.endOfOperation); + LOGGER.info(" Needed time: " + elapsedTime + " seconds."); + } + + public int getMode() { + return mode; + } + + public void setMode(int newMode) { + this.mode = newMode; + } + + public void end() throws ApplicationException { + dbMorphHandlerStatic.closeDatabases(); + dbMorphHandlerDynamic.closeDatabases(); + closeDynamicFormsDataFile(); + } + + public ArrayList getLemmasByFormName(String lang, String formNameArg, boolean normalize) throws ApplicationException { + String language = Language.getInstance().getLanguageId(lang); + ArrayList retFormLemmas = null; + String formName = formNameArg; + if (normalize) { + MpdlNormalizer normalizer = new MpdlNormalizer(language); + formName = normalizer.normalize(formNameArg); + } + // first look in local cache + String key = language + "###" + formName; + Hashtable formLemmasHashtable = forms.get(key); + if (formLemmasHashtable == null) { + ArrayList dbFormLemmas = readLemmasByFormName(language, formName); + // put lemmas into local cache + int localHashTableSize = forms.size(); + if (localHashTableSize >= MAX_HASHTABLE_SIZE) { + clearCache(); + } + if (dbFormLemmas != null && ! dbFormLemmas.isEmpty()) { + formLemmasHashtable = new Hashtable(); + for (int i=0; i lemmaForms = readFormsByLemmaName(language, lemmaName); + lemma.setForms(lemmaForms); + lemmas.put(lemmaKey, lemma); + } else { + lemma = localLemma; + } + formLemmasHashtable.put(lemmaKey, lemma); + } + forms.put(key, formLemmasHashtable); + } + } + retFormLemmas = new ArrayList(); + if (formLemmasHashtable != null) { + Enumeration formLemmasKeys = formLemmasHashtable.keys(); + while(formLemmasKeys.hasMoreElements()) { + String lemmaKey = formLemmasKeys.nextElement(); + Lemma l = formLemmasHashtable.get(lemmaKey); + retFormLemmas.add(l); + } + } + Collections.sort(retFormLemmas); + return retFormLemmas; + } + + public Lemma getLemma(String lang, String lemmaNameArg, boolean normalize) throws ApplicationException { + String language = Language.getInstance().getLanguageId(lang); + String lemmaName = lemmaNameArg; + if (normalize) { + MpdlNormalizer normalizer = new MpdlNormalizer(language); + lemmaName = normalizer.normalize(lemmaNameArg); + } + // first look in local cache + String key = language + "###" + lemmaName; + Lemma lemma = lemmas.get(key); + if (lemma == null) { + ArrayList dbLemmaForms = readFormsByLemmaName(language, lemmaName); + if (dbLemmaForms != null && dbLemmaForms.size() > 0) { + lemma = new Lemma(); + lemma.setLemmaName(lemmaName); + lemma.setLanguage(language); + lemma.setProvider(dbLemmaForms.get(0).getProvider()); + lemma.setForms(dbLemmaForms); + lemmas.put(lemmaName, lemma); + } + } + return lemma; + } + + public void insertFormDynamic(Form newFlatForm) throws ApplicationException { + if (! newFlatForm.isOk()) + return; + String provider = newFlatForm.getProvider(); + String lang = newFlatForm.getLanguage(); + String language = Language.getInstance().getLanguageId(lang); + String lemmaName = newFlatForm.getLemmaName(); + Lemma newFlatLemma = new Lemma(provider, language, lemmaName); + newFlatLemma.addForm(newFlatForm); + // write to berkeley db; there is no test if the form is already contained (has to be done before) + writeFormLemmaDynamic(newFlatForm, newFlatLemma); + // write to backup file + String formsXmlStr = newFlatForm.getXmlString(); + writeToDynamicFile(formsXmlStr); + // fill local cache with new form if it is not too full + int localHashTableSize = forms.size(); + if (localHashTableSize >= MAX_HASHTABLE_SIZE) { + clearCache(); + } + String lemmaKey = language + "###" + lemmaName; + Lemma localLemma = lemmas.get(lemmaKey); + if (localLemma == null) { + lemmas.put(lemmaKey, newFlatLemma); + } else { + localLemma.addForm(newFlatForm); + String formName = newFlatForm.getFormName(); + String formKey = language + "###" + formName; + Hashtable formLemmas = forms.get(formKey); + if (formLemmas == null) { + formLemmas = new Hashtable(); + formLemmas.put(lemmaKey, localLemma); + forms.put(formKey, formLemmas); + } else { + formLemmas.put(formKey, localLemma); + } + } + } + + public ArrayList getFormsByLuceneQuery(String lang, String luceneQueryString, boolean normalize) throws ApplicationException { + String language = Language.getInstance().getLanguageId(lang); + ArrayList result = new ArrayList(); + luceneQueryString = luceneQueryString.toLowerCase(); + ArrayList formsFromQuery = getVariantsFromLuceneQuery(luceneQueryString); + if (! (formsFromQuery == null || formsFromQuery.isEmpty())) { + for (int i=0; i formLemmas = null; + // lemma mode: if formName contains "lemmalemma" then the lemma itself is fetched + if (formStr.startsWith("lemmalemma")) { + formLemmas = new ArrayList(); + String lemmaName = formStr.substring(10); + Lemma lemma = getLemma(language, lemmaName, false); + formLemmas.add(lemma); + } else { + formLemmas = getLemmasByFormName(language, formStr, false); + } + if (formLemmas != null && ! formLemmas.isEmpty()) { + for (int j=0; j lemmaForms = l.getFormsList(); + result.addAll(lemmaForms); + } + } + } + } + return result; + } + + public ArrayList getLemmasByLuceneQuery(String lang, String luceneQueryString, boolean normalize) throws ApplicationException { + String language = Language.getInstance().getLanguageId(lang); + Hashtable lemmas = new Hashtable(); + luceneQueryString = luceneQueryString.toLowerCase(); + ArrayList formsFromQuery = getVariantsFromLuceneQuery(luceneQueryString); + if (! (formsFromQuery == null || formsFromQuery.isEmpty())) { + for (int i=0; i formLemmas = null; + // lemma mode: if formName contains "lemmalemma" then the lemma itself is fetched + if (formStr.startsWith("lemmalemma")) { + formLemmas = new ArrayList(); + String lemmaName = formStr.substring(10); + Lemma lemma = getLemma(language, lemmaName, false); + formLemmas.add(lemma); + } else { + formLemmas = getLemmasByFormName(language, formStr, false); + } + if (formLemmas != null) { + for (int j=0; j result = new ArrayList(); + if (lemmas != null) { + Enumeration formLemmasKeys = lemmas.keys(); + while(formLemmasKeys.hasMoreElements()) { + String lemmaKey = formLemmasKeys.nextElement(); + Lemma l = lemmas.get(lemmaKey); + result.add(l); + } + } + Collections.sort(result); + if (result.isEmpty()) + return null; + else + return result; + } + + public ArrayList getIndexKeysByLemmaNames(String lang, ArrayList lemmaNames) throws ApplicationException { + String language = Language.getInstance().getLanguageId(lang); + Hashtable indexKeys = new Hashtable(); + for (int j=0; j lemmaForms = lemma.getFormsList(); + for (int k=0; k fLemmas = getLemmasByFormName(language, form.getFormName(), false); + if (fLemmas != null) { + String indexKey = ""; + if (fLemmas.size() == 1) { + indexKey = fLemmas.get(0).getLemmaName(); + } else { + for (int l=0; l result = new ArrayList(); + if (indexKeys != null) { + Enumeration indexKeysKeys = indexKeys.keys(); + while(indexKeysKeys.hasMoreElements()) { + String indexKey = indexKeysKeys.nextElement(); + result.add(indexKey); + } + } + Collections.sort(result); + if (result.isEmpty()) + return null; + else + return result; + } + + private void clearCache() { + forms = null; + lemmas = null; + forms = new Hashtable>(); + lemmas = new Hashtable(); + } + + private ArrayList readLemmasByFormName(String lang, String formName) throws ApplicationException { + String language = Language.getInstance().getLanguageId(lang); + ArrayList lemmasStatic = dbMorphHandlerStatic.readLemmas(language, formName); + ArrayList lemmasDynamic = dbMorphHandlerDynamic.readLemmas(language, formName); + lemmasStatic.addAll(lemmasDynamic); + return lemmasStatic; + } + + private ArrayList readFormsByLemmaName(String lang, String lemmaName) throws ApplicationException { + String language = Language.getInstance().getLanguageId(lang); + ArrayList formsStatic = dbMorphHandlerStatic.readForms(language, lemmaName); + ArrayList formsDynamic = dbMorphHandlerDynamic.readForms(language, lemmaName); + formsStatic.addAll(formsDynamic); + return formsStatic; + } + + private void writeFormLemmaDynamic(Form newFlatForm, Lemma newFlatLemma) throws ApplicationException { + dbMorphHandlerDynamic.writeFormLemma(newFlatForm, newFlatLemma); + dbMorphHandlerDynamic.writeLemmaForm(newFlatLemma, newFlatForm); + } + + private void openDynamicFormsDataFile() throws ApplicationException { + try { + File dataFileDynamicForms = new File(DATA_FILE_DYNAMIC_FORMS); + if (! dataFileDynamicForms.exists()) { + FileUtil.getInstance().copyFile(DATA_FILE_DYNAMIC_FORMS + ".empty", DATA_FILE_DYNAMIC_FORMS); + } + File dataFileDynamicFormsTmp = new File(DATA_FILE_DYNAMIC_FORMS + ".tmp"); + dataFileDynamicFormsTmp.delete(); + FileUtil.getInstance().copyFile(DATA_FILE_DYNAMIC_FORMS, DATA_FILE_DYNAMIC_FORMS + ".tmp"); + FileUtil.getInstance().deleteLastNBytes(dataFileDynamicFormsTmp, 9); // without last "" entry + FileOutputStream dataFileOutputStreamDynamicForms = new FileOutputStream(dataFileDynamicFormsTmp, true); + outputStreamDynamicForms = new BufferedOutputStream(dataFileOutputStreamDynamicForms); + } catch (IOException e) { + throw new ApplicationException(e); + } + } + + private void closeDynamicFormsDataFile() throws ApplicationException { + try { + writeToDynamicFile("\n"); + if (outputStreamDynamicForms != null) + outputStreamDynamicForms.close(); + File dataFileDynamicForms = new File(DATA_FILE_DYNAMIC_FORMS); + File dataFileDynamicFormsTmp = new File(DATA_FILE_DYNAMIC_FORMS + ".tmp"); + dataFileDynamicForms.delete(); + dataFileDynamicFormsTmp.renameTo(new File(DATA_FILE_DYNAMIC_FORMS)); + } catch (IOException e) { + throw new ApplicationException(e); + } + } + + private void writeToDynamicFile(String outStr) throws ApplicationException { + try { + if (outputStreamDynamicForms != null) { + byte[] bytes = outStr.getBytes("utf-8"); + outputStreamDynamicForms.write(bytes, 0, bytes.length); + outputStreamDynamicForms.flush(); + } + } catch (IOException e) { + throw new ApplicationException(e); + } + } + + private ArrayList getVariantsFromLuceneQuery(String queryString) { + LuceneUtil luceneUtil = LuceneUtil.getInstance(); + ArrayList variants = luceneUtil.getVariantsFromLuceneQuery(queryString); + return variants; + } + + private void beginOperation() { + beginOfOperation = new Date(); + } + + private void endOperation() { + endOfOperation = new Date(); + } +} \ No newline at end of file diff -r 000000000000 -r 408254cf2f1d software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/morph/app/SimpleMorphContentHandler.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/morph/app/SimpleMorphContentHandler.java Wed Nov 24 17:24:23 2010 +0100 @@ -0,0 +1,119 @@ +package de.mpg.mpiwg.berlin.mpdl.lt.morph.app; + +import org.xml.sax.*; + + +public class SimpleMorphContentHandler implements ContentHandler { + private Element currentElement; + private Lemma lemma; + private Form form; + + public SimpleMorphContentHandler() { + } + + public Form getForm() { + return form; + } + + public Lemma getLemma() { + return lemma; + } + + public void startDocument() throws SAXException { + } + + public void endDocument() throws SAXException { + } + + public void characters(char[] c, int start, int length) throws SAXException { + if (currentElement != null) { + String elemName = currentElement.name; + if (form != null) { + char[] cCopy = new char[length]; + System.arraycopy(c, start, cCopy, 0, length); + String charactersStr = String.valueOf(cCopy); + if (elemName.equals("provider")) + form.setProvider(charactersStr); + else if (elemName.equals("language")) + form.setLanguage(charactersStr); + else if (elemName.equals("form-name")) + form.setFormName(charactersStr); + else if (elemName.equals("lemma-name")) + form.setLemmaName(charactersStr); + else if (elemName.equals("pos")) + form.setPos(charactersStr); + else if (elemName.equals("tense")) + form.setTense(charactersStr); + else if (elemName.equals("voice")) + form.setVoice(charactersStr); + else if (elemName.equals("casus")) + form.setCasus(charactersStr); + else if (elemName.equals("number")) + form.setNumber(charactersStr); + else if (elemName.equals("mood")) + form.setMood(charactersStr); + else if (elemName.equals("person")) + form.setPerson(charactersStr); + else if (elemName.equals("gender")) + form.setGender(charactersStr); + else if (elemName.equals("definite")) + form.setDefinite(charactersStr); + } else if (lemma != null) { + char[] cCopy = new char[length]; + System.arraycopy(c, start, cCopy, 0, length); + String charactersStr = String.valueOf(cCopy); + if (elemName.equals("provider")) + lemma.setProvider(charactersStr); + else if (elemName.equals("language")) + lemma.setLanguage(charactersStr); + else if (elemName.equals("lemma-name")) + lemma.setLemmaName(charactersStr); + } + } + } + + public void ignorableWhitespace(char[] c, int start, int length) throws SAXException { + } + + public void processingInstruction(String target, String data) throws SAXException { + } + + public void setDocumentLocator(org.xml.sax.Locator arg1) { + } + + public void endPrefixMapping(String prefix) throws SAXException { + } + + public void skippedEntity(String name) throws SAXException { + } + + public void endElement(String uri, String localName, String name) throws SAXException { + currentElement = null; + } + + public void startElement(String uri, String localName, String name, Attributes attrs) throws SAXException { + currentElement = new Element(name); + if (name.equals("form")) { + form = new Form(); + } else if (name.equals("lemma")) { + lemma = new Lemma(); + } + } + + public void startPrefixMapping(String prefix, String uri) throws SAXException { + } + + private class Element { + String name; + String value; + + Element(String name) { + this.name = name; + } + + Element(String name, String value) { + this.name = name; + this.value = value; + } + } +} diff -r 000000000000 -r 408254cf2f1d software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/morph/converter/Converter.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/morph/converter/Converter.java Wed Nov 24 17:24:23 2010 +0100 @@ -0,0 +1,491 @@ +package de.mpg.mpiwg.berlin.mpdl.lt.morph.converter; + +import java.io.BufferedInputStream; +import java.io.BufferedOutputStream; +import java.io.BufferedReader; +import java.io.File; +import java.io.FileInputStream; +import java.io.FileNotFoundException; +import java.io.FileOutputStream; +import java.io.FileReader; +import java.io.IOException; +import java.io.InputStream; +import java.util.Date; +import java.util.Hashtable; + +import org.xml.sax.InputSource; +import org.xml.sax.SAXException; +import org.xml.sax.XMLReader; + +import com.sun.org.apache.xerces.internal.parsers.SAXParser; + +import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; +import de.mpg.mpiwg.berlin.mpdl.general.MpdlConstants; +import de.mpg.mpiwg.berlin.mpdl.lt.general.Transcoder; +import de.mpg.mpiwg.berlin.mpdl.lt.morph.app.Form; +import de.mpg.mpiwg.berlin.mpdl.util.Util; + +public class Converter { + private static Converter instance; + private static String MPDL_DATA_DIR = MpdlConstants.MPDL_DATA_DIR; + private static String ORIG_PERSEUS_DATA_DIR = MPDL_DATA_DIR + "/dataFilesOrig/perseus"; + private static String ORIG_CELEX_DATA_DIR = MPDL_DATA_DIR + "/dataFilesOrig/celex"; + private static String ORIG_FRENCH_DATA_DIR = MPDL_DATA_DIR + "/dataFilesOrig/french"; + private static String ORIG_ITALIAN_DATA_DIR = MPDL_DATA_DIR + "/dataFilesOrig/italian"; + private static String ORIG_DONATUS_SUB_DATA_DIR = MPDL_DATA_DIR + "/dataFilesOrig/donatus-sup"; + private static String OUT_DATA_DIR = MPDL_DATA_DIR + "/dataFiles"; + private PerseusContentHandler perseusContentHandler; + private Hashtable> forms = new Hashtable>(); + private Date beginOfOperation; + private Date endOfOperation; + + public static Converter getInstance() throws ApplicationException { + if (instance == null) { + instance = new Converter(); + } + return instance; + } + + /** + * + */ + public static void main(String[] args) throws ApplicationException { + getInstance(); + instance.beginOperation(); + System.out.print("Start ..."); + /* + // Latin + String inputFileNameLatin = ORIG_PERSEUS_DATA_DIR + "/" + "latin.morph.xml"; + String outputFileNameLatin = OUT_DATA_DIR + "/" + "perseus-latin-forms.xml"; + instance.perseusConvert("perseus", "la", inputFileNameLatin, outputFileNameLatin); + String inputFileNameDonatusLatinSup = ORIG_DONATUS_SUB_DATA_DIR + "/" + "donatus-sup-la-forms.csv"; + String outputFileNameDonatusLatinSup = OUT_DATA_DIR + "/" + "donatus-sup-la-forms.xml"; + instance.donatusSupplementsConvert("donatus-sup", "la", inputFileNameDonatusLatinSup, outputFileNameDonatusLatinSup); + instance.forms = new Hashtable>(); + // Greek + String inputFileNameGreek = ORIG_PERSEUS_DATA_DIR + "/" + "greek.morph.xml"; + String outputFileNameGreek = OUT_DATA_DIR + "/" + "perseus-greek-forms.xml"; + instance.perseusConvert("perseus", "el", inputFileNameGreek, outputFileNameGreek); + String inputFileNameDonatusGreekSup = ORIG_DONATUS_SUB_DATA_DIR + "/" + "donatus-sup-el-forms.csv"; + String outputFileNameDonatusGreekSup = OUT_DATA_DIR + "/" + "donatus-sup-el-forms.xml"; + instance.donatusSupplementsConvert("donatus-sup", "el", inputFileNameDonatusGreekSup, outputFileNameDonatusGreekSup); + instance.forms = new Hashtable>(); + // Arabic + String inputFileNameArabic = ORIG_PERSEUS_DATA_DIR + "/" + "arabic.morph.xml"; + String outputFileNameArabic = OUT_DATA_DIR + "/" + "perseus-arabic-forms.xml"; + instance.perseusConvert("perseus", "ar", inputFileNameArabic, outputFileNameArabic); + String inputFileNameDonatusArabicSup = ORIG_DONATUS_SUB_DATA_DIR + "/" + "donatus-sup-ar-forms.csv"; + String outputFileNameDonatusArabicSup = OUT_DATA_DIR + "/" + "donatus-sup-ar-forms.xml"; + instance.donatusSupplementsConvert("donatus-sup", "ar", inputFileNameDonatusArabicSup, outputFileNameDonatusArabicSup); + instance.forms = new Hashtable>(); + // Dutch + String inputFileNameDutchWords = ORIG_CELEX_DATA_DIR + "/" + "dmw.cd"; + String inputFileNameDutchLemmas = ORIG_CELEX_DATA_DIR + "/" + "dml.cd"; + String outputFileNameDutch = OUT_DATA_DIR + "/" + "celex-dutch-forms.xml"; + instance.celexConvert("celex", "nl", inputFileNameDutchWords, inputFileNameDutchLemmas, outputFileNameDutch); + instance.forms = new Hashtable>(); + // German + String inputFileNameGermanWords = ORIG_CELEX_DATA_DIR + "/" + "gmw.cd"; + String inputFileNameGermanLemmas = ORIG_CELEX_DATA_DIR + "/" + "gml.cd"; + String outputFileNameGerman = OUT_DATA_DIR + "/" + "celex-german-forms.xml"; + instance.celexConvert("celex", "de", inputFileNameGermanWords, inputFileNameGermanLemmas, outputFileNameGerman); + String inputFileNameDonatusGermanSup = ORIG_DONATUS_SUB_DATA_DIR + "/" + "donatus-sup-de-forms.csv"; + String outputFileNameDonatusGermanSup = OUT_DATA_DIR + "/" + "donatus-sup-de-forms.xml"; + instance.donatusSupplementsConvert("donatus-sup", "de", inputFileNameDonatusGermanSup, outputFileNameDonatusGermanSup); + instance.forms = new Hashtable>(); + // English + String inputFileNameEnglishWords = ORIG_CELEX_DATA_DIR + "/" + "emw.cd"; + String inputFileNameEnglishLemmas = ORIG_CELEX_DATA_DIR + "/" + "eml.cd"; + String outputFileNameEnglish = OUT_DATA_DIR + "/" + "celex-english-forms.xml"; + instance.celexConvert("celex", "en", inputFileNameEnglishWords, inputFileNameEnglishLemmas, outputFileNameEnglish); + String inputFileNameDonatusEnglishSup = ORIG_DONATUS_SUB_DATA_DIR + "/" + "donatus-sup-en-forms.csv"; + String outputFileNameDonatusEnglishSup = OUT_DATA_DIR + "/" + "donatus-sup-en-forms.xml"; + instance.donatusSupplementsConvert("donatus-sup", "en", inputFileNameDonatusEnglishSup, outputFileNameDonatusEnglishSup); + instance.forms = new Hashtable>(); + // French + String inputFileNameFrench = ORIG_FRENCH_DATA_DIR + "/" + "lexique"; + String outputFileNameFrench = OUT_DATA_DIR + "/" + "lexique-french-forms.xml"; + instance.lexiqueConvert("lexique", "fr", inputFileNameFrench, outputFileNameFrench); + String inputFileNameDonatusFrenchSup = ORIG_DONATUS_SUB_DATA_DIR + "/" + "donatus-sup-fr-forms.csv"; + String outputFileNameDonatusFrenchSup = OUT_DATA_DIR + "/" + "donatus-sup-fr-forms.xml"; + instance.donatusSupplementsConvert("donatus-sup", "fr", inputFileNameDonatusFrenchSup, outputFileNameDonatusFrenchSup); + instance.forms = new Hashtable>(); + */ + // Italian + String inputFileNameItalian = ORIG_ITALIAN_DATA_DIR + "/" + "ital.hash"; + String outputFileNameItalian = OUT_DATA_DIR + "/" + "donatus-italian-forms.xml"; + instance.donatusItalianConvert("donatus", "it", inputFileNameItalian, outputFileNameItalian); + /* + String inputFileNameDonatusItalianSup = ORIG_DONATUS_SUB_DATA_DIR + "/" + "donatus-sup-it-forms.csv"; + String outputFileNameDonatusItalianSup = OUT_DATA_DIR + "/" + "donatus-sup-it-forms.xml"; + instance.donatusSupplementsConvert("donatus-sup", "it", inputFileNameDonatusItalianSup, outputFileNameDonatusItalianSup); + */ + instance.forms = new Hashtable>(); + + instance.end(); + instance.endOperation(); + Double elapsedTime = new Util().getSecondWithMillisecondsBetween(instance.beginOfOperation, instance.endOfOperation); + System.out.println("End."); + System.out.println("Needed time: " + elapsedTime + " seconds"); + } + + private void perseusConvert(String provider, String language, String inputFileName, String outputFileName) throws ApplicationException { + File inputFile = new File(inputFileName); + perseusContentHandler = new PerseusContentHandler(provider, language, outputFileName); + try { + XMLReader xmlParser = new SAXParser(); + xmlParser.setContentHandler(perseusContentHandler); + InputStream inputStream = new FileInputStream(inputFile); + BufferedInputStream bufferedInputStream = new BufferedInputStream(inputStream); + InputSource input = new InputSource(bufferedInputStream); + xmlParser.parse(input); + bufferedInputStream.close(); + forms = perseusContentHandler.getForms(); + } catch (SAXException e) { + throw new ApplicationException(e); + } catch (IOException e) { + throw new ApplicationException(e); + } + } + + private void celexConvert(String provider, String language, String inputFileNameWords, String inputFileNameLemmas, String outputFileName) throws ApplicationException { + File inputFileLemmas = new File(inputFileNameLemmas); + Hashtable lemmas = loadLemmas(inputFileLemmas); + File inputFileWords = new File(inputFileNameWords); + File outputFile = new File(outputFileName); + writeCelexForms(provider, language, lemmas, inputFileWords, outputFile); + } + + private void lexiqueConvert(String provider, String language, String inputFileName, String outputFileName) throws ApplicationException { + File inputFile = new File(inputFileName); + File outputFile = new File(outputFileName); + writeLexiqueForms(provider, language, inputFile, outputFile); + } + + private void donatusItalianConvert(String provider, String language, String inputFileName, String outputFileName) throws ApplicationException { + File inputFile = new File(inputFileName); + File outputFile = new File(outputFileName); + writeDonatusItalianForms(provider, language, inputFile, outputFile); + } + + private void donatusSupplementsConvert(String provider, String language, String inputFileName, String outputFileName) throws ApplicationException { + File inputFile = new File(inputFileName); + File outputFile = new File(outputFileName); + writeDonatusSupplementsForms(provider, language, inputFile, outputFile); + } + + private Hashtable loadLemmas(File inputFile) { + Hashtable retLemmas = new Hashtable(); + BufferedReader in = null; + try { + in = new BufferedReader(new FileReader(inputFile)); + String line = null; + while((line = in.readLine()) != null) { + int from = line.indexOf("\\"); + int to = line.indexOf("\\", from + 1); + String idStr = line.substring(0, from); + Integer idInt = new Integer(idStr); + String lemma = line.substring(from + 1, to); + retLemmas.put(idInt, lemma); + } + } catch (FileNotFoundException e) { + e.printStackTrace(); + } catch (IOException e) { + e.printStackTrace(); + } finally { + // always close the stream + if (in != null) try { in.close(); } catch (Exception e) { } + } + return retLemmas; + } + + private void writeCelexForms(String provider, String language, Hashtable lemmas, File inputFileWords, File outputFile) throws ApplicationException { + BufferedReader in = null; + BufferedOutputStream out = null; + forms = new Hashtable>(); + try { + in = new BufferedReader(new FileReader(inputFileWords)); + out = new BufferedOutputStream(new FileOutputStream(outputFile)); + write("\n", out); + String line = null; + while((line = in.readLine()) != null) { + int delim1 = line.indexOf("\\"); + int delim2 = line.indexOf("\\", delim1 + 1); + int delim3 = line.indexOf("\\", delim2 + 1); + int delim4 = line.indexOf("\\", delim3 + 1); + String formName = line.substring(delim1 + 1, delim2); + String lemmaIdStr = line.substring(delim3 + 1, delim4); + Integer lemmaIdInt = null; + try { + lemmaIdInt = new Integer(lemmaIdStr); + } catch (NumberFormatException e) { + System.out.println("Warning: Lemma id: " + lemmaIdStr + " is not correct"); + } + if (lemmaIdInt != null) { + String lemmaName = lemmas.get(lemmaIdInt); + Form form = new Form(); + form.setProvider(provider); + form.setLanguage(language); + form.setFormName(formName); + form.setLemmaName(lemmaName); + form.normalize(); + if (form.isOk()) { + Hashtable formLemmas = forms.get(formName); + if (formLemmas == null) { + formLemmas = new Hashtable(); + formLemmas.put(lemmaName, form); + forms.put(formName, formLemmas); + write(form, out); + } else { + Form formLemma = formLemmas.get(lemmaName); + if (formLemma == null) { + formLemmas.put(lemmaName, form); + write(form, out); + } + } + } + } + } + write("\n", out); + } catch (FileNotFoundException e) { + throw new ApplicationException(e); + } catch (IOException e) { + throw new ApplicationException(e); + } finally { + // always close the stream + if (in != null) try { in.close(); } catch (Exception e) { } + if (out != null) try { out.close(); } catch (Exception e) { } + } + } + + private void writeLexiqueForms(String provider, String language, File inputFile, File outputFile) throws ApplicationException { + BufferedReader in = null; + BufferedOutputStream out = null; + forms = new Hashtable>(); + try { + in = new BufferedReader(new FileReader(inputFile)); + out = new BufferedOutputStream(new FileOutputStream(outputFile)); + write("\n", out); + String line = null; + while((line = in.readLine()) != null) { + int delim1 = line.indexOf("\t"); + int delim2 = line.indexOf("\t", delim1 + 1); + String formName = line.substring(0, delim1).trim(); + String lemmaName = line.substring(delim1 + 1, delim2).trim(); + if (lemmaName.equals("=")) + lemmaName = formName; + Form form = new Form(); + form.setProvider(provider); + form.setLanguage(language); + form.setFormName(formName); + form.setLemmaName(lemmaName); + form.normalize(); + if (form.isOk()) { + Hashtable formLemmas = forms.get(formName); + if (formLemmas == null) { + formLemmas = new Hashtable(); + formLemmas.put(lemmaName, form); + forms.put(formName, formLemmas); + write(form, out); + } else { + Form formLemma = formLemmas.get(lemmaName); + if (formLemma == null) { + formLemmas.put(lemmaName, form); + write(form, out); + } + } + } + } + write("\n", out); + } catch (FileNotFoundException e) { + throw new ApplicationException(e); + } catch (IOException e) { + throw new ApplicationException(e); + } finally { + // always close the stream + if (in != null) try { in.close(); } catch (Exception e) { } + if (out != null) try { out.close(); } catch (Exception e) { } + } + } + + private void writeDonatusItalianForms(String provider, String language, File inputFile, File outputFile) throws ApplicationException { + BufferedReader in = null; + BufferedOutputStream out = null; + forms = new Hashtable>(); + try { + in = new BufferedReader(new FileReader(inputFile)); + out = new BufferedOutputStream(new FileOutputStream(outputFile)); + write("\n", out); + String line = null; + while((line = in.readLine()) != null) { + // one line is of the form: 'risoluino' => 'V risolvino,risolvere pres imperat 3rd pl ......', + // or of the form: 'legamenti' => 'N legamento masc pl ......', + // this method only recognize the first lemma TODO recognize all lemmas for the form + int delim1 = line.indexOf("'"); + int delim2 = line.indexOf("'", delim1 + 1); + int delim3 = line.indexOf("'", delim2 + 1); + int delim4 = delim3 + 6; // beginning of the lemma + int delim5 = line.indexOf(" ", delim4 + 1); // end of the first lemma(s) is separated by a blank + String formName = line.substring(delim1 + 1, delim2); + formName = formName.replace("\\", ""); + String lemmaName = line.substring(delim4 + 1, delim5); + int commaInLemma = lemmaName.indexOf(","); // when there are more than one lemma + if (commaInLemma != -1) + lemmaName = lemmaName.substring(0, commaInLemma); + lemmaName = lemmaName.replace("\\", ""); + Form form = new Form(); + form.setProvider(provider); + form.setLanguage(language); + form.setFormName(formName); + form.setLemmaName(lemmaName); + form.normalize(); + boolean lineContainsAp = line.contains("\''"); // some of the form lines contain irregular strings of the form: 'par\'' => 'N pari/^,pari indeclform adverb + if (form.isOk() && ! lineContainsAp) { + Hashtable formLemmas = forms.get(formName); + if (formLemmas == null) { + formLemmas = new Hashtable(); + formLemmas.put(lemmaName, form); + forms.put(formName, formLemmas); + write(form, out); + } else { + Form formLemma = formLemmas.get(lemmaName); + if (formLemma == null) { + formLemmas.put(lemmaName, form); + write(form, out); + } + } + } + } + write("\n", out); + } catch (FileNotFoundException e) { + throw new ApplicationException(e); + } catch (IOException e) { + throw new ApplicationException(e); + } finally { + // always close the stream + if (in != null) try { in.close(); } catch (Exception e) { } + if (out != null) try { out.close(); } catch (Exception e) { } + } + } + + private void writeDonatusSupplementsForms(String provider, String language, File inputFile, File outputFile) throws ApplicationException { + BufferedReader in = null; + BufferedOutputStream out = null; + try { + in = new BufferedReader(new FileReader(inputFile)); + out = new BufferedOutputStream(new FileOutputStream(outputFile)); + write("\n", out); + String line = null; + String lemmaName = ""; + String formName = ""; + // each line is a form + while((line = in.readLine()) != null) { + if (line.length() == 0) + break; + String firstChar = line.substring(0, 1); + String mode = "lemmaAndForm"; + if (firstChar.equals(",")) + mode = "form"; + if (mode.equals("lemmaAndForm")) { + int quote2 = line.indexOf("\"", 1); + lemmaName = line.substring(1, quote2); + int quote3 = line.indexOf("\"", quote2 + 1); + int quote4 = line.indexOf("\"", quote3 + 1); + formName = line.substring(quote3 + 1, quote4); + } else if (mode.equals("form")) { + int quote2 = line.indexOf("\"", 3); + formName = line.substring(2, quote2); + } + Form form = new Form(); + form.setProvider(provider); + form.setLanguage(language); + form.setFormName(formName); + form.setLemmaName(lemmaName); + if (form.isGreek()) + transcodeFromBetaCode2Unicode(form); + else if (form.isArabic()) + form = transcodeFromBuckwalter2Unicode(form); + form.normalize(); + if (form.isOk()) { + Hashtable formLemmas = forms.get(formName); + if (formLemmas == null) { + formLemmas = new Hashtable(); + formLemmas.put(lemmaName, form); + forms.put(formName, formLemmas); + write(form, out); + } else { + Form formLemma = formLemmas.get(lemmaName); + if (formLemma == null) { + formLemmas.put(lemmaName, form); + write(form, out); + } + } + } + } + write("\n", out); + } catch (FileNotFoundException e) { + throw new ApplicationException(e); + } catch (IOException e) { + throw new ApplicationException(e); + } finally { + // always close the stream + if (in != null) try { in.close(); } catch (Exception e) { } + if (out != null) try { out.close(); } catch (Exception e) { } + } + } + + private void write(Form form, BufferedOutputStream out) throws ApplicationException { + try { + String xmlFormStr = form.getXmlString(); + byte[] bytes = xmlFormStr.getBytes("utf-8"); + out.write(bytes, 0, bytes.length); + out.flush(); + } catch (IOException e) { + throw new ApplicationException(e); + } + } + + private void write(String inputString, BufferedOutputStream out) throws ApplicationException { + try { + byte[] bytes = inputString.getBytes("utf-8"); + out.write(bytes, 0, bytes.length); + out.flush(); + } catch (IOException e) { + throw new ApplicationException(e); + } + } + + private Form transcodeFromBetaCode2Unicode(Form form) throws ApplicationException { + String formName = form.getFormName(); + String lemmaName = form.getLemmaName(); + Transcoder transcoder = Transcoder.getInstance(); + String encodedUnicodeForm = transcoder.transcodeFromBetaCode2Unicode(formName); + String encodedUnicodeLemma = transcoder.transcodeFromBetaCode2Unicode(lemmaName); + form.setFormName(encodedUnicodeForm); + form.setLemmaName(encodedUnicodeLemma); + return form; + } + + private Form transcodeFromBuckwalter2Unicode(Form form) throws ApplicationException { + String formName = form.getFormName(); + String lemmaName = form.getLemmaName(); + Transcoder transcoder = Transcoder.getInstance(); + String encodedUnicodeForm = transcoder.transcodeFromBuckwalter2Unicode(formName); + String encodedUnicodeLemma = transcoder.transcodeFromBuckwalter2Unicode(lemmaName); + form.setFormName(encodedUnicodeForm); + form.setLemmaName(encodedUnicodeLemma); + return form; + } + + private void end() throws ApplicationException { + } + + private void beginOperation() { + beginOfOperation = new Date(); + } + + private void endOperation() { + endOfOperation = new Date(); + } + +} \ No newline at end of file diff -r 000000000000 -r 408254cf2f1d software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/morph/converter/PerseusContentHandler.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/morph/converter/PerseusContentHandler.java Wed Nov 24 17:24:23 2010 +0100 @@ -0,0 +1,220 @@ +package de.mpg.mpiwg.berlin.mpdl.lt.morph.converter; + +import java.io.BufferedOutputStream; +import java.io.File; +import java.io.FileNotFoundException; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.OutputStream; +import java.util.Hashtable; + +import org.xml.sax.*; + +import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; +import de.mpg.mpiwg.berlin.mpdl.lt.general.Transcoder; +import de.mpg.mpiwg.berlin.mpdl.lt.morph.app.Form; + +public class PerseusContentHandler implements ContentHandler { + private static String[] XML_FORM_FIELD_NAMES = {"form", "lemma", "pos", "tense", "voice", "case", "number", "mood", "person", "gender", "definite"}; + private Hashtable> forms; + private File outputFile; + private String provider; + private String language; + private OutputStream out; + private Element currentElement; + private Form form; + + public PerseusContentHandler(String provider, String language, String outputFileName) throws ApplicationException { + this.outputFile = new File(outputFileName); + this.provider = provider; + this.language = language; + } + + public Hashtable> getForms() { + return forms; + } + + public void startDocument() throws SAXException { + try { + out = new BufferedOutputStream(new FileOutputStream(outputFile)); + forms = new Hashtable>(); + } catch (FileNotFoundException e) { + throw new SAXException(e); + } + write("\n"); + } + + public void endDocument() throws SAXException { + write("\n"); + try { + if (out != null) + out.close(); + } catch (Exception e) { + // nothing: always close the stream at the end of the method + } + } + + public void characters(char[] c, int start, int length) throws SAXException { + if (currentElement != null) { + String elemName = currentElement.name; + if (form != null && isXmlFormField(elemName)) { + char[] cCopy = new char[length]; + System.arraycopy(c, start, cCopy, 0, length); + String charactersStr = String.valueOf(cCopy); + if (charactersStr != null && ! (charactersStr.trim().equals(""))) { + if (elemName.equals("form")) + form.addFormName(charactersStr); + else if (elemName.equals("lemma")) + form.addLemmaName(charactersStr); + else if (elemName.equals("pos")) + form.addPos(charactersStr); + else if (elemName.equals("tense")) + form.addTense(charactersStr); + else if (elemName.equals("voice")) + form.addVoice(charactersStr); + else if (elemName.equals("case")) + form.addCasus(charactersStr); + else if (elemName.equals("number")) + form.addNumber(charactersStr); + else if (elemName.equals("mood")) + form.addMood(charactersStr); + else if (elemName.equals("person")) + form.addPerson(charactersStr); + else if (elemName.equals("gender")) + form.addGender(charactersStr); + else if (elemName.equals("definite")) + form.addDefinite(charactersStr); + } + } + } + } + + public void ignorableWhitespace(char[] c, int start, int length) throws SAXException { + } + + public void processingInstruction(String target, String data) throws SAXException { + } + + public void setDocumentLocator(org.xml.sax.Locator arg1) { + } + + public void endPrefixMapping(String prefix) throws SAXException { + } + + public void skippedEntity(String name) throws SAXException { + } + + public void endElement(String uri, String localName, String name) throws SAXException { + currentElement = null; + try { + if (name.equals("analysis")) { + if (form.isGreek()) + form = transcodeFromBetaCode2Unicode(form); + else if (form.isArabic()) + form = transcodeFromBuckwalter2Unicode(form); + form.normalize(); + if (form.isOk()) { + String formName = form.getFormName(); + String lemmaName = form.getLemmaName(); + Hashtable formLemmas = forms.get(formName); + if (formLemmas == null) { + formLemmas = new Hashtable(); + formLemmas.put(lemmaName, form); + forms.put(formName, formLemmas); + write(form); + } else { + Form formLemma = formLemmas.get(lemmaName); + if (formLemma == null) { + formLemmas.put(lemmaName, form); + write(form); + } + } + } + form = null; + } + } catch (ApplicationException e) { + throw new SAXException(e); + } + } + + public void startElement(String uri, String localName, String name, Attributes attrs) throws SAXException { + currentElement = new Element(name); + if (name.equals("analysis")) { + form = new Form(); + form.setProvider(provider); + form.setLanguage(language); + } + } + + public void startPrefixMapping(String prefix, String uri) throws SAXException { + } + + private boolean isXmlFormField(String fieldName) { + boolean isXmlFormField = false; + for (int i=0; i readForms(String language, String lemmaName) throws ApplicationException { + ArrayList retForms = new ArrayList(); + String lang = Language.getInstance().getLanguageId(language); + String hashKey = lang + "###" + lemmaName; + try { + Database lemmaDB = morphDbEnv.getLemmaDB(); + Cursor cursor = lemmaDB.openCursor(null, null); + byte[] bHashKey = hashKey.getBytes("utf-8"); + DatabaseEntry dbEntryKey = new DatabaseEntry(bHashKey); + DatabaseEntry foundFormValue = new DatabaseEntry(); + OperationStatus operationStatus = cursor.getSearchKey(dbEntryKey, foundFormValue, LockMode.DEFAULT); + while (operationStatus == OperationStatus.SUCCESS) { + byte[] foundFormValueBytes = foundFormValue.getData(); + String foundFormValueStr = new String(foundFormValueBytes, "utf-8"); + Form f = parseXmlFormString(foundFormValueStr); + retForms.add(f); + operationStatus = cursor.getNextDup(dbEntryKey, foundFormValue, LockMode.DEFAULT); + } + cursor.close(); + } catch (DatabaseException e) { + throw new ApplicationException(e); + } catch (UnsupportedEncodingException e) { + throw new ApplicationException(e); + } + return retForms; + } + + // TODO diese Methode wird nicht verwendet bis jetzt + public Hashtable readForms() throws ApplicationException { + Hashtable retForms = new Hashtable(); + try { + Database lemmaDB = morphDbEnv.getLemmaDB(); + Cursor cursor = lemmaDB.openCursor(null, null); + DatabaseEntry dbEntryKey = new DatabaseEntry(); + DatabaseEntry foundFormValue = new DatabaseEntry(); + OperationStatus operationStatus = cursor.getFirst(dbEntryKey, foundFormValue, LockMode.DEFAULT); + while (operationStatus == OperationStatus.SUCCESS) { + byte[] foundFormValueBytes = foundFormValue.getData(); + String foundFormValueStr = new String(foundFormValueBytes, "utf-8"); + Form f = parseXmlFormString(foundFormValueStr); + String formHashKey = f.getLanguage() + "###" + f.getFormName(); + retForms.put(formHashKey, f); + operationStatus = cursor.getNext(dbEntryKey, foundFormValue, LockMode.DEFAULT); + } + cursor.close(); + } catch (DatabaseException e) { + throw new ApplicationException(e); + } catch (UnsupportedEncodingException e) { + throw new ApplicationException(e); + } + return retForms; + } + + public ArrayList readLemmas(String language, String formName) throws ApplicationException { + ArrayList retForms = new ArrayList(); + String lang = Language.getInstance().getLanguageId(language); + String hashKey = lang + "###" + formName; + try { + Database formDB = morphDbEnv.getFormDB(); + Cursor cursor = formDB.openCursor(null, null); + byte[] bHashKey = hashKey.getBytes("utf-8"); + DatabaseEntry dbEntryKey = new DatabaseEntry(bHashKey); + DatabaseEntry foundLemmaValue = new DatabaseEntry(); + OperationStatus operationStatus = cursor.getSearchKey(dbEntryKey, foundLemmaValue, LockMode.DEFAULT); + while (operationStatus == OperationStatus.SUCCESS) { + byte[] foundLemmaValueBytes = foundLemmaValue.getData(); + String foundLemmaValueStr = new String(foundLemmaValueBytes, "utf-8"); + Lemma l = parseXmlLemmaString(foundLemmaValueStr); + retForms.add(l); + operationStatus = cursor.getNextDup(dbEntryKey, foundLemmaValue, LockMode.DEFAULT); + } + cursor.close(); + } catch (DatabaseException e) { + throw new ApplicationException(e); + } catch (UnsupportedEncodingException e) { + throw new ApplicationException(e); + } + return retForms; + } + + private Form parseXmlFormString(String xmlString) throws ApplicationException { + Form form = null; + try { + XMLReader xmlParser = new SAXParser(); + SimpleMorphContentHandler morphContentHandler = new SimpleMorphContentHandler(); + xmlParser.setContentHandler(morphContentHandler); + Reader reader = new StringReader(xmlString); + InputSource input = new InputSource(reader); + xmlParser.parse(input); + form = morphContentHandler.getForm(); + } catch (SAXException e) { + throw new ApplicationException(e); + } catch (IOException e) { + throw new ApplicationException(e); + } + return form; + } + + private Lemma parseXmlLemmaString(String xmlString) throws ApplicationException { + Lemma lemma = null; + try { + XMLReader xmlParser = new SAXParser(); + SimpleMorphContentHandler morphContentHandler = new SimpleMorphContentHandler(); + xmlParser.setContentHandler(morphContentHandler); + Reader reader = new StringReader(xmlString); + InputSource input = new InputSource(reader); + xmlParser.parse(input); + lemma = morphContentHandler.getLemma(); + } catch (SAXException e) { + throw new ApplicationException(e); + } catch (IOException e) { + throw new ApplicationException(e); + } + return lemma; + } + +} \ No newline at end of file diff -r 000000000000 -r 408254cf2f1d software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/morph/db/DBMorphSupWriter.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/morph/db/DBMorphSupWriter.java Wed Nov 24 17:24:23 2010 +0100 @@ -0,0 +1,265 @@ +package de.mpg.mpiwg.berlin.mpdl.lt.morph.db; + +import java.io.BufferedOutputStream; +import java.io.BufferedReader; +import java.io.File; +import java.io.FileNotFoundException; +import java.io.FileOutputStream; +import java.io.FileReader; +import java.io.IOException; +import java.io.UnsupportedEncodingException; +import java.util.Date; +import java.util.HashMap; +import java.util.Iterator; + +import com.sleepycat.je.Cursor; +import com.sleepycat.je.Database; +import com.sleepycat.je.DatabaseEntry; +import com.sleepycat.je.DatabaseException; +import com.sleepycat.je.LockMode; +import com.sleepycat.je.OperationStatus; +import com.sleepycat.je.util.DbLoad; + +import de.mpg.mpiwg.berlin.mpdl.util.Util; +import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; +import de.mpg.mpiwg.berlin.mpdl.general.MpdlConstants; +import de.mpg.mpiwg.berlin.mpdl.lt.general.Transcoder; + +public class DBMorphSupWriter { + private static DBMorphSupWriter instance; + private static String MPDL_DATA_DIR = MpdlConstants.MPDL_DATA_DIR; + private static String DATA_FILES_DIR_DONATUS_ADD_SUP = MPDL_DATA_DIR + "/dataFiles/donatusAdditionalSup"; + private static String DB_DIR_DONATUS_ADD_SUP = MPDL_DATA_DIR + "/dataFiles/donatusAdditionalSup/db"; + private static String[] DONATUS_SUP_DUMPS = {"cache-la", "cache-el", "cache-it"}; + private DbEnvMorphSup dbEnvMorphSup; + private Date beginOfOperation; + private Date endOfOperation; + + public static DBMorphSupWriter getInstance() throws ApplicationException { + if (instance == null) { + instance = new DBMorphSupWriter(); + } + return instance; + } + + public static void main(String[] args) throws ApplicationException { + getInstance(); + instance.beginOperation(); + System.out.print("Start ..."); + instance.initReadWrite(); + // instance.loadDonatusSupDbDumpsToDb(); + instance.printSizeOfAllMorphSupDBs(); + // instance.writeDonatusSupsToFiles(); + instance.end(); + instance.endOperation(); + Double elapsedTime = new Util().getSecondWithMillisecondsBetween(instance.beginOfOperation, instance.endOfOperation); + System.out.println("End."); + System.out.println("Needed time: " + elapsedTime + " seconds"); + } + + private void initReadWrite() throws ApplicationException { + dbEnvMorphSup = new DbEnvMorphSup(); + dbEnvMorphSup.setDataDir(DB_DIR_DONATUS_ADD_SUP); + dbEnvMorphSup.initReadWrite(); + } + + private void loadDonatusSupDbDumpsToDb() throws ApplicationException { + for (int i=0; i getWholeMorphHashMap(String donatusSupName) throws ApplicationException { + HashMap morphHashMap = new HashMap(); + try { + dbEnvMorphSup.openDatabase(donatusSupName + "Dump"); + Database morphDB = dbEnvMorphSup.getMorphSupDB(donatusSupName + "Dump"); + Cursor cursor = morphDB.openCursor(null, null); + DatabaseEntry dbEntryKey = new DatabaseEntry(); + DatabaseEntry dbEntryValue = new DatabaseEntry(); + OperationStatus operationStatus = cursor.getFirst(dbEntryKey, dbEntryValue, LockMode.DEFAULT); + while (operationStatus == OperationStatus.SUCCESS) { + int size = dbEntryKey.getSize(); + if (size > 0) { + byte[] dbEntryKeyBytes = dbEntryKey.getData(); + String dbEntryKeyStr = new String(dbEntryKeyBytes, "utf-8"); + DatabaseEntry newDbEntryValue = new DatabaseEntry(dbEntryValue.getData()); + morphHashMap.put(dbEntryKeyStr, newDbEntryValue); + } + operationStatus = cursor.getNext(dbEntryKey, dbEntryValue, LockMode.DEFAULT); + } + cursor.close(); + } catch (DatabaseException e) { + throw new ApplicationException(e); + } catch (UnsupportedEncodingException e) { + throw new ApplicationException(e); + } + return morphHashMap; + } + + private void writeDonatusSupsToFiles() throws ApplicationException { + BufferedReader in = null; + BufferedOutputStream out = null; + try { + for (int i=0; i morphHashMap = getWholeMorphHashMap(donatusSupName); + Iterator morphDumpIter = morphHashMap.keySet().iterator(); + File outputFile = new File(DATA_FILES_DIR_DONATUS_ADD_SUP + "/donatus-sup-" + donatusSupName + ".xml"); + out = new BufferedOutputStream(new FileOutputStream(outputFile)); + write("\n", out); + while (morphDumpIter.hasNext()) { + write("\n", out); + write("" + "donatus-sup" + "\n", out); + String language = "unknown"; + if (donatusSupName.startsWith("cache-")) + language = donatusSupName.substring(6); + write("" + language + "\n", out); + String morphKeyStr = morphDumpIter.next(); + String formStr = morphKeyStr; + if (language.equals("el")) + formStr = transcodeFromBetaCode2Unicode(formStr); + formStr = formStr.toLowerCase(); + write("" + formStr + "\n", out); + DatabaseEntry morphValue = morphHashMap.get(morphKeyStr); + byte[] morphValueBytes = morphValue.getData(); + String wholeLemmaStr = new String(morphValueBytes, "utf-8"); + // only first lemma is recognized TODO recognize all lemmas for the form + char splitSymbol = '\u0009'; + int firstIndexOfSplitSymbol = wholeLemmaStr.indexOf(splitSymbol); + String lemmaForm = wholeLemmaStr; + if (firstIndexOfSplitSymbol != -1) + lemmaForm = wholeLemmaStr.substring(0, firstIndexOfSplitSymbol); + else + lemmaForm = lemmaForm + "XXXXXX"; + char splitSymbol2 = '\u000B'; + int firstIndexOfSplitSymbol2 = lemmaForm.indexOf(splitSymbol2); + if (firstIndexOfSplitSymbol2 != -1) + lemmaForm = lemmaForm.substring(0, firstIndexOfSplitSymbol2); + if (language.equals("el")) + lemmaForm = transcodeFromBetaCode2Unicode(lemmaForm); + lemmaForm = lemmaForm.replaceAll("#\\d", ""); + lemmaForm = lemmaForm.toLowerCase(); + write("" + lemmaForm + "\n", out); + write("\n", out); + } + write("\n", out); + } + } catch (FileNotFoundException e) { + throw new ApplicationException(e); + } catch (UnsupportedEncodingException e) { + throw new ApplicationException(e); + } finally { + // always close the stream + if (in != null) try { in.close(); } catch (Exception e) { } + if (out != null) try { out.close(); } catch (Exception e) { } + } + } + + private void write(byte[] inputBytes, BufferedOutputStream out) throws ApplicationException { + try { + out.write(inputBytes, 0, inputBytes.length); + out.flush(); + } catch (IOException e) { + throw new ApplicationException(e); + } + } + + private void write(String outStr, BufferedOutputStream out) throws ApplicationException { + try { + byte[] bytes = outStr.getBytes("utf-8"); + out.write(bytes, 0, bytes.length); + out.flush(); + } catch (IOException e) { + throw new ApplicationException(e); + } + } + + private String transcodeFromBetaCode2Unicode(String inputStr) throws ApplicationException { + Transcoder transcoder = Transcoder.getInstance(); + String encodedUnicodeForm = transcoder.transcodeFromBetaCode2Unicode(inputStr); + return encodedUnicodeForm; + } + + private void beginOperation() { + beginOfOperation = new Date(); + } + + private void endOperation() { + endOfOperation = new Date(); + } + +} \ No newline at end of file diff -r 000000000000 -r 408254cf2f1d software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/morph/db/DBMorphWriter.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/morph/db/DBMorphWriter.java Wed Nov 24 17:24:23 2010 +0100 @@ -0,0 +1,168 @@ +package de.mpg.mpiwg.berlin.mpdl.lt.morph.db; + +import java.io.BufferedInputStream; +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.util.ArrayList; +import java.util.Date; + +import org.xml.sax.InputSource; +import org.xml.sax.SAXException; +import org.xml.sax.XMLReader; + +import com.sun.org.apache.xerces.internal.parsers.SAXParser; + +import de.mpg.mpiwg.berlin.mpdl.lt.morph.app.Form; +import de.mpg.mpiwg.berlin.mpdl.lt.morph.app.Lemma; +import de.mpg.mpiwg.berlin.mpdl.lt.morph.db.DBMorphHandler; +import de.mpg.mpiwg.berlin.mpdl.util.Util; +import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; +import de.mpg.mpiwg.berlin.mpdl.general.MpdlConstants; + +public class DBMorphWriter { + private static DBMorphWriter instance; + private static String MPDL_DATA_DIR = MpdlConstants.MPDL_DATA_DIR; + private static String DB_DIR_DONATUS = MPDL_DATA_DIR + "/dataBerkeleyDB/donatus"; + private static String DATA_FILES_DIR = MPDL_DATA_DIR + "/dataFiles"; + private DBMorphHandler dbMorphHandler; + private Date beginOfOperation; + private Date endOfOperation; + + public static DBMorphWriter getInstance() throws ApplicationException { + if (instance == null) { + instance = new DBMorphWriter(); + instance.init(); + } + return instance; + } + + /** + * + */ + public static void main(String[] args) throws ApplicationException { + getInstance(); + instance.beginOperation(); + System.out.println("Start ..."); + instance.init(); + instance.openMorphData(); + // instance.deleteMorphData(); + long size = instance.getSize(); + System.out.println("Count forms: " + size); + // instance.writeMorphData(); + // instance.readSampleData(); + instance.end(); + instance.endOperation(); + Double elapsedTime = new Util().getSecondWithMillisecondsBetween(instance.beginOfOperation, instance.endOfOperation); + System.out.println("End."); + System.out.println("Needed time: " + elapsedTime + " seconds"); + } + + private void init() throws ApplicationException { + dbMorphHandler = new DBMorphHandler(DB_DIR_DONATUS); + dbMorphHandler.start(); + } + + private void openMorphData() throws ApplicationException { + dbMorphHandler.openDatabases(); + } + + private void deleteMorphData() throws ApplicationException { + dbMorphHandler.deleteMorphData(); + } + + private void writeMorphData() throws ApplicationException { + String inputFileNameLatin = DATA_FILES_DIR + "/" + "perseus-latin-forms.xml"; + instance.write(inputFileNameLatin); + String inputFileNameGreek = DATA_FILES_DIR + "/" + "perseus-greek-forms.xml"; + instance.write(inputFileNameGreek); + String inputFileNameArabic = DATA_FILES_DIR + "/" + "perseus-arabic-forms.xml"; + instance.write(inputFileNameArabic); + String inputFileNameDutch = DATA_FILES_DIR + "/" + "celex-dutch-forms.xml"; + instance.write(inputFileNameDutch); + String inputFileNameGerman = DATA_FILES_DIR + "/" + "celex-german-forms.xml"; + instance.write(inputFileNameGerman); + String inputFileNameEnglish = DATA_FILES_DIR + "/" + "celex-english-forms.xml"; + instance.write(inputFileNameEnglish); + String inputFileNameFrench = DATA_FILES_DIR + "/" + "lexique-french-forms.xml"; + instance.write(inputFileNameFrench); + String inputFileNameItalian = DATA_FILES_DIR + "/" + "donatus-italian-forms.xml"; + instance.write(inputFileNameItalian); + String[] languages = {"ar", "de", "en", "el", "fr", "it", "la"}; + for (int i = 0; i < languages.length; i++) { + String language = languages[i]; + String inputFileNameDonatusSup = DATA_FILES_DIR + "/" + "donatus-sup-" + language + "-forms.xml"; + instance.write(inputFileNameDonatusSup); + } + String[] donatusAdditionalSups = {"cache-la", "cache-el", "cache-it"}; + for (int i = 0; i < donatusAdditionalSups.length; i++) { + String donatusAdditionalSupName = donatusAdditionalSups[i]; + String inputFileNameDonatusAddSup = DATA_FILES_DIR + "/donatusAdditionalSup/" + "donatus-sup-" + donatusAdditionalSupName + ".xml"; + instance.write(inputFileNameDonatusAddSup); + } + } + + private void write(String inputFileName) throws ApplicationException { + File inputFile = new File(inputFileName); + if (! inputFile.exists()) { + System.out.println("Input file: " + inputFile.getAbsolutePath() + " does not exist."); + return; + } + DBMorphWriterContentHandler morphContentHandler = new DBMorphWriterContentHandler(dbMorphHandler); + try { + XMLReader xmlParser = new SAXParser(); + xmlParser.setContentHandler(morphContentHandler); + InputStream inputStream = new FileInputStream(inputFile); + BufferedInputStream bufferedInputStream = new BufferedInputStream(inputStream); + InputSource input = new InputSource(bufferedInputStream); + xmlParser.parse(input); + bufferedInputStream.close(); + } catch (SAXException e) { + throw new ApplicationException(e); + } catch (IOException e) { + throw new ApplicationException(e); + } + } + + private long getSize() throws ApplicationException { + long size = dbMorphHandler.getSize(); + return size; + } + + private void addSampleData() throws ApplicationException { + Lemma l1 = new Lemma("perseus", "la", "abrogo"); + Form f1 = new Form("perseus", "la", "abrogare"); + Form f2 = new Form("perseus", "la", "abroges"); + dbMorphHandler.writeFormLemma(f1, l1); + dbMorphHandler.writeLemmaForm(l1, f1); + dbMorphHandler.writeLemmaForm(l1, f2); + } + + private void readSampleData() throws ApplicationException { + ArrayList
    forms = dbMorphHandler.readForms("la", "abrogo"); + System.out.println("Forms: " + forms); + } + + private void deleteSampleData() throws ApplicationException { + Lemma l1 = new Lemma("perseus", "la", "abrogo"); + Form f1 = new Form("perseus", "la", "abrogare"); + Form f2 = new Form("perseus", "la", "abroges"); + dbMorphHandler.deleteLemma(l1); + dbMorphHandler.deleteForm(f1); + dbMorphHandler.deleteForm(f2); + } + + private void end() throws ApplicationException { + dbMorphHandler.closeDatabases(); + } + + private void beginOperation() { + beginOfOperation = new Date(); + } + + private void endOperation() { + endOfOperation = new Date(); + } + +} \ No newline at end of file diff -r 000000000000 -r 408254cf2f1d software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/morph/db/DBMorphWriterContentHandler.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/morph/db/DBMorphWriterContentHandler.java Wed Nov 24 17:24:23 2010 +0100 @@ -0,0 +1,133 @@ +package de.mpg.mpiwg.berlin.mpdl.lt.morph.db; + +import java.util.Hashtable; + +import org.xml.sax.*; + +import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; +import de.mpg.mpiwg.berlin.mpdl.lt.morph.app.Form; +import de.mpg.mpiwg.berlin.mpdl.lt.morph.app.Lemma; + +public class DBMorphWriterContentHandler implements ContentHandler { + private DBMorphHandler dbMorphHandler; + private Element currentElement; + private Form form; + private Lemma lemma; + private Hashtable forms; + + public DBMorphWriterContentHandler(DBMorphHandler dbMorphHandler) { + this.dbMorphHandler = dbMorphHandler; + } + + public void startDocument() throws SAXException { + forms = new Hashtable(); + } + + public void endDocument() throws SAXException { + forms = null; + } + + // TODO setPos etc. ersetzen durch addPos etc. + public void characters(char[] c, int start, int length) throws SAXException { + if (currentElement != null) { + String elemName = currentElement.name; + if (form != null) { + char[] cCopy = new char[length]; + System.arraycopy(c, start, cCopy, 0, length); + String charactersStr = String.valueOf(cCopy); + if (charactersStr != null && ! (charactersStr.trim().equals(""))) { + if (elemName.equals("provider")) { + form.addProvider(charactersStr); + lemma.addProvider(charactersStr); + } else if (elemName.equals("language")) { + form.addLanguage(charactersStr); + lemma.addLanguage(charactersStr); + } else if (elemName.equals("form-name")) { + form.addFormName(charactersStr); + } else if (elemName.equals("lemma-name")) { + form.addLemmaName(charactersStr); + lemma.addLemmaName(charactersStr); + } else if (elemName.equals("pos")) { + form.addPos(charactersStr); + } else if (elemName.equals("tense")) { + form.addTense(charactersStr); + } else if (elemName.equals("voice")) { + form.addVoice(charactersStr); + } else if (elemName.equals("casus")) { + form.addCasus(charactersStr); + } else if (elemName.equals("number")) { + form.addNumber(charactersStr); + } else if (elemName.equals("mood")) { + form.addMood(charactersStr); + } else if (elemName.equals("person")) { + form.addPerson(charactersStr); + } else if (elemName.equals("gender")) { + form.addGender(charactersStr); + } else if (elemName.equals("definite")) { + form.addDefinite(charactersStr); + } + } + } + } + } + + public void ignorableWhitespace(char[] c, int start, int length) throws SAXException { + } + + public void processingInstruction(String target, String data) throws SAXException { + } + + public void setDocumentLocator(org.xml.sax.Locator arg1) { + } + + public void endPrefixMapping(String prefix) throws SAXException { + } + + public void skippedEntity(String name) throws SAXException { + } + + public void startElement(String uri, String localName, String name, Attributes attrs) throws SAXException { + currentElement = new Element(name, ""); + if (localName.equals("form")) { + form = new Form(); + lemma = new Lemma(); + } + } + + public void endElement(String uri, String localName, String name) throws SAXException { + currentElement = null; + if (localName.equals("form")) { + String keyStr = form.getFormName(); + forms.put(keyStr, form); + write(form, lemma); + form = null; + lemma = null; + } + } + + public void startPrefixMapping(String prefix, String uri) throws SAXException { + } + + private void write(Form form, Lemma lemma) throws SAXException { + try { + dbMorphHandler.writeFormLemma(form, lemma); + dbMorphHandler.writeLemmaForm(lemma, form); + } catch (ApplicationException e) { + throw new SAXException(e); + } + } + + private class Element { + String name; + String value; + + Element(String name) { + this.name = name; + } + + Element(String name, String value) { + this.name = name; + this.value = value; + } + } +} diff -r 000000000000 -r 408254cf2f1d software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/morph/db/DbEnvMorph.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/morph/db/DbEnvMorph.java Wed Nov 24 17:24:23 2010 +0100 @@ -0,0 +1,105 @@ +package de.mpg.mpiwg.berlin.mpdl.lt.morph.db; + +import java.io.File; + +import com.sleepycat.je.Database; +import com.sleepycat.je.DatabaseConfig; +import com.sleepycat.je.DatabaseException; +import com.sleepycat.je.Environment; +import com.sleepycat.je.EnvironmentConfig; + +import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; + +public class DbEnvMorph { + private String dataDir; + private File envPath; + private Environment env; + private EnvironmentConfig envConfig; + private DatabaseConfig dbConfig; + private Database lemmaDB; + private Database formDB; + + public DbEnvMorph() { + } + + public void setDataDir(String dataDir) { + this.dataDir = dataDir; + } + + public void init() throws ApplicationException { + try { + envConfig = new EnvironmentConfig(); + dbConfig = new DatabaseConfig(); + envConfig.setReadOnly(false); + dbConfig.setReadOnly(false); + envConfig.setAllowCreate(true); + dbConfig.setAllowCreate(true); + envConfig.setTransactional(true); + dbConfig.setTransactional(true); + // allow duplicates for keys + dbConfig.setSortedDuplicates(true); + envPath = new File(dataDir); + env = new Environment(envPath, envConfig); + } catch (DatabaseException e) { + throw new ApplicationException(e); + } + } + + public void openDatabases() throws ApplicationException { + try { + // open databases (and create them if they do not exist) + lemmaDB = env.openDatabase(null, "LemmaDB", dbConfig); + formDB = env.openDatabase(null, "FormDB", dbConfig); + } catch (DatabaseException e) { + throw new ApplicationException(e); + } + } + + public void removeDatabases() throws ApplicationException { + try { + if (formDB != null) + formDB.close(); + if (lemmaDB != null) + lemmaDB.close(); + env.removeDatabase(null, "LemmaDB"); + env.removeDatabase(null, "FormDB"); + formDB = null; + lemmaDB = null; + /* + boolean bla = true; + env.truncateDatabase(null, "LemmaDB", bla); + env.truncateDatabase(null, "FormDB", bla); + */ + } catch (DatabaseException e) { + throw new ApplicationException(e); + } + } + + public Environment getEnv() { + return env; + } + + public Database getLemmaDB() { + return lemmaDB; + } + + public Database getFormDB() { + return formDB; + } + + public void close() throws ApplicationException { + if (env != null) { + try { + if (formDB != null) + formDB.close(); + if (lemmaDB != null) + lemmaDB.close(); + if (env != null) + env.close(); + } catch (DatabaseException e) { + throw new ApplicationException(e); + } + } + } +} + diff -r 000000000000 -r 408254cf2f1d software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/morph/db/DbEnvMorphSup.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/morph/db/DbEnvMorphSup.java Wed Nov 24 17:24:23 2010 +0100 @@ -0,0 +1,101 @@ +package de.mpg.mpiwg.berlin.mpdl.lt.morph.db; + +import java.io.File; +import java.util.HashMap; + +import com.sleepycat.je.Database; +import com.sleepycat.je.DatabaseConfig; +import com.sleepycat.je.DatabaseException; +import com.sleepycat.je.Environment; +import com.sleepycat.je.EnvironmentConfig; + +import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; + +public class DbEnvMorphSup { + private String dataDir; + private File envPath; + private Environment env; + private EnvironmentConfig envConfig; + private DatabaseConfig dbConfig; + private HashMap morphSupDBs = new HashMap(); + + public DbEnvMorphSup() { + } + + public void setDataDir(String dataDir) { + this.dataDir = dataDir; + } + + public void initReadOnly() throws ApplicationException { + try { + envConfig = new EnvironmentConfig(); + dbConfig = new DatabaseConfig(); + envPath = new File(dataDir); + env = new Environment(envPath, envConfig); + } catch (DatabaseException e) { + throw new ApplicationException(e); + } + } + + public void initReadWrite() throws ApplicationException { + try { + envConfig = new EnvironmentConfig(); + dbConfig = new DatabaseConfig(); + envConfig.setReadOnly(false); + dbConfig.setReadOnly(false); + envConfig.setAllowCreate(true); + dbConfig.setAllowCreate(true); + envConfig.setTransactional(true); + dbConfig.setTransactional(true); + envPath = new File(dataDir); + env = new Environment(envPath, envConfig); + } catch (DatabaseException e) { + throw new ApplicationException(e); + } + } + + public void openDatabase(String morphSupName) throws ApplicationException { + try { + Database lexDB = morphSupDBs.get(morphSupName); + if (lexDB == null) { + Database morphSupDB = env.openDatabase(null, morphSupName + ".db", dbConfig); + morphSupDBs.put(morphSupName, morphSupDB); + } + } catch (DatabaseException e) { + throw new ApplicationException(e); + } + } + + public void closeDatabase(String morphSupName) throws ApplicationException { + try { + if (morphSupDBs != null) { + Database morphSupDB = morphSupDBs.get(morphSupName); + if (morphSupDB != null) + morphSupDB.close(); + } + } catch (DatabaseException e) { + throw new ApplicationException(e); + } + } + + public Environment getEnv() { + return env; + } + + public Database getMorphSupDB(String morphSupName) { + Database morphSupDB = morphSupDBs.get(morphSupName); + return morphSupDB; + } + + public void close() throws ApplicationException { + if (env != null) { + try { + if (env != null) + env.close(); + } catch (DatabaseException e) { + throw new ApplicationException(e); + } + } + } +} + diff -r 000000000000 -r 408254cf2f1d software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lucene/LuceneUtil.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lucene/LuceneUtil.java Wed Nov 24 17:24:23 2010 +0100 @@ -0,0 +1,27 @@ +package de.mpg.mpiwg.berlin.mpdl.lucene; + +import java.util.ArrayList; + +public class LuceneUtil { + private static LuceneUtil instance; + + public static LuceneUtil getInstance() { + if (instance == null) { + instance = new LuceneUtil(); + } + return instance; + } + + public ArrayList getVariantsFromLuceneQuery(String queryString) { + ArrayList variants = new ArrayList(); + String[] variantTokens = queryString.split(" "); // TODO throw the phrases away (e.g.: "bla bla bla") + for (int i = 0; i < variantTokens.length; i++) { + String token = variantTokens[i]; + if (! (token.contains("*") || token.contains("?") || token.contains("~") || token.contains("-") || token.contains("+") || token.contains("^") || token.contains("OR") || token.contains("AND") || token.contains("NOT"))) { + variants.add(token); + } + } + return variants; + } + +} \ No newline at end of file diff -r 000000000000 -r 408254cf2f1d software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lucene/MorphQueryParser.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lucene/MorphQueryParser.java Wed Nov 24 17:24:23 2010 +0100 @@ -0,0 +1,175 @@ +package de.mpg.mpiwg.berlin.mpdl.lucene; + +import java.io.IOException; +import java.io.StringReader; +import java.util.ArrayList; +import java.util.List; +import java.util.Vector; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.index.Term; +import org.apache.lucene.queryParser.ParseException; +import org.apache.lucene.queryParser.QueryParser; +import org.apache.lucene.search.BooleanClause; +import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.MultiPhraseQuery; +import org.apache.lucene.search.PhraseQuery; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.TermQuery; + +import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; +import de.mpg.mpiwg.berlin.mpdl.lt.analyzer.MpdlMorphAnalyzer; +import de.mpg.mpiwg.berlin.mpdl.lt.morph.app.MorphologyCache; + +public class MorphQueryParser extends QueryParser { + String language; + + public MorphQueryParser(String f, MpdlMorphAnalyzer a) { + super(f, a); + this.language = a.getLanguage(); + } + + /** + * @exception ParseException throw in overridden method to disallow + */ + protected Query getFieldQuery(String field, String queryText) throws ParseException { + // Use the analyzer to get all the tokens, and then build a TermQuery, + // PhraseQuery, or nothing based on the term count + Analyzer analyzer = getAnalyzer(); + TokenStream source = analyzer.tokenStream(field, new StringReader(queryText)); + Vector v = new Vector(); + org.apache.lucene.analysis.Token t; + int positionCount = 0; + boolean severalTokensAtSamePosition = false; + + while (true) { + try { + t = source.next(); + } + catch (IOException e) { + t = null; + } + if (t == null) + break; + v.addElement(t); + if (t.getPositionIncrement() != 0) + positionCount += t.getPositionIncrement(); + else + severalTokensAtSamePosition = true; + } + try { + source.close(); + } + catch (IOException e) { + // ignore + } + + if (v.size() == 0) + return null; + else if (v.size() == 1) { + t = (org.apache.lucene.analysis.Token) v.elementAt(0); + // BEGIN MPDL specific extensions + Query retMorphQuery = null; + try { + String termText = t.termText(); + ArrayList lemmaNames = null; + if (termText != null && ! termText.trim().equals("")) { + // lemma mode: if term contains "lemmalemma" then the lemma itself is fetched + if (termText.startsWith("lemmalemma")) { + lemmaNames = new ArrayList(); + String lemmaName = termText.substring(10); + lemmaNames.add(lemmaName); + } else { + String[] lemmasStrArray = termText.split("\\+\\+\\+"); + if (lemmasStrArray != null) + lemmaNames = new ArrayList(); + for (int i=0; i morphIndexKeys = morphologyCache.getIndexKeysByLemmaNames(language, lemmaNames); + if (morphIndexKeys == null) { + return null; + } else if (morphIndexKeys.size() == 1) { + String morphIndexKey = morphIndexKeys.get(0); + retMorphQuery = new TermQuery(new Term(field, morphIndexKey)); + } else if (morphIndexKeys.size() > 1) { + BooleanQuery retMorphQueryBoolean = new BooleanQuery(true); + for (int i=0; i 0 && multiTerms.size() > 0) { + if (getEnablePositionIncrements()) { + mpq.add((Term[])multiTerms.toArray(new Term[0]),position); + } else { + mpq.add((Term[])multiTerms.toArray(new Term[0])); + } + multiTerms.clear(); + } + position += t.getPositionIncrement(); + multiTerms.add(new Term(field, t.termText())); + } + if (getEnablePositionIncrements()) { + mpq.add((Term[])multiTerms.toArray(new Term[0]),position); + } else { + mpq.add((Term[])multiTerms.toArray(new Term[0])); + } + return mpq; + } + } + else { + PhraseQuery pq = new PhraseQuery(); + pq.setSlop(getPhraseSlop()); + int position = -1; + for (int i = 0; i < v.size(); i++) { + t = (org.apache.lucene.analysis.Token) v.elementAt(i); + if (getEnablePositionIncrements()) { + position += t.getPositionIncrement(); + pq.add(new Term(field, t.termText()),position); + } else { + pq.add(new Term(field, t.termText())); + } + } + return pq; + } + } + } + +} \ No newline at end of file diff -r 000000000000 -r 408254cf2f1d software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/schedule/MpdlChainScheduler.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/schedule/MpdlChainScheduler.java Wed Nov 24 17:24:23 2010 +0100 @@ -0,0 +1,243 @@ +package de.mpg.mpiwg.berlin.mpdl.schedule; + +import java.net.URL; +import java.util.ArrayList; +import java.util.Collection; +import java.util.Date; +import java.util.HashMap; +import java.util.Iterator; +import java.util.List; +import java.util.PriorityQueue; +import java.util.Queue; + +import org.apache.log4j.Logger; +import org.quartz.JobDataMap; +import org.quartz.JobDetail; +import org.quartz.JobExecutionContext; +import org.quartz.JobListener; +import org.quartz.SchedulerException; +import org.quartz.SimpleTrigger; +import org.quartz.Trigger; +import org.quartz.impl.StdSchedulerFactory; + +import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; + +public class MpdlChainScheduler { + private static MpdlChainScheduler instance; + private static String CRUD_JOB = "MPDL_CRUD_JOB"; + private static String CRUD_TRIGGER = "MPDL_CRUD_TRIGGER"; + private static String CRUD_GROUP = "MPDL_CRUD_GROUP"; + private static Logger LOGGER = Logger.getLogger(MpdlChainScheduler.class); // Logs to EXIST_HOME/webapp/WEB-INF/logs/exist.log + private org.quartz.Scheduler scheduler; + private JobListener jobListener; + private Queue docOperationQueue = new PriorityQueue(); + private HashMap finishedDocOperations = new HashMap(); + private boolean operationInProgress = false; + private int jobOrderId = 0; + + public static MpdlChainScheduler getInstance() throws ApplicationException { + if (instance == null) { + instance = new MpdlChainScheduler(); + instance.init(); + } + return instance; + } + + public MpdlDocOperation doOperation(MpdlDocOperation docOperation) throws ApplicationException { + jobOrderId++; + docOperation.setOrderId(jobOrderId); + queueOperation(docOperation); + scheduleNextOperation(); + return docOperation; + } + + public void finishOperation(MpdlDocOperation docOperation) throws ApplicationException { + operationInProgress = false; + Date now = new Date(); + docOperation.setEnd(now); + docOperation.setStatus("finished"); + int jobId = new Integer(docOperation.getOrderId()); + finishedDocOperations.put(jobId, docOperation); + log(docOperation); + // schedule next job if there is one + scheduleNextOperation(); + } + + private void log(MpdlDocOperation docOperation) { + Date startTime = docOperation.getStart(); + Date endTime = docOperation.getEnd(); + long executionTime = -1; + if (startTime != null && endTime != null) + executionTime = (endTime.getTime() - startTime.getTime()); + String jobInfo = "MPDL: Document operation " + docOperation.toString() + ": started at: " + startTime + + " and ended at: " + endTime + " (needed time: " + executionTime + " ms)"; + LOGGER.info(jobInfo); + } + + public synchronized void scheduleNextOperation() throws ApplicationException { + if (isOperationInProgress()) { + // nothing, operation has to wait + } else { + MpdlDocOperation docOperation = docOperationQueue.poll(); + if (docOperation == null) { + // if queue is empty then do nothing (there are no more operations to execute) + } else { + Date now = new Date(); + operationInProgress = true; + docOperation.setStart(now); + scheduleJob(docOperation, now); + } + } + } + + public ArrayList getDocOperations() throws ApplicationException { + ArrayList docOperations = new ArrayList(); + try { + // first: all finished jobs + Collection finiDocOperations = finishedDocOperations.values(); + docOperations.addAll(finiDocOperations); + // second: all currently executed jobs + if (operationInProgress) { + List currentJobs = (List) scheduler.getCurrentlyExecutingJobs(); + Iterator iter = currentJobs.iterator(); + while (iter.hasNext()) { + JobExecutionContext jobExecutionContext = iter.next(); + MpdlDocOperation docOperation = getDocOperation(jobExecutionContext); + if (docOperation != null) { + docOperations.add(docOperation); + } + } + } + // third: all queued jobs + Iterator iter = docOperationQueue.iterator(); + while (iter.hasNext()) { + MpdlDocOperation docOperation = iter.next(); + docOperations.add(docOperation); + } + } catch (SchedulerException e) { + LOGGER.error(e.getMessage()); + throw new ApplicationException(e); + } + return docOperations; + } + + public MpdlDocOperation getDocOperation(int jobId) throws ApplicationException { + MpdlDocOperation docOperation = null; + try { + // first try: looks into currently executing jobs + if (operationInProgress) { + List currentJobs = (List) scheduler.getCurrentlyExecutingJobs(); + Iterator iter = currentJobs.iterator(); + while (iter.hasNext()) { + JobExecutionContext jobExecutionContext = iter.next(); + docOperation = getDocOperation(jobExecutionContext); + if (docOperation != null) { + int dopOpJobId = docOperation.getOrderId(); + if (jobId == dopOpJobId) + return docOperation; + } + } + } + // second try: look into finished jobs + docOperation = finishedDocOperations.get(new Integer(jobId)); + if (docOperation != null) { + return docOperation; + } + // third try: look into queued jobs + Iterator iter = docOperationQueue.iterator(); + while (iter.hasNext()) { + docOperation = iter.next(); + if (docOperation.getOrderId() == jobId) + return docOperation; + } + } catch (SchedulerException e) { + LOGGER.error(e.getMessage()); + throw new ApplicationException(e); + } + // if not found return null + return null; + } + + public MpdlDocOperation getDocOperation(JobExecutionContext jobExecutionContext) { + MpdlDocOperation docOperation = null; + if (jobExecutionContext != null) { + JobDetail job = jobExecutionContext.getJobDetail(); + JobDataMap parameters = job.getJobDataMap(); + docOperation = (MpdlDocOperation) parameters.get("operation"); + } + return docOperation; + } + + private void queueOperation(MpdlDocOperation docOperation) { + int operationsBefore = docOperationQueue.size(); + if (operationsBefore == 0) + docOperation.setStatus("waiting in operation queue"); + else + docOperation.setStatus("waiting in operation queue: " + operationsBefore + " operations heve to be executed before this operation"); + docOperationQueue.offer(docOperation); + } + + private synchronized boolean isOperationInProgress() { + return operationInProgress; + } + + private void scheduleJob(MpdlDocOperation docOperation, Date fireTime) throws ApplicationException { + try { + int jobId = docOperation.getOrderId(); + String jobName = CRUD_JOB + "-id-" + jobId + "-timeId-" + fireTime; + JobDetail job = new JobDetail(jobName, CRUD_GROUP, MpdlDocJob.class); + JobDataMap parameters = new JobDataMap(); + parameters.put("operation", docOperation); + job.setJobDataMap(parameters); + job.addJobListener(jobListener.getName()); + String triggerName = CRUD_TRIGGER + "-id-" + jobId + "-timeId-" + fireTime; + Trigger trigger = new SimpleTrigger(triggerName, CRUD_GROUP, fireTime); + scheduler.scheduleJob(job, trigger); + String jobInfo = "MPDL: Schedule document operation: " + docOperation.toString() + ": done at: " + fireTime.toString(); + LOGGER.info(jobInfo); + } catch (SchedulerException e) { + LOGGER.error(e.getMessage()); + throw new ApplicationException(e); + } + } + + private void init() throws ApplicationException { + try { + if (scheduler == null) { + String quartzPath = getQuartzPath(); + StdSchedulerFactory schedulerFactory = new StdSchedulerFactory(quartzPath); + scheduler = schedulerFactory.getScheduler(); + jobListener = new MpdlChainSchedulerListener(); + scheduler.addJobListener(jobListener); + scheduler.start(); + LOGGER.info("MPDL: Started Quartz scheduler factory: " + quartzPath); + } + } catch (SchedulerException e) { + LOGGER.error(e.getMessage()); + throw new ApplicationException(e); + } + } + + public void end() throws ApplicationException { + try { + if (scheduler != null) { + scheduler.shutdown(); + } + String quartzPath = getQuartzPath(); + LOGGER.info("MPDL: Ended Quartz scheduler factory: " + quartzPath); + } catch (SchedulerException e) { + LOGGER.error(e.getMessage()); + throw new ApplicationException(e); + } + } + + private String getQuartzPath() { + URL quartzUrl = MpdlChainScheduler.class.getResource("quartz.properties"); + String quartzPath = quartzUrl.getPath(); + if (quartzPath.indexOf(".jar!") != -1) { + int beginIndex = quartzPath.indexOf(".jar!") + 6; + quartzPath = quartzPath.substring(beginIndex); + } + return quartzPath; + } +} \ No newline at end of file diff -r 000000000000 -r 408254cf2f1d software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/schedule/MpdlChainSchedulerListener.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/schedule/MpdlChainSchedulerListener.java Wed Nov 24 17:24:23 2010 +0100 @@ -0,0 +1,53 @@ +package de.mpg.mpiwg.berlin.mpdl.schedule; + +import org.apache.log4j.Logger; +import org.quartz.JobDataMap; +import org.quartz.JobDetail; +import org.quartz.JobExecutionContext; +import org.quartz.JobExecutionException; +import org.quartz.JobListener; + +import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; + +public class MpdlChainSchedulerListener implements JobListener { + private static Logger LOGGER = Logger.getLogger(MpdlChainSchedulerListener.class); // Logs to EXIST_HOME/webapp/WEB-INF/logs/exist.log + + public String getName() { + return "MpdlJobChainingListener"; + } + + public void jobToBeExecuted(JobExecutionContext inContext) { + } + + public void jobExecutionVetoed(JobExecutionContext inContext) { + String message = "MPDL: JobChainingListener: Job execution was vetoed."; + LOGGER.debug(message); + } + + public void jobWasExecuted(JobExecutionContext inContext, JobExecutionException inException) { + // after finishing his job it tries to schedule the next operation (if there is one in the queue) + MpdlDocOperation docOperation = null; + try { + MpdlChainScheduler mpdlChainScheduler = MpdlChainScheduler.getInstance(); + docOperation = getDocOperation(inContext); + mpdlChainScheduler.finishOperation(docOperation); + } catch (ApplicationException e) { + if (docOperation != null) { + docOperation.setErrorMessage(e.getMessage()); + } + LOGGER.error(e.getMessage()); + } + } + + private MpdlDocOperation getDocOperation(JobExecutionContext context) { + MpdlDocOperation docOperation = null; + if (context != null) { + JobDetail job = context.getJobDetail(); + JobDataMap parameters = job.getJobDataMap(); + docOperation = (MpdlDocOperation) parameters.get("operation"); + } + return docOperation; + } + + +} diff -r 000000000000 -r 408254cf2f1d software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/schedule/MpdlDocJob.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/schedule/MpdlDocJob.java Wed Nov 24 17:24:23 2010 +0100 @@ -0,0 +1,94 @@ +package de.mpg.mpiwg.berlin.mpdl.schedule; + +import java.util.Date; + +import org.apache.log4j.Logger; +import org.quartz.Job; +import org.quartz.JobDataMap; +import org.quartz.JobDetail; +import org.quartz.JobExecutionContext; +import org.quartz.JobExecutionException; + +import de.mpg.mpiwg.berlin.mpdl.client.DocumentHandler; +import de.mpg.mpiwg.berlin.mpdl.escidoc.ESciDocIngestor; +import de.mpg.mpiwg.berlin.mpdl.escidoc.ESciDocRestSession; +import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; +import de.mpg.mpiwg.berlin.mpdl.xmlrpc.MpdlXmlRpcDocHandler; + +public class MpdlDocJob implements Job { + public static String STATUS_BEGIN = "started"; + private static Logger LOGGER = Logger.getLogger(MpdlDocJob.class); // Logs to EXIST_HOME/webapp/WEB-INF/logs/exist.log + private JobExecutionContext currentExecutedContext; + + public void execute(JobExecutionContext context) throws JobExecutionException { + this.currentExecutedContext = context; + MpdlDocOperation docOperation = getDocOperation(); + docOperation.setIncludePdf(true); // default is true: handle also Pdf/Html version of the document + try { + docOperation.setStatus(STATUS_BEGIN); + String operationName = docOperation.getName(); + String cookieId = docOperation.getESciDocCookieId(); + MpdlXmlRpcDocHandler mpdlXmlRpcDocHandler = MpdlXmlRpcDocHandler.getInstance(); + ESciDocRestSession eSciDocSession = ESciDocRestSession.getInstance(cookieId); + ESciDocIngestor eSciDocIngestor = new ESciDocIngestor(eSciDocSession); + if (operationName.equals("create") || operationName.equals("update")) { + DocumentHandler docHandler = new DocumentHandler(mpdlXmlRpcDocHandler, eSciDocIngestor); + docHandler.doOperation(docOperation); + } else if (operationName.equals("delete")) { + DocumentHandler docHandler = new DocumentHandler(mpdlXmlRpcDocHandler, eSciDocIngestor); + docHandler.doOperation(docOperation); + } else if (operationName.equals("updateExist")) { + DocumentHandler docHandler = new DocumentHandler(mpdlXmlRpcDocHandler); + docHandler.doOperation(docOperation); + } else if (operationName.equals("deleteExist")) { + DocumentHandler docHandler = new DocumentHandler(mpdlXmlRpcDocHandler); + docHandler.doOperation(docOperation); + } else if (operationName.equals("importAllDocumentsLocallyExist")) { + DocumentHandler docHandler = new DocumentHandler(mpdlXmlRpcDocHandler); + docOperation.setIncludePdf(false); // for performance reasons while importing documents: do not generate Pdf/Html-Versions of the document + docHandler.doOperation(docOperation); + } else if (operationName.equals("generatePdfHtmlDocumentFiles")) { + DocumentHandler docHandler = new DocumentHandler(mpdlXmlRpcDocHandler); + docOperation.setIncludePdf(true); + docHandler.doOperation(docOperation); + } + Date startingTime = docOperation.getStart(); + String jobInfo = "MPDL: Document operation " + docOperation.toString() + ": started at: " + startingTime; + LOGGER.info(jobInfo); + this.currentExecutedContext = null; + } catch (Exception e) { + try { + // Quartz will automatically unschedule all triggers associated with this job so that it does not run again + MpdlChainScheduler mpdlChainScheduler = MpdlChainScheduler.getInstance(); + mpdlChainScheduler.finishOperation(docOperation); + String errorMessage = e.getMessage(); + if (errorMessage == null) { + Throwable t = e.getCause(); + if (t == null) { + errorMessage = e.toString(); + } else { + errorMessage = t.getMessage(); + } + } + docOperation.setErrorMessage(errorMessage); + LOGGER.error(errorMessage, e); + JobExecutionException jobExecutionException = new JobExecutionException(e); + jobExecutionException.setUnscheduleAllTriggers(true); + throw jobExecutionException; + } catch (ApplicationException ex) { + // nothing + } + } + } + + private MpdlDocOperation getDocOperation() { + MpdlDocOperation docOperation = null; + if (currentExecutedContext != null) { + JobDetail job = currentExecutedContext.getJobDetail(); + JobDataMap parameters = job.getJobDataMap(); + docOperation = (MpdlDocOperation) parameters.get("operation"); + } + return docOperation; + } + +} diff -r 000000000000 -r 408254cf2f1d software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/schedule/MpdlDocOperation.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/schedule/MpdlDocOperation.java Wed Nov 24 17:24:23 2010 +0100 @@ -0,0 +1,195 @@ +package de.mpg.mpiwg.berlin.mpdl.schedule; + +import java.util.Date; + +import org.w3c.dom.Node; + +import de.mpg.mpiwg.berlin.mpdl.escidoc.MetadataRecord; + +public class MpdlDocOperation implements Comparable { + private int id; + private Date start; + private Date end; + private String name; + private String status; + private String errorMessage; + private String uploadFileName; + private String srcUrl; + private String docBase; + private String language; + private String fileName; + private String eSciDocDestUrl; + private String eSciDocCookieId; + private MetadataRecord mdRecord; + private Node docNode; + private boolean includePdf = false; // default + + public MpdlDocOperation(String name, String srcUrl, String uploadFileName, String docBase, String language, String fileName) { + this.name = name; + this.srcUrl = srcUrl; + this.uploadFileName = uploadFileName; + this.docBase = docBase; + this.language = language; + this.fileName = fileName; + } + + public int compareTo(MpdlDocOperation op) { + Integer opOrderId = new Integer(op.id); + Integer thisOrderId = new Integer(id); + return thisOrderId.compareTo(opOrderId); + } + + public boolean isFinished() { + if (status != null && status.equals("finished")) + return true; + else + return false; + } + + public boolean isError() { + if (errorMessage != null && errorMessage.length() > 0) + return true; + else + return false; + } + + public int getOrderId() { + return id; + } + + public void setOrderId(int orderId) { + this.id = orderId; + } + + public String getStatus() { + return status; + } + + public void setStatus(String status) { + this.status = status; + } + + public Date getStart() { + return start; + } + + public void setStart(Date start) { + this.start = start; + } + + public Date getEnd() { + return end; + } + + public void setEnd(Date end) { + this.end = end; + } + + public String getName() { + return name; + } + + public void setName(String name) { + this.name = name; + } + + public String getErrorMessage() { + return errorMessage; + } + + public void setErrorMessage(String errorMessage) { + this.errorMessage = errorMessage; + } + + public String getDestUrl() { + return "/" + docBase + "/" + language + "/" + fileName; + } + + public String getSrcUrl() { + return srcUrl; + } + + public void setSrcUrl(String srcUrl) { + this.srcUrl = srcUrl; + } + + public String getUploadFileName() { + return uploadFileName; + } + + public void setUploadFileName(String uploadFileName) { + this.uploadFileName = uploadFileName; + } + + public String getDocBase() { + return docBase; + } + + public void setDocBase(String docBase) { + this.docBase = docBase; + } + + public String getLanguage() { + return language; + } + + public void setLanguage(String language) { + this.language = language; + } + + public String getFileName() { + return fileName; + } + + public void setFileName(String fileName) { + this.fileName = fileName; + } + + public String getESciDocDestUrl() { + return eSciDocDestUrl; + } + + public void setESciDocDestUrl(String sciDocDestUrl) { + eSciDocDestUrl = sciDocDestUrl; + } + + public String getESciDocCookieId() { + return eSciDocCookieId; + } + + public void setESciDocCookieId(String sciDocCookieId) { + eSciDocCookieId = sciDocCookieId; + } + + public MetadataRecord getMdRecord() { + return mdRecord; + } + + public void setMdRecord(MetadataRecord mdRecord) { + this.mdRecord = mdRecord; + } + + public Node getDocNode() { + return docNode; + } + + public void setDocNode(Node docNode) { + this.docNode = docNode; + } + + public boolean includePdf() { + return includePdf; + } + + public void setIncludePdf (boolean includePdf) { + this.includePdf = includePdf; + } + + public String toString() { + if (name.equals("delete")) + return name + "(" + id + ", " + "/" + docBase + "/" + language + "/" + fileName + ")"; + else + return name + "(" + id + ", " + uploadFileName + ", " + "/" + docBase + "/" + language + "/" + fileName + ")"; + } + +} diff -r 000000000000 -r 408254cf2f1d software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/schedule/quartz.properties --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/schedule/quartz.properties Wed Nov 24 17:24:23 2010 +0100 @@ -0,0 +1,9 @@ +org.quartz.scheduler.instanceName = MpdlScheduler +org.quartz.scheduler.instanceId = auto +org.quartz.scheduler.rmi.export = false +org.quartz.scheduler.rmi.proxy = false + +org.quartz.threadPool.class = org.quartz.simpl.SimpleThreadPool +org.quartz.threadPool.threadCount = 3 + +org.quartz.jobStore.class = org.quartz.simpl.RAMJobStore diff -r 000000000000 -r 408254cf2f1d software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/util/FileUtil.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/util/FileUtil.java Wed Nov 24 17:24:23 2010 +0100 @@ -0,0 +1,424 @@ +package de.mpg.mpiwg.berlin.mpdl.util; + +import java.io.BufferedInputStream; +import java.io.BufferedOutputStream; +import java.io.File; +import java.io.FileInputStream; +import java.io.FileNotFoundException; +import java.io.FileOutputStream; +import java.io.FilenameFilter; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.io.RandomAccessFile; +import java.net.MalformedURLException; +import java.net.URI; +import java.net.URL; +import java.net.URLConnection; +import java.util.ArrayList; +import java.util.List; + +import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; + +public class FileUtil { + private static FileUtil instance; + + public static FileUtil getInstance() throws ApplicationException { + if (instance == null) { + instance = new FileUtil(); + } + return instance; + } + + public void createDirectory(String dir) { + File destLocalDirectory = new File(dir); + destLocalDirectory.mkdirs(); + } + + public void deleteDirectory(String dir) { + File destLocalDirectory = new File(dir); + // directory with all files and subdirectories is deleted + deleteDirectory(destLocalDirectory); + } + + /** + * Deletes all files and subdirectories under dir. No exception is thrown + * if dir (or one of its children) does nor exist. + * @dir dir the directory to be deleted + * @return true if all deletions were successful. If a deletion fails, the method stops attempting to delete and returns false. + */ + public boolean deleteDirectory(File dir) { + if (dir.isDirectory()) { + String[] children = dir.list(); + for (int i=0; i 0) { + out.write(buf, 0, len); + } + out.flush(); // buffered content is flushed to file + } catch (FileNotFoundException e) { + throw new ApplicationException(e); + } catch (IOException e) { + throw new ApplicationException(e); + } finally { + try { + if (in != null) + in.close(); + if (out != null) + out.close(); + } catch (Exception e) { + // nothing: always close the stream at the end of the method + } + } + } + + public void saveUrlToLocalFile(URL srcUrl, String destFileName) throws ApplicationException { + BufferedInputStream in = null; + BufferedOutputStream out = null; + try { + /* wenn ein Zugriff mit "http:" gemacht wird, wird die XML Deklaration () nicht ausgelesen + * beim Zugriff mit "file;" ist das anders + * evtl. wieder einbauen, um die Deklaration manuell zu schreiben + URLConnection urlConn = srcUrl.openConnection(); + String contentTypeStr = urlConn.getContentType(); + String contentEncodingStr = urlConn.getContentEncoding(); + boolean contentTypeXml = false; + if (contentTypeStr != null) { + contentTypeStr = contentTypeStr.toLowerCase(); + if (contentTypeStr.indexOf("application/xml") != -1 || contentTypeStr.indexOf("text/xml") != -1) + contentTypeXml = true; + } + */ + InputStream inputStream = srcUrl.openStream(); + in = new BufferedInputStream(inputStream); + File outputFile = new File(destFileName); + File outputDir = new File(outputFile.getParent()); + if (! outputDir.exists()) { + outputDir.mkdirs(); // create the directory including parent directories which do not exist + } + out = new BufferedOutputStream(new FileOutputStream(outputFile)); + int bufLen = 1000*1024; + byte[] buf = new byte[bufLen]; + int len = 0; + /* + if (contentTypeXml) { + String xmlDecl = "\n"; + out.write(xmlDecl.getBytes("utf-8")); + } + */ + while ((len = in.read(buf)) > 0) { + out.write(buf, 0, len); + out.flush(); + } + } catch (MalformedURLException e) { + throw new ApplicationException(e); + } catch (IOException e) { + throw new ApplicationException(e); + } finally { + try { + if (in != null) + in.close(); + if (out != null) + out.close(); + } catch (Exception e) { + // nothing: always close the stream at the end of the method + } + } + } + + public void saveInputStreamToLocalFile(InputStream srcInputStream, String destFileName) throws ApplicationException { + BufferedInputStream in = null; + BufferedOutputStream out = null; + try { + in = new BufferedInputStream(srcInputStream); + File outputFile = new File(destFileName); + File outputDir = new File(outputFile.getParent()); + if (! outputDir.exists()) { + outputDir.mkdirs(); // create the directory including parent directories which do not exist + } + out = new BufferedOutputStream(new FileOutputStream(outputFile)); + int bufLen = 1000*1024; + byte[] buf = new byte[bufLen]; + int len = 0; + while ((len = in.read(buf)) > 0) { + out.write(buf, 0, len); + out.flush(); + } + } catch (MalformedURLException e) { + throw new ApplicationException(e); + } catch (IOException e) { + throw new ApplicationException(e); + } finally { + try { + if (in != null) + in.close(); + if (out != null) + out.close(); + } catch (Exception e) { + // nothing: always close the stream at the end of the method + } + } + } + + public void deleteLastNBytes(File file, int countBytes) throws ApplicationException { + try { + RandomAccessFile raf = new RandomAccessFile(file, "rw"); + long length = raf.length(); + raf.setLength(length - countBytes); + raf.close(); + } catch (IOException e) { + throw new ApplicationException(e); + } + } + + public void testFile(String fileName) throws ApplicationException { + File file = new File(fileName); + boolean fileExists = file.exists(); + if (! fileExists) { + throw new ApplicationException("File: " + fileName + " does not exist"); + } + } + + /** + * Reads a chunk of data of an input stream. + * Does not close the stream until last bytes are read + * @in in the input stream to be read + * @chunkSize chunkSize length of the chunk which is read + * @return byte[] of bytes read + */ + public byte[] readBytes(InputStream in, int chunkSize) throws ApplicationException { + byte[] resultBytes = new byte[chunkSize]; + try { + int len = in.read(resultBytes, 0, chunkSize); + if (len == -1) { + try { in.close(); } catch (Exception e) { } // close the stream if end of file is reached + resultBytes = null; + } else if (len < chunkSize && len != chunkSize) { // if read chunk is last chunk of the file it delivers this chunk + byte[] tmp = new byte[len]; + System.arraycopy(resultBytes, 0, tmp, 0, len); + resultBytes = tmp; + } + } catch (FileNotFoundException e) { + throw new ApplicationException(e); + } catch (IOException e) { + throw new ApplicationException(e); + } + return resultBytes; + } + + /** + * Reads a file storing intermediate data into an array. + * @file file the file to be read + * @return byte[] of file content + */ + public byte[] readBytes(String fileName) throws ApplicationException { + InputStream in = null; + byte[] out = new byte[0]; + try { + in = new BufferedInputStream(new FileInputStream(fileName)); + // the length of a buffer can vary + int bufLen = 20000*1024; + byte[] buf = new byte[bufLen]; + byte[] tmp = null; + int len = 0; + while((len = in.read(buf, 0, bufLen)) != -1) { + // extend array + tmp = new byte[out.length + len]; + System.arraycopy(out, 0, tmp, 0, out.length); + System.arraycopy(buf, 0, tmp, out.length, len); + out = tmp; + tmp = null; + } + } catch (FileNotFoundException e) { + throw new ApplicationException(e); + } catch (IOException e) { + throw new ApplicationException(e); + } finally { + // always close the stream + if (in != null) try { in.close(); } catch (Exception e) { } + } + return out; + } + + public String getMimeType(String fileName) throws ApplicationException { + String mimeType = null; + File file = new File(fileName); + try { + URI uri = file.toURI(); + URL url = uri.toURL(); + URLConnection urlConnection = url.openConnection(); + mimeType = urlConnection.getContentType(); + } catch (MalformedURLException e) { + throw new ApplicationException(e); + } catch (IOException e) { + throw new ApplicationException(e); + } + return mimeType; + } + + /** + * Reads a file storing intermediate data into an array. + * @file file the file to be read + * @return byte array of the file content + * TODO test this method if it is really faster + */ + private byte[] readBytesFast(String file) throws ApplicationException { + InputStream in = null; + byte[] buf = null; + int bufLen = 20000*1024; + try { + in = new BufferedInputStream(new FileInputStream(file)); + buf = new byte[bufLen]; + byte[] tmp = null; + int len = 0; + List data = new ArrayList(24); // keeps pieces of data + while((len = in.read(buf, 0, bufLen)) != -1){ + tmp = new byte[len]; + System.arraycopy(buf, 0, tmp, 0, len); // still need to do copy + data.add(tmp); + } + /* This part os optional. This method could return a List data + for further processing, etc. */ + len = 0; + if (data.size() == 1) return (byte[]) data.get(0); + for (int i=0;i fontFileNames; + + public static MpdlITextRenderer getInstance() throws ApplicationException { + if (instance == null) { + instance = new MpdlITextRenderer(); + instance.init(); + } + return instance; + } + + public void init() throws ApplicationException { + renderer = new ITextRenderer(); + SharedContext rendererSharedContext = renderer.getSharedContext(); + MpdlITextUserAgent mpdlUserAgent = new MpdlITextUserAgent(); // user agent to get a callback handle to the web access of images (getImageResource(url)) + mpdlUserAgent.setSharedContext(rendererSharedContext); + rendererSharedContext.setUserAgentCallback(mpdlUserAgent); + fontFileNames = new Hashtable(); + String fontJunicodeFileName = MpdlConstants.MPDL_EXIST_DATA_DIR + "/fonts/Junicode-Regular.ttf"; + String fontJunicodeBoldFileName = MpdlConstants.MPDL_EXIST_DATA_DIR + "/fonts/Junicode-Bold.ttf"; + String fontJunicodeItalicFileName = MpdlConstants.MPDL_EXIST_DATA_DIR + "/fonts/Junicode-Italic.ttf"; + String fontJunicodeBoldItalicFileName = MpdlConstants.MPDL_EXIST_DATA_DIR + "/fonts/Junicode-BoldItalic.ttf"; + String fontSunExtAFileName = MpdlConstants.MPDL_EXIST_DATA_DIR + "/fonts/Sun-ExtA.ttf"; // chinese symbols + String fontSunExtBFileName = MpdlConstants.MPDL_EXIST_DATA_DIR + "/fonts/Sun-ExtB.ttf"; // chinese symbols + String fontDejaVuFileName = MpdlConstants.MPDL_EXIST_DATA_DIR + "/fonts/DejaVuSans.ttf"; // arabic symbols + setFont(fontJunicodeFileName); + setFont(fontJunicodeBoldFileName); + setFont(fontJunicodeItalicFileName); + setFont(fontJunicodeBoldItalicFileName); // if set then some not bold italic characters are shown bold (e.g. in Benedetti_1585.xml) + setFont(fontSunExtAFileName); + setFont(fontSunExtBFileName); + setFont(fontDejaVuFileName); + } + + public byte[] createPdf(String htmlPageFragment, String language, String topLeftStr, String topRightStr, String bottomLeftStr, String bottomRightStr) throws ApplicationException { + byte[] pdfBytes = null; + try { + String htmlPageDoc = getPageHtmlDoc(htmlPageFragment, language, topLeftStr, topRightStr, bottomLeftStr, bottomRightStr); + renderer.setDocumentFromString(htmlPageDoc); + renderer.layout(); + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + renderer.createPDF(baos); + pdfBytes = baos.toByteArray(); + baos.close(); + } catch (Exception e) { + init(); + String message = e.getMessage(); + if (message.indexOf("nausikaa") > 0 && message.indexOf("500") > 0) { + throw new ApplicationException("Could not fetch image from nausikaa2.rz-berlin.mpg.de: please try again later"); + } + throw new ApplicationException(e); + } + return pdfBytes; + } + + public void createFile(boolean pdf, boolean html, String mode, MetadataRecord mdRecord) throws ApplicationException { + OutputStream osPdf = null; + OutputStream osHtml = null; + OutputStream osHtmlPdf = null; + String eXistIdentifier = mdRecord.getEXistIdentifier(); + String language = mdRecord.getLanguage(); + if (eXistIdentifier == null) + throw new ApplicationException("Pdf/Html-Generation failed: no eXist-Identifier given in mdRecord"); + String eXistIdentifierWithoutExtension = eXistIdentifier.substring(0, eXistIdentifier.length() - 4); // without ".xml" + String destFileNamePdf = MpdlConstants.MPDL_EXIST_DATA_DIR + "/documents" + eXistIdentifierWithoutExtension + ".pdf"; + String destFileNameHtml = MpdlConstants.MPDL_EXIST_DATA_DIR + "/documents" + eXistIdentifierWithoutExtension + ".html"; + String destFileNameHtmlPdfTmp = MpdlConstants.MPDL_EXIST_DATA_DIR + "/documents" + eXistIdentifierWithoutExtension + "-4Pdf.html"; + try { + // start document + if (pdf) { + osPdf = new FileOutputStream(new File(destFileNamePdf)); + osHtmlPdf = new FileOutputStream(new File(destFileNameHtmlPdfTmp)); + } + if (html) + osHtml = new FileOutputStream(new File(destFileNameHtml)); + int countPages = httpClientGetCountPages(mdRecord); + // style page + String pageStyleHtml = "float:left; clear:both; border: thin solid #808080; width: 21.0cm; margin-top: 0.2cm; margin-bottom: 1cm; margin-left: 0.7cm; margin-right: 0.7cm; padding: 0.2cm;"; + // firstPage + String firstPageHtmlShort = getFirstPageHtml(mdRecord, true); + String firstPageHtmlLong = getFirstPageHtml(mdRecord, false); + String mdRecordStr = getMdRecordString(mdRecord); + String htmlHeadStr = getHtmlHead(null, mdRecordStr); + String fontStyle = getFontStyle(language); + if(pdf) { + write("" + htmlHeadStr + "", osHtmlPdf); + // first page + write(firstPageHtmlLong, osHtmlPdf); + } + if (html) { + write("" + htmlHeadStr + "", osHtml); + // first page + write("
    ", osHtml); + write(firstPageHtmlShort, osHtml); + write("
    ", osHtml); + } + // table of content of document + String htmlToc = getTocHtml(mdRecord); + if (html && htmlToc != null) { + write("
    ", osHtml); + write(htmlToc, osHtml); + write("
    ", osHtml); + } + if(pdf && htmlToc != null) { + write(htmlToc, osHtmlPdf); + } + // all pages of the document + for(int i=1; i<=countPages; i++) { + String htmlPageFragment = httpClientGetPageFragmentHtml(eXistIdentifier, mode, i); + htmlPageFragment = removeXmlStartString(htmlPageFragment); + String pnHrefName = ""; + if (html) { + write("
    ", osHtml); + write(pnHrefName, osHtml); + write("
    ", osHtml); + write("
    ", osHtml); + String htmlPageFragmentWithImageUrl = htmlPageFragment.replaceAll("src=\"images/", "src=\"http://" + MpdlConstants.MPDL_FULL_EXIST_HOST_NAME + "/mpdl/images/"); // to find the camera.png file on webserver mpdl-proto + write(htmlPageFragmentWithImageUrl, osHtml); + write("
    ", osHtml); + } + htmlPageFragment = pnHrefName + htmlPageFragment; + if(pdf) { + String htmlPageFragmentWithImageDir = htmlPageFragment.replaceAll("src=\"images/", "src=\"../../../../../mpdl/images/"); // to find the camera.png file in webbapp/mpdl/image/ directory + write(htmlPageFragmentWithImageDir, osHtmlPdf); + } + } + if (html) { + write("", osHtml); + } + // create PDF document + if(pdf) { + write("", osHtmlPdf); + osHtmlPdf.close(); + renderer.setDocument(new File(destFileNameHtmlPdfTmp)); + renderer.layout(); // takes the most time + renderer.createPDF(osPdf); + } + } catch (Exception e) { + init(); + String message = e.getMessage(); + if (message != null && message.indexOf("nausikaa") > 0 && message.indexOf("500") > 0) { + throw new ApplicationException("fetch image is not possible: " + message); + } + throw new ApplicationException(e); + } finally { + try { + osHtmlPdf.close(); + osPdf.close(); + osHtml.close(); + FileUtil.getInstance().deleteFile(destFileNameHtmlPdfTmp); + } catch (IOException e) { + // nothing + } + } + } + + private String getFirstPageHtml(MetadataRecord mdRecord, boolean shortPage) { + String author = mdRecord.getCreator(); + String title = mdRecord.getTitle(); + String year = mdRecord.getYear(); + String existId = mdRecord.getEXistIdentifier(); + String firstPageHtml = "
    "; + firstPageHtml = firstPageHtml + "

    " + "Max Planck Institute for the History of Science" + "

    "; + firstPageHtml = firstPageHtml + "

    " + "Max-Planck-Institut für Wissenschaftsgeschichte" + "

    "; + firstPageHtml = firstPageHtml + "

    " + "MPDL project" + "

    "; + firstPageHtml = firstPageHtml + "

    "; + firstPageHtml = firstPageHtml + "

    "; + if (! shortPage) { + firstPageHtml = firstPageHtml + "

    "; + firstPageHtml = firstPageHtml + "

    "; + firstPageHtml = firstPageHtml + "

    "; + firstPageHtml = firstPageHtml + "

    "; + } + if (author != null) { + firstPageHtml = firstPageHtml + "

    " + author + "

    "; + } + if (title != null) { + firstPageHtml = firstPageHtml + "

    " + title + "

    "; + } + if (year != null) { + firstPageHtml = firstPageHtml + "

    " + year + "

    "; + } + if (! shortPage) { + firstPageHtml = firstPageHtml + "

    "; + firstPageHtml = firstPageHtml + "

    "; + firstPageHtml = firstPageHtml + "

    "; + firstPageHtml = firstPageHtml + "

    "; + firstPageHtml = firstPageHtml + "

    "; + firstPageHtml = firstPageHtml + "

    "; + firstPageHtml = firstPageHtml + "

    "; + firstPageHtml = firstPageHtml + "

    "; + firstPageHtml = firstPageHtml + "

    "; + firstPageHtml = firstPageHtml + "

    "; + firstPageHtml = firstPageHtml + "

    "; + firstPageHtml = firstPageHtml + "

    "; + firstPageHtml = firstPageHtml + "

    "; + firstPageHtml = firstPageHtml + "

    "; + firstPageHtml = firstPageHtml + "

    "; + firstPageHtml = firstPageHtml + "

    "; + firstPageHtml = firstPageHtml + "

    "; + firstPageHtml = firstPageHtml + "

    "; + } + firstPageHtml = firstPageHtml + "

    "; + firstPageHtml = firstPageHtml + "

    "; + firstPageHtml = firstPageHtml + "

    "; + firstPageHtml = firstPageHtml + "

    "; + String urlDocuView = "http://" + MpdlConstants.MPDL_FULL_EXIST_HOST_NAME + "/mpdl/interface/echo/echoDocuView.xql"; + String document = "?document=" + existId; + String urlDoc = urlDocuView + document; + firstPageHtml = firstPageHtml + "

    Document link:

    " + urlDocuView + "

    " + document + "

    "; + firstPageHtml = firstPageHtml + "
    "; + return firstPageHtml; + } + + private String getTocHtml(MetadataRecord mdRecord) throws ApplicationException { + String htmlStr = null; + String eXistIdentifier = mdRecord.getEXistIdentifier(); + String htmlToc = httpClientGetContentListHtml(eXistIdentifier); + String resultSizeStr = XmlUtil.getInstance().evaluateToString(htmlToc, "//div[@class = 'queryResultHits']", null); + int resultSize = 0; + if (resultSizeStr != null) + resultSize = Integer.parseInt(resultSizeStr); + if (resultSize <= 0) + return null; + if (htmlToc != null) { + htmlToc = removeXmlStartString(htmlToc); + htmlToc = htmlToc.replaceAll("page-fragment\\.xql.*pn=", "#pn"); + htmlToc = htmlToc.replaceAll(">Page: ", ">"); + htmlToc = "Content" + htmlToc; + htmlStr = "
    "; + htmlStr = htmlStr + htmlToc; + htmlStr = htmlStr + "
    "; + } + return htmlStr; + } + + private String getPageHtmlDoc(String htmlFragment, String language, String topLeftStr, String topRightStr, String bottomLeftStr, String bottomRightStr) { + String fontStyle = getFontStyle(language); + String stylePage = getStylePage(topLeftStr, topRightStr, bottomLeftStr, bottomRightStr); + String htmlStr = ""; + String htmlHeadStr = getHtmlHead(stylePage, topLeftStr); + htmlStr = htmlStr + htmlHeadStr; + htmlStr = htmlStr + ""; + htmlStr = htmlStr + htmlFragment; + htmlStr = htmlStr + ""; + htmlStr = htmlStr + ""; + return htmlStr; + } + + private String getMdRecordString(MetadataRecord mdRecord) { + String author = mdRecord.getCreator(); + String title = mdRecord.getTitle(); + String year = mdRecord.getYear(); + String mdRecordStr = ""; + if (mdRecord != null) { + if (author != null && ! author.equals("")) + mdRecordStr = mdRecordStr + author; + if (title != null && ! title.equals("")) + mdRecordStr = mdRecordStr + ". " + title; + if (year != null && ! year.equals("")) + mdRecordStr = mdRecordStr + ". " + year + "."; + else + mdRecordStr = mdRecordStr + "."; + } + return mdRecordStr; + } + + private String getHtmlHead(String stylePageStr, String titleStr) { + String htmlStr = ""; + if (stylePageStr != null) + htmlStr = htmlStr + ""; + htmlStr = htmlStr + "" + titleStr + ""; + String httpExistHostName = "http" + "://" + MpdlConstants.MPDL_FULL_EXIST_HOST_NAME; + htmlStr = htmlStr + ""; + htmlStr = htmlStr + ""; + return htmlStr; + } + + private String removeXmlStartString(String inputStr) { + String xmlStartStr = ""; + boolean startsWithXmlStartStr = inputStr.startsWith(xmlStartStr); + if (startsWithXmlStartStr) { + int xmlStartStrLength = xmlStartStr.length(); + int xmlStartStrIndex = -1; + xmlStartStrIndex = inputStr.indexOf(xmlStartStr); + if (xmlStartStrIndex != -1) + inputStr = inputStr.substring(xmlStartStrLength); + } + return inputStr; + } + + private String getFontStyle(String language) { + String fontFamily = "Junicode"; + if (language.equals("ar")) + fontFamily = "DejaVu Sans"; + else if (language.equals("zh") || language.equals("zho-Hant")) + fontFamily = "Sun-ExtA, Sun-ExtB"; + return "font-size:11pt; font-family:" + fontFamily + ";"; + } + + private String getStylePage(String topLeftStr, String topRightStr, String bottomLeftStr, String bottomRightStr) { + String fontStylePage = "8pt, sans-serif; "; + String stylePage = "@page {" + "size: A4;" + "margin-top: 1.5cm;" + "margin-bottom: 1cm;" + "margin-left: 0.7cm;" + "margin-right: 0.7cm;" + "border: thin solid #808080;" + "padding: 0.2cm;" + " font-size: 10px;" + + " @top-left { font: " + fontStylePage + " padding-left: 0.2cm; padding-right: 1cm; font-weight:bold; content: " + topLeftStr + ";}" + + " @top-right { font: " + fontStylePage + " white-space: nowrap; font-weight:bold; content: " + topRightStr + ";}" + + " @bottom-left { font: " + fontStylePage + " white-space: nowrap; font-weight:bold; content: " + bottomLeftStr + ";}" + + " @bottom-right { font: " + fontStylePage + " white-space: nowrap; font-weight:bold; content: " + bottomRightStr + ";}" + "}"; + return stylePage; + } + + private String httpClientGetPageFragmentHtml(String docName, String mode, int pageNumber) throws ApplicationException { + String retPageFragment = null; + try { + HttpClient httpClient = new HttpClient(); + String requestName = "/mpdl/interface/page-fragment.xql?document=" + docName + "&mode=" + mode + "&pn=" + pageNumber + "&characterNormalization=orig"; + String urlStr = "http" + "://" + MpdlConstants.MPDL_EXIST_HOST_NAME + ":" + MpdlConstants.MPDL_EXIST_PORT + requestName; + GetMethod method = new GetMethod(urlStr); + httpClient.executeMethod(method); + byte[] responseBody = method.getResponseBody(); + retPageFragment = new String(responseBody, "utf-8"); + method.releaseConnection(); + } catch (HttpException e) { + throw new ApplicationException(e); + } catch (IOException e) { + throw new ApplicationException(e); + } + return retPageFragment; + } + + private String httpClientGetContentListHtml(String docName) throws ApplicationException { + String retHtmlFragment = null; + try { + HttpClient httpClient = new HttpClient(); + String requestName = "/mpdl/interface/doc-query.xql?document=" + docName + "&queryType=toc&queryResultPageSize=10000"; + String urlStr = "http" + "://" + MpdlConstants.MPDL_EXIST_HOST_NAME + ":" + MpdlConstants.MPDL_EXIST_PORT + requestName; + GetMethod method = new GetMethod(urlStr); + httpClient.executeMethod(method); + byte[] responseBody = method.getResponseBody(); + retHtmlFragment = new String(responseBody, "utf-8"); + method.releaseConnection(); + } catch (HttpException e) { + throw new ApplicationException(e); + } catch (IOException e) { + throw new ApplicationException(e); + } + return retHtmlFragment; + } + + private int httpClientGetCountPages(MetadataRecord mdRecord) throws ApplicationException { + int count = -1; + String docName = mdRecord.getEXistIdentifier(); + String docBase = mdRecord.getDocBase(); + String pbTag = "echo:pb"; + if (docBase != null && docBase.equals("archimedes")) + pbTag = "pb"; + try { + HttpClient httpClient = new HttpClient(); + String requestName = "/mpdl/interface/xquery.xql?document=" + docName + "&xquery=count(//" + pbTag + ")"; + String urlStr = "http" + "://" + MpdlConstants.MPDL_EXIST_HOST_NAME + ":" + MpdlConstants.MPDL_EXIST_PORT + requestName; + GetMethod method = new GetMethod(urlStr); + httpClient.executeMethod(method); + byte[] responseBody = method.getResponseBody(); + String xmlResult = new String(responseBody, "utf-8"); + method.releaseConnection(); + if (xmlResult != null && ! xmlResult.equals("")) { + XmlUtil xmlUtil = XmlUtil.getInstance(); + String countPagesStr = xmlUtil.evaluateToString(xmlResult, "/result/queryResult/records/record/content", null); + count = Integer.parseInt(countPagesStr); + } + if (count == 0) + count = 1; // if no pb tag found then document consists of one page + } catch (HttpException e) { + throw new ApplicationException(e); + } catch (IOException e) { + throw new ApplicationException(e); + } + return count; + } + + private void write(String str, OutputStream out) throws ApplicationException { + try { + byte[] bytes = str.getBytes("utf-8"); + out.write(bytes, 0, bytes.length); + out.flush(); + } catch (UnsupportedEncodingException e) { + throw new ApplicationException(e); + } catch (FileNotFoundException e) { + throw new ApplicationException(e); + } catch (IOException e) { + throw new ApplicationException(e); + } + } + + private void setFont(String fontFileName) throws ApplicationException { + try { + String existingFontFileName = fontFileNames.get(fontFileName); + if (existingFontFileName == null) { + fontFileNames.put(fontFileName, fontFileName); + ITextFontResolver fontResolver = renderer.getFontResolver(); + fontResolver.addFont(fontFileName, BaseFont.IDENTITY_H, BaseFont.NOT_EMBEDDED); // Identy_H is Unicode Horizontal; not_embedded means not embedded in the PDF doc + } + } catch (XRRuntimeException e) { + init(); + String message = e.getMessage(); + if (message.indexOf("nausikaa") > 0 && message.indexOf("500") > 0) { + throw new ApplicationException("Could not fetch image from nausikaa2.rz-berlin.mpg.de: please try again later"); + } + throw new ApplicationException(e); + } catch (IOException e) { + init(); + String message = e.getMessage(); + if (message.indexOf("nausikaa") > 0 && message.indexOf("500") > 0) { + throw new ApplicationException("fetch image is not possible: " + message); + } + throw new ApplicationException(e); + } catch (DocumentException e) { + init(); + String message = e.getMessage(); + if (message.indexOf("nausikaa") > 0 && message.indexOf("500") > 0) { + throw new ApplicationException("fetch image is not possible: " + message); + } + throw new ApplicationException(e); + } + } + + // old method: each page is set as an own html page + public void createFileOld(boolean pdf, boolean html, String mode, MetadataRecord mdRecord) throws ApplicationException { + OutputStream osPdf = null; + OutputStream osHtml = null; + OutputStream osHtmlPdf = null; + String eXistIdentifier = mdRecord.getEXistIdentifier(); + String language = mdRecord.getLanguage(); + if (eXistIdentifier == null) + throw new ApplicationException("Pdf/Html-Generation failed: no eXist-Identifier given in mdRecord"); + String eXistIdentifierWithoutExtension = eXistIdentifier.substring(0, eXistIdentifier.length() - 4); // without ".xml" + String destFileNamePdf = MpdlConstants.MPDL_EXIST_DATA_DIR + "/documents" + eXistIdentifierWithoutExtension + ".pdf"; + String destFileNameHtml = MpdlConstants.MPDL_EXIST_DATA_DIR + "/documents" + eXistIdentifierWithoutExtension + ".html"; + String destFileNameHtmlPdfTmp = MpdlConstants.MPDL_EXIST_DATA_DIR + "/documents" + eXistIdentifierWithoutExtension + "-4Pdf.html"; + try { + // start document + if (pdf) { + osPdf = new FileOutputStream(new File(destFileNamePdf)); + osHtmlPdf = new FileOutputStream(new File(destFileNameHtmlPdfTmp)); + } + if (html) + osHtml = new FileOutputStream(new File(destFileNameHtml)); + int countPages = httpClientGetCountPages(mdRecord); + // style page + String pageStyleHtml = "float:left; clear:both; border: thin solid #808080; width: 21.0cm; margin-top: 0.2cm; margin-bottom: 1cm; margin-left: 0.7cm; margin-right: 0.7cm; padding: 0.2cm;"; + // firstPage + String firstPageHtmlShort = getFirstPageHtml(mdRecord, true); + String firstPageHtmlLong = getFirstPageHtml(mdRecord, false); + String mdRecordStr = getMdRecordString(mdRecord); + String htmlHeadStr = getHtmlHead(null, mdRecordStr); + String fontStyle = getFontStyle(language); + if(pdf) { + write("" + htmlHeadStr + "", osHtmlPdf); + // first page + write(firstPageHtmlLong, osHtmlPdf); + renderer.setDocumentFromString("" + htmlHeadStr + "" + firstPageHtmlLong + ""); + renderer.layout(); + renderer.createPDF(osPdf, false); + } + if (html) { + write("" + htmlHeadStr + "", osHtml); + // first page + write("
    ", osHtml); + write(firstPageHtmlShort, osHtml); + write("
    ", osHtml); + } + // table of content of document + String htmlToc = getTocHtml(mdRecord); + if (html && htmlToc != null) { + write("
    ", osHtml); + write(htmlToc, osHtml); + write("
    ", osHtml); + } + if(pdf && htmlToc != null) { + write(htmlToc, osHtmlPdf); + renderer.setDocumentFromString("" + htmlHeadStr + "" + htmlToc + ""); + renderer.layout(); + renderer.writeNextDocument(); + } + // all pages of the document + for(int i=1; i<=countPages; i++) { + String htmlPageFragment = httpClientGetPageFragmentHtml(eXistIdentifier, mode, i); + htmlPageFragment = removeXmlStartString(htmlPageFragment); + String pnHrefName = ""; + if (html) { + write("
    ", osHtml); + write(pnHrefName, osHtml); + write("
    ", osHtml); + write("
    ", osHtml); + String htmlPageFragmentWithImageUrl = htmlPageFragment.replaceAll("src=\"images/", "src=\"http://" + MpdlConstants.MPDL_FULL_EXIST_HOST_NAME + "/mpdl/images/"); // to find the camera.png file on webserver mpdl-proto + write(htmlPageFragmentWithImageUrl, osHtml); + write("
    ", osHtml); + } + htmlPageFragment = pnHrefName + htmlPageFragment; + if(pdf) { + String htmlPageFragmentWithImageUrl = htmlPageFragment.replaceAll("src=\"images/", "src=\"http://" + MpdlConstants.MPDL_FULL_EXIST_HOST_NAME + "/mpdl/images/"); // to find the camera.png file on webserver mpdl-proto + String htmlPageFragmentSinglePage = htmlPageFragmentWithImageUrl.replaceAll("class=\"page\">", "class=\"singlePage\">"); + String pnPdf = ""Page " + i + " (" counter(page) ")""; + String htmlPage = getPageHtmlDoc(htmlPageFragmentSinglePage, language, """", pnPdf, """", """"); + write(htmlPage, osHtmlPdf); + renderer.setDocumentFromString(htmlPage); + try { + renderer.layout(); + renderer.writeNextDocument(); + } catch (XRRuntimeException e) { + System.out.println("XXXX: " + e.getMessage()); + } + } + } + if (html) { + write("", osHtml); + } + // create PDF document + if(pdf) { + write("", osHtmlPdf); + osHtmlPdf.close(); + renderer.finishPDF(); + } + } catch (Exception e) { + init(); + String message = e.getMessage(); + if (message.indexOf("nausikaa") > 0 && message.indexOf("500") > 0) { + throw new ApplicationException("Could not fetch image from nausikaa2.rz-berlin.mpg.de: please try again later"); + } + throw new ApplicationException(e); + } finally { + try { + osHtmlPdf.close(); + osPdf.close(); + osHtml.close(); + FileUtil.getInstance().deleteFile(destFileNameHtmlPdfTmp); + } catch (IOException e) { + // nothing + } + } + } + +} diff -r 000000000000 -r 408254cf2f1d software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/util/MpdlITextUserAgent.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/util/MpdlITextUserAgent.java Wed Nov 24 17:24:23 2010 +0100 @@ -0,0 +1,149 @@ +package de.mpg.mpiwg.berlin.mpdl.util; + +import java.io.IOException; +import java.io.InputStream; +import java.net.URL; + +import org.apache.log4j.Logger; +import org.xhtmlrenderer.layout.SharedContext; +import org.xhtmlrenderer.pdf.ITextFSImage; +import org.xhtmlrenderer.pdf.ITextOutputDevice; +import org.xhtmlrenderer.pdf.PDFAsImage; +import org.xhtmlrenderer.resource.ImageResource; +import org.xhtmlrenderer.swing.NaiveUserAgent; + +import com.lowagie.text.Image; +import com.lowagie.text.Rectangle; +import com.lowagie.text.pdf.PdfReader; + +public class MpdlITextUserAgent extends NaiveUserAgent { + private static final int IMAGE_CACHE_CAPACITY = 32; + private static final float DEFAULT_DOTS_PER_POINT = 20f * 4f / 3f; + private static Logger LOGGER = Logger.getLogger(MpdlITextUserAgent.class); // Logs to EXIST_HOME/webapp/WEB-INF/logs/exist.log + private SharedContext sharedContext; + private ITextOutputDevice outputDevice; + + public MpdlITextUserAgent() { + super(IMAGE_CACHE_CAPACITY); + outputDevice = new ITextOutputDevice(DEFAULT_DOTS_PER_POINT); + } + + @SuppressWarnings("unchecked") + public ImageResource getImageResource(String inputUri) { + ImageResource resource = null; + String uri = resolveURI(inputUri); + resource = (ImageResource) _imageCache.get(uri); + if (resource == null) { + InputStream is = resolveAndOpenStream(uri); + if (is != null) { + try { + URL url = new URL(uri); + if (url.getPath() != null && url.getPath().toLowerCase().endsWith(".pdf")) { + PdfReader reader = outputDevice.getReader(url); + PDFAsImage image = new PDFAsImage(url); + Rectangle rect = reader.getPageSizeWithRotation(1); + image.setInitialWidth(rect.getWidth()*outputDevice.getDotsPerPoint()); + image.setInitialHeight(rect.getHeight()*outputDevice.getDotsPerPoint()); + resource = new ImageResource(image); + } else { + Image image = getImage(url); + if (image == null) + return null; + scaleToOutputResolution(image); + resource = new ImageResource(new ITextFSImage(image)); + } + _imageCache.put(uri, resource); + } catch (IOException e) { + LOGGER.error("Can't get image file: unexpected problem for URI: '" + uri + "': " + e.getMessage(), e); + } finally { + try { + if (is != null) + is.close(); + } catch (IOException e) { + // ignore + } + } + } + } + if (resource == null) { + resource = new ImageResource(null); + } + return resource; + } + + private void scaleToOutputResolution(Image image) { + float factor = sharedContext.getDotsPerPixel(); + image.scaleAbsolute(image.getPlainWidth() * factor, image.getPlainHeight() * factor); + } + + public SharedContext getSharedContext() { + return sharedContext; + } + + public void setSharedContext(SharedContext sharedContext) { + this.sharedContext = sharedContext; + } + + private Image getImage(URL url) { + Image image = null; + try { + image = Image.getInstance(url); + } catch (Exception e) { + try { + Thread.sleep(1000); + } catch (InterruptedException ee) { + // nothing + } + LOGGER.error("first retry to get image for URL '" + url.toString() + "': " + e.getMessage(), e); + try { + image = Image.getInstance(url); + } catch (Exception e2) { + try { + Thread.sleep(1000); + } catch (InterruptedException ee) { + // nothing + } + LOGGER.error("second retry to get image for URL '" + url.toString() + "': " + e.getMessage(), e); + try { + image = Image.getInstance(url); + } catch (Exception e3) { + LOGGER.error("third retry to get image for URL '" + url.toString() + "': " + e.getMessage(), e); + return null; + } + } + } + return image; + } + + protected InputStream resolveAndOpenStream(String inputUri) { + InputStream is = null; + String uri = resolveURI(inputUri); + try { + is = new URL(uri).openStream(); + } catch (Exception e) { + try { + Thread.sleep(1000); + } catch (InterruptedException ee) { + // nothing + } + LOGGER.error("first retry to open stream for URL '" + uri + "': " + e.getMessage(), e); + try { + is = new URL(uri).openStream(); + } catch (Exception e2) { + try { + Thread.sleep(1000); + } catch (InterruptedException ee) { + // nothing + } + LOGGER.error("second retry to open stream for URL '" + uri + "': " + e.getMessage(), e); + try { + is = new URL(uri).openStream(); + } catch (Exception e3) { + LOGGER.error("third retry to open stream for URL '" + uri + "': " + e.getMessage(), e); + return null; + } + } + } + return is; + } +} diff -r 000000000000 -r 408254cf2f1d software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/util/StringUtilEscapeChars.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/util/StringUtilEscapeChars.java Wed Nov 24 17:24:23 2010 +0100 @@ -0,0 +1,475 @@ +package de.mpg.mpiwg.berlin.mpdl.util; + +import java.io.UnsupportedEncodingException; +import java.net.URLEncoder; +import java.text.CharacterIterator; +import java.text.StringCharacterIterator; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +public class StringUtilEscapeChars { + public static String deleteSpecialXmlEntities(String inputStr) { + inputStr = inputStr.replaceAll("<", ""); + inputStr = inputStr.replaceAll(">", ""); + inputStr = inputStr.replaceAll("&lt;", ""); + inputStr = inputStr.replaceAll("&gt;", ""); + return inputStr; + } + + public static String resolveXmlEntities(String inputStr) { + inputStr = inputStr.replaceAll("&", "&"); + inputStr = inputStr.replaceAll("<", "<"); + inputStr = inputStr.replaceAll(">", ">"); + inputStr = inputStr.replaceAll(""", "\""); + inputStr = inputStr.replaceAll("'", "'"); + return inputStr; + } + + public static String deresolveXmlEntities(String inputStr) { + StringBuffer buf = new StringBuffer(); + for (int i = 0; i < inputStr.length(); i++) { + char c = inputStr.charAt(i); + String replace = new String(); + switch (c) { + case '&': replace = "&"; break; + case '<': replace = "<"; break; + case '>': replace = ">"; break; + case '"': replace = """; break; + // case '\'': replace = "'"; break; // causes problems in DictionarizerContentHandler + default: replace += c; break; + } + buf.append(replace); + } + return buf.toString(); + } + + /** + * Escape characters for text appearing in HTML markup. + * + *

    This method exists as a defence against Cross Site Scripting (XSS) hacks. + * The idea is to neutralize control characters commonly used by scripts, such that + * they will not be executed by the browser. This is done by replacing the control + * characters with their escaped equivalents. + * See {@link hirondelle.web4j.security.SafeText} as well. + * + *

    The following characters are replaced with corresponding + * HTML character entities : + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + *
    Character Replacement
    < <
    > >
    & &
    " "
    \t
    ! !
    # #
    $ $
    % %
    ' '
    ( (
    ) )
    * *
    + +
    , ,
    - -
    . .
    / /
    : :
    ; ;
    = =
    ? ?
    @ @
    [ [
    \ \
    ] ]
    ^ ^
    _ _
    ` `
    { {
    | |
    } }
    ~ ~
    + * + *

    Note that JSTL's {@code } escapes only the first + * five of the above characters. + */ + public static String forHTML(String aText){ + final StringBuilder result = new StringBuilder(); + final StringCharacterIterator iterator = new StringCharacterIterator(aText); + char character = iterator.current(); + while (character != CharacterIterator.DONE ){ + if (character == '<') { + result.append("<"); + } + else if (character == '>') { + result.append(">"); + } + else if (character == '&') { + result.append("&"); + } + else if (character == '\"') { + result.append("""); + } + else if (character == '\t') { + addCharEntity(9, result); + } + else if (character == '!') { + addCharEntity(33, result); + } + else if (character == '#') { + addCharEntity(35, result); + } + else if (character == '$') { + addCharEntity(36, result); + } + else if (character == '%') { + addCharEntity(37, result); + } + else if (character == '\'') { + addCharEntity(39, result); + } + else if (character == '(') { + addCharEntity(40, result); + } + else if (character == ')') { + addCharEntity(41, result); + } + else if (character == '*') { + addCharEntity(42, result); + } + else if (character == '+') { + addCharEntity(43, result); + } + else if (character == ',') { + addCharEntity(44, result); + } + else if (character == '-') { + addCharEntity(45, result); + } + else if (character == '.') { + addCharEntity(46, result); + } + else if (character == '/') { + addCharEntity(47, result); + } + else if (character == ':') { + addCharEntity(58, result); + } + else if (character == ';') { + addCharEntity(59, result); + } + else if (character == '=') { + addCharEntity(61, result); + } + else if (character == '?') { + addCharEntity(63, result); + } + else if (character == '@') { + addCharEntity(64, result); + } + else if (character == '[') { + addCharEntity(91, result); + } + else if (character == '\\') { + addCharEntity(92, result); + } + else if (character == ']') { + addCharEntity(93, result); + } + else if (character == '^') { + addCharEntity(94, result); + } + else if (character == '_') { + addCharEntity(95, result); + } + else if (character == '`') { + addCharEntity(96, result); + } + else if (character == '{') { + addCharEntity(123, result); + } + else if (character == '|') { + addCharEntity(124, result); + } + else if (character == '}') { + addCharEntity(125, result); + } + else if (character == '~') { + addCharEntity(126, result); + } + else { + //the char is not a special one + //add it to the result as is + result.append(character); + } + character = iterator.next(); + } + return result.toString(); + } + + + /** + * Escape all ampersand characters in a URL. + * + *

    Replaces all '&' characters with '&'. + * + *

    An ampersand character may appear in the query string of a URL. + * The ampersand character is indeed valid in a URL. + * However, URLs usually appear as an HREF attribute, and + * such attributes have the additional constraint that ampersands + * must be escaped. + * + *

    The JSTL tag does indeed perform proper URL encoding of + * query parameters. But it does not, in general, produce text which + * is valid as an HREF attribute, simply because it does + * not escape the ampersand character. This is a nuisance when + * multiple query parameters appear in the URL, since it requires a little + * extra work. + */ + public static String forHrefAmpersand(String aURL){ + return aURL.replace("&", "&"); + } + + /** + * Synonym for URLEncoder.encode(String, "UTF-8"). + * + *

    Used to ensure that HTTP query strings are in proper form, by escaping + * special characters such as spaces. + * + *

    It is important to note that if a query string appears in an HREF + * attribute, then there are two issues - ensuring the query string is valid HTTP + * (it is URL-encoded), and ensuring it is valid HTML (ensuring the + * ampersand is escaped). + */ + public static String forURL(String aURLFragment){ + String result = null; + try { + result = URLEncoder.encode(aURLFragment, "UTF-8"); + } + catch (UnsupportedEncodingException ex){ + throw new RuntimeException("UTF-8 not supported", ex); + } + return result; + } + + /** + * Escape characters for text appearing as XML data, between tags. + * + *

    The following characters are replaced with corresponding character entities : + * + * + * + * + * + * + * + *
    Character Encoding
    < <
    > >
    & &
    " "
    ' '
    + * + *

    Note that JSTL's {@code } escapes the exact same set of + * characters as this method. That is, {@code } + * is good for escaping to produce valid XML, but not for producing safe + * HTML. + */ + public static String forXML(String aText){ + final StringBuilder result = new StringBuilder(); + final StringCharacterIterator iterator = new StringCharacterIterator(aText); + char character = iterator.current(); + while (character != CharacterIterator.DONE ){ + if (character == '<') { + result.append("<"); + } + else if (character == '>') { + result.append(">"); + } + else if (character == '\"') { + result.append("""); + } + else if (character == '\'') { + result.append("'"); + } + else if (character == '&') { + result.append("&"); + } + else { + //the char is not a special one + //add it to the result as is + result.append(character); + } + character = iterator.next(); + } + return result.toString(); + } + + /** + * Return aText with all '<' and '>' characters + * replaced by their escaped equivalents. + */ + public static String toDisableTags(String aText){ + final StringBuilder result = new StringBuilder(); + final StringCharacterIterator iterator = new StringCharacterIterator(aText); + char character = iterator.current(); + while (character != CharacterIterator.DONE ){ + if (character == '<') { + result.append("<"); + } + else if (character == '>') { + result.append(">"); + } + else { + //the char is not a special one + //add it to the result as is + result.append(character); + } + character = iterator.next(); + } + return result.toString(); + } + + + /** + * Replace characters having special meaning in regular expressions + * with their escaped equivalents, preceded by a '\' character. + * + *

    The escaped characters include : + *

      + *
    • . + *
    • \ + *
    • ?, * , and + + *
    • & + *
    • : + *
    • { and } + *
    • [ and ] + *
    • ( and ) + *
    • ^ and $ + *
    + */ + public static String forRegex(String aRegexFragment){ + final StringBuilder result = new StringBuilder(); + + final StringCharacterIterator iterator = + new StringCharacterIterator(aRegexFragment) + ; + char character = iterator.current(); + while (character != CharacterIterator.DONE ){ + /* + * All literals need to have backslashes doubled. + */ + if (character == '.') { + result.append("\\."); + } + else if (character == '\\') { + result.append("\\\\"); + } + else if (character == '?') { + result.append("\\?"); + } + else if (character == '*') { + result.append("\\*"); + } + else if (character == '+') { + result.append("\\+"); + } + else if (character == '&') { + result.append("\\&"); + } + else if (character == ':') { + result.append("\\:"); + } + else if (character == '{') { + result.append("\\{"); + } + else if (character == '}') { + result.append("\\}"); + } + else if (character == '[') { + result.append("\\["); + } + else if (character == ']') { + result.append("\\]"); + } + else if (character == '(') { + result.append("\\("); + } + else if (character == ')') { + result.append("\\)"); + } + else if (character == '^') { + result.append("\\^"); + } + else if (character == '$') { + result.append("\\$"); + } + else { + //the char is not a special one + //add it to the result as is + result.append(character); + } + character = iterator.next(); + } + return result.toString(); + } + + /** + * Escape '$' and '\' characters in replacement strings. + * + *

    Synonym for Matcher.quoteReplacement(String). + * + *

    The following methods use replacement strings which treat + * '$' and '\' as special characters: + *

      + *
    • String.replaceAll(String, String) + *
    • String.replaceFirst(String, String) + *
    • Matcher.appendReplacement(StringBuffer, String) + *
    + * + *

    If replacement text can contain arbitrary characters, then you + * will usually need to escape that text, to ensure special characters + * are interpreted literally. + */ + public static String forReplacementString(String aInput){ + return Matcher.quoteReplacement(aInput); + } + + /** + * Disable all ", Pattern.CASE_INSENSITIVE + ); + + private static void addCharEntity(Integer aIdx, StringBuilder aBuilder){ + String padding = ""; + if( aIdx <= 9 ){ + padding = "00"; + } + else if( aIdx <= 99 ){ + padding = "0"; + } + else { + //no prefix + } + String number = padding + aIdx.toString(); + aBuilder.append("&#" + number + ";"); + } + } diff -r 000000000000 -r 408254cf2f1d software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/util/Util.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/util/Util.java Wed Nov 24 17:24:23 2010 +0100 @@ -0,0 +1,48 @@ +package de.mpg.mpiwg.berlin.mpdl.util; + +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.util.Date; +import java.util.Properties; + +public class Util { + + public Properties getProperties(String fullFileName) { + Properties props = new Properties(); + try { + File file = new File(fullFileName); + FileInputStream in = new FileInputStream(file); + props.load(in); + } catch (IOException e) { + } + return props; + } + + public String toYearStr(String inputStr) { + String retYearStr = inputStr.trim(); + int index = inputStr.indexOf("-"); + if (index > 0) { + retYearStr = inputStr.substring(0, index); + retYearStr = retYearStr.trim(); + } + try { + Integer year = new Integer(retYearStr); + if (year < 10000 || year > 2500) + retYearStr = null; + } catch (NumberFormatException e) { + retYearStr = null; + } + return retYearStr; + } + + public Double getSecondWithMillisecondsBetween(Date begin, Date end) { + long beginMS = begin.getTime(); + long endMS = end.getTime(); + long elapsedSeconds = (endMS - beginMS) / 1000; + long elapsedMilliSecondsAfterSeconds1 = (endMS - beginMS) - (elapsedSeconds * 1000); + Double seconds = new Double(elapsedSeconds + "." + elapsedMilliSecondsAfterSeconds1); + return seconds; + } + +} diff -r 000000000000 -r 408254cf2f1d software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/util/XmlUtil.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/util/XmlUtil.java Wed Nov 24 17:24:23 2010 +0100 @@ -0,0 +1,384 @@ +package de.mpg.mpiwg.berlin.mpdl.util; + +import java.io.File; +import java.io.IOException; +import java.io.Reader; +import java.io.StringReader; +import java.io.StringWriter; +import java.net.URL; +import java.text.DateFormat; +import java.text.ParseException; +import java.text.SimpleDateFormat; +import java.util.ArrayList; +import java.util.Date; + +import javax.xml.XMLConstants; +import javax.xml.namespace.NamespaceContext; +import javax.xml.parsers.DocumentBuilder; +import javax.xml.parsers.DocumentBuilderFactory; +import javax.xml.transform.OutputKeys; +import javax.xml.transform.Source; +import javax.xml.transform.Transformer; +import javax.xml.transform.TransformerConfigurationException; +import javax.xml.transform.TransformerException; +import javax.xml.transform.TransformerFactory; +import javax.xml.transform.dom.DOMSource; +import javax.xml.transform.sax.SAXSource; +import javax.xml.transform.stream.StreamResult; +import javax.xml.validation.Schema; +import javax.xml.validation.SchemaFactory; +import javax.xml.validation.Validator; +import javax.xml.xpath.XPath; +import javax.xml.xpath.XPathConstants; +import javax.xml.xpath.XPathFactory; + +import net.sf.saxon.om.NodeInfo; + +import org.w3c.dom.Document; +import org.w3c.dom.NamedNodeMap; +import org.w3c.dom.Node; +import org.w3c.dom.NodeList; +import org.xml.sax.InputSource; +import org.xml.sax.SAXException; + +import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; + +public class XmlUtil { + static String JAXP_SCHEMA_LANGUAGE = "http://java.sun.com/xml/jaxp/properties/schemaLanguage"; + static String JAXP_SCHEMA_SOURCE = "http://java.sun.com/xml/jaxp/properties/schemaSource"; + static String W3C_XML_SCHEMA = XMLConstants.W3C_XML_SCHEMA_NS_URI; + + public static XmlUtil getInstance() { + return new XmlUtil(); + } + + public Node doc(String url) throws ApplicationException { + Node root = null; + try { + DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance(); + dbf.setNamespaceAware(true); + DocumentBuilder db = dbf.newDocumentBuilder(); + InputSource inputSource = new InputSource(url); + Document doc = db.parse(inputSource); + root = doc.getFirstChild(); + } catch (Exception e) { + throw new ApplicationException(e); + } + return root; + } + + public Node parse(String xmlFileName) throws ApplicationException { + File xmlFile = new File(xmlFileName); + XmlUtil xmlUtil = XmlUtil.getInstance(); + Node retNode = null; + try { + retNode = xmlUtil.doc(xmlFile); + } catch (ApplicationException e) { + throw new ApplicationException("Your source file is not valid: " + e.getMessage()); + } + return retNode; + } + + public Node doc(File xmlFile) throws ApplicationException { + Node root = null; + try { + DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance(); + dbf.setNamespaceAware(true); + DocumentBuilder db = dbf.newDocumentBuilder(); + Document doc = db.parse(xmlFile); + root = doc.getFirstChild(); + } catch (Exception e) { + throw new ApplicationException(e); + } + return root; + } + + public void validateByRelaxNG(File xmlFile, URL schemaUrl) throws ApplicationException { + System.setProperty(SchemaFactory.class.getName() + ":" + XMLConstants.RELAXNG_NS_URI, "com.thaiopensource.relaxng.jaxp.CompactSyntaxSchemaFactory"); + SchemaFactory factory = SchemaFactory.newInstance(XMLConstants.RELAXNG_NS_URI); + Schema schema = null; + try { + schema = factory.newSchema(schemaUrl); + } catch (SAXException e) { + throw new ApplicationException(e); + } + Validator validator = schema.newValidator(); + InputSource inputSource = new InputSource(xmlFile.getPath()); + Source source = new SAXSource(inputSource); + try { + validator.validate(source); + } catch (SAXException e) { + String message = e.getMessage(); + String text = "Your file is not valid against the RelaxNG schema: " + schemaUrl; + throw new ApplicationException(text + ":\n" + message); + } catch (IOException e) { + String message = e.getMessage(); + String text = "Your file is not valid against the RelaxNG schema: " + schemaUrl; + throw new ApplicationException(text + ": " + message); + } + } + + public String getNodeValue(Node node) { + String nodeValueStr = node.getNodeValue(); + if (nodeValueStr == null) + nodeValueStr = node.getTextContent(); + return nodeValueStr; + } + + public String getNodeAttributeValue(Node node, String attrName) { + NamedNodeMap attrs = node.getAttributes(); + if (attrs == null) { + return null; + } + Node attN = attrs.getNamedItem(attrName); + if (attN == null) { + return null; + } + return attN.getNodeValue(); + } + + public ArrayList toStringArray(NodeList nodes) { + ArrayList nodeValues = null; + for (int i=0; i< nodes.getLength(); i++) { + Node node = nodes.item(i); + if (nodeValues == null) + nodeValues = new ArrayList(); + String nodeValue = node.getNodeValue(); + nodeValues.add(nodeValue); + } + return nodeValues; + } + + public String toXsDate(Date date) { + DateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSS'Z'"); + String xsDateStr = dateFormat.format(date); + return xsDateStr; + } + + public Date toDate(String xsDateStr) throws ApplicationException { + Date retDate = null; + if (xsDateStr == null) + return null; + try { + DateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSS'Z'"); + retDate = dateFormat.parse(xsDateStr); + } catch (ParseException e) { + throw new ApplicationException(e); + } + return retDate; + } + + public String evaluateToString(String xmlString, String xpathExpression, NamespaceContext nsContext) throws ApplicationException { + String resultStr = null; + ArrayList strArray = evaluateToStringArray(xmlString, xpathExpression, nsContext); + if (strArray != null && strArray.size() > 0) + resultStr = strArray.get(0); + return resultStr; + } + + public String evaluateToString(InputSource inputSource, String xpathExpression, NamespaceContext nsContext) throws ApplicationException { + String resultStr = null; + ArrayList strArray = evaluateToStringArray(inputSource, xpathExpression, nsContext); + if (strArray != null && strArray.size() > 0) + resultStr = strArray.get(0); + return resultStr; + } + + public String evaluateToString(Node node, String xpathExpression, NamespaceContext nsContext) throws ApplicationException { + String resultStr = null; + ArrayList strArray = evaluateToStringArray(node, xpathExpression, nsContext); + if (strArray != null && strArray.size() > 0) + resultStr = strArray.get(0); + return resultStr; + } + + public ArrayList evaluateToStringArray(String xmlString, String xpathExpression, NamespaceContext nsContext) throws ApplicationException { + Reader stringReader = new StringReader(xmlString); + InputSource inputSource = new InputSource(stringReader); + ArrayList retStrArray = evaluateToStringArray(inputSource, xpathExpression, nsContext); + return retStrArray; + } + + public ArrayList evaluateToStringArray(InputSource inputSource, String xpathExpression, NamespaceContext nsContext) throws ApplicationException { + ArrayList retStrArray = null; + try { + XPath xpath = XPathFactory.newInstance().newXPath(); + if (nsContext != null) + xpath.setNamespaceContext(nsContext); + Object resultObjects = xpath.evaluate(xpathExpression, inputSource, XPathConstants.NODESET); + if (resultObjects != null) { + retStrArray = nodesetToStringArray(resultObjects); + } + } catch (Exception e) { + throw new ApplicationException(e); + } + return retStrArray; + } + + public ArrayList evaluateToNodeArray(InputSource inputSource, String xpathExpression, NamespaceContext nsContext) throws ApplicationException { + ArrayList retArray = null; + try { + XPath xpath = XPathFactory.newInstance().newXPath(); + if (nsContext != null) + xpath.setNamespaceContext(nsContext); + Object resultObjects = xpath.evaluate(xpathExpression, inputSource, XPathConstants.NODESET); + if (resultObjects != null) { + retArray = nodesetToNodeArray(resultObjects); + } + } catch (Exception e) { + throw new ApplicationException(e); + } + return retArray; + } + + public ArrayList evaluateToStringArray(Node node, String xpathExpression, NamespaceContext nsContext) throws ApplicationException { + ArrayList retStrArray = null; + try { + XPath xpath = XPathFactory.newInstance().newXPath(); + if (nsContext != null) + xpath.setNamespaceContext(nsContext); + Object resultObjects = xpath.evaluate(xpathExpression, node, XPathConstants.NODESET); + if (resultObjects != null) { + retStrArray = nodesetToStringArray(resultObjects); + } + } catch (Exception e) { + throw new ApplicationException(e); + } + return retStrArray; + } + + /* + * XPath evaluation: handles both, javax and also Saxon's implementation + * javax XPath evaluation: returns a NodeList + * Saxon's XPath evaluation: returns an ArrayList of TinyTextImpl (which could be casted to NodeInfo which could be handled as if it was a dom node) + */ + private ArrayList nodesetToStringArray(Object nodesetObjects) { + ArrayList retStrArray = null; + if (nodesetObjects instanceof NodeList) { + NodeList resultNodeList = (NodeList) nodesetObjects; + int length = resultNodeList.getLength(); + if (length > 0) { + retStrArray = new ArrayList(); + for (int i=0; i(); + for (int i=0; i nodesetToNodeArray(Object nodesetObjects) { + ArrayList retArray = null; + if (nodesetObjects instanceof NodeList) { + NodeList resultNodeList = (NodeList) nodesetObjects; + int length = resultNodeList.getLength(); + if (length > 0) { + retArray = new ArrayList(); + for (int i=0; i(); + for (int i=0; i getPBFileNames(Node documentNode, String docBase) throws ApplicationException { + ArrayList pbFileNamesArrayStr = null; + if (docBase != null && docBase.equals("echo")) { + XmlUtil xmlUtil = XmlUtil.getInstance(); + NamespaceContext nsContext = getEchoNsContext(); + pbFileNamesArrayStr = xmlUtil.evaluateToStringArray(documentNode, "//echo:pb/@file", nsContext); + } else if (docBase != null && docBase.equals("archimedes")) { + XmlUtil xmlUtil = XmlUtil.getInstance(); + ArrayList pbsStrArray = xmlUtil.evaluateToStringArray(documentNode, "//pb", null); + if (pbsStrArray != null) { + pbFileNamesArrayStr = new ArrayList(); + int countPBs = pbsStrArray.size(); + for (int i=1; i<=countPBs; i++) { + pbFileNamesArrayStr.add("" + i); // empty names for each page break + } + } + } + return pbFileNamesArrayStr; + } + + public String getPageImgDir(MetadataRecord mdRecord) throws ApplicationException { + String dcId = mdRecord.getIdentifier(); // dublin core identifier: is used to find the digilib image directory for this document + String id = getIdByDCIdentifier(dcId); + String imagesDocDirectory = "/permanent/library/" + id; + if (mdRecord.hasArchimedesDocBase()) + imagesDocDirectory = "/permanent/archimedes/" + id; + String echoDir = mdRecord.getEchoDir(); + if (echoDir != null) + imagesDocDirectory = echoDir; + String pageImgSubDir = "pageimg"; // default name: if digilib does not answer then this name is used + String indexMetaPageImgDir = getIndexMetaDataPageImg(imagesDocDirectory); + if (indexMetaPageImgDir != null) + pageImgSubDir = indexMetaPageImgDir; + String pageImgDir = imagesDocDirectory + "/" + pageImgSubDir; + return pageImgDir; + } + + private Node parse(File file) throws ApplicationException { + XmlUtil xmlUtil = XmlUtil.getInstance(); + Node retNode = null; + try { + retNode = xmlUtil.doc(file); + } catch (ApplicationException e) { + throw new ApplicationException("Your source file is not valid: " + e.getMessage()); + } + return retNode; + } + + private void validate(Node docNode, String docBase) throws ApplicationException { + XmlUtil xmlUtil = XmlUtil.getInstance(); + NamespaceContext nsContext = getEchoNsContext(); + String echoTest = null; + String archimedesTest = null; + try { + echoTest = xmlUtil.evaluateToString(docNode, "/echo:echo/echo:metadata", nsContext); + archimedesTest = xmlUtil.evaluateToString(docNode, "/archimedes/info", null); + } catch (ApplicationException e) { + throw new ApplicationException("Your source file is not an \"echo\" or \"archimedes\" file. Please proof that file."); + } + if (docBase.equals("echo") && archimedesTest != null) + throw new ApplicationException("Your source file is an \"archimedes\" file. " + "Please specify \"archimedes\" in your destination document base."); + if (docBase.equals("archimedes") && echoTest != null) + throw new ApplicationException("Your source file is an \"echo\" file. " + "Please specify \"echo\" in your destination document base."); + } + + private void validateByRelaxNGSchema(File destFile, String docBase) throws ApplicationException { + XmlUtil xmlUtil = XmlUtil.getInstance(); + if (docBase.equals("echo")) { + URL echoSchemaUrl = getEchoRelaxNGSchemaUrl(); + xmlUtil.validateByRelaxNG(destFile, echoSchemaUrl); + } + } + + private URL getEchoRelaxNGSchemaUrl() throws ApplicationException { + String echoSchemaUrlStr = "http://" + MpdlConstants.MPDL_EXIST_HOST_NAME + ":" + MpdlConstants.MPDL_EXIST_PORT + MpdlConstants.MPDL_ECHO_RELAXNG_PATH; + URL echoSchemaUrl = null; + try { + echoSchemaUrl = new URL(echoSchemaUrlStr); + } catch (MalformedURLException e) { + throw new ApplicationException(e); + } + return echoSchemaUrl; + } + + private void validate(MetadataRecord mdRecord) throws ApplicationException { + String identifier = mdRecord.getIdentifier(); + String creator = mdRecord.getCreator(); + String title = mdRecord.getTitle(); + if (identifier == null || identifier.trim().equals("")) + throw new ApplicationException("Your document file does not contain the metadata field: " + "identifier"); + if (creator == null || creator.trim().equals("")) + throw new ApplicationException("Your document file does not contain the metadata field: " + "creator"); + if (title == null || title.trim().equals("")) + throw new ApplicationException("Your document file does not contain the metadata field: " + "title"); + } + + private MetadataRecord getMetadataRecordEcho(Node documentNode) throws ApplicationException { + XmlUtil xmlUtil = XmlUtil.getInstance(); + NamespaceContext nsContext = getEchoNsContext(); + String identifier = xmlUtil.evaluateToString(documentNode, "/echo:echo/echo:metadata/dcterms:identifier", nsContext); + if (identifier != null) + identifier = StringUtilEscapeChars.deresolveXmlEntities(identifier); + String creator = xmlUtil.evaluateToString(documentNode, "/echo:echo/echo:metadata/dcterms:creator", nsContext); + if (creator != null) + creator = StringUtilEscapeChars.deresolveXmlEntities(creator); + String title = xmlUtil.evaluateToString(documentNode, "/echo:echo/echo:metadata/dcterms:title", nsContext); + if (title != null) + title = StringUtilEscapeChars.deresolveXmlEntities(title); + String language = xmlUtil.evaluateToString(documentNode, "/echo:echo/echo:metadata/dcterms:language", nsContext); + if (language != null) + language = StringUtilEscapeChars.deresolveXmlEntities(language); + String yearStr = xmlUtil.evaluateToString(documentNode, "/echo:echo/echo:metadata/dcterms:date", nsContext); + Date date = null; + if (yearStr != null && ! yearStr.equals("")) { + yearStr = StringUtilEscapeChars.deresolveXmlEntities(yearStr); + yearStr = new Util().toYearStr(yearStr); // test if possible etc + if (yearStr != null) + date = XmlUtil.getInstance().toDate(yearStr + "-01-01T00:00:00.000Z"); + } + String rights = xmlUtil.evaluateToString(documentNode, "/echo:echo/echo:metadata/dcterms:rights", nsContext); + if (rights != null) + rights = StringUtilEscapeChars.deresolveXmlEntities(rights); + String license = xmlUtil.evaluateToString(documentNode, "/echo:echo/echo:metadata/dcterms:license", nsContext); + if (license != null) + license = StringUtilEscapeChars.deresolveXmlEntities(license); + String accessRights = xmlUtil.evaluateToString(documentNode, "/echo:echo/echo:metadata/dcterms:accessRights", nsContext); + if (accessRights != null) + accessRights = StringUtilEscapeChars.deresolveXmlEntities(accessRights); + String echoDir = xmlUtil.evaluateToString(documentNode, "/echo:echo/echo:metadata/echo:echodir", nsContext); + if (echoDir != null) + echoDir = StringUtilEscapeChars.deresolveXmlEntities(echoDir); + String echoLink = xmlUtil.evaluateToString(documentNode, "/echo:echo/echo:metadata/echo:echolink", nsContext); + if (echoLink != null) + echoLink = StringUtilEscapeChars.deresolveXmlEntities(echoLink); + MetadataRecord mdRecord = new MetadataRecord(identifier, language, creator, title, null, null, "text/xml", rights, date); + mdRecord.setDocBase("echo"); + mdRecord.setLicense(license); + mdRecord.setAccessRights(accessRights); + mdRecord.setEchoLink(echoLink); + mdRecord.setEchoDir(echoDir); + return mdRecord; + } + + private MetadataRecord getMetadataRecordArchimedes(Node documentNode) throws ApplicationException { + XmlUtil xmlUtil = XmlUtil.getInstance(); + String identifier = xmlUtil.evaluateToString(documentNode, "/archimedes/info/cvs_file", null); + if (identifier != null) + identifier = StringUtilEscapeChars.deresolveXmlEntities(identifier); + String creator = xmlUtil.evaluateToString(documentNode, "/archimedes/info/author", null); + if (creator != null) + creator = StringUtilEscapeChars.deresolveXmlEntities(creator); + String title = xmlUtil.evaluateToString(documentNode, "/archimedes/info/title", null); + if (title != null) + title = StringUtilEscapeChars.deresolveXmlEntities(title); + String language = xmlUtil.evaluateToString(documentNode, "/archimedes/info/lang", null); + if (language != null) + language = StringUtilEscapeChars.deresolveXmlEntities(language); + String yearStr = xmlUtil.evaluateToString(documentNode, "/archimedes/info/date", null); + Date date = null; + if (yearStr != null && ! yearStr.equals("")) { + yearStr = StringUtilEscapeChars.deresolveXmlEntities(yearStr); + yearStr = new Util().toYearStr(yearStr); // test if possible etc + if (yearStr != null) + date = XmlUtil.getInstance().toDate(yearStr + "-01-01T00:00:00.000Z"); + } + String rights = "open access"; + String license = "http://echo.mpiwg-berlin.mpg.de/policy/oa_basics/declaration"; + String accessRights = "free"; + MetadataRecord mdRecord = new MetadataRecord(identifier, language, creator, title, null, null, "text/xml", rights, date); + mdRecord.setDocBase("archimedes"); + mdRecord.setLicense(license); + mdRecord.setAccessRights(accessRights); + return mdRecord; + } + + private String getIndexMetaDataPageImg(String imagesDocDirectory) throws ApplicationException { + String resultStr = null; + String nausikaaURLTexter = "http://nausikaa2.mpiwg-berlin.mpg.de/digitallibrary/servlet/Texter"; + XmlUtil xmlUtil = XmlUtil.getInstance(); + String pageImageDirectory = null; + try { + Node imagesDocDirectoryIndexMetaNode = xmlUtil.doc(nausikaaURLTexter + "?fn=" + imagesDocDirectory + "/index.meta"); + pageImageDirectory = xmlUtil.evaluateToString(imagesDocDirectoryIndexMetaNode, "/resource/meta/texttool/image", null); + } catch (Exception e) { + // return null if digilib does not work + } + if (pageImageDirectory != null) { + resultStr = pageImageDirectory; + } + return resultStr; + } + + private String getIdByExistId(String eXistIdentifier) { + String id = null; + if (eXistIdentifier == null) + return null; + int firstDelimPos = eXistIdentifier.indexOf("/", 2); + int secondDelimPos = eXistIdentifier.indexOf("/", firstDelimPos + 1); + int thirdDelimPos = eXistIdentifier.indexOf(".xml", secondDelimPos + 1); + if (firstDelimPos == -1 || secondDelimPos == -1 || thirdDelimPos == -1) + id = eXistIdentifier; + else + id = eXistIdentifier.substring(secondDelimPos + 1, thirdDelimPos); + return id; + } + + private String getIdByDCIdentifier(String dcIdentifier) { + if (dcIdentifier == null || dcIdentifier.trim().equals("")) + return null; + // if dcIdentifier starts with "ECHO:" or "ARCHIMEDES:" then delete it + if (dcIdentifier.startsWith("ECHO:")) + dcIdentifier = dcIdentifier.substring(5); + if (dcIdentifier.startsWith("ARCHIMEDES:")) + dcIdentifier = dcIdentifier.substring(11); + // delete the .xml suffix if there is one + if (dcIdentifier.endsWith(".xml")) { + int size = dcIdentifier.length(); + dcIdentifier = dcIdentifier.substring(0, size - 4); + } + return dcIdentifier; + } + + public NamespaceContext getEchoNsContext() { + NamespaceContext nsContext = new NamespaceContext() { + public String getNamespaceURI(String prefix) { + String uri; + if (prefix.equals("de")) + uri = "http://www.mpiwg-berlin.mpg.de/ns/de/1.0/"; + else if (prefix.equals("echo")) + uri = "http://www.mpiwg-berlin.mpg.de/ns/echo/1.0/"; + else if (prefix.equals("dc")) + uri = "http://purl.org/dc/elements/1.1/"; + else if (prefix.equals("dcterms")) + uri = "http://purl.org/dc/terms"; + else if (prefix.equals("dcq")) + uri = "http://purl.org/dc/qualifiers/1.0/"; + else if (prefix.equals("xhtml")) + uri = "http://www.w3.org/1999/xhtml"; + else if (prefix.equals("dct")) + uri = "http://purl.org/dc/terms/1.0/"; + else if (prefix.equals("xlink")) + uri = "http://www.w3.org/1999/xlink"; + else if (prefix.equals("rdf")) + uri = "http://www.w3.org/1999/02/22-rdf-syntax-ns#"; + else if (prefix.equals("xsi")) + uri = "http://www.w3.org/2001/XMLSchema-instance"; + else if (prefix.equals("mml")) + uri = "http://www.w3.org/1998/Math/MathML"; + else + uri = null; + return uri; + } + + public String getPrefix(String uri) { + if (uri.equals("http://www.mpiwg-berlin.mpg.de/ns/de/1.0/")) + return "de"; + else if (uri.equals("http://www.mpiwg-berlin.mpg.de/ns/echo/1.0/")) + return "echo"; + else if (uri.equals("http://purl.org/dc/elements/1.1/")) + return "dc"; + else if (uri.equals("http://purl.org/dc/terms")) + return "dcterms"; + else if (uri.equals("http://purl.org/dc/qualifiers/1.0/")) + return "dcq"; + else if (uri.equals("http://www.w3.org/1999/xhtml")) + return "xhtml"; + else if (uri.equals("http://purl.org/dc/terms/1.0/")) + return "dct"; + else if (uri.equals("http://www.w3.org/1999/xlink")) + return "xlink"; + else if (uri.equals("http://www.w3.org/1999/02/22-rdf-syntax-ns#")) + return "rdf"; + else if (uri.equals("http://www.w3.org/2001/XMLSchema-instance")) + return "xsi"; + else if (uri.equals("http://www.w3.org/1998/Math/MathML")) + return "mml"; + else + return null; + } + + public Iterator getPrefixes(String namespace) { + return null; + } + }; + return nsContext; + } + + +} diff -r 000000000000 -r 408254cf2f1d software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/xmlrpc/FilenameFilterExtension.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/xmlrpc/FilenameFilterExtension.java Wed Nov 24 17:24:23 2010 +0100 @@ -0,0 +1,22 @@ +package de.mpg.mpiwg.berlin.mpdl.xmlrpc; + +import java.io.File; +import java.io.FilenameFilter; + +public class FilenameFilterExtension implements FilenameFilter { + + private String fileExtension; + + public FilenameFilterExtension(String fileExtension) { + this.fileExtension = fileExtension; + } + + public boolean accept(File dir, String name) { + String nameToLower = name.toLowerCase(); + String fileExtensionToLower = fileExtension.toLowerCase(); + return nameToLower.endsWith("." + fileExtensionToLower); + // MimeTable mimetab = MimeTable.getInstance(); + // MimeType mime = mimetab.getContentTypeFor(name); + // return mime != null && mime.isXMLType(); + } +} diff -r 000000000000 -r 408254cf2f1d software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/xmlrpc/MpdlXmlRpcDocHandler.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/xmlrpc/MpdlXmlRpcDocHandler.java Wed Nov 24 17:24:23 2010 +0100 @@ -0,0 +1,148 @@ +package de.mpg.mpiwg.berlin.mpdl.xmlrpc; + +import org.apache.log4j.Logger; + +import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; +import de.mpg.mpiwg.berlin.mpdl.general.MpdlConstants; +import de.mpg.mpiwg.berlin.mpdl.schedule.MpdlDocOperation; +import de.mpg.mpiwg.berlin.mpdl.xmlrpc.MpdlXmlRpcInterface; +import de.mpg.mpiwg.berlin.mpdl.xmlrpc.MpdlXmlRpcInterfaceImpl; + +/** + * Handler for eXist collections and documents (singleton). + * It could not be used in a multi threading environment. + * Locally saved documents could be stored (over XML-RPC) into eXist + * collections. Collections could be configured language specific (see + * instance variable "languages" below). + * Your local directory structure must look like this: + * documents + * archimedes + * ar + * yourDoc1.xml + * ... + * ... + * zh + * yourDoc1.xml + * ... + * echo + * ar + * yourDoc1.xml + * ... + * ... + * zh + * yourDoc1.xml + * ... + * + */ +public class MpdlXmlRpcDocHandler { + private static MpdlXmlRpcDocHandler instance; + private static Logger LOGGER = Logger.getLogger(MpdlXmlRpcDocHandler.class); // Logs to EXIST_HOME/webapp/WEB-INF/logs/exist.log + private static String DOC_ROOT_COLLECTION_MORPH = "/db/mpdl/documents/morph"; + private static String DOC_ROOT_COLLECTION_STANDARD = "/db/mpdl/documents/standard"; + private static String LOCAL_DOC_DIR = MpdlConstants.MPDL_EXIST_DATA_DIR + "/" + "documents"; + private static String SERVER_NAME = MpdlConstants.MPDL_EXIST_HOST_NAME; + private static int SERVER_PORT = MpdlConstants.MPDL_EXIST_PORT; + private static String ADMIN_USER_NAME = MpdlConstants.MPDL_EXIST_ADMIN_USER_NAME; + private static String ADMIN_USER_PW = MpdlConstants.MPDL_EXIST_ADMIN_USER_PW; + private MpdlXmlRpcInterface eXistXmlRpcInterface = null; + + public static MpdlXmlRpcDocHandler getInstance() throws ApplicationException { + if (instance == null) { + instance = new MpdlXmlRpcDocHandler(); + instance.init(); + } + return instance; + } + + private void init() throws ApplicationException { + eXistXmlRpcInterface = MpdlXmlRpcInterfaceImpl.getInstance(SERVER_NAME, SERVER_PORT, ADMIN_USER_NAME, ADMIN_USER_PW); + } + + public boolean documentExists(MpdlDocOperation docOperation) throws ApplicationException { + String docFileName = docOperation.getFileName(); + String docBase = docOperation.getDocBase(); + String language = docOperation.getLanguage(); + boolean docExists = documentExists(docBase, language, docFileName); + return docExists; + } + + public void saveDocumentFile(MpdlDocOperation docOperation) throws ApplicationException { + String docFileName = docOperation.getFileName(); + String docBase = docOperation.getDocBase(); + String language = docOperation.getLanguage(); + saveDocumentFile(docBase, language, docFileName); + } + + public void saveDocumentFile(String localFile, String existIdentifier) throws ApplicationException { + int index = existIdentifier.lastIndexOf("/"); + String collection = existIdentifier.substring(0, index); + String docFileName = existIdentifier.substring(index); + String documentCollectionMorph = DOC_ROOT_COLLECTION_MORPH + collection; + String documentCollectionStandard = DOC_ROOT_COLLECTION_STANDARD + collection; + eXistXmlRpcInterface.saveDocument(documentCollectionMorph, docFileName, localFile); + LOGGER.info("MPDL: XML-RPC: Document: \"" + localFile + "\" saved to eXist collection: \"" + documentCollectionMorph + "\""); + eXistXmlRpcInterface.saveDocument(documentCollectionStandard, docFileName, localFile); + LOGGER.info("MPDL: XML-RPC: Document: \"" + localFile + "\" saved to eXist collection: \"" + documentCollectionStandard + "\""); + } + + public void deleteDocumentFile(MpdlDocOperation docOperation) throws ApplicationException { + String docFileName = docOperation.getFileName(); + String docBase = docOperation.getDocBase(); + String language = docOperation.getLanguage(); + deleteDocumentFile(docBase, language, docFileName); + } + + public void deleteDocumentFile(String existIdentifier) throws ApplicationException { + int index = existIdentifier.lastIndexOf("/"); + String collection = existIdentifier.substring(0, index); + String docFileName = existIdentifier.substring(index); + String documentCollectionMorph = DOC_ROOT_COLLECTION_MORPH + collection; + String documentCollectionStandard = DOC_ROOT_COLLECTION_STANDARD + collection; + eXistXmlRpcInterface.deleteDocument(documentCollectionMorph, docFileName); + LOGGER.info("MPDL: XML-RPC: Document deleted: \"" + documentCollectionMorph + "/" + docFileName + "\""); + eXistXmlRpcInterface.deleteDocument(documentCollectionStandard, docFileName); + LOGGER.info("MPDL: XML-RPC: Document deleted: \"" + documentCollectionStandard + "/" + docFileName + "\""); + } + + public void createCollection(String fullCollectionName) throws ApplicationException { + eXistXmlRpcInterface.createCollection(fullCollectionName); + } + + public void deleteCollection(String fullCollectionName) throws ApplicationException { + eXistXmlRpcInterface.deleteCollection(fullCollectionName); + } + + private boolean documentExists(String docBase, String language, String docFileName) throws ApplicationException { + String documentCollection = DOC_ROOT_COLLECTION_STANDARD + "/" + docBase + "/" + language; + String fullDocName = documentCollection + "/" + docFileName; + String[] fullCollectionDocNames = eXistXmlRpcInterface.getDocumentNames(documentCollection); + boolean isAvailable = false; + if (fullCollectionDocNames != null) { + for (int i=0; i countMs || milestonePositionToInt > countMs+1) { + resultFragment.add(new StringValue("")); + return resultFragment; + } + String msFromPathName = getNodePath(docUriStr, milestoneNameStr, milestonePositionFromInt); + String msToPathName = getNodePath(docUriStr, milestoneNameStr, milestonePositionToInt); + String openElementsOfMsFrom = pathName2XmlTags(msFromPathName, "open"); + String closingElementsOfMsTo = pathName2XmlTags(msToPathName, "close"); + // fetch the fragment between the two milestones + String fragment = getFragmentBetween(docUriStr, milestoneNameStr, milestonePositionFromInt, milestonePositionToInt); + fragment = openElementsOfMsFrom + fragment + closingElementsOfMsTo; + StringValue strValFragment = new StringValue(fragment); + resultFragment.add(strValFragment); + return resultFragment; + } + + /** + * Fetch the fragment between two milestones in an XML document + * bufferSize is important for better performance: each chunk in this size is + * matched against the regular expression, if it is too small or too high then + * performance could be bad + * @param docUriStr document URI (e.g. /db/shakespeare/hamlet.xml) + * @param msName milestone name (e.g.: pb) + * @param msPositionFrom first milestone (e.g.: 10) + * @param msPositionTo second milestone (e.g.: 11) + * @return fragment between the two milestones with msPositionFrom and msPositionTo + * @throws XPathException + */ + private String getFragmentBetween(String docUriStr, String msName, int msPositionFrom, int msPositionTo) throws XPathException { + int bufferSize = 16384; // performance: buffer size 4096 is 25% slower + String existHomeFilePath = getExistHomeFilePath(); + String docLocalFileName = existHomeFilePath + FS_DOC_CACHE_PATH + docUriStr; + /* + * find milestones: + * find milestones explicitly closed: blabla + * find milestones in multilines: + * + * find case insensitive and in multilines: Pattern.CASE_INSENSITIVE | Pattern.MULTILINE + */ + String regExprMsInternClosed = "<" + msName + "[^>]*?/>"; + String regExprMsExternClosed = "<" + msName + "[^>]*?>[^>]*?"; + String regExprMilestone = regExprMsInternClosed + "|" + regExprMsExternClosed; + Pattern p = Pattern.compile(regExprMilestone, Pattern.CASE_INSENSITIVE | Pattern.MULTILINE); // both flags enabled + String readBufferStr = ""; + char[] readBuffer = new char[bufferSize]; + String msFragmentBuffer = ""; + int msCount = 0; + String result = ""; + boolean eof = false; + String ms = ""; + try { + BufferedReader in = new BufferedReader(new FileReader(docLocalFileName)); + while (!eof && !(msCount >= msPositionTo)) { + int countReadChars = in.read(readBuffer, 0, bufferSize); + // last page: delivers all characters to the end in the document + if (countReadChars == -1) { + eof = true; + in.close(); + return ms + msFragmentBuffer; + } + readBufferStr = new String(readBuffer, 0, countReadChars); + msFragmentBuffer = msFragmentBuffer + readBufferStr; + Matcher m = p.matcher(msFragmentBuffer); + int fragmentBeginPos = 0; + while (m.find()) { + int msBeginPos = m.start(); + int msEndPos = m.end(); + // we have the milestone fragments (milestone end could be matched) one by one + // milestone end: cut the part in the last line before the milestone + String msFragment = ms + msFragmentBuffer.substring(fragmentBeginPos, msBeginPos); + // add result milestone fragments which are between msPositionFrom and msPositionTo + // last fragment in document (last page): is not added + if (msCount >= msPositionFrom && msCount < msPositionTo) { + result = result + msFragment; + } + fragmentBeginPos = msEndPos; + ms = msFragmentBuffer.substring(msBeginPos, msEndPos); + msCount++; // each found milestone increments the count of milestones + } + // delivers the portion after the last found milestone; this is used for the next msFragmentBuffer for matching + msFragmentBuffer = msFragmentBuffer.substring(fragmentBeginPos, msFragmentBuffer.length()); + } + in.close(); + } catch (IOException e) { + throw new XPathException(e); + } + return result; + } + + private String getNodePath(String docPath, String msName, int position) throws XPathException { + String query = + "let $ms := doc('" + docPath + "')//" + msName + "[" + position + "]/.. \n" + + "let $result := " + + " if ($ms) " + + " then util:node-xpath($ms)" + + " else (\"\") \n" + + "return $result"; + String nodePath = executeXQuery(query); + return nodePath; + } + + private int getCountMs(String docPath, String msName) throws XPathException { + int count = -1; + String query = "let $result := count(doc('" + docPath + "')//" + msName + ")" + "\n" + "return $result"; + String resultStr = executeXQuery(query); + count = new Integer(resultStr); + return count; + } + + /** + * A path name delivered by function xnode-path (with special strings such as + * "@", "[", "]", " eq ") is converted to an XML String with xml tags, + * opened or closed such as the mode says + * @param pathName delivered by function xnode-path: Example: /archimedes[@xmlns:xlink eq "http://www.w3.org/1999/xlink"]/text/body/chap/p[@type eq "main"]/s/foreign[@lang eq "en"] + * @param mode open or close + * @return xml tags opened or closed + */ + private String pathName2XmlTags(String pathName, String mode) { + String result = ""; + ArrayList elements = pathName2ElementsWithAttributes(pathName); + if (mode.equals("open")) { + for (int i=0; i < elements.size(); i++) { + String element = elements.get(i); + element = element.replaceAll("\\[", " "); // opening element: replace open bracket with space + element = element.replaceAll(" eq ", "="); // opening element: remove @ character + element = element.replaceAll("@", ""); // opening element: remove @ character + element = element.replaceAll("\\]", ""); // opening element: remove closing bracket + if (! (element.length() == 0)) + result += "<" + element + ">\n"; + } + } else if (mode.equals("close")) { + for (int i=elements.size()-1; i >= 0; i--) { + String element = elements.get(i); + element = element.replaceAll("\\[[^\\]]*\\]", ""); // closing element: remove brackets with attributes + if (! (element.length() == 0)) + result += "\n"; + } + } + return result; + } + + private ArrayList pathName2ElementsWithAttributes(String pathName) { + ArrayList result = new ArrayList(); + String regExpr = "/[^/]+\\[[^\\]]*\\]" + "|" + "/[^/\\[]+"; // pathName example: "/archimedes[@xmlns:xlink eq "http://www.w3.org/1999/xlink"]/text/body/chap/p[@type eq "main"]/s/foreign[@lang eq "en"]" + Pattern p = Pattern.compile(regExpr, Pattern.CASE_INSENSITIVE | Pattern.MULTILINE); // both flags enabled + Matcher m = p.matcher(pathName); + while (m.find()) { + int msBeginPos = m.start(); + int msEndPos = m.end(); + String elementName = pathName.substring(msBeginPos+1, msEndPos); // without first "/" character + result.add(elementName); + } + return result; + } + + private String getExistHomeFilePath() throws XPathException { + return context.getBroker().getConfiguration().getExistHome().getAbsolutePath(); + } + + private String executeXQuery(String xQueryStr) throws XPathException { + XQuery xQuery = context.getBroker().getXQueryService(); + CompiledXQuery compiledXQuery = xQuery.compile(context, xQueryStr); + Sequence sequence = compiledXQuery.eval(null); // without context + Item item = sequence.itemAt(0); + String nodeValueStr = item.getStringValue(); + return nodeValueStr; + } + + /** + * not yet used but useful in future + * @param docPath + * @return + * @throws XPathException + */ + private String getNamespaceString(String docPath) throws XPathException { + String query = + "let $elem := doc('" + docPath + "')/*" + "\n" + + "let $prefs := in-scope-prefixes($elem)" + "\n" + + "for $pref in $prefs" + "\n" + + " let $uri := namespace-uri-for-prefix($pref, $elem)" + "\n" + + " let $result := " + + " if ($pref = \"xml\") " + + " then ()" + "\n" + + " else concat(\"xmlns:\", $pref, \"="\", $uri, \""\") \n" + + "return $result"; + String resultStr = executeXQuery(query); + return resultStr; + } +} diff -r 000000000000 -r 408254cf2f1d software/eXist/mpdl-modules/src/org/exist/xquery/modules/mpdldoc/CheckUri.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/org/exist/xquery/modules/mpdldoc/CheckUri.java Wed Nov 24 17:24:23 2010 +0100 @@ -0,0 +1,91 @@ +/* + * eXist Open Source Native XML Database: Extension module + * Copyright (C) 2008 Josef Willenborg + * jwillenborg@mpiwg-berlin.mpg.de + * http://www.mpiwg-berlin.mpg.de + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * $Id: $ + */ +package org.exist.xquery.modules.mpdldoc; + +import java.io.IOException; +import org.apache.commons.httpclient.HttpClient; +import org.apache.commons.httpclient.methods.GetMethod; +import org.apache.commons.httpclient.params.HttpClientParams; +import org.apache.commons.httpclient.params.HttpMethodParams; +import org.exist.dom.QName; +import org.exist.xquery.BasicFunction; +import org.exist.xquery.Cardinality; +import org.exist.xquery.FunctionSignature; +import org.exist.xquery.XPathException; +import org.exist.xquery.XQueryContext; +import org.exist.xquery.value.BooleanValue; +import org.exist.xquery.value.NumericValue; +import org.exist.xquery.value.Sequence; +import org.exist.xquery.value.SequenceType; +import org.exist.xquery.value.Type; + +/** + * @author Josef Willenborg (jwillenborg@mpiwg-berlin.mpg.de) + */ +public class CheckUri extends BasicFunction { + + public final static FunctionSignature signature = + new FunctionSignature( + new QName("check-uri", MPDLDocModule.NAMESPACE_URI, MPDLDocModule.PREFIX), + "A function which checks the uri-string if it is available within a timeout value (in ms).", + new SequenceType[] { + new SequenceType(Type.STRING, Cardinality.ZERO_OR_ONE), + new SequenceType(Type.INTEGER, Cardinality.ZERO_OR_ONE) + }, + new SequenceType(Type.BOOLEAN, Cardinality.EXACTLY_ONE)); + + public CheckUri(XQueryContext context) { + super(context, signature); + } + + public Sequence eval(Sequence[] args, Sequence contextSequence) throws XPathException { + Sequence isOk = BooleanValue.TRUE; + HttpClient httpClient = new HttpClient(); + GetMethod method = null; + try { + Sequence firstSeq = args[0]; + Sequence secondSeq = args[1]; + if (firstSeq.isEmpty()) + return isOk; + String uriStr = firstSeq.getStringValue(); + int milliseconds = 2000; // default value + if (! secondSeq.isEmpty()) { + NumericValue value = (NumericValue) secondSeq.convertTo(Type.NUMBER); + milliseconds = value.getInt(); + } + httpClient.getParams().setParameter(HttpMethodParams.SO_TIMEOUT, new Integer(milliseconds)); + httpClient.getParams().setParameter(HttpClientParams.CONNECTION_MANAGER_TIMEOUT, new Long(milliseconds)); + method = new GetMethod(uriStr); + method.getParams().setParameter(HttpMethodParams.SO_TIMEOUT, new Integer(milliseconds)); + method.setFollowRedirects(true); + httpClient.executeMethod(method); + } catch (IOException e) { + isOk = BooleanValue.FALSE; // if timeout exception is thrown + } finally { + if (method != null) { + method.releaseConnection(); + } + } + return isOk; + } +} diff -r 000000000000 -r 408254cf2f1d software/eXist/mpdl-modules/src/org/exist/xquery/modules/mpdldoc/ESciDocLogin.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/org/exist/xquery/modules/mpdldoc/ESciDocLogin.java Wed Nov 24 17:24:23 2010 +0100 @@ -0,0 +1,79 @@ +/* + * eXist Open Source Native XML Database: Extension module + * Copyright (C) 2008 Josef Willenborg + * jwillenborg@mpiwg-berlin.mpg.de + * http://www.mpiwg-berlin.mpg.de + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * $Id: TextModule.java $ + */ +package org.exist.xquery.modules.mpdldoc; + +import org.exist.dom.QName; +import org.exist.xquery.BasicFunction; +import org.exist.xquery.Cardinality; +import org.exist.xquery.FunctionSignature; +import org.exist.xquery.XPathException; +import org.exist.xquery.XQueryContext; +import org.exist.xquery.value.Sequence; +import org.exist.xquery.value.SequenceType; +import org.exist.xquery.value.StringValue; +import org.exist.xquery.value.Type; +import org.exist.xquery.value.ValueSequence; + +import de.mpg.mpiwg.berlin.mpdl.escidoc.ESciDocRestSession; +import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; + +/** + * @author Josef Willenborg (jwillenborg@mpiwg-berlin.mpg.de) + */ +public class ESciDocLogin extends BasicFunction { + + public final static FunctionSignature signature = + new FunctionSignature( + new QName("escidoc-login", MPDLDocModule.NAMESPACE_URI, MPDLDocModule.PREFIX), + "A function which delivers an eSciDoc cookie id for the given login name and password", + new SequenceType[] { + new SequenceType(Type.STRING, Cardinality.EXACTLY_ONE), + new SequenceType(Type.STRING, Cardinality.EXACTLY_ONE) + }, + new SequenceType(Type.STRING, Cardinality.EXACTLY_ONE)); + + public ESciDocLogin(XQueryContext context) { + super(context, signature); + } + + public Sequence eval(Sequence[] args, Sequence contextSequence) throws XPathException { + try { + Sequence firstSeq = args[0]; + Sequence secondSeq = args[1]; + if (firstSeq.isEmpty() || secondSeq.isEmpty()) + return Sequence.EMPTY_SEQUENCE; + String userName = firstSeq.getStringValue(); + String pw = secondSeq.getStringValue(); + String eSciDocCookieId = ESciDocRestSession.login(userName, pw); + ValueSequence resultSequence = new ValueSequence(); + StringValue strValCookieId = new StringValue(""); + if (eSciDocCookieId != null) + strValCookieId = new StringValue(eSciDocCookieId); + resultSequence.add(strValCookieId); + return resultSequence; + } catch (ApplicationException e) { + throw new XPathException(e); + } + } + +} diff -r 000000000000 -r 408254cf2f1d software/eXist/mpdl-modules/src/org/exist/xquery/modules/mpdldoc/GetESciDocContainerIdByExistId.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/org/exist/xquery/modules/mpdldoc/GetESciDocContainerIdByExistId.java Wed Nov 24 17:24:23 2010 +0100 @@ -0,0 +1,88 @@ +/* + * eXist Open Source Native XML Database: Extension module + * Copyright (C) 2008 Josef Willenborg + * jwillenborg@mpiwg-berlin.mpg.de + * http://www.mpiwg-berlin.mpg.de + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * $Id: TextModule.java $ + */ +package org.exist.xquery.modules.mpdldoc; + +import org.exist.dom.QName; +import org.exist.xquery.BasicFunction; +import org.exist.xquery.Cardinality; +import org.exist.xquery.FunctionSignature; +import org.exist.xquery.XPathException; +import org.exist.xquery.XQueryContext; +import org.exist.xquery.value.Sequence; +import org.exist.xquery.value.SequenceType; +import org.exist.xquery.value.StringValue; +import org.exist.xquery.value.Type; +import org.exist.xquery.value.ValueSequence; + +import de.mpg.mpiwg.berlin.mpdl.escidoc.ESciDocRestSession; +import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; + +/** + * @author Josef Willenborg (jwillenborg@mpiwg-berlin.mpg.de) + */ +public class GetESciDocContainerIdByExistId extends BasicFunction { + + public final static FunctionSignature signature = + new FunctionSignature( + new QName("escidoc-get-containerid", MPDLDocModule.NAMESPACE_URI, MPDLDocModule.PREFIX), + "A function which delivers the container id of the first argument: existId." + + "Second argument is the cookieId.", + new SequenceType[] { + new SequenceType(Type.STRING, Cardinality.EXACTLY_ONE), + new SequenceType(Type.STRING, Cardinality.EXACTLY_ONE) + }, + new SequenceType(Type.STRING, Cardinality.EXACTLY_ONE)); + + public GetESciDocContainerIdByExistId(XQueryContext context) { + super(context, signature); + } + + public Sequence eval(Sequence[] args, Sequence contextSequence) throws XPathException { + try { + Sequence firstSeq = args[0]; + Sequence secondSeq = args[1]; + if (firstSeq.isEmpty() || secondSeq.isEmpty()) + return Sequence.EMPTY_SEQUENCE; + String firstSeqStrValue = firstSeq.getStringValue(); + String existId = null; + if (! firstSeqStrValue.equals("")) + existId = firstSeqStrValue; + String secondSeqStrValue = secondSeq.getStringValue(); + String eSciDocCookieId = null; + if (! secondSeqStrValue.equals("")) + eSciDocCookieId = secondSeqStrValue; + ESciDocRestSession eSciDocSession = ESciDocRestSession.getInstance(eSciDocCookieId); + String eScidDocContainerId = eSciDocSession.getContainerIdByEXistId(existId); + ValueSequence result = new ValueSequence(); + if (eScidDocContainerId != null) { + result.add(new StringValue(eScidDocContainerId)); + } else { + result.add(new StringValue("")); + } + return result; + } catch (ApplicationException e) { + throw new XPathException(e); + } + } + +} diff -r 000000000000 -r 408254cf2f1d software/eXist/mpdl-modules/src/org/exist/xquery/modules/mpdldoc/GetESciDocs.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/org/exist/xquery/modules/mpdldoc/GetESciDocs.java Wed Nov 24 17:24:23 2010 +0100 @@ -0,0 +1,119 @@ +/* + * eXist Open Source Native XML Database: Extension module + * Copyright (C) 2008 Josef Willenborg + * jwillenborg@mpiwg-berlin.mpg.de + * http://www.mpiwg-berlin.mpg.de + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * $Id: TextModule.java $ + */ +package org.exist.xquery.modules.mpdldoc; + +import java.util.ArrayList; + +import org.exist.dom.QName; +import org.exist.memtree.DocumentImpl; +import org.exist.memtree.MemTreeBuilder; +import org.exist.xquery.BasicFunction; +import org.exist.xquery.Cardinality; +import org.exist.xquery.FunctionSignature; +import org.exist.xquery.XPathException; +import org.exist.xquery.XQueryContext; +import org.exist.xquery.value.Sequence; +import org.exist.xquery.value.SequenceType; +import org.exist.xquery.value.Type; + +import de.mpg.mpiwg.berlin.mpdl.escidoc.ESciDocRestSession; +import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; +import de.mpg.mpiwg.berlin.mpdl.general.MpdlConstants; + +/** + * @author Josef Willenborg (jwillenborg@mpiwg-berlin.mpg.de) + */ +public class GetESciDocs extends BasicFunction { + + public final static FunctionSignature signature = + new FunctionSignature( + new QName("escidoc-get-docs", MPDLDocModule.NAMESPACE_URI, MPDLDocModule.PREFIX), + "A function which delivers all eSciDoc documents restricted to the first argument: docbase." + + "Second argument is the cookieId.", + new SequenceType[] { + new SequenceType(Type.STRING, Cardinality.EXACTLY_ONE), + new SequenceType(Type.STRING, Cardinality.EXACTLY_ONE) + }, + new SequenceType(Type.NODE, Cardinality.EXACTLY_ONE)); + + public GetESciDocs(XQueryContext context) { + super(context, signature); + } + + public Sequence eval(Sequence[] args, Sequence contextSequence) throws XPathException { + try { + Sequence firstSeq = args[0]; + Sequence secondSeq = args[1]; + if (firstSeq.isEmpty() || secondSeq.isEmpty()) + return Sequence.EMPTY_SEQUENCE; + String firstSeqStrValue = firstSeq.getStringValue(); + String docBase = null; + if (! firstSeqStrValue.equals("")) + docBase = firstSeqStrValue; + String docBaseContainerId = MpdlConstants.MPDL_ESCIDOC_ECHO_CONTAINER_ID; + if (docBase != null && docBase.equals("archimedes")) + docBaseContainerId = MpdlConstants.MPDL_ESCIDOC_ARCHIMEDES_CONTAINER_ID; + String secondSeqStrValue = secondSeq.getStringValue(); + String eSciDocCookieId = null; + if (! secondSeqStrValue.equals("")) + eSciDocCookieId = secondSeqStrValue; + ESciDocRestSession eSciDocSession = ESciDocRestSession.getInstance(eSciDocCookieId); + String containerXmlStr = eSciDocSession.getContainer(docBaseContainerId); + ArrayList containerIdsOfDocBaseContainer = eSciDocSession.getContainerIds(containerXmlStr); + ArrayList containerTitlesOfDocBaseContainer = eSciDocSession.getContainerTitles(containerXmlStr); + DocumentImpl doc = null; + if (containerIdsOfDocBaseContainer != null) { + MemTreeBuilder builder = context.getDocumentBuilder(); + builder.startElement("", "documents", "documents", null); + for (int i=0; i 0 && endIndex > 0) { + existId = containerTitle.substring(beginIndex + 13, endIndex + 4); + builder.characters(existId); + } + } + builder.endElement(); + builder.endElement(); + } + builder.endElement(); + doc = ((DocumentImpl)builder.getDocument()); + } else { + return Sequence.EMPTY_SEQUENCE; + } + return doc; + } catch (ApplicationException e) { + throw new XPathException(e); + } + } + +} diff -r 000000000000 -r 408254cf2f1d software/eXist/mpdl-modules/src/org/exist/xquery/modules/mpdldoc/GetJobs.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/org/exist/xquery/modules/mpdldoc/GetJobs.java Wed Nov 24 17:24:23 2010 +0100 @@ -0,0 +1,248 @@ +/* + * eXist Open Source Native XML Database: Extension module + * Copyright (C) 2008 Josef Willenborg + * jwillenborg@mpiwg-berlin.mpg.de + * http://www.mpiwg-berlin.mpg.de + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * $Id: TextModule.java $ + */ +package org.exist.xquery.modules.mpdldoc; + +import java.util.ArrayList; +import java.util.Date; + +import org.exist.dom.QName; +import org.exist.http.servlets.RequestWrapper; +import org.exist.memtree.DocumentImpl; +import org.exist.memtree.MemTreeBuilder; +import org.exist.xquery.BasicFunction; +import org.exist.xquery.Cardinality; +import org.exist.xquery.FunctionSignature; +import org.exist.xquery.Variable; +import org.exist.xquery.XPathException; +import org.exist.xquery.XQueryContext; +import org.exist.xquery.functions.request.RequestModule; +import org.exist.xquery.value.JavaObjectValue; +import org.exist.xquery.value.Sequence; +import org.exist.xquery.value.SequenceType; +import org.exist.xquery.value.Type; + +import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; +import de.mpg.mpiwg.berlin.mpdl.general.MpdlConstants; +import de.mpg.mpiwg.berlin.mpdl.schedule.MpdlChainScheduler; +import de.mpg.mpiwg.berlin.mpdl.schedule.MpdlDocOperation; + +/** + * @author Josef Willenborg (jwillenborg@mpiwg-berlin.mpg.de) + */ +public class GetJobs extends BasicFunction { + + public final static FunctionSignature signature = + new FunctionSignature( + new QName("get-jobs", MPDLDocModule.NAMESPACE_URI, MPDLDocModule.PREFIX), + "A function which delivers all jobs or the job given by an id.", + new SequenceType[] { new SequenceType(Type.STRING, Cardinality.EXACTLY_ONE)}, + new SequenceType(Type.NODE, Cardinality.EXACTLY_ONE)); + + public GetJobs(XQueryContext context) { + super(context, signature); + } + + public Sequence eval(Sequence[] args, Sequence contextSequence) throws XPathException { + try { + Sequence firstSeq = args[0]; + if (firstSeq.isEmpty()) + return Sequence.EMPTY_SEQUENCE; + String firstSeqStrValue = firstSeq.getStringValue(); + boolean getAllJobs = false; + if (firstSeqStrValue.equals("all")) + getAllJobs = true; + MpdlChainScheduler scheduler = MpdlChainScheduler.getInstance(); + ArrayList docOperations = new ArrayList(); + if (getAllJobs) { + docOperations = scheduler.getDocOperations(); + } else { + String jobIdStr = firstSeq.getStringValue(); + int jobId = Integer.parseInt(jobIdStr); + MpdlDocOperation docOperation = scheduler.getDocOperation(jobId); + if (docOperation != null) + docOperations.add(docOperation); + } + DocumentImpl doc = null; + if ((getAllJobs && ! docOperations.isEmpty()) || (! getAllJobs && ! docOperations.isEmpty())) { + MemTreeBuilder builder = context.getDocumentBuilder(); + builder.startElement("", "mpdl-doc-operations", "mpdl-doc-operations", null); + for (int i=0; i", "class=\"singlePage\">"); + byte[] pdfBytes = mpdlRenderer.createPdf(singlePageStr, language, topLeftStr, topRightStr, bottomLeftStr, bottomRightStr); + + return new Base64Binary(pdfBytes); + } catch (ApplicationException e) { + throw new XPathException(e.getMessage()); + } catch (SAXException e) { + throw new XPathException(e); + } + } +} diff -r 000000000000 -r 408254cf2f1d software/eXist/mpdl-modules/src/org/exist/xquery/modules/mpdldoc/MPDLDocModule.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/org/exist/xquery/modules/mpdldoc/MPDLDocModule.java Wed Nov 24 17:24:23 2010 +0100 @@ -0,0 +1,64 @@ +/* + * eXist Open Source Native XML Database: Extension module + * Copyright (C) 2008 Josef Willenborg + * jwillenborg@mpiwg-berlin.mpg.de + * http://www.mpiwg-berlin.mpg.de + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * $Id: TextModule.java $ + */ +package org.exist.xquery.modules.mpdldoc; + +import org.exist.xquery.AbstractInternalModule; +import org.exist.xquery.FunctionDef; + +/** + * @author Josef Willenborg (jwillenborg@mpiwg-berlin.mpg.de) + */ +public class MPDLDocModule extends AbstractInternalModule { + public final static String NAMESPACE_URI = "http://exist-db.org/xquery/mpdldoc"; + public final static String PREFIX = "mpdldoc"; + + private final static FunctionDef[] functions = { + new FunctionDef(CheckUri.signature, CheckUri.class), + new FunctionDef(MpdlDocOperationStarter.signature, MpdlDocOperationStarter.class), + new FunctionDef(ESciDocLogin.signature, ESciDocLogin.class), + new FunctionDef(GetESciDocs.signature, GetESciDocs.class), + new FunctionDef(GetJobs.signature, GetJobs.class), + new FunctionDef(GetESciDocContainerIdByExistId.signature, GetESciDocContainerIdByExistId.class), + new FunctionDef(Html2Pdf.signature, Html2Pdf.class) + }; + + public MPDLDocModule() { + super(functions); + } + + public String getNamespaceURI() { + return NAMESPACE_URI; + } + + public String getDefaultPrefix() { + return PREFIX; + } + + public String getDescription() { + return "A module for document functions"; + } + + public String getReleaseVersion() { + return "A module for document functions"; + } +} diff -r 000000000000 -r 408254cf2f1d software/eXist/mpdl-modules/src/org/exist/xquery/modules/mpdldoc/MpdlDocOperationStarter.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/org/exist/xquery/modules/mpdldoc/MpdlDocOperationStarter.java Wed Nov 24 17:24:23 2010 +0100 @@ -0,0 +1,97 @@ +/* + * eXist Open Source Native XML Database: Extension module + * Copyright (C) 2008 Josef Willenborg + * jwillenborg@mpiwg-berlin.mpg.de + * http://www.mpiwg-berlin.mpg.de + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * $Id: TextModule.java $ + */ +package org.exist.xquery.modules.mpdldoc; + +import org.exist.dom.QName; +import org.exist.xquery.BasicFunction; +import org.exist.xquery.Cardinality; +import org.exist.xquery.FunctionSignature; +import org.exist.xquery.XPathException; +import org.exist.xquery.XQueryContext; +import org.exist.xquery.value.Sequence; +import org.exist.xquery.value.SequenceType; +import org.exist.xquery.value.StringValue; +import org.exist.xquery.value.Type; +import org.exist.xquery.value.ValueSequence; + +import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; +import de.mpg.mpiwg.berlin.mpdl.schedule.MpdlChainScheduler; +import de.mpg.mpiwg.berlin.mpdl.schedule.MpdlDocOperation; + +/** + * @author Josef Willenborg (jwillenborg@mpiwg-berlin.mpg.de) + */ +public class MpdlDocOperationStarter extends BasicFunction { + + public final static FunctionSignature signature = + new FunctionSignature( + new QName("do", MPDLDocModule.NAMESPACE_URI, MPDLDocModule.PREFIX), + "A function which performs/schedules an operation (insert, update or delete) for the given source file (given by local file path) " + + "to the destination (given by document base, language and fileName) both to eSciDoc and eXist." + + "It returns a node which contains information about the scheduled operation (jobId etc.).", + new SequenceType[] { new SequenceType(Type.STRING, Cardinality.EXACTLY_ONE), + new SequenceType(Type.STRING, Cardinality.EXACTLY_ONE), + new SequenceType(Type.STRING, Cardinality.EXACTLY_ONE), + new SequenceType(Type.STRING, Cardinality.EXACTLY_ONE), + new SequenceType(Type.STRING, Cardinality.EXACTLY_ONE), + new SequenceType(Type.STRING, Cardinality.EXACTLY_ONE), + new SequenceType(Type.STRING, Cardinality.EXACTLY_ONE) }, + new SequenceType(Type.STRING, Cardinality.EXACTLY_ONE)); + + public MpdlDocOperationStarter(XQueryContext context) { + super(context, signature); + } + + public Sequence eval(Sequence[] args, Sequence contextSequence) throws XPathException { + Sequence firstSeq = args[0]; + Sequence secondSeq = args[1]; + Sequence thirdSeq = args[2]; + Sequence fourthSeq = args[3]; + Sequence fifthSeq = args[4]; + Sequence sixthSeq = args[5]; + Sequence seventhSeq = args[6]; + if (firstSeq.isEmpty() || secondSeq.isEmpty() || thirdSeq.isEmpty() || fourthSeq.isEmpty() || fifthSeq.isEmpty() || sixthSeq.isEmpty() || seventhSeq.isEmpty()) + return Sequence.EMPTY_SEQUENCE; + String operationName = firstSeq.getStringValue(); + String srcUrl = secondSeq.getStringValue(); + String uploadFileName = thirdSeq.getStringValue(); + String docBase = fourthSeq.getStringValue(); + String language = fifthSeq.getStringValue(); + String fileName = sixthSeq.getStringValue(); + String eSciDocCookieId = seventhSeq.getStringValue(); + MpdlDocOperation docOperation = new MpdlDocOperation(operationName, srcUrl, uploadFileName, docBase, language, fileName); + docOperation.setESciDocCookieId(eSciDocCookieId); + try { + MpdlChainScheduler scheduler = MpdlChainScheduler.getInstance(); + docOperation = scheduler.doOperation(docOperation); + } catch (ApplicationException e) { + throw new XPathException(e); + } + int jobId = docOperation.getOrderId(); + ValueSequence resultSequence = new ValueSequence(); + StringValue strValItems = new StringValue(String.valueOf(jobId)); + resultSequence.add(strValItems); + return resultSequence; + } + +} diff -r 000000000000 -r 408254cf2f1d software/eXist/mpdl-modules/src/org/exist/xquery/modules/mpdltext/Dictionarize.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/org/exist/xquery/modules/mpdltext/Dictionarize.java Wed Nov 24 17:24:23 2010 +0100 @@ -0,0 +1,97 @@ +/* + * eXist Open Source Native XML Database: Extension module + * Copyright (C) 2008 Josef Willenborg + * jwillenborg@mpiwg-berlin.mpg.de + * http://www.mpiwg-berlin.mpg.de + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * $Id: TextModule.java $ + */ +package org.exist.xquery.modules.mpdltext; + +import java.io.IOException; +import java.io.Reader; +import java.io.StringReader; + +import org.exist.dom.QName; +import org.exist.xquery.BasicFunction; +import org.exist.xquery.Cardinality; +import org.exist.xquery.FunctionSignature; +import org.exist.xquery.XPathException; +import org.exist.xquery.XQueryContext; +import org.exist.xquery.value.Sequence; +import org.exist.xquery.value.SequenceType; +import org.exist.xquery.value.StringValue; +import org.exist.xquery.value.Type; +import org.exist.xquery.value.ValueSequence; +import org.xml.sax.InputSource; +import org.xml.sax.SAXException; +import org.xml.sax.XMLReader; + +import com.sun.org.apache.xerces.internal.parsers.SAXParser; + +import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; +import de.mpg.mpiwg.berlin.mpdl.lt.doc.DictionarizerContentHandler; + +/** + * @author Josef Willenborg (jwillenborg@mpiwg-berlin.mpg.de) + */ +public class Dictionarize extends BasicFunction { + + public final static FunctionSignature signature = + new FunctionSignature( + new QName("dictionarize", MPDLTextModule.NAMESPACE_URI, MPDLTextModule.PREFIX), + "A function which dictionarize the given xml fragment string of the given language." + + "Result is xml fragment which contains the original xml fragment enriched by " + + "a word tag for each word which contains attributes for the lemma and pollux dictionary.", + new SequenceType[] { new SequenceType(Type.STRING, Cardinality.EXACTLY_ONE), + new SequenceType(Type.STRING, Cardinality.EXACTLY_ONE)}, + new SequenceType(Type.STRING, Cardinality.EXACTLY_ONE)); + + public Dictionarize(XQueryContext context) { + super(context, signature); + } + + public Sequence eval(Sequence[] args, Sequence contextSequence) throws XPathException { + Sequence xmlFragmentSeq = args[0]; + Sequence languageSeq = args[1]; + String xmlFragment = ""; + String language = ""; + if (xmlFragmentSeq.isEmpty() || languageSeq.isEmpty()) + return Sequence.EMPTY_SEQUENCE; + xmlFragment = xmlFragmentSeq.getStringValue(); + language = languageSeq.getStringValue(); + String outputXmlFragment = null; + try { + DictionarizerContentHandler dictContentHandler = new DictionarizerContentHandler(language); + XMLReader xmlParser = new SAXParser(); + xmlParser.setContentHandler(dictContentHandler); + Reader stringReaderXmlFragment = new StringReader(xmlFragment); + InputSource input = new InputSource(stringReaderXmlFragment); + xmlParser.parse(input); + outputXmlFragment = dictContentHandler.getXmlFragment(); + } catch (ApplicationException e) { + throw new XPathException(e); + } catch (IOException e) { + throw new XPathException(e); + } catch (SAXException e) { + throw new XPathException(e); + } + ValueSequence result = new ValueSequence(); + result.add(new StringValue(outputXmlFragment)); + return result; + } +} diff -r 000000000000 -r 408254cf2f1d software/eXist/mpdl-modules/src/org/exist/xquery/modules/mpdltext/EncodeBig5.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/org/exist/xquery/modules/mpdltext/EncodeBig5.java Wed Nov 24 17:24:23 2010 +0100 @@ -0,0 +1,81 @@ +/* + * eXist Open Source Native XML Database: Extension module + * Copyright (C) 2008 Josef Willenborg + * jwillenborg@mpiwg-berlin.mpg.de + * http://www.mpiwg-berlin.mpg.de + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * $Id: TextModule.java $ + */ +package org.exist.xquery.modules.mpdltext; + +import java.io.UnsupportedEncodingException; + +import org.exist.dom.QName; +import org.exist.xquery.BasicFunction; +import org.exist.xquery.Cardinality; +import org.exist.xquery.FunctionSignature; +import org.exist.xquery.XPathException; +import org.exist.xquery.XQueryContext; +import org.exist.xquery.value.Sequence; +import org.exist.xquery.value.SequenceType; +import org.exist.xquery.value.StringValue; +import org.exist.xquery.value.Type; +import org.exist.xquery.value.ValueSequence; + +public class EncodeBig5 extends BasicFunction { + + public final static FunctionSignature signature = + new FunctionSignature( + new QName("encode-big5", MPDLTextModule.NAMESPACE_URI, MPDLTextModule.PREFIX), + "A function which delivers an encoded translation of the big5 input string", + new SequenceType[] { new SequenceType(Type.STRING, Cardinality.ZERO_OR_MORE) }, + new SequenceType(Type.STRING, Cardinality.ZERO_OR_MORE)); + + public EncodeBig5(XQueryContext context) { + super(context, signature); + } + + public Sequence eval(Sequence[] args, Sequence contextSequence) throws XPathException { + Sequence seqBig5InputStr = args[0]; + String big5InputStr = ""; + String charset = "big5"; + if (seqBig5InputStr.isEmpty()) + return Sequence.EMPTY_SEQUENCE; + else + big5InputStr = seqBig5InputStr.getStringValue(); + ValueSequence result = null; + String resultStr = ""; + try { + byte[] resultBytes = big5InputStr.getBytes(charset); + for (int i=0; i < resultBytes.length; i++) { + byte b = resultBytes[i]; + int unsigned = unsignedByteToInt(b); + String hexStr = Integer.toHexString(unsigned); + resultStr = resultStr + "%" + hexStr; + } + result = new ValueSequence(); + result.add(new StringValue(resultStr)); + } catch (UnsupportedEncodingException e) { + + } + return result; + } + + private int unsignedByteToInt(byte b) { + return (int) b & 0xFF; + } +} diff -r 000000000000 -r 408254cf2f1d software/eXist/mpdl-modules/src/org/exist/xquery/modules/mpdltext/GetBig5EncodedTerms.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/org/exist/xquery/modules/mpdltext/GetBig5EncodedTerms.java Wed Nov 24 17:24:23 2010 +0100 @@ -0,0 +1,118 @@ +/* + * eXist Open Source Native XML Database: Extension module + * Copyright (C) 2008 Josef Willenborg + * jwillenborg@mpiwg-berlin.mpg.de + * http://www.mpiwg-berlin.mpg.de + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * $Id: TextModule.java $ + */ +package org.exist.xquery.modules.mpdltext; + +import java.io.UnsupportedEncodingException; +import java.util.ArrayList; + +import org.exist.dom.QName; +import org.exist.memtree.DocumentImpl; +import org.exist.memtree.MemTreeBuilder; +import org.exist.xquery.BasicFunction; +import org.exist.xquery.Cardinality; +import org.exist.xquery.FunctionSignature; +import org.exist.xquery.XPathException; +import org.exist.xquery.XQueryContext; +import org.exist.xquery.value.Sequence; +import org.exist.xquery.value.SequenceType; +import org.exist.xquery.value.Type; + +/** + * @author Josef Willenborg (jwillenborg@mpiwg-berlin.mpg.de) + */ +public class GetBig5EncodedTerms extends BasicFunction { + + public final static FunctionSignature signature = + new FunctionSignature( + new QName("get-big5-encoded-terms", MPDLTextModule.NAMESPACE_URI, MPDLTextModule.PREFIX), + "bla bla", + new SequenceType[] { new SequenceType(Type.STRING, Cardinality.ZERO_OR_MORE) }, + new SequenceType(Type.NODE, Cardinality.EXACTLY_ONE)); + + public GetBig5EncodedTerms(XQueryContext context) { + super(context, signature); + } + + public Sequence eval(Sequence[] args, Sequence contextSequence) throws XPathException { + Sequence luceneQueryStringSeq = args[0]; + String luceneQueryString = ""; + if (luceneQueryStringSeq.isEmpty()) + return Sequence.EMPTY_SEQUENCE; + luceneQueryString = luceneQueryStringSeq.getStringValue(); + ArrayList queryTerms = getTermsFromLuceneQuery(luceneQueryString); + int size = queryTerms.size(); + MemTreeBuilder builder = context.getDocumentBuilder(); + builder.startElement("", "big5-mappings", "big5-mappings", null); + for (int i=0; i getTermsFromLuceneQuery(String queryString) { + ArrayList terms = new ArrayList(); + String[] variantTokens = queryString.split(" "); // TODO throw the phrases away (e.g.: "bla bla bla") + for (int i = 0; i < variantTokens.length; i++) { + String token = variantTokens[i]; + if (! (token.contains("*") || token.contains("?") || token.contains("~") || token.contains("-") || token.contains("+") || token.contains("^") || token.contains("OR") || token.contains("AND") || token.contains("NOT"))) { + terms.add(token); + } + } + return terms; + } + + private String encodeBig5(String inputStr) { + String resultStr = ""; + String charset = "big5"; + try { + byte[] resultBytes = inputStr.getBytes(charset); + for (int i=0; i < resultBytes.length; i++) { + byte b = resultBytes[i]; + int unsigned = unsignedByteToInt(b); + String hexStr = Integer.toHexString(unsigned); + resultStr = resultStr + "%" + hexStr; + } + } catch (UnsupportedEncodingException e) { + + } + return resultStr; + } + + private int unsignedByteToInt(byte b) { + return (int) b & 0xFF; + } + +} diff -r 000000000000 -r 408254cf2f1d software/eXist/mpdl-modules/src/org/exist/xquery/modules/mpdltext/GetDonatusQueryVariants.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/org/exist/xquery/modules/mpdltext/GetDonatusQueryVariants.java Wed Nov 24 17:24:23 2010 +0100 @@ -0,0 +1,92 @@ +/* + * eXist Open Source Native XML Database: Extension module + * Copyright (C) 2008 Josef Willenborg + * jwillenborg@mpiwg-berlin.mpg.de + * http://www.mpiwg-berlin.mpg.de + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * $Id: TextModule.java $ + */ +package org.exist.xquery.modules.mpdltext; + +import java.util.ArrayList; + +import org.exist.dom.QName; +import org.exist.xquery.BasicFunction; +import org.exist.xquery.Cardinality; +import org.exist.xquery.FunctionSignature; +import org.exist.xquery.XPathException; +import org.exist.xquery.XQueryContext; +import org.exist.xquery.value.Sequence; +import org.exist.xquery.value.SequenceType; +import org.exist.xquery.value.StringValue; +import org.exist.xquery.value.Type; +import org.exist.xquery.value.ValueSequence; + +import de.mpg.mpiwg.berlin.mpdl.donatus.xmlrpc.DonatusCache; +import de.mpg.mpiwg.berlin.mpdl.donatus.xmlrpc.DonatusVariant; +import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; + +/** + * @author Josef Willenborg (jwillenborg@mpiwg-berlin.mpg.de) + */ +public class GetDonatusQueryVariants extends BasicFunction { + + public final static FunctionSignature signature = + new FunctionSignature( + new QName("get-donatus-query-variants", MPDLTextModule.NAMESPACE_URI, MPDLTextModule.PREFIX), + "A function which delivers morphological variants (seperated by |) of a given Lucene query string of a given " + + "language over the lemma of that variant by the Donatus language technology", + new SequenceType[] { new SequenceType(Type.STRING, Cardinality.ZERO_OR_MORE), + new SequenceType(Type.STRING, Cardinality.ZERO_OR_MORE) }, + new SequenceType(Type.STRING, Cardinality.ZERO_OR_MORE)); + + public GetDonatusQueryVariants(XQueryContext context) { + super(context, signature); + } + + public Sequence eval(Sequence[] args, Sequence contextSequence) throws XPathException { + Sequence langSeq = args[0]; + Sequence luceneQueryStringSeq = args[1]; + String language = ""; + String luceneQueryString = ""; + if (langSeq.isEmpty() || luceneQueryStringSeq.isEmpty()) + return Sequence.EMPTY_SEQUENCE; + language = langSeq.getStringValue(); + luceneQueryString = luceneQueryStringSeq.getStringValue(); + ArrayList resultVariants = null; + try { + DonatusCache donatusCache = DonatusCache.getInstance(); + resultVariants = donatusCache.getQueryVariants(language, luceneQueryString); + } catch (ApplicationException e) { + throw new XPathException(e); + } + ValueSequence result = new ValueSequence(); + String resultStr = ""; + int size = resultVariants.size(); + for (int i=0; i lemmas = null; + try { + MorphologyCache morphologyCache = MorphologyCache.getInstance(); + Lemma lemma = morphologyCache.getLemma(language, formName, true); + if (lemma == null) { + lemma = new Lemma("standard analyzer", language, formName); // delivers at least one lemma with one form (with the formName) + } + lemmas = new ArrayList(); + lemmas.add(lemma); + } catch (ApplicationException e) { + throw new XPathException(e); + } + DocumentImpl doc = null; + if (lemmas != null) { + MemTreeBuilder builder = context.getDocumentBuilder(); + builder.startElement("", "lemmas", "lemmas", null); + for (int i=0; i forms = lemma.getFormsList(); + builder.startElement("", "forms-size", "forms-size", null); + builder.characters(String.valueOf(forms.size())); + builder.endElement(); + builder.startElement("", "forms", "forms", null); + for (int j=0; j lemmas = null; + try { + MorphologyCache morphologyCache = MorphologyCache.getInstance(); + lemmas = morphologyCache.getLemmasByFormName(language, formName, true); + if (lemmas == null || lemmas.isEmpty()) { + Lemma lemma = new Lemma("standard analyzer", language, formName); // delivers at least one lemma with one form (with the formName) + lemmas = new ArrayList(); + lemmas.add(lemma); + } + } catch (ApplicationException e) { + throw new XPathException(e); + } + DocumentImpl doc = null; + if (lemmas != null) { + MemTreeBuilder builder = context.getDocumentBuilder(); + builder.startElement("", "lemmas", "lemmas", null); + for (int i=0; i forms = lemma.getFormsList(); + builder.startElement("", "forms-size", "forms-size", null); + builder.characters(String.valueOf(forms.size())); + builder.endElement(); + builder.startElement("", "forms", "forms", null); + for (int j=0; j lemmas = null; + try { + MorphologyCache morphologyCache = MorphologyCache.getInstance(); + lemmas = morphologyCache.getLemmasByLuceneQuery(language, luceneQueryStr, true); + } catch (ApplicationException e) { + throw new XPathException(e); + } + DocumentImpl doc = null; + if (lemmas != null) { + MemTreeBuilder builder = context.getDocumentBuilder(); + builder.startElement("", "lemmas", "lemmas", null); + for (int i=0; i forms = lemma.getFormsList(); + builder.startElement("", "forms-size", "forms-size", null); + builder.characters(String.valueOf(forms.size())); + builder.endElement(); + builder.startElement("", "forms", "forms", null); + for (int j=0; j lemmas = morphologyCache.getLemmasByFormName(language, formName, true); + if (lemmas != null && ! lemmas.isEmpty()) { + lemmasStr = ""; + for (int j=0; j lexicons = null; + try { + ArrayList statLexicons = Lexica.getInstance().getLexicons(language); + if (statLexicons != null) { + LexHandler lexHandler = LexHandler.getInstance(); + for (int i=0; i(); + lexicons.add(lexicon); + } + } + } + } catch (ApplicationException e) { + throw new XPathException(e); + } + DocumentImpl doc = null; + if (lexicons != null) { + MemTreeBuilder builder = context.getDocumentBuilder(); + builder.startElement("", "lexica", "lexica", null); + for (int i=0; i entries = lexicon.getEntries(); + for (int j=0; j lexicons = null; + try { + ArrayList statLexicons = Lexica.getInstance().getLexicons(language); + if (statLexicons != null) { + LexHandler lexHandler = LexHandler.getInstance(); + for (int i=0; i lexQueryVariants = luceneUtil.getVariantsFromLuceneQuery(luceneQueryStr); + for (int j=0; j(); + } + } + if (! lexicon.isEmpty()) + lexicons.add(lexicon); + } + } + } catch (ApplicationException e) { + throw new XPathException(e); + } + DocumentImpl doc = null; + if (lexicons != null) { + MemTreeBuilder builder = context.getDocumentBuilder(); + builder.startElement("", "lexica", "lexica", null); + for (int i=0; i entries = lexicon.getEntries(); + for (int j=0; j lexEntryKeys = lexHandler.getLexEntryKeys(formName, language, true); + if (lexEntryKeys != null) { + lexEntryKeysStr = ""; + for (int j=0; j resultVariants = null; + try { + MorphologyCache morphologyCache = MorphologyCache.getInstance(); + resultVariants = morphologyCache.getFormsByLuceneQuery(language, luceneQueryString, true); + } catch (ApplicationException e) { + throw new XPathException(e); + } + ValueSequence result = new ValueSequence(); + String resultStr = ""; + int size = resultVariants.size(); + MpdlNormalizer normalizer = new MpdlNormalizer(language); + for (int i=0; i regOrigForms = regManager.getRegOrigsByNormLuceneQueryString(language, luceneQueryString); + if (regOrigForms != null && regOrigForms.size() > 0) { + regOrigFormsFound = true; + for (int i=0; i resultVariants = morphologyCache.getFormsByLuceneQuery(language, luceneQueryString, true); + for (int i=0; i 0) { + regOrigFormsFound = true; + for (int j=0; j 0) { + builder.startElement("", "toc-entry", "toc-entry", null); + for (int j=0; j nodeLevel) { + if (nodeLevel == 1) { + level2 = 0; level3 = 0; level4 = 0; level5 = 0; level6 = 0; level7 = 0; + } else if (nodeLevel == 2) { + level3 = 0; level4 = 0; level5 = 0; level6 = 0; level7 = 0; + } else if (nodeLevel == 3) { + level4 = 0; level5 = 0; level6 = 0; level7 = 0; + } else if (nodeLevel == 4) { + level5 = 0; level6 = 0; level7 = 0; + } else if (nodeLevel == 5) { + level6 = 0; level7 = 0; + } else if (nodeLevel == 6) { + level7 = 0; + } + } + level = nodeLevel; + if (level == 1) + level1++; + else if (level == 2) + level2++; + else if (level == 3) + level3++; + else if (level == 4) + level4++; + else if (level == 5) + level5++; + else if (level == 6) + level6++; + else if (level == 7) + level7++; + } + } + String levelString = ""; + if (level1 != 0) + levelString += level1 + "."; + if (level2 != 0) + levelString += level2 + "."; + if (level3 != 0) + levelString += level3 + "."; + if (level4 != 0) + levelString += level4 + "."; + if (level5 != 0) + levelString += level5 + "."; + if (level6 != 0) + levelString += level6 + "."; + if (level7 != 0) + levelString += level7 + "."; + + builder.startElement("", "level-string", "level-string", null); + builder.characters(levelString); + builder.endElement(); + + String[] splitStr = levelString.split("\\."); + int realLevel = splitStr.length; + builder.startElement("", "real-level", "real-level", null); + builder.characters("" + realLevel); + builder.endElement(); + builder.endElement(); + } + } + builder.endElement(); + doc = ((DocumentImpl)builder.getDocument()); + return doc; + } + + public Sequence evalAsPersistentSeq(Sequence[] args, Sequence contextSequence) throws XPathException { + Sequence tocEntriesSeq = args[0]; + if (tocEntriesSeq.isEmpty()) + return Sequence.EMPTY_SEQUENCE; + int level = 0; + int level1 = 0; + int level2 = 0; + int level3 = 0; + int level4 = 0; + int level5 = 0; + int level6 = 0; + int level7 = 0; + DocumentImpl doc = null; + MemTreeBuilder builder = context.getDocumentBuilder(); + builder.startElement("", "toc-entries", "toc-entries", null); + for (int i=0; i 0) { + builder.startElement("", "toc-entry", "toc-entry", null); + for (int j=0; j nodeLevel) { + if (nodeLevel == 1) { + level2 = 0; level3 = 0; level4 = 0; level5 = 0; level6 = 0; level7 = 0; + } else if (nodeLevel == 2) { + level3 = 0; level4 = 0; level5 = 0; level6 = 0; level7 = 0; + } else if (nodeLevel == 3) { + level4 = 0; level5 = 0; level6 = 0; level7 = 0; + } else if (nodeLevel == 4) { + level5 = 0; level6 = 0; level7 = 0; + } else if (nodeLevel == 5) { + level6 = 0; level7 = 0; + } else if (nodeLevel == 6) { + level7 = 0; + } + } + level = nodeLevel; + if (level == 1) + level1++; + else if (level == 2) + level2++; + else if (level == 3) + level3++; + else if (level == 4) + level4++; + else if (level == 5) + level5++; + else if (level == 6) + level6++; + else if (level == 7) + level7++; + } + } + String levelString = ""; + if (level1 != 0) + levelString += level1 + "."; + if (level2 != 0) + levelString += level2 + "."; + if (level3 != 0) + levelString += level3 + "."; + if (level4 != 0) + levelString += level4 + "."; + if (level5 != 0) + levelString += level5 + "."; + if (level6 != 0) + levelString += level6 + "."; + if (level7 != 0) + levelString += level7 + "."; + + builder.startElement("", "level-string", "level-string", null); + builder.characters(levelString); + builder.endElement(); + + String[] splitStr = levelString.split("\\."); + int realLevel = splitStr.length; + builder.startElement("", "real-level", "real-level", null); + builder.characters("" + realLevel); + builder.endElement(); + builder.endElement(); + } + } + builder.endElement(); + doc = ((DocumentImpl)builder.getDocument()); + return doc; + } + +}