diff software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/donatus/xmlrpc/DonatusCacheOld.java @ 0:408254cf2f1d

Erstellung
author Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
date Wed, 24 Nov 2010 17:24:23 +0100
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/donatus/xmlrpc/DonatusCacheOld.java	Wed Nov 24 17:24:23 2010 +0100
@@ -0,0 +1,327 @@
+package de.mpg.mpiwg.berlin.mpdl.donatus.xmlrpc;
+
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.io.Reader;
+import java.io.StringReader;
+import java.io.UnsupportedEncodingException;
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.util.ArrayList;
+import java.util.Date;
+
+import org.xml.sax.InputSource;
+import org.xml.sax.SAXException;
+import org.xml.sax.XMLReader;
+
+import com.sleepycat.je.Cursor;
+import com.sleepycat.je.Database;
+import com.sleepycat.je.DatabaseEntry;
+import com.sleepycat.je.DatabaseException;
+import com.sleepycat.je.LockMode;
+import com.sleepycat.je.OperationStatus;
+import com.sleepycat.je.Transaction;
+import com.sun.org.apache.xerces.internal.parsers.SAXParser;
+
+import de.mpg.mpiwg.berlin.mpdl.donatus.analysis.DonatusAnalyzer;
+import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException;
+import de.mpg.mpiwg.berlin.mpdl.util.FileUtil;
+import de.mpg.mpiwg.berlin.mpdl.util.Util;
+
+public class DonatusCacheOld {
+  private static DonatusCacheOld instance;
+  private DonatusBerkeleyDbEnv berkeleyDBEnv = null;
+  private Date state = null;  // last time the cache is written
+
+  // for performance reasons these variables are needed
+  public static int QUERY_MODE = 0;
+  public static int DOCUMENT_MODE = 1;
+  protected int mode = QUERY_MODE;
+  // for performance reasons the cache contains a donatusMorphologyDocument which 
+  // caches all lemmas for one document (in DOCUMENT_MODE) 
+  private DonatusMorphologyDocument donatusMorphologyDocument = null;
+
+  public static DonatusCacheOld getInstance() throws ApplicationException {
+    if (instance == null) {
+      instance = new DonatusCacheOld();
+      instance.init();
+    }
+    return instance;
+  }
+
+  private void init() throws ApplicationException {
+    try {
+      berkeleyDBEnv = new DonatusBerkeleyDbEnv();
+      berkeleyDBEnv.setup(false); // open databases in read/write mode
+      state = new Date();
+    } catch (DatabaseException e) {
+      throw new ApplicationException(e);
+    }
+  }
+  
+  public int getMode() {
+    return mode;  
+  }
+  
+  public void setMode(int newMode) {
+    this.mode = newMode;
+    if (newMode == QUERY_MODE)    
+      donatusMorphologyDocument = null; // reset the morphology document
+  }
+  
+  public void close() {
+    berkeleyDBEnv.close();
+  }
+  
+  // TODO Aufruf über RPC-API: execute(String path, HashMap parameters); spez. MPDL-Funktion zum Administrieren von BerkeleyDB: org.exist.xquery.modules.mpdldb.BerkeleyDBAdmin
+  public void deleteCache() {
+    berkeleyDBEnv.removeDatabases();
+    state = new Date();
+  }
+  
+  public void cacheLemmas(DonatusAnalyzer analyzer, String docUri, ArrayList<String> sentences) throws ApplicationException {
+    try {
+      Date beginOfOperation1 = new Date();
+      URL url = new URL(docUri);
+      String path = url.getPath();
+      System.out.print("Indexing: " + path + " Donatus-Analyze ... ");
+      DonatusHandler donatusHandler = new DonatusHandler(analyzer);
+      donatusMorphologyDocument = donatusHandler.analyze(docUri, sentences);
+      Date endOfOperation1 = new Date();
+      Double elapsedTime1 = new Util().getSecondWithMillisecondsBetween(beginOfOperation1, endOfOperation1);
+      System.out.print(elapsedTime1 + " sec ... Writing lemmas to BerkeleyDB ... ");
+      Date beginOfOperation2 = new Date();
+      writeLemmas(donatusMorphologyDocument);
+      Date endOfOperation2 = new Date();
+      Double elapsedTime2 = new Util().getSecondWithMillisecondsBetween(beginOfOperation2, endOfOperation2);
+      System.out.print(elapsedTime2 + " sec ... Stemming ... ");
+      String donMorphPath = path.replaceFirst(".xml", "-donatus-morph-v" + endOfOperation2.getTime() + ".xml");
+      String morphDocFilePathStr = DonatusConstants.BERKELEY_DB_DIR + "/donatusAnalyzedFiles" + donMorphPath;
+      FileUtil fileUtil = new FileUtil();
+      byte[] morphDocBytes = donatusMorphologyDocument.getDocumentBytes();
+      fileUtil.saveFile(morphDocBytes, morphDocFilePathStr);
+      String donWtagPath = path.replaceFirst(".xml", "-donatus-wtag-v" + endOfOperation2.getTime() + ".xml");
+      String wtagFilePathStr = DonatusConstants.BERKELEY_DB_DIR + "/donatusAnalyzedFiles" + donWtagPath;
+      byte[] wtagBytes = donatusMorphologyDocument.getWtagBytes();
+      fileUtil.saveFile(wtagBytes, wtagFilePathStr);
+    } catch (MalformedURLException e) {
+      throw new ApplicationException(e);
+    } catch (IOException e) {
+      throw new ApplicationException(e);
+    }
+    state = new Date();
+  }
+  
+  public DonatusLemma getLemma(String language, String variantForm) throws ApplicationException {
+    DonatusLemma lemma = null;
+    if (mode == QUERY_MODE) {
+      lemma = readVariantLemma(null, language, variantForm);
+    } else {
+      if (donatusMorphologyDocument != null) {
+        DonatusVariant v = donatusMorphologyDocument.getVariant(variantForm);
+        if (v != null) {
+          DonatusLemma l = v.getLemma();
+          lemma = donatusMorphologyDocument.getLemma(l.getForm());
+        }
+      }
+    }
+    return lemma;
+  }
+  
+  public ArrayList<DonatusVariant> getQueryVariants(String language, String luceneQueryString) throws ApplicationException {
+    ArrayList<DonatusVariant> result = new ArrayList<DonatusVariant>();
+    ArrayList<String> variantsFromQuery = getVariantsFromLuceneQuery(luceneQueryString);
+    if (! (variantsFromQuery == null || variantsFromQuery.isEmpty())) {
+      for (int i=0; i<variantsFromQuery.size(); i++) {
+        String variantStr = variantsFromQuery.get(i);
+        DonatusLemma lemma = getLemma(language, variantStr);
+        if (lemma != null) {
+          ArrayList<DonatusVariant> lemmaVariants = lemma.getVariants();
+          result.addAll(lemmaVariants);
+        }
+      }
+    }
+    return result;
+  }
+
+  private void writeLemmas(DonatusMorphologyDocument donatusMorphologyDocument) throws ApplicationException {
+    Transaction txn = null;  // without txn
+    // Transaction txn = berkeleyDBEnv.getEnv().beginTransaction(null, null);
+    // delivers all variants of all lemmas - so for example more than one variant with the same form name but in different lemmas
+    ArrayList<DonatusVariant> variants = donatusMorphologyDocument.getVariants();  
+    for (int i=0; i<variants.size(); i++) {
+      DonatusVariant newVariant = variants.get(i);
+      String newVariantForm = newVariant.getForm();
+      String language = newVariant.getLemma().getLanguage();
+      if (newVariantForm != null && language != null && ! newVariantForm.equals("") && ! language.equals("")) {
+        DonatusLemma newVariantLemma = newVariant.getLemma();
+        // look if this variant is already contained in variantDB and if so if the lemma there is the same as the new variant lemma
+        DonatusLemma dbVariantLemma = readVariantLemma(txn, language, newVariantForm);
+        if (dbVariantLemma != null) {
+          if (dbVariantLemma.getForm().equals(newVariantLemma.getForm())) {
+            // the variants of newVariantLemma are added to the existing variantLemma and this lemma is saved
+            ArrayList<DonatusVariant> newVariantLemmaVariants = newVariantLemma.getVariants();
+            for (int j=0; j<newVariantLemmaVariants.size(); j++) {
+              DonatusVariant v = newVariantLemmaVariants.get(j);
+              dbVariantLemma.addVariant(v);
+            }
+            writeLemmaByVariantKey(txn, newVariant, dbVariantLemma);
+          } else {
+            // the two lemmas of the new and existing variant are not the same: nothing should be saved
+          }
+        } else {
+          writeLemmaByVariantKey(txn, newVariant, newVariantLemma);
+        }
+      }
+    }
+    // Only filled, not tested and used yet, for future
+    ArrayList<DonatusLemma> lemmas = donatusMorphologyDocument.getLemmas();
+    for (int i=0; i<lemmas.size(); i++) {
+      DonatusLemma lemma = lemmas.get(i);
+      String lemmaForm = lemma.getForm();
+      String language = lemma.getLanguage();
+      if (lemmaForm != null && language != null && ! lemmaForm.equals("") && ! language.equals("")) {
+        writeLemmaByLemmaKey(txn, lemma);
+      }
+    }
+    state = new Date();
+  }
+  
+  // TODO method is only simple: proof all Lucene cases
+  private ArrayList<String> getVariantsFromLuceneQuery(String queryString) {
+    ArrayList<String> variants = new ArrayList<String>();
+    String[] variantTokens = queryString.split(" ");  // TODO throw the phrases away (e.g.: "bla bla bla")
+    for (int i = 0; i < variantTokens.length; i++) {
+      String token = variantTokens[i];
+      if (! (token.contains("*") || token.contains("?") || token.contains("~") || token.contains("-") || token.contains("+") || token.contains("^") || token.contains("OR") || token.contains("AND") || token.contains("NOT"))) {
+        variants.add(token);
+      }
+    }
+    return variants;
+  }
+
+  private void writeLemmaByVariantKey(Transaction txn, DonatusVariant variantKey, DonatusLemma lemma) throws ApplicationException {
+    try {
+      String variantKeyStr = variantKey.getLemma().getLanguage() + "###" + variantKey.getForm();
+      DatabaseEntry dbEntryKey = new DatabaseEntry(variantKeyStr.getBytes("UTF-8"));
+      String lemmaXmlValue = lemma.getXmlString();
+      DatabaseEntry dbEntryValue = new DatabaseEntry(lemmaXmlValue.getBytes("UTF-8"));
+      Database variantDB = berkeleyDBEnv.getVariantDB();
+      variantDB.put(txn, dbEntryKey, dbEntryValue);
+    } catch (DatabaseException e) {
+      throw new ApplicationException(e);
+    } catch (UnsupportedEncodingException e) {
+      throw new ApplicationException(e);
+    }
+  }
+    
+  private void writeLemmaByLemmaKey(Transaction txn, DonatusLemma lemma) throws ApplicationException {
+    try {
+      String lemmaKeyStr = lemma.getLanguage() + "###" + lemma.getForm();
+      DatabaseEntry dbEntryKey = new DatabaseEntry(lemmaKeyStr.getBytes("UTF-8"));
+      String lemmaXmlValue = lemma.getXmlString();
+      DatabaseEntry dbEntryValue = new DatabaseEntry(lemmaXmlValue.getBytes("UTF-8"));
+      Database lemmaDB = berkeleyDBEnv.getLemmaDB();
+      lemmaDB.put(txn, dbEntryKey, dbEntryValue);
+    } catch (DatabaseException e) {
+      throw new ApplicationException(e);
+    } catch (UnsupportedEncodingException e) {
+      throw new ApplicationException(e);
+    }
+  }
+    
+  private DonatusLemma readVariantLemma(Transaction txn, String language, String variantForm) throws ApplicationException {
+    DonatusLemma lemma = null;
+    String hashKey = language + "###" + variantForm;
+    try {
+      Database variantDB = berkeleyDBEnv.getVariantDB();
+      Cursor cursor = variantDB.openCursor(txn, null);
+      byte[] bHashKey = hashKey.getBytes("UTF-8");
+      DatabaseEntry dbEntryKey = new DatabaseEntry(bHashKey);
+      DatabaseEntry foundXmlLemmaValue = new DatabaseEntry();
+      OperationStatus operationStatus = variantDB.get(null, dbEntryKey, foundXmlLemmaValue, LockMode.DEFAULT);
+      if (operationStatus == OperationStatus.SUCCESS) {
+        byte[] foundXmlLemmaValueBytes = foundXmlLemmaValue.getData();
+        String foundXmlLemmaStr = new String(foundXmlLemmaValueBytes, "UTF-8");
+        lemma = parseXmlLemmaString(language, foundXmlLemmaStr);
+      }
+      cursor.close();
+    } catch (DatabaseException e) {
+      throw new ApplicationException(e);
+    } catch (UnsupportedEncodingException e) {
+      throw new ApplicationException(e);
+    }
+    return lemma;
+  }
+  
+  private DonatusLemma parseXmlLemmaString(String language, String xmlLemmaString) throws ApplicationException {
+    DonatusLemma lemma = null;
+    DonatusMorphologyDocument morphologyDoc = parseDonatusMorphDoc(language, xmlLemmaString);
+    ArrayList<DonatusLemma> lemmas = morphologyDoc.getLemmas();
+    if (lemmas.size() > 0)
+      lemma = lemmas.get(0);
+    return lemma;
+  }
+
+  private DonatusMorphologyDocument parseDonatusMorphDoc(String language, String xmlString) throws ApplicationException {
+    DonatusMorphologyDocument morphologyDoc = null;
+    try {
+      XMLReader xmlParser = new SAXParser();
+      DonatusMorphologyDocumentContentHandler donatusMorphContentHandler = new DonatusMorphologyDocumentContentHandler("tempDummyUri", language);
+      xmlParser.setContentHandler(donatusMorphContentHandler);
+      String morphDocDefXml = getDonatusMorphDocDefXml();
+      String morphDocMorphStartXml = "<morphology xmlns=\"http://archimedes.fas.harvard.edu/ns/morphology/3\" xmlns:xlink=\"http://www.w3.org/1999/xlink\">\n";
+      String morphDocMorphEndXml = "</morphology>";
+      String morphDocXml = morphDocDefXml + morphDocMorphStartXml + xmlString + morphDocMorphEndXml;
+      Reader reader = new StringReader(morphDocXml);
+      InputSource input = new InputSource(reader);
+      xmlParser.parse(input);
+      morphologyDoc = donatusMorphContentHandler.getResult();
+    } catch (SAXException e) {
+      throw new ApplicationException(e);
+    } catch (IOException e) {
+      throw new ApplicationException(e);
+    }
+    return morphologyDoc;
+  }
+
+  private static String getDonatusMorphDocDefXml() {
+    String defXml = 
+    "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n" +
+    "<!DOCTYPE morphology [\n" +
+    "<!ELEMENT morphology (lemma*, context-form*)>\n" +
+    "<!ELEMENT lemma (definition?, variant*)>\n" +
+    "<!ELEMENT context-form (tokens, analysis)>\n" +
+    "<!ELEMENT definition (#PCDATA)>\n" +
+    "<!ELEMENT variant (analysis)*>\n" +
+    "<!ELEMENT analysis EMPTY>\n" +
+    "<!ELEMENT tokens (token+)>\n" +
+    "<!ELEMENT token EMPTY>\n" +
+    "<!ATTLIST morphology\n" +
+    "   xmlns           CDATA           #FIXED \"http://archimedes.fas.harvard.edu/ns/morphology/3\"\n" +
+    "   xmlns:xlink     CDATA           #FIXED \"http://www.w3.org/1999/xlink\">\n" +
+    "<!ATTLIST lemma\n" +
+    "   form            CDATA           #REQUIRED\n" +
+    "   lang            CDATA           #REQUIRED>\n" +
+    "<!ATTLIST definition\n" +
+    "   lang            CDATA           #IMPLIED>\n" +
+    "<!ATTLIST variant\n" +
+    "   form            CDATA           #REQUIRED\n" +
+    "   modified        (y|n)           #IMPLIED>\n" +
+    "<!ATTLIST analysis\n" +
+    "   desc            CDATA           #IMPLIED\n" +
+    "   xlink:href      CDATA           #IMPLIED\n" +
+    "   xlink:type      (simple)        #FIXED \"simple\"\n" +
+    "   form            CDATA           #IMPLIED\n" +
+    "   id              ID              #IMPLIED>\n" +
+    "<!ATTLIST context-form\n" +
+    "   lang            CDATA           #REQUIRED\n" +
+    "   xlink:href      CDATA           #REQUIRED\n" +
+    "   xlink:type      (simple)        #FIXED \"simple\">\n" +
+    "<!ATTLIST token\n" +
+    "   form            CDATA           #REQUIRED\n" +
+    "   count           CDATA           #REQUIRED>\n" +
+    "]>\n";
+    return defXml;
+  }
+}
\ No newline at end of file