view software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/donatus/xmlrpc/DonatusCacheOld.java @ 0:408254cf2f1d

Erstellung
author Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
date Wed, 24 Nov 2010 17:24:23 +0100
parents
children
line wrap: on
line source

package de.mpg.mpiwg.berlin.mpdl.donatus.xmlrpc;

import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Date;

import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;

import com.sleepycat.je.Cursor;
import com.sleepycat.je.Database;
import com.sleepycat.je.DatabaseEntry;
import com.sleepycat.je.DatabaseException;
import com.sleepycat.je.LockMode;
import com.sleepycat.je.OperationStatus;
import com.sleepycat.je.Transaction;
import com.sun.org.apache.xerces.internal.parsers.SAXParser;

import de.mpg.mpiwg.berlin.mpdl.donatus.analysis.DonatusAnalyzer;
import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException;
import de.mpg.mpiwg.berlin.mpdl.util.FileUtil;
import de.mpg.mpiwg.berlin.mpdl.util.Util;

public class DonatusCacheOld {
  private static DonatusCacheOld instance;
  private DonatusBerkeleyDbEnv berkeleyDBEnv = null;
  private Date state = null;  // last time the cache is written

  // for performance reasons these variables are needed
  public static int QUERY_MODE = 0;
  public static int DOCUMENT_MODE = 1;
  protected int mode = QUERY_MODE;
  // for performance reasons the cache contains a donatusMorphologyDocument which 
  // caches all lemmas for one document (in DOCUMENT_MODE) 
  private DonatusMorphologyDocument donatusMorphologyDocument = null;

  public static DonatusCacheOld getInstance() throws ApplicationException {
    if (instance == null) {
      instance = new DonatusCacheOld();
      instance.init();
    }
    return instance;
  }

  private void init() throws ApplicationException {
    try {
      berkeleyDBEnv = new DonatusBerkeleyDbEnv();
      berkeleyDBEnv.setup(false); // open databases in read/write mode
      state = new Date();
    } catch (DatabaseException e) {
      throw new ApplicationException(e);
    }
  }
  
  public int getMode() {
    return mode;  
  }
  
  public void setMode(int newMode) {
    this.mode = newMode;
    if (newMode == QUERY_MODE)    
      donatusMorphologyDocument = null; // reset the morphology document
  }
  
  public void close() {
    berkeleyDBEnv.close();
  }
  
  // TODO Aufruf über RPC-API: execute(String path, HashMap parameters); spez. MPDL-Funktion zum Administrieren von BerkeleyDB: org.exist.xquery.modules.mpdldb.BerkeleyDBAdmin
  public void deleteCache() {
    berkeleyDBEnv.removeDatabases();
    state = new Date();
  }
  
  public void cacheLemmas(DonatusAnalyzer analyzer, String docUri, ArrayList<String> sentences) throws ApplicationException {
    try {
      Date beginOfOperation1 = new Date();
      URL url = new URL(docUri);
      String path = url.getPath();
      System.out.print("Indexing: " + path + " Donatus-Analyze ... ");
      DonatusHandler donatusHandler = new DonatusHandler(analyzer);
      donatusMorphologyDocument = donatusHandler.analyze(docUri, sentences);
      Date endOfOperation1 = new Date();
      Double elapsedTime1 = new Util().getSecondWithMillisecondsBetween(beginOfOperation1, endOfOperation1);
      System.out.print(elapsedTime1 + " sec ... Writing lemmas to BerkeleyDB ... ");
      Date beginOfOperation2 = new Date();
      writeLemmas(donatusMorphologyDocument);
      Date endOfOperation2 = new Date();
      Double elapsedTime2 = new Util().getSecondWithMillisecondsBetween(beginOfOperation2, endOfOperation2);
      System.out.print(elapsedTime2 + " sec ... Stemming ... ");
      String donMorphPath = path.replaceFirst(".xml", "-donatus-morph-v" + endOfOperation2.getTime() + ".xml");
      String morphDocFilePathStr = DonatusConstants.BERKELEY_DB_DIR + "/donatusAnalyzedFiles" + donMorphPath;
      FileUtil fileUtil = new FileUtil();
      byte[] morphDocBytes = donatusMorphologyDocument.getDocumentBytes();
      fileUtil.saveFile(morphDocBytes, morphDocFilePathStr);
      String donWtagPath = path.replaceFirst(".xml", "-donatus-wtag-v" + endOfOperation2.getTime() + ".xml");
      String wtagFilePathStr = DonatusConstants.BERKELEY_DB_DIR + "/donatusAnalyzedFiles" + donWtagPath;
      byte[] wtagBytes = donatusMorphologyDocument.getWtagBytes();
      fileUtil.saveFile(wtagBytes, wtagFilePathStr);
    } catch (MalformedURLException e) {
      throw new ApplicationException(e);
    } catch (IOException e) {
      throw new ApplicationException(e);
    }
    state = new Date();
  }
  
  public DonatusLemma getLemma(String language, String variantForm) throws ApplicationException {
    DonatusLemma lemma = null;
    if (mode == QUERY_MODE) {
      lemma = readVariantLemma(null, language, variantForm);
    } else {
      if (donatusMorphologyDocument != null) {
        DonatusVariant v = donatusMorphologyDocument.getVariant(variantForm);
        if (v != null) {
          DonatusLemma l = v.getLemma();
          lemma = donatusMorphologyDocument.getLemma(l.getForm());
        }
      }
    }
    return lemma;
  }
  
  public ArrayList<DonatusVariant> getQueryVariants(String language, String luceneQueryString) throws ApplicationException {
    ArrayList<DonatusVariant> result = new ArrayList<DonatusVariant>();
    ArrayList<String> variantsFromQuery = getVariantsFromLuceneQuery(luceneQueryString);
    if (! (variantsFromQuery == null || variantsFromQuery.isEmpty())) {
      for (int i=0; i<variantsFromQuery.size(); i++) {
        String variantStr = variantsFromQuery.get(i);
        DonatusLemma lemma = getLemma(language, variantStr);
        if (lemma != null) {
          ArrayList<DonatusVariant> lemmaVariants = lemma.getVariants();
          result.addAll(lemmaVariants);
        }
      }
    }
    return result;
  }

  private void writeLemmas(DonatusMorphologyDocument donatusMorphologyDocument) throws ApplicationException {
    Transaction txn = null;  // without txn
    // Transaction txn = berkeleyDBEnv.getEnv().beginTransaction(null, null);
    // delivers all variants of all lemmas - so for example more than one variant with the same form name but in different lemmas
    ArrayList<DonatusVariant> variants = donatusMorphologyDocument.getVariants();  
    for (int i=0; i<variants.size(); i++) {
      DonatusVariant newVariant = variants.get(i);
      String newVariantForm = newVariant.getForm();
      String language = newVariant.getLemma().getLanguage();
      if (newVariantForm != null && language != null && ! newVariantForm.equals("") && ! language.equals("")) {
        DonatusLemma newVariantLemma = newVariant.getLemma();
        // look if this variant is already contained in variantDB and if so if the lemma there is the same as the new variant lemma
        DonatusLemma dbVariantLemma = readVariantLemma(txn, language, newVariantForm);
        if (dbVariantLemma != null) {
          if (dbVariantLemma.getForm().equals(newVariantLemma.getForm())) {
            // the variants of newVariantLemma are added to the existing variantLemma and this lemma is saved
            ArrayList<DonatusVariant> newVariantLemmaVariants = newVariantLemma.getVariants();
            for (int j=0; j<newVariantLemmaVariants.size(); j++) {
              DonatusVariant v = newVariantLemmaVariants.get(j);
              dbVariantLemma.addVariant(v);
            }
            writeLemmaByVariantKey(txn, newVariant, dbVariantLemma);
          } else {
            // the two lemmas of the new and existing variant are not the same: nothing should be saved
          }
        } else {
          writeLemmaByVariantKey(txn, newVariant, newVariantLemma);
        }
      }
    }
    // Only filled, not tested and used yet, for future
    ArrayList<DonatusLemma> lemmas = donatusMorphologyDocument.getLemmas();
    for (int i=0; i<lemmas.size(); i++) {
      DonatusLemma lemma = lemmas.get(i);
      String lemmaForm = lemma.getForm();
      String language = lemma.getLanguage();
      if (lemmaForm != null && language != null && ! lemmaForm.equals("") && ! language.equals("")) {
        writeLemmaByLemmaKey(txn, lemma);
      }
    }
    state = new Date();
  }
  
  // TODO method is only simple: proof all Lucene cases
  private ArrayList<String> getVariantsFromLuceneQuery(String queryString) {
    ArrayList<String> variants = new ArrayList<String>();
    String[] variantTokens = queryString.split(" ");  // TODO throw the phrases away (e.g.: "bla bla bla")
    for (int i = 0; i < variantTokens.length; i++) {
      String token = variantTokens[i];
      if (! (token.contains("*") || token.contains("?") || token.contains("~") || token.contains("-") || token.contains("+") || token.contains("^") || token.contains("OR") || token.contains("AND") || token.contains("NOT"))) {
        variants.add(token);
      }
    }
    return variants;
  }

  private void writeLemmaByVariantKey(Transaction txn, DonatusVariant variantKey, DonatusLemma lemma) throws ApplicationException {
    try {
      String variantKeyStr = variantKey.getLemma().getLanguage() + "###" + variantKey.getForm();
      DatabaseEntry dbEntryKey = new DatabaseEntry(variantKeyStr.getBytes("UTF-8"));
      String lemmaXmlValue = lemma.getXmlString();
      DatabaseEntry dbEntryValue = new DatabaseEntry(lemmaXmlValue.getBytes("UTF-8"));
      Database variantDB = berkeleyDBEnv.getVariantDB();
      variantDB.put(txn, dbEntryKey, dbEntryValue);
    } catch (DatabaseException e) {
      throw new ApplicationException(e);
    } catch (UnsupportedEncodingException e) {
      throw new ApplicationException(e);
    }
  }
    
  private void writeLemmaByLemmaKey(Transaction txn, DonatusLemma lemma) throws ApplicationException {
    try {
      String lemmaKeyStr = lemma.getLanguage() + "###" + lemma.getForm();
      DatabaseEntry dbEntryKey = new DatabaseEntry(lemmaKeyStr.getBytes("UTF-8"));
      String lemmaXmlValue = lemma.getXmlString();
      DatabaseEntry dbEntryValue = new DatabaseEntry(lemmaXmlValue.getBytes("UTF-8"));
      Database lemmaDB = berkeleyDBEnv.getLemmaDB();
      lemmaDB.put(txn, dbEntryKey, dbEntryValue);
    } catch (DatabaseException e) {
      throw new ApplicationException(e);
    } catch (UnsupportedEncodingException e) {
      throw new ApplicationException(e);
    }
  }
    
  private DonatusLemma readVariantLemma(Transaction txn, String language, String variantForm) throws ApplicationException {
    DonatusLemma lemma = null;
    String hashKey = language + "###" + variantForm;
    try {
      Database variantDB = berkeleyDBEnv.getVariantDB();
      Cursor cursor = variantDB.openCursor(txn, null);
      byte[] bHashKey = hashKey.getBytes("UTF-8");
      DatabaseEntry dbEntryKey = new DatabaseEntry(bHashKey);
      DatabaseEntry foundXmlLemmaValue = new DatabaseEntry();
      OperationStatus operationStatus = variantDB.get(null, dbEntryKey, foundXmlLemmaValue, LockMode.DEFAULT);
      if (operationStatus == OperationStatus.SUCCESS) {
        byte[] foundXmlLemmaValueBytes = foundXmlLemmaValue.getData();
        String foundXmlLemmaStr = new String(foundXmlLemmaValueBytes, "UTF-8");
        lemma = parseXmlLemmaString(language, foundXmlLemmaStr);
      }
      cursor.close();
    } catch (DatabaseException e) {
      throw new ApplicationException(e);
    } catch (UnsupportedEncodingException e) {
      throw new ApplicationException(e);
    }
    return lemma;
  }
  
  private DonatusLemma parseXmlLemmaString(String language, String xmlLemmaString) throws ApplicationException {
    DonatusLemma lemma = null;
    DonatusMorphologyDocument morphologyDoc = parseDonatusMorphDoc(language, xmlLemmaString);
    ArrayList<DonatusLemma> lemmas = morphologyDoc.getLemmas();
    if (lemmas.size() > 0)
      lemma = lemmas.get(0);
    return lemma;
  }

  private DonatusMorphologyDocument parseDonatusMorphDoc(String language, String xmlString) throws ApplicationException {
    DonatusMorphologyDocument morphologyDoc = null;
    try {
      XMLReader xmlParser = new SAXParser();
      DonatusMorphologyDocumentContentHandler donatusMorphContentHandler = new DonatusMorphologyDocumentContentHandler("tempDummyUri", language);
      xmlParser.setContentHandler(donatusMorphContentHandler);
      String morphDocDefXml = getDonatusMorphDocDefXml();
      String morphDocMorphStartXml = "<morphology xmlns=\"http://archimedes.fas.harvard.edu/ns/morphology/3\" xmlns:xlink=\"http://www.w3.org/1999/xlink\">\n";
      String morphDocMorphEndXml = "</morphology>";
      String morphDocXml = morphDocDefXml + morphDocMorphStartXml + xmlString + morphDocMorphEndXml;
      Reader reader = new StringReader(morphDocXml);
      InputSource input = new InputSource(reader);
      xmlParser.parse(input);
      morphologyDoc = donatusMorphContentHandler.getResult();
    } catch (SAXException e) {
      throw new ApplicationException(e);
    } catch (IOException e) {
      throw new ApplicationException(e);
    }
    return morphologyDoc;
  }

  private static String getDonatusMorphDocDefXml() {
    String defXml = 
    "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n" +
    "<!DOCTYPE morphology [\n" +
    "<!ELEMENT morphology (lemma*, context-form*)>\n" +
    "<!ELEMENT lemma (definition?, variant*)>\n" +
    "<!ELEMENT context-form (tokens, analysis)>\n" +
    "<!ELEMENT definition (#PCDATA)>\n" +
    "<!ELEMENT variant (analysis)*>\n" +
    "<!ELEMENT analysis EMPTY>\n" +
    "<!ELEMENT tokens (token+)>\n" +
    "<!ELEMENT token EMPTY>\n" +
    "<!ATTLIST morphology\n" +
    "   xmlns           CDATA           #FIXED \"http://archimedes.fas.harvard.edu/ns/morphology/3\"\n" +
    "   xmlns:xlink     CDATA           #FIXED \"http://www.w3.org/1999/xlink\">\n" +
    "<!ATTLIST lemma\n" +
    "   form            CDATA           #REQUIRED\n" +
    "   lang            CDATA           #REQUIRED>\n" +
    "<!ATTLIST definition\n" +
    "   lang            CDATA           #IMPLIED>\n" +
    "<!ATTLIST variant\n" +
    "   form            CDATA           #REQUIRED\n" +
    "   modified        (y|n)           #IMPLIED>\n" +
    "<!ATTLIST analysis\n" +
    "   desc            CDATA           #IMPLIED\n" +
    "   xlink:href      CDATA           #IMPLIED\n" +
    "   xlink:type      (simple)        #FIXED \"simple\"\n" +
    "   form            CDATA           #IMPLIED\n" +
    "   id              ID              #IMPLIED>\n" +
    "<!ATTLIST context-form\n" +
    "   lang            CDATA           #REQUIRED\n" +
    "   xlink:href      CDATA           #REQUIRED\n" +
    "   xlink:type      (simple)        #FIXED \"simple\">\n" +
    "<!ATTLIST token\n" +
    "   form            CDATA           #REQUIRED\n" +
    "   count           CDATA           #REQUIRED>\n" +
    "]>\n";
    return defXml;
  }
}