Mercurial > hg > mpdl-group
diff software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/donatus/xmlrpc/DonatusCacheOld.java @ 0:408254cf2f1d
Erstellung
author | Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de> |
---|---|
date | Wed, 24 Nov 2010 17:24:23 +0100 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/donatus/xmlrpc/DonatusCacheOld.java Wed Nov 24 17:24:23 2010 +0100 @@ -0,0 +1,327 @@ +package de.mpg.mpiwg.berlin.mpdl.donatus.xmlrpc; + +import java.io.FileNotFoundException; +import java.io.IOException; +import java.io.Reader; +import java.io.StringReader; +import java.io.UnsupportedEncodingException; +import java.net.MalformedURLException; +import java.net.URL; +import java.util.ArrayList; +import java.util.Date; + +import org.xml.sax.InputSource; +import org.xml.sax.SAXException; +import org.xml.sax.XMLReader; + +import com.sleepycat.je.Cursor; +import com.sleepycat.je.Database; +import com.sleepycat.je.DatabaseEntry; +import com.sleepycat.je.DatabaseException; +import com.sleepycat.je.LockMode; +import com.sleepycat.je.OperationStatus; +import com.sleepycat.je.Transaction; +import com.sun.org.apache.xerces.internal.parsers.SAXParser; + +import de.mpg.mpiwg.berlin.mpdl.donatus.analysis.DonatusAnalyzer; +import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; +import de.mpg.mpiwg.berlin.mpdl.util.FileUtil; +import de.mpg.mpiwg.berlin.mpdl.util.Util; + +public class DonatusCacheOld { + private static DonatusCacheOld instance; + private DonatusBerkeleyDbEnv berkeleyDBEnv = null; + private Date state = null; // last time the cache is written + + // for performance reasons these variables are needed + public static int QUERY_MODE = 0; + public static int DOCUMENT_MODE = 1; + protected int mode = QUERY_MODE; + // for performance reasons the cache contains a donatusMorphologyDocument which + // caches all lemmas for one document (in DOCUMENT_MODE) + private DonatusMorphologyDocument donatusMorphologyDocument = null; + + public static DonatusCacheOld getInstance() throws ApplicationException { + if (instance == null) { + instance = new DonatusCacheOld(); + instance.init(); + } + return instance; + } + + private void init() throws ApplicationException { + try { + berkeleyDBEnv = new DonatusBerkeleyDbEnv(); + berkeleyDBEnv.setup(false); // open databases in read/write mode + state = new Date(); + } catch (DatabaseException e) { + throw new ApplicationException(e); + } + } + + public int getMode() { + return mode; + } + + public void setMode(int newMode) { + this.mode = newMode; + if (newMode == QUERY_MODE) + donatusMorphologyDocument = null; // reset the morphology document + } + + public void close() { + berkeleyDBEnv.close(); + } + + // TODO Aufruf über RPC-API: execute(String path, HashMap parameters); spez. MPDL-Funktion zum Administrieren von BerkeleyDB: org.exist.xquery.modules.mpdldb.BerkeleyDBAdmin + public void deleteCache() { + berkeleyDBEnv.removeDatabases(); + state = new Date(); + } + + public void cacheLemmas(DonatusAnalyzer analyzer, String docUri, ArrayList<String> sentences) throws ApplicationException { + try { + Date beginOfOperation1 = new Date(); + URL url = new URL(docUri); + String path = url.getPath(); + System.out.print("Indexing: " + path + " Donatus-Analyze ... "); + DonatusHandler donatusHandler = new DonatusHandler(analyzer); + donatusMorphologyDocument = donatusHandler.analyze(docUri, sentences); + Date endOfOperation1 = new Date(); + Double elapsedTime1 = new Util().getSecondWithMillisecondsBetween(beginOfOperation1, endOfOperation1); + System.out.print(elapsedTime1 + " sec ... Writing lemmas to BerkeleyDB ... "); + Date beginOfOperation2 = new Date(); + writeLemmas(donatusMorphologyDocument); + Date endOfOperation2 = new Date(); + Double elapsedTime2 = new Util().getSecondWithMillisecondsBetween(beginOfOperation2, endOfOperation2); + System.out.print(elapsedTime2 + " sec ... Stemming ... "); + String donMorphPath = path.replaceFirst(".xml", "-donatus-morph-v" + endOfOperation2.getTime() + ".xml"); + String morphDocFilePathStr = DonatusConstants.BERKELEY_DB_DIR + "/donatusAnalyzedFiles" + donMorphPath; + FileUtil fileUtil = new FileUtil(); + byte[] morphDocBytes = donatusMorphologyDocument.getDocumentBytes(); + fileUtil.saveFile(morphDocBytes, morphDocFilePathStr); + String donWtagPath = path.replaceFirst(".xml", "-donatus-wtag-v" + endOfOperation2.getTime() + ".xml"); + String wtagFilePathStr = DonatusConstants.BERKELEY_DB_DIR + "/donatusAnalyzedFiles" + donWtagPath; + byte[] wtagBytes = donatusMorphologyDocument.getWtagBytes(); + fileUtil.saveFile(wtagBytes, wtagFilePathStr); + } catch (MalformedURLException e) { + throw new ApplicationException(e); + } catch (IOException e) { + throw new ApplicationException(e); + } + state = new Date(); + } + + public DonatusLemma getLemma(String language, String variantForm) throws ApplicationException { + DonatusLemma lemma = null; + if (mode == QUERY_MODE) { + lemma = readVariantLemma(null, language, variantForm); + } else { + if (donatusMorphologyDocument != null) { + DonatusVariant v = donatusMorphologyDocument.getVariant(variantForm); + if (v != null) { + DonatusLemma l = v.getLemma(); + lemma = donatusMorphologyDocument.getLemma(l.getForm()); + } + } + } + return lemma; + } + + public ArrayList<DonatusVariant> getQueryVariants(String language, String luceneQueryString) throws ApplicationException { + ArrayList<DonatusVariant> result = new ArrayList<DonatusVariant>(); + ArrayList<String> variantsFromQuery = getVariantsFromLuceneQuery(luceneQueryString); + if (! (variantsFromQuery == null || variantsFromQuery.isEmpty())) { + for (int i=0; i<variantsFromQuery.size(); i++) { + String variantStr = variantsFromQuery.get(i); + DonatusLemma lemma = getLemma(language, variantStr); + if (lemma != null) { + ArrayList<DonatusVariant> lemmaVariants = lemma.getVariants(); + result.addAll(lemmaVariants); + } + } + } + return result; + } + + private void writeLemmas(DonatusMorphologyDocument donatusMorphologyDocument) throws ApplicationException { + Transaction txn = null; // without txn + // Transaction txn = berkeleyDBEnv.getEnv().beginTransaction(null, null); + // delivers all variants of all lemmas - so for example more than one variant with the same form name but in different lemmas + ArrayList<DonatusVariant> variants = donatusMorphologyDocument.getVariants(); + for (int i=0; i<variants.size(); i++) { + DonatusVariant newVariant = variants.get(i); + String newVariantForm = newVariant.getForm(); + String language = newVariant.getLemma().getLanguage(); + if (newVariantForm != null && language != null && ! newVariantForm.equals("") && ! language.equals("")) { + DonatusLemma newVariantLemma = newVariant.getLemma(); + // look if this variant is already contained in variantDB and if so if the lemma there is the same as the new variant lemma + DonatusLemma dbVariantLemma = readVariantLemma(txn, language, newVariantForm); + if (dbVariantLemma != null) { + if (dbVariantLemma.getForm().equals(newVariantLemma.getForm())) { + // the variants of newVariantLemma are added to the existing variantLemma and this lemma is saved + ArrayList<DonatusVariant> newVariantLemmaVariants = newVariantLemma.getVariants(); + for (int j=0; j<newVariantLemmaVariants.size(); j++) { + DonatusVariant v = newVariantLemmaVariants.get(j); + dbVariantLemma.addVariant(v); + } + writeLemmaByVariantKey(txn, newVariant, dbVariantLemma); + } else { + // the two lemmas of the new and existing variant are not the same: nothing should be saved + } + } else { + writeLemmaByVariantKey(txn, newVariant, newVariantLemma); + } + } + } + // Only filled, not tested and used yet, for future + ArrayList<DonatusLemma> lemmas = donatusMorphologyDocument.getLemmas(); + for (int i=0; i<lemmas.size(); i++) { + DonatusLemma lemma = lemmas.get(i); + String lemmaForm = lemma.getForm(); + String language = lemma.getLanguage(); + if (lemmaForm != null && language != null && ! lemmaForm.equals("") && ! language.equals("")) { + writeLemmaByLemmaKey(txn, lemma); + } + } + state = new Date(); + } + + // TODO method is only simple: proof all Lucene cases + private ArrayList<String> getVariantsFromLuceneQuery(String queryString) { + ArrayList<String> variants = new ArrayList<String>(); + String[] variantTokens = queryString.split(" "); // TODO throw the phrases away (e.g.: "bla bla bla") + for (int i = 0; i < variantTokens.length; i++) { + String token = variantTokens[i]; + if (! (token.contains("*") || token.contains("?") || token.contains("~") || token.contains("-") || token.contains("+") || token.contains("^") || token.contains("OR") || token.contains("AND") || token.contains("NOT"))) { + variants.add(token); + } + } + return variants; + } + + private void writeLemmaByVariantKey(Transaction txn, DonatusVariant variantKey, DonatusLemma lemma) throws ApplicationException { + try { + String variantKeyStr = variantKey.getLemma().getLanguage() + "###" + variantKey.getForm(); + DatabaseEntry dbEntryKey = new DatabaseEntry(variantKeyStr.getBytes("UTF-8")); + String lemmaXmlValue = lemma.getXmlString(); + DatabaseEntry dbEntryValue = new DatabaseEntry(lemmaXmlValue.getBytes("UTF-8")); + Database variantDB = berkeleyDBEnv.getVariantDB(); + variantDB.put(txn, dbEntryKey, dbEntryValue); + } catch (DatabaseException e) { + throw new ApplicationException(e); + } catch (UnsupportedEncodingException e) { + throw new ApplicationException(e); + } + } + + private void writeLemmaByLemmaKey(Transaction txn, DonatusLemma lemma) throws ApplicationException { + try { + String lemmaKeyStr = lemma.getLanguage() + "###" + lemma.getForm(); + DatabaseEntry dbEntryKey = new DatabaseEntry(lemmaKeyStr.getBytes("UTF-8")); + String lemmaXmlValue = lemma.getXmlString(); + DatabaseEntry dbEntryValue = new DatabaseEntry(lemmaXmlValue.getBytes("UTF-8")); + Database lemmaDB = berkeleyDBEnv.getLemmaDB(); + lemmaDB.put(txn, dbEntryKey, dbEntryValue); + } catch (DatabaseException e) { + throw new ApplicationException(e); + } catch (UnsupportedEncodingException e) { + throw new ApplicationException(e); + } + } + + private DonatusLemma readVariantLemma(Transaction txn, String language, String variantForm) throws ApplicationException { + DonatusLemma lemma = null; + String hashKey = language + "###" + variantForm; + try { + Database variantDB = berkeleyDBEnv.getVariantDB(); + Cursor cursor = variantDB.openCursor(txn, null); + byte[] bHashKey = hashKey.getBytes("UTF-8"); + DatabaseEntry dbEntryKey = new DatabaseEntry(bHashKey); + DatabaseEntry foundXmlLemmaValue = new DatabaseEntry(); + OperationStatus operationStatus = variantDB.get(null, dbEntryKey, foundXmlLemmaValue, LockMode.DEFAULT); + if (operationStatus == OperationStatus.SUCCESS) { + byte[] foundXmlLemmaValueBytes = foundXmlLemmaValue.getData(); + String foundXmlLemmaStr = new String(foundXmlLemmaValueBytes, "UTF-8"); + lemma = parseXmlLemmaString(language, foundXmlLemmaStr); + } + cursor.close(); + } catch (DatabaseException e) { + throw new ApplicationException(e); + } catch (UnsupportedEncodingException e) { + throw new ApplicationException(e); + } + return lemma; + } + + private DonatusLemma parseXmlLemmaString(String language, String xmlLemmaString) throws ApplicationException { + DonatusLemma lemma = null; + DonatusMorphologyDocument morphologyDoc = parseDonatusMorphDoc(language, xmlLemmaString); + ArrayList<DonatusLemma> lemmas = morphologyDoc.getLemmas(); + if (lemmas.size() > 0) + lemma = lemmas.get(0); + return lemma; + } + + private DonatusMorphologyDocument parseDonatusMorphDoc(String language, String xmlString) throws ApplicationException { + DonatusMorphologyDocument morphologyDoc = null; + try { + XMLReader xmlParser = new SAXParser(); + DonatusMorphologyDocumentContentHandler donatusMorphContentHandler = new DonatusMorphologyDocumentContentHandler("tempDummyUri", language); + xmlParser.setContentHandler(donatusMorphContentHandler); + String morphDocDefXml = getDonatusMorphDocDefXml(); + String morphDocMorphStartXml = "<morphology xmlns=\"http://archimedes.fas.harvard.edu/ns/morphology/3\" xmlns:xlink=\"http://www.w3.org/1999/xlink\">\n"; + String morphDocMorphEndXml = "</morphology>"; + String morphDocXml = morphDocDefXml + morphDocMorphStartXml + xmlString + morphDocMorphEndXml; + Reader reader = new StringReader(morphDocXml); + InputSource input = new InputSource(reader); + xmlParser.parse(input); + morphologyDoc = donatusMorphContentHandler.getResult(); + } catch (SAXException e) { + throw new ApplicationException(e); + } catch (IOException e) { + throw new ApplicationException(e); + } + return morphologyDoc; + } + + private static String getDonatusMorphDocDefXml() { + String defXml = + "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n" + + "<!DOCTYPE morphology [\n" + + "<!ELEMENT morphology (lemma*, context-form*)>\n" + + "<!ELEMENT lemma (definition?, variant*)>\n" + + "<!ELEMENT context-form (tokens, analysis)>\n" + + "<!ELEMENT definition (#PCDATA)>\n" + + "<!ELEMENT variant (analysis)*>\n" + + "<!ELEMENT analysis EMPTY>\n" + + "<!ELEMENT tokens (token+)>\n" + + "<!ELEMENT token EMPTY>\n" + + "<!ATTLIST morphology\n" + + " xmlns CDATA #FIXED \"http://archimedes.fas.harvard.edu/ns/morphology/3\"\n" + + " xmlns:xlink CDATA #FIXED \"http://www.w3.org/1999/xlink\">\n" + + "<!ATTLIST lemma\n" + + " form CDATA #REQUIRED\n" + + " lang CDATA #REQUIRED>\n" + + "<!ATTLIST definition\n" + + " lang CDATA #IMPLIED>\n" + + "<!ATTLIST variant\n" + + " form CDATA #REQUIRED\n" + + " modified (y|n) #IMPLIED>\n" + + "<!ATTLIST analysis\n" + + " desc CDATA #IMPLIED\n" + + " xlink:href CDATA #IMPLIED\n" + + " xlink:type (simple) #FIXED \"simple\"\n" + + " form CDATA #IMPLIED\n" + + " id ID #IMPLIED>\n" + + "<!ATTLIST context-form\n" + + " lang CDATA #REQUIRED\n" + + " xlink:href CDATA #REQUIRED\n" + + " xlink:type (simple) #FIXED \"simple\">\n" + + "<!ATTLIST token\n" + + " form CDATA #REQUIRED\n" + + " count CDATA #REQUIRED>\n" + + "]>\n"; + return defXml; + } +} \ No newline at end of file