Mercurial > hg > mpdl-group
view software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/donatus/xmlrpc/DonatusCache.java @ 0:408254cf2f1d
Erstellung
author | Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de> |
---|---|
date | Wed, 24 Nov 2010 17:24:23 +0100 |
parents | |
children |
line wrap: on
line source
package de.mpg.mpiwg.berlin.mpdl.donatus.xmlrpc; import java.io.FileNotFoundException; import java.io.IOException; import java.io.Reader; import java.io.StringReader; import java.io.UnsupportedEncodingException; import java.net.MalformedURLException; import java.net.URL; import java.util.ArrayList; import java.util.Date; import org.xml.sax.InputSource; import org.xml.sax.SAXException; import org.xml.sax.XMLReader; import com.sleepycat.je.Cursor; import com.sleepycat.je.Database; import com.sleepycat.je.DatabaseEntry; import com.sleepycat.je.DatabaseException; import com.sleepycat.je.LockMode; import com.sleepycat.je.OperationStatus; import com.sleepycat.je.Transaction; import com.sun.org.apache.xerces.internal.parsers.SAXParser; import de.mpg.mpiwg.berlin.mpdl.donatus.analysis.DonatusAnalyzer; import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; import de.mpg.mpiwg.berlin.mpdl.util.FileUtil; public class DonatusCache { private static DonatusCache instance; private DonatusBerkeleyDbEnv berkeleyDBEnv = null; private Date state = null; // last time the cache is written // for performance reasons these variables are needed public static int QUERY_MODE = 0; public static int DOCUMENT_MODE = 1; protected int mode = QUERY_MODE; // for performance reasons the cache contains a donatusMorphologyDocument which // caches all lemmas for one document (in DOCUMENT_MODE) private DonatusMorphologyDocument donatusMorphologyDocument = null; public static DonatusCache getInstance() throws ApplicationException { if (instance == null) { instance = new DonatusCache(); instance.init(); } return instance; } private void init() throws ApplicationException { try { berkeleyDBEnv = new DonatusBerkeleyDbEnv(); berkeleyDBEnv.setup(false); // open databases in read/write mode state = new Date(); } catch (DatabaseException e) { throw new ApplicationException(e); } } public int getMode() { return mode; } public void setMode(int newMode) { this.mode = newMode; if (newMode == QUERY_MODE) donatusMorphologyDocument = null; // reset the morphology document } public void close() { berkeleyDBEnv.close(); } // TODO Aufruf über RPC-API: execute(String path, HashMap parameters); spez. MPDL-Funktion zum Administrieren von BerkeleyDB: org.exist.xquery.modules.mpdldb.BerkeleyDBAdmin public void deleteCache() { berkeleyDBEnv.removeDatabases(); state = new Date(); } public void analyze(DonatusAnalyzer analyzer, String docUri, ArrayList<String> sentences) throws ApplicationException { DonatusHandler donatusHandler = new DonatusHandler(analyzer); donatusMorphologyDocument = donatusHandler.analyze(docUri, sentences); } public void addVariant(String language, String lemmaForm, String type, String variantForm) throws ApplicationException { DonatusLemma lemma = getLemmaByVariantForm(language, variantForm); // if variantForm is already cached in a lemma then do nothing if (lemma == null) { // if lemmaForm is already cached as a lemma then do nothing else build the new lemma with the variant lemma = getLemmaByLemmaForm(language, lemmaForm); if (lemma == null) { lemma = new DonatusLemma(donatusMorphologyDocument, language, type, lemmaForm); donatusMorphologyDocument.putLemma(lemma); } else { // nothing } } DonatusVariant v = new DonatusVariant(lemma, type, variantForm); lemma.addVariant(v); } public void saveLemmas() throws ApplicationException { try { String docUri = donatusMorphologyDocument.getDocUri(); URL url = new URL(docUri); String path = url.getPath(); writeLemmas(donatusMorphologyDocument); Date endOfOperation2 = new Date(); String donMorphPath = path.replaceFirst(".xml", "-donatus-morph-v" + endOfOperation2.getTime() + ".xml"); String morphDocFilePathStr = DonatusConstants.BERKELEY_DB_DIR + "/donatusAnalyzedFiles" + donMorphPath; FileUtil fileUtil = new FileUtil(); byte[] morphDocBytes = donatusMorphologyDocument.getDocumentBytes(); fileUtil.saveFile(morphDocBytes, morphDocFilePathStr); String donWtagPath = path.replaceFirst(".xml", "-donatus-wtag-v" + endOfOperation2.getTime() + ".xml"); String wtagFilePathStr = DonatusConstants.BERKELEY_DB_DIR + "/donatusAnalyzedFiles" + donWtagPath; byte[] wtagBytes = donatusMorphologyDocument.getWtagBytes(); fileUtil.saveFile(wtagBytes, wtagFilePathStr); } catch (MalformedURLException e) { throw new ApplicationException(e); } catch (IOException e) { throw new ApplicationException(e); } state = new Date(); } public DonatusLemma getLemmaByVariantForm(String language, String variantForm) throws ApplicationException { DonatusLemma lemma = null; if (mode == QUERY_MODE) { lemma = readVariantLemma(null, language, variantForm); } else { if (donatusMorphologyDocument != null) { DonatusVariant v = donatusMorphologyDocument.getVariant(variantForm); if (v != null) { DonatusLemma l = v.getLemma(); lemma = donatusMorphologyDocument.getLemma(l.getForm()); } } } return lemma; } public DonatusLemma getLemmaByLemmaForm(String language, String lemmaForm) throws ApplicationException { DonatusLemma lemma = null; if (mode == QUERY_MODE) { lemma = readLemma(null, language, lemmaForm); } else { if (donatusMorphologyDocument != null) { lemma = donatusMorphologyDocument.getLemma(lemmaForm); } } return lemma; } public ArrayList<DonatusVariant> getQueryVariants(String language, String luceneQueryString) throws ApplicationException { ArrayList<DonatusVariant> result = new ArrayList<DonatusVariant>(); ArrayList<String> variantsFromQuery = getVariantsFromLuceneQuery(luceneQueryString); if (! (variantsFromQuery == null || variantsFromQuery.isEmpty())) { for (int i=0; i<variantsFromQuery.size(); i++) { String variantStr = variantsFromQuery.get(i); DonatusLemma lemma = getLemmaByVariantForm(language, variantStr); if (lemma != null) { ArrayList<DonatusVariant> lemmaVariants = lemma.getVariants(); result.addAll(lemmaVariants); } } } return result; } private void writeLemmas(DonatusMorphologyDocument donatusMorphologyDocument) throws ApplicationException { Transaction txn = null; // without txn // Transaction txn = berkeleyDBEnv.getEnv().beginTransaction(null, null); // delivers all variants of all lemmas - so for example more than one variant with the same form name but in different lemmas ArrayList<DonatusVariant> variants = donatusMorphologyDocument.getVariants(); for (int i=0; i<variants.size(); i++) { DonatusVariant newVariant = variants.get(i); String newVariantForm = newVariant.getForm(); String language = newVariant.getLemma().getLanguage(); if (newVariantForm != null && language != null && ! newVariantForm.equals("") && ! language.equals("")) { DonatusLemma newVariantLemma = newVariant.getLemma(); // look if this variant is already contained in variantDB and if so if the lemma there is the same as the new variant lemma DonatusLemma dbVariantLemma = readVariantLemma(txn, language, newVariantForm); if (dbVariantLemma != null) { if (dbVariantLemma.getForm().equals(newVariantLemma.getForm())) { // the variants of newVariantLemma are added to the existing variantLemma and this lemma is saved ArrayList<DonatusVariant> newVariantLemmaVariants = newVariantLemma.getVariants(); for (int j=0; j<newVariantLemmaVariants.size(); j++) { DonatusVariant v = newVariantLemmaVariants.get(j); dbVariantLemma.addVariant(v); } writeLemmaByVariantKey(txn, newVariant, dbVariantLemma); } else { // the two lemmas of the new and existing variant are not the same: nothing should be saved } } else { writeLemmaByVariantKey(txn, newVariant, newVariantLemma); } } } // Only filled, not tested and used yet, for future ArrayList<DonatusLemma> lemmas = donatusMorphologyDocument.getLemmas(); for (int i=0; i<lemmas.size(); i++) { DonatusLemma lemma = lemmas.get(i); String lemmaForm = lemma.getForm(); String language = lemma.getLanguage(); if (lemmaForm != null && language != null && ! lemmaForm.equals("") && ! language.equals("")) { writeLemmaByLemmaKey(txn, lemma); } } state = new Date(); } // TODO method is only simple: proof all Lucene cases private ArrayList<String> getVariantsFromLuceneQuery(String queryString) { ArrayList<String> variants = new ArrayList<String>(); String[] variantTokens = queryString.split(" "); // TODO throw the phrases away (e.g.: "bla bla bla") for (int i = 0; i < variantTokens.length; i++) { String token = variantTokens[i]; if (! (token.contains("*") || token.contains("?") || token.contains("~") || token.contains("-") || token.contains("+") || token.contains("^") || token.contains("OR") || token.contains("AND") || token.contains("NOT"))) { variants.add(token); } } return variants; } private void writeLemmaByVariantKey(Transaction txn, DonatusVariant variantKey, DonatusLemma lemma) throws ApplicationException { try { String variantKeyStr = variantKey.getLemma().getLanguage() + "###" + variantKey.getForm(); DatabaseEntry dbEntryKey = new DatabaseEntry(variantKeyStr.getBytes("UTF-8")); String lemmaXmlValue = lemma.getXmlString(); DatabaseEntry dbEntryValue = new DatabaseEntry(lemmaXmlValue.getBytes("UTF-8")); Database variantDB = berkeleyDBEnv.getVariantDB(); variantDB.put(txn, dbEntryKey, dbEntryValue); } catch (DatabaseException e) { throw new ApplicationException(e); } catch (UnsupportedEncodingException e) { throw new ApplicationException(e); } } private void writeLemmaByLemmaKey(Transaction txn, DonatusLemma lemma) throws ApplicationException { try { String lemmaKeyStr = lemma.getLanguage() + "###" + lemma.getForm(); DatabaseEntry dbEntryKey = new DatabaseEntry(lemmaKeyStr.getBytes("UTF-8")); String lemmaXmlValue = lemma.getXmlString(); DatabaseEntry dbEntryValue = new DatabaseEntry(lemmaXmlValue.getBytes("UTF-8")); Database lemmaDB = berkeleyDBEnv.getLemmaDB(); lemmaDB.put(txn, dbEntryKey, dbEntryValue); } catch (DatabaseException e) { throw new ApplicationException(e); } catch (UnsupportedEncodingException e) { throw new ApplicationException(e); } } private DonatusLemma readVariantLemma(Transaction txn, String language, String variantForm) throws ApplicationException { DonatusLemma lemma = null; String hashKey = language + "###" + variantForm; try { Database variantDB = berkeleyDBEnv.getVariantDB(); Cursor cursor = variantDB.openCursor(txn, null); byte[] bHashKey = hashKey.getBytes("UTF-8"); DatabaseEntry dbEntryKey = new DatabaseEntry(bHashKey); DatabaseEntry foundXmlLemmaValue = new DatabaseEntry(); OperationStatus operationStatus = variantDB.get(null, dbEntryKey, foundXmlLemmaValue, LockMode.DEFAULT); if (operationStatus == OperationStatus.SUCCESS) { byte[] foundXmlLemmaValueBytes = foundXmlLemmaValue.getData(); String foundXmlLemmaStr = new String(foundXmlLemmaValueBytes, "UTF-8"); lemma = parseXmlLemmaString(language, foundXmlLemmaStr); } cursor.close(); } catch (DatabaseException e) { throw new ApplicationException(e); } catch (UnsupportedEncodingException e) { throw new ApplicationException(e); } return lemma; } private DonatusLemma readLemma(Transaction txn, String language, String lemmaForm) throws ApplicationException { DonatusLemma lemma = null; String hashKey = language + "###" + lemmaForm; try { Database lemmaDB = berkeleyDBEnv.getLemmaDB(); Cursor cursor = lemmaDB.openCursor(txn, null); byte[] bHashKey = hashKey.getBytes("UTF-8"); DatabaseEntry dbEntryKey = new DatabaseEntry(bHashKey); DatabaseEntry foundXmlLemmaValue = new DatabaseEntry(); OperationStatus operationStatus = lemmaDB.get(null, dbEntryKey, foundXmlLemmaValue, LockMode.DEFAULT); if (operationStatus == OperationStatus.SUCCESS) { byte[] foundXmlLemmaValueBytes = foundXmlLemmaValue.getData(); String foundXmlLemmaStr = new String(foundXmlLemmaValueBytes, "UTF-8"); lemma = parseXmlLemmaString(language, foundXmlLemmaStr); } cursor.close(); } catch (DatabaseException e) { throw new ApplicationException(e); } catch (UnsupportedEncodingException e) { throw new ApplicationException(e); } return lemma; } private DonatusLemma parseXmlLemmaString(String language, String xmlLemmaString) throws ApplicationException { DonatusLemma lemma = null; DonatusMorphologyDocument morphologyDoc = parseDonatusMorphDoc(language, xmlLemmaString); ArrayList<DonatusLemma> lemmas = morphologyDoc.getLemmas(); if (lemmas.size() > 0) lemma = lemmas.get(0); return lemma; } private DonatusMorphologyDocument parseDonatusMorphDoc(String language, String xmlString) throws ApplicationException { DonatusMorphologyDocument morphologyDoc = null; try { XMLReader xmlParser = new SAXParser(); DonatusMorphologyDocumentContentHandler donatusMorphContentHandler = new DonatusMorphologyDocumentContentHandler("tempDummyUri", language); xmlParser.setContentHandler(donatusMorphContentHandler); String morphDocDefXml = getDonatusMorphDocDefXml(); String morphDocMorphStartXml = "<morphology xmlns=\"http://archimedes.fas.harvard.edu/ns/morphology/3\" xmlns:xlink=\"http://www.w3.org/1999/xlink\">\n"; String morphDocMorphEndXml = "</morphology>"; String morphDocXml = morphDocDefXml + morphDocMorphStartXml + xmlString + morphDocMorphEndXml; Reader reader = new StringReader(morphDocXml); InputSource input = new InputSource(reader); xmlParser.parse(input); morphologyDoc = donatusMorphContentHandler.getResult(); } catch (SAXException e) { throw new ApplicationException(e); } catch (IOException e) { throw new ApplicationException(e); } return morphologyDoc; } private static String getDonatusMorphDocDefXml() { String defXml = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n" + "<!DOCTYPE morphology [\n" + "<!ELEMENT morphology (lemma*, context-form*)>\n" + "<!ELEMENT lemma (definition?, variant*)>\n" + "<!ELEMENT context-form (tokens, analysis)>\n" + "<!ELEMENT definition (#PCDATA)>\n" + "<!ELEMENT variant (analysis)*>\n" + "<!ELEMENT analysis EMPTY>\n" + "<!ELEMENT tokens (token+)>\n" + "<!ELEMENT token EMPTY>\n" + "<!ATTLIST morphology\n" + " xmlns CDATA #FIXED \"http://archimedes.fas.harvard.edu/ns/morphology/3\"\n" + " xmlns:xlink CDATA #FIXED \"http://www.w3.org/1999/xlink\">\n" + "<!ATTLIST lemma\n" + " form CDATA #REQUIRED\n" + " lang CDATA #REQUIRED>\n" + "<!ATTLIST definition\n" + " lang CDATA #IMPLIED>\n" + "<!ATTLIST variant\n" + " form CDATA #REQUIRED\n" + " modified (y|n) #IMPLIED>\n" + "<!ATTLIST analysis\n" + " desc CDATA #IMPLIED\n" + " xlink:href CDATA #IMPLIED\n" + " xlink:type (simple) #FIXED \"simple\"\n" + " form CDATA #IMPLIED\n" + " id ID #IMPLIED>\n" + "<!ATTLIST context-form\n" + " lang CDATA #REQUIRED\n" + " xlink:href CDATA #REQUIRED\n" + " xlink:type (simple) #FIXED \"simple\">\n" + "<!ATTLIST token\n" + " form CDATA #REQUIRED\n" + " count CDATA #REQUIRED>\n" + "]>\n"; return defXml; } }