comparison software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/donatus/xmlrpc/DonatusCache.java @ 0:408254cf2f1d

Erstellung
author Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
date Wed, 24 Nov 2010 17:24:23 +0100
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:408254cf2f1d
1 package de.mpg.mpiwg.berlin.mpdl.donatus.xmlrpc;
2
3 import java.io.FileNotFoundException;
4 import java.io.IOException;
5 import java.io.Reader;
6 import java.io.StringReader;
7 import java.io.UnsupportedEncodingException;
8 import java.net.MalformedURLException;
9 import java.net.URL;
10 import java.util.ArrayList;
11 import java.util.Date;
12
13 import org.xml.sax.InputSource;
14 import org.xml.sax.SAXException;
15 import org.xml.sax.XMLReader;
16
17 import com.sleepycat.je.Cursor;
18 import com.sleepycat.je.Database;
19 import com.sleepycat.je.DatabaseEntry;
20 import com.sleepycat.je.DatabaseException;
21 import com.sleepycat.je.LockMode;
22 import com.sleepycat.je.OperationStatus;
23 import com.sleepycat.je.Transaction;
24 import com.sun.org.apache.xerces.internal.parsers.SAXParser;
25
26 import de.mpg.mpiwg.berlin.mpdl.donatus.analysis.DonatusAnalyzer;
27 import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException;
28 import de.mpg.mpiwg.berlin.mpdl.util.FileUtil;
29
30 public class DonatusCache {
31 private static DonatusCache instance;
32 private DonatusBerkeleyDbEnv berkeleyDBEnv = null;
33 private Date state = null; // last time the cache is written
34
35 // for performance reasons these variables are needed
36 public static int QUERY_MODE = 0;
37 public static int DOCUMENT_MODE = 1;
38 protected int mode = QUERY_MODE;
39 // for performance reasons the cache contains a donatusMorphologyDocument which
40 // caches all lemmas for one document (in DOCUMENT_MODE)
41 private DonatusMorphologyDocument donatusMorphologyDocument = null;
42
43 public static DonatusCache getInstance() throws ApplicationException {
44 if (instance == null) {
45 instance = new DonatusCache();
46 instance.init();
47 }
48 return instance;
49 }
50
51 private void init() throws ApplicationException {
52 try {
53 berkeleyDBEnv = new DonatusBerkeleyDbEnv();
54 berkeleyDBEnv.setup(false); // open databases in read/write mode
55 state = new Date();
56 } catch (DatabaseException e) {
57 throw new ApplicationException(e);
58 }
59 }
60
61 public int getMode() {
62 return mode;
63 }
64
65 public void setMode(int newMode) {
66 this.mode = newMode;
67 if (newMode == QUERY_MODE)
68 donatusMorphologyDocument = null; // reset the morphology document
69 }
70
71 public void close() {
72 berkeleyDBEnv.close();
73 }
74
75 // TODO Aufruf über RPC-API: execute(String path, HashMap parameters); spez. MPDL-Funktion zum Administrieren von BerkeleyDB: org.exist.xquery.modules.mpdldb.BerkeleyDBAdmin
76 public void deleteCache() {
77 berkeleyDBEnv.removeDatabases();
78 state = new Date();
79 }
80
81 public void analyze(DonatusAnalyzer analyzer, String docUri, ArrayList<String> sentences) throws ApplicationException {
82 DonatusHandler donatusHandler = new DonatusHandler(analyzer);
83 donatusMorphologyDocument = donatusHandler.analyze(docUri, sentences);
84 }
85
86 public void addVariant(String language, String lemmaForm, String type, String variantForm) throws ApplicationException {
87 DonatusLemma lemma = getLemmaByVariantForm(language, variantForm);
88 // if variantForm is already cached in a lemma then do nothing
89 if (lemma == null) {
90 // if lemmaForm is already cached as a lemma then do nothing else build the new lemma with the variant
91 lemma = getLemmaByLemmaForm(language, lemmaForm);
92 if (lemma == null) {
93 lemma = new DonatusLemma(donatusMorphologyDocument, language, type, lemmaForm);
94 donatusMorphologyDocument.putLemma(lemma);
95 } else {
96 // nothing
97 }
98 }
99 DonatusVariant v = new DonatusVariant(lemma, type, variantForm);
100 lemma.addVariant(v);
101 }
102
103 public void saveLemmas() throws ApplicationException {
104 try {
105 String docUri = donatusMorphologyDocument.getDocUri();
106 URL url = new URL(docUri);
107 String path = url.getPath();
108 writeLemmas(donatusMorphologyDocument);
109 Date endOfOperation2 = new Date();
110 String donMorphPath = path.replaceFirst(".xml", "-donatus-morph-v" + endOfOperation2.getTime() + ".xml");
111 String morphDocFilePathStr = DonatusConstants.BERKELEY_DB_DIR + "/donatusAnalyzedFiles" + donMorphPath;
112 FileUtil fileUtil = new FileUtil();
113 byte[] morphDocBytes = donatusMorphologyDocument.getDocumentBytes();
114 fileUtil.saveFile(morphDocBytes, morphDocFilePathStr);
115 String donWtagPath = path.replaceFirst(".xml", "-donatus-wtag-v" + endOfOperation2.getTime() + ".xml");
116 String wtagFilePathStr = DonatusConstants.BERKELEY_DB_DIR + "/donatusAnalyzedFiles" + donWtagPath;
117 byte[] wtagBytes = donatusMorphologyDocument.getWtagBytes();
118 fileUtil.saveFile(wtagBytes, wtagFilePathStr);
119 } catch (MalformedURLException e) {
120 throw new ApplicationException(e);
121 } catch (IOException e) {
122 throw new ApplicationException(e);
123 }
124 state = new Date();
125 }
126
127 public DonatusLemma getLemmaByVariantForm(String language, String variantForm) throws ApplicationException {
128 DonatusLemma lemma = null;
129 if (mode == QUERY_MODE) {
130 lemma = readVariantLemma(null, language, variantForm);
131 } else {
132 if (donatusMorphologyDocument != null) {
133 DonatusVariant v = donatusMorphologyDocument.getVariant(variantForm);
134 if (v != null) {
135 DonatusLemma l = v.getLemma();
136 lemma = donatusMorphologyDocument.getLemma(l.getForm());
137 }
138 }
139 }
140 return lemma;
141 }
142
143 public DonatusLemma getLemmaByLemmaForm(String language, String lemmaForm) throws ApplicationException {
144 DonatusLemma lemma = null;
145 if (mode == QUERY_MODE) {
146 lemma = readLemma(null, language, lemmaForm);
147 } else {
148 if (donatusMorphologyDocument != null) {
149 lemma = donatusMorphologyDocument.getLemma(lemmaForm);
150 }
151 }
152 return lemma;
153 }
154
155 public ArrayList<DonatusVariant> getQueryVariants(String language, String luceneQueryString) throws ApplicationException {
156 ArrayList<DonatusVariant> result = new ArrayList<DonatusVariant>();
157 ArrayList<String> variantsFromQuery = getVariantsFromLuceneQuery(luceneQueryString);
158 if (! (variantsFromQuery == null || variantsFromQuery.isEmpty())) {
159 for (int i=0; i<variantsFromQuery.size(); i++) {
160 String variantStr = variantsFromQuery.get(i);
161 DonatusLemma lemma = getLemmaByVariantForm(language, variantStr);
162 if (lemma != null) {
163 ArrayList<DonatusVariant> lemmaVariants = lemma.getVariants();
164 result.addAll(lemmaVariants);
165 }
166 }
167 }
168 return result;
169 }
170
171 private void writeLemmas(DonatusMorphologyDocument donatusMorphologyDocument) throws ApplicationException {
172 Transaction txn = null; // without txn
173 // Transaction txn = berkeleyDBEnv.getEnv().beginTransaction(null, null);
174 // delivers all variants of all lemmas - so for example more than one variant with the same form name but in different lemmas
175 ArrayList<DonatusVariant> variants = donatusMorphologyDocument.getVariants();
176 for (int i=0; i<variants.size(); i++) {
177 DonatusVariant newVariant = variants.get(i);
178 String newVariantForm = newVariant.getForm();
179 String language = newVariant.getLemma().getLanguage();
180 if (newVariantForm != null && language != null && ! newVariantForm.equals("") && ! language.equals("")) {
181 DonatusLemma newVariantLemma = newVariant.getLemma();
182 // look if this variant is already contained in variantDB and if so if the lemma there is the same as the new variant lemma
183 DonatusLemma dbVariantLemma = readVariantLemma(txn, language, newVariantForm);
184 if (dbVariantLemma != null) {
185 if (dbVariantLemma.getForm().equals(newVariantLemma.getForm())) {
186 // the variants of newVariantLemma are added to the existing variantLemma and this lemma is saved
187 ArrayList<DonatusVariant> newVariantLemmaVariants = newVariantLemma.getVariants();
188 for (int j=0; j<newVariantLemmaVariants.size(); j++) {
189 DonatusVariant v = newVariantLemmaVariants.get(j);
190 dbVariantLemma.addVariant(v);
191 }
192 writeLemmaByVariantKey(txn, newVariant, dbVariantLemma);
193 } else {
194 // the two lemmas of the new and existing variant are not the same: nothing should be saved
195 }
196 } else {
197 writeLemmaByVariantKey(txn, newVariant, newVariantLemma);
198 }
199 }
200 }
201 // Only filled, not tested and used yet, for future
202 ArrayList<DonatusLemma> lemmas = donatusMorphologyDocument.getLemmas();
203 for (int i=0; i<lemmas.size(); i++) {
204 DonatusLemma lemma = lemmas.get(i);
205 String lemmaForm = lemma.getForm();
206 String language = lemma.getLanguage();
207 if (lemmaForm != null && language != null && ! lemmaForm.equals("") && ! language.equals("")) {
208 writeLemmaByLemmaKey(txn, lemma);
209 }
210 }
211 state = new Date();
212 }
213
214 // TODO method is only simple: proof all Lucene cases
215 private ArrayList<String> getVariantsFromLuceneQuery(String queryString) {
216 ArrayList<String> variants = new ArrayList<String>();
217 String[] variantTokens = queryString.split(" "); // TODO throw the phrases away (e.g.: "bla bla bla")
218 for (int i = 0; i < variantTokens.length; i++) {
219 String token = variantTokens[i];
220 if (! (token.contains("*") || token.contains("?") || token.contains("~") || token.contains("-") || token.contains("+") || token.contains("^") || token.contains("OR") || token.contains("AND") || token.contains("NOT"))) {
221 variants.add(token);
222 }
223 }
224 return variants;
225 }
226
227 private void writeLemmaByVariantKey(Transaction txn, DonatusVariant variantKey, DonatusLemma lemma) throws ApplicationException {
228 try {
229 String variantKeyStr = variantKey.getLemma().getLanguage() + "###" + variantKey.getForm();
230 DatabaseEntry dbEntryKey = new DatabaseEntry(variantKeyStr.getBytes("UTF-8"));
231 String lemmaXmlValue = lemma.getXmlString();
232 DatabaseEntry dbEntryValue = new DatabaseEntry(lemmaXmlValue.getBytes("UTF-8"));
233 Database variantDB = berkeleyDBEnv.getVariantDB();
234 variantDB.put(txn, dbEntryKey, dbEntryValue);
235 } catch (DatabaseException e) {
236 throw new ApplicationException(e);
237 } catch (UnsupportedEncodingException e) {
238 throw new ApplicationException(e);
239 }
240 }
241
242 private void writeLemmaByLemmaKey(Transaction txn, DonatusLemma lemma) throws ApplicationException {
243 try {
244 String lemmaKeyStr = lemma.getLanguage() + "###" + lemma.getForm();
245 DatabaseEntry dbEntryKey = new DatabaseEntry(lemmaKeyStr.getBytes("UTF-8"));
246 String lemmaXmlValue = lemma.getXmlString();
247 DatabaseEntry dbEntryValue = new DatabaseEntry(lemmaXmlValue.getBytes("UTF-8"));
248 Database lemmaDB = berkeleyDBEnv.getLemmaDB();
249 lemmaDB.put(txn, dbEntryKey, dbEntryValue);
250 } catch (DatabaseException e) {
251 throw new ApplicationException(e);
252 } catch (UnsupportedEncodingException e) {
253 throw new ApplicationException(e);
254 }
255 }
256
257 private DonatusLemma readVariantLemma(Transaction txn, String language, String variantForm) throws ApplicationException {
258 DonatusLemma lemma = null;
259 String hashKey = language + "###" + variantForm;
260 try {
261 Database variantDB = berkeleyDBEnv.getVariantDB();
262 Cursor cursor = variantDB.openCursor(txn, null);
263 byte[] bHashKey = hashKey.getBytes("UTF-8");
264 DatabaseEntry dbEntryKey = new DatabaseEntry(bHashKey);
265 DatabaseEntry foundXmlLemmaValue = new DatabaseEntry();
266 OperationStatus operationStatus = variantDB.get(null, dbEntryKey, foundXmlLemmaValue, LockMode.DEFAULT);
267 if (operationStatus == OperationStatus.SUCCESS) {
268 byte[] foundXmlLemmaValueBytes = foundXmlLemmaValue.getData();
269 String foundXmlLemmaStr = new String(foundXmlLemmaValueBytes, "UTF-8");
270 lemma = parseXmlLemmaString(language, foundXmlLemmaStr);
271 }
272 cursor.close();
273 } catch (DatabaseException e) {
274 throw new ApplicationException(e);
275 } catch (UnsupportedEncodingException e) {
276 throw new ApplicationException(e);
277 }
278 return lemma;
279 }
280
281 private DonatusLemma readLemma(Transaction txn, String language, String lemmaForm) throws ApplicationException {
282 DonatusLemma lemma = null;
283 String hashKey = language + "###" + lemmaForm;
284 try {
285 Database lemmaDB = berkeleyDBEnv.getLemmaDB();
286 Cursor cursor = lemmaDB.openCursor(txn, null);
287 byte[] bHashKey = hashKey.getBytes("UTF-8");
288 DatabaseEntry dbEntryKey = new DatabaseEntry(bHashKey);
289 DatabaseEntry foundXmlLemmaValue = new DatabaseEntry();
290 OperationStatus operationStatus = lemmaDB.get(null, dbEntryKey, foundXmlLemmaValue, LockMode.DEFAULT);
291 if (operationStatus == OperationStatus.SUCCESS) {
292 byte[] foundXmlLemmaValueBytes = foundXmlLemmaValue.getData();
293 String foundXmlLemmaStr = new String(foundXmlLemmaValueBytes, "UTF-8");
294 lemma = parseXmlLemmaString(language, foundXmlLemmaStr);
295 }
296 cursor.close();
297 } catch (DatabaseException e) {
298 throw new ApplicationException(e);
299 } catch (UnsupportedEncodingException e) {
300 throw new ApplicationException(e);
301 }
302 return lemma;
303 }
304
305 private DonatusLemma parseXmlLemmaString(String language, String xmlLemmaString) throws ApplicationException {
306 DonatusLemma lemma = null;
307 DonatusMorphologyDocument morphologyDoc = parseDonatusMorphDoc(language, xmlLemmaString);
308 ArrayList<DonatusLemma> lemmas = morphologyDoc.getLemmas();
309 if (lemmas.size() > 0)
310 lemma = lemmas.get(0);
311 return lemma;
312 }
313
314 private DonatusMorphologyDocument parseDonatusMorphDoc(String language, String xmlString) throws ApplicationException {
315 DonatusMorphologyDocument morphologyDoc = null;
316 try {
317 XMLReader xmlParser = new SAXParser();
318 DonatusMorphologyDocumentContentHandler donatusMorphContentHandler = new DonatusMorphologyDocumentContentHandler("tempDummyUri", language);
319 xmlParser.setContentHandler(donatusMorphContentHandler);
320 String morphDocDefXml = getDonatusMorphDocDefXml();
321 String morphDocMorphStartXml = "<morphology xmlns=\"http://archimedes.fas.harvard.edu/ns/morphology/3\" xmlns:xlink=\"http://www.w3.org/1999/xlink\">\n";
322 String morphDocMorphEndXml = "</morphology>";
323 String morphDocXml = morphDocDefXml + morphDocMorphStartXml + xmlString + morphDocMorphEndXml;
324 Reader reader = new StringReader(morphDocXml);
325 InputSource input = new InputSource(reader);
326 xmlParser.parse(input);
327 morphologyDoc = donatusMorphContentHandler.getResult();
328 } catch (SAXException e) {
329 throw new ApplicationException(e);
330 } catch (IOException e) {
331 throw new ApplicationException(e);
332 }
333 return morphologyDoc;
334 }
335
336 private static String getDonatusMorphDocDefXml() {
337 String defXml =
338 "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n" +
339 "<!DOCTYPE morphology [\n" +
340 "<!ELEMENT morphology (lemma*, context-form*)>\n" +
341 "<!ELEMENT lemma (definition?, variant*)>\n" +
342 "<!ELEMENT context-form (tokens, analysis)>\n" +
343 "<!ELEMENT definition (#PCDATA)>\n" +
344 "<!ELEMENT variant (analysis)*>\n" +
345 "<!ELEMENT analysis EMPTY>\n" +
346 "<!ELEMENT tokens (token+)>\n" +
347 "<!ELEMENT token EMPTY>\n" +
348 "<!ATTLIST morphology\n" +
349 " xmlns CDATA #FIXED \"http://archimedes.fas.harvard.edu/ns/morphology/3\"\n" +
350 " xmlns:xlink CDATA #FIXED \"http://www.w3.org/1999/xlink\">\n" +
351 "<!ATTLIST lemma\n" +
352 " form CDATA #REQUIRED\n" +
353 " lang CDATA #REQUIRED>\n" +
354 "<!ATTLIST definition\n" +
355 " lang CDATA #IMPLIED>\n" +
356 "<!ATTLIST variant\n" +
357 " form CDATA #REQUIRED\n" +
358 " modified (y|n) #IMPLIED>\n" +
359 "<!ATTLIST analysis\n" +
360 " desc CDATA #IMPLIED\n" +
361 " xlink:href CDATA #IMPLIED\n" +
362 " xlink:type (simple) #FIXED \"simple\"\n" +
363 " form CDATA #IMPLIED\n" +
364 " id ID #IMPLIED>\n" +
365 "<!ATTLIST context-form\n" +
366 " lang CDATA #REQUIRED\n" +
367 " xlink:href CDATA #REQUIRED\n" +
368 " xlink:type (simple) #FIXED \"simple\">\n" +
369 "<!ATTLIST token\n" +
370 " form CDATA #REQUIRED\n" +
371 " count CDATA #REQUIRED>\n" +
372 "]>\n";
373 return defXml;
374 }
375 }