Mercurial > hg > mpdl-group
comparison software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/donatus/xmlrpc/DonatusCache.java @ 0:408254cf2f1d
Erstellung
author | Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de> |
---|---|
date | Wed, 24 Nov 2010 17:24:23 +0100 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:408254cf2f1d |
---|---|
1 package de.mpg.mpiwg.berlin.mpdl.donatus.xmlrpc; | |
2 | |
3 import java.io.FileNotFoundException; | |
4 import java.io.IOException; | |
5 import java.io.Reader; | |
6 import java.io.StringReader; | |
7 import java.io.UnsupportedEncodingException; | |
8 import java.net.MalformedURLException; | |
9 import java.net.URL; | |
10 import java.util.ArrayList; | |
11 import java.util.Date; | |
12 | |
13 import org.xml.sax.InputSource; | |
14 import org.xml.sax.SAXException; | |
15 import org.xml.sax.XMLReader; | |
16 | |
17 import com.sleepycat.je.Cursor; | |
18 import com.sleepycat.je.Database; | |
19 import com.sleepycat.je.DatabaseEntry; | |
20 import com.sleepycat.je.DatabaseException; | |
21 import com.sleepycat.je.LockMode; | |
22 import com.sleepycat.je.OperationStatus; | |
23 import com.sleepycat.je.Transaction; | |
24 import com.sun.org.apache.xerces.internal.parsers.SAXParser; | |
25 | |
26 import de.mpg.mpiwg.berlin.mpdl.donatus.analysis.DonatusAnalyzer; | |
27 import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; | |
28 import de.mpg.mpiwg.berlin.mpdl.util.FileUtil; | |
29 | |
30 public class DonatusCache { | |
31 private static DonatusCache instance; | |
32 private DonatusBerkeleyDbEnv berkeleyDBEnv = null; | |
33 private Date state = null; // last time the cache is written | |
34 | |
35 // for performance reasons these variables are needed | |
36 public static int QUERY_MODE = 0; | |
37 public static int DOCUMENT_MODE = 1; | |
38 protected int mode = QUERY_MODE; | |
39 // for performance reasons the cache contains a donatusMorphologyDocument which | |
40 // caches all lemmas for one document (in DOCUMENT_MODE) | |
41 private DonatusMorphologyDocument donatusMorphologyDocument = null; | |
42 | |
43 public static DonatusCache getInstance() throws ApplicationException { | |
44 if (instance == null) { | |
45 instance = new DonatusCache(); | |
46 instance.init(); | |
47 } | |
48 return instance; | |
49 } | |
50 | |
51 private void init() throws ApplicationException { | |
52 try { | |
53 berkeleyDBEnv = new DonatusBerkeleyDbEnv(); | |
54 berkeleyDBEnv.setup(false); // open databases in read/write mode | |
55 state = new Date(); | |
56 } catch (DatabaseException e) { | |
57 throw new ApplicationException(e); | |
58 } | |
59 } | |
60 | |
61 public int getMode() { | |
62 return mode; | |
63 } | |
64 | |
65 public void setMode(int newMode) { | |
66 this.mode = newMode; | |
67 if (newMode == QUERY_MODE) | |
68 donatusMorphologyDocument = null; // reset the morphology document | |
69 } | |
70 | |
71 public void close() { | |
72 berkeleyDBEnv.close(); | |
73 } | |
74 | |
75 // TODO Aufruf über RPC-API: execute(String path, HashMap parameters); spez. MPDL-Funktion zum Administrieren von BerkeleyDB: org.exist.xquery.modules.mpdldb.BerkeleyDBAdmin | |
76 public void deleteCache() { | |
77 berkeleyDBEnv.removeDatabases(); | |
78 state = new Date(); | |
79 } | |
80 | |
81 public void analyze(DonatusAnalyzer analyzer, String docUri, ArrayList<String> sentences) throws ApplicationException { | |
82 DonatusHandler donatusHandler = new DonatusHandler(analyzer); | |
83 donatusMorphologyDocument = donatusHandler.analyze(docUri, sentences); | |
84 } | |
85 | |
86 public void addVariant(String language, String lemmaForm, String type, String variantForm) throws ApplicationException { | |
87 DonatusLemma lemma = getLemmaByVariantForm(language, variantForm); | |
88 // if variantForm is already cached in a lemma then do nothing | |
89 if (lemma == null) { | |
90 // if lemmaForm is already cached as a lemma then do nothing else build the new lemma with the variant | |
91 lemma = getLemmaByLemmaForm(language, lemmaForm); | |
92 if (lemma == null) { | |
93 lemma = new DonatusLemma(donatusMorphologyDocument, language, type, lemmaForm); | |
94 donatusMorphologyDocument.putLemma(lemma); | |
95 } else { | |
96 // nothing | |
97 } | |
98 } | |
99 DonatusVariant v = new DonatusVariant(lemma, type, variantForm); | |
100 lemma.addVariant(v); | |
101 } | |
102 | |
103 public void saveLemmas() throws ApplicationException { | |
104 try { | |
105 String docUri = donatusMorphologyDocument.getDocUri(); | |
106 URL url = new URL(docUri); | |
107 String path = url.getPath(); | |
108 writeLemmas(donatusMorphologyDocument); | |
109 Date endOfOperation2 = new Date(); | |
110 String donMorphPath = path.replaceFirst(".xml", "-donatus-morph-v" + endOfOperation2.getTime() + ".xml"); | |
111 String morphDocFilePathStr = DonatusConstants.BERKELEY_DB_DIR + "/donatusAnalyzedFiles" + donMorphPath; | |
112 FileUtil fileUtil = new FileUtil(); | |
113 byte[] morphDocBytes = donatusMorphologyDocument.getDocumentBytes(); | |
114 fileUtil.saveFile(morphDocBytes, morphDocFilePathStr); | |
115 String donWtagPath = path.replaceFirst(".xml", "-donatus-wtag-v" + endOfOperation2.getTime() + ".xml"); | |
116 String wtagFilePathStr = DonatusConstants.BERKELEY_DB_DIR + "/donatusAnalyzedFiles" + donWtagPath; | |
117 byte[] wtagBytes = donatusMorphologyDocument.getWtagBytes(); | |
118 fileUtil.saveFile(wtagBytes, wtagFilePathStr); | |
119 } catch (MalformedURLException e) { | |
120 throw new ApplicationException(e); | |
121 } catch (IOException e) { | |
122 throw new ApplicationException(e); | |
123 } | |
124 state = new Date(); | |
125 } | |
126 | |
127 public DonatusLemma getLemmaByVariantForm(String language, String variantForm) throws ApplicationException { | |
128 DonatusLemma lemma = null; | |
129 if (mode == QUERY_MODE) { | |
130 lemma = readVariantLemma(null, language, variantForm); | |
131 } else { | |
132 if (donatusMorphologyDocument != null) { | |
133 DonatusVariant v = donatusMorphologyDocument.getVariant(variantForm); | |
134 if (v != null) { | |
135 DonatusLemma l = v.getLemma(); | |
136 lemma = donatusMorphologyDocument.getLemma(l.getForm()); | |
137 } | |
138 } | |
139 } | |
140 return lemma; | |
141 } | |
142 | |
143 public DonatusLemma getLemmaByLemmaForm(String language, String lemmaForm) throws ApplicationException { | |
144 DonatusLemma lemma = null; | |
145 if (mode == QUERY_MODE) { | |
146 lemma = readLemma(null, language, lemmaForm); | |
147 } else { | |
148 if (donatusMorphologyDocument != null) { | |
149 lemma = donatusMorphologyDocument.getLemma(lemmaForm); | |
150 } | |
151 } | |
152 return lemma; | |
153 } | |
154 | |
155 public ArrayList<DonatusVariant> getQueryVariants(String language, String luceneQueryString) throws ApplicationException { | |
156 ArrayList<DonatusVariant> result = new ArrayList<DonatusVariant>(); | |
157 ArrayList<String> variantsFromQuery = getVariantsFromLuceneQuery(luceneQueryString); | |
158 if (! (variantsFromQuery == null || variantsFromQuery.isEmpty())) { | |
159 for (int i=0; i<variantsFromQuery.size(); i++) { | |
160 String variantStr = variantsFromQuery.get(i); | |
161 DonatusLemma lemma = getLemmaByVariantForm(language, variantStr); | |
162 if (lemma != null) { | |
163 ArrayList<DonatusVariant> lemmaVariants = lemma.getVariants(); | |
164 result.addAll(lemmaVariants); | |
165 } | |
166 } | |
167 } | |
168 return result; | |
169 } | |
170 | |
171 private void writeLemmas(DonatusMorphologyDocument donatusMorphologyDocument) throws ApplicationException { | |
172 Transaction txn = null; // without txn | |
173 // Transaction txn = berkeleyDBEnv.getEnv().beginTransaction(null, null); | |
174 // delivers all variants of all lemmas - so for example more than one variant with the same form name but in different lemmas | |
175 ArrayList<DonatusVariant> variants = donatusMorphologyDocument.getVariants(); | |
176 for (int i=0; i<variants.size(); i++) { | |
177 DonatusVariant newVariant = variants.get(i); | |
178 String newVariantForm = newVariant.getForm(); | |
179 String language = newVariant.getLemma().getLanguage(); | |
180 if (newVariantForm != null && language != null && ! newVariantForm.equals("") && ! language.equals("")) { | |
181 DonatusLemma newVariantLemma = newVariant.getLemma(); | |
182 // look if this variant is already contained in variantDB and if so if the lemma there is the same as the new variant lemma | |
183 DonatusLemma dbVariantLemma = readVariantLemma(txn, language, newVariantForm); | |
184 if (dbVariantLemma != null) { | |
185 if (dbVariantLemma.getForm().equals(newVariantLemma.getForm())) { | |
186 // the variants of newVariantLemma are added to the existing variantLemma and this lemma is saved | |
187 ArrayList<DonatusVariant> newVariantLemmaVariants = newVariantLemma.getVariants(); | |
188 for (int j=0; j<newVariantLemmaVariants.size(); j++) { | |
189 DonatusVariant v = newVariantLemmaVariants.get(j); | |
190 dbVariantLemma.addVariant(v); | |
191 } | |
192 writeLemmaByVariantKey(txn, newVariant, dbVariantLemma); | |
193 } else { | |
194 // the two lemmas of the new and existing variant are not the same: nothing should be saved | |
195 } | |
196 } else { | |
197 writeLemmaByVariantKey(txn, newVariant, newVariantLemma); | |
198 } | |
199 } | |
200 } | |
201 // Only filled, not tested and used yet, for future | |
202 ArrayList<DonatusLemma> lemmas = donatusMorphologyDocument.getLemmas(); | |
203 for (int i=0; i<lemmas.size(); i++) { | |
204 DonatusLemma lemma = lemmas.get(i); | |
205 String lemmaForm = lemma.getForm(); | |
206 String language = lemma.getLanguage(); | |
207 if (lemmaForm != null && language != null && ! lemmaForm.equals("") && ! language.equals("")) { | |
208 writeLemmaByLemmaKey(txn, lemma); | |
209 } | |
210 } | |
211 state = new Date(); | |
212 } | |
213 | |
214 // TODO method is only simple: proof all Lucene cases | |
215 private ArrayList<String> getVariantsFromLuceneQuery(String queryString) { | |
216 ArrayList<String> variants = new ArrayList<String>(); | |
217 String[] variantTokens = queryString.split(" "); // TODO throw the phrases away (e.g.: "bla bla bla") | |
218 for (int i = 0; i < variantTokens.length; i++) { | |
219 String token = variantTokens[i]; | |
220 if (! (token.contains("*") || token.contains("?") || token.contains("~") || token.contains("-") || token.contains("+") || token.contains("^") || token.contains("OR") || token.contains("AND") || token.contains("NOT"))) { | |
221 variants.add(token); | |
222 } | |
223 } | |
224 return variants; | |
225 } | |
226 | |
227 private void writeLemmaByVariantKey(Transaction txn, DonatusVariant variantKey, DonatusLemma lemma) throws ApplicationException { | |
228 try { | |
229 String variantKeyStr = variantKey.getLemma().getLanguage() + "###" + variantKey.getForm(); | |
230 DatabaseEntry dbEntryKey = new DatabaseEntry(variantKeyStr.getBytes("UTF-8")); | |
231 String lemmaXmlValue = lemma.getXmlString(); | |
232 DatabaseEntry dbEntryValue = new DatabaseEntry(lemmaXmlValue.getBytes("UTF-8")); | |
233 Database variantDB = berkeleyDBEnv.getVariantDB(); | |
234 variantDB.put(txn, dbEntryKey, dbEntryValue); | |
235 } catch (DatabaseException e) { | |
236 throw new ApplicationException(e); | |
237 } catch (UnsupportedEncodingException e) { | |
238 throw new ApplicationException(e); | |
239 } | |
240 } | |
241 | |
242 private void writeLemmaByLemmaKey(Transaction txn, DonatusLemma lemma) throws ApplicationException { | |
243 try { | |
244 String lemmaKeyStr = lemma.getLanguage() + "###" + lemma.getForm(); | |
245 DatabaseEntry dbEntryKey = new DatabaseEntry(lemmaKeyStr.getBytes("UTF-8")); | |
246 String lemmaXmlValue = lemma.getXmlString(); | |
247 DatabaseEntry dbEntryValue = new DatabaseEntry(lemmaXmlValue.getBytes("UTF-8")); | |
248 Database lemmaDB = berkeleyDBEnv.getLemmaDB(); | |
249 lemmaDB.put(txn, dbEntryKey, dbEntryValue); | |
250 } catch (DatabaseException e) { | |
251 throw new ApplicationException(e); | |
252 } catch (UnsupportedEncodingException e) { | |
253 throw new ApplicationException(e); | |
254 } | |
255 } | |
256 | |
257 private DonatusLemma readVariantLemma(Transaction txn, String language, String variantForm) throws ApplicationException { | |
258 DonatusLemma lemma = null; | |
259 String hashKey = language + "###" + variantForm; | |
260 try { | |
261 Database variantDB = berkeleyDBEnv.getVariantDB(); | |
262 Cursor cursor = variantDB.openCursor(txn, null); | |
263 byte[] bHashKey = hashKey.getBytes("UTF-8"); | |
264 DatabaseEntry dbEntryKey = new DatabaseEntry(bHashKey); | |
265 DatabaseEntry foundXmlLemmaValue = new DatabaseEntry(); | |
266 OperationStatus operationStatus = variantDB.get(null, dbEntryKey, foundXmlLemmaValue, LockMode.DEFAULT); | |
267 if (operationStatus == OperationStatus.SUCCESS) { | |
268 byte[] foundXmlLemmaValueBytes = foundXmlLemmaValue.getData(); | |
269 String foundXmlLemmaStr = new String(foundXmlLemmaValueBytes, "UTF-8"); | |
270 lemma = parseXmlLemmaString(language, foundXmlLemmaStr); | |
271 } | |
272 cursor.close(); | |
273 } catch (DatabaseException e) { | |
274 throw new ApplicationException(e); | |
275 } catch (UnsupportedEncodingException e) { | |
276 throw new ApplicationException(e); | |
277 } | |
278 return lemma; | |
279 } | |
280 | |
281 private DonatusLemma readLemma(Transaction txn, String language, String lemmaForm) throws ApplicationException { | |
282 DonatusLemma lemma = null; | |
283 String hashKey = language + "###" + lemmaForm; | |
284 try { | |
285 Database lemmaDB = berkeleyDBEnv.getLemmaDB(); | |
286 Cursor cursor = lemmaDB.openCursor(txn, null); | |
287 byte[] bHashKey = hashKey.getBytes("UTF-8"); | |
288 DatabaseEntry dbEntryKey = new DatabaseEntry(bHashKey); | |
289 DatabaseEntry foundXmlLemmaValue = new DatabaseEntry(); | |
290 OperationStatus operationStatus = lemmaDB.get(null, dbEntryKey, foundXmlLemmaValue, LockMode.DEFAULT); | |
291 if (operationStatus == OperationStatus.SUCCESS) { | |
292 byte[] foundXmlLemmaValueBytes = foundXmlLemmaValue.getData(); | |
293 String foundXmlLemmaStr = new String(foundXmlLemmaValueBytes, "UTF-8"); | |
294 lemma = parseXmlLemmaString(language, foundXmlLemmaStr); | |
295 } | |
296 cursor.close(); | |
297 } catch (DatabaseException e) { | |
298 throw new ApplicationException(e); | |
299 } catch (UnsupportedEncodingException e) { | |
300 throw new ApplicationException(e); | |
301 } | |
302 return lemma; | |
303 } | |
304 | |
305 private DonatusLemma parseXmlLemmaString(String language, String xmlLemmaString) throws ApplicationException { | |
306 DonatusLemma lemma = null; | |
307 DonatusMorphologyDocument morphologyDoc = parseDonatusMorphDoc(language, xmlLemmaString); | |
308 ArrayList<DonatusLemma> lemmas = morphologyDoc.getLemmas(); | |
309 if (lemmas.size() > 0) | |
310 lemma = lemmas.get(0); | |
311 return lemma; | |
312 } | |
313 | |
314 private DonatusMorphologyDocument parseDonatusMorphDoc(String language, String xmlString) throws ApplicationException { | |
315 DonatusMorphologyDocument morphologyDoc = null; | |
316 try { | |
317 XMLReader xmlParser = new SAXParser(); | |
318 DonatusMorphologyDocumentContentHandler donatusMorphContentHandler = new DonatusMorphologyDocumentContentHandler("tempDummyUri", language); | |
319 xmlParser.setContentHandler(donatusMorphContentHandler); | |
320 String morphDocDefXml = getDonatusMorphDocDefXml(); | |
321 String morphDocMorphStartXml = "<morphology xmlns=\"http://archimedes.fas.harvard.edu/ns/morphology/3\" xmlns:xlink=\"http://www.w3.org/1999/xlink\">\n"; | |
322 String morphDocMorphEndXml = "</morphology>"; | |
323 String morphDocXml = morphDocDefXml + morphDocMorphStartXml + xmlString + morphDocMorphEndXml; | |
324 Reader reader = new StringReader(morphDocXml); | |
325 InputSource input = new InputSource(reader); | |
326 xmlParser.parse(input); | |
327 morphologyDoc = donatusMorphContentHandler.getResult(); | |
328 } catch (SAXException e) { | |
329 throw new ApplicationException(e); | |
330 } catch (IOException e) { | |
331 throw new ApplicationException(e); | |
332 } | |
333 return morphologyDoc; | |
334 } | |
335 | |
336 private static String getDonatusMorphDocDefXml() { | |
337 String defXml = | |
338 "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n" + | |
339 "<!DOCTYPE morphology [\n" + | |
340 "<!ELEMENT morphology (lemma*, context-form*)>\n" + | |
341 "<!ELEMENT lemma (definition?, variant*)>\n" + | |
342 "<!ELEMENT context-form (tokens, analysis)>\n" + | |
343 "<!ELEMENT definition (#PCDATA)>\n" + | |
344 "<!ELEMENT variant (analysis)*>\n" + | |
345 "<!ELEMENT analysis EMPTY>\n" + | |
346 "<!ELEMENT tokens (token+)>\n" + | |
347 "<!ELEMENT token EMPTY>\n" + | |
348 "<!ATTLIST morphology\n" + | |
349 " xmlns CDATA #FIXED \"http://archimedes.fas.harvard.edu/ns/morphology/3\"\n" + | |
350 " xmlns:xlink CDATA #FIXED \"http://www.w3.org/1999/xlink\">\n" + | |
351 "<!ATTLIST lemma\n" + | |
352 " form CDATA #REQUIRED\n" + | |
353 " lang CDATA #REQUIRED>\n" + | |
354 "<!ATTLIST definition\n" + | |
355 " lang CDATA #IMPLIED>\n" + | |
356 "<!ATTLIST variant\n" + | |
357 " form CDATA #REQUIRED\n" + | |
358 " modified (y|n) #IMPLIED>\n" + | |
359 "<!ATTLIST analysis\n" + | |
360 " desc CDATA #IMPLIED\n" + | |
361 " xlink:href CDATA #IMPLIED\n" + | |
362 " xlink:type (simple) #FIXED \"simple\"\n" + | |
363 " form CDATA #IMPLIED\n" + | |
364 " id ID #IMPLIED>\n" + | |
365 "<!ATTLIST context-form\n" + | |
366 " lang CDATA #REQUIRED\n" + | |
367 " xlink:href CDATA #REQUIRED\n" + | |
368 " xlink:type (simple) #FIXED \"simple\">\n" + | |
369 "<!ATTLIST token\n" + | |
370 " form CDATA #REQUIRED\n" + | |
371 " count CDATA #REQUIRED>\n" + | |
372 "]>\n"; | |
373 return defXml; | |
374 } | |
375 } |