Mercurial > hg > mpdl-group
annotate software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/dict/db/LexHandler.java @ 23:e845310098ba
diverse Korrekturen
author | Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de> |
---|---|
date | Tue, 27 Nov 2012 12:35:19 +0100 |
parents | 7d6d969b10cf |
children |
rev | line source |
---|---|
19 | 1 package de.mpg.mpiwg.berlin.mpdl.lt.dict.db; |
2 | |
3 import java.io.UnsupportedEncodingException; | |
4 import java.util.ArrayList; | |
5 import java.util.Collections; | |
6 import java.util.Date; | |
7 import java.util.logging.Logger; | |
8 | |
9 import com.sleepycat.je.Cursor; | |
10 import com.sleepycat.je.Database; | |
11 import com.sleepycat.je.DatabaseEntry; | |
12 import com.sleepycat.je.DatabaseException; | |
13 import com.sleepycat.je.LockMode; | |
14 import com.sleepycat.je.OperationStatus; | |
15 | |
16 import de.mpg.mpiwg.berlin.mpdl.util.Util; | |
17 import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; | |
18 import de.mpg.mpiwg.berlin.mpdl.lt.dict.app.Lexica; | |
19 import de.mpg.mpiwg.berlin.mpdl.lt.dict.app.Lexicon; | |
20 import de.mpg.mpiwg.berlin.mpdl.lt.dict.app.LexiconEntry; | |
21 import de.mpg.mpiwg.berlin.mpdl.lt.general.Constants; | |
22 import de.mpg.mpiwg.berlin.mpdl.lt.general.Language; | |
23 import de.mpg.mpiwg.berlin.mpdl.lt.morph.app.Lemma; | |
24 import de.mpg.mpiwg.berlin.mpdl.lt.morph.app.MorphologyCache; | |
25 import de.mpg.mpiwg.berlin.mpdl.lt.text.transcode.Transcoder; | |
26 | |
27 public class LexHandler { | |
28 private static LexHandler instance; | |
29 private static Logger LOGGER = Logger.getLogger(LexHandler.class.getName()); | |
30 private static String DATA_DIR = Constants.getInstance().getDataDir(); | |
31 private static String DB_DIR_LEXICA = DATA_DIR + "/dataBerkeleyDB/pollux"; | |
32 private DbEnvLex dbEnvLexica; | |
33 private Date beginOfOperation; | |
34 private Date endOfOperation; | |
35 | |
36 public static LexHandler getInstance() throws ApplicationException { | |
37 if (instance == null) { | |
38 instance = new LexHandler(); | |
39 instance.initReadOnly(); | |
40 } | |
41 return instance; | |
42 } | |
43 | |
44 public void end() throws ApplicationException { | |
45 ArrayList<Lexicon> lexicons = Lexica.getInstance().getLocalLexicons(); | |
46 for (int i=0; i<lexicons.size(); i++) { | |
47 Lexicon lexicon = lexicons.get(i); | |
48 String lexiconName = lexicon.getName(); | |
49 dbEnvLexica.closeDatabase(lexiconName); | |
50 } | |
51 dbEnvLexica.close(); | |
23
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
52 LOGGER.info("Lexicon cache: db closed"); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
53 dbEnvLexica = null; |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
54 instance = null; |
19 | 55 } |
56 | |
57 /** | |
58 * @param query | |
59 * @param type | |
60 * @param language | |
61 * @param normalization | |
62 * @return lemmas | |
63 * @throws ApplicationException | |
64 */ | |
23
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
65 public ArrayList<Lemma> getLemmas(String query, String type, String language, int normMode, boolean atLeastOneLemmaWithWordForm) throws ApplicationException { |
19 | 66 ArrayList<Lemma> lexLemmas = new ArrayList<Lemma>(); |
67 // get lemmas of all forms in query | |
68 MorphologyCache morphologyCache = MorphologyCache.getInstance(); | |
69 String[] queryForms = query.split(" "); | |
70 for (int k=0; k<queryForms.length; k++) { | |
71 String queryForm = queryForms[k]; | |
72 ArrayList<Lemma> lemmas = null; | |
73 if (type.equals("form")) { | |
20
7d6d969b10cf
little corrections
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
19
diff
changeset
|
74 lemmas = morphologyCache.getLemmasByFormName(language, queryForm, normMode); |
19 | 75 } else if (type.equals("lemma")) { |
76 lemmas = new ArrayList<Lemma>(); | |
20
7d6d969b10cf
little corrections
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
19
diff
changeset
|
77 Lemma l = morphologyCache.getLemma(language, queryForm, normMode); |
19 | 78 if (l != null) |
79 lemmas.add(l); | |
80 } | |
81 if (lemmas != null && ! lemmas.isEmpty()) { | |
82 lexLemmas.addAll(lemmas); | |
83 } else { | |
23
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
84 if (atLeastOneLemmaWithWordForm) { |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
85 Lemma l = new Lemma("created dynamically cause no lemma is available", language, queryForm); // at least the word form is added for finding it in the lexicon |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
86 lexLemmas.add(l); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
87 } |
19 | 88 } |
89 } | |
90 Collections.sort(lexLemmas); | |
91 if (lexLemmas.isEmpty()) | |
92 return null; | |
93 else | |
94 return lexLemmas; | |
95 } | |
96 | |
20
7d6d969b10cf
little corrections
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
19
diff
changeset
|
97 public ArrayList<Lexicon> getLexEntries(ArrayList<Lemma> lexLemmas, String language, String lexiconName, String query) throws ApplicationException { |
19 | 98 ArrayList<Lexicon> retLexicons = new ArrayList<Lexicon>(); |
99 ArrayList<Lexicon> lexicons = Lexica.getInstance().getLexicons(language); | |
100 if (lexiconName != null) { | |
101 lexicons = new ArrayList<Lexicon>(); | |
102 Lexicon lexicon = Lexica.getInstance().getLexicon(lexiconName); | |
103 if (lexicon != null) | |
104 lexicons.add(lexicon); | |
105 } | |
106 if (lexicons != null) { | |
107 for (int i=0; i<lexicons.size(); i++) { | |
108 Lexicon lexicon = lexicons.get(i).clone(); // clone without lexicon entries | |
109 for (int j=0; j<lexLemmas.size(); j++) { | |
110 String lemmaName = lexLemmas.get(j).getLemmaName(); | |
111 LexiconEntry lexEntry = getEntry(lexicon, lemmaName); | |
112 if (lexEntry != null) { | |
113 lexicon.addEntry(lexEntry); // add entries to the cloned lexicon | |
114 } | |
115 } | |
20
7d6d969b10cf
little corrections
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
19
diff
changeset
|
116 if (Language.getInstance().isGerman(language) && query != null) { |
7d6d969b10cf
little corrections
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
19
diff
changeset
|
117 String[] lexFormNames = query.split(" "); |
7d6d969b10cf
little corrections
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
19
diff
changeset
|
118 for (int j=0; j<lexFormNames.length; j++) { |
7d6d969b10cf
little corrections
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
19
diff
changeset
|
119 String lexFormName = lexFormNames[j]; |
7d6d969b10cf
little corrections
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
19
diff
changeset
|
120 LexiconEntry lexEntry = lexicon.getEntry(lexFormName); |
7d6d969b10cf
little corrections
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
19
diff
changeset
|
121 if (lexEntry == null) { |
7d6d969b10cf
little corrections
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
19
diff
changeset
|
122 LexiconEntry newLexEntry = new LexiconEntry(lexiconName, lexFormName, null); |
7d6d969b10cf
little corrections
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
19
diff
changeset
|
123 String lexiconQueryUrl = lexicon.getQueryUrl(); |
7d6d969b10cf
little corrections
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
19
diff
changeset
|
124 String remoteUrl = lexiconQueryUrl + lexFormName; |
7d6d969b10cf
little corrections
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
19
diff
changeset
|
125 newLexEntry.setRemoteUrl(remoteUrl); |
7d6d969b10cf
little corrections
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
19
diff
changeset
|
126 lexicon.addEntry(newLexEntry); |
7d6d969b10cf
little corrections
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
19
diff
changeset
|
127 } |
7d6d969b10cf
little corrections
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
19
diff
changeset
|
128 } |
7d6d969b10cf
little corrections
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
19
diff
changeset
|
129 } |
19 | 130 if (! lexicon.isEmpty()) |
131 retLexicons.add(lexicon); | |
132 } | |
133 } | |
134 Collections.sort(retLexicons); | |
135 return retLexicons; | |
136 } | |
137 | |
138 /** | |
139 * | |
140 * @param formName | |
141 * @param language | |
142 * @return delivers lexical entries by the help of the morphology component (lexical entry of the stem of the normalized word form) | |
143 * @throws ApplicationException | |
144 */ | |
20
7d6d969b10cf
little corrections
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
19
diff
changeset
|
145 public ArrayList<String> getLexEntryKeys(String formName, String language, int normMode) throws ApplicationException { |
19 | 146 ArrayList<String> lexEntryKeys = new ArrayList<String>(); |
147 MorphologyCache morphologyCache = MorphologyCache.getInstance(); | |
20
7d6d969b10cf
little corrections
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
19
diff
changeset
|
148 ArrayList<Lemma> formLemmas = morphologyCache.getLemmasByFormName(language, formName, normMode); |
19 | 149 boolean hasLexEntry = false; |
150 hasLexEntry = hasLexEntryKey(formName, language); | |
151 if (hasLexEntry) | |
152 lexEntryKeys.add(formName); | |
153 if (formLemmas != null) { | |
154 for (int j=0; j<formLemmas.size(); j++) { | |
155 Lemma l = formLemmas.get(j); | |
156 String lName = l.getLemmaName(); | |
157 if (! hasLexEntry) { | |
158 hasLexEntry = hasLexEntryKey(lName, language); | |
159 } | |
20
7d6d969b10cf
little corrections
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
19
diff
changeset
|
160 if (language.equals("de") || language.equals("fr") || language.equals("nl")) // TODO Lexika für diese Sprachen in BerkeleyDB einbringen (für nl auch eine bessere Morph.) |
19 | 161 lexEntryKeys.add(lName); |
162 if (! lName.equals(formName) && hasLexEntry) { | |
163 lexEntryKeys.add(lName); | |
164 } | |
165 } | |
166 } | |
167 if(lexEntryKeys.isEmpty()) | |
168 return null; | |
169 else | |
170 return lexEntryKeys; | |
171 } | |
172 | |
173 public boolean hasLexEntryKey(String formName, String language) throws ApplicationException { | |
174 boolean hasLexEntry = false; | |
175 if (language.equals("zh")) // each chinese character always has a lexicon entry | |
176 return true; | |
23
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
177 String[] lexiconNames = Lexica.getInstance().getLocalLexiconNames(language); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
178 if (lexiconNames != null) { |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
179 for (int i=0; i<lexiconNames.length; i++) { |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
180 String lexiconName = lexiconNames[i]; |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
181 hasLexEntry = hasKey(lexiconName, formName); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
182 if (hasLexEntry) { |
19 | 183 return true; |
184 } | |
185 } | |
186 } | |
187 return hasLexEntry; | |
188 } | |
189 | |
20
7d6d969b10cf
little corrections
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
19
diff
changeset
|
190 public ArrayList<Lexicon> getLexEntriesBeginningWith(String language, String formPrefix, int pageNumber, int pageSize) throws ApplicationException { |
19 | 191 int from = (pageNumber * pageSize) - pageSize + 1; |
192 int to = pageNumber * pageSize; | |
193 ArrayList<Lexicon> statLexicons = Lexica.getInstance().getLocalLexicons(language); | |
194 ArrayList<Lexicon> retLexicons = null; | |
195 if (statLexicons != null) { | |
196 for (int i=0; i<statLexicons.size(); i++) { | |
197 Lexicon lexicon = statLexicons.get(i).clone(); // clone without lexicon entries | |
198 String lexiconName = lexicon.getName(); | |
199 ArrayList<LexiconEntry> lexEntries = readEntriesBeginningWith(lexiconName, formPrefix, from, to); | |
200 // TODO merge the entries and remove duplicates | |
201 if (lexEntries != null) { | |
202 lexicon.addEntries(lexEntries); | |
203 if (retLexicons == null) | |
204 retLexicons = new ArrayList<Lexicon>(); | |
205 retLexicons.add(lexicon); | |
206 } | |
207 } | |
208 } | |
209 return retLexicons; | |
210 } | |
211 | |
20
7d6d969b10cf
little corrections
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
19
diff
changeset
|
212 public ArrayList<Lexicon> getLexEntriesByLexiconBeginningWith(String lexiconName, String formPrefix, int pageNumber, int pageSize) throws ApplicationException { |
19 | 213 int from = (pageNumber * pageSize) - pageSize + 1; |
214 int to = pageNumber * pageSize; | |
215 Lexicon lexicon = Lexica.getInstance().getLexicon(lexiconName).clone(); | |
216 ArrayList<Lexicon> retLexicons = null; | |
217 if (lexicon != null) { | |
218 ArrayList<LexiconEntry> lexEntries = readEntriesBeginningWith(lexiconName, formPrefix, from, to); | |
219 if (lexEntries != null) { | |
220 lexicon.addEntries(lexEntries); | |
221 retLexicons = new ArrayList<Lexicon>(); | |
222 retLexicons.add(lexicon); | |
223 } | |
224 } | |
225 return retLexicons; | |
226 } | |
227 | |
228 private LexiconEntry getEntry(Lexicon lexicon, String formName) throws ApplicationException { | |
229 LexiconEntry lexEntry = null; | |
230 if (lexicon.isLocalLexicon()) { | |
231 lexEntry = readEntry(lexicon.getName(), formName); | |
232 String lexiconQueryUrl = lexicon.getQueryUrl(); | |
233 if (lexEntry != null && lexicon.getQueryUrl() != null) { | |
234 String language = lexicon.getSourceLanguage(); | |
235 if (Language.getInstance().isGreek(language)) { | |
236 formName = Transcoder.getInstance().transcodeFromUnicode2BetaCode(formName); | |
237 } else if (Language.getInstance().isArabic(language)) { | |
238 formName = Transcoder.getInstance().transcodeFromUnicode2Buckwalter(formName); | |
239 } | |
240 lexEntry.setRemoteUrl(lexiconQueryUrl + formName); | |
241 } | |
242 } else { | |
243 lexEntry = lexicon.getDynamicEntry(formName); | |
244 } | |
245 return lexEntry; | |
246 } | |
247 | |
23
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
248 private boolean hasKey(String lexiconName, String formName) throws ApplicationException { |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
249 boolean hasKey = false; |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
250 try { |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
251 String keyStr = formName; |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
252 DatabaseEntry dbEntryKey = new DatabaseEntry(keyStr.getBytes("utf-8")); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
253 Database lexDB = dbEnvLexica.getLexiconDB(lexiconName); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
254 Cursor cursor = lexDB.openCursor(null, null); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
255 DatabaseEntry foundValue = new DatabaseEntry(); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
256 foundValue.setPartial(0, 0, true); // more performance: the value is not fetched: only the key is fetched |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
257 OperationStatus operationStatus = cursor.getSearchKey(dbEntryKey, foundValue, LockMode.DEFAULT); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
258 if (operationStatus == OperationStatus.SUCCESS) { |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
259 hasKey = true; |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
260 } |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
261 cursor.close(); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
262 } catch (DatabaseException e) { |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
263 throw new ApplicationException(e); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
264 } catch (UnsupportedEncodingException e) { |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
265 throw new ApplicationException(e); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
266 } |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
267 return hasKey; |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
268 } |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
269 |
19 | 270 private LexiconEntry readEntry(String lexiconName, String formName) throws ApplicationException { |
271 LexiconEntry retLexEntry = null; | |
272 try { | |
273 String dbFoundValueStr = null; | |
274 String keyStr = formName; | |
275 DatabaseEntry dbEntryKey = new DatabaseEntry(keyStr.getBytes("utf-8")); | |
276 Database lexDB = dbEnvLexica.getLexiconDB(lexiconName); | |
277 Cursor cursor = lexDB.openCursor(null, null); | |
278 DatabaseEntry foundValue = new DatabaseEntry(); | |
279 OperationStatus operationStatus = cursor.getSearchKey(dbEntryKey, foundValue, LockMode.DEFAULT); | |
280 if (operationStatus == OperationStatus.SUCCESS) { | |
281 byte[] foundValueBytes = foundValue.getData(); | |
282 dbFoundValueStr = new String(foundValueBytes, "utf-8"); | |
283 } | |
284 cursor.close(); | |
285 if (dbFoundValueStr != null) { | |
286 retLexEntry = new LexiconEntry(lexiconName, formName, dbFoundValueStr); | |
20
7d6d969b10cf
little corrections
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
19
diff
changeset
|
287 retLexEntry = correct(retLexEntry); // correct errors: e.g. in lsj some html entities are not correct |
19 | 288 } |
289 } catch (DatabaseException e) { | |
290 throw new ApplicationException(e); | |
291 } catch (UnsupportedEncodingException e) { | |
292 throw new ApplicationException(e); | |
293 } | |
294 return retLexEntry; | |
295 } | |
296 | |
297 private ArrayList<LexiconEntry> readEntriesBeginningWith(String lexiconName, String formPrefix, int from, int to) throws ApplicationException { | |
298 ArrayList<LexiconEntry> retLexEntries = new ArrayList<LexiconEntry>();; | |
299 try { | |
300 String dbFoundValueStr = null; | |
301 String keyStr = formPrefix; | |
302 DatabaseEntry dbEntryKey = new DatabaseEntry(keyStr.getBytes("utf-8")); | |
303 Database lexDB = dbEnvLexica.getLexiconDB(lexiconName); | |
304 Cursor cursor = lexDB.openCursor(null, null); | |
305 DatabaseEntry foundValue = new DatabaseEntry(); | |
306 OperationStatus operationStatus = cursor.getSearchKeyRange(dbEntryKey, foundValue, LockMode.DEFAULT); | |
307 int counter = 1; | |
308 while (operationStatus == OperationStatus.SUCCESS && counter <= to) { | |
309 if (counter >= from) { | |
310 byte[] foundValueBytes = foundValue.getData(); | |
311 dbFoundValueStr = new String(foundValueBytes, "utf-8"); | |
312 byte[] foundKeyBytes = dbEntryKey.getData(); | |
313 String dbFoundKeyStr = new String(foundKeyBytes, "utf-8"); | |
314 LexiconEntry lexEntry = new LexiconEntry(lexiconName, dbFoundKeyStr, dbFoundValueStr); | |
20
7d6d969b10cf
little corrections
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
19
diff
changeset
|
315 lexEntry = correct(lexEntry); // correct errors: e.g. in lsj some html entities are not correct |
19 | 316 retLexEntries.add(lexEntry); |
317 } | |
318 operationStatus = cursor.getNext(dbEntryKey, foundValue, LockMode.DEFAULT); | |
319 counter++; | |
320 } | |
321 cursor.close(); | |
322 if (retLexEntries.isEmpty()) { | |
323 return null; | |
324 } | |
325 } catch (DatabaseException e) { | |
326 throw new ApplicationException(e); | |
327 } catch (UnsupportedEncodingException e) { | |
328 throw new ApplicationException(e); | |
329 } | |
330 return retLexEntries; | |
331 } | |
332 | |
20
7d6d969b10cf
little corrections
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
19
diff
changeset
|
333 private LexiconEntry correct(LexiconEntry lexEntry) { |
7d6d969b10cf
little corrections
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
19
diff
changeset
|
334 String lexiconName = lexEntry.getLexiconName(); |
7d6d969b10cf
little corrections
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
19
diff
changeset
|
335 String content = lexEntry.getContent(); |
7d6d969b10cf
little corrections
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
19
diff
changeset
|
336 if (content != null && content.contains("&#") && lexiconName.equals("lsj")) { // errors in greek lexicon lsj |
7d6d969b10cf
little corrections
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
19
diff
changeset
|
337 content = content.replaceAll("&#\u03C7", "&#x"); // html entity: replace greek Minuskel Chi by "x" |
7d6d969b10cf
little corrections
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
19
diff
changeset
|
338 content = content.replaceAll("&#x[^0-9]{4};", ""); // html entity: remove entity if special greek not hex characters appear |
7d6d969b10cf
little corrections
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
19
diff
changeset
|
339 lexEntry.setContent(content); |
7d6d969b10cf
little corrections
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
19
diff
changeset
|
340 } |
7d6d969b10cf
little corrections
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
19
diff
changeset
|
341 return lexEntry; |
7d6d969b10cf
little corrections
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
19
diff
changeset
|
342 } |
7d6d969b10cf
little corrections
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
19
diff
changeset
|
343 |
19 | 344 public static void main(String[] args) throws ApplicationException { |
345 getInstance(); | |
346 instance.beginOperation(); | |
347 System.out.print("Start ..."); | |
348 instance.readSampleData(); | |
349 instance.end(); | |
350 instance.endOperation(); | |
351 Double elapsedTime = new Util().getSecondWithMillisecondsBetween(instance.beginOfOperation, instance.endOfOperation); | |
352 System.out.println("End."); | |
353 System.out.println("Needed time: " + elapsedTime + " seconds"); | |
354 } | |
355 | |
356 private void initReadOnly() throws ApplicationException { | |
357 dbEnvLexica = new DbEnvLex(); | |
358 dbEnvLexica.setDataDir(DB_DIR_LEXICA); | |
359 dbEnvLexica.initReadOnly(); | |
360 ArrayList<Lexicon> lexicons = Lexica.getInstance().getLocalLexicons(); | |
361 for (int i=0; i<lexicons.size(); i++) { | |
362 Lexicon lexicon = lexicons.get(i); | |
363 String lexiconName = lexicon.getName(); | |
364 dbEnvLexica.openDatabase(lexiconName); | |
365 } | |
23
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
366 LOGGER.info("Lexicon cache: db opened read only"); |
19 | 367 } |
368 | |
369 private void readSampleData() throws ApplicationException { | |
370 // List<String> dbNames = dbEnvLexica.getEnv().getDatabaseNames(); | |
371 String l1 = readEntry("autenrieth", "au)to/s").getContent(); // greek: see also bonitz and lsj | |
372 String l2 = readEntry("ls", "laudabilis").getContent(); // latin | |
373 System.out.println("Autenrieth: autos: " + l1); | |
374 System.out.println("Lewis & Short: Laudabilis: " + l2); | |
375 } | |
376 | |
377 private void beginOperation() { | |
378 beginOfOperation = new Date(); | |
379 } | |
380 | |
381 private void endOperation() { | |
382 endOfOperation = new Date(); | |
383 } | |
384 | |
385 } |