annotate software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/dict/db/LexHandler.java @ 23:e845310098ba

diverse Korrekturen
author Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
date Tue, 27 Nov 2012 12:35:19 +0100
parents 7d6d969b10cf
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
19
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
1 package de.mpg.mpiwg.berlin.mpdl.lt.dict.db;
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
2
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
3 import java.io.UnsupportedEncodingException;
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
4 import java.util.ArrayList;
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
5 import java.util.Collections;
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
6 import java.util.Date;
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
7 import java.util.logging.Logger;
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
8
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
9 import com.sleepycat.je.Cursor;
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
10 import com.sleepycat.je.Database;
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
11 import com.sleepycat.je.DatabaseEntry;
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
12 import com.sleepycat.je.DatabaseException;
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
13 import com.sleepycat.je.LockMode;
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
14 import com.sleepycat.je.OperationStatus;
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
15
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
16 import de.mpg.mpiwg.berlin.mpdl.util.Util;
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
17 import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException;
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
18 import de.mpg.mpiwg.berlin.mpdl.lt.dict.app.Lexica;
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
19 import de.mpg.mpiwg.berlin.mpdl.lt.dict.app.Lexicon;
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
20 import de.mpg.mpiwg.berlin.mpdl.lt.dict.app.LexiconEntry;
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
21 import de.mpg.mpiwg.berlin.mpdl.lt.general.Constants;
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
22 import de.mpg.mpiwg.berlin.mpdl.lt.general.Language;
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
23 import de.mpg.mpiwg.berlin.mpdl.lt.morph.app.Lemma;
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
24 import de.mpg.mpiwg.berlin.mpdl.lt.morph.app.MorphologyCache;
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
25 import de.mpg.mpiwg.berlin.mpdl.lt.text.transcode.Transcoder;
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
26
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
27 public class LexHandler {
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
28 private static LexHandler instance;
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
29 private static Logger LOGGER = Logger.getLogger(LexHandler.class.getName());
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
30 private static String DATA_DIR = Constants.getInstance().getDataDir();
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
31 private static String DB_DIR_LEXICA = DATA_DIR + "/dataBerkeleyDB/pollux";
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
32 private DbEnvLex dbEnvLexica;
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
33 private Date beginOfOperation;
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
34 private Date endOfOperation;
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
35
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
36 public static LexHandler getInstance() throws ApplicationException {
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
37 if (instance == null) {
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
38 instance = new LexHandler();
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
39 instance.initReadOnly();
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
40 }
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
41 return instance;
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
42 }
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
43
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
44 public void end() throws ApplicationException {
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
45 ArrayList<Lexicon> lexicons = Lexica.getInstance().getLocalLexicons();
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
46 for (int i=0; i<lexicons.size(); i++) {
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
47 Lexicon lexicon = lexicons.get(i);
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
48 String lexiconName = lexicon.getName();
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
49 dbEnvLexica.closeDatabase(lexiconName);
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
50 }
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
51 dbEnvLexica.close();
23
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 20
diff changeset
52 LOGGER.info("Lexicon cache: db closed");
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 20
diff changeset
53 dbEnvLexica = null;
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 20
diff changeset
54 instance = null;
19
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
55 }
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
56
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
57 /**
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
58 * @param query
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
59 * @param type
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
60 * @param language
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
61 * @param normalization
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
62 * @return lemmas
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
63 * @throws ApplicationException
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
64 */
23
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 20
diff changeset
65 public ArrayList<Lemma> getLemmas(String query, String type, String language, int normMode, boolean atLeastOneLemmaWithWordForm) throws ApplicationException {
19
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
66 ArrayList<Lemma> lexLemmas = new ArrayList<Lemma>();
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
67 // get lemmas of all forms in query
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
68 MorphologyCache morphologyCache = MorphologyCache.getInstance();
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
69 String[] queryForms = query.split(" ");
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
70 for (int k=0; k<queryForms.length; k++) {
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
71 String queryForm = queryForms[k];
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
72 ArrayList<Lemma> lemmas = null;
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
73 if (type.equals("form")) {
20
7d6d969b10cf little corrections
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
74 lemmas = morphologyCache.getLemmasByFormName(language, queryForm, normMode);
19
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
75 } else if (type.equals("lemma")) {
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
76 lemmas = new ArrayList<Lemma>();
20
7d6d969b10cf little corrections
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
77 Lemma l = morphologyCache.getLemma(language, queryForm, normMode);
19
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
78 if (l != null)
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
79 lemmas.add(l);
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
80 }
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
81 if (lemmas != null && ! lemmas.isEmpty()) {
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
82 lexLemmas.addAll(lemmas);
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
83 } else {
23
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 20
diff changeset
84 if (atLeastOneLemmaWithWordForm) {
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 20
diff changeset
85 Lemma l = new Lemma("created dynamically cause no lemma is available", language, queryForm); // at least the word form is added for finding it in the lexicon
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 20
diff changeset
86 lexLemmas.add(l);
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 20
diff changeset
87 }
19
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
88 }
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
89 }
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
90 Collections.sort(lexLemmas);
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
91 if (lexLemmas.isEmpty())
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
92 return null;
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
93 else
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
94 return lexLemmas;
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
95 }
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
96
20
7d6d969b10cf little corrections
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
97 public ArrayList<Lexicon> getLexEntries(ArrayList<Lemma> lexLemmas, String language, String lexiconName, String query) throws ApplicationException {
19
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
98 ArrayList<Lexicon> retLexicons = new ArrayList<Lexicon>();
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
99 ArrayList<Lexicon> lexicons = Lexica.getInstance().getLexicons(language);
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
100 if (lexiconName != null) {
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
101 lexicons = new ArrayList<Lexicon>();
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
102 Lexicon lexicon = Lexica.getInstance().getLexicon(lexiconName);
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
103 if (lexicon != null)
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
104 lexicons.add(lexicon);
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
105 }
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
106 if (lexicons != null) {
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
107 for (int i=0; i<lexicons.size(); i++) {
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
108 Lexicon lexicon = lexicons.get(i).clone(); // clone without lexicon entries
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
109 for (int j=0; j<lexLemmas.size(); j++) {
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
110 String lemmaName = lexLemmas.get(j).getLemmaName();
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
111 LexiconEntry lexEntry = getEntry(lexicon, lemmaName);
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
112 if (lexEntry != null) {
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
113 lexicon.addEntry(lexEntry); // add entries to the cloned lexicon
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
114 }
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
115 }
20
7d6d969b10cf little corrections
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
116 if (Language.getInstance().isGerman(language) && query != null) {
7d6d969b10cf little corrections
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
117 String[] lexFormNames = query.split(" ");
7d6d969b10cf little corrections
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
118 for (int j=0; j<lexFormNames.length; j++) {
7d6d969b10cf little corrections
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
119 String lexFormName = lexFormNames[j];
7d6d969b10cf little corrections
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
120 LexiconEntry lexEntry = lexicon.getEntry(lexFormName);
7d6d969b10cf little corrections
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
121 if (lexEntry == null) {
7d6d969b10cf little corrections
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
122 LexiconEntry newLexEntry = new LexiconEntry(lexiconName, lexFormName, null);
7d6d969b10cf little corrections
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
123 String lexiconQueryUrl = lexicon.getQueryUrl();
7d6d969b10cf little corrections
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
124 String remoteUrl = lexiconQueryUrl + lexFormName;
7d6d969b10cf little corrections
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
125 newLexEntry.setRemoteUrl(remoteUrl);
7d6d969b10cf little corrections
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
126 lexicon.addEntry(newLexEntry);
7d6d969b10cf little corrections
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
127 }
7d6d969b10cf little corrections
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
128 }
7d6d969b10cf little corrections
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
129 }
19
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
130 if (! lexicon.isEmpty())
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
131 retLexicons.add(lexicon);
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
132 }
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
133 }
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
134 Collections.sort(retLexicons);
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
135 return retLexicons;
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
136 }
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
137
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
138 /**
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
139 *
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
140 * @param formName
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
141 * @param language
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
142 * @return delivers lexical entries by the help of the morphology component (lexical entry of the stem of the normalized word form)
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
143 * @throws ApplicationException
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
144 */
20
7d6d969b10cf little corrections
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
145 public ArrayList<String> getLexEntryKeys(String formName, String language, int normMode) throws ApplicationException {
19
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
146 ArrayList<String> lexEntryKeys = new ArrayList<String>();
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
147 MorphologyCache morphologyCache = MorphologyCache.getInstance();
20
7d6d969b10cf little corrections
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
148 ArrayList<Lemma> formLemmas = morphologyCache.getLemmasByFormName(language, formName, normMode);
19
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
149 boolean hasLexEntry = false;
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
150 hasLexEntry = hasLexEntryKey(formName, language);
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
151 if (hasLexEntry)
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
152 lexEntryKeys.add(formName);
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
153 if (formLemmas != null) {
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
154 for (int j=0; j<formLemmas.size(); j++) {
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
155 Lemma l = formLemmas.get(j);
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
156 String lName = l.getLemmaName();
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
157 if (! hasLexEntry) {
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
158 hasLexEntry = hasLexEntryKey(lName, language);
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
159 }
20
7d6d969b10cf little corrections
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
160 if (language.equals("de") || language.equals("fr") || language.equals("nl")) // TODO Lexika für diese Sprachen in BerkeleyDB einbringen (für nl auch eine bessere Morph.)
19
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
161 lexEntryKeys.add(lName);
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
162 if (! lName.equals(formName) && hasLexEntry) {
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
163 lexEntryKeys.add(lName);
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
164 }
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
165 }
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
166 }
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
167 if(lexEntryKeys.isEmpty())
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
168 return null;
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
169 else
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
170 return lexEntryKeys;
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
171 }
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
172
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
173 public boolean hasLexEntryKey(String formName, String language) throws ApplicationException {
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
174 boolean hasLexEntry = false;
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
175 if (language.equals("zh")) // each chinese character always has a lexicon entry
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
176 return true;
23
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 20
diff changeset
177 String[] lexiconNames = Lexica.getInstance().getLocalLexiconNames(language);
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 20
diff changeset
178 if (lexiconNames != null) {
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 20
diff changeset
179 for (int i=0; i<lexiconNames.length; i++) {
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 20
diff changeset
180 String lexiconName = lexiconNames[i];
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 20
diff changeset
181 hasLexEntry = hasKey(lexiconName, formName);
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 20
diff changeset
182 if (hasLexEntry) {
19
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
183 return true;
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
184 }
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
185 }
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
186 }
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
187 return hasLexEntry;
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
188 }
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
189
20
7d6d969b10cf little corrections
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
190 public ArrayList<Lexicon> getLexEntriesBeginningWith(String language, String formPrefix, int pageNumber, int pageSize) throws ApplicationException {
19
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
191 int from = (pageNumber * pageSize) - pageSize + 1;
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
192 int to = pageNumber * pageSize;
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
193 ArrayList<Lexicon> statLexicons = Lexica.getInstance().getLocalLexicons(language);
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
194 ArrayList<Lexicon> retLexicons = null;
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
195 if (statLexicons != null) {
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
196 for (int i=0; i<statLexicons.size(); i++) {
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
197 Lexicon lexicon = statLexicons.get(i).clone(); // clone without lexicon entries
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
198 String lexiconName = lexicon.getName();
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
199 ArrayList<LexiconEntry> lexEntries = readEntriesBeginningWith(lexiconName, formPrefix, from, to);
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
200 // TODO merge the entries and remove duplicates
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
201 if (lexEntries != null) {
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
202 lexicon.addEntries(lexEntries);
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
203 if (retLexicons == null)
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
204 retLexicons = new ArrayList<Lexicon>();
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
205 retLexicons.add(lexicon);
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
206 }
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
207 }
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
208 }
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
209 return retLexicons;
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
210 }
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
211
20
7d6d969b10cf little corrections
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
212 public ArrayList<Lexicon> getLexEntriesByLexiconBeginningWith(String lexiconName, String formPrefix, int pageNumber, int pageSize) throws ApplicationException {
19
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
213 int from = (pageNumber * pageSize) - pageSize + 1;
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
214 int to = pageNumber * pageSize;
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
215 Lexicon lexicon = Lexica.getInstance().getLexicon(lexiconName).clone();
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
216 ArrayList<Lexicon> retLexicons = null;
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
217 if (lexicon != null) {
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
218 ArrayList<LexiconEntry> lexEntries = readEntriesBeginningWith(lexiconName, formPrefix, from, to);
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
219 if (lexEntries != null) {
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
220 lexicon.addEntries(lexEntries);
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
221 retLexicons = new ArrayList<Lexicon>();
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
222 retLexicons.add(lexicon);
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
223 }
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
224 }
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
225 return retLexicons;
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
226 }
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
227
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
228 private LexiconEntry getEntry(Lexicon lexicon, String formName) throws ApplicationException {
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
229 LexiconEntry lexEntry = null;
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
230 if (lexicon.isLocalLexicon()) {
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
231 lexEntry = readEntry(lexicon.getName(), formName);
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
232 String lexiconQueryUrl = lexicon.getQueryUrl();
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
233 if (lexEntry != null && lexicon.getQueryUrl() != null) {
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
234 String language = lexicon.getSourceLanguage();
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
235 if (Language.getInstance().isGreek(language)) {
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
236 formName = Transcoder.getInstance().transcodeFromUnicode2BetaCode(formName);
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
237 } else if (Language.getInstance().isArabic(language)) {
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
238 formName = Transcoder.getInstance().transcodeFromUnicode2Buckwalter(formName);
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
239 }
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
240 lexEntry.setRemoteUrl(lexiconQueryUrl + formName);
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
241 }
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
242 } else {
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
243 lexEntry = lexicon.getDynamicEntry(formName);
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
244 }
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
245 return lexEntry;
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
246 }
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
247
23
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 20
diff changeset
248 private boolean hasKey(String lexiconName, String formName) throws ApplicationException {
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 20
diff changeset
249 boolean hasKey = false;
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 20
diff changeset
250 try {
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 20
diff changeset
251 String keyStr = formName;
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 20
diff changeset
252 DatabaseEntry dbEntryKey = new DatabaseEntry(keyStr.getBytes("utf-8"));
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 20
diff changeset
253 Database lexDB = dbEnvLexica.getLexiconDB(lexiconName);
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 20
diff changeset
254 Cursor cursor = lexDB.openCursor(null, null);
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 20
diff changeset
255 DatabaseEntry foundValue = new DatabaseEntry();
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 20
diff changeset
256 foundValue.setPartial(0, 0, true); // more performance: the value is not fetched: only the key is fetched
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 20
diff changeset
257 OperationStatus operationStatus = cursor.getSearchKey(dbEntryKey, foundValue, LockMode.DEFAULT);
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 20
diff changeset
258 if (operationStatus == OperationStatus.SUCCESS) {
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 20
diff changeset
259 hasKey = true;
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 20
diff changeset
260 }
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 20
diff changeset
261 cursor.close();
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 20
diff changeset
262 } catch (DatabaseException e) {
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 20
diff changeset
263 throw new ApplicationException(e);
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 20
diff changeset
264 } catch (UnsupportedEncodingException e) {
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 20
diff changeset
265 throw new ApplicationException(e);
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 20
diff changeset
266 }
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 20
diff changeset
267 return hasKey;
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 20
diff changeset
268 }
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 20
diff changeset
269
19
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
270 private LexiconEntry readEntry(String lexiconName, String formName) throws ApplicationException {
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
271 LexiconEntry retLexEntry = null;
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
272 try {
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
273 String dbFoundValueStr = null;
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
274 String keyStr = formName;
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
275 DatabaseEntry dbEntryKey = new DatabaseEntry(keyStr.getBytes("utf-8"));
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
276 Database lexDB = dbEnvLexica.getLexiconDB(lexiconName);
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
277 Cursor cursor = lexDB.openCursor(null, null);
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
278 DatabaseEntry foundValue = new DatabaseEntry();
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
279 OperationStatus operationStatus = cursor.getSearchKey(dbEntryKey, foundValue, LockMode.DEFAULT);
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
280 if (operationStatus == OperationStatus.SUCCESS) {
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
281 byte[] foundValueBytes = foundValue.getData();
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
282 dbFoundValueStr = new String(foundValueBytes, "utf-8");
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
283 }
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
284 cursor.close();
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
285 if (dbFoundValueStr != null) {
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
286 retLexEntry = new LexiconEntry(lexiconName, formName, dbFoundValueStr);
20
7d6d969b10cf little corrections
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
287 retLexEntry = correct(retLexEntry); // correct errors: e.g. in lsj some html entities are not correct
19
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
288 }
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
289 } catch (DatabaseException e) {
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
290 throw new ApplicationException(e);
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
291 } catch (UnsupportedEncodingException e) {
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
292 throw new ApplicationException(e);
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
293 }
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
294 return retLexEntry;
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
295 }
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
296
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
297 private ArrayList<LexiconEntry> readEntriesBeginningWith(String lexiconName, String formPrefix, int from, int to) throws ApplicationException {
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
298 ArrayList<LexiconEntry> retLexEntries = new ArrayList<LexiconEntry>();;
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
299 try {
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
300 String dbFoundValueStr = null;
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
301 String keyStr = formPrefix;
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
302 DatabaseEntry dbEntryKey = new DatabaseEntry(keyStr.getBytes("utf-8"));
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
303 Database lexDB = dbEnvLexica.getLexiconDB(lexiconName);
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
304 Cursor cursor = lexDB.openCursor(null, null);
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
305 DatabaseEntry foundValue = new DatabaseEntry();
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
306 OperationStatus operationStatus = cursor.getSearchKeyRange(dbEntryKey, foundValue, LockMode.DEFAULT);
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
307 int counter = 1;
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
308 while (operationStatus == OperationStatus.SUCCESS && counter <= to) {
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
309 if (counter >= from) {
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
310 byte[] foundValueBytes = foundValue.getData();
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
311 dbFoundValueStr = new String(foundValueBytes, "utf-8");
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
312 byte[] foundKeyBytes = dbEntryKey.getData();
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
313 String dbFoundKeyStr = new String(foundKeyBytes, "utf-8");
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
314 LexiconEntry lexEntry = new LexiconEntry(lexiconName, dbFoundKeyStr, dbFoundValueStr);
20
7d6d969b10cf little corrections
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
315 lexEntry = correct(lexEntry); // correct errors: e.g. in lsj some html entities are not correct
19
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
316 retLexEntries.add(lexEntry);
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
317 }
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
318 operationStatus = cursor.getNext(dbEntryKey, foundValue, LockMode.DEFAULT);
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
319 counter++;
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
320 }
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
321 cursor.close();
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
322 if (retLexEntries.isEmpty()) {
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
323 return null;
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
324 }
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
325 } catch (DatabaseException e) {
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
326 throw new ApplicationException(e);
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
327 } catch (UnsupportedEncodingException e) {
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
328 throw new ApplicationException(e);
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
329 }
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
330 return retLexEntries;
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
331 }
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
332
20
7d6d969b10cf little corrections
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
333 private LexiconEntry correct(LexiconEntry lexEntry) {
7d6d969b10cf little corrections
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
334 String lexiconName = lexEntry.getLexiconName();
7d6d969b10cf little corrections
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
335 String content = lexEntry.getContent();
7d6d969b10cf little corrections
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
336 if (content != null && content.contains("&#") && lexiconName.equals("lsj")) { // errors in greek lexicon lsj
7d6d969b10cf little corrections
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
337 content = content.replaceAll("&#\u03C7", "&#x"); // html entity: replace greek Minuskel Chi by "x"
7d6d969b10cf little corrections
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
338 content = content.replaceAll("&#x[^0-9]{4};", ""); // html entity: remove entity if special greek not hex characters appear
7d6d969b10cf little corrections
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
339 lexEntry.setContent(content);
7d6d969b10cf little corrections
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
340 }
7d6d969b10cf little corrections
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
341 return lexEntry;
7d6d969b10cf little corrections
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
342 }
7d6d969b10cf little corrections
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
343
19
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
344 public static void main(String[] args) throws ApplicationException {
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
345 getInstance();
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
346 instance.beginOperation();
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
347 System.out.print("Start ...");
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
348 instance.readSampleData();
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
349 instance.end();
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
350 instance.endOperation();
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
351 Double elapsedTime = new Util().getSecondWithMillisecondsBetween(instance.beginOfOperation, instance.endOfOperation);
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
352 System.out.println("End.");
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
353 System.out.println("Needed time: " + elapsedTime + " seconds");
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
354 }
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
355
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
356 private void initReadOnly() throws ApplicationException {
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
357 dbEnvLexica = new DbEnvLex();
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
358 dbEnvLexica.setDataDir(DB_DIR_LEXICA);
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
359 dbEnvLexica.initReadOnly();
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
360 ArrayList<Lexicon> lexicons = Lexica.getInstance().getLocalLexicons();
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
361 for (int i=0; i<lexicons.size(); i++) {
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
362 Lexicon lexicon = lexicons.get(i);
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
363 String lexiconName = lexicon.getName();
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
364 dbEnvLexica.openDatabase(lexiconName);
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
365 }
23
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 20
diff changeset
366 LOGGER.info("Lexicon cache: db opened read only");
19
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
367 }
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
368
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
369 private void readSampleData() throws ApplicationException {
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
370 // List<String> dbNames = dbEnvLexica.getEnv().getDatabaseNames();
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
371 String l1 = readEntry("autenrieth", "au)to/s").getContent(); // greek: see also bonitz and lsj
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
372 String l2 = readEntry("ls", "laudabilis").getContent(); // latin
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
373 System.out.println("Autenrieth: autos: " + l1);
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
374 System.out.println("Lewis & Short: Laudabilis: " + l2);
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
375 }
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
376
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
377 private void beginOperation() {
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
378 beginOfOperation = new Date();
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
379 }
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
380
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
381 private void endOperation() {
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
382 endOfOperation = new Date();
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
383 }
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
384
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
385 }