Mercurial > hg > mpdl-group
comparison software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/morph/app/MorphologyCache.java @ 19:4a3641ae14d2
Erstellung
author | Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de> |
---|---|
date | Wed, 09 Nov 2011 15:32:05 +0100 |
parents | |
children | 7d6d969b10cf |
comparison
equal
deleted
inserted
replaced
18:dc5e9fcb3fdc | 19:4a3641ae14d2 |
---|---|
1 package de.mpg.mpiwg.berlin.mpdl.lt.morph.app; | |
2 | |
3 import java.util.ArrayList; | |
4 import java.util.Collections; | |
5 import java.util.Date; | |
6 import java.util.Enumeration; | |
7 import java.util.Hashtable; | |
8 | |
9 import java.util.logging.Logger; | |
10 | |
11 import de.mpg.mpiwg.berlin.mpdl.lt.general.Language; | |
12 import de.mpg.mpiwg.berlin.mpdl.lt.general.Constants; | |
13 import de.mpg.mpiwg.berlin.mpdl.lt.morph.app.Form; | |
14 import de.mpg.mpiwg.berlin.mpdl.lt.morph.app.Lemma; | |
15 import de.mpg.mpiwg.berlin.mpdl.lt.morph.db.DBMorphHandler; | |
16 import de.mpg.mpiwg.berlin.mpdl.lt.text.norm.Normalizer; | |
17 import de.mpg.mpiwg.berlin.mpdl.lucene.util.LuceneUtil; | |
18 import de.mpg.mpiwg.berlin.mpdl.util.Util; | |
19 import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; | |
20 | |
21 public class MorphologyCache { | |
22 private static MorphologyCache instance; | |
23 private static Logger LOGGER = Logger.getLogger(MorphologyCache.class.getName()); | |
24 private static String DATA_DIR = Constants.getInstance().getDataDir(); | |
25 private static String DB_DIR_DONATUS = DATA_DIR + "/dataBerkeleyDB/donatus"; | |
26 public static int QUERY_MODE = 0; | |
27 public static int DOCUMENT_MODE = 1; | |
28 private static int MAX_HASHTABLE_SIZE = Constants.MORPHOLOGY_CACHE_SIZE; | |
29 protected int mode = QUERY_MODE; | |
30 private Hashtable<String, Hashtable<String, Lemma>> forms = new Hashtable<String, Hashtable<String, Lemma>>(); // cache of forms: hashKey is formName | |
31 private Hashtable<String, Lemma> lemmas = new Hashtable<String, Lemma>(); // cache of lemmas: hashKey is lemmaName | |
32 private DBMorphHandler dbMorphHandlerStatic; // handles static morph data (BerkeleyDB) | |
33 private Date beginOfOperation; | |
34 private Date endOfOperation; | |
35 | |
36 public static MorphologyCache getInstance() throws ApplicationException { | |
37 if (instance == null) { | |
38 instance = new MorphologyCache(); | |
39 instance.init(); | |
40 } | |
41 return instance; | |
42 } | |
43 | |
44 private void init() throws ApplicationException { | |
45 instance.beginOperation(); | |
46 dbMorphHandlerStatic = new DBMorphHandler(DB_DIR_DONATUS); | |
47 dbMorphHandlerStatic.start(); | |
48 dbMorphHandlerStatic.openDatabases(); | |
49 instance.endOperation(); | |
50 Double elapsedTime = new Util().getSecondWithMillisecondsBetween(instance.beginOfOperation, instance.endOfOperation); | |
51 LOGGER.info("Morphology db cache: opened (needed " + elapsedTime + " seconds)"); | |
52 } | |
53 | |
54 public int getMode() { | |
55 return mode; | |
56 } | |
57 | |
58 public void setMode(int newMode) { | |
59 this.mode = newMode; | |
60 } | |
61 | |
62 public void end() throws ApplicationException { | |
63 dbMorphHandlerStatic.closeDatabases(); | |
64 LOGGER.info("Morphology db cache: closed"); | |
65 } | |
66 | |
67 public ArrayList<Lemma> getLemmasByFormName(String lang, String formNameArg, boolean normalize) throws ApplicationException { | |
68 String language = Language.getInstance().getLanguageId(lang); | |
69 ArrayList<Lemma> retFormLemmas = null; | |
70 String formName = formNameArg; | |
71 if (normalize) { | |
72 Normalizer normalizer = new Normalizer(language); | |
73 formName = normalizer.normalize(formNameArg); | |
74 } | |
75 // first look in local cache | |
76 String key = language + "###" + formName; | |
77 Hashtable<String, Lemma> formLemmasHashtable = forms.get(key); | |
78 if (formLemmasHashtable == null) { | |
79 ArrayList<Lemma> dbFormLemmas = readLemmasByFormName(language, formName); | |
80 // put lemmas into local cache | |
81 int localHashTableSize = forms.size(); | |
82 if (localHashTableSize >= MAX_HASHTABLE_SIZE) { | |
83 clearCache(); | |
84 } | |
85 if (dbFormLemmas != null && ! dbFormLemmas.isEmpty()) { | |
86 formLemmasHashtable = new Hashtable<String, Lemma>(); | |
87 for (int i=0; i<dbFormLemmas.size(); i++) { | |
88 Lemma lemma = dbFormLemmas.get(i); | |
89 String lemmaName = lemma.getLemmaName(); | |
90 String lemmaKey = language + "###" + lemmaName; | |
91 Lemma localLemma = lemmas.get(lemmaKey); | |
92 if (localLemma == null) { | |
93 ArrayList<Form> lemmaForms = readFormsByLemmaName(language, lemmaName); | |
94 lemma.setForms(lemmaForms); | |
95 lemmas.put(lemmaKey, lemma); | |
96 } else { | |
97 lemma = localLemma; | |
98 } | |
99 formLemmasHashtable.put(lemmaKey, lemma); | |
100 } | |
101 forms.put(key, formLemmasHashtable); | |
102 } | |
103 } | |
104 retFormLemmas = new ArrayList<Lemma>(); | |
105 if (formLemmasHashtable != null) { | |
106 Enumeration<String> formLemmasKeys = formLemmasHashtable.keys(); | |
107 while(formLemmasKeys.hasMoreElements()) { | |
108 String lemmaKey = formLemmasKeys.nextElement(); | |
109 Lemma l = formLemmasHashtable.get(lemmaKey); | |
110 retFormLemmas.add(l); | |
111 } | |
112 } | |
113 Collections.sort(retFormLemmas); | |
114 return retFormLemmas; | |
115 } | |
116 | |
117 public Lemma getLemma(String lang, String lemmaNameArg, boolean normalize) throws ApplicationException { | |
118 String language = Language.getInstance().getLanguageId(lang); | |
119 String lemmaName = lemmaNameArg; | |
120 if (normalize) { | |
121 Normalizer normalizer = new Normalizer(language); | |
122 lemmaName = normalizer.normalize(lemmaNameArg); | |
123 } | |
124 // first look in local cache | |
125 String key = language + "###" + lemmaName; | |
126 Lemma lemma = lemmas.get(key); | |
127 if (lemma == null) { | |
128 ArrayList<Form> dbLemmaForms = readFormsByLemmaName(language, lemmaName); | |
129 if (dbLemmaForms != null && dbLemmaForms.size() > 0) { | |
130 lemma = new Lemma(); | |
131 lemma.setLemmaName(lemmaName); | |
132 lemma.setLanguage(language); | |
133 lemma.setProvider(dbLemmaForms.get(0).getProvider()); | |
134 lemma.setForms(dbLemmaForms); | |
135 lemmas.put(lemmaName, lemma); | |
136 } | |
137 } | |
138 return lemma; | |
139 } | |
140 | |
141 public ArrayList<Form> getFormsByLuceneQuery(String lang, String luceneQueryString, boolean normalize) throws ApplicationException { | |
142 String language = Language.getInstance().getLanguageId(lang); | |
143 ArrayList<Form> result = new ArrayList<Form>(); | |
144 luceneQueryString = luceneQueryString.toLowerCase(); | |
145 ArrayList<String> formsFromQuery = getVariantsFromLuceneQuery(luceneQueryString); | |
146 if (! (formsFromQuery == null || formsFromQuery.isEmpty())) { | |
147 for (int i=0; i<formsFromQuery.size(); i++) { | |
148 String formStr = formsFromQuery.get(i); | |
149 if (normalize) { | |
150 Normalizer normalizer = new Normalizer(language); | |
151 formStr = normalizer.normalize(formStr); | |
152 } | |
153 ArrayList<Lemma> formLemmas = null; | |
154 // lemma mode: if formName contains "lemmalemma" then the lemma itself is fetched | |
155 if (formStr.startsWith("lemmalemma")) { | |
156 formLemmas = new ArrayList<Lemma>(); | |
157 String lemmaName = formStr.substring(10); | |
158 Lemma lemma = getLemma(language, lemmaName, false); | |
159 formLemmas.add(lemma); | |
160 } else { | |
161 formLemmas = getLemmasByFormName(language, formStr, false); | |
162 } | |
163 if (formLemmas != null && ! formLemmas.isEmpty()) { | |
164 for (int j=0; j<formLemmas.size(); j++) { | |
165 Lemma l = formLemmas.get(j); | |
166 ArrayList<Form> lemmaForms = l.getFormsList(); | |
167 result.addAll(lemmaForms); | |
168 } | |
169 } | |
170 } | |
171 } | |
172 return result; | |
173 } | |
174 | |
175 public ArrayList<Lemma> getLemmasByLuceneQuery(String lang, String luceneQueryString, boolean normalize) throws ApplicationException { | |
176 String language = Language.getInstance().getLanguageId(lang); | |
177 Hashtable<String, Lemma> lemmas = new Hashtable<String, Lemma>(); | |
178 luceneQueryString = luceneQueryString.toLowerCase(); | |
179 ArrayList<String> formsFromQuery = getVariantsFromLuceneQuery(luceneQueryString); | |
180 if (! (formsFromQuery == null || formsFromQuery.isEmpty())) { | |
181 for (int i=0; i<formsFromQuery.size(); i++) { | |
182 String formStr = formsFromQuery.get(i); | |
183 if (normalize) { | |
184 Normalizer normalizer = new Normalizer(language); | |
185 formStr = normalizer.normalize(formStr); | |
186 } | |
187 ArrayList<Lemma> formLemmas = null; | |
188 // lemma mode: if formName starts with "lemmalemma" then the lemma itself is fetched | |
189 if (formStr.startsWith("lemmalemma")) { | |
190 formLemmas = new ArrayList<Lemma>(); | |
191 String lemmaName = formStr.substring(10); | |
192 Lemma lemma = getLemma(language, lemmaName, false); | |
193 formLemmas.add(lemma); | |
194 } else { | |
195 formLemmas = getLemmasByFormName(language, formStr, false); | |
196 } | |
197 if (formLemmas != null) { | |
198 for (int j=0; j<formLemmas.size(); j++) { | |
199 Lemma lemma = formLemmas.get(j); | |
200 lemmas.put(lemma.getLemmaName(), lemma); | |
201 } | |
202 } | |
203 } | |
204 } | |
205 ArrayList<Lemma> result = new ArrayList<Lemma>(); | |
206 if (lemmas != null) { | |
207 Enumeration<String> formLemmasKeys = lemmas.keys(); | |
208 while(formLemmasKeys.hasMoreElements()) { | |
209 String lemmaKey = formLemmasKeys.nextElement(); | |
210 Lemma l = lemmas.get(lemmaKey); | |
211 result.add(l); | |
212 } | |
213 } | |
214 Collections.sort(result); | |
215 if (result.isEmpty()) | |
216 return null; | |
217 else | |
218 return result; | |
219 } | |
220 | |
221 public ArrayList<String> getIndexKeysByLemmaNames(String lang, ArrayList<String> lemmaNames) throws ApplicationException { | |
222 String language = Language.getInstance().getLanguageId(lang); | |
223 Hashtable<String, String> indexKeys = new Hashtable<String, String>(); | |
224 for (int j=0; j<lemmaNames.size(); j++) { | |
225 String lemmaName = lemmaNames.get(j); | |
226 Lemma lemma = getLemma(language, lemmaName, false); | |
227 indexKeys.put(lemmaName, lemmaName); | |
228 if (lemma != null) { | |
229 ArrayList<Form> lemmaForms = lemma.getFormsList(); | |
230 for (int k=0; k<lemmaForms.size(); k++) { | |
231 Form form = lemmaForms.get(k); | |
232 ArrayList<Lemma> fLemmas = getLemmasByFormName(language, form.getFormName(), false); | |
233 if (fLemmas != null) { | |
234 String indexKey = ""; | |
235 if (fLemmas.size() == 1) { | |
236 indexKey = fLemmas.get(0).getLemmaName(); | |
237 } else { | |
238 for (int l=0; l<fLemmas.size(); l++) { | |
239 Lemma lem = fLemmas.get(l); | |
240 indexKey = indexKey + "+++" + lem.getLemmaName(); | |
241 } | |
242 indexKeys.put(indexKey, indexKey); | |
243 } | |
244 } | |
245 } | |
246 } | |
247 } | |
248 ArrayList<String> result = new ArrayList<String>(); | |
249 if (indexKeys != null) { | |
250 Enumeration<String> indexKeysKeys = indexKeys.keys(); | |
251 while(indexKeysKeys.hasMoreElements()) { | |
252 String indexKey = indexKeysKeys.nextElement(); | |
253 result.add(indexKey); | |
254 } | |
255 } | |
256 Collections.sort(result); | |
257 if (result.isEmpty()) | |
258 return null; | |
259 else | |
260 return result; | |
261 } | |
262 | |
263 private void clearCache() { | |
264 forms = null; | |
265 lemmas = null; | |
266 forms = new Hashtable<String, Hashtable<String, Lemma>>(); | |
267 lemmas = new Hashtable<String, Lemma>(); | |
268 } | |
269 | |
270 private ArrayList<Lemma> readLemmasByFormName(String lang, String formName) throws ApplicationException { | |
271 String language = Language.getInstance().getLanguageId(lang); | |
272 ArrayList<Lemma> lemmasStatic = dbMorphHandlerStatic.readLemmas(language, formName); | |
273 return lemmasStatic; | |
274 } | |
275 | |
276 private ArrayList<Form> readFormsByLemmaName(String lang, String lemmaName) throws ApplicationException { | |
277 String language = Language.getInstance().getLanguageId(lang); | |
278 ArrayList<Form> formsStatic = dbMorphHandlerStatic.readForms(language, lemmaName); | |
279 return formsStatic; | |
280 } | |
281 | |
282 private ArrayList<String> getVariantsFromLuceneQuery(String queryString) { | |
283 LuceneUtil luceneUtil = LuceneUtil.getInstance(); | |
284 ArrayList<String> variants = luceneUtil.getVariantsFromLuceneQuery(queryString); | |
285 return variants; | |
286 } | |
287 | |
288 private void beginOperation() { | |
289 beginOfOperation = new Date(); | |
290 } | |
291 | |
292 private void endOperation() { | |
293 endOfOperation = new Date(); | |
294 } | |
295 } |