comparison software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/morph/app/MorphologyCache.java @ 19:4a3641ae14d2

Erstellung
author Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
date Wed, 09 Nov 2011 15:32:05 +0100
parents
children 7d6d969b10cf
comparison
equal deleted inserted replaced
18:dc5e9fcb3fdc 19:4a3641ae14d2
1 package de.mpg.mpiwg.berlin.mpdl.lt.morph.app;
2
3 import java.util.ArrayList;
4 import java.util.Collections;
5 import java.util.Date;
6 import java.util.Enumeration;
7 import java.util.Hashtable;
8
9 import java.util.logging.Logger;
10
11 import de.mpg.mpiwg.berlin.mpdl.lt.general.Language;
12 import de.mpg.mpiwg.berlin.mpdl.lt.general.Constants;
13 import de.mpg.mpiwg.berlin.mpdl.lt.morph.app.Form;
14 import de.mpg.mpiwg.berlin.mpdl.lt.morph.app.Lemma;
15 import de.mpg.mpiwg.berlin.mpdl.lt.morph.db.DBMorphHandler;
16 import de.mpg.mpiwg.berlin.mpdl.lt.text.norm.Normalizer;
17 import de.mpg.mpiwg.berlin.mpdl.lucene.util.LuceneUtil;
18 import de.mpg.mpiwg.berlin.mpdl.util.Util;
19 import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException;
20
21 public class MorphologyCache {
22 private static MorphologyCache instance;
23 private static Logger LOGGER = Logger.getLogger(MorphologyCache.class.getName());
24 private static String DATA_DIR = Constants.getInstance().getDataDir();
25 private static String DB_DIR_DONATUS = DATA_DIR + "/dataBerkeleyDB/donatus";
26 public static int QUERY_MODE = 0;
27 public static int DOCUMENT_MODE = 1;
28 private static int MAX_HASHTABLE_SIZE = Constants.MORPHOLOGY_CACHE_SIZE;
29 protected int mode = QUERY_MODE;
30 private Hashtable<String, Hashtable<String, Lemma>> forms = new Hashtable<String, Hashtable<String, Lemma>>(); // cache of forms: hashKey is formName
31 private Hashtable<String, Lemma> lemmas = new Hashtable<String, Lemma>(); // cache of lemmas: hashKey is lemmaName
32 private DBMorphHandler dbMorphHandlerStatic; // handles static morph data (BerkeleyDB)
33 private Date beginOfOperation;
34 private Date endOfOperation;
35
36 public static MorphologyCache getInstance() throws ApplicationException {
37 if (instance == null) {
38 instance = new MorphologyCache();
39 instance.init();
40 }
41 return instance;
42 }
43
44 private void init() throws ApplicationException {
45 instance.beginOperation();
46 dbMorphHandlerStatic = new DBMorphHandler(DB_DIR_DONATUS);
47 dbMorphHandlerStatic.start();
48 dbMorphHandlerStatic.openDatabases();
49 instance.endOperation();
50 Double elapsedTime = new Util().getSecondWithMillisecondsBetween(instance.beginOfOperation, instance.endOfOperation);
51 LOGGER.info("Morphology db cache: opened (needed " + elapsedTime + " seconds)");
52 }
53
54 public int getMode() {
55 return mode;
56 }
57
58 public void setMode(int newMode) {
59 this.mode = newMode;
60 }
61
62 public void end() throws ApplicationException {
63 dbMorphHandlerStatic.closeDatabases();
64 LOGGER.info("Morphology db cache: closed");
65 }
66
67 public ArrayList<Lemma> getLemmasByFormName(String lang, String formNameArg, boolean normalize) throws ApplicationException {
68 String language = Language.getInstance().getLanguageId(lang);
69 ArrayList<Lemma> retFormLemmas = null;
70 String formName = formNameArg;
71 if (normalize) {
72 Normalizer normalizer = new Normalizer(language);
73 formName = normalizer.normalize(formNameArg);
74 }
75 // first look in local cache
76 String key = language + "###" + formName;
77 Hashtable<String, Lemma> formLemmasHashtable = forms.get(key);
78 if (formLemmasHashtable == null) {
79 ArrayList<Lemma> dbFormLemmas = readLemmasByFormName(language, formName);
80 // put lemmas into local cache
81 int localHashTableSize = forms.size();
82 if (localHashTableSize >= MAX_HASHTABLE_SIZE) {
83 clearCache();
84 }
85 if (dbFormLemmas != null && ! dbFormLemmas.isEmpty()) {
86 formLemmasHashtable = new Hashtable<String, Lemma>();
87 for (int i=0; i<dbFormLemmas.size(); i++) {
88 Lemma lemma = dbFormLemmas.get(i);
89 String lemmaName = lemma.getLemmaName();
90 String lemmaKey = language + "###" + lemmaName;
91 Lemma localLemma = lemmas.get(lemmaKey);
92 if (localLemma == null) {
93 ArrayList<Form> lemmaForms = readFormsByLemmaName(language, lemmaName);
94 lemma.setForms(lemmaForms);
95 lemmas.put(lemmaKey, lemma);
96 } else {
97 lemma = localLemma;
98 }
99 formLemmasHashtable.put(lemmaKey, lemma);
100 }
101 forms.put(key, formLemmasHashtable);
102 }
103 }
104 retFormLemmas = new ArrayList<Lemma>();
105 if (formLemmasHashtable != null) {
106 Enumeration<String> formLemmasKeys = formLemmasHashtable.keys();
107 while(formLemmasKeys.hasMoreElements()) {
108 String lemmaKey = formLemmasKeys.nextElement();
109 Lemma l = formLemmasHashtable.get(lemmaKey);
110 retFormLemmas.add(l);
111 }
112 }
113 Collections.sort(retFormLemmas);
114 return retFormLemmas;
115 }
116
117 public Lemma getLemma(String lang, String lemmaNameArg, boolean normalize) throws ApplicationException {
118 String language = Language.getInstance().getLanguageId(lang);
119 String lemmaName = lemmaNameArg;
120 if (normalize) {
121 Normalizer normalizer = new Normalizer(language);
122 lemmaName = normalizer.normalize(lemmaNameArg);
123 }
124 // first look in local cache
125 String key = language + "###" + lemmaName;
126 Lemma lemma = lemmas.get(key);
127 if (lemma == null) {
128 ArrayList<Form> dbLemmaForms = readFormsByLemmaName(language, lemmaName);
129 if (dbLemmaForms != null && dbLemmaForms.size() > 0) {
130 lemma = new Lemma();
131 lemma.setLemmaName(lemmaName);
132 lemma.setLanguage(language);
133 lemma.setProvider(dbLemmaForms.get(0).getProvider());
134 lemma.setForms(dbLemmaForms);
135 lemmas.put(lemmaName, lemma);
136 }
137 }
138 return lemma;
139 }
140
141 public ArrayList<Form> getFormsByLuceneQuery(String lang, String luceneQueryString, boolean normalize) throws ApplicationException {
142 String language = Language.getInstance().getLanguageId(lang);
143 ArrayList<Form> result = new ArrayList<Form>();
144 luceneQueryString = luceneQueryString.toLowerCase();
145 ArrayList<String> formsFromQuery = getVariantsFromLuceneQuery(luceneQueryString);
146 if (! (formsFromQuery == null || formsFromQuery.isEmpty())) {
147 for (int i=0; i<formsFromQuery.size(); i++) {
148 String formStr = formsFromQuery.get(i);
149 if (normalize) {
150 Normalizer normalizer = new Normalizer(language);
151 formStr = normalizer.normalize(formStr);
152 }
153 ArrayList<Lemma> formLemmas = null;
154 // lemma mode: if formName contains "lemmalemma" then the lemma itself is fetched
155 if (formStr.startsWith("lemmalemma")) {
156 formLemmas = new ArrayList<Lemma>();
157 String lemmaName = formStr.substring(10);
158 Lemma lemma = getLemma(language, lemmaName, false);
159 formLemmas.add(lemma);
160 } else {
161 formLemmas = getLemmasByFormName(language, formStr, false);
162 }
163 if (formLemmas != null && ! formLemmas.isEmpty()) {
164 for (int j=0; j<formLemmas.size(); j++) {
165 Lemma l = formLemmas.get(j);
166 ArrayList<Form> lemmaForms = l.getFormsList();
167 result.addAll(lemmaForms);
168 }
169 }
170 }
171 }
172 return result;
173 }
174
175 public ArrayList<Lemma> getLemmasByLuceneQuery(String lang, String luceneQueryString, boolean normalize) throws ApplicationException {
176 String language = Language.getInstance().getLanguageId(lang);
177 Hashtable<String, Lemma> lemmas = new Hashtable<String, Lemma>();
178 luceneQueryString = luceneQueryString.toLowerCase();
179 ArrayList<String> formsFromQuery = getVariantsFromLuceneQuery(luceneQueryString);
180 if (! (formsFromQuery == null || formsFromQuery.isEmpty())) {
181 for (int i=0; i<formsFromQuery.size(); i++) {
182 String formStr = formsFromQuery.get(i);
183 if (normalize) {
184 Normalizer normalizer = new Normalizer(language);
185 formStr = normalizer.normalize(formStr);
186 }
187 ArrayList<Lemma> formLemmas = null;
188 // lemma mode: if formName starts with "lemmalemma" then the lemma itself is fetched
189 if (formStr.startsWith("lemmalemma")) {
190 formLemmas = new ArrayList<Lemma>();
191 String lemmaName = formStr.substring(10);
192 Lemma lemma = getLemma(language, lemmaName, false);
193 formLemmas.add(lemma);
194 } else {
195 formLemmas = getLemmasByFormName(language, formStr, false);
196 }
197 if (formLemmas != null) {
198 for (int j=0; j<formLemmas.size(); j++) {
199 Lemma lemma = formLemmas.get(j);
200 lemmas.put(lemma.getLemmaName(), lemma);
201 }
202 }
203 }
204 }
205 ArrayList<Lemma> result = new ArrayList<Lemma>();
206 if (lemmas != null) {
207 Enumeration<String> formLemmasKeys = lemmas.keys();
208 while(formLemmasKeys.hasMoreElements()) {
209 String lemmaKey = formLemmasKeys.nextElement();
210 Lemma l = lemmas.get(lemmaKey);
211 result.add(l);
212 }
213 }
214 Collections.sort(result);
215 if (result.isEmpty())
216 return null;
217 else
218 return result;
219 }
220
221 public ArrayList<String> getIndexKeysByLemmaNames(String lang, ArrayList<String> lemmaNames) throws ApplicationException {
222 String language = Language.getInstance().getLanguageId(lang);
223 Hashtable<String, String> indexKeys = new Hashtable<String, String>();
224 for (int j=0; j<lemmaNames.size(); j++) {
225 String lemmaName = lemmaNames.get(j);
226 Lemma lemma = getLemma(language, lemmaName, false);
227 indexKeys.put(lemmaName, lemmaName);
228 if (lemma != null) {
229 ArrayList<Form> lemmaForms = lemma.getFormsList();
230 for (int k=0; k<lemmaForms.size(); k++) {
231 Form form = lemmaForms.get(k);
232 ArrayList<Lemma> fLemmas = getLemmasByFormName(language, form.getFormName(), false);
233 if (fLemmas != null) {
234 String indexKey = "";
235 if (fLemmas.size() == 1) {
236 indexKey = fLemmas.get(0).getLemmaName();
237 } else {
238 for (int l=0; l<fLemmas.size(); l++) {
239 Lemma lem = fLemmas.get(l);
240 indexKey = indexKey + "+++" + lem.getLemmaName();
241 }
242 indexKeys.put(indexKey, indexKey);
243 }
244 }
245 }
246 }
247 }
248 ArrayList<String> result = new ArrayList<String>();
249 if (indexKeys != null) {
250 Enumeration<String> indexKeysKeys = indexKeys.keys();
251 while(indexKeysKeys.hasMoreElements()) {
252 String indexKey = indexKeysKeys.nextElement();
253 result.add(indexKey);
254 }
255 }
256 Collections.sort(result);
257 if (result.isEmpty())
258 return null;
259 else
260 return result;
261 }
262
263 private void clearCache() {
264 forms = null;
265 lemmas = null;
266 forms = new Hashtable<String, Hashtable<String, Lemma>>();
267 lemmas = new Hashtable<String, Lemma>();
268 }
269
270 private ArrayList<Lemma> readLemmasByFormName(String lang, String formName) throws ApplicationException {
271 String language = Language.getInstance().getLanguageId(lang);
272 ArrayList<Lemma> lemmasStatic = dbMorphHandlerStatic.readLemmas(language, formName);
273 return lemmasStatic;
274 }
275
276 private ArrayList<Form> readFormsByLemmaName(String lang, String lemmaName) throws ApplicationException {
277 String language = Language.getInstance().getLanguageId(lang);
278 ArrayList<Form> formsStatic = dbMorphHandlerStatic.readForms(language, lemmaName);
279 return formsStatic;
280 }
281
282 private ArrayList<String> getVariantsFromLuceneQuery(String queryString) {
283 LuceneUtil luceneUtil = LuceneUtil.getInstance();
284 ArrayList<String> variants = luceneUtil.getVariantsFromLuceneQuery(queryString);
285 return variants;
286 }
287
288 private void beginOperation() {
289 beginOfOperation = new Date();
290 }
291
292 private void endOperation() {
293 endOfOperation = new Date();
294 }
295 }