Mercurial > hg > mpdl-group
comparison software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/dict/db/LexHandler.java @ 20:7d6d969b10cf
little corrections
author | Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de> |
---|---|
date | Wed, 14 Dec 2011 12:48:43 +0100 |
parents | 4a3641ae14d2 |
children | e845310098ba |
comparison
equal
deleted
inserted
replaced
19:4a3641ae14d2 | 20:7d6d969b10cf |
---|---|
58 * @param language | 58 * @param language |
59 * @param normalization | 59 * @param normalization |
60 * @return lemmas | 60 * @return lemmas |
61 * @throws ApplicationException | 61 * @throws ApplicationException |
62 */ | 62 */ |
63 public ArrayList<Lemma> getLemmas(String query, String type, String language, String normalization) throws ApplicationException { | 63 public ArrayList<Lemma> getLemmas(String query, String type, String language, int normMode) throws ApplicationException { |
64 ArrayList<Lemma> lexLemmas = new ArrayList<Lemma>(); | 64 ArrayList<Lemma> lexLemmas = new ArrayList<Lemma>(); |
65 // get lemmas of all forms in query | 65 // get lemmas of all forms in query |
66 MorphologyCache morphologyCache = MorphologyCache.getInstance(); | 66 MorphologyCache morphologyCache = MorphologyCache.getInstance(); |
67 String[] queryForms = query.split(" "); | 67 String[] queryForms = query.split(" "); |
68 for (int k=0; k<queryForms.length; k++) { | 68 for (int k=0; k<queryForms.length; k++) { |
69 String queryForm = queryForms[k]; | 69 String queryForm = queryForms[k]; |
70 ArrayList<Lemma> lemmas = null; | 70 ArrayList<Lemma> lemmas = null; |
71 if (type.equals("form")) { | 71 if (type.equals("form")) { |
72 if (normalization.equals("norm")) | 72 lemmas = morphologyCache.getLemmasByFormName(language, queryForm, normMode); |
73 lemmas = morphologyCache.getLemmasByFormName(language, queryForm, true); | |
74 else if (normalization.equals("none")) | |
75 lemmas = morphologyCache.getLemmasByFormName(language, queryForm, false); | |
76 else | |
77 lemmas = morphologyCache.getLemmasByFormName(language, queryForm, true); // TODO reg and reg+norm | |
78 } else if (type.equals("lemma")) { | 73 } else if (type.equals("lemma")) { |
79 lemmas = new ArrayList<Lemma>(); | 74 lemmas = new ArrayList<Lemma>(); |
80 Lemma l = null; | 75 Lemma l = morphologyCache.getLemma(language, queryForm, normMode); |
81 if (normalization.equals("norm")) | |
82 l = morphologyCache.getLemma(language, queryForm, true); | |
83 else if (normalization.equals("none")) | |
84 l = morphologyCache.getLemma(language, queryForm, false); | |
85 else | |
86 l = morphologyCache.getLemma(language, queryForm, true); | |
87 if (l != null) | 76 if (l != null) |
88 lemmas.add(l); | 77 lemmas.add(l); |
89 } | 78 } |
90 if (lemmas != null && ! lemmas.isEmpty()) { | 79 if (lemmas != null && ! lemmas.isEmpty()) { |
91 lexLemmas.addAll(lemmas); | 80 lexLemmas.addAll(lemmas); |
99 return null; | 88 return null; |
100 else | 89 else |
101 return lexLemmas; | 90 return lexLemmas; |
102 } | 91 } |
103 | 92 |
104 public ArrayList<Lexicon> getLexEntries(ArrayList<Lemma> lexLemmas, String language, String lexiconName) throws ApplicationException { | 93 public ArrayList<Lexicon> getLexEntries(ArrayList<Lemma> lexLemmas, String language, String lexiconName, String query) throws ApplicationException { |
105 ArrayList<Lexicon> retLexicons = new ArrayList<Lexicon>(); | 94 ArrayList<Lexicon> retLexicons = new ArrayList<Lexicon>(); |
106 ArrayList<Lexicon> lexicons = Lexica.getInstance().getLexicons(language); | 95 ArrayList<Lexicon> lexicons = Lexica.getInstance().getLexicons(language); |
107 if (lexiconName != null) { | 96 if (lexiconName != null) { |
108 lexicons = new ArrayList<Lexicon>(); | 97 lexicons = new ArrayList<Lexicon>(); |
109 Lexicon lexicon = Lexica.getInstance().getLexicon(lexiconName); | 98 Lexicon lexicon = Lexica.getInstance().getLexicon(lexiconName); |
113 if (lexicons != null) { | 102 if (lexicons != null) { |
114 for (int i=0; i<lexicons.size(); i++) { | 103 for (int i=0; i<lexicons.size(); i++) { |
115 Lexicon lexicon = lexicons.get(i).clone(); // clone without lexicon entries | 104 Lexicon lexicon = lexicons.get(i).clone(); // clone without lexicon entries |
116 for (int j=0; j<lexLemmas.size(); j++) { | 105 for (int j=0; j<lexLemmas.size(); j++) { |
117 String lemmaName = lexLemmas.get(j).getLemmaName(); | 106 String lemmaName = lexLemmas.get(j).getLemmaName(); |
118 if (Language.getInstance().isGerman(language) && lemmaName.contains("ae")) | |
119 lemmaName = lemmaName.replaceAll("ae", "Š"); | |
120 if (Language.getInstance().isGerman(language) && lemmaName.contains("oe")) | |
121 lemmaName = lemmaName.replaceAll("oe", "š"); | |
122 if (Language.getInstance().isGerman(language) && lemmaName.contains("ue")) | |
123 lemmaName = lemmaName.replaceAll("ue", "Ÿ"); | |
124 if (Language.getInstance().isGerman(language) && lemmaName.contains("ss")) | |
125 lemmaName = lemmaName.replaceAll("ss", "§"); | |
126 LexiconEntry lexEntry = getEntry(lexicon, lemmaName); | 107 LexiconEntry lexEntry = getEntry(lexicon, lemmaName); |
127 if (lexEntry != null) { | 108 if (lexEntry != null) { |
128 lexicon.addEntry(lexEntry); // add entries to the cloned lexicon | 109 lexicon.addEntry(lexEntry); // add entries to the cloned lexicon |
110 } | |
111 } | |
112 if (Language.getInstance().isGerman(language) && query != null) { | |
113 String[] lexFormNames = query.split(" "); | |
114 for (int j=0; j<lexFormNames.length; j++) { | |
115 String lexFormName = lexFormNames[j]; | |
116 LexiconEntry lexEntry = lexicon.getEntry(lexFormName); | |
117 if (lexEntry == null) { | |
118 LexiconEntry newLexEntry = new LexiconEntry(lexiconName, lexFormName, null); | |
119 String lexiconQueryUrl = lexicon.getQueryUrl(); | |
120 String remoteUrl = lexiconQueryUrl + lexFormName; | |
121 newLexEntry.setRemoteUrl(remoteUrl); | |
122 lexicon.addEntry(newLexEntry); | |
123 } | |
129 } | 124 } |
130 } | 125 } |
131 if (! lexicon.isEmpty()) | 126 if (! lexicon.isEmpty()) |
132 retLexicons.add(lexicon); | 127 retLexicons.add(lexicon); |
133 } | 128 } |
141 * @param formName | 136 * @param formName |
142 * @param language | 137 * @param language |
143 * @return delivers lexical entries by the help of the morphology component (lexical entry of the stem of the normalized word form) | 138 * @return delivers lexical entries by the help of the morphology component (lexical entry of the stem of the normalized word form) |
144 * @throws ApplicationException | 139 * @throws ApplicationException |
145 */ | 140 */ |
146 public ArrayList<String> getLexEntryKeys(String formName, String language, boolean normalize) throws ApplicationException { | 141 public ArrayList<String> getLexEntryKeys(String formName, String language, int normMode) throws ApplicationException { |
147 ArrayList<String> lexEntryKeys = new ArrayList<String>(); | 142 ArrayList<String> lexEntryKeys = new ArrayList<String>(); |
148 MorphologyCache morphologyCache = MorphologyCache.getInstance(); | 143 MorphologyCache morphologyCache = MorphologyCache.getInstance(); |
149 ArrayList<Lemma> formLemmas = morphologyCache.getLemmasByFormName(language, formName, normalize); | 144 ArrayList<Lemma> formLemmas = morphologyCache.getLemmasByFormName(language, formName, normMode); |
150 boolean hasLexEntry = false; | 145 boolean hasLexEntry = false; |
151 hasLexEntry = hasLexEntryKey(formName, language); | 146 hasLexEntry = hasLexEntryKey(formName, language); |
152 if (hasLexEntry) | 147 if (hasLexEntry) |
153 lexEntryKeys.add(formName); | 148 lexEntryKeys.add(formName); |
154 if (formLemmas != null) { | 149 if (formLemmas != null) { |
156 Lemma l = formLemmas.get(j); | 151 Lemma l = formLemmas.get(j); |
157 String lName = l.getLemmaName(); | 152 String lName = l.getLemmaName(); |
158 if (! hasLexEntry) { | 153 if (! hasLexEntry) { |
159 hasLexEntry = hasLexEntryKey(lName, language); | 154 hasLexEntry = hasLexEntryKey(lName, language); |
160 } | 155 } |
161 if (language.equals("de") || language.equals("fr") || language.equals("nl")) // TODO Lexika fŸr diese Sprachen in BerkeleyDB einbringen (fŸr nl auch eine bessere Morph.) | 156 if (language.equals("de") || language.equals("fr") || language.equals("nl")) // TODO Lexika für diese Sprachen in BerkeleyDB einbringen (für nl auch eine bessere Morph.) |
162 lexEntryKeys.add(lName); | 157 lexEntryKeys.add(lName); |
163 if (! lName.equals(formName) && hasLexEntry) { | 158 if (! lName.equals(formName) && hasLexEntry) { |
164 lexEntryKeys.add(lName); | 159 lexEntryKeys.add(lName); |
165 } | 160 } |
166 } | 161 } |
186 } | 181 } |
187 } | 182 } |
188 return hasLexEntry; | 183 return hasLexEntry; |
189 } | 184 } |
190 | 185 |
191 public ArrayList<Lexicon> getLexEntriesBeginningWith(String language, String formPrefix, int pageNumber) throws ApplicationException { | 186 public ArrayList<Lexicon> getLexEntriesBeginningWith(String language, String formPrefix, int pageNumber, int pageSize) throws ApplicationException { |
192 int pageSize = 50; | |
193 int from = (pageNumber * pageSize) - pageSize + 1; | 187 int from = (pageNumber * pageSize) - pageSize + 1; |
194 int to = pageNumber * pageSize; | 188 int to = pageNumber * pageSize; |
195 ArrayList<Lexicon> statLexicons = Lexica.getInstance().getLocalLexicons(language); | 189 ArrayList<Lexicon> statLexicons = Lexica.getInstance().getLocalLexicons(language); |
196 ArrayList<Lexicon> retLexicons = null; | 190 ArrayList<Lexicon> retLexicons = null; |
197 if (statLexicons != null) { | 191 if (statLexicons != null) { |
209 } | 203 } |
210 } | 204 } |
211 return retLexicons; | 205 return retLexicons; |
212 } | 206 } |
213 | 207 |
214 public ArrayList<Lexicon> getLexEntriesByLexiconBeginningWith(String lexiconName, String formPrefix, int pageNumber) throws ApplicationException { | 208 public ArrayList<Lexicon> getLexEntriesByLexiconBeginningWith(String lexiconName, String formPrefix, int pageNumber, int pageSize) throws ApplicationException { |
215 int pageSize = 50; | |
216 int from = (pageNumber * pageSize) - pageSize + 1; | 209 int from = (pageNumber * pageSize) - pageSize + 1; |
217 int to = pageNumber * pageSize; | 210 int to = pageNumber * pageSize; |
218 Lexicon lexicon = Lexica.getInstance().getLexicon(lexiconName).clone(); | 211 Lexicon lexicon = Lexica.getInstance().getLexicon(lexiconName).clone(); |
219 ArrayList<Lexicon> retLexicons = null; | 212 ArrayList<Lexicon> retLexicons = null; |
220 if (lexicon != null) { | 213 if (lexicon != null) { |
263 dbFoundValueStr = new String(foundValueBytes, "utf-8"); | 256 dbFoundValueStr = new String(foundValueBytes, "utf-8"); |
264 } | 257 } |
265 cursor.close(); | 258 cursor.close(); |
266 if (dbFoundValueStr != null) { | 259 if (dbFoundValueStr != null) { |
267 retLexEntry = new LexiconEntry(lexiconName, formName, dbFoundValueStr); | 260 retLexEntry = new LexiconEntry(lexiconName, formName, dbFoundValueStr); |
261 retLexEntry = correct(retLexEntry); // correct errors: e.g. in lsj some html entities are not correct | |
268 } | 262 } |
269 } catch (DatabaseException e) { | 263 } catch (DatabaseException e) { |
270 throw new ApplicationException(e); | 264 throw new ApplicationException(e); |
271 } catch (UnsupportedEncodingException e) { | 265 } catch (UnsupportedEncodingException e) { |
272 throw new ApplicationException(e); | 266 throw new ApplicationException(e); |
290 byte[] foundValueBytes = foundValue.getData(); | 284 byte[] foundValueBytes = foundValue.getData(); |
291 dbFoundValueStr = new String(foundValueBytes, "utf-8"); | 285 dbFoundValueStr = new String(foundValueBytes, "utf-8"); |
292 byte[] foundKeyBytes = dbEntryKey.getData(); | 286 byte[] foundKeyBytes = dbEntryKey.getData(); |
293 String dbFoundKeyStr = new String(foundKeyBytes, "utf-8"); | 287 String dbFoundKeyStr = new String(foundKeyBytes, "utf-8"); |
294 LexiconEntry lexEntry = new LexiconEntry(lexiconName, dbFoundKeyStr, dbFoundValueStr); | 288 LexiconEntry lexEntry = new LexiconEntry(lexiconName, dbFoundKeyStr, dbFoundValueStr); |
289 lexEntry = correct(lexEntry); // correct errors: e.g. in lsj some html entities are not correct | |
295 retLexEntries.add(lexEntry); | 290 retLexEntries.add(lexEntry); |
296 } | 291 } |
297 operationStatus = cursor.getNext(dbEntryKey, foundValue, LockMode.DEFAULT); | 292 operationStatus = cursor.getNext(dbEntryKey, foundValue, LockMode.DEFAULT); |
298 counter++; | 293 counter++; |
299 } | 294 } |
305 throw new ApplicationException(e); | 300 throw new ApplicationException(e); |
306 } catch (UnsupportedEncodingException e) { | 301 } catch (UnsupportedEncodingException e) { |
307 throw new ApplicationException(e); | 302 throw new ApplicationException(e); |
308 } | 303 } |
309 return retLexEntries; | 304 return retLexEntries; |
305 } | |
306 | |
307 private LexiconEntry correct(LexiconEntry lexEntry) { | |
308 String lexiconName = lexEntry.getLexiconName(); | |
309 String content = lexEntry.getContent(); | |
310 if (content != null && content.contains("&#") && lexiconName.equals("lsj")) { // errors in greek lexicon lsj | |
311 content = content.replaceAll("&#\u03C7", "&#x"); // html entity: replace greek Minuskel Chi by "x" | |
312 content = content.replaceAll("&#x[^0-9]{4};", ""); // html entity: remove entity if special greek not hex characters appear | |
313 lexEntry.setContent(content); | |
314 } | |
315 return lexEntry; | |
310 } | 316 } |
311 | 317 |
312 public static void main(String[] args) throws ApplicationException { | 318 public static void main(String[] args) throws ApplicationException { |
313 getInstance(); | 319 getInstance(); |
314 instance.beginOperation(); | 320 instance.beginOperation(); |