comparison software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/dict/db/LexHandler.java @ 20:7d6d969b10cf

little corrections
author Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
date Wed, 14 Dec 2011 12:48:43 +0100
parents 4a3641ae14d2
children e845310098ba
comparison
equal deleted inserted replaced
19:4a3641ae14d2 20:7d6d969b10cf
58 * @param language 58 * @param language
59 * @param normalization 59 * @param normalization
60 * @return lemmas 60 * @return lemmas
61 * @throws ApplicationException 61 * @throws ApplicationException
62 */ 62 */
63 public ArrayList<Lemma> getLemmas(String query, String type, String language, String normalization) throws ApplicationException { 63 public ArrayList<Lemma> getLemmas(String query, String type, String language, int normMode) throws ApplicationException {
64 ArrayList<Lemma> lexLemmas = new ArrayList<Lemma>(); 64 ArrayList<Lemma> lexLemmas = new ArrayList<Lemma>();
65 // get lemmas of all forms in query 65 // get lemmas of all forms in query
66 MorphologyCache morphologyCache = MorphologyCache.getInstance(); 66 MorphologyCache morphologyCache = MorphologyCache.getInstance();
67 String[] queryForms = query.split(" "); 67 String[] queryForms = query.split(" ");
68 for (int k=0; k<queryForms.length; k++) { 68 for (int k=0; k<queryForms.length; k++) {
69 String queryForm = queryForms[k]; 69 String queryForm = queryForms[k];
70 ArrayList<Lemma> lemmas = null; 70 ArrayList<Lemma> lemmas = null;
71 if (type.equals("form")) { 71 if (type.equals("form")) {
72 if (normalization.equals("norm")) 72 lemmas = morphologyCache.getLemmasByFormName(language, queryForm, normMode);
73 lemmas = morphologyCache.getLemmasByFormName(language, queryForm, true);
74 else if (normalization.equals("none"))
75 lemmas = morphologyCache.getLemmasByFormName(language, queryForm, false);
76 else
77 lemmas = morphologyCache.getLemmasByFormName(language, queryForm, true); // TODO reg and reg+norm
78 } else if (type.equals("lemma")) { 73 } else if (type.equals("lemma")) {
79 lemmas = new ArrayList<Lemma>(); 74 lemmas = new ArrayList<Lemma>();
80 Lemma l = null; 75 Lemma l = morphologyCache.getLemma(language, queryForm, normMode);
81 if (normalization.equals("norm"))
82 l = morphologyCache.getLemma(language, queryForm, true);
83 else if (normalization.equals("none"))
84 l = morphologyCache.getLemma(language, queryForm, false);
85 else
86 l = morphologyCache.getLemma(language, queryForm, true);
87 if (l != null) 76 if (l != null)
88 lemmas.add(l); 77 lemmas.add(l);
89 } 78 }
90 if (lemmas != null && ! lemmas.isEmpty()) { 79 if (lemmas != null && ! lemmas.isEmpty()) {
91 lexLemmas.addAll(lemmas); 80 lexLemmas.addAll(lemmas);
99 return null; 88 return null;
100 else 89 else
101 return lexLemmas; 90 return lexLemmas;
102 } 91 }
103 92
104 public ArrayList<Lexicon> getLexEntries(ArrayList<Lemma> lexLemmas, String language, String lexiconName) throws ApplicationException { 93 public ArrayList<Lexicon> getLexEntries(ArrayList<Lemma> lexLemmas, String language, String lexiconName, String query) throws ApplicationException {
105 ArrayList<Lexicon> retLexicons = new ArrayList<Lexicon>(); 94 ArrayList<Lexicon> retLexicons = new ArrayList<Lexicon>();
106 ArrayList<Lexicon> lexicons = Lexica.getInstance().getLexicons(language); 95 ArrayList<Lexicon> lexicons = Lexica.getInstance().getLexicons(language);
107 if (lexiconName != null) { 96 if (lexiconName != null) {
108 lexicons = new ArrayList<Lexicon>(); 97 lexicons = new ArrayList<Lexicon>();
109 Lexicon lexicon = Lexica.getInstance().getLexicon(lexiconName); 98 Lexicon lexicon = Lexica.getInstance().getLexicon(lexiconName);
113 if (lexicons != null) { 102 if (lexicons != null) {
114 for (int i=0; i<lexicons.size(); i++) { 103 for (int i=0; i<lexicons.size(); i++) {
115 Lexicon lexicon = lexicons.get(i).clone(); // clone without lexicon entries 104 Lexicon lexicon = lexicons.get(i).clone(); // clone without lexicon entries
116 for (int j=0; j<lexLemmas.size(); j++) { 105 for (int j=0; j<lexLemmas.size(); j++) {
117 String lemmaName = lexLemmas.get(j).getLemmaName(); 106 String lemmaName = lexLemmas.get(j).getLemmaName();
118 if (Language.getInstance().isGerman(language) && lemmaName.contains("ae"))
119 lemmaName = lemmaName.replaceAll("ae", "Š");
120 if (Language.getInstance().isGerman(language) && lemmaName.contains("oe"))
121 lemmaName = lemmaName.replaceAll("oe", "š");
122 if (Language.getInstance().isGerman(language) && lemmaName.contains("ue"))
123 lemmaName = lemmaName.replaceAll("ue", "Ÿ");
124 if (Language.getInstance().isGerman(language) && lemmaName.contains("ss"))
125 lemmaName = lemmaName.replaceAll("ss", "§");
126 LexiconEntry lexEntry = getEntry(lexicon, lemmaName); 107 LexiconEntry lexEntry = getEntry(lexicon, lemmaName);
127 if (lexEntry != null) { 108 if (lexEntry != null) {
128 lexicon.addEntry(lexEntry); // add entries to the cloned lexicon 109 lexicon.addEntry(lexEntry); // add entries to the cloned lexicon
110 }
111 }
112 if (Language.getInstance().isGerman(language) && query != null) {
113 String[] lexFormNames = query.split(" ");
114 for (int j=0; j<lexFormNames.length; j++) {
115 String lexFormName = lexFormNames[j];
116 LexiconEntry lexEntry = lexicon.getEntry(lexFormName);
117 if (lexEntry == null) {
118 LexiconEntry newLexEntry = new LexiconEntry(lexiconName, lexFormName, null);
119 String lexiconQueryUrl = lexicon.getQueryUrl();
120 String remoteUrl = lexiconQueryUrl + lexFormName;
121 newLexEntry.setRemoteUrl(remoteUrl);
122 lexicon.addEntry(newLexEntry);
123 }
129 } 124 }
130 } 125 }
131 if (! lexicon.isEmpty()) 126 if (! lexicon.isEmpty())
132 retLexicons.add(lexicon); 127 retLexicons.add(lexicon);
133 } 128 }
141 * @param formName 136 * @param formName
142 * @param language 137 * @param language
143 * @return delivers lexical entries by the help of the morphology component (lexical entry of the stem of the normalized word form) 138 * @return delivers lexical entries by the help of the morphology component (lexical entry of the stem of the normalized word form)
144 * @throws ApplicationException 139 * @throws ApplicationException
145 */ 140 */
146 public ArrayList<String> getLexEntryKeys(String formName, String language, boolean normalize) throws ApplicationException { 141 public ArrayList<String> getLexEntryKeys(String formName, String language, int normMode) throws ApplicationException {
147 ArrayList<String> lexEntryKeys = new ArrayList<String>(); 142 ArrayList<String> lexEntryKeys = new ArrayList<String>();
148 MorphologyCache morphologyCache = MorphologyCache.getInstance(); 143 MorphologyCache morphologyCache = MorphologyCache.getInstance();
149 ArrayList<Lemma> formLemmas = morphologyCache.getLemmasByFormName(language, formName, normalize); 144 ArrayList<Lemma> formLemmas = morphologyCache.getLemmasByFormName(language, formName, normMode);
150 boolean hasLexEntry = false; 145 boolean hasLexEntry = false;
151 hasLexEntry = hasLexEntryKey(formName, language); 146 hasLexEntry = hasLexEntryKey(formName, language);
152 if (hasLexEntry) 147 if (hasLexEntry)
153 lexEntryKeys.add(formName); 148 lexEntryKeys.add(formName);
154 if (formLemmas != null) { 149 if (formLemmas != null) {
156 Lemma l = formLemmas.get(j); 151 Lemma l = formLemmas.get(j);
157 String lName = l.getLemmaName(); 152 String lName = l.getLemmaName();
158 if (! hasLexEntry) { 153 if (! hasLexEntry) {
159 hasLexEntry = hasLexEntryKey(lName, language); 154 hasLexEntry = hasLexEntryKey(lName, language);
160 } 155 }
161 if (language.equals("de") || language.equals("fr") || language.equals("nl")) // TODO Lexika fŸr diese Sprachen in BerkeleyDB einbringen (fŸr nl auch eine bessere Morph.) 156 if (language.equals("de") || language.equals("fr") || language.equals("nl")) // TODO Lexika für diese Sprachen in BerkeleyDB einbringen (für nl auch eine bessere Morph.)
162 lexEntryKeys.add(lName); 157 lexEntryKeys.add(lName);
163 if (! lName.equals(formName) && hasLexEntry) { 158 if (! lName.equals(formName) && hasLexEntry) {
164 lexEntryKeys.add(lName); 159 lexEntryKeys.add(lName);
165 } 160 }
166 } 161 }
186 } 181 }
187 } 182 }
188 return hasLexEntry; 183 return hasLexEntry;
189 } 184 }
190 185
191 public ArrayList<Lexicon> getLexEntriesBeginningWith(String language, String formPrefix, int pageNumber) throws ApplicationException { 186 public ArrayList<Lexicon> getLexEntriesBeginningWith(String language, String formPrefix, int pageNumber, int pageSize) throws ApplicationException {
192 int pageSize = 50;
193 int from = (pageNumber * pageSize) - pageSize + 1; 187 int from = (pageNumber * pageSize) - pageSize + 1;
194 int to = pageNumber * pageSize; 188 int to = pageNumber * pageSize;
195 ArrayList<Lexicon> statLexicons = Lexica.getInstance().getLocalLexicons(language); 189 ArrayList<Lexicon> statLexicons = Lexica.getInstance().getLocalLexicons(language);
196 ArrayList<Lexicon> retLexicons = null; 190 ArrayList<Lexicon> retLexicons = null;
197 if (statLexicons != null) { 191 if (statLexicons != null) {
209 } 203 }
210 } 204 }
211 return retLexicons; 205 return retLexicons;
212 } 206 }
213 207
214 public ArrayList<Lexicon> getLexEntriesByLexiconBeginningWith(String lexiconName, String formPrefix, int pageNumber) throws ApplicationException { 208 public ArrayList<Lexicon> getLexEntriesByLexiconBeginningWith(String lexiconName, String formPrefix, int pageNumber, int pageSize) throws ApplicationException {
215 int pageSize = 50;
216 int from = (pageNumber * pageSize) - pageSize + 1; 209 int from = (pageNumber * pageSize) - pageSize + 1;
217 int to = pageNumber * pageSize; 210 int to = pageNumber * pageSize;
218 Lexicon lexicon = Lexica.getInstance().getLexicon(lexiconName).clone(); 211 Lexicon lexicon = Lexica.getInstance().getLexicon(lexiconName).clone();
219 ArrayList<Lexicon> retLexicons = null; 212 ArrayList<Lexicon> retLexicons = null;
220 if (lexicon != null) { 213 if (lexicon != null) {
263 dbFoundValueStr = new String(foundValueBytes, "utf-8"); 256 dbFoundValueStr = new String(foundValueBytes, "utf-8");
264 } 257 }
265 cursor.close(); 258 cursor.close();
266 if (dbFoundValueStr != null) { 259 if (dbFoundValueStr != null) {
267 retLexEntry = new LexiconEntry(lexiconName, formName, dbFoundValueStr); 260 retLexEntry = new LexiconEntry(lexiconName, formName, dbFoundValueStr);
261 retLexEntry = correct(retLexEntry); // correct errors: e.g. in lsj some html entities are not correct
268 } 262 }
269 } catch (DatabaseException e) { 263 } catch (DatabaseException e) {
270 throw new ApplicationException(e); 264 throw new ApplicationException(e);
271 } catch (UnsupportedEncodingException e) { 265 } catch (UnsupportedEncodingException e) {
272 throw new ApplicationException(e); 266 throw new ApplicationException(e);
290 byte[] foundValueBytes = foundValue.getData(); 284 byte[] foundValueBytes = foundValue.getData();
291 dbFoundValueStr = new String(foundValueBytes, "utf-8"); 285 dbFoundValueStr = new String(foundValueBytes, "utf-8");
292 byte[] foundKeyBytes = dbEntryKey.getData(); 286 byte[] foundKeyBytes = dbEntryKey.getData();
293 String dbFoundKeyStr = new String(foundKeyBytes, "utf-8"); 287 String dbFoundKeyStr = new String(foundKeyBytes, "utf-8");
294 LexiconEntry lexEntry = new LexiconEntry(lexiconName, dbFoundKeyStr, dbFoundValueStr); 288 LexiconEntry lexEntry = new LexiconEntry(lexiconName, dbFoundKeyStr, dbFoundValueStr);
289 lexEntry = correct(lexEntry); // correct errors: e.g. in lsj some html entities are not correct
295 retLexEntries.add(lexEntry); 290 retLexEntries.add(lexEntry);
296 } 291 }
297 operationStatus = cursor.getNext(dbEntryKey, foundValue, LockMode.DEFAULT); 292 operationStatus = cursor.getNext(dbEntryKey, foundValue, LockMode.DEFAULT);
298 counter++; 293 counter++;
299 } 294 }
305 throw new ApplicationException(e); 300 throw new ApplicationException(e);
306 } catch (UnsupportedEncodingException e) { 301 } catch (UnsupportedEncodingException e) {
307 throw new ApplicationException(e); 302 throw new ApplicationException(e);
308 } 303 }
309 return retLexEntries; 304 return retLexEntries;
305 }
306
307 private LexiconEntry correct(LexiconEntry lexEntry) {
308 String lexiconName = lexEntry.getLexiconName();
309 String content = lexEntry.getContent();
310 if (content != null && content.contains("&#") && lexiconName.equals("lsj")) { // errors in greek lexicon lsj
311 content = content.replaceAll("&#\u03C7", "&#x"); // html entity: replace greek Minuskel Chi by "x"
312 content = content.replaceAll("&#x[^0-9]{4};", ""); // html entity: remove entity if special greek not hex characters appear
313 lexEntry.setContent(content);
314 }
315 return lexEntry;
310 } 316 }
311 317
312 public static void main(String[] args) throws ApplicationException { 318 public static void main(String[] args) throws ApplicationException {
313 getInstance(); 319 getInstance();
314 instance.beginOperation(); 320 instance.beginOperation();