comparison software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/morph/converter/Converter.java @ 0:408254cf2f1d

Erstellung
author Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
date Wed, 24 Nov 2010 17:24:23 +0100
parents
children fba5577e49d9
comparison
equal deleted inserted replaced
-1:000000000000 0:408254cf2f1d
1 package de.mpg.mpiwg.berlin.mpdl.lt.morph.converter;
2
3 import java.io.BufferedInputStream;
4 import java.io.BufferedOutputStream;
5 import java.io.BufferedReader;
6 import java.io.File;
7 import java.io.FileInputStream;
8 import java.io.FileNotFoundException;
9 import java.io.FileOutputStream;
10 import java.io.FileReader;
11 import java.io.IOException;
12 import java.io.InputStream;
13 import java.util.Date;
14 import java.util.Hashtable;
15
16 import org.xml.sax.InputSource;
17 import org.xml.sax.SAXException;
18 import org.xml.sax.XMLReader;
19
20 import com.sun.org.apache.xerces.internal.parsers.SAXParser;
21
22 import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException;
23 import de.mpg.mpiwg.berlin.mpdl.general.MpdlConstants;
24 import de.mpg.mpiwg.berlin.mpdl.lt.general.Transcoder;
25 import de.mpg.mpiwg.berlin.mpdl.lt.morph.app.Form;
26 import de.mpg.mpiwg.berlin.mpdl.util.Util;
27
28 public class Converter {
29 private static Converter instance;
30 private static String MPDL_DATA_DIR = MpdlConstants.MPDL_DATA_DIR;
31 private static String ORIG_PERSEUS_DATA_DIR = MPDL_DATA_DIR + "/dataFilesOrig/perseus";
32 private static String ORIG_CELEX_DATA_DIR = MPDL_DATA_DIR + "/dataFilesOrig/celex";
33 private static String ORIG_FRENCH_DATA_DIR = MPDL_DATA_DIR + "/dataFilesOrig/french";
34 private static String ORIG_ITALIAN_DATA_DIR = MPDL_DATA_DIR + "/dataFilesOrig/italian";
35 private static String ORIG_DONATUS_SUB_DATA_DIR = MPDL_DATA_DIR + "/dataFilesOrig/donatus-sup";
36 private static String OUT_DATA_DIR = MPDL_DATA_DIR + "/dataFiles";
37 private PerseusContentHandler perseusContentHandler;
38 private Hashtable<String, Hashtable<String, Form>> forms = new Hashtable<String, Hashtable<String, Form>>();
39 private Date beginOfOperation;
40 private Date endOfOperation;
41
42 public static Converter getInstance() throws ApplicationException {
43 if (instance == null) {
44 instance = new Converter();
45 }
46 return instance;
47 }
48
49 /**
50 *
51 */
52 public static void main(String[] args) throws ApplicationException {
53 getInstance();
54 instance.beginOperation();
55 System.out.print("Start ...");
56 /*
57 // Latin
58 String inputFileNameLatin = ORIG_PERSEUS_DATA_DIR + "/" + "latin.morph.xml";
59 String outputFileNameLatin = OUT_DATA_DIR + "/" + "perseus-latin-forms.xml";
60 instance.perseusConvert("perseus", "la", inputFileNameLatin, outputFileNameLatin);
61 String inputFileNameDonatusLatinSup = ORIG_DONATUS_SUB_DATA_DIR + "/" + "donatus-sup-la-forms.csv";
62 String outputFileNameDonatusLatinSup = OUT_DATA_DIR + "/" + "donatus-sup-la-forms.xml";
63 instance.donatusSupplementsConvert("donatus-sup", "la", inputFileNameDonatusLatinSup, outputFileNameDonatusLatinSup);
64 instance.forms = new Hashtable<String, Hashtable<String, Form>>();
65 // Greek
66 String inputFileNameGreek = ORIG_PERSEUS_DATA_DIR + "/" + "greek.morph.xml";
67 String outputFileNameGreek = OUT_DATA_DIR + "/" + "perseus-greek-forms.xml";
68 instance.perseusConvert("perseus", "el", inputFileNameGreek, outputFileNameGreek);
69 String inputFileNameDonatusGreekSup = ORIG_DONATUS_SUB_DATA_DIR + "/" + "donatus-sup-el-forms.csv";
70 String outputFileNameDonatusGreekSup = OUT_DATA_DIR + "/" + "donatus-sup-el-forms.xml";
71 instance.donatusSupplementsConvert("donatus-sup", "el", inputFileNameDonatusGreekSup, outputFileNameDonatusGreekSup);
72 instance.forms = new Hashtable<String, Hashtable<String, Form>>();
73 // Arabic
74 String inputFileNameArabic = ORIG_PERSEUS_DATA_DIR + "/" + "arabic.morph.xml";
75 String outputFileNameArabic = OUT_DATA_DIR + "/" + "perseus-arabic-forms.xml";
76 instance.perseusConvert("perseus", "ar", inputFileNameArabic, outputFileNameArabic);
77 String inputFileNameDonatusArabicSup = ORIG_DONATUS_SUB_DATA_DIR + "/" + "donatus-sup-ar-forms.csv";
78 String outputFileNameDonatusArabicSup = OUT_DATA_DIR + "/" + "donatus-sup-ar-forms.xml";
79 instance.donatusSupplementsConvert("donatus-sup", "ar", inputFileNameDonatusArabicSup, outputFileNameDonatusArabicSup);
80 instance.forms = new Hashtable<String, Hashtable<String, Form>>();
81 // Dutch
82 String inputFileNameDutchWords = ORIG_CELEX_DATA_DIR + "/" + "dmw.cd";
83 String inputFileNameDutchLemmas = ORIG_CELEX_DATA_DIR + "/" + "dml.cd";
84 String outputFileNameDutch = OUT_DATA_DIR + "/" + "celex-dutch-forms.xml";
85 instance.celexConvert("celex", "nl", inputFileNameDutchWords, inputFileNameDutchLemmas, outputFileNameDutch);
86 instance.forms = new Hashtable<String, Hashtable<String, Form>>();
87 // German
88 String inputFileNameGermanWords = ORIG_CELEX_DATA_DIR + "/" + "gmw.cd";
89 String inputFileNameGermanLemmas = ORIG_CELEX_DATA_DIR + "/" + "gml.cd";
90 String outputFileNameGerman = OUT_DATA_DIR + "/" + "celex-german-forms.xml";
91 instance.celexConvert("celex", "de", inputFileNameGermanWords, inputFileNameGermanLemmas, outputFileNameGerman);
92 String inputFileNameDonatusGermanSup = ORIG_DONATUS_SUB_DATA_DIR + "/" + "donatus-sup-de-forms.csv";
93 String outputFileNameDonatusGermanSup = OUT_DATA_DIR + "/" + "donatus-sup-de-forms.xml";
94 instance.donatusSupplementsConvert("donatus-sup", "de", inputFileNameDonatusGermanSup, outputFileNameDonatusGermanSup);
95 instance.forms = new Hashtable<String, Hashtable<String, Form>>();
96 // English
97 String inputFileNameEnglishWords = ORIG_CELEX_DATA_DIR + "/" + "emw.cd";
98 String inputFileNameEnglishLemmas = ORIG_CELEX_DATA_DIR + "/" + "eml.cd";
99 String outputFileNameEnglish = OUT_DATA_DIR + "/" + "celex-english-forms.xml";
100 instance.celexConvert("celex", "en", inputFileNameEnglishWords, inputFileNameEnglishLemmas, outputFileNameEnglish);
101 String inputFileNameDonatusEnglishSup = ORIG_DONATUS_SUB_DATA_DIR + "/" + "donatus-sup-en-forms.csv";
102 String outputFileNameDonatusEnglishSup = OUT_DATA_DIR + "/" + "donatus-sup-en-forms.xml";
103 instance.donatusSupplementsConvert("donatus-sup", "en", inputFileNameDonatusEnglishSup, outputFileNameDonatusEnglishSup);
104 instance.forms = new Hashtable<String, Hashtable<String, Form>>();
105 // French
106 String inputFileNameFrench = ORIG_FRENCH_DATA_DIR + "/" + "lexique";
107 String outputFileNameFrench = OUT_DATA_DIR + "/" + "lexique-french-forms.xml";
108 instance.lexiqueConvert("lexique", "fr", inputFileNameFrench, outputFileNameFrench);
109 String inputFileNameDonatusFrenchSup = ORIG_DONATUS_SUB_DATA_DIR + "/" + "donatus-sup-fr-forms.csv";
110 String outputFileNameDonatusFrenchSup = OUT_DATA_DIR + "/" + "donatus-sup-fr-forms.xml";
111 instance.donatusSupplementsConvert("donatus-sup", "fr", inputFileNameDonatusFrenchSup, outputFileNameDonatusFrenchSup);
112 instance.forms = new Hashtable<String, Hashtable<String, Form>>();
113 */
114 // Italian
115 String inputFileNameItalian = ORIG_ITALIAN_DATA_DIR + "/" + "ital.hash";
116 String outputFileNameItalian = OUT_DATA_DIR + "/" + "donatus-italian-forms.xml";
117 instance.donatusItalianConvert("donatus", "it", inputFileNameItalian, outputFileNameItalian);
118 /*
119 String inputFileNameDonatusItalianSup = ORIG_DONATUS_SUB_DATA_DIR + "/" + "donatus-sup-it-forms.csv";
120 String outputFileNameDonatusItalianSup = OUT_DATA_DIR + "/" + "donatus-sup-it-forms.xml";
121 instance.donatusSupplementsConvert("donatus-sup", "it", inputFileNameDonatusItalianSup, outputFileNameDonatusItalianSup);
122 */
123 instance.forms = new Hashtable<String, Hashtable<String, Form>>();
124
125 instance.end();
126 instance.endOperation();
127 Double elapsedTime = new Util().getSecondWithMillisecondsBetween(instance.beginOfOperation, instance.endOfOperation);
128 System.out.println("End.");
129 System.out.println("Needed time: " + elapsedTime + " seconds");
130 }
131
132 private void perseusConvert(String provider, String language, String inputFileName, String outputFileName) throws ApplicationException {
133 File inputFile = new File(inputFileName);
134 perseusContentHandler = new PerseusContentHandler(provider, language, outputFileName);
135 try {
136 XMLReader xmlParser = new SAXParser();
137 xmlParser.setContentHandler(perseusContentHandler);
138 InputStream inputStream = new FileInputStream(inputFile);
139 BufferedInputStream bufferedInputStream = new BufferedInputStream(inputStream);
140 InputSource input = new InputSource(bufferedInputStream);
141 xmlParser.parse(input);
142 bufferedInputStream.close();
143 forms = perseusContentHandler.getForms();
144 } catch (SAXException e) {
145 throw new ApplicationException(e);
146 } catch (IOException e) {
147 throw new ApplicationException(e);
148 }
149 }
150
151 private void celexConvert(String provider, String language, String inputFileNameWords, String inputFileNameLemmas, String outputFileName) throws ApplicationException {
152 File inputFileLemmas = new File(inputFileNameLemmas);
153 Hashtable<Integer, String> lemmas = loadLemmas(inputFileLemmas);
154 File inputFileWords = new File(inputFileNameWords);
155 File outputFile = new File(outputFileName);
156 writeCelexForms(provider, language, lemmas, inputFileWords, outputFile);
157 }
158
159 private void lexiqueConvert(String provider, String language, String inputFileName, String outputFileName) throws ApplicationException {
160 File inputFile = new File(inputFileName);
161 File outputFile = new File(outputFileName);
162 writeLexiqueForms(provider, language, inputFile, outputFile);
163 }
164
165 private void donatusItalianConvert(String provider, String language, String inputFileName, String outputFileName) throws ApplicationException {
166 File inputFile = new File(inputFileName);
167 File outputFile = new File(outputFileName);
168 writeDonatusItalianForms(provider, language, inputFile, outputFile);
169 }
170
171 private void donatusSupplementsConvert(String provider, String language, String inputFileName, String outputFileName) throws ApplicationException {
172 File inputFile = new File(inputFileName);
173 File outputFile = new File(outputFileName);
174 writeDonatusSupplementsForms(provider, language, inputFile, outputFile);
175 }
176
177 private Hashtable<Integer, String> loadLemmas(File inputFile) {
178 Hashtable<Integer, String> retLemmas = new Hashtable<Integer, String>();
179 BufferedReader in = null;
180 try {
181 in = new BufferedReader(new FileReader(inputFile));
182 String line = null;
183 while((line = in.readLine()) != null) {
184 int from = line.indexOf("\\");
185 int to = line.indexOf("\\", from + 1);
186 String idStr = line.substring(0, from);
187 Integer idInt = new Integer(idStr);
188 String lemma = line.substring(from + 1, to);
189 retLemmas.put(idInt, lemma);
190 }
191 } catch (FileNotFoundException e) {
192 e.printStackTrace();
193 } catch (IOException e) {
194 e.printStackTrace();
195 } finally {
196 // always close the stream
197 if (in != null) try { in.close(); } catch (Exception e) { }
198 }
199 return retLemmas;
200 }
201
202 private void writeCelexForms(String provider, String language, Hashtable<Integer, String> lemmas, File inputFileWords, File outputFile) throws ApplicationException {
203 BufferedReader in = null;
204 BufferedOutputStream out = null;
205 forms = new Hashtable<String, Hashtable<String, Form>>();
206 try {
207 in = new BufferedReader(new FileReader(inputFileWords));
208 out = new BufferedOutputStream(new FileOutputStream(outputFile));
209 write("<forms>\n", out);
210 String line = null;
211 while((line = in.readLine()) != null) {
212 int delim1 = line.indexOf("\\");
213 int delim2 = line.indexOf("\\", delim1 + 1);
214 int delim3 = line.indexOf("\\", delim2 + 1);
215 int delim4 = line.indexOf("\\", delim3 + 1);
216 String formName = line.substring(delim1 + 1, delim2);
217 String lemmaIdStr = line.substring(delim3 + 1, delim4);
218 Integer lemmaIdInt = null;
219 try {
220 lemmaIdInt = new Integer(lemmaIdStr);
221 } catch (NumberFormatException e) {
222 System.out.println("Warning: Lemma id: " + lemmaIdStr + " is not correct");
223 }
224 if (lemmaIdInt != null) {
225 String lemmaName = lemmas.get(lemmaIdInt);
226 Form form = new Form();
227 form.setProvider(provider);
228 form.setLanguage(language);
229 form.setFormName(formName);
230 form.setLemmaName(lemmaName);
231 form.normalize();
232 if (form.isOk()) {
233 Hashtable<String, Form> formLemmas = forms.get(formName);
234 if (formLemmas == null) {
235 formLemmas = new Hashtable<String, Form>();
236 formLemmas.put(lemmaName, form);
237 forms.put(formName, formLemmas);
238 write(form, out);
239 } else {
240 Form formLemma = formLemmas.get(lemmaName);
241 if (formLemma == null) {
242 formLemmas.put(lemmaName, form);
243 write(form, out);
244 }
245 }
246 }
247 }
248 }
249 write("</forms>\n", out);
250 } catch (FileNotFoundException e) {
251 throw new ApplicationException(e);
252 } catch (IOException e) {
253 throw new ApplicationException(e);
254 } finally {
255 // always close the stream
256 if (in != null) try { in.close(); } catch (Exception e) { }
257 if (out != null) try { out.close(); } catch (Exception e) { }
258 }
259 }
260
261 private void writeLexiqueForms(String provider, String language, File inputFile, File outputFile) throws ApplicationException {
262 BufferedReader in = null;
263 BufferedOutputStream out = null;
264 forms = new Hashtable<String, Hashtable<String, Form>>();
265 try {
266 in = new BufferedReader(new FileReader(inputFile));
267 out = new BufferedOutputStream(new FileOutputStream(outputFile));
268 write("<forms>\n", out);
269 String line = null;
270 while((line = in.readLine()) != null) {
271 int delim1 = line.indexOf("\t");
272 int delim2 = line.indexOf("\t", delim1 + 1);
273 String formName = line.substring(0, delim1).trim();
274 String lemmaName = line.substring(delim1 + 1, delim2).trim();
275 if (lemmaName.equals("="))
276 lemmaName = formName;
277 Form form = new Form();
278 form.setProvider(provider);
279 form.setLanguage(language);
280 form.setFormName(formName);
281 form.setLemmaName(lemmaName);
282 form.normalize();
283 if (form.isOk()) {
284 Hashtable<String, Form> formLemmas = forms.get(formName);
285 if (formLemmas == null) {
286 formLemmas = new Hashtable<String, Form>();
287 formLemmas.put(lemmaName, form);
288 forms.put(formName, formLemmas);
289 write(form, out);
290 } else {
291 Form formLemma = formLemmas.get(lemmaName);
292 if (formLemma == null) {
293 formLemmas.put(lemmaName, form);
294 write(form, out);
295 }
296 }
297 }
298 }
299 write("</forms>\n", out);
300 } catch (FileNotFoundException e) {
301 throw new ApplicationException(e);
302 } catch (IOException e) {
303 throw new ApplicationException(e);
304 } finally {
305 // always close the stream
306 if (in != null) try { in.close(); } catch (Exception e) { }
307 if (out != null) try { out.close(); } catch (Exception e) { }
308 }
309 }
310
311 private void writeDonatusItalianForms(String provider, String language, File inputFile, File outputFile) throws ApplicationException {
312 BufferedReader in = null;
313 BufferedOutputStream out = null;
314 forms = new Hashtable<String, Hashtable<String, Form>>();
315 try {
316 in = new BufferedReader(new FileReader(inputFile));
317 out = new BufferedOutputStream(new FileOutputStream(outputFile));
318 write("<forms>\n", out);
319 String line = null;
320 while((line = in.readLine()) != null) {
321 // one line is of the form: 'risoluino' => '<NL>V risolvino,risolvere pres imperat 3rd pl ...</NL><NL>...</NL>',
322 // or of the form: 'legamenti' => '<NL>N legamento masc pl ...</NL><NL>...</NL>',
323 // this method only recognize the first lemma TODO recognize all lemmas for the form
324 int delim1 = line.indexOf("'");
325 int delim2 = line.indexOf("'", delim1 + 1);
326 int delim3 = line.indexOf("'", delim2 + 1);
327 int delim4 = delim3 + 6; // beginning of the lemma
328 int delim5 = line.indexOf(" ", delim4 + 1); // end of the first lemma(s) is separated by a blank
329 String formName = line.substring(delim1 + 1, delim2);
330 formName = formName.replace("\\", "");
331 String lemmaName = line.substring(delim4 + 1, delim5);
332 int commaInLemma = lemmaName.indexOf(","); // when there are more than one lemma
333 if (commaInLemma != -1)
334 lemmaName = lemmaName.substring(0, commaInLemma);
335 lemmaName = lemmaName.replace("\\", "");
336 Form form = new Form();
337 form.setProvider(provider);
338 form.setLanguage(language);
339 form.setFormName(formName);
340 form.setLemmaName(lemmaName);
341 form.normalize();
342 boolean lineContainsAp = line.contains("\''"); // some of the form lines contain irregular strings of the form: 'par\'' => '<NL>N pari/^,pari indeclform adverb</NL>
343 if (form.isOk() && ! lineContainsAp) {
344 Hashtable<String, Form> formLemmas = forms.get(formName);
345 if (formLemmas == null) {
346 formLemmas = new Hashtable<String, Form>();
347 formLemmas.put(lemmaName, form);
348 forms.put(formName, formLemmas);
349 write(form, out);
350 } else {
351 Form formLemma = formLemmas.get(lemmaName);
352 if (formLemma == null) {
353 formLemmas.put(lemmaName, form);
354 write(form, out);
355 }
356 }
357 }
358 }
359 write("</forms>\n", out);
360 } catch (FileNotFoundException e) {
361 throw new ApplicationException(e);
362 } catch (IOException e) {
363 throw new ApplicationException(e);
364 } finally {
365 // always close the stream
366 if (in != null) try { in.close(); } catch (Exception e) { }
367 if (out != null) try { out.close(); } catch (Exception e) { }
368 }
369 }
370
371 private void writeDonatusSupplementsForms(String provider, String language, File inputFile, File outputFile) throws ApplicationException {
372 BufferedReader in = null;
373 BufferedOutputStream out = null;
374 try {
375 in = new BufferedReader(new FileReader(inputFile));
376 out = new BufferedOutputStream(new FileOutputStream(outputFile));
377 write("<forms>\n", out);
378 String line = null;
379 String lemmaName = "";
380 String formName = "";
381 // each line is a form
382 while((line = in.readLine()) != null) {
383 if (line.length() == 0)
384 break;
385 String firstChar = line.substring(0, 1);
386 String mode = "lemmaAndForm";
387 if (firstChar.equals(","))
388 mode = "form";
389 if (mode.equals("lemmaAndForm")) {
390 int quote2 = line.indexOf("\"", 1);
391 lemmaName = line.substring(1, quote2);
392 int quote3 = line.indexOf("\"", quote2 + 1);
393 int quote4 = line.indexOf("\"", quote3 + 1);
394 formName = line.substring(quote3 + 1, quote4);
395 } else if (mode.equals("form")) {
396 int quote2 = line.indexOf("\"", 3);
397 formName = line.substring(2, quote2);
398 }
399 Form form = new Form();
400 form.setProvider(provider);
401 form.setLanguage(language);
402 form.setFormName(formName);
403 form.setLemmaName(lemmaName);
404 if (form.isGreek())
405 transcodeFromBetaCode2Unicode(form);
406 else if (form.isArabic())
407 form = transcodeFromBuckwalter2Unicode(form);
408 form.normalize();
409 if (form.isOk()) {
410 Hashtable<String, Form> formLemmas = forms.get(formName);
411 if (formLemmas == null) {
412 formLemmas = new Hashtable<String, Form>();
413 formLemmas.put(lemmaName, form);
414 forms.put(formName, formLemmas);
415 write(form, out);
416 } else {
417 Form formLemma = formLemmas.get(lemmaName);
418 if (formLemma == null) {
419 formLemmas.put(lemmaName, form);
420 write(form, out);
421 }
422 }
423 }
424 }
425 write("</forms>\n", out);
426 } catch (FileNotFoundException e) {
427 throw new ApplicationException(e);
428 } catch (IOException e) {
429 throw new ApplicationException(e);
430 } finally {
431 // always close the stream
432 if (in != null) try { in.close(); } catch (Exception e) { }
433 if (out != null) try { out.close(); } catch (Exception e) { }
434 }
435 }
436
437 private void write(Form form, BufferedOutputStream out) throws ApplicationException {
438 try {
439 String xmlFormStr = form.getXmlString();
440 byte[] bytes = xmlFormStr.getBytes("utf-8");
441 out.write(bytes, 0, bytes.length);
442 out.flush();
443 } catch (IOException e) {
444 throw new ApplicationException(e);
445 }
446 }
447
448 private void write(String inputString, BufferedOutputStream out) throws ApplicationException {
449 try {
450 byte[] bytes = inputString.getBytes("utf-8");
451 out.write(bytes, 0, bytes.length);
452 out.flush();
453 } catch (IOException e) {
454 throw new ApplicationException(e);
455 }
456 }
457
458 private Form transcodeFromBetaCode2Unicode(Form form) throws ApplicationException {
459 String formName = form.getFormName();
460 String lemmaName = form.getLemmaName();
461 Transcoder transcoder = Transcoder.getInstance();
462 String encodedUnicodeForm = transcoder.transcodeFromBetaCode2Unicode(formName);
463 String encodedUnicodeLemma = transcoder.transcodeFromBetaCode2Unicode(lemmaName);
464 form.setFormName(encodedUnicodeForm);
465 form.setLemmaName(encodedUnicodeLemma);
466 return form;
467 }
468
469 private Form transcodeFromBuckwalter2Unicode(Form form) throws ApplicationException {
470 String formName = form.getFormName();
471 String lemmaName = form.getLemmaName();
472 Transcoder transcoder = Transcoder.getInstance();
473 String encodedUnicodeForm = transcoder.transcodeFromBuckwalter2Unicode(formName);
474 String encodedUnicodeLemma = transcoder.transcodeFromBuckwalter2Unicode(lemmaName);
475 form.setFormName(encodedUnicodeForm);
476 form.setLemmaName(encodedUnicodeLemma);
477 return form;
478 }
479
480 private void end() throws ApplicationException {
481 }
482
483 private void beginOperation() {
484 beginOfOperation = new Date();
485 }
486
487 private void endOperation() {
488 endOfOperation = new Date();
489 }
490
491 }