Mercurial > hg > mpdl-group
view software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/morph/converter/Converter.java @ 12:fba5577e49d9
diverse Fehlerbehebungen
author | Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de> |
---|---|
date | Tue, 19 Apr 2011 16:51:26 +0200 |
parents | 408254cf2f1d |
children |
line wrap: on
line source
package de.mpg.mpiwg.berlin.mpdl.lt.morph.converter; import java.io.BufferedInputStream; import java.io.BufferedOutputStream; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.FileReader; import java.io.IOException; import java.io.InputStream; import java.util.Date; import java.util.Hashtable; import org.xml.sax.InputSource; import org.xml.sax.SAXException; import org.xml.sax.XMLReader; import com.sun.org.apache.xerces.internal.parsers.SAXParser; import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; import de.mpg.mpiwg.berlin.mpdl.general.MpdlConstants; import de.mpg.mpiwg.berlin.mpdl.lt.general.Transcoder; import de.mpg.mpiwg.berlin.mpdl.lt.morph.app.Form; import de.mpg.mpiwg.berlin.mpdl.util.Util; public class Converter { private static Converter instance; private static String MPDL_DATA_DIR = MpdlConstants.MPDL_DATA_DIR; private static String ORIG_PERSEUS_DATA_DIR = MPDL_DATA_DIR + "/dataFilesOrig/perseus"; private static String ORIG_CELEX_DATA_DIR = MPDL_DATA_DIR + "/dataFilesOrig/celex"; private static String ORIG_FRENCH_DATA_DIR = MPDL_DATA_DIR + "/dataFilesOrig/french"; private static String ORIG_ITALIAN_DATA_DIR = MPDL_DATA_DIR + "/dataFilesOrig/italian"; private static String ORIG_DONATUS_SUB_DATA_DIR = MPDL_DATA_DIR + "/dataFilesOrig/donatus-sup"; private static String OUT_DATA_DIR = MPDL_DATA_DIR + "/dataFiles"; private PerseusContentHandler perseusContentHandler; private Hashtable<String, Hashtable<String, Form>> forms = new Hashtable<String, Hashtable<String, Form>>(); private Date beginOfOperation; private Date endOfOperation; public static Converter getInstance() throws ApplicationException { if (instance == null) { instance = new Converter(); } return instance; } /** * */ public static void main(String[] args) throws ApplicationException { getInstance(); instance.beginOperation(); System.out.print("Start ..."); /* // Latin String inputFileNameLatin = ORIG_PERSEUS_DATA_DIR + "/" + "latin.morph.xml"; String outputFileNameLatin = OUT_DATA_DIR + "/" + "perseus-latin-forms.xml"; instance.perseusConvert("perseus", "la", inputFileNameLatin, outputFileNameLatin); String inputFileNameDonatusLatinSup = ORIG_DONATUS_SUB_DATA_DIR + "/" + "donatus-sup-la-forms.csv"; String outputFileNameDonatusLatinSup = OUT_DATA_DIR + "/" + "donatus-sup-la-forms.xml"; instance.donatusSupplementsConvert("donatus-sup", "la", inputFileNameDonatusLatinSup, outputFileNameDonatusLatinSup); instance.forms = new Hashtable<String, Hashtable<String, Form>>(); // Greek String inputFileNameGreek = ORIG_PERSEUS_DATA_DIR + "/" + "greek.morph.xml"; String outputFileNameGreek = OUT_DATA_DIR + "/" + "perseus-greek-forms.xml"; instance.perseusConvert("perseus", "el", inputFileNameGreek, outputFileNameGreek); String inputFileNameDonatusGreekSup = ORIG_DONATUS_SUB_DATA_DIR + "/" + "donatus-sup-el-forms.csv"; String outputFileNameDonatusGreekSup = OUT_DATA_DIR + "/" + "donatus-sup-el-forms.xml"; instance.donatusSupplementsConvert("donatus-sup", "el", inputFileNameDonatusGreekSup, outputFileNameDonatusGreekSup); instance.forms = new Hashtable<String, Hashtable<String, Form>>(); // Arabic String inputFileNameArabic = ORIG_PERSEUS_DATA_DIR + "/" + "arabic.morph.xml"; String outputFileNameArabic = OUT_DATA_DIR + "/" + "perseus-arabic-forms.xml"; instance.perseusConvert("perseus", "ar", inputFileNameArabic, outputFileNameArabic); String inputFileNameDonatusArabicSup = ORIG_DONATUS_SUB_DATA_DIR + "/" + "donatus-sup-ar-forms.csv"; String outputFileNameDonatusArabicSup = OUT_DATA_DIR + "/" + "donatus-sup-ar-forms.xml"; instance.donatusSupplementsConvert("donatus-sup", "ar", inputFileNameDonatusArabicSup, outputFileNameDonatusArabicSup); instance.forms = new Hashtable<String, Hashtable<String, Form>>(); // Dutch String inputFileNameDutchWords = ORIG_CELEX_DATA_DIR + "/" + "dmw.cd"; String inputFileNameDutchLemmas = ORIG_CELEX_DATA_DIR + "/" + "dml.cd"; String outputFileNameDutch = OUT_DATA_DIR + "/" + "celex-dutch-forms.xml"; instance.celexConvert("celex", "nl", inputFileNameDutchWords, inputFileNameDutchLemmas, outputFileNameDutch); instance.forms = new Hashtable<String, Hashtable<String, Form>>(); // German String inputFileNameGermanWords = ORIG_CELEX_DATA_DIR + "/" + "gmw.cd"; String inputFileNameGermanLemmas = ORIG_CELEX_DATA_DIR + "/" + "gml.cd"; String outputFileNameGerman = OUT_DATA_DIR + "/" + "celex-german-forms.xml"; instance.celexConvert("celex", "de", inputFileNameGermanWords, inputFileNameGermanLemmas, outputFileNameGerman); String inputFileNameDonatusGermanSup = ORIG_DONATUS_SUB_DATA_DIR + "/" + "donatus-sup-de-forms.csv"; String outputFileNameDonatusGermanSup = OUT_DATA_DIR + "/" + "donatus-sup-de-forms.xml"; instance.donatusSupplementsConvert("donatus-sup", "de", inputFileNameDonatusGermanSup, outputFileNameDonatusGermanSup); instance.forms = new Hashtable<String, Hashtable<String, Form>>(); // English String inputFileNameEnglishWords = ORIG_CELEX_DATA_DIR + "/" + "emw.cd"; String inputFileNameEnglishLemmas = ORIG_CELEX_DATA_DIR + "/" + "eml.cd"; String outputFileNameEnglish = OUT_DATA_DIR + "/" + "celex-english-forms.xml"; instance.celexConvert("celex", "en", inputFileNameEnglishWords, inputFileNameEnglishLemmas, outputFileNameEnglish); String inputFileNameDonatusEnglishSup = ORIG_DONATUS_SUB_DATA_DIR + "/" + "donatus-sup-en-forms.csv"; String outputFileNameDonatusEnglishSup = OUT_DATA_DIR + "/" + "donatus-sup-en-forms.xml"; instance.donatusSupplementsConvert("donatus-sup", "en", inputFileNameDonatusEnglishSup, outputFileNameDonatusEnglishSup); instance.forms = new Hashtable<String, Hashtable<String, Form>>(); // French String inputFileNameFrench = ORIG_FRENCH_DATA_DIR + "/" + "lexique"; String outputFileNameFrench = OUT_DATA_DIR + "/" + "lexique-french-forms.xml"; instance.lexiqueConvert("lexique", "fr", inputFileNameFrench, outputFileNameFrench); String inputFileNameDonatusFrenchSup = ORIG_DONATUS_SUB_DATA_DIR + "/" + "donatus-sup-fr-forms.csv"; String outputFileNameDonatusFrenchSup = OUT_DATA_DIR + "/" + "donatus-sup-fr-forms.xml"; instance.donatusSupplementsConvert("donatus-sup", "fr", inputFileNameDonatusFrenchSup, outputFileNameDonatusFrenchSup); instance.forms = new Hashtable<String, Hashtable<String, Form>>(); // Italian String inputFileNameItalian = ORIG_ITALIAN_DATA_DIR + "/" + "ital.hash"; String outputFileNameItalian = OUT_DATA_DIR + "/" + "donatus-italian-forms.xml"; instance.donatusItalianConvert("donatus", "it", inputFileNameItalian, outputFileNameItalian); String inputFileNameDonatusItalianSup = ORIG_DONATUS_SUB_DATA_DIR + "/" + "donatus-sup-it-forms.csv"; String outputFileNameDonatusItalianSup = OUT_DATA_DIR + "/" + "donatus-sup-it-forms.xml"; instance.donatusSupplementsConvert("donatus-sup", "it", inputFileNameDonatusItalianSup, outputFileNameDonatusItalianSup); */ instance.forms = new Hashtable<String, Hashtable<String, Form>>(); instance.end(); instance.endOperation(); Double elapsedTime = new Util().getSecondWithMillisecondsBetween(instance.beginOfOperation, instance.endOfOperation); System.out.println("End."); System.out.println("Needed time: " + elapsedTime + " seconds"); } private void perseusConvert(String provider, String language, String inputFileName, String outputFileName) throws ApplicationException { File inputFile = new File(inputFileName); perseusContentHandler = new PerseusContentHandler(provider, language, outputFileName); try { XMLReader xmlParser = new SAXParser(); xmlParser.setContentHandler(perseusContentHandler); InputStream inputStream = new FileInputStream(inputFile); BufferedInputStream bufferedInputStream = new BufferedInputStream(inputStream); InputSource input = new InputSource(bufferedInputStream); xmlParser.parse(input); bufferedInputStream.close(); forms = perseusContentHandler.getForms(); } catch (SAXException e) { throw new ApplicationException(e); } catch (IOException e) { throw new ApplicationException(e); } } private void celexConvert(String provider, String language, String inputFileNameWords, String inputFileNameLemmas, String outputFileName) throws ApplicationException { File inputFileLemmas = new File(inputFileNameLemmas); Hashtable<Integer, String> lemmas = loadLemmas(inputFileLemmas); File inputFileWords = new File(inputFileNameWords); File outputFile = new File(outputFileName); writeCelexForms(provider, language, lemmas, inputFileWords, outputFile); } private void lexiqueConvert(String provider, String language, String inputFileName, String outputFileName) throws ApplicationException { File inputFile = new File(inputFileName); File outputFile = new File(outputFileName); writeLexiqueForms(provider, language, inputFile, outputFile); } private void donatusItalianConvert(String provider, String language, String inputFileName, String outputFileName) throws ApplicationException { File inputFile = new File(inputFileName); File outputFile = new File(outputFileName); writeDonatusItalianForms(provider, language, inputFile, outputFile); } private void donatusSupplementsConvert(String provider, String language, String inputFileName, String outputFileName) throws ApplicationException { File inputFile = new File(inputFileName); File outputFile = new File(outputFileName); writeDonatusSupplementsForms(provider, language, inputFile, outputFile); } private Hashtable<Integer, String> loadLemmas(File inputFile) { Hashtable<Integer, String> retLemmas = new Hashtable<Integer, String>(); BufferedReader in = null; try { in = new BufferedReader(new FileReader(inputFile)); String line = null; while((line = in.readLine()) != null) { int from = line.indexOf("\\"); int to = line.indexOf("\\", from + 1); String idStr = line.substring(0, from); Integer idInt = new Integer(idStr); String lemma = line.substring(from + 1, to); retLemmas.put(idInt, lemma); } } catch (FileNotFoundException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } finally { // always close the stream if (in != null) try { in.close(); } catch (Exception e) { } } return retLemmas; } private void writeCelexForms(String provider, String language, Hashtable<Integer, String> lemmas, File inputFileWords, File outputFile) throws ApplicationException { BufferedReader in = null; BufferedOutputStream out = null; forms = new Hashtable<String, Hashtable<String, Form>>(); try { in = new BufferedReader(new FileReader(inputFileWords)); out = new BufferedOutputStream(new FileOutputStream(outputFile)); write("<forms>\n", out); String line = null; while((line = in.readLine()) != null) { int delim1 = line.indexOf("\\"); int delim2 = line.indexOf("\\", delim1 + 1); int delim3 = line.indexOf("\\", delim2 + 1); int delim4 = line.indexOf("\\", delim3 + 1); String formName = line.substring(delim1 + 1, delim2); String lemmaIdStr = line.substring(delim3 + 1, delim4); Integer lemmaIdInt = null; try { lemmaIdInt = new Integer(lemmaIdStr); } catch (NumberFormatException e) { System.out.println("Warning: Lemma id: " + lemmaIdStr + " is not correct"); } if (lemmaIdInt != null) { String lemmaName = lemmas.get(lemmaIdInt); Form form = new Form(); form.setProvider(provider); form.setLanguage(language); form.setFormName(formName); form.setLemmaName(lemmaName); form.normalize(); if (form.isOk()) { Hashtable<String, Form> formLemmas = forms.get(formName); if (formLemmas == null) { formLemmas = new Hashtable<String, Form>(); formLemmas.put(lemmaName, form); forms.put(formName, formLemmas); write(form, out); } else { Form formLemma = formLemmas.get(lemmaName); if (formLemma == null) { formLemmas.put(lemmaName, form); write(form, out); } } } } } write("</forms>\n", out); } catch (FileNotFoundException e) { throw new ApplicationException(e); } catch (IOException e) { throw new ApplicationException(e); } finally { // always close the stream if (in != null) try { in.close(); } catch (Exception e) { } if (out != null) try { out.close(); } catch (Exception e) { } } } private void writeLexiqueForms(String provider, String language, File inputFile, File outputFile) throws ApplicationException { BufferedReader in = null; BufferedOutputStream out = null; forms = new Hashtable<String, Hashtable<String, Form>>(); try { in = new BufferedReader(new FileReader(inputFile)); out = new BufferedOutputStream(new FileOutputStream(outputFile)); write("<forms>\n", out); String line = null; while((line = in.readLine()) != null) { int delim1 = line.indexOf("\t"); int delim2 = line.indexOf("\t", delim1 + 1); String formName = line.substring(0, delim1).trim(); String lemmaName = line.substring(delim1 + 1, delim2).trim(); if (lemmaName.equals("=")) lemmaName = formName; Form form = new Form(); form.setProvider(provider); form.setLanguage(language); form.setFormName(formName); form.setLemmaName(lemmaName); form.normalize(); if (form.isOk()) { Hashtable<String, Form> formLemmas = forms.get(formName); if (formLemmas == null) { formLemmas = new Hashtable<String, Form>(); formLemmas.put(lemmaName, form); forms.put(formName, formLemmas); write(form, out); } else { Form formLemma = formLemmas.get(lemmaName); if (formLemma == null) { formLemmas.put(lemmaName, form); write(form, out); } } } } write("</forms>\n", out); } catch (FileNotFoundException e) { throw new ApplicationException(e); } catch (IOException e) { throw new ApplicationException(e); } finally { // always close the stream if (in != null) try { in.close(); } catch (Exception e) { } if (out != null) try { out.close(); } catch (Exception e) { } } } private void writeDonatusItalianForms(String provider, String language, File inputFile, File outputFile) throws ApplicationException { BufferedReader in = null; BufferedOutputStream out = null; forms = new Hashtable<String, Hashtable<String, Form>>(); try { in = new BufferedReader(new FileReader(inputFile)); out = new BufferedOutputStream(new FileOutputStream(outputFile)); write("<forms>\n", out); String line = null; while((line = in.readLine()) != null) { // one line is of the form: 'risoluino' => '<NL>V risolvino,risolvere pres imperat 3rd pl ...</NL><NL>...</NL>', // or of the form: 'legamenti' => '<NL>N legamento masc pl ...</NL><NL>...</NL>', // this method only recognize the first lemma TODO recognize all lemmas for the form int delim1 = line.indexOf("'"); int delim2 = line.indexOf("'", delim1 + 1); int delim3 = line.indexOf("'", delim2 + 1); int delim4 = delim3 + 6; // beginning of the lemma int delim5 = line.indexOf(" ", delim4 + 1); // end of the first lemma(s) is separated by a blank String formName = line.substring(delim1 + 1, delim2); formName = formName.replace("\\", ""); String lemmaName = line.substring(delim4 + 1, delim5); int commaInLemma = lemmaName.indexOf(","); // when there are more than one lemma if (commaInLemma != -1) lemmaName = lemmaName.substring(0, commaInLemma); lemmaName = lemmaName.replace("\\", ""); Form form = new Form(); form.setProvider(provider); form.setLanguage(language); form.setFormName(formName); form.setLemmaName(lemmaName); form.normalize(); boolean lineContainsAp = line.contains("\''"); // some of the form lines contain irregular strings of the form: 'par\'' => '<NL>N pari/^,pari indeclform adverb</NL> if (form.isOk() && ! lineContainsAp) { Hashtable<String, Form> formLemmas = forms.get(formName); if (formLemmas == null) { formLemmas = new Hashtable<String, Form>(); formLemmas.put(lemmaName, form); forms.put(formName, formLemmas); write(form, out); } else { Form formLemma = formLemmas.get(lemmaName); if (formLemma == null) { formLemmas.put(lemmaName, form); write(form, out); } } } } write("</forms>\n", out); } catch (FileNotFoundException e) { throw new ApplicationException(e); } catch (IOException e) { throw new ApplicationException(e); } finally { // always close the stream if (in != null) try { in.close(); } catch (Exception e) { } if (out != null) try { out.close(); } catch (Exception e) { } } } private void writeDonatusSupplementsForms(String provider, String language, File inputFile, File outputFile) throws ApplicationException { BufferedReader in = null; BufferedOutputStream out = null; try { in = new BufferedReader(new FileReader(inputFile)); out = new BufferedOutputStream(new FileOutputStream(outputFile)); write("<forms>\n", out); String line = null; String lemmaName = ""; String formName = ""; // each line is a form while((line = in.readLine()) != null) { if (line.length() == 0) break; String firstChar = line.substring(0, 1); String mode = "lemmaAndForm"; if (firstChar.equals(",")) mode = "form"; if (mode.equals("lemmaAndForm")) { int quote2 = line.indexOf("\"", 1); lemmaName = line.substring(1, quote2); int quote3 = line.indexOf("\"", quote2 + 1); int quote4 = line.indexOf("\"", quote3 + 1); formName = line.substring(quote3 + 1, quote4); } else if (mode.equals("form")) { int quote2 = line.indexOf("\"", 3); formName = line.substring(2, quote2); } Form form = new Form(); form.setProvider(provider); form.setLanguage(language); form.setFormName(formName); form.setLemmaName(lemmaName); if (form.isGreek()) transcodeFromBetaCode2Unicode(form); else if (form.isArabic()) form = transcodeFromBuckwalter2Unicode(form); form.normalize(); if (form.isOk()) { Hashtable<String, Form> formLemmas = forms.get(formName); if (formLemmas == null) { formLemmas = new Hashtable<String, Form>(); formLemmas.put(lemmaName, form); forms.put(formName, formLemmas); write(form, out); } else { Form formLemma = formLemmas.get(lemmaName); if (formLemma == null) { formLemmas.put(lemmaName, form); write(form, out); } } } } write("</forms>\n", out); } catch (FileNotFoundException e) { throw new ApplicationException(e); } catch (IOException e) { throw new ApplicationException(e); } finally { // always close the stream if (in != null) try { in.close(); } catch (Exception e) { } if (out != null) try { out.close(); } catch (Exception e) { } } } private void write(Form form, BufferedOutputStream out) throws ApplicationException { try { String xmlFormStr = form.getXmlString(); byte[] bytes = xmlFormStr.getBytes("utf-8"); out.write(bytes, 0, bytes.length); out.flush(); } catch (IOException e) { throw new ApplicationException(e); } } private void write(String inputString, BufferedOutputStream out) throws ApplicationException { try { byte[] bytes = inputString.getBytes("utf-8"); out.write(bytes, 0, bytes.length); out.flush(); } catch (IOException e) { throw new ApplicationException(e); } } private Form transcodeFromBetaCode2Unicode(Form form) throws ApplicationException { String formName = form.getFormName(); String lemmaName = form.getLemmaName(); Transcoder transcoder = Transcoder.getInstance(); String encodedUnicodeForm = transcoder.transcodeFromBetaCode2Unicode(formName); String encodedUnicodeLemma = transcoder.transcodeFromBetaCode2Unicode(lemmaName); // replace "small letter sigma" at the end of a word by the "small letter end sigma" if (encodedUnicodeForm != null && encodedUnicodeForm.endsWith("σ")) { int length = encodedUnicodeForm.length(); encodedUnicodeForm = encodedUnicodeForm.substring(0, length - 1) + "ς"; } if (encodedUnicodeLemma != null && encodedUnicodeLemma.endsWith("σ")) { int length = encodedUnicodeLemma.length(); encodedUnicodeLemma = encodedUnicodeLemma.substring(0, length - 1) + "ς"; } form.setFormName(encodedUnicodeForm); form.setLemmaName(encodedUnicodeLemma); return form; } private Form transcodeFromBuckwalter2Unicode(Form form) throws ApplicationException { String formName = form.getFormName(); String lemmaName = form.getLemmaName(); Transcoder transcoder = Transcoder.getInstance(); String encodedUnicodeForm = transcoder.transcodeFromBuckwalter2Unicode(formName); String encodedUnicodeLemma = transcoder.transcodeFromBuckwalter2Unicode(lemmaName); form.setFormName(encodedUnicodeForm); form.setLemmaName(encodedUnicodeLemma); return form; } private void end() throws ApplicationException { } private void beginOperation() { beginOfOperation = new Date(); } private void endOperation() { endOfOperation = new Date(); } }