view software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/morph/converter/Converter.java @ 12:fba5577e49d9

diverse Fehlerbehebungen
author Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
date Tue, 19 Apr 2011 16:51:26 +0200
parents 408254cf2f1d
children
line wrap: on
line source

package de.mpg.mpiwg.berlin.mpdl.lt.morph.converter;

import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStream;
import java.util.Date;
import java.util.Hashtable;

import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;

import com.sun.org.apache.xerces.internal.parsers.SAXParser;

import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException;
import de.mpg.mpiwg.berlin.mpdl.general.MpdlConstants;
import de.mpg.mpiwg.berlin.mpdl.lt.general.Transcoder;
import de.mpg.mpiwg.berlin.mpdl.lt.morph.app.Form;
import de.mpg.mpiwg.berlin.mpdl.util.Util;

public class Converter {
  private static Converter instance;
  private static String MPDL_DATA_DIR = MpdlConstants.MPDL_DATA_DIR;
  private static String ORIG_PERSEUS_DATA_DIR = MPDL_DATA_DIR + "/dataFilesOrig/perseus";
  private static String ORIG_CELEX_DATA_DIR = MPDL_DATA_DIR + "/dataFilesOrig/celex";
  private static String ORIG_FRENCH_DATA_DIR = MPDL_DATA_DIR + "/dataFilesOrig/french";
  private static String ORIG_ITALIAN_DATA_DIR = MPDL_DATA_DIR + "/dataFilesOrig/italian";
  private static String ORIG_DONATUS_SUB_DATA_DIR = MPDL_DATA_DIR + "/dataFilesOrig/donatus-sup";
  private static String OUT_DATA_DIR = MPDL_DATA_DIR + "/dataFiles";
  private PerseusContentHandler perseusContentHandler;
  private Hashtable<String, Hashtable<String, Form>> forms = new Hashtable<String, Hashtable<String, Form>>();
  private Date beginOfOperation;
  private Date endOfOperation;
  
  public static Converter getInstance() throws ApplicationException {
    if (instance == null) {
      instance = new Converter();
    }
    return instance;
  }

  /**
   * 
   */
  public static void main(String[] args) throws ApplicationException {
    getInstance();
    instance.beginOperation();
    System.out.print("Start ...");
    /*
    // Latin
    String inputFileNameLatin = ORIG_PERSEUS_DATA_DIR + "/" + "latin.morph.xml";
    String outputFileNameLatin = OUT_DATA_DIR + "/" + "perseus-latin-forms.xml";
    instance.perseusConvert("perseus", "la", inputFileNameLatin, outputFileNameLatin);
    String inputFileNameDonatusLatinSup = ORIG_DONATUS_SUB_DATA_DIR + "/" + "donatus-sup-la-forms.csv";
    String outputFileNameDonatusLatinSup = OUT_DATA_DIR + "/" + "donatus-sup-la-forms.xml";
    instance.donatusSupplementsConvert("donatus-sup", "la", inputFileNameDonatusLatinSup, outputFileNameDonatusLatinSup);
    instance.forms = new Hashtable<String, Hashtable<String, Form>>();
    // Greek
    String inputFileNameGreek = ORIG_PERSEUS_DATA_DIR + "/" + "greek.morph.xml";
    String outputFileNameGreek = OUT_DATA_DIR + "/" + "perseus-greek-forms.xml";
    instance.perseusConvert("perseus", "el", inputFileNameGreek, outputFileNameGreek);
    String inputFileNameDonatusGreekSup = ORIG_DONATUS_SUB_DATA_DIR + "/" + "donatus-sup-el-forms.csv";
    String outputFileNameDonatusGreekSup = OUT_DATA_DIR + "/" + "donatus-sup-el-forms.xml";
    instance.donatusSupplementsConvert("donatus-sup", "el", inputFileNameDonatusGreekSup, outputFileNameDonatusGreekSup);
    instance.forms = new Hashtable<String, Hashtable<String, Form>>();
    // Arabic
    String inputFileNameArabic = ORIG_PERSEUS_DATA_DIR + "/" + "arabic.morph.xml";
    String outputFileNameArabic = OUT_DATA_DIR + "/" + "perseus-arabic-forms.xml";
    instance.perseusConvert("perseus", "ar", inputFileNameArabic, outputFileNameArabic);
    String inputFileNameDonatusArabicSup = ORIG_DONATUS_SUB_DATA_DIR + "/" + "donatus-sup-ar-forms.csv";
    String outputFileNameDonatusArabicSup = OUT_DATA_DIR + "/" + "donatus-sup-ar-forms.xml";
    instance.donatusSupplementsConvert("donatus-sup", "ar", inputFileNameDonatusArabicSup, outputFileNameDonatusArabicSup);
    instance.forms = new Hashtable<String, Hashtable<String, Form>>();
    // Dutch
    String inputFileNameDutchWords = ORIG_CELEX_DATA_DIR + "/" + "dmw.cd";
    String inputFileNameDutchLemmas = ORIG_CELEX_DATA_DIR + "/" + "dml.cd";
    String outputFileNameDutch = OUT_DATA_DIR + "/" + "celex-dutch-forms.xml";
    instance.celexConvert("celex", "nl", inputFileNameDutchWords, inputFileNameDutchLemmas, outputFileNameDutch);
    instance.forms = new Hashtable<String, Hashtable<String, Form>>();
    // German
    String inputFileNameGermanWords = ORIG_CELEX_DATA_DIR + "/" + "gmw.cd";
    String inputFileNameGermanLemmas = ORIG_CELEX_DATA_DIR + "/" + "gml.cd";
    String outputFileNameGerman = OUT_DATA_DIR + "/" + "celex-german-forms.xml";
    instance.celexConvert("celex", "de", inputFileNameGermanWords, inputFileNameGermanLemmas, outputFileNameGerman);
    String inputFileNameDonatusGermanSup = ORIG_DONATUS_SUB_DATA_DIR + "/" + "donatus-sup-de-forms.csv";
    String outputFileNameDonatusGermanSup = OUT_DATA_DIR + "/" + "donatus-sup-de-forms.xml";
    instance.donatusSupplementsConvert("donatus-sup", "de", inputFileNameDonatusGermanSup, outputFileNameDonatusGermanSup);
    instance.forms = new Hashtable<String, Hashtable<String, Form>>();
    // English
    String inputFileNameEnglishWords = ORIG_CELEX_DATA_DIR + "/" + "emw.cd";
    String inputFileNameEnglishLemmas = ORIG_CELEX_DATA_DIR + "/" + "eml.cd";
    String outputFileNameEnglish = OUT_DATA_DIR + "/" + "celex-english-forms.xml";
    instance.celexConvert("celex", "en", inputFileNameEnglishWords, inputFileNameEnglishLemmas, outputFileNameEnglish);
    String inputFileNameDonatusEnglishSup = ORIG_DONATUS_SUB_DATA_DIR + "/" + "donatus-sup-en-forms.csv";
    String outputFileNameDonatusEnglishSup = OUT_DATA_DIR + "/" + "donatus-sup-en-forms.xml";
    instance.donatusSupplementsConvert("donatus-sup", "en", inputFileNameDonatusEnglishSup, outputFileNameDonatusEnglishSup);
    instance.forms = new Hashtable<String, Hashtable<String, Form>>();
    // French
    String inputFileNameFrench = ORIG_FRENCH_DATA_DIR + "/" + "lexique";
    String outputFileNameFrench = OUT_DATA_DIR + "/" + "lexique-french-forms.xml";
    instance.lexiqueConvert("lexique", "fr", inputFileNameFrench, outputFileNameFrench);
    String inputFileNameDonatusFrenchSup = ORIG_DONATUS_SUB_DATA_DIR + "/" + "donatus-sup-fr-forms.csv";
    String outputFileNameDonatusFrenchSup = OUT_DATA_DIR + "/" + "donatus-sup-fr-forms.xml";
    instance.donatusSupplementsConvert("donatus-sup", "fr", inputFileNameDonatusFrenchSup, outputFileNameDonatusFrenchSup);
    instance.forms = new Hashtable<String, Hashtable<String, Form>>();
    // Italian
    String inputFileNameItalian = ORIG_ITALIAN_DATA_DIR + "/" + "ital.hash";
    String outputFileNameItalian = OUT_DATA_DIR + "/" + "donatus-italian-forms.xml";
    instance.donatusItalianConvert("donatus", "it", inputFileNameItalian, outputFileNameItalian);
    String inputFileNameDonatusItalianSup = ORIG_DONATUS_SUB_DATA_DIR + "/" + "donatus-sup-it-forms.csv";
    String outputFileNameDonatusItalianSup = OUT_DATA_DIR + "/" + "donatus-sup-it-forms.xml";
    instance.donatusSupplementsConvert("donatus-sup", "it", inputFileNameDonatusItalianSup, outputFileNameDonatusItalianSup);
    */
    instance.forms = new Hashtable<String, Hashtable<String, Form>>();

    instance.end();
    instance.endOperation();
    Double elapsedTime = new Util().getSecondWithMillisecondsBetween(instance.beginOfOperation, instance.endOfOperation);
    System.out.println("End.");
    System.out.println("Needed time: " + elapsedTime + " seconds");
  }

  private void perseusConvert(String provider, String language, String inputFileName, String outputFileName) throws ApplicationException {
    File inputFile = new File(inputFileName);
    perseusContentHandler = new PerseusContentHandler(provider, language, outputFileName);
    try {
      XMLReader xmlParser = new SAXParser();
      xmlParser.setContentHandler(perseusContentHandler);
      InputStream inputStream = new FileInputStream(inputFile);
      BufferedInputStream bufferedInputStream = new BufferedInputStream(inputStream);
      InputSource input = new InputSource(bufferedInputStream);
      xmlParser.parse(input);
      bufferedInputStream.close();
      forms = perseusContentHandler.getForms();
    } catch (SAXException e) {
      throw new ApplicationException(e);
    } catch (IOException e) {
      throw new ApplicationException(e);
    }
  }
  
  private void celexConvert(String provider, String language, String inputFileNameWords, String inputFileNameLemmas, String outputFileName) throws ApplicationException {
    File inputFileLemmas = new File(inputFileNameLemmas);
    Hashtable<Integer, String> lemmas = loadLemmas(inputFileLemmas);
    File inputFileWords = new File(inputFileNameWords);
    File outputFile = new File(outputFileName);
    writeCelexForms(provider, language, lemmas, inputFileWords, outputFile);
  }

  private void lexiqueConvert(String provider, String language, String inputFileName, String outputFileName) throws ApplicationException {
    File inputFile = new File(inputFileName);
    File outputFile = new File(outputFileName);
    writeLexiqueForms(provider, language, inputFile, outputFile);
  }

  private void donatusItalianConvert(String provider, String language, String inputFileName, String outputFileName) throws ApplicationException {
    File inputFile = new File(inputFileName);
    File outputFile = new File(outputFileName);
    writeDonatusItalianForms(provider, language, inputFile, outputFile);
  }

  private void donatusSupplementsConvert(String provider, String language, String inputFileName, String outputFileName) throws ApplicationException {
    File inputFile = new File(inputFileName);
    File outputFile = new File(outputFileName);
    writeDonatusSupplementsForms(provider, language, inputFile, outputFile);
  }

  private Hashtable<Integer, String> loadLemmas(File inputFile) {
    Hashtable<Integer, String> retLemmas = new Hashtable<Integer, String>();
    BufferedReader in = null;
    try {
      in = new BufferedReader(new FileReader(inputFile));
      String line = null;
      while((line = in.readLine()) != null) {
        int from = line.indexOf("\\");
        int to = line.indexOf("\\", from + 1);
        String idStr = line.substring(0, from);
        Integer idInt = new Integer(idStr);
        String lemma = line.substring(from + 1, to);
        retLemmas.put(idInt, lemma);
      }
    } catch (FileNotFoundException e) {
      e.printStackTrace();
    } catch (IOException e) {
      e.printStackTrace();
    } finally {
      // always close the stream 
      if (in != null) try { in.close(); } catch (Exception e) { }
    }
    return retLemmas;
  }
  
  private void writeCelexForms(String provider, String language, Hashtable<Integer, String> lemmas, File inputFileWords, File outputFile) throws ApplicationException {
    BufferedReader in = null;
    BufferedOutputStream out = null;
    forms = new Hashtable<String, Hashtable<String, Form>>();
    try {
      in = new BufferedReader(new FileReader(inputFileWords));
      out = new BufferedOutputStream(new FileOutputStream(outputFile));
      write("<forms>\n", out);
      String line = null;
      while((line = in.readLine()) != null) {
        int delim1 = line.indexOf("\\");
        int delim2 = line.indexOf("\\", delim1 + 1);
        int delim3 = line.indexOf("\\", delim2 + 1);
        int delim4 = line.indexOf("\\", delim3 + 1);
        String formName = line.substring(delim1 + 1, delim2);
        String lemmaIdStr = line.substring(delim3 + 1, delim4);
        Integer lemmaIdInt = null;
        try {
          lemmaIdInt = new Integer(lemmaIdStr);
        } catch (NumberFormatException e) {
          System.out.println("Warning: Lemma id: " + lemmaIdStr + " is not correct");
        }
        if (lemmaIdInt != null) {
          String lemmaName = lemmas.get(lemmaIdInt);
          Form form = new Form();
          form.setProvider(provider);
          form.setLanguage(language);
          form.setFormName(formName);
          form.setLemmaName(lemmaName);
          form.normalize();
          if (form.isOk()) {
            Hashtable<String, Form> formLemmas = forms.get(formName);
            if (formLemmas == null) {
              formLemmas = new Hashtable<String, Form>();
              formLemmas.put(lemmaName, form);
              forms.put(formName, formLemmas);
              write(form, out);
            } else {
              Form formLemma = formLemmas.get(lemmaName);
              if (formLemma == null) {
                formLemmas.put(lemmaName, form);
                write(form, out);
              }
            }
          }
        }
      }
      write("</forms>\n", out);
    } catch (FileNotFoundException e) {
      throw new ApplicationException(e);
    } catch (IOException e) {
      throw new ApplicationException(e);
    } finally {
      // always close the stream 
      if (in != null) try { in.close(); } catch (Exception e) { }
      if (out != null) try { out.close(); } catch (Exception e) { }
    }
  }

  private void writeLexiqueForms(String provider, String language, File inputFile, File outputFile) throws ApplicationException {
    BufferedReader in = null;
    BufferedOutputStream out = null;
    forms = new Hashtable<String, Hashtable<String, Form>>();
    try {
      in = new BufferedReader(new FileReader(inputFile));
      out = new BufferedOutputStream(new FileOutputStream(outputFile));
      write("<forms>\n", out);
      String line = null;
      while((line = in.readLine()) != null) {
        int delim1 = line.indexOf("\t");
        int delim2 = line.indexOf("\t", delim1 + 1);
        String formName = line.substring(0, delim1).trim();
        String lemmaName = line.substring(delim1 + 1, delim2).trim();
        if (lemmaName.equals("="))
          lemmaName = formName;
        Form form = new Form();
        form.setProvider(provider);
        form.setLanguage(language);
        form.setFormName(formName);
        form.setLemmaName(lemmaName);
        form.normalize();
        if (form.isOk()) {
          Hashtable<String, Form> formLemmas = forms.get(formName);
          if (formLemmas == null) {
            formLemmas = new Hashtable<String, Form>();
            formLemmas.put(lemmaName, form);
            forms.put(formName, formLemmas);
            write(form, out);
          } else {
            Form formLemma = formLemmas.get(lemmaName);
            if (formLemma == null) {
              formLemmas.put(lemmaName, form);
              write(form, out);
            }
          }
        }
      }
      write("</forms>\n", out);
    } catch (FileNotFoundException e) {
      throw new ApplicationException(e);
    } catch (IOException e) {
      throw new ApplicationException(e);
    } finally {
      // always close the stream 
      if (in != null) try { in.close(); } catch (Exception e) { }
      if (out != null) try { out.close(); } catch (Exception e) { }
    }
  }

  private void writeDonatusItalianForms(String provider, String language, File inputFile, File outputFile) throws ApplicationException {
    BufferedReader in = null;
    BufferedOutputStream out = null;
    forms = new Hashtable<String, Hashtable<String, Form>>();
    try {
      in = new BufferedReader(new FileReader(inputFile));
      out = new BufferedOutputStream(new FileOutputStream(outputFile));
      write("<forms>\n", out);
      String line = null;
      while((line = in.readLine()) != null) {
        // one line is of the form: 'risoluino' => '<NL>V risolvino,risolvere  pres imperat 3rd pl ...</NL><NL>...</NL>',
        // or of the form: 'legamenti' => '<NL>N legamento  masc pl ...</NL><NL>...</NL>',
        // this method only recognize the first lemma TODO recognize all lemmas for the form
        int delim1 = line.indexOf("'");
        int delim2 = line.indexOf("'", delim1 + 1);
        int delim3 = line.indexOf("'", delim2 + 1);
        int delim4 = delim3 + 6; // beginning of the lemma
        int delim5 = line.indexOf(" ", delim4 + 1); // end of the first lemma(s) is separated by a blank
        String formName = line.substring(delim1 + 1, delim2);
        formName = formName.replace("\\", "");
        String lemmaName = line.substring(delim4 + 1, delim5);
        int commaInLemma = lemmaName.indexOf(","); // when there are more than one lemma
        if (commaInLemma != -1)
          lemmaName = lemmaName.substring(0, commaInLemma);  
        lemmaName = lemmaName.replace("\\", "");
        Form form = new Form();
        form.setProvider(provider);
        form.setLanguage(language);
        form.setFormName(formName);
        form.setLemmaName(lemmaName);
        form.normalize();
        boolean lineContainsAp = line.contains("\''");  // some of the form lines contain irregular strings of the form: 'par\'' => '<NL>N pari/^,pari     indeclform  adverb</NL>
        if (form.isOk() && ! lineContainsAp) {
          Hashtable<String, Form> formLemmas = forms.get(formName);
          if (formLemmas == null) {
            formLemmas = new Hashtable<String, Form>();
            formLemmas.put(lemmaName, form);
            forms.put(formName, formLemmas);
            write(form, out);
          } else {
            Form formLemma = formLemmas.get(lemmaName);
            if (formLemma == null) {
              formLemmas.put(lemmaName, form);
              write(form, out);
            }
          }
        }
      }
      write("</forms>\n", out);
    } catch (FileNotFoundException e) {
      throw new ApplicationException(e);
    } catch (IOException e) {
      throw new ApplicationException(e);
    } finally {
      // always close the stream 
      if (in != null) try { in.close(); } catch (Exception e) { }
      if (out != null) try { out.close(); } catch (Exception e) { }
    }
  }
  
  private void writeDonatusSupplementsForms(String provider, String language, File inputFile, File outputFile) throws ApplicationException {
    BufferedReader in = null;
    BufferedOutputStream out = null;
    try {
      in = new BufferedReader(new FileReader(inputFile));
      out = new BufferedOutputStream(new FileOutputStream(outputFile));
      write("<forms>\n", out);
      String line = null;
      String lemmaName = "";
      String formName = "";
      // each line is a form
      while((line = in.readLine()) != null) {
        if (line.length() == 0)
          break;
        String firstChar = line.substring(0, 1);
        String mode = "lemmaAndForm";
        if (firstChar.equals(","))
          mode = "form";
        if (mode.equals("lemmaAndForm")) {
          int quote2 = line.indexOf("\"", 1);
          lemmaName = line.substring(1, quote2);
          int quote3 = line.indexOf("\"", quote2 + 1);
          int quote4 = line.indexOf("\"", quote3 + 1);
          formName = line.substring(quote3 + 1, quote4);
        } else if (mode.equals("form")) {
          int quote2 = line.indexOf("\"", 3);
          formName = line.substring(2, quote2);
        }
        Form form = new Form();
        form.setProvider(provider);
        form.setLanguage(language);
        form.setFormName(formName);
        form.setLemmaName(lemmaName);
        if (form.isGreek())
          transcodeFromBetaCode2Unicode(form);
        else if (form.isArabic())
          form = transcodeFromBuckwalter2Unicode(form);
        form.normalize();
        if (form.isOk()) {
          Hashtable<String, Form> formLemmas = forms.get(formName);
          if (formLemmas == null) {
            formLemmas = new Hashtable<String, Form>();
            formLemmas.put(lemmaName, form);
            forms.put(formName, formLemmas);
            write(form, out);
          } else {
            Form formLemma = formLemmas.get(lemmaName);
            if (formLemma == null) {
              formLemmas.put(lemmaName, form);
              write(form, out);
            }
          }
        }
      }
      write("</forms>\n", out);
    } catch (FileNotFoundException e) {
      throw new ApplicationException(e);
    } catch (IOException e) {
      throw new ApplicationException(e);
    } finally {
      // always close the stream 
      if (in != null) try { in.close(); } catch (Exception e) { }
      if (out != null) try { out.close(); } catch (Exception e) { }
    }
  }

  private void write(Form form, BufferedOutputStream out) throws ApplicationException {
    try {
      String xmlFormStr = form.getXmlString();
      byte[] bytes = xmlFormStr.getBytes("utf-8");
      out.write(bytes, 0, bytes.length);
      out.flush();
    } catch (IOException e) {
      throw new ApplicationException(e);
    }
  }

  private void write(String inputString, BufferedOutputStream out) throws ApplicationException {
    try {
      byte[] bytes = inputString.getBytes("utf-8");
      out.write(bytes, 0, bytes.length);
      out.flush();
    } catch (IOException e) {
      throw new ApplicationException(e);
    }
  }

  private Form transcodeFromBetaCode2Unicode(Form form) throws ApplicationException {
    String formName = form.getFormName();
    String lemmaName = form.getLemmaName();
    Transcoder transcoder = Transcoder.getInstance();
    String encodedUnicodeForm = transcoder.transcodeFromBetaCode2Unicode(formName);
    String encodedUnicodeLemma = transcoder.transcodeFromBetaCode2Unicode(lemmaName);
    // replace "small letter sigma" at the end of a word by the "small letter end sigma"
    if (encodedUnicodeForm != null && encodedUnicodeForm.endsWith("σ")) {
      int length = encodedUnicodeForm.length();
      encodedUnicodeForm = encodedUnicodeForm.substring(0, length - 1) + "ς";
    }
    if (encodedUnicodeLemma != null && encodedUnicodeLemma.endsWith("σ")) {
      int length = encodedUnicodeLemma.length();
      encodedUnicodeLemma = encodedUnicodeLemma.substring(0, length - 1) + "ς";
    }
    form.setFormName(encodedUnicodeForm);
    form.setLemmaName(encodedUnicodeLemma);
    return form;
  }
  
  private Form transcodeFromBuckwalter2Unicode(Form form) throws ApplicationException {
    String formName = form.getFormName();
    String lemmaName = form.getLemmaName();
    Transcoder transcoder = Transcoder.getInstance();
    String encodedUnicodeForm = transcoder.transcodeFromBuckwalter2Unicode(formName);
    String encodedUnicodeLemma = transcoder.transcodeFromBuckwalter2Unicode(lemmaName);
    form.setFormName(encodedUnicodeForm);
    form.setLemmaName(encodedUnicodeLemma);
    return form;
  }

  private void end() throws ApplicationException {
  }

  private void beginOperation() {
    beginOfOperation = new Date();
  }

  private void endOperation() {
    endOfOperation = new Date();
  }

}