view software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/morph/converter/PerseusContentHandler.java @ 0:408254cf2f1d

Erstellung
author Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
date Wed, 24 Nov 2010 17:24:23 +0100
parents
children fba5577e49d9
line wrap: on
line source

package de.mpg.mpiwg.berlin.mpdl.lt.morph.converter;

import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.util.Hashtable;

import org.xml.sax.*;

import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException;
import de.mpg.mpiwg.berlin.mpdl.lt.general.Transcoder;
import de.mpg.mpiwg.berlin.mpdl.lt.morph.app.Form;

public class PerseusContentHandler implements ContentHandler {
  private static String[] XML_FORM_FIELD_NAMES = {"form", "lemma", "pos", "tense", "voice", "case", "number", "mood", "person", "gender", "definite"};
  private Hashtable<String, Hashtable<String, Form>> forms;
  private File outputFile;
  private String provider;
  private String language;
  private OutputStream out;
  private Element currentElement;
  private Form form;
  
  public PerseusContentHandler(String provider, String language, String outputFileName) throws ApplicationException {
    this.outputFile = new File(outputFileName);
    this.provider = provider;
    this.language = language;
  }
  
  public Hashtable<String, Hashtable<String, Form>> getForms() {
    return forms;  
  }
  
  public void startDocument() throws SAXException {
    try {
      out = new BufferedOutputStream(new FileOutputStream(outputFile));
      forms = new Hashtable<String, Hashtable<String, Form>>();
    } catch (FileNotFoundException e) {
      throw new SAXException(e);
    }
    write("<forms>\n");
  }

  public void endDocument() throws SAXException {
    write("</forms>\n");
    try { 
      if (out != null)
        out.close(); 
    } catch (Exception e) { 
        // nothing: always close the stream at the end of the method
    }  
  }
  
  public void characters(char[] c, int start, int length) throws SAXException {
    if (currentElement != null) {
      String elemName = currentElement.name;
      if (form != null && isXmlFormField(elemName)) {
        char[] cCopy = new char[length];
        System.arraycopy(c, start, cCopy, 0, length);
        String charactersStr = String.valueOf(cCopy);
        if (charactersStr != null && ! (charactersStr.trim().equals(""))) {
          if (elemName.equals("form"))
            form.addFormName(charactersStr);
          else if (elemName.equals("lemma"))
            form.addLemmaName(charactersStr);
          else if (elemName.equals("pos"))
            form.addPos(charactersStr);
          else if (elemName.equals("tense"))
            form.addTense(charactersStr);
          else if (elemName.equals("voice"))
            form.addVoice(charactersStr);
          else if (elemName.equals("case"))
            form.addCasus(charactersStr);
          else if (elemName.equals("number"))
            form.addNumber(charactersStr);
          else if (elemName.equals("mood"))
            form.addMood(charactersStr);
          else if (elemName.equals("person"))
            form.addPerson(charactersStr);
          else if (elemName.equals("gender"))
            form.addGender(charactersStr);
          else if (elemName.equals("definite"))
            form.addDefinite(charactersStr);
        }
      }
    } 
  }

  public void ignorableWhitespace(char[] c, int start, int length) throws SAXException {
  }

  public void processingInstruction(String target, String data) throws SAXException {
  }

  public void setDocumentLocator(org.xml.sax.Locator arg1) {
  }

  public void endPrefixMapping(String prefix) throws SAXException {
  }

  public void skippedEntity(String name) throws SAXException {
  }

  public void endElement(String uri, String localName, String name) throws SAXException {
    currentElement = null;
    try {
      if (name.equals("analysis")) {
        if (form.isGreek())
          form = transcodeFromBetaCode2Unicode(form);
        else if (form.isArabic())
          form = transcodeFromBuckwalter2Unicode(form);
        form.normalize();
        if (form.isOk()) {
          String formName = form.getFormName();
          String lemmaName = form.getLemmaName();
          Hashtable<String, Form> formLemmas = forms.get(formName);
          if (formLemmas == null) {
            formLemmas = new Hashtable<String, Form>();
            formLemmas.put(lemmaName, form);
            forms.put(formName, formLemmas);
            write(form);
          } else {
            Form formLemma = formLemmas.get(lemmaName);
            if (formLemma == null) {
              formLemmas.put(lemmaName, form);
              write(form);
            }
          }
       }
        form = null;
      }
    } catch (ApplicationException e) {
      throw new SAXException(e);
    }
  }

  public void startElement(String uri, String localName, String name, Attributes attrs) throws SAXException {
    currentElement = new Element(name);
    if (name.equals("analysis")) {
      form = new Form();
      form.setProvider(provider);
      form.setLanguage(language);
    }
  }

  public void startPrefixMapping(String prefix, String uri) throws SAXException {
  }
  
  private boolean isXmlFormField(String fieldName) {
    boolean isXmlFormField = false;
    for (int i=0; i<XML_FORM_FIELD_NAMES.length; i++) {
      String n = XML_FORM_FIELD_NAMES[i];
      if (fieldName.toLowerCase().equals(n)) {
        isXmlFormField = true;
        break;
      }
    }
    return isXmlFormField;
  }
  
  private void write(String outStr) throws SAXException {
    try {
      byte[] bytes = outStr.getBytes("utf-8");
      out.write(bytes, 0, bytes.length);
      out.flush();
    } catch (IOException e) {
      throw new SAXException(e);
    }
  }
  
  private void write(Form form) throws SAXException {
    try {
      String xmlFormStr = form.getXmlString();
      byte[] bytes = xmlFormStr.getBytes("utf-8");
      out.write(bytes, 0, bytes.length);
      out.flush();
    } catch (IOException e) {
      throw new SAXException(e);
    }
  }
  
  private Form transcodeFromBetaCode2Unicode(Form form) throws ApplicationException {
    String formName = form.getFormName();
    String lemmaName = form.getLemmaName();
    Transcoder transcoder = Transcoder.getInstance();
    String encodedUnicodeForm = transcoder.transcodeFromBetaCode2Unicode(formName);
    String encodedUnicodeLemma = transcoder.transcodeFromBetaCode2Unicode(lemmaName);
    form.setFormName(encodedUnicodeForm);
    form.setLemmaName(encodedUnicodeLemma);
    return form;
  }
  
  private Form transcodeFromBuckwalter2Unicode(Form form) throws ApplicationException {
    String formName = form.getFormName();
    String lemmaName = form.getLemmaName();
    Transcoder transcoder = Transcoder.getInstance();
    String encodedUnicodeForm = transcoder.transcodeFromBuckwalter2Unicode(formName);
    String encodedUnicodeLemma = transcoder.transcodeFromBuckwalter2Unicode(lemmaName);
    form.setFormName(encodedUnicodeForm);
    form.setLemmaName(encodedUnicodeLemma);
    return form;
  }

  private class Element {
    String name;
    String value;
    
    Element(String name) {
      this.name = name;
    }

    Element(String name, String value) {
      this.name = name;
      this.value = value;
    }
  }
}