view software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/doc/ArchimedesDocContentHandler.java @ 0:408254cf2f1d

Erstellung
author Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
date Wed, 24 Nov 2010 17:24:23 +0100
parents
children
line wrap: on
line source

package de.mpg.mpiwg.berlin.mpdl.lt.doc;

import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;

import org.xml.sax.*;

import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException;
import de.mpg.mpiwg.berlin.mpdl.lt.general.Transcoder;
import de.mpg.mpiwg.berlin.mpdl.util.StringUtilEscapeChars;

public class ArchimedesDocContentHandler implements ContentHandler {
  private String xmlnsString = "";
  private File outputFile;
  private String language;
  private String fromEncoding;
  private String toEncoding;
  private OutputStream out;
  private Element currentElement;
  
  public ArchimedesDocContentHandler(String language, String fromEncoding, String toEncoding, File outputFile) throws ApplicationException {
    this.language = language;
    this.outputFile = outputFile;
    this.fromEncoding = fromEncoding;
    this.toEncoding = toEncoding;
  }
  
  public void startDocument() throws SAXException {
    try {
      out = new BufferedOutputStream(new FileOutputStream(outputFile));
      write("<?xml version=\"1.0\"?>\n");
    } catch (FileNotFoundException e) {
      throw new SAXException(e);
    }
  }

  public void endDocument() throws SAXException {
    try { 
      if (out != null)
        out.close(); 
    } catch (Exception e) { 
        // nothing: always close the stream at the end of the method
    }  
  }
  
  public void characters(char[] c, int start, int length) throws SAXException {
    char[] cCopy = new char[length];
    System.arraycopy(c, start, cCopy, 0, length);
    String charactersStr = String.valueOf(cCopy);
    if (charactersStr != null) {
      String elemName = null;
      if (currentElement != null)
        elemName = currentElement.name;
      if ((! isArchMetadata(elemName)) && (currentElement == null || currentElement.isGreek() || currentElement.isArabic())) {
        try {
          if (fromEncoding.equals("betacode") && toEncoding.equals("unicode")) {
              charactersStr = transcodeFromBetaCode2Unicode(charactersStr);
          } else if (fromEncoding.equals("buckwalter") && toEncoding.equals("unicode")) {
            charactersStr = transcodeFromBuckwalter2Unicode(charactersStr);
          }
        } catch (ApplicationException e) {
          throw new SAXException(e);
        }
      }
      charactersStr = StringUtilEscapeChars.forXML(charactersStr);
      if (currentElement != null)
        currentElement.value = charactersStr;
      write(charactersStr);
    }
  }

  public void ignorableWhitespace(char[] c, int start, int length) throws SAXException {
  }

  public void processingInstruction(String target, String data) throws SAXException {
    
  }

  public void setDocumentLocator(Locator locator) {
  }

  public void startPrefixMapping(String prefix, String uri) throws SAXException {
    xmlnsString += "xmlns:" + prefix + "=\"" + uri + "\" ";
  }
  
  public void endPrefixMapping(String prefix) throws SAXException {
  }

  public void skippedEntity(String name) throws SAXException {
  }

  public void startElement(String uri, String localName, String name, Attributes attrs) throws SAXException {
    currentElement = new Element(language, name);
    int attrSize = attrs.getLength();
    String attrString = "";
    for (int i=0; i<attrSize; i++) {
      String attrQName = attrs.getQName(i);
      String attrValue = attrs.getValue(i);
      attrValue = StringUtilEscapeChars.forXML(attrValue);
      attrString = attrString + " " + attrQName + "=\"" + attrValue + "\"";
      if (attrQName != null && attrQName.equals("lang") && attrValue != null) {
        currentElement.language = attrValue;
      }
    }
    currentElement.attrString = attrString;
    if (xmlnsString.equals("")) {
      write("<" + name + attrString + ">");
    } else { 
      currentElement.xmlnsString = xmlnsString;
      write("<" + name + " " + xmlnsString + attrString + ">");
    }
    xmlnsString = "";
  }

  public void endElement(String uri, String localName, String name) throws SAXException {
    currentElement = null;
    write("</" + name + ">");
  }

  private void write(String outStr) throws SAXException {
    try {
      byte[] bytes = outStr.getBytes("utf-8");
      out.write(bytes, 0, bytes.length);
      out.flush();
    } catch (IOException e) {
      throw new SAXException(e);
    }
  }
  
  private String transcodeFromBetaCode2Unicode(String inputStr) throws ApplicationException {
    Transcoder transcoder = Transcoder.getInstance();
    String encodedUnicodeStr = transcoder.transcodeFromBetaCode2Unicode(inputStr);
    return encodedUnicodeStr;
  }
  
  private String transcodeFromBuckwalter2Unicode(String inputStr) throws ApplicationException {
    Transcoder transcoder = Transcoder.getInstance();
    String encodedUnicodeStr = transcoder.transcodeFromBuckwalter2Unicode(inputStr);
    return encodedUnicodeStr;
  }
  
  private boolean isArchMetadata(String elemName) {
    boolean isArchMetadata = false;
    if (elemName == null)
      return false;
    String elName = elemName.toLowerCase().trim();
    if (elName.equals("info") || elName.equals("author") || elName.equals("title") || elName.equals("date") || elName.equals("place") 
        || elName.equals("translator") || elName.equals("lang") || elName.equals("cvs_file") || elName.equals("cvs_version") || elName.equals("comments") || elName.equals("locator")) {
      isArchMetadata = true;
    }
    return isArchMetadata;
  }

  private class Element {
    String name;
    String language;
    String xmlnsString;
    String attrString;
    String value;
    
    Element(String language, String name) {
      this.language = language;
      this.name = name;
    }
    
    boolean isGreek() {
      boolean isGreek = false;
      if (language != null && language.equals("el"))
        isGreek = true;
      return isGreek;
    }

    boolean isArabic() {
      boolean isArabic = false;
      if (language != null && language.equals("ar"))
        isArabic = true;
      return isArabic;
    }
  }
}