Mercurial > hg > mpdl-group
diff software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/doc/ArchimedesDocForeignLangContentHandler.java @ 0:408254cf2f1d
Erstellung
author | Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de> |
---|---|
date | Wed, 24 Nov 2010 17:24:23 +0100 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/doc/ArchimedesDocForeignLangContentHandler.java Wed Nov 24 17:24:23 2010 +0100 @@ -0,0 +1,176 @@ +package de.mpg.mpiwg.berlin.mpdl.lt.doc; + +import java.io.BufferedOutputStream; +import java.io.File; +import java.io.FileNotFoundException; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.OutputStream; + +import org.xml.sax.*; + +import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; +import de.mpg.mpiwg.berlin.mpdl.lt.general.Transcoder; +import de.mpg.mpiwg.berlin.mpdl.util.StringUtilEscapeChars; + +public class ArchimedesDocForeignLangContentHandler implements ContentHandler { + private String xmlnsString = ""; + private File outputFile; + private OutputStream out; + private Element currentElement; + private boolean currentElementIsForeign = false; + + public ArchimedesDocForeignLangContentHandler(File outputFile) throws ApplicationException { + this.outputFile = outputFile; + } + + public void startDocument() throws SAXException { + try { + out = new BufferedOutputStream(new FileOutputStream(outputFile)); + write("<?xml version=\"1.0\"?>\n"); + } catch (FileNotFoundException e) { + throw new SAXException(e); + } + } + + public void endDocument() throws SAXException { + try { + if (out != null) + out.close(); + } catch (Exception e) { + // nothing: always close the stream at the end of the method + } + } + + public void characters(char[] c, int start, int length) throws SAXException { + char[] cCopy = new char[length]; + System.arraycopy(c, start, cCopy, 0, length); + String charactersStr = String.valueOf(cCopy); + if (charactersStr != null) { + String elemName = null; + if (currentElement != null) + elemName = currentElement.name; + if ((! isArchMetadata(elemName)) && (currentElementIsForeign)) { + try { + charactersStr = transcodeFromBetaCode2Unicode(charactersStr); + } catch (ApplicationException e) { + throw new SAXException(e); + } + } + charactersStr = StringUtilEscapeChars.forXML(charactersStr); + if (currentElement != null) + currentElement.value = charactersStr; + write(charactersStr); + } + } + + public void ignorableWhitespace(char[] c, int start, int length) throws SAXException { + } + + public void processingInstruction(String target, String data) throws SAXException { + + } + + public void setDocumentLocator(Locator locator) { + } + + public void startPrefixMapping(String prefix, String uri) throws SAXException { + xmlnsString += "xmlns:" + prefix + "=\"" + uri + "\" "; + } + + public void endPrefixMapping(String prefix) throws SAXException { + } + + public void skippedEntity(String name) throws SAXException { + } + + public void startElement(String uri, String localName, String name, Attributes attrs) throws SAXException { + Element newElement = new Element(null, name); + if (currentElement != null) { + if (currentElement.language != null) + newElement.language = currentElement.language; // language wird an Kinder vererbt + } + currentElement = newElement; + int attrSize = attrs.getLength(); + String attrString = ""; + for (int i=0; i<attrSize; i++) { + String attrQName = attrs.getQName(i); + String attrValue = attrs.getValue(i); + attrValue = StringUtilEscapeChars.forXML(attrValue); + if (attrQName != null && attrQName.equals("lang") && attrValue != null) { + currentElement.language = attrValue; // wenn xml:lang belegt ist, wird es an das neue Element gesetzt und überschreibt vom Vater geerbte Werte + } + // replace "lang=greek" by "lang=grc" + if (name.equals("foreign") && attrQName.equals("lang") && attrValue.equals("greek")) { + currentElementIsForeign = true; + attrString = attrString + " " + attrQName + "=\"" + "grc" + "\""; + } else { + attrString = attrString + " " + attrQName + "=\"" + attrValue + "\""; + } + } + currentElement.attrString = attrString; + if (xmlnsString.equals("")) { + write("<" + name + attrString + ">"); + } else { + currentElement.xmlnsString = xmlnsString; + write("<" + name + " " + xmlnsString + attrString + ">"); + } + xmlnsString = ""; + } + + public void endElement(String uri, String localName, String name) throws SAXException { + if (name.equals("foreign")) + currentElementIsForeign = false; + currentElement = null; + write("</" + name + ">"); + } + + private void write(String outStr) throws SAXException { + try { + byte[] bytes = outStr.getBytes("utf-8"); + out.write(bytes, 0, bytes.length); + out.flush(); + } catch (IOException e) { + throw new SAXException(e); + } + } + + private String transcodeFromBetaCode2Unicode(String inputStr) throws ApplicationException { + Transcoder transcoder = Transcoder.getInstance(); + String encodedUnicodeStr = transcoder.transcodeFromBetaCode2Unicode(inputStr); + return encodedUnicodeStr; + } + + private boolean isArchMetadata(String elemName) { + boolean isArchMetadata = false; + if (elemName == null) + return false; + String elName = elemName.toLowerCase().trim(); + if (elName.equals("info") || elName.equals("author") || elName.equals("title") || elName.equals("date") || elName.equals("place") + || elName.equals("translator") || elName.equals("lang") || elName.equals("cvs_file") || elName.equals("cvs_version") || elName.equals("comments") || elName.equals("locator")) { + isArchMetadata = true; + } + return isArchMetadata; + } + + private class Element { + String name; + String language; + String xmlnsString; + String attrString; + String value; + + Element(String language, String name) { + this.language = language; + this.name = name; + } + + boolean isGreek() { + boolean isGreek = false; + if (language != null && (language.equals("el") || language.equals("greek") || language.equals("grc"))) + isGreek = true; + return isGreek; + } + + } +}