Mercurial > hg > mpdl-group
view software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/morph/converter/PerseusContentHandler.java @ 12:fba5577e49d9
diverse Fehlerbehebungen
author | Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de> |
---|---|
date | Tue, 19 Apr 2011 16:51:26 +0200 |
parents | 408254cf2f1d |
children |
line wrap: on
line source
package de.mpg.mpiwg.berlin.mpdl.lt.morph.converter; import java.io.BufferedOutputStream; import java.io.File; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.IOException; import java.io.OutputStream; import java.util.Hashtable; import org.xml.sax.*; import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; import de.mpg.mpiwg.berlin.mpdl.lt.general.Transcoder; import de.mpg.mpiwg.berlin.mpdl.lt.morph.app.Form; public class PerseusContentHandler implements ContentHandler { private static String[] XML_FORM_FIELD_NAMES = {"form", "lemma", "pos", "tense", "voice", "case", "number", "mood", "person", "gender", "definite"}; private Hashtable<String, Hashtable<String, Form>> forms; private File outputFile; private String provider; private String language; private OutputStream out; private Element currentElement; private Form form; public PerseusContentHandler(String provider, String language, String outputFileName) throws ApplicationException { this.outputFile = new File(outputFileName); this.provider = provider; this.language = language; } public Hashtable<String, Hashtable<String, Form>> getForms() { return forms; } public void startDocument() throws SAXException { try { out = new BufferedOutputStream(new FileOutputStream(outputFile)); forms = new Hashtable<String, Hashtable<String, Form>>(); } catch (FileNotFoundException e) { throw new SAXException(e); } write("<forms>\n"); } public void endDocument() throws SAXException { write("</forms>\n"); try { if (out != null) out.close(); } catch (Exception e) { // nothing: always close the stream at the end of the method } } public void characters(char[] c, int start, int length) throws SAXException { if (currentElement != null) { String elemName = currentElement.name; if (form != null && isXmlFormField(elemName)) { char[] cCopy = new char[length]; System.arraycopy(c, start, cCopy, 0, length); String charactersStr = String.valueOf(cCopy); if (charactersStr != null && ! (charactersStr.trim().equals(""))) { if (elemName.equals("form")) form.addFormName(charactersStr); else if (elemName.equals("lemma")) form.addLemmaName(charactersStr); else if (elemName.equals("pos")) form.addPos(charactersStr); else if (elemName.equals("tense")) form.addTense(charactersStr); else if (elemName.equals("voice")) form.addVoice(charactersStr); else if (elemName.equals("case")) form.addCasus(charactersStr); else if (elemName.equals("number")) form.addNumber(charactersStr); else if (elemName.equals("mood")) form.addMood(charactersStr); else if (elemName.equals("person")) form.addPerson(charactersStr); else if (elemName.equals("gender")) form.addGender(charactersStr); else if (elemName.equals("definite")) form.addDefinite(charactersStr); } } } } public void ignorableWhitespace(char[] c, int start, int length) throws SAXException { } public void processingInstruction(String target, String data) throws SAXException { } public void setDocumentLocator(org.xml.sax.Locator arg1) { } public void endPrefixMapping(String prefix) throws SAXException { } public void skippedEntity(String name) throws SAXException { } public void endElement(String uri, String localName, String name) throws SAXException { currentElement = null; try { if (name.equals("analysis")) { if (form.isGreek()) form = transcodeFromBetaCode2Unicode(form); else if (form.isArabic()) form = transcodeFromBuckwalter2Unicode(form); form.normalize(); if (form.isOk()) { String formName = form.getFormName(); String lemmaName = form.getLemmaName(); Hashtable<String, Form> formLemmas = forms.get(formName); if (formLemmas == null) { formLemmas = new Hashtable<String, Form>(); formLemmas.put(lemmaName, form); forms.put(formName, formLemmas); write(form); } else { Form formLemma = formLemmas.get(lemmaName); if (formLemma == null) { formLemmas.put(lemmaName, form); write(form); } } } form = null; } } catch (ApplicationException e) { throw new SAXException(e); } } public void startElement(String uri, String localName, String name, Attributes attrs) throws SAXException { currentElement = new Element(name); if (name.equals("analysis")) { form = new Form(); form.setProvider(provider); form.setLanguage(language); } } public void startPrefixMapping(String prefix, String uri) throws SAXException { } private boolean isXmlFormField(String fieldName) { boolean isXmlFormField = false; for (int i=0; i<XML_FORM_FIELD_NAMES.length; i++) { String n = XML_FORM_FIELD_NAMES[i]; if (fieldName.toLowerCase().equals(n)) { isXmlFormField = true; break; } } return isXmlFormField; } private void write(String outStr) throws SAXException { try { byte[] bytes = outStr.getBytes("utf-8"); out.write(bytes, 0, bytes.length); out.flush(); } catch (IOException e) { throw new SAXException(e); } } private void write(Form form) throws SAXException { try { String xmlFormStr = form.getXmlString(); byte[] bytes = xmlFormStr.getBytes("utf-8"); out.write(bytes, 0, bytes.length); out.flush(); } catch (IOException e) { throw new SAXException(e); } } private Form transcodeFromBetaCode2Unicode(Form form) throws ApplicationException { String formName = form.getFormName(); String lemmaName = form.getLemmaName(); Transcoder transcoder = Transcoder.getInstance(); String encodedUnicodeForm = transcoder.transcodeFromBetaCode2Unicode(formName); String encodedUnicodeLemma = transcoder.transcodeFromBetaCode2Unicode(lemmaName); // replace "small letter sigma" at the end of a word by the "small letter end sigma" if (encodedUnicodeForm != null && encodedUnicodeForm.endsWith("σ")) { int length = encodedUnicodeForm.length(); encodedUnicodeForm = encodedUnicodeForm.substring(0, length - 1) + "ς"; } if (encodedUnicodeLemma != null && encodedUnicodeLemma.endsWith("σ")) { int length = encodedUnicodeLemma.length(); encodedUnicodeLemma = encodedUnicodeLemma.substring(0, length - 1) + "ς"; } form.setFormName(encodedUnicodeForm); form.setLemmaName(encodedUnicodeLemma); return form; } private Form transcodeFromBuckwalter2Unicode(Form form) throws ApplicationException { String formName = form.getFormName(); String lemmaName = form.getLemmaName(); Transcoder transcoder = Transcoder.getInstance(); String encodedUnicodeForm = transcoder.transcodeFromBuckwalter2Unicode(formName); String encodedUnicodeLemma = transcoder.transcodeFromBuckwalter2Unicode(lemmaName); form.setFormName(encodedUnicodeForm); form.setLemmaName(encodedUnicodeLemma); return form; } private class Element { String name; String value; Element(String name) { this.name = name; } Element(String name, String value) { this.name = name; this.value = value; } } }