comparison software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/morph/converter/PerseusContentHandler.java @ 0:408254cf2f1d

Erstellung
author Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
date Wed, 24 Nov 2010 17:24:23 +0100
parents
children fba5577e49d9
comparison
equal deleted inserted replaced
-1:000000000000 0:408254cf2f1d
1 package de.mpg.mpiwg.berlin.mpdl.lt.morph.converter;
2
3 import java.io.BufferedOutputStream;
4 import java.io.File;
5 import java.io.FileNotFoundException;
6 import java.io.FileOutputStream;
7 import java.io.IOException;
8 import java.io.OutputStream;
9 import java.util.Hashtable;
10
11 import org.xml.sax.*;
12
13 import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException;
14 import de.mpg.mpiwg.berlin.mpdl.lt.general.Transcoder;
15 import de.mpg.mpiwg.berlin.mpdl.lt.morph.app.Form;
16
17 public class PerseusContentHandler implements ContentHandler {
18 private static String[] XML_FORM_FIELD_NAMES = {"form", "lemma", "pos", "tense", "voice", "case", "number", "mood", "person", "gender", "definite"};
19 private Hashtable<String, Hashtable<String, Form>> forms;
20 private File outputFile;
21 private String provider;
22 private String language;
23 private OutputStream out;
24 private Element currentElement;
25 private Form form;
26
27 public PerseusContentHandler(String provider, String language, String outputFileName) throws ApplicationException {
28 this.outputFile = new File(outputFileName);
29 this.provider = provider;
30 this.language = language;
31 }
32
33 public Hashtable<String, Hashtable<String, Form>> getForms() {
34 return forms;
35 }
36
37 public void startDocument() throws SAXException {
38 try {
39 out = new BufferedOutputStream(new FileOutputStream(outputFile));
40 forms = new Hashtable<String, Hashtable<String, Form>>();
41 } catch (FileNotFoundException e) {
42 throw new SAXException(e);
43 }
44 write("<forms>\n");
45 }
46
47 public void endDocument() throws SAXException {
48 write("</forms>\n");
49 try {
50 if (out != null)
51 out.close();
52 } catch (Exception e) {
53 // nothing: always close the stream at the end of the method
54 }
55 }
56
57 public void characters(char[] c, int start, int length) throws SAXException {
58 if (currentElement != null) {
59 String elemName = currentElement.name;
60 if (form != null && isXmlFormField(elemName)) {
61 char[] cCopy = new char[length];
62 System.arraycopy(c, start, cCopy, 0, length);
63 String charactersStr = String.valueOf(cCopy);
64 if (charactersStr != null && ! (charactersStr.trim().equals(""))) {
65 if (elemName.equals("form"))
66 form.addFormName(charactersStr);
67 else if (elemName.equals("lemma"))
68 form.addLemmaName(charactersStr);
69 else if (elemName.equals("pos"))
70 form.addPos(charactersStr);
71 else if (elemName.equals("tense"))
72 form.addTense(charactersStr);
73 else if (elemName.equals("voice"))
74 form.addVoice(charactersStr);
75 else if (elemName.equals("case"))
76 form.addCasus(charactersStr);
77 else if (elemName.equals("number"))
78 form.addNumber(charactersStr);
79 else if (elemName.equals("mood"))
80 form.addMood(charactersStr);
81 else if (elemName.equals("person"))
82 form.addPerson(charactersStr);
83 else if (elemName.equals("gender"))
84 form.addGender(charactersStr);
85 else if (elemName.equals("definite"))
86 form.addDefinite(charactersStr);
87 }
88 }
89 }
90 }
91
92 public void ignorableWhitespace(char[] c, int start, int length) throws SAXException {
93 }
94
95 public void processingInstruction(String target, String data) throws SAXException {
96 }
97
98 public void setDocumentLocator(org.xml.sax.Locator arg1) {
99 }
100
101 public void endPrefixMapping(String prefix) throws SAXException {
102 }
103
104 public void skippedEntity(String name) throws SAXException {
105 }
106
107 public void endElement(String uri, String localName, String name) throws SAXException {
108 currentElement = null;
109 try {
110 if (name.equals("analysis")) {
111 if (form.isGreek())
112 form = transcodeFromBetaCode2Unicode(form);
113 else if (form.isArabic())
114 form = transcodeFromBuckwalter2Unicode(form);
115 form.normalize();
116 if (form.isOk()) {
117 String formName = form.getFormName();
118 String lemmaName = form.getLemmaName();
119 Hashtable<String, Form> formLemmas = forms.get(formName);
120 if (formLemmas == null) {
121 formLemmas = new Hashtable<String, Form>();
122 formLemmas.put(lemmaName, form);
123 forms.put(formName, formLemmas);
124 write(form);
125 } else {
126 Form formLemma = formLemmas.get(lemmaName);
127 if (formLemma == null) {
128 formLemmas.put(lemmaName, form);
129 write(form);
130 }
131 }
132 }
133 form = null;
134 }
135 } catch (ApplicationException e) {
136 throw new SAXException(e);
137 }
138 }
139
140 public void startElement(String uri, String localName, String name, Attributes attrs) throws SAXException {
141 currentElement = new Element(name);
142 if (name.equals("analysis")) {
143 form = new Form();
144 form.setProvider(provider);
145 form.setLanguage(language);
146 }
147 }
148
149 public void startPrefixMapping(String prefix, String uri) throws SAXException {
150 }
151
152 private boolean isXmlFormField(String fieldName) {
153 boolean isXmlFormField = false;
154 for (int i=0; i<XML_FORM_FIELD_NAMES.length; i++) {
155 String n = XML_FORM_FIELD_NAMES[i];
156 if (fieldName.toLowerCase().equals(n)) {
157 isXmlFormField = true;
158 break;
159 }
160 }
161 return isXmlFormField;
162 }
163
164 private void write(String outStr) throws SAXException {
165 try {
166 byte[] bytes = outStr.getBytes("utf-8");
167 out.write(bytes, 0, bytes.length);
168 out.flush();
169 } catch (IOException e) {
170 throw new SAXException(e);
171 }
172 }
173
174 private void write(Form form) throws SAXException {
175 try {
176 String xmlFormStr = form.getXmlString();
177 byte[] bytes = xmlFormStr.getBytes("utf-8");
178 out.write(bytes, 0, bytes.length);
179 out.flush();
180 } catch (IOException e) {
181 throw new SAXException(e);
182 }
183 }
184
185 private Form transcodeFromBetaCode2Unicode(Form form) throws ApplicationException {
186 String formName = form.getFormName();
187 String lemmaName = form.getLemmaName();
188 Transcoder transcoder = Transcoder.getInstance();
189 String encodedUnicodeForm = transcoder.transcodeFromBetaCode2Unicode(formName);
190 String encodedUnicodeLemma = transcoder.transcodeFromBetaCode2Unicode(lemmaName);
191 form.setFormName(encodedUnicodeForm);
192 form.setLemmaName(encodedUnicodeLemma);
193 return form;
194 }
195
196 private Form transcodeFromBuckwalter2Unicode(Form form) throws ApplicationException {
197 String formName = form.getFormName();
198 String lemmaName = form.getLemmaName();
199 Transcoder transcoder = Transcoder.getInstance();
200 String encodedUnicodeForm = transcoder.transcodeFromBuckwalter2Unicode(formName);
201 String encodedUnicodeLemma = transcoder.transcodeFromBuckwalter2Unicode(lemmaName);
202 form.setFormName(encodedUnicodeForm);
203 form.setLemmaName(encodedUnicodeLemma);
204 return form;
205 }
206
207 private class Element {
208 String name;
209 String value;
210
211 Element(String name) {
212 this.name = name;
213 }
214
215 Element(String name, String value) {
216 this.name = name;
217 this.value = value;
218 }
219 }
220 }