Mercurial > hg > mpdl-group
comparison software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/morph/converter/PerseusContentHandler.java @ 0:408254cf2f1d
Erstellung
author | Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de> |
---|---|
date | Wed, 24 Nov 2010 17:24:23 +0100 |
parents | |
children | fba5577e49d9 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:408254cf2f1d |
---|---|
1 package de.mpg.mpiwg.berlin.mpdl.lt.morph.converter; | |
2 | |
3 import java.io.BufferedOutputStream; | |
4 import java.io.File; | |
5 import java.io.FileNotFoundException; | |
6 import java.io.FileOutputStream; | |
7 import java.io.IOException; | |
8 import java.io.OutputStream; | |
9 import java.util.Hashtable; | |
10 | |
11 import org.xml.sax.*; | |
12 | |
13 import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; | |
14 import de.mpg.mpiwg.berlin.mpdl.lt.general.Transcoder; | |
15 import de.mpg.mpiwg.berlin.mpdl.lt.morph.app.Form; | |
16 | |
17 public class PerseusContentHandler implements ContentHandler { | |
18 private static String[] XML_FORM_FIELD_NAMES = {"form", "lemma", "pos", "tense", "voice", "case", "number", "mood", "person", "gender", "definite"}; | |
19 private Hashtable<String, Hashtable<String, Form>> forms; | |
20 private File outputFile; | |
21 private String provider; | |
22 private String language; | |
23 private OutputStream out; | |
24 private Element currentElement; | |
25 private Form form; | |
26 | |
27 public PerseusContentHandler(String provider, String language, String outputFileName) throws ApplicationException { | |
28 this.outputFile = new File(outputFileName); | |
29 this.provider = provider; | |
30 this.language = language; | |
31 } | |
32 | |
33 public Hashtable<String, Hashtable<String, Form>> getForms() { | |
34 return forms; | |
35 } | |
36 | |
37 public void startDocument() throws SAXException { | |
38 try { | |
39 out = new BufferedOutputStream(new FileOutputStream(outputFile)); | |
40 forms = new Hashtable<String, Hashtable<String, Form>>(); | |
41 } catch (FileNotFoundException e) { | |
42 throw new SAXException(e); | |
43 } | |
44 write("<forms>\n"); | |
45 } | |
46 | |
47 public void endDocument() throws SAXException { | |
48 write("</forms>\n"); | |
49 try { | |
50 if (out != null) | |
51 out.close(); | |
52 } catch (Exception e) { | |
53 // nothing: always close the stream at the end of the method | |
54 } | |
55 } | |
56 | |
57 public void characters(char[] c, int start, int length) throws SAXException { | |
58 if (currentElement != null) { | |
59 String elemName = currentElement.name; | |
60 if (form != null && isXmlFormField(elemName)) { | |
61 char[] cCopy = new char[length]; | |
62 System.arraycopy(c, start, cCopy, 0, length); | |
63 String charactersStr = String.valueOf(cCopy); | |
64 if (charactersStr != null && ! (charactersStr.trim().equals(""))) { | |
65 if (elemName.equals("form")) | |
66 form.addFormName(charactersStr); | |
67 else if (elemName.equals("lemma")) | |
68 form.addLemmaName(charactersStr); | |
69 else if (elemName.equals("pos")) | |
70 form.addPos(charactersStr); | |
71 else if (elemName.equals("tense")) | |
72 form.addTense(charactersStr); | |
73 else if (elemName.equals("voice")) | |
74 form.addVoice(charactersStr); | |
75 else if (elemName.equals("case")) | |
76 form.addCasus(charactersStr); | |
77 else if (elemName.equals("number")) | |
78 form.addNumber(charactersStr); | |
79 else if (elemName.equals("mood")) | |
80 form.addMood(charactersStr); | |
81 else if (elemName.equals("person")) | |
82 form.addPerson(charactersStr); | |
83 else if (elemName.equals("gender")) | |
84 form.addGender(charactersStr); | |
85 else if (elemName.equals("definite")) | |
86 form.addDefinite(charactersStr); | |
87 } | |
88 } | |
89 } | |
90 } | |
91 | |
92 public void ignorableWhitespace(char[] c, int start, int length) throws SAXException { | |
93 } | |
94 | |
95 public void processingInstruction(String target, String data) throws SAXException { | |
96 } | |
97 | |
98 public void setDocumentLocator(org.xml.sax.Locator arg1) { | |
99 } | |
100 | |
101 public void endPrefixMapping(String prefix) throws SAXException { | |
102 } | |
103 | |
104 public void skippedEntity(String name) throws SAXException { | |
105 } | |
106 | |
107 public void endElement(String uri, String localName, String name) throws SAXException { | |
108 currentElement = null; | |
109 try { | |
110 if (name.equals("analysis")) { | |
111 if (form.isGreek()) | |
112 form = transcodeFromBetaCode2Unicode(form); | |
113 else if (form.isArabic()) | |
114 form = transcodeFromBuckwalter2Unicode(form); | |
115 form.normalize(); | |
116 if (form.isOk()) { | |
117 String formName = form.getFormName(); | |
118 String lemmaName = form.getLemmaName(); | |
119 Hashtable<String, Form> formLemmas = forms.get(formName); | |
120 if (formLemmas == null) { | |
121 formLemmas = new Hashtable<String, Form>(); | |
122 formLemmas.put(lemmaName, form); | |
123 forms.put(formName, formLemmas); | |
124 write(form); | |
125 } else { | |
126 Form formLemma = formLemmas.get(lemmaName); | |
127 if (formLemma == null) { | |
128 formLemmas.put(lemmaName, form); | |
129 write(form); | |
130 } | |
131 } | |
132 } | |
133 form = null; | |
134 } | |
135 } catch (ApplicationException e) { | |
136 throw new SAXException(e); | |
137 } | |
138 } | |
139 | |
140 public void startElement(String uri, String localName, String name, Attributes attrs) throws SAXException { | |
141 currentElement = new Element(name); | |
142 if (name.equals("analysis")) { | |
143 form = new Form(); | |
144 form.setProvider(provider); | |
145 form.setLanguage(language); | |
146 } | |
147 } | |
148 | |
149 public void startPrefixMapping(String prefix, String uri) throws SAXException { | |
150 } | |
151 | |
152 private boolean isXmlFormField(String fieldName) { | |
153 boolean isXmlFormField = false; | |
154 for (int i=0; i<XML_FORM_FIELD_NAMES.length; i++) { | |
155 String n = XML_FORM_FIELD_NAMES[i]; | |
156 if (fieldName.toLowerCase().equals(n)) { | |
157 isXmlFormField = true; | |
158 break; | |
159 } | |
160 } | |
161 return isXmlFormField; | |
162 } | |
163 | |
164 private void write(String outStr) throws SAXException { | |
165 try { | |
166 byte[] bytes = outStr.getBytes("utf-8"); | |
167 out.write(bytes, 0, bytes.length); | |
168 out.flush(); | |
169 } catch (IOException e) { | |
170 throw new SAXException(e); | |
171 } | |
172 } | |
173 | |
174 private void write(Form form) throws SAXException { | |
175 try { | |
176 String xmlFormStr = form.getXmlString(); | |
177 byte[] bytes = xmlFormStr.getBytes("utf-8"); | |
178 out.write(bytes, 0, bytes.length); | |
179 out.flush(); | |
180 } catch (IOException e) { | |
181 throw new SAXException(e); | |
182 } | |
183 } | |
184 | |
185 private Form transcodeFromBetaCode2Unicode(Form form) throws ApplicationException { | |
186 String formName = form.getFormName(); | |
187 String lemmaName = form.getLemmaName(); | |
188 Transcoder transcoder = Transcoder.getInstance(); | |
189 String encodedUnicodeForm = transcoder.transcodeFromBetaCode2Unicode(formName); | |
190 String encodedUnicodeLemma = transcoder.transcodeFromBetaCode2Unicode(lemmaName); | |
191 form.setFormName(encodedUnicodeForm); | |
192 form.setLemmaName(encodedUnicodeLemma); | |
193 return form; | |
194 } | |
195 | |
196 private Form transcodeFromBuckwalter2Unicode(Form form) throws ApplicationException { | |
197 String formName = form.getFormName(); | |
198 String lemmaName = form.getLemmaName(); | |
199 Transcoder transcoder = Transcoder.getInstance(); | |
200 String encodedUnicodeForm = transcoder.transcodeFromBuckwalter2Unicode(formName); | |
201 String encodedUnicodeLemma = transcoder.transcodeFromBuckwalter2Unicode(lemmaName); | |
202 form.setFormName(encodedUnicodeForm); | |
203 form.setLemmaName(encodedUnicodeLemma); | |
204 return form; | |
205 } | |
206 | |
207 private class Element { | |
208 String name; | |
209 String value; | |
210 | |
211 Element(String name) { | |
212 this.name = name; | |
213 } | |
214 | |
215 Element(String name, String value) { | |
216 this.name = name; | |
217 this.value = value; | |
218 } | |
219 } | |
220 } |