Mercurial > hg > openmind
annotate src/main/java/org/mpi/openmind/repository/utils/OM4XmlEventReader.java @ 89:8adfa8679991
new implementation of translit-to-romanization rules in RomanizationLoc with test(!).
author | Robert Casties <casties@mpiwg-berlin.mpg.de> |
---|---|
date | Mon, 26 Feb 2018 14:39:49 +0100 |
parents | 90f9a1c45b15 |
children |
rev | line source |
---|---|
31 | 1 /** |
2 * | |
3 */ | |
4 package org.mpi.openmind.repository.utils; | |
5 | |
6 import java.io.InputStream; | |
7 import java.util.ArrayList; | |
8 import java.util.HashMap; | |
9 import java.util.Iterator; | |
10 import java.util.List; | |
11 import java.util.Map; | |
12 | |
13 import javax.xml.namespace.QName; | |
14 import javax.xml.stream.XMLEventReader; | |
15 import javax.xml.stream.XMLInputFactory; | |
16 import javax.xml.stream.XMLStreamException; | |
17 import javax.xml.stream.events.Attribute; | |
18 import javax.xml.stream.events.Characters; | |
19 import javax.xml.stream.events.EndElement; | |
20 import javax.xml.stream.events.StartElement; | |
21 import javax.xml.stream.events.XMLEvent; | |
22 | |
23 import org.apache.log4j.Logger; | |
24 | |
25 /** | |
26 * Class that reads an OM4 XML dump into lists of simple objects. | |
27 * | |
28 * The constructor takes an InputStream. | |
29 * | |
30 * The read() method reads the contents of the file into the members | |
31 * .entities and .relations. | |
32 * | |
32 | 33 * The contents are Lists of OmXmlEntities and OmXmlRelations holding |
34 * Lists of omXmlAttributes. | |
35 * | |
36 * This implementation uses XMLEventReader. | |
31 | 37 * |
38 * @author casties | |
39 * | |
40 */ | |
41 public class OM4XmlEventReader { | |
42 | |
43 private static Logger logger = Logger.getLogger(OM4XmlEventReader.class); | |
44 | |
45 public OM4XmlEventReader(InputStream xmlStream) { | |
46 super(); | |
47 this.xmlStream = xmlStream; | |
48 } | |
49 | |
50 InputStream xmlStream; | |
51 | |
52 public int numEntities; | |
32 | 53 public List<OmXmlEntity> entities; |
31 | 54 private int entCnt = 0; |
55 | |
56 public int numRelations; | |
32 | 57 public List<OmXmlRelation> relations; |
31 | 58 private int relCnt = 0; |
59 | |
60 /** | |
61 * Simple class holding the representation of an OpenMind Attribute from XML. | |
62 * | |
63 * @author casties | |
64 */ | |
32 | 65 public class OmXmlAttribute { |
31 | 66 public Map<String, String> xmlAtts; |
67 public String value; | |
68 | |
69 public String getId() { | |
70 return xmlAtts.get("id"); | |
71 } | |
72 } | |
73 | |
74 /** | |
75 * Simple class holding the representation of an OpenMind Entity from XML. | |
76 * | |
77 * @author casties | |
78 */ | |
32 | 79 public class OmXmlEntity { |
31 | 80 public Map<String, String> xmlAtts; |
81 public String value; | |
32 | 82 public List<OmXmlAttribute> attributes; |
31 | 83 |
84 public String getId() { | |
85 return xmlAtts.get("id"); | |
86 } | |
87 } | |
88 | |
89 /** | |
90 * Simple class holding the representation of an OpenMind Relation from XML. | |
91 * | |
92 * @author casties | |
93 */ | |
32 | 94 public class OmXmlRelation { |
31 | 95 public Map<String, String> xmlAtts; |
96 public String value; | |
32 | 97 public List<OmXmlAttribute> attributes; |
31 | 98 |
99 public String getId() { | |
100 return xmlAtts.get("id"); | |
101 } | |
102 } | |
103 | |
104 /** | |
105 * Reads the XML from xmlStream and populates entities and relations. | |
106 * | |
107 * @throws XMLStreamException | |
108 */ | |
109 public void read() throws XMLStreamException { | |
110 XMLInputFactory inputFactory = XMLInputFactory.newInstance(); | |
111 XMLEventReader reader = inputFactory.createXMLEventReader(xmlStream, "UTF-8"); | |
112 try { | |
113 while (reader.hasNext()) { | |
114 XMLEvent e = reader.nextEvent(); | |
115 if (e.isStartDocument()) { | |
116 continue; | |
117 } else if (e.isStartElement()) { | |
118 StartElement es = e.asStartElement(); | |
119 String lname = es.getName().getLocalPart(); | |
120 if (lname == XMLUtil.ENTITIES) { | |
121 entities = processEntities(es, reader); | |
122 } else if (lname == XMLUtil.RELATIONS) { | |
123 relations = processRelations(es, reader); | |
124 } | |
125 } | |
126 } | |
127 } finally { | |
128 reader.close(); | |
129 } | |
130 } | |
131 | |
132 /** | |
133 * Process the entities tag and its contents. | |
134 * | |
135 * @param elem | |
136 * @param reader | |
137 * @return | |
138 * @throws XMLStreamException | |
139 */ | |
32 | 140 private List<OmXmlEntity> processEntities(StartElement elem, XMLEventReader reader) throws XMLStreamException { |
31 | 141 logger.debug("loading entities..."); |
142 // get number attribute | |
82
90f9a1c45b15
small change to xml format.
Robert Casties <casties@mpiwg-berlin.mpg.de>
parents:
32
diff
changeset
|
143 Attribute numa = elem.getAttributeByName(new QName("count")); |
31 | 144 if (numa != null) { |
145 numEntities = Integer.parseInt(numa.getValue()); | |
146 } | |
147 // start reading sub-elements | |
32 | 148 List<OmXmlEntity> entities = new ArrayList<OmXmlEntity>(); |
31 | 149 while (reader.hasNext()) { |
150 XMLEvent e = reader.nextEvent(); | |
151 if (e.isStartElement()) { | |
152 // start of next element | |
153 StartElement es = e.asStartElement(); | |
154 String lname = es.getName().getLocalPart(); | |
155 if (lname == XMLUtil.ENTITY) { | |
156 // process entity tag | |
157 entities.add(processEntity(es, reader)); | |
158 } | |
159 } else if (e.isEndElement()) { | |
160 EndElement ee = e.asEndElement(); | |
161 if (ee.getName().getLocalPart().equals(XMLUtil.ENTITIES)) { | |
162 // end of this element | |
163 break; | |
164 } else { | |
165 logger.warn("Unexpected EndElement: "+ee); | |
166 } | |
167 } | |
168 } | |
169 return entities; | |
170 } | |
171 | |
172 /** | |
173 * Process the entity tag and its contents. | |
174 * | |
175 * @param elem | |
176 * @param reader | |
177 * @return | |
178 * @throws XMLStreamException | |
179 */ | |
32 | 180 private OmXmlEntity processEntity(StartElement elem, XMLEventReader reader) throws XMLStreamException { |
31 | 181 //logger.debug("entity"); |
32 | 182 OmXmlEntity ent = new OmXmlEntity(); |
31 | 183 Map<String, String> xmlAtts = new HashMap<String, String>(); |
184 @SuppressWarnings("unchecked") | |
185 Iterator<Attribute> atts = elem.getAttributes(); | |
186 while (atts.hasNext()) { | |
187 Attribute att = atts.next(); | |
188 xmlAtts.put(att.getName().getLocalPart(), att.getValue()); | |
189 } | |
190 ent.xmlAtts = xmlAtts; | |
191 // start reading sub-elements | |
32 | 192 ent.attributes = new ArrayList<OmXmlAttribute>(); |
31 | 193 while (reader.hasNext()) { |
194 XMLEvent e = reader.nextEvent(); | |
195 if (e.isStartElement()) { | |
196 // start of next element | |
197 StartElement es = e.asStartElement(); | |
198 String lname = es.getName().getLocalPart(); | |
199 if (lname == XMLUtil.ATTRIBUTES) { | |
200 // ignore attributes tag | |
201 continue; | |
202 } | |
203 if (lname == XMLUtil.ATTRIBUTE) { | |
204 // process attribute tag | |
205 ent.attributes.add(processAttribute(es, reader)); | |
206 } | |
207 } else if (e.isCharacters()) { | |
208 // text content | |
209 Characters ec = e.asCharacters(); | |
210 if (ent.value == null) { | |
211 ent.value = ec.getData(); | |
212 } else { | |
213 ent.value += ec.getData(); | |
214 } | |
215 } else if (e.isEndElement()) { | |
216 EndElement ee = e.asEndElement(); | |
217 if (ee.getName().getLocalPart().equals(XMLUtil.ENTITY)) { | |
218 // end of this element | |
219 break; | |
220 } | |
221 } | |
222 } | |
223 if (++entCnt % 500 == 0) { | |
224 logger.debug(""+entCnt+" entities read..."); | |
225 } | |
226 return ent; | |
227 } | |
228 | |
229 /** | |
230 * Process the relations tag and its contents. | |
231 * | |
232 * @param elem | |
233 * @param reader | |
234 * @return | |
235 * @throws XMLStreamException | |
236 */ | |
32 | 237 private List<OmXmlRelation> processRelations(StartElement elem, XMLEventReader reader) throws XMLStreamException { |
31 | 238 logger.debug("loading relations..."); |
239 // get number attribute | |
82
90f9a1c45b15
small change to xml format.
Robert Casties <casties@mpiwg-berlin.mpg.de>
parents:
32
diff
changeset
|
240 Attribute numa = elem.getAttributeByName(new QName("count")); |
31 | 241 if (numa != null) { |
242 numRelations = Integer.parseInt(numa.getValue()); | |
243 } | |
244 // start reading sub-elements | |
32 | 245 List<OmXmlRelation> rels = new ArrayList<OmXmlRelation>(); |
31 | 246 while (reader.hasNext()) { |
247 XMLEvent e = reader.nextEvent(); | |
248 if (e.isStartElement()) { | |
249 // start of next element | |
250 StartElement es = e.asStartElement(); | |
251 String lname = es.getName().getLocalPart(); | |
252 if (lname == XMLUtil.RELATION) { | |
253 // process entity tag | |
254 rels.add(processRelation(es, reader)); | |
255 } | |
256 } else if (e.isEndElement()) { | |
257 EndElement ee = e.asEndElement(); | |
258 if (ee.getName().getLocalPart().equals(XMLUtil.RELATIONS)) { | |
259 // end of this element | |
260 break; | |
261 } else { | |
262 logger.warn("Unexpected EndElement: "+ee); | |
263 } | |
264 } | |
265 } | |
266 return rels; | |
267 } | |
268 | |
269 | |
270 /** | |
271 * Process the relation tag and its contents. | |
272 * | |
273 * @param elem | |
274 * @param reader | |
275 * @return | |
276 * @throws XMLStreamException | |
277 */ | |
32 | 278 private OmXmlRelation processRelation(StartElement elem, XMLEventReader reader) throws XMLStreamException { |
31 | 279 //logger.debug("relation"); |
32 | 280 OmXmlRelation rel = new OmXmlRelation(); |
31 | 281 Map<String, String> xmlAtts = new HashMap<String, String>(); |
282 @SuppressWarnings("unchecked") | |
283 Iterator<Attribute> atts = elem.getAttributes(); | |
284 while (atts.hasNext()) { | |
285 Attribute att = atts.next(); | |
286 xmlAtts.put(att.getName().getLocalPart(), att.getValue()); | |
287 } | |
288 rel.xmlAtts = xmlAtts; | |
289 // start reading sub-elements | |
32 | 290 rel.attributes = new ArrayList<OmXmlAttribute>(); |
31 | 291 while (reader.hasNext()) { |
292 XMLEvent e = reader.nextEvent(); | |
293 if (e.isStartElement()) { | |
294 // start of next element | |
295 StartElement es = e.asStartElement(); | |
296 String lname = es.getName().getLocalPart(); | |
297 if (lname == XMLUtil.ATTRIBUTES) { | |
298 // ignore attributes tag | |
299 continue; | |
300 } | |
301 if (lname == XMLUtil.ATTRIBUTE) { | |
302 // process attribute tag | |
303 rel.attributes.add(processAttribute(es, reader)); | |
304 } | |
305 } else if (e.isCharacters()) { | |
306 // text content | |
307 Characters ec = e.asCharacters(); | |
308 if (rel.value == null) { | |
309 rel.value = ec.getData(); | |
310 } else { | |
311 rel.value += ec.getData(); | |
312 } | |
313 } else if (e.isEndElement()) { | |
314 EndElement ee = e.asEndElement(); | |
315 if (ee.getName().getLocalPart().equals(XMLUtil.RELATION)) { | |
316 // end of this element | |
317 break; | |
318 } | |
319 } | |
320 } | |
321 if (++relCnt % 100 == 0) { | |
322 logger.debug(""+relCnt+" relations read..."); | |
323 } | |
324 return rel; | |
325 } | |
326 | |
327 /** | |
328 * Process the attribute tag and its contents. | |
329 * | |
330 * @param elem | |
331 * @param reader | |
332 * @return | |
333 * @throws XMLStreamException | |
334 */ | |
32 | 335 private OmXmlAttribute processAttribute(StartElement elem, XMLEventReader reader) throws XMLStreamException { |
31 | 336 //logger.debug("attribute"); |
32 | 337 OmXmlAttribute oma = new OmXmlAttribute(); |
31 | 338 Map<String, String> xmlAtts = new HashMap<String, String>(); |
339 @SuppressWarnings("unchecked") | |
340 Iterator<Attribute> atts = elem.getAttributes(); | |
341 while (atts.hasNext()) { | |
342 Attribute att = atts.next(); | |
343 xmlAtts.put(att.getName().getLocalPart(), att.getValue()); | |
344 } | |
345 oma.xmlAtts = xmlAtts; | |
346 // start reading sub-elements | |
347 while (reader.hasNext()) { | |
348 XMLEvent e = reader.nextEvent(); | |
349 if (e.isCharacters()) { | |
350 // text content | |
351 Characters ec = e.asCharacters(); | |
352 if (oma.value == null) { | |
353 oma.value = ec.getData(); | |
354 } else { | |
355 oma.value += ec.getData(); | |
356 } | |
357 } else if (e.isEndElement()) { | |
358 EndElement ee = e.asEndElement(); | |
359 if (ee.getName().getLocalPart().equals(XMLUtil.ATTRIBUTE)) { | |
360 // end of this element | |
361 break; | |
362 } else { | |
363 logger.warn("Unexpected EndElement: "+ee); | |
364 } | |
365 } | |
366 } | |
367 return oma; | |
368 } | |
369 | |
370 } |