31
|
1 /**
|
|
2 *
|
|
3 */
|
|
4 package org.mpi.openmind.repository.utils;
|
|
5
|
|
6 import java.io.InputStream;
|
|
7 import java.util.ArrayList;
|
|
8 import java.util.HashMap;
|
|
9 import java.util.Iterator;
|
|
10 import java.util.List;
|
|
11 import java.util.Map;
|
|
12
|
|
13 import javax.xml.namespace.QName;
|
|
14 import javax.xml.stream.XMLEventReader;
|
|
15 import javax.xml.stream.XMLInputFactory;
|
|
16 import javax.xml.stream.XMLStreamException;
|
|
17 import javax.xml.stream.events.Attribute;
|
|
18 import javax.xml.stream.events.Characters;
|
|
19 import javax.xml.stream.events.EndElement;
|
|
20 import javax.xml.stream.events.StartElement;
|
|
21 import javax.xml.stream.events.XMLEvent;
|
|
22
|
|
23 import org.apache.log4j.Logger;
|
|
24
|
|
25 /**
|
|
26 * Class that reads an OM4 XML dump into lists of simple objects.
|
|
27 *
|
|
28 * The constructor takes an InputStream.
|
|
29 *
|
|
30 * The read() method reads the contents of the file into the members
|
|
31 * .entities and .relations.
|
|
32 *
|
|
33 * The contents are Lists of OmEntities and OmRelations also holding
|
|
34 * Lists of omAttributes.
|
|
35 *
|
|
36 * @author casties
|
|
37 *
|
|
38 */
|
|
39 public class OM4XmlEventReader {
|
|
40
|
|
41 private static Logger logger = Logger.getLogger(OM4XmlEventReader.class);
|
|
42
|
|
43 public OM4XmlEventReader(InputStream xmlStream) {
|
|
44 super();
|
|
45 this.xmlStream = xmlStream;
|
|
46 }
|
|
47
|
|
48 InputStream xmlStream;
|
|
49
|
|
50 public int numEntities;
|
|
51 public List<OmEntity> entities;
|
|
52 private int entCnt = 0;
|
|
53
|
|
54 public int numRelations;
|
|
55 public List<OmRelation> relations;
|
|
56 private int relCnt = 0;
|
|
57
|
|
58 /**
|
|
59 * Simple class holding the representation of an OpenMind Attribute from XML.
|
|
60 *
|
|
61 * @author casties
|
|
62 */
|
|
63 public class OmAttribute {
|
|
64 public Map<String, String> xmlAtts;
|
|
65 public String value;
|
|
66
|
|
67 public String getId() {
|
|
68 return xmlAtts.get("id");
|
|
69 }
|
|
70 }
|
|
71
|
|
72 /**
|
|
73 * Simple class holding the representation of an OpenMind Entity from XML.
|
|
74 *
|
|
75 * @author casties
|
|
76 */
|
|
77 public class OmEntity {
|
|
78 public Map<String, String> xmlAtts;
|
|
79 public String value;
|
|
80 public List<OmAttribute> attributes;
|
|
81
|
|
82 public String getId() {
|
|
83 return xmlAtts.get("id");
|
|
84 }
|
|
85 }
|
|
86
|
|
87 /**
|
|
88 * Simple class holding the representation of an OpenMind Relation from XML.
|
|
89 *
|
|
90 * @author casties
|
|
91 */
|
|
92 public class OmRelation {
|
|
93 public Map<String, String> xmlAtts;
|
|
94 public String value;
|
|
95 public List<OmAttribute> attributes;
|
|
96
|
|
97 public String getId() {
|
|
98 return xmlAtts.get("id");
|
|
99 }
|
|
100 }
|
|
101
|
|
102 /**
|
|
103 * Reads the XML from xmlStream and populates entities and relations.
|
|
104 *
|
|
105 * @throws XMLStreamException
|
|
106 */
|
|
107 public void read() throws XMLStreamException {
|
|
108 XMLInputFactory inputFactory = XMLInputFactory.newInstance();
|
|
109 XMLEventReader reader = inputFactory.createXMLEventReader(xmlStream, "UTF-8");
|
|
110 try {
|
|
111 while (reader.hasNext()) {
|
|
112 XMLEvent e = reader.nextEvent();
|
|
113 if (e.isStartDocument()) {
|
|
114 continue;
|
|
115 } else if (e.isStartElement()) {
|
|
116 StartElement es = e.asStartElement();
|
|
117 String lname = es.getName().getLocalPart();
|
|
118 if (lname == XMLUtil.ENTITIES) {
|
|
119 entities = processEntities(es, reader);
|
|
120 } else if (lname == XMLUtil.RELATIONS) {
|
|
121 relations = processRelations(es, reader);
|
|
122 }
|
|
123 }
|
|
124 }
|
|
125 } finally {
|
|
126 reader.close();
|
|
127 }
|
|
128 }
|
|
129
|
|
130 /**
|
|
131 * Process the entities tag and its contents.
|
|
132 *
|
|
133 * @param elem
|
|
134 * @param reader
|
|
135 * @return
|
|
136 * @throws XMLStreamException
|
|
137 */
|
|
138 private List<OmEntity> processEntities(StartElement elem, XMLEventReader reader) throws XMLStreamException {
|
|
139 logger.debug("loading entities...");
|
|
140 // get number attribute
|
|
141 Attribute numa = elem.getAttributeByName(new QName("number"));
|
|
142 if (numa != null) {
|
|
143 numEntities = Integer.parseInt(numa.getValue());
|
|
144 }
|
|
145 // start reading sub-elements
|
|
146 List<OmEntity> entities = new ArrayList<OmEntity>();
|
|
147 while (reader.hasNext()) {
|
|
148 XMLEvent e = reader.nextEvent();
|
|
149 if (e.isStartElement()) {
|
|
150 // start of next element
|
|
151 StartElement es = e.asStartElement();
|
|
152 String lname = es.getName().getLocalPart();
|
|
153 if (lname == XMLUtil.ENTITY) {
|
|
154 // process entity tag
|
|
155 entities.add(processEntity(es, reader));
|
|
156 }
|
|
157 } else if (e.isEndElement()) {
|
|
158 EndElement ee = e.asEndElement();
|
|
159 if (ee.getName().getLocalPart().equals(XMLUtil.ENTITIES)) {
|
|
160 // end of this element
|
|
161 break;
|
|
162 } else {
|
|
163 logger.warn("Unexpected EndElement: "+ee);
|
|
164 }
|
|
165 }
|
|
166 }
|
|
167 return entities;
|
|
168 }
|
|
169
|
|
170 /**
|
|
171 * Process the entity tag and its contents.
|
|
172 *
|
|
173 * @param elem
|
|
174 * @param reader
|
|
175 * @return
|
|
176 * @throws XMLStreamException
|
|
177 */
|
|
178 private OmEntity processEntity(StartElement elem, XMLEventReader reader) throws XMLStreamException {
|
|
179 //logger.debug("entity");
|
|
180 OmEntity ent = new OmEntity();
|
|
181 Map<String, String> xmlAtts = new HashMap<String, String>();
|
|
182 @SuppressWarnings("unchecked")
|
|
183 Iterator<Attribute> atts = elem.getAttributes();
|
|
184 while (atts.hasNext()) {
|
|
185 Attribute att = atts.next();
|
|
186 xmlAtts.put(att.getName().getLocalPart(), att.getValue());
|
|
187 }
|
|
188 ent.xmlAtts = xmlAtts;
|
|
189 // start reading sub-elements
|
|
190 ent.attributes = new ArrayList<OmAttribute>();
|
|
191 while (reader.hasNext()) {
|
|
192 XMLEvent e = reader.nextEvent();
|
|
193 if (e.isStartElement()) {
|
|
194 // start of next element
|
|
195 StartElement es = e.asStartElement();
|
|
196 String lname = es.getName().getLocalPart();
|
|
197 if (lname == XMLUtil.ATTRIBUTES) {
|
|
198 // ignore attributes tag
|
|
199 continue;
|
|
200 }
|
|
201 if (lname == XMLUtil.ATTRIBUTE) {
|
|
202 // process attribute tag
|
|
203 ent.attributes.add(processAttribute(es, reader));
|
|
204 }
|
|
205 } else if (e.isCharacters()) {
|
|
206 // text content
|
|
207 Characters ec = e.asCharacters();
|
|
208 if (ent.value == null) {
|
|
209 ent.value = ec.getData();
|
|
210 } else {
|
|
211 ent.value += ec.getData();
|
|
212 }
|
|
213 } else if (e.isEndElement()) {
|
|
214 EndElement ee = e.asEndElement();
|
|
215 if (ee.getName().getLocalPart().equals(XMLUtil.ENTITY)) {
|
|
216 // end of this element
|
|
217 break;
|
|
218 }
|
|
219 }
|
|
220 }
|
|
221 if (++entCnt % 500 == 0) {
|
|
222 logger.debug(""+entCnt+" entities read...");
|
|
223 }
|
|
224 return ent;
|
|
225 }
|
|
226
|
|
227 /**
|
|
228 * Process the relations tag and its contents.
|
|
229 *
|
|
230 * @param elem
|
|
231 * @param reader
|
|
232 * @return
|
|
233 * @throws XMLStreamException
|
|
234 */
|
|
235 private List<OmRelation> processRelations(StartElement elem, XMLEventReader reader) throws XMLStreamException {
|
|
236 logger.debug("loading relations...");
|
|
237 // get number attribute
|
|
238 Attribute numa = elem.getAttributeByName(new QName("number"));
|
|
239 if (numa != null) {
|
|
240 numRelations = Integer.parseInt(numa.getValue());
|
|
241 }
|
|
242 // start reading sub-elements
|
|
243 List<OmRelation> rels = new ArrayList<OmRelation>();
|
|
244 while (reader.hasNext()) {
|
|
245 XMLEvent e = reader.nextEvent();
|
|
246 if (e.isStartElement()) {
|
|
247 // start of next element
|
|
248 StartElement es = e.asStartElement();
|
|
249 String lname = es.getName().getLocalPart();
|
|
250 if (lname == XMLUtil.RELATION) {
|
|
251 // process entity tag
|
|
252 rels.add(processRelation(es, reader));
|
|
253 }
|
|
254 } else if (e.isEndElement()) {
|
|
255 EndElement ee = e.asEndElement();
|
|
256 if (ee.getName().getLocalPart().equals(XMLUtil.RELATIONS)) {
|
|
257 // end of this element
|
|
258 break;
|
|
259 } else {
|
|
260 logger.warn("Unexpected EndElement: "+ee);
|
|
261 }
|
|
262 }
|
|
263 }
|
|
264 return rels;
|
|
265 }
|
|
266
|
|
267
|
|
268 /**
|
|
269 * Process the relation tag and its contents.
|
|
270 *
|
|
271 * @param elem
|
|
272 * @param reader
|
|
273 * @return
|
|
274 * @throws XMLStreamException
|
|
275 */
|
|
276 private OmRelation processRelation(StartElement elem, XMLEventReader reader) throws XMLStreamException {
|
|
277 //logger.debug("relation");
|
|
278 OmRelation rel = new OmRelation();
|
|
279 Map<String, String> xmlAtts = new HashMap<String, String>();
|
|
280 @SuppressWarnings("unchecked")
|
|
281 Iterator<Attribute> atts = elem.getAttributes();
|
|
282 while (atts.hasNext()) {
|
|
283 Attribute att = atts.next();
|
|
284 xmlAtts.put(att.getName().getLocalPart(), att.getValue());
|
|
285 }
|
|
286 rel.xmlAtts = xmlAtts;
|
|
287 // start reading sub-elements
|
|
288 rel.attributes = new ArrayList<OmAttribute>();
|
|
289 while (reader.hasNext()) {
|
|
290 XMLEvent e = reader.nextEvent();
|
|
291 if (e.isStartElement()) {
|
|
292 // start of next element
|
|
293 StartElement es = e.asStartElement();
|
|
294 String lname = es.getName().getLocalPart();
|
|
295 if (lname == XMLUtil.ATTRIBUTES) {
|
|
296 // ignore attributes tag
|
|
297 continue;
|
|
298 }
|
|
299 if (lname == XMLUtil.ATTRIBUTE) {
|
|
300 // process attribute tag
|
|
301 rel.attributes.add(processAttribute(es, reader));
|
|
302 }
|
|
303 } else if (e.isCharacters()) {
|
|
304 // text content
|
|
305 Characters ec = e.asCharacters();
|
|
306 if (rel.value == null) {
|
|
307 rel.value = ec.getData();
|
|
308 } else {
|
|
309 rel.value += ec.getData();
|
|
310 }
|
|
311 } else if (e.isEndElement()) {
|
|
312 EndElement ee = e.asEndElement();
|
|
313 if (ee.getName().getLocalPart().equals(XMLUtil.RELATION)) {
|
|
314 // end of this element
|
|
315 break;
|
|
316 }
|
|
317 }
|
|
318 }
|
|
319 if (++relCnt % 100 == 0) {
|
|
320 logger.debug(""+relCnt+" relations read...");
|
|
321 }
|
|
322 return rel;
|
|
323 }
|
|
324
|
|
325 /**
|
|
326 * Process the attribute tag and its contents.
|
|
327 *
|
|
328 * @param elem
|
|
329 * @param reader
|
|
330 * @return
|
|
331 * @throws XMLStreamException
|
|
332 */
|
|
333 private OmAttribute processAttribute(StartElement elem, XMLEventReader reader) throws XMLStreamException {
|
|
334 //logger.debug("attribute");
|
|
335 OmAttribute oma = new OmAttribute();
|
|
336 Map<String, String> xmlAtts = new HashMap<String, String>();
|
|
337 @SuppressWarnings("unchecked")
|
|
338 Iterator<Attribute> atts = elem.getAttributes();
|
|
339 while (atts.hasNext()) {
|
|
340 Attribute att = atts.next();
|
|
341 xmlAtts.put(att.getName().getLocalPart(), att.getValue());
|
|
342 }
|
|
343 oma.xmlAtts = xmlAtts;
|
|
344 // start reading sub-elements
|
|
345 while (reader.hasNext()) {
|
|
346 XMLEvent e = reader.nextEvent();
|
|
347 if (e.isCharacters()) {
|
|
348 // text content
|
|
349 Characters ec = e.asCharacters();
|
|
350 if (oma.value == null) {
|
|
351 oma.value = ec.getData();
|
|
352 } else {
|
|
353 oma.value += ec.getData();
|
|
354 }
|
|
355 } else if (e.isEndElement()) {
|
|
356 EndElement ee = e.asEndElement();
|
|
357 if (ee.getName().getLocalPart().equals(XMLUtil.ATTRIBUTE)) {
|
|
358 // end of this element
|
|
359 break;
|
|
360 } else {
|
|
361 logger.warn("Unexpected EndElement: "+ee);
|
|
362 }
|
|
363 }
|
|
364 }
|
|
365 return oma;
|
|
366 }
|
|
367
|
|
368 }
|