31
|
1 /**
|
|
2 *
|
|
3 */
|
|
4 package org.mpi.openmind.repository.utils;
|
|
5
|
|
6 import java.io.InputStream;
|
|
7 import java.util.ArrayList;
|
|
8 import java.util.HashMap;
|
|
9 import java.util.Iterator;
|
|
10 import java.util.List;
|
|
11 import java.util.Map;
|
|
12
|
|
13 import javax.xml.namespace.QName;
|
|
14 import javax.xml.stream.XMLEventReader;
|
|
15 import javax.xml.stream.XMLInputFactory;
|
|
16 import javax.xml.stream.XMLStreamException;
|
|
17 import javax.xml.stream.events.Attribute;
|
|
18 import javax.xml.stream.events.Characters;
|
|
19 import javax.xml.stream.events.EndElement;
|
|
20 import javax.xml.stream.events.StartElement;
|
|
21 import javax.xml.stream.events.XMLEvent;
|
|
22
|
|
23 import org.apache.log4j.Logger;
|
|
24
|
|
25 /**
|
|
26 * Class that reads an OM4 XML dump into lists of simple objects.
|
|
27 *
|
|
28 * The constructor takes an InputStream.
|
|
29 *
|
|
30 * The read() method reads the contents of the file into the members
|
|
31 * .entities and .relations.
|
|
32 *
|
32
|
33 * The contents are Lists of OmXmlEntities and OmXmlRelations holding
|
|
34 * Lists of omXmlAttributes.
|
|
35 *
|
|
36 * This implementation uses XMLEventReader.
|
31
|
37 *
|
|
38 * @author casties
|
|
39 *
|
|
40 */
|
|
41 public class OM4XmlEventReader {
|
|
42
|
|
43 private static Logger logger = Logger.getLogger(OM4XmlEventReader.class);
|
|
44
|
|
45 public OM4XmlEventReader(InputStream xmlStream) {
|
|
46 super();
|
|
47 this.xmlStream = xmlStream;
|
|
48 }
|
|
49
|
|
50 InputStream xmlStream;
|
|
51
|
|
52 public int numEntities;
|
32
|
53 public List<OmXmlEntity> entities;
|
31
|
54 private int entCnt = 0;
|
|
55
|
|
56 public int numRelations;
|
32
|
57 public List<OmXmlRelation> relations;
|
31
|
58 private int relCnt = 0;
|
|
59
|
|
60 /**
|
|
61 * Simple class holding the representation of an OpenMind Attribute from XML.
|
|
62 *
|
|
63 * @author casties
|
|
64 */
|
32
|
65 public class OmXmlAttribute {
|
31
|
66 public Map<String, String> xmlAtts;
|
|
67 public String value;
|
|
68
|
|
69 public String getId() {
|
|
70 return xmlAtts.get("id");
|
|
71 }
|
|
72 }
|
|
73
|
|
74 /**
|
|
75 * Simple class holding the representation of an OpenMind Entity from XML.
|
|
76 *
|
|
77 * @author casties
|
|
78 */
|
32
|
79 public class OmXmlEntity {
|
31
|
80 public Map<String, String> xmlAtts;
|
|
81 public String value;
|
32
|
82 public List<OmXmlAttribute> attributes;
|
31
|
83
|
|
84 public String getId() {
|
|
85 return xmlAtts.get("id");
|
|
86 }
|
|
87 }
|
|
88
|
|
89 /**
|
|
90 * Simple class holding the representation of an OpenMind Relation from XML.
|
|
91 *
|
|
92 * @author casties
|
|
93 */
|
32
|
94 public class OmXmlRelation {
|
31
|
95 public Map<String, String> xmlAtts;
|
|
96 public String value;
|
32
|
97 public List<OmXmlAttribute> attributes;
|
31
|
98
|
|
99 public String getId() {
|
|
100 return xmlAtts.get("id");
|
|
101 }
|
|
102 }
|
|
103
|
|
104 /**
|
|
105 * Reads the XML from xmlStream and populates entities and relations.
|
|
106 *
|
|
107 * @throws XMLStreamException
|
|
108 */
|
|
109 public void read() throws XMLStreamException {
|
|
110 XMLInputFactory inputFactory = XMLInputFactory.newInstance();
|
|
111 XMLEventReader reader = inputFactory.createXMLEventReader(xmlStream, "UTF-8");
|
|
112 try {
|
|
113 while (reader.hasNext()) {
|
|
114 XMLEvent e = reader.nextEvent();
|
|
115 if (e.isStartDocument()) {
|
|
116 continue;
|
|
117 } else if (e.isStartElement()) {
|
|
118 StartElement es = e.asStartElement();
|
|
119 String lname = es.getName().getLocalPart();
|
|
120 if (lname == XMLUtil.ENTITIES) {
|
|
121 entities = processEntities(es, reader);
|
|
122 } else if (lname == XMLUtil.RELATIONS) {
|
|
123 relations = processRelations(es, reader);
|
|
124 }
|
|
125 }
|
|
126 }
|
|
127 } finally {
|
|
128 reader.close();
|
|
129 }
|
|
130 }
|
|
131
|
|
132 /**
|
|
133 * Process the entities tag and its contents.
|
|
134 *
|
|
135 * @param elem
|
|
136 * @param reader
|
|
137 * @return
|
|
138 * @throws XMLStreamException
|
|
139 */
|
32
|
140 private List<OmXmlEntity> processEntities(StartElement elem, XMLEventReader reader) throws XMLStreamException {
|
31
|
141 logger.debug("loading entities...");
|
|
142 // get number attribute
|
|
143 Attribute numa = elem.getAttributeByName(new QName("number"));
|
|
144 if (numa != null) {
|
|
145 numEntities = Integer.parseInt(numa.getValue());
|
|
146 }
|
|
147 // start reading sub-elements
|
32
|
148 List<OmXmlEntity> entities = new ArrayList<OmXmlEntity>();
|
31
|
149 while (reader.hasNext()) {
|
|
150 XMLEvent e = reader.nextEvent();
|
|
151 if (e.isStartElement()) {
|
|
152 // start of next element
|
|
153 StartElement es = e.asStartElement();
|
|
154 String lname = es.getName().getLocalPart();
|
|
155 if (lname == XMLUtil.ENTITY) {
|
|
156 // process entity tag
|
|
157 entities.add(processEntity(es, reader));
|
|
158 }
|
|
159 } else if (e.isEndElement()) {
|
|
160 EndElement ee = e.asEndElement();
|
|
161 if (ee.getName().getLocalPart().equals(XMLUtil.ENTITIES)) {
|
|
162 // end of this element
|
|
163 break;
|
|
164 } else {
|
|
165 logger.warn("Unexpected EndElement: "+ee);
|
|
166 }
|
|
167 }
|
|
168 }
|
|
169 return entities;
|
|
170 }
|
|
171
|
|
172 /**
|
|
173 * Process the entity tag and its contents.
|
|
174 *
|
|
175 * @param elem
|
|
176 * @param reader
|
|
177 * @return
|
|
178 * @throws XMLStreamException
|
|
179 */
|
32
|
180 private OmXmlEntity processEntity(StartElement elem, XMLEventReader reader) throws XMLStreamException {
|
31
|
181 //logger.debug("entity");
|
32
|
182 OmXmlEntity ent = new OmXmlEntity();
|
31
|
183 Map<String, String> xmlAtts = new HashMap<String, String>();
|
|
184 @SuppressWarnings("unchecked")
|
|
185 Iterator<Attribute> atts = elem.getAttributes();
|
|
186 while (atts.hasNext()) {
|
|
187 Attribute att = atts.next();
|
|
188 xmlAtts.put(att.getName().getLocalPart(), att.getValue());
|
|
189 }
|
|
190 ent.xmlAtts = xmlAtts;
|
|
191 // start reading sub-elements
|
32
|
192 ent.attributes = new ArrayList<OmXmlAttribute>();
|
31
|
193 while (reader.hasNext()) {
|
|
194 XMLEvent e = reader.nextEvent();
|
|
195 if (e.isStartElement()) {
|
|
196 // start of next element
|
|
197 StartElement es = e.asStartElement();
|
|
198 String lname = es.getName().getLocalPart();
|
|
199 if (lname == XMLUtil.ATTRIBUTES) {
|
|
200 // ignore attributes tag
|
|
201 continue;
|
|
202 }
|
|
203 if (lname == XMLUtil.ATTRIBUTE) {
|
|
204 // process attribute tag
|
|
205 ent.attributes.add(processAttribute(es, reader));
|
|
206 }
|
|
207 } else if (e.isCharacters()) {
|
|
208 // text content
|
|
209 Characters ec = e.asCharacters();
|
|
210 if (ent.value == null) {
|
|
211 ent.value = ec.getData();
|
|
212 } else {
|
|
213 ent.value += ec.getData();
|
|
214 }
|
|
215 } else if (e.isEndElement()) {
|
|
216 EndElement ee = e.asEndElement();
|
|
217 if (ee.getName().getLocalPart().equals(XMLUtil.ENTITY)) {
|
|
218 // end of this element
|
|
219 break;
|
|
220 }
|
|
221 }
|
|
222 }
|
|
223 if (++entCnt % 500 == 0) {
|
|
224 logger.debug(""+entCnt+" entities read...");
|
|
225 }
|
|
226 return ent;
|
|
227 }
|
|
228
|
|
229 /**
|
|
230 * Process the relations tag and its contents.
|
|
231 *
|
|
232 * @param elem
|
|
233 * @param reader
|
|
234 * @return
|
|
235 * @throws XMLStreamException
|
|
236 */
|
32
|
237 private List<OmXmlRelation> processRelations(StartElement elem, XMLEventReader reader) throws XMLStreamException {
|
31
|
238 logger.debug("loading relations...");
|
|
239 // get number attribute
|
|
240 Attribute numa = elem.getAttributeByName(new QName("number"));
|
|
241 if (numa != null) {
|
|
242 numRelations = Integer.parseInt(numa.getValue());
|
|
243 }
|
|
244 // start reading sub-elements
|
32
|
245 List<OmXmlRelation> rels = new ArrayList<OmXmlRelation>();
|
31
|
246 while (reader.hasNext()) {
|
|
247 XMLEvent e = reader.nextEvent();
|
|
248 if (e.isStartElement()) {
|
|
249 // start of next element
|
|
250 StartElement es = e.asStartElement();
|
|
251 String lname = es.getName().getLocalPart();
|
|
252 if (lname == XMLUtil.RELATION) {
|
|
253 // process entity tag
|
|
254 rels.add(processRelation(es, reader));
|
|
255 }
|
|
256 } else if (e.isEndElement()) {
|
|
257 EndElement ee = e.asEndElement();
|
|
258 if (ee.getName().getLocalPart().equals(XMLUtil.RELATIONS)) {
|
|
259 // end of this element
|
|
260 break;
|
|
261 } else {
|
|
262 logger.warn("Unexpected EndElement: "+ee);
|
|
263 }
|
|
264 }
|
|
265 }
|
|
266 return rels;
|
|
267 }
|
|
268
|
|
269
|
|
270 /**
|
|
271 * Process the relation tag and its contents.
|
|
272 *
|
|
273 * @param elem
|
|
274 * @param reader
|
|
275 * @return
|
|
276 * @throws XMLStreamException
|
|
277 */
|
32
|
278 private OmXmlRelation processRelation(StartElement elem, XMLEventReader reader) throws XMLStreamException {
|
31
|
279 //logger.debug("relation");
|
32
|
280 OmXmlRelation rel = new OmXmlRelation();
|
31
|
281 Map<String, String> xmlAtts = new HashMap<String, String>();
|
|
282 @SuppressWarnings("unchecked")
|
|
283 Iterator<Attribute> atts = elem.getAttributes();
|
|
284 while (atts.hasNext()) {
|
|
285 Attribute att = atts.next();
|
|
286 xmlAtts.put(att.getName().getLocalPart(), att.getValue());
|
|
287 }
|
|
288 rel.xmlAtts = xmlAtts;
|
|
289 // start reading sub-elements
|
32
|
290 rel.attributes = new ArrayList<OmXmlAttribute>();
|
31
|
291 while (reader.hasNext()) {
|
|
292 XMLEvent e = reader.nextEvent();
|
|
293 if (e.isStartElement()) {
|
|
294 // start of next element
|
|
295 StartElement es = e.asStartElement();
|
|
296 String lname = es.getName().getLocalPart();
|
|
297 if (lname == XMLUtil.ATTRIBUTES) {
|
|
298 // ignore attributes tag
|
|
299 continue;
|
|
300 }
|
|
301 if (lname == XMLUtil.ATTRIBUTE) {
|
|
302 // process attribute tag
|
|
303 rel.attributes.add(processAttribute(es, reader));
|
|
304 }
|
|
305 } else if (e.isCharacters()) {
|
|
306 // text content
|
|
307 Characters ec = e.asCharacters();
|
|
308 if (rel.value == null) {
|
|
309 rel.value = ec.getData();
|
|
310 } else {
|
|
311 rel.value += ec.getData();
|
|
312 }
|
|
313 } else if (e.isEndElement()) {
|
|
314 EndElement ee = e.asEndElement();
|
|
315 if (ee.getName().getLocalPart().equals(XMLUtil.RELATION)) {
|
|
316 // end of this element
|
|
317 break;
|
|
318 }
|
|
319 }
|
|
320 }
|
|
321 if (++relCnt % 100 == 0) {
|
|
322 logger.debug(""+relCnt+" relations read...");
|
|
323 }
|
|
324 return rel;
|
|
325 }
|
|
326
|
|
327 /**
|
|
328 * Process the attribute tag and its contents.
|
|
329 *
|
|
330 * @param elem
|
|
331 * @param reader
|
|
332 * @return
|
|
333 * @throws XMLStreamException
|
|
334 */
|
32
|
335 private OmXmlAttribute processAttribute(StartElement elem, XMLEventReader reader) throws XMLStreamException {
|
31
|
336 //logger.debug("attribute");
|
32
|
337 OmXmlAttribute oma = new OmXmlAttribute();
|
31
|
338 Map<String, String> xmlAtts = new HashMap<String, String>();
|
|
339 @SuppressWarnings("unchecked")
|
|
340 Iterator<Attribute> atts = elem.getAttributes();
|
|
341 while (atts.hasNext()) {
|
|
342 Attribute att = atts.next();
|
|
343 xmlAtts.put(att.getName().getLocalPart(), att.getValue());
|
|
344 }
|
|
345 oma.xmlAtts = xmlAtts;
|
|
346 // start reading sub-elements
|
|
347 while (reader.hasNext()) {
|
|
348 XMLEvent e = reader.nextEvent();
|
|
349 if (e.isCharacters()) {
|
|
350 // text content
|
|
351 Characters ec = e.asCharacters();
|
|
352 if (oma.value == null) {
|
|
353 oma.value = ec.getData();
|
|
354 } else {
|
|
355 oma.value += ec.getData();
|
|
356 }
|
|
357 } else if (e.isEndElement()) {
|
|
358 EndElement ee = e.asEndElement();
|
|
359 if (ee.getName().getLocalPart().equals(XMLUtil.ATTRIBUTE)) {
|
|
360 // end of this element
|
|
361 break;
|
|
362 } else {
|
|
363 logger.warn("Unexpected EndElement: "+ee);
|
|
364 }
|
|
365 }
|
|
366 }
|
|
367 return oma;
|
|
368 }
|
|
369
|
|
370 }
|