Mercurial > hg > openmind
view src/main/java/org/mpi/openmind/repository/utils/OM4XmlEventReader.java @ 32:9c54842f5e86
better names for XML importer sub-classes.
author | casties |
---|---|
date | Thu, 25 Aug 2016 11:29:47 +0200 |
parents | 7d8ebe8ac8a2 |
children | 90f9a1c45b15 |
line wrap: on
line source
/** * */ package org.mpi.openmind.repository.utils; import java.io.InputStream; import java.util.ArrayList; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; import javax.xml.namespace.QName; import javax.xml.stream.XMLEventReader; import javax.xml.stream.XMLInputFactory; import javax.xml.stream.XMLStreamException; import javax.xml.stream.events.Attribute; import javax.xml.stream.events.Characters; import javax.xml.stream.events.EndElement; import javax.xml.stream.events.StartElement; import javax.xml.stream.events.XMLEvent; import org.apache.log4j.Logger; /** * Class that reads an OM4 XML dump into lists of simple objects. * * The constructor takes an InputStream. * * The read() method reads the contents of the file into the members * .entities and .relations. * * The contents are Lists of OmXmlEntities and OmXmlRelations holding * Lists of omXmlAttributes. * * This implementation uses XMLEventReader. * * @author casties * */ public class OM4XmlEventReader { private static Logger logger = Logger.getLogger(OM4XmlEventReader.class); public OM4XmlEventReader(InputStream xmlStream) { super(); this.xmlStream = xmlStream; } InputStream xmlStream; public int numEntities; public List<OmXmlEntity> entities; private int entCnt = 0; public int numRelations; public List<OmXmlRelation> relations; private int relCnt = 0; /** * Simple class holding the representation of an OpenMind Attribute from XML. * * @author casties */ public class OmXmlAttribute { public Map<String, String> xmlAtts; public String value; public String getId() { return xmlAtts.get("id"); } } /** * Simple class holding the representation of an OpenMind Entity from XML. * * @author casties */ public class OmXmlEntity { public Map<String, String> xmlAtts; public String value; public List<OmXmlAttribute> attributes; public String getId() { return xmlAtts.get("id"); } } /** * Simple class holding the representation of an OpenMind Relation from XML. * * @author casties */ public class OmXmlRelation { public Map<String, String> xmlAtts; public String value; public List<OmXmlAttribute> attributes; public String getId() { return xmlAtts.get("id"); } } /** * Reads the XML from xmlStream and populates entities and relations. * * @throws XMLStreamException */ public void read() throws XMLStreamException { XMLInputFactory inputFactory = XMLInputFactory.newInstance(); XMLEventReader reader = inputFactory.createXMLEventReader(xmlStream, "UTF-8"); try { while (reader.hasNext()) { XMLEvent e = reader.nextEvent(); if (e.isStartDocument()) { continue; } else if (e.isStartElement()) { StartElement es = e.asStartElement(); String lname = es.getName().getLocalPart(); if (lname == XMLUtil.ENTITIES) { entities = processEntities(es, reader); } else if (lname == XMLUtil.RELATIONS) { relations = processRelations(es, reader); } } } } finally { reader.close(); } } /** * Process the entities tag and its contents. * * @param elem * @param reader * @return * @throws XMLStreamException */ private List<OmXmlEntity> processEntities(StartElement elem, XMLEventReader reader) throws XMLStreamException { logger.debug("loading entities..."); // get number attribute Attribute numa = elem.getAttributeByName(new QName("number")); if (numa != null) { numEntities = Integer.parseInt(numa.getValue()); } // start reading sub-elements List<OmXmlEntity> entities = new ArrayList<OmXmlEntity>(); while (reader.hasNext()) { XMLEvent e = reader.nextEvent(); if (e.isStartElement()) { // start of next element StartElement es = e.asStartElement(); String lname = es.getName().getLocalPart(); if (lname == XMLUtil.ENTITY) { // process entity tag entities.add(processEntity(es, reader)); } } else if (e.isEndElement()) { EndElement ee = e.asEndElement(); if (ee.getName().getLocalPart().equals(XMLUtil.ENTITIES)) { // end of this element break; } else { logger.warn("Unexpected EndElement: "+ee); } } } return entities; } /** * Process the entity tag and its contents. * * @param elem * @param reader * @return * @throws XMLStreamException */ private OmXmlEntity processEntity(StartElement elem, XMLEventReader reader) throws XMLStreamException { //logger.debug("entity"); OmXmlEntity ent = new OmXmlEntity(); Map<String, String> xmlAtts = new HashMap<String, String>(); @SuppressWarnings("unchecked") Iterator<Attribute> atts = elem.getAttributes(); while (atts.hasNext()) { Attribute att = atts.next(); xmlAtts.put(att.getName().getLocalPart(), att.getValue()); } ent.xmlAtts = xmlAtts; // start reading sub-elements ent.attributes = new ArrayList<OmXmlAttribute>(); while (reader.hasNext()) { XMLEvent e = reader.nextEvent(); if (e.isStartElement()) { // start of next element StartElement es = e.asStartElement(); String lname = es.getName().getLocalPart(); if (lname == XMLUtil.ATTRIBUTES) { // ignore attributes tag continue; } if (lname == XMLUtil.ATTRIBUTE) { // process attribute tag ent.attributes.add(processAttribute(es, reader)); } } else if (e.isCharacters()) { // text content Characters ec = e.asCharacters(); if (ent.value == null) { ent.value = ec.getData(); } else { ent.value += ec.getData(); } } else if (e.isEndElement()) { EndElement ee = e.asEndElement(); if (ee.getName().getLocalPart().equals(XMLUtil.ENTITY)) { // end of this element break; } } } if (++entCnt % 500 == 0) { logger.debug(""+entCnt+" entities read..."); } return ent; } /** * Process the relations tag and its contents. * * @param elem * @param reader * @return * @throws XMLStreamException */ private List<OmXmlRelation> processRelations(StartElement elem, XMLEventReader reader) throws XMLStreamException { logger.debug("loading relations..."); // get number attribute Attribute numa = elem.getAttributeByName(new QName("number")); if (numa != null) { numRelations = Integer.parseInt(numa.getValue()); } // start reading sub-elements List<OmXmlRelation> rels = new ArrayList<OmXmlRelation>(); while (reader.hasNext()) { XMLEvent e = reader.nextEvent(); if (e.isStartElement()) { // start of next element StartElement es = e.asStartElement(); String lname = es.getName().getLocalPart(); if (lname == XMLUtil.RELATION) { // process entity tag rels.add(processRelation(es, reader)); } } else if (e.isEndElement()) { EndElement ee = e.asEndElement(); if (ee.getName().getLocalPart().equals(XMLUtil.RELATIONS)) { // end of this element break; } else { logger.warn("Unexpected EndElement: "+ee); } } } return rels; } /** * Process the relation tag and its contents. * * @param elem * @param reader * @return * @throws XMLStreamException */ private OmXmlRelation processRelation(StartElement elem, XMLEventReader reader) throws XMLStreamException { //logger.debug("relation"); OmXmlRelation rel = new OmXmlRelation(); Map<String, String> xmlAtts = new HashMap<String, String>(); @SuppressWarnings("unchecked") Iterator<Attribute> atts = elem.getAttributes(); while (atts.hasNext()) { Attribute att = atts.next(); xmlAtts.put(att.getName().getLocalPart(), att.getValue()); } rel.xmlAtts = xmlAtts; // start reading sub-elements rel.attributes = new ArrayList<OmXmlAttribute>(); while (reader.hasNext()) { XMLEvent e = reader.nextEvent(); if (e.isStartElement()) { // start of next element StartElement es = e.asStartElement(); String lname = es.getName().getLocalPart(); if (lname == XMLUtil.ATTRIBUTES) { // ignore attributes tag continue; } if (lname == XMLUtil.ATTRIBUTE) { // process attribute tag rel.attributes.add(processAttribute(es, reader)); } } else if (e.isCharacters()) { // text content Characters ec = e.asCharacters(); if (rel.value == null) { rel.value = ec.getData(); } else { rel.value += ec.getData(); } } else if (e.isEndElement()) { EndElement ee = e.asEndElement(); if (ee.getName().getLocalPart().equals(XMLUtil.RELATION)) { // end of this element break; } } } if (++relCnt % 100 == 0) { logger.debug(""+relCnt+" relations read..."); } return rel; } /** * Process the attribute tag and its contents. * * @param elem * @param reader * @return * @throws XMLStreamException */ private OmXmlAttribute processAttribute(StartElement elem, XMLEventReader reader) throws XMLStreamException { //logger.debug("attribute"); OmXmlAttribute oma = new OmXmlAttribute(); Map<String, String> xmlAtts = new HashMap<String, String>(); @SuppressWarnings("unchecked") Iterator<Attribute> atts = elem.getAttributes(); while (atts.hasNext()) { Attribute att = atts.next(); xmlAtts.put(att.getName().getLocalPart(), att.getValue()); } oma.xmlAtts = xmlAtts; // start reading sub-elements while (reader.hasNext()) { XMLEvent e = reader.nextEvent(); if (e.isCharacters()) { // text content Characters ec = e.asCharacters(); if (oma.value == null) { oma.value = ec.getData(); } else { oma.value += ec.getData(); } } else if (e.isEndElement()) { EndElement ee = e.asEndElement(); if (ee.getName().getLocalPart().equals(XMLUtil.ATTRIBUTE)) { // end of this element break; } else { logger.warn("Unexpected EndElement: "+ee); } } } return oma; } }