view src/main/java/org/mpi/openmind/repository/utils/OM4XmlEventReader.java @ 32:9c54842f5e86

better names for XML importer sub-classes.
author casties
date Thu, 25 Aug 2016 11:29:47 +0200
parents 7d8ebe8ac8a2
children 90f9a1c45b15
line wrap: on
line source

/**
 * 
 */
package org.mpi.openmind.repository.utils;

import java.io.InputStream;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;

import javax.xml.namespace.QName;
import javax.xml.stream.XMLEventReader;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.events.Attribute;
import javax.xml.stream.events.Characters;
import javax.xml.stream.events.EndElement;
import javax.xml.stream.events.StartElement;
import javax.xml.stream.events.XMLEvent;

import org.apache.log4j.Logger;

/**
 * Class that reads an OM4 XML dump into lists of simple objects.
 * 
 * The constructor takes an InputStream.
 * 
 * The read() method reads the contents of the file into the members
 * .entities and .relations.
 * 
 * The contents are Lists of OmXmlEntities and OmXmlRelations holding
 * Lists of omXmlAttributes.
 * 
 * This implementation uses XMLEventReader.
 * 
 * @author casties
 *
 */
public class OM4XmlEventReader {

    private static Logger logger = Logger.getLogger(OM4XmlEventReader.class);
    
    public OM4XmlEventReader(InputStream xmlStream) {
        super();
        this.xmlStream = xmlStream;
    }

    InputStream xmlStream;
    
    public int numEntities;
    public List<OmXmlEntity> entities;
    private int entCnt = 0;

    public int numRelations;
    public List<OmXmlRelation> relations;
    private int relCnt = 0;

    /**
     * Simple class holding the representation of an OpenMind Attribute from XML.
     *  
     * @author casties
     */
    public class OmXmlAttribute {
        public Map<String, String> xmlAtts;
        public String value;
        
        public String getId() {
            return xmlAtts.get("id");
        }
    }

    /**
     * Simple class holding the representation of an OpenMind Entity from XML.
     *  
     * @author casties
     */
    public class OmXmlEntity {
        public Map<String, String> xmlAtts;
        public String value;
        public List<OmXmlAttribute> attributes;

        public String getId() {
            return xmlAtts.get("id");
        }
    }

    /**
     * Simple class holding the representation of an OpenMind Relation from XML.
     *  
     * @author casties
     */
    public class OmXmlRelation {
        public Map<String, String> xmlAtts;
        public String value;
        public List<OmXmlAttribute> attributes;

        public String getId() {
            return xmlAtts.get("id");
        }
    }

    /**
     * Reads the XML from xmlStream and populates entities and relations.
     * 
     * @throws XMLStreamException
     */
    public void read() throws XMLStreamException {
        XMLInputFactory inputFactory = XMLInputFactory.newInstance();
        XMLEventReader reader = inputFactory.createXMLEventReader(xmlStream, "UTF-8");
        try {
            while (reader.hasNext()) {
                XMLEvent e = reader.nextEvent();
                if (e.isStartDocument()) {
                    continue;
                } else if (e.isStartElement()) {
                    StartElement es = e.asStartElement();
                    String lname = es.getName().getLocalPart();
                    if (lname == XMLUtil.ENTITIES) {
                        entities = processEntities(es, reader);
                    } else if (lname == XMLUtil.RELATIONS) {
                        relations = processRelations(es, reader);
                    }
                }
            }
        } finally {
            reader.close();
        }
    }

    /**
     * Process the entities tag and its contents.
     * 
     * @param elem
     * @param reader
     * @return
     * @throws XMLStreamException
     */
    private List<OmXmlEntity> processEntities(StartElement elem, XMLEventReader reader) throws XMLStreamException {
        logger.debug("loading entities...");
        // get number attribute
        Attribute numa = elem.getAttributeByName(new QName("number"));
        if (numa != null) {
            numEntities = Integer.parseInt(numa.getValue());
        }
        // start reading sub-elements
        List<OmXmlEntity> entities = new ArrayList<OmXmlEntity>();
        while (reader.hasNext()) {
            XMLEvent e = reader.nextEvent();
            if (e.isStartElement()) {
                // start of next element
                StartElement es = e.asStartElement();
                String lname = es.getName().getLocalPart();
                if (lname == XMLUtil.ENTITY) {
                    // process entity tag
                    entities.add(processEntity(es, reader));
                }
            } else if (e.isEndElement()) {
                EndElement ee = e.asEndElement();
                if (ee.getName().getLocalPart().equals(XMLUtil.ENTITIES)) {
                    // end of this element
                    break;
                } else {
                    logger.warn("Unexpected EndElement: "+ee);
                }
            }
        }
        return entities;
    }

    /**
     * Process the entity tag and its contents.
     * 
     * @param elem
     * @param reader
     * @return
     * @throws XMLStreamException
     */
    private OmXmlEntity processEntity(StartElement elem, XMLEventReader reader) throws XMLStreamException {
        //logger.debug("entity");
        OmXmlEntity ent = new OmXmlEntity();
        Map<String, String> xmlAtts = new HashMap<String, String>();
        @SuppressWarnings("unchecked")
        Iterator<Attribute> atts = elem.getAttributes();
        while (atts.hasNext()) {
            Attribute att = atts.next();
            xmlAtts.put(att.getName().getLocalPart(), att.getValue());
        }
        ent.xmlAtts = xmlAtts;
        // start reading sub-elements
        ent.attributes = new ArrayList<OmXmlAttribute>();
        while (reader.hasNext()) {
            XMLEvent e = reader.nextEvent();
            if (e.isStartElement()) {
                // start of next element
                StartElement es = e.asStartElement();
                String lname = es.getName().getLocalPart();
                if (lname == XMLUtil.ATTRIBUTES) {
                    // ignore attributes tag
                    continue;
                }
                if (lname == XMLUtil.ATTRIBUTE) {
                    // process attribute tag
                    ent.attributes.add(processAttribute(es, reader));
                }
            } else if (e.isCharacters()) {
                // text content
                Characters ec = e.asCharacters();
                if (ent.value == null) {
                    ent.value = ec.getData();
                } else {
                    ent.value += ec.getData();
                }
            } else if (e.isEndElement()) {
                EndElement ee = e.asEndElement();
                if (ee.getName().getLocalPart().equals(XMLUtil.ENTITY)) {
                    // end of this element
                    break;
                }
            }
        }
        if (++entCnt % 500 == 0) {
            logger.debug(""+entCnt+" entities read...");
        }
        return ent;
    }

    /**
     * Process the relations tag and its contents.
     * 
     * @param elem
     * @param reader
     * @return
     * @throws XMLStreamException
     */
    private List<OmXmlRelation> processRelations(StartElement elem, XMLEventReader reader) throws XMLStreamException {
        logger.debug("loading relations...");
        // get number attribute
        Attribute numa = elem.getAttributeByName(new QName("number"));
        if (numa != null) {
            numRelations = Integer.parseInt(numa.getValue());
        }
        // start reading sub-elements
        List<OmXmlRelation> rels = new ArrayList<OmXmlRelation>();
        while (reader.hasNext()) {
            XMLEvent e = reader.nextEvent();
            if (e.isStartElement()) {
                // start of next element
                StartElement es = e.asStartElement();
                String lname = es.getName().getLocalPart();
                if (lname == XMLUtil.RELATION) {
                    // process entity tag
                    rels.add(processRelation(es, reader));
                }
            } else if (e.isEndElement()) {
                EndElement ee = e.asEndElement();
                if (ee.getName().getLocalPart().equals(XMLUtil.RELATIONS)) {
                    // end of this element
                    break;
                } else {
                    logger.warn("Unexpected EndElement: "+ee);
                }
            }
        }
        return rels;
    }


    /**
     * Process the relation tag and its contents.
     * 
     * @param elem
     * @param reader
     * @return
     * @throws XMLStreamException
     */
    private OmXmlRelation processRelation(StartElement elem, XMLEventReader reader) throws XMLStreamException {
        //logger.debug("relation");
        OmXmlRelation rel = new OmXmlRelation();
        Map<String, String> xmlAtts = new HashMap<String, String>();
        @SuppressWarnings("unchecked")
        Iterator<Attribute> atts = elem.getAttributes();
        while (atts.hasNext()) {
            Attribute att = atts.next();
            xmlAtts.put(att.getName().getLocalPart(), att.getValue());
        }
        rel.xmlAtts = xmlAtts;
        // start reading sub-elements
        rel.attributes = new ArrayList<OmXmlAttribute>();
        while (reader.hasNext()) {
            XMLEvent e = reader.nextEvent();
            if (e.isStartElement()) {
                // start of next element
                StartElement es = e.asStartElement();
                String lname = es.getName().getLocalPart();
                if (lname == XMLUtil.ATTRIBUTES) {
                    // ignore attributes tag
                    continue;
                }
                if (lname == XMLUtil.ATTRIBUTE) {
                    // process attribute tag
                    rel.attributes.add(processAttribute(es, reader));
                }
            } else if (e.isCharacters()) {
                // text content
                Characters ec = e.asCharacters();
                if (rel.value == null) {
                    rel.value = ec.getData();
                } else {
                    rel.value += ec.getData();
                }
            } else if (e.isEndElement()) {
                EndElement ee = e.asEndElement();
                if (ee.getName().getLocalPart().equals(XMLUtil.RELATION)) {
                    // end of this element
                    break;
                }
            }
        }
        if (++relCnt % 100 == 0) {
            logger.debug(""+relCnt+" relations read...");
        }
        return rel;
    }

    /**
     * Process the attribute tag and its contents.
     * 
     * @param elem
     * @param reader
     * @return
     * @throws XMLStreamException
     */
    private OmXmlAttribute processAttribute(StartElement elem, XMLEventReader reader) throws XMLStreamException {
        //logger.debug("attribute");
        OmXmlAttribute oma = new OmXmlAttribute();
        Map<String, String> xmlAtts = new HashMap<String, String>();
        @SuppressWarnings("unchecked")
        Iterator<Attribute> atts = elem.getAttributes();
        while (atts.hasNext()) {
            Attribute att = atts.next();
            xmlAtts.put(att.getName().getLocalPart(), att.getValue());
        }
        oma.xmlAtts = xmlAtts;
        // start reading sub-elements
        while (reader.hasNext()) {
            XMLEvent e = reader.nextEvent();
            if (e.isCharacters()) {
                // text content
                Characters ec = e.asCharacters();
                if (oma.value == null) {
                    oma.value = ec.getData();
                } else {
                    oma.value += ec.getData();
                }
            } else if (e.isEndElement()) {
                EndElement ee = e.asEndElement();
                if (ee.getName().getLocalPart().equals(XMLUtil.ATTRIBUTE)) {
                    // end of this element
                    break;
                } else {
                    logger.warn("Unexpected EndElement: "+ee);
                }
            }
        }
        return oma;
    }

}