view src/main/java/org/mpi/openmind/repository/utils/OM4StreamWriter.java @ 106:93c7dbfaf062

add bibid tag to xml export of endnote-id attributes.
author Robert Casties <casties@mpiwg-berlin.mpg.de>
date Fri, 26 Apr 2019 18:12:23 +0200
parents 1149eb948036
children 484be3266e54
line wrap: on
line source

package org.mpi.openmind.repository.utils;

import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.text.DecimalFormat;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import javax.xml.stream.XMLOutputFactory;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamWriter;

import org.apache.commons.lang.StringUtils;
import org.apache.log4j.Logger;
import org.joda.time.DateTime;
import org.joda.time.format.DateTimeFormatter;
import org.joda.time.format.ISODateTimeFormat;
import org.json.JSONException;
import org.json.JSONObject;
import org.mpi.openmind.repository.bo.Attribute;
import org.mpi.openmind.repository.bo.Entity;
import org.mpi.openmind.repository.bo.Node;
import org.mpi.openmind.repository.bo.Relation;
import org.mpi.openmind.repository.services.PersistenceService;

/**
 * Export all entities and relations and definitions to XML.
 * 
 * Saves (content) entities and relations (i.e. assertions) and definitions 
 * (i.e. definition entities and relations) in separate files.
 * 
 * @author jurzua, casties
 *
 */
public class OM4StreamWriter {

    protected static final String FORMAT_VERSION = "4.10";

    private static Logger logger = Logger.getLogger(OM4StreamWriter.class);

    private static final int itemsPerPage = 500;
    
    /** Include normalized own-values. */
    public static boolean includeNormalizations = true;
    
    /** key for entity count in attribute counts map */
    private static final String ENT_KEY = "<entity-count>";

    /** formatter for isodate tag */
    public static DateTimeFormatter dateFormatter = ISODateTimeFormat.date();
    
    /** pattern for bibid in endnote-id attribute */
    public static final Pattern bibidPattern = Pattern.compile("#(\\d+)");
    
    /**
     * Return the object's string representation or "null" if its null.
     * 
     * @param s
     * @return
     */
    private static String defaultString(Object s) {
        if (s == null) {
            return "null";
        } else {
            return s.toString();
        }
    }
    
    
    /**
     * Saves all content Entities with their Attributes and Relations in a XML file with the given fileName.
     * 
     * @param fileName
     * @param ps
     */
    public static void backupEntities(String fileName, PersistenceService ps) {
        writeEntsAndRels(fileName, ps, Node.TYPE_ABOX, includeNormalizations);
    }

    /**
     * Saves all definitions in a XML file with the given fileName.
     * 
     * @param fileName
     * @param ps
     */
    public static void backupDefinitions(String fileName, PersistenceService ps) {
        writeEntsAndRels(fileName, ps, Node.TYPE_TBOX, false);
    }

    /**
     * Writes all entities of the given type and their relations to the XML file at fileName.
     * 
     * Type is either TYPE_TBOX or TYPE_ABOX.
     * 
     * @param fileName
     * @param ps
     * @param type 
     */
    private static void writeEntsAndRels(String fileName, PersistenceService ps, String type, boolean includeNorm) {
        OutputStreamWriter out;
        try {
        	// statistics collection Maps
            Map<String, Map<String, Long>> entStats = new HashMap<String, Map<String, Long>>();
            Map<String, Map<String, Long>> relStats = new HashMap<String, Map<String, Long>>();
            
            // setup xml writer
            FileOutputStream fileOut = new FileOutputStream(fileName);
            out = new OutputStreamWriter(fileOut, "UTF-8");
            XMLOutputFactory factory = XMLOutputFactory.newInstance();
            XMLStreamWriter writer = factory.createXMLStreamWriter(out);

            int entitiesCount = 0;

            writer.writeStartDocument("UTF-8", "1.0");

            if (type.equals(Node.TYPE_ABOX)) {
                writer.writeStartElement(XMLUtil.OPENMIND_DATA);
                writer.writeAttribute("version", FORMAT_VERSION);
                // get number of content Entities
                entitiesCount = ps.getEntityCount(null).intValue();
            } else {
                writer.writeStartElement(XMLUtil.META_DATA);
                writer.writeAttribute("version", FORMAT_VERSION);
                // get number of definition Entities
                entitiesCount = ps.getEntityCount(Node.TYPE_TBOX).intValue();
            }

            int numberOfPages = entitiesCount / itemsPerPage;
            // debug: int numberOfPages = 1;
            int counter = 0;
            long start = System.currentTimeMillis();
            DecimalFormat df = new DecimalFormat("#.##");

            // list of Relations (filled from Entities)
            List<Relation> relList = new ArrayList<Relation>();

            /*
             * write entities
             */
            writer.writeStartElement((type.equals(Node.TYPE_TBOX)) ? XMLUtil.DEFINITIONS : XMLUtil.ENTITIES);
            writer.writeAttribute("count", Integer.toString(entitiesCount));
            // iterate database by pages
            for (int currentPage = 0; currentPage <= numberOfPages; currentPage++) {
                int startRecord = currentPage * itemsPerPage;
                List<Entity> entities;

                if (type.equals(Node.TYPE_ABOX)) {
                	// get page of content Entities
                    entities = ps.getEntityPage(null, startRecord, itemsPerPage);
                } else {
                	// get page of definition Entities
                    entities = ps.getEntityPage(Node.TYPE_TBOX, startRecord, itemsPerPage);
                }

                // iterate entities
                for (Entity ent : entities) {
                    // write entity to XML
                    writeEntity(ent, writer, ps, includeNorm, entStats);
                    // add (source)relations to list
                    List<Relation> rels = ent.getSourceRelations();
                    relList.addAll(rels);
                    // update stats for relations
                    Map<String, Long> entRelStats = entStats.get(ent.getObjectClass());
                    for (Relation rel: rels) {
                        // update source relations
                        updateRelStats(rel, true, entRelStats);
                    }
                    for (Relation rel: ent.getTargetRelations()) {
                        // update target relations
                        updateRelStats(rel, false, entRelStats);
                    }
                    // count entities
                    counter++;
                }
                
                long runtime = System.currentTimeMillis() - start;
                double percent = ((double) counter / (double) entitiesCount) * 100.0;
                logger.debug("(" + df.format(percent) + "%) \t[" + counter + "/" + entitiesCount + "]\t");
                logger.debug("Speed[ents/s]: " + df.format((double) counter / ((double) runtime / 1000)));
                writer.flush();
            }
            writer.writeEndElement();

            /*
             * write relations (from list)
             */
            writer.writeStartElement(XMLUtil.RELATIONS);
            writer.writeAttribute("count", Integer.toString(relList.size()));
            for (Relation rel : relList) {
                writeRelation(rel, writer, includeNorm, relStats);
            }
            writer.writeEndElement();

            /*
             * write statistics
             */
            // entity stats
            writeStats(XMLUtil.ENTITY_STATS, XMLUtil.ENTITY, entStats, writer);
            // relation stats
            writeStats(XMLUtil.RELATION_STATS, XMLUtil.RELATION, relStats, writer);
            
            // end file.
            writer.writeEndElement();

            writer.flush();
            writer.close();

            logger.info("END Stream Writer");
        } catch (IOException e) {
            logger.error(e);
        } catch (XMLStreamException e) {
        	logger.error(e);
        }
    }

    /**
     * Write OpenMind relation to XML.
     * 
     * @param rel
     * @param writer
     * @param relStats 
     * @throws XMLStreamException
     */
    private static void writeRelation(Relation rel, XMLStreamWriter writer, boolean includeNorm, 
    		Map<String, Map<String, Long>> relStats) throws XMLStreamException {
        writer.writeStartElement(XMLUtil.RELATION);

        // update stats
		Map<String, Long> attStats = null;
    	if (relStats != null) {
    		attStats = updateNodeStats(rel, relStats);
    	}

        /*
         * write XML-attributes
         */
        writer.writeAttribute(XMLUtil.OBJECT_CLASS, defaultString(rel.getObjectClass()));
        writer.writeAttribute(XMLUtil.ID, defaultString(rel.getId()));
        writer.writeAttribute(XMLUtil.ROW_ID, defaultString(rel.getRowId()));
        if (StringUtils.isNotEmpty(rel.getContentType())) {
            writer.writeAttribute(XMLUtil.CONTENT_TYPE, rel.getContentType());
        }
        writer.writeAttribute(XMLUtil.RELATION_SOURCE_ID, defaultString(rel.getSourceId()));
        writer.writeAttribute(XMLUtil.RELATION_SOURCE, defaultString(rel.getSourceObjectClass()));
        writer.writeAttribute(XMLUtil.RELATION_TARGET_ID, defaultString(rel.getTargetId()));
        writer.writeAttribute(XMLUtil.RELATION_TARGET, defaultString(rel.getTargetObjectClass()));
        writer.writeAttribute(XMLUtil.VERSION, defaultString(rel.getVersion()));
        writer.writeAttribute(XMLUtil.MODIFICATION_TIME, defaultString(rel.getModificationTime()));
        if (rel.getUser() != null) {
            writer.writeAttribute(XMLUtil.USER, rel.getUser());
        }
        if (rel.getIsPublic()) {
            writer.writeAttribute(XMLUtil.PUBLIC, "true");
        }

        /*
         * write OpenMind attributes of this relation as XML tags
         */
        if (rel.getAttributes().size() > 0) {
            writer.writeStartElement(XMLUtil.ATTRIBUTES);
			for (Attribute att : rel.getAttributes()) {
				if (attStats != null) {
					// update stats
					updateAttStats(att, attStats);
				}
				// write xml
				writeAttribute(att, writer, includeNorm);
			}
            writer.writeEndElement();
        }
        
        /*
         *  write own value as content
         */
        if (StringUtils.isNotEmpty(rel.getOwnValue())) {
            writer.writeCharacters(rel.getOwnValue());
        }

        writer.writeEndElement();
    }

    /**
     * Write OpenMind entity to XML.
     * 
     * @param entity
     * @param writer
     * @param ps
     * @param entStats 
     * @throws XMLStreamException
     */
    private static void writeEntity(Entity entity, XMLStreamWriter writer, PersistenceService ps, boolean includeNorm, 
    		Map<String, Map<String, Long>> entStats)
            throws XMLStreamException {

        writer.writeStartElement((entity.getType().equals(Node.TYPE_TBOX)) ? XMLUtil.DEFINITION : XMLUtil.ENTITY);

        if (entity.isLightweight()) {
            // make sure we have all attributes and relations
            entity = ps.getEntityContent(entity);
        }
        
        // update stats
        Map<String, Long> attStats = updateNodeStats(entity, entStats);

        /*
         * write XML attributes
         */
		writer.writeAttribute(XMLUtil.OBJECT_CLASS, defaultString(entity.getObjectClass()));
        writer.writeAttribute(XMLUtil.ID, defaultString(entity.getId()));
        writer.writeAttribute(XMLUtil.ROW_ID, defaultString(entity.getRowId()));
        if (StringUtils.isNotEmpty(entity.getContentType())) {
            writer.writeAttribute(XMLUtil.CONTENT_TYPE, entity.getContentType());
        }
        writer.writeAttribute(XMLUtil.VERSION, defaultString(entity.getVersion()));
        writer.writeAttribute(XMLUtil.MODIFICATION_TIME, defaultString(entity.getModificationTime()));
        if (entity.getUser() != null) {
            writer.writeAttribute(XMLUtil.USER, entity.getUser());
        }
        if (entity.getIsPublic()) {
            writer.writeAttribute(XMLUtil.PUBLIC, "true");
        }

        /*
         * write OpenMind attributes of this entity as XML tags
         */
        if (entity.getAttributes().size() > 0) {
            writer.writeStartElement(XMLUtil.ATTRIBUTES);
            for (Attribute att : entity.getAttributes()) {
                // update stats
                updateAttStats(att, attStats);
                // write xml
                writeAttribute(att, writer, includeNorm);
            }
            writer.writeEndElement();
        }

        /*
         * write outgoing relations of this entity as XML tags
         */
        if (entity.getSourceRelations().size() > 0) {
            writer.writeStartElement(XMLUtil.RELATIONS);
            for (Relation rel : entity.getSourceRelations()) {
            	// write xml (without stats)
            	writeRelation(rel, writer, includeNorm, null);
            }
            writer.writeEndElement();
        }

        /*
         * write own value
         */
        String ov = entity.getOwnValue();
		if (StringUtils.isNotEmpty(ov)) {
            writer.writeCharacters(ov);
            String nov = entity.getNormalizedOwnValue();
			if (includeNorm && StringUtils.isNotEmpty(nov) && !ov.equals(nov)) {
                // write normalized value
            	writer.writeStartElement(XMLUtil.NORMALIZED);
            	writer.writeCharacters(nov);
            	writer.writeEndElement();
            }
        }

        writer.writeEndElement();
    }


	private static void writeAttribute(Attribute att, XMLStreamWriter writer, boolean includeNorm) throws XMLStreamException {
        writer.writeStartElement(XMLUtil.ATTRIBUTE);

        String name = att.getName();
        /*
         * write XML attributes
         */
        writer.writeAttribute(XMLUtil.ATTRIBUTE_NAME, defaultString(name));
        writer.writeAttribute(XMLUtil.ID, defaultString(att.getId()));
        writer.writeAttribute(XMLUtil.ROW_ID, defaultString(att.getRowId()));
        writer.writeAttribute(XMLUtil.CONTENT_TYPE, defaultString(att.getContentType()));
        writer.writeAttribute(XMLUtil.VERSION, defaultString(att.getVersion()));
        writer.writeAttribute(XMLUtil.MODIFICATION_TIME, defaultString(att.getModificationTime()));
        if (att.getUser() != null) {
            writer.writeAttribute(XMLUtil.USER, att.getUser());
        }
        if (att.getIsPublic()) {
            writer.writeAttribute(XMLUtil.PUBLIC, "true");
        }
            
        /*
         * write value as content
         */
        String ov = att.getValue();
		if (StringUtils.isNotEmpty(ov)) {
            writer.writeCharacters(ov);
            String nov = att.getNormalizedOwnValue();
			if (includeNorm && StringUtils.isNotEmpty(nov) && !ov.equals(nov)) {
                // write normalized value
            	writer.writeStartElement(XMLUtil.NORMALIZED);
            	writer.writeCharacters(nov);
            	writer.writeEndElement();
            }
			boolean processed = false;
            // convert endnote-id into additional bibid element
            if (!processed && name.equals("endnote-id")) {
                Matcher bibidMatch = bibidPattern.matcher(ov);
                if (bibidMatch.find()) {
                    String bibid = bibidMatch.group(1);
                    writer.writeStartElement(XMLUtil.BIBID);
                    writer.writeCharacters(bibid);
                    writer.writeEndElement();
                    processed = true;
                }
            }
			// convert any date JSON into additional isodate element
			if (!processed && ov.startsWith("{")) {
				try {
					JSONObject json = new JSONObject(ov);
					JSONObject date = null;
					if (json.has("date")) {
						date = json.getJSONObject("date"); 
					} else if (json.has("from")) {
						date = json.getJSONObject("from"); 
					}
					if (date != null) {
						int year = date.getInt("year");
						int month = date.getInt("month");
						int day = date.getInt("dayOfMonth");
						DateTime dt = new DateTime(year, month, day, 0, 0);
						writer.writeStartElement(XMLUtil.ISODATE);
						writer.writeCharacters(dateFormatter.print(dt));
		            	writer.writeEndElement();
		            	processed = true;
					}
				} catch (JSONException e) {
					// maybe not JSON...
				}
			}
        }
        
        writer.writeEndElement();
    }

	
    private static void writeStats(String statsTag, String entryTag, Map<String, Map<String, Long>> nodeStats,
            XMLStreamWriter writer) throws XMLStreamException {
        // write stats tag
        writer.writeStartElement(statsTag);

        for (String nodeType : nodeStats.keySet()) {
            Map<String, Long> attStats = nodeStats.get(nodeType);
            Long nodeCnt = attStats.get(ENT_KEY);
            // write tag for entity/attribute
            writer.writeStartElement(entryTag);
            writer.writeAttribute(XMLUtil.OBJECT_CLASS, (nodeType == null) ? "null" : nodeType);
            writer.writeAttribute(XMLUtil.COUNT, nodeCnt.toString());

            // write attributes
            for (String attName : attStats.keySet()) {
                // skip ENT_KEY
                if (attName.equals(ENT_KEY))
                    continue;
                if (attName.contains("[")) {
                    // write relation tag
                    writer.writeStartElement(XMLUtil.RELATION);
                } else {
                    // write attribute tag
                    writer.writeStartElement(XMLUtil.ATTRIBUTE);                    
                }
                writer.writeAttribute(XMLUtil.ATTRIBUTE_NAME, attName);
                Long attCnt = attStats.get(attName);
                writer.writeAttribute(XMLUtil.COUNT, attCnt.toString());
                writer.writeEndElement();
            }
            // end of entity/attribute tag
            writer.writeEndElement();
        }
        // end of stats tag
        writer.writeEndElement();
    }	
	
	/**
	 * @param objectClass
	 * @param entStats
	 * @return
	 */
    protected static Map<String, Long> updateNodeStats(Node ent, Map<String, Map<String, Long>> entStats) {
        String objectClass = ent.getObjectClass();
        Map<String, Long> attStats = entStats.get(objectClass);
        if (attStats == null) {
            // create new attribute stats entry
            attStats = new HashMap<String, Long>();
            // add key to count entities
            attStats.put(ENT_KEY, 1l);
            // add to map
            entStats.put(objectClass, attStats);
        } else {
            // increment entity count
            Long entCnt = attStats.get(ENT_KEY);
            attStats.put(ENT_KEY, entCnt + 1);
        }
        return attStats;
    }

    /**
     * @param att
     * @param attStats
     */
    protected static void updateAttStats(Attribute att, Map<String, Long> attStats) {
		String attName = att.getName();
		Long cnt = attStats.get(attName);
		if (cnt == null) {
			attStats.put(attName, 1l);
		} else {
			attStats.put(attName, cnt + 1);
		}
	}

    /**
     * Update relation statistics. 
     * 
     * Relation stats are saved like attribute stats but with "[entity-type]" before
     * or after the relation name.
     * 
     * @param rel
     * @param relStats
     */
    protected static void updateRelStats(Relation rel, boolean isSrcRel, Map<String, Long> relStats) {
        String relName = rel.getObjectClass();
        if (isSrcRel) {
            relName = relName + "[" + rel.getTargetObjectClass() + "]";
        } else {
            relName = "[" + rel.getSourceObjectClass() + "]" + relName;
        }
        Long cnt = relStats.get(relName);
        if (cnt == null) {
            relStats.put(relName, 1l);
        } else {
            relStats.put(relName, cnt + 1);
        }
    }



}