Mercurial > hg > openmind
view src/main/java/org/mpi/openmind/repository/utils/OM4StreamWriter.java @ 120:3b0ce5e3302d
add Node status field to XML export.
author | Robert Casties <casties@mpiwg-berlin.mpg.de> |
---|---|
date | Wed, 08 Jan 2020 17:41:25 +0100 |
parents | 4eac7c57e593 |
children | 8d79021099a4 |
line wrap: on
line source
package org.mpi.openmind.repository.utils; import java.io.FileOutputStream; import java.io.IOException; import java.io.OutputStreamWriter; import java.text.DecimalFormat; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.regex.Matcher; import java.util.regex.Pattern; import javax.xml.stream.XMLOutputFactory; import javax.xml.stream.XMLStreamException; import javax.xml.stream.XMLStreamWriter; import org.apache.commons.lang.StringUtils; import org.apache.log4j.Logger; import org.joda.time.DateTime; import org.joda.time.format.DateTimeFormatter; import org.joda.time.format.ISODateTimeFormat; import org.json.JSONException; import org.json.JSONObject; import org.mpi.openmind.repository.bo.Attribute; import org.mpi.openmind.repository.bo.Entity; import org.mpi.openmind.repository.bo.Node; import org.mpi.openmind.repository.bo.Relation; import org.mpi.openmind.repository.services.PersistenceService; /** * Export all entities and relations and definitions to XML. * * Saves (content) entities and relations (i.e. assertions) and definitions * (i.e. definition entities and relations) in separate files. * * @author jurzua, casties * */ public class OM4StreamWriter { protected static final String FORMAT_VERSION = "4.12"; private static Logger logger = Logger.getLogger(OM4StreamWriter.class); private static final int itemsPerPage = 500; /** Include normalized own-values. */ public static boolean includeNormalizations = true; /** key for entity count in attribute counts map */ private static final String ENT_KEY = "<entity-count>"; /** formatter for isodate tag */ public static DateTimeFormatter dateFormatter = ISODateTimeFormat.date(); /** pattern for bibid in endnote-id attribute */ public static final Pattern bibidPattern = Pattern.compile("#(\\d+)"); /** * Return the object's string representation or "null" if its null. * * @param s * @return */ private static String defaultString(Object s) { if (s == null) { return "null"; } else { return s.toString(); } } /** * Saves all content Entities with their Attributes and Relations in a XML file with the given fileName. * * @param fileName * @param ps */ public static void backupEntities(String fileName, PersistenceService ps) { writeEntsAndRels(fileName, ps, Node.TYPE_ABOX, includeNormalizations); } /** * Saves all definitions in a XML file with the given fileName. * * @param fileName * @param ps */ public static void backupDefinitions(String fileName, PersistenceService ps) { writeEntsAndRels(fileName, ps, Node.TYPE_TBOX, false); } /** * Writes all entities of the given type and their relations to the XML file at fileName. * * Type is either TYPE_TBOX or TYPE_ABOX. * * @param fileName * @param ps * @param type */ private static void writeEntsAndRels(String fileName, PersistenceService ps, String type, boolean includeNorm) { OutputStreamWriter out; try { // statistics collection Maps Map<String, Map<String, Long>> entStats = new HashMap<String, Map<String, Long>>(); Map<String, Map<String, Long>> relStats = new HashMap<String, Map<String, Long>>(); // setup xml writer FileOutputStream fileOut = new FileOutputStream(fileName); out = new OutputStreamWriter(fileOut, "UTF-8"); XMLOutputFactory factory = XMLOutputFactory.newInstance(); XMLStreamWriter writer = factory.createXMLStreamWriter(out); int entitiesCount = 0; writer.writeStartDocument("UTF-8", "1.0"); if (type.equals(Node.TYPE_ABOX)) { writer.writeStartElement(XMLUtil.OPENMIND_DATA); writer.writeAttribute("version", FORMAT_VERSION); // get number of content Entities entitiesCount = ps.getEntityCount(null).intValue(); } else { writer.writeStartElement(XMLUtil.META_DATA); writer.writeAttribute("version", FORMAT_VERSION); // get number of definition Entities entitiesCount = ps.getEntityCount(Node.TYPE_TBOX).intValue(); } int numberOfPages = entitiesCount / itemsPerPage; // debug: int numberOfPages = 1; int counter = 0; long start = System.currentTimeMillis(); DecimalFormat df = new DecimalFormat("#.##"); // list of Relations (filled from Entities) List<Relation> relList = new ArrayList<Relation>(); /* * write entities */ writer.writeStartElement((type.equals(Node.TYPE_TBOX)) ? XMLUtil.DEFINITIONS : XMLUtil.ENTITIES); writer.writeAttribute("count", Integer.toString(entitiesCount)); // iterate database by pages for (int currentPage = 0; currentPage <= numberOfPages; currentPage++) { int startRecord = currentPage * itemsPerPage; List<Entity> entities; if (type.equals(Node.TYPE_ABOX)) { // get page of content Entities entities = ps.getEntityPage(null, startRecord, itemsPerPage); } else { // get page of definition Entities entities = ps.getEntityPage(Node.TYPE_TBOX, startRecord, itemsPerPage); } // iterate entities for (Entity ent : entities) { // write entity to XML writeEntity(ent, writer, ps, includeNorm, entStats); // add (source)relations to list List<Relation> srcRels = ent.getSourceRelations(); relList.addAll(srcRels); // update stats for relations Map<String, Long> entRelStats = entStats.get(ent.getObjectClass()); for (Relation rel: srcRels) { // update source relations updateRelStats(rel, true, entRelStats); } for (Relation rel: ent.getTargetRelations()) { // update target relations updateRelStats(rel, false, entRelStats); } // count entities counter++; } long runtime = System.currentTimeMillis() - start; double percent = ((double) counter / (double) entitiesCount) * 100.0; logger.debug("(" + df.format(percent) + "%) \t[" + counter + "/" + entitiesCount + "]\t"); logger.debug("Speed[ents/s]: " + df.format((double) counter / ((double) runtime / 1000))); writer.flush(); } writer.writeEndElement(); /* * write relations (from list) */ writer.writeStartElement(XMLUtil.RELATIONS); writer.writeAttribute("count", Integer.toString(relList.size())); for (Relation rel : relList) { writeRelation(rel, writer, includeNorm, relStats); } writer.writeEndElement(); /* * write statistics */ // entity stats writeStats(XMLUtil.ENTITY_STATS, XMLUtil.ENTITY, entStats, writer); // relation stats writeStats(XMLUtil.RELATION_STATS, XMLUtil.RELATION, relStats, writer); // end file. writer.writeEndElement(); writer.flush(); writer.close(); logger.info("END Stream Writer"); } catch (IOException e) { logger.error(e); } catch (XMLStreamException e) { logger.error(e); } } /** * Write OpenMind relation to XML. * * @param rel * @param writer * @param relStats * @throws XMLStreamException */ private static void writeRelation(Relation rel, XMLStreamWriter writer, boolean includeNorm, Map<String, Map<String, Long>> relStats) throws XMLStreamException { writer.writeStartElement(XMLUtil.RELATION); // update stats Map<String, Long> attStats = null; if (relStats != null) { attStats = updateNodeStats(rel, relStats); } /* * write XML-attributes */ writer.writeAttribute(XMLUtil.OBJECT_CLASS, defaultString(rel.getObjectClass())); writer.writeAttribute(XMLUtil.ID, defaultString(rel.getId())); writer.writeAttribute(XMLUtil.ROW_ID, defaultString(rel.getRowId())); if (StringUtils.isNotEmpty(rel.getContentType())) { writer.writeAttribute(XMLUtil.CONTENT_TYPE, rel.getContentType()); } writer.writeAttribute(XMLUtil.RELATION_SOURCE_ID, defaultString(rel.getSourceId())); writer.writeAttribute(XMLUtil.RELATION_SOURCE, defaultString(rel.getSourceObjectClass())); writer.writeAttribute(XMLUtil.RELATION_TARGET_ID, defaultString(rel.getTargetId())); writer.writeAttribute(XMLUtil.RELATION_TARGET, defaultString(rel.getTargetObjectClass())); writer.writeAttribute(XMLUtil.VERSION, defaultString(rel.getVersion())); writer.writeAttribute(XMLUtil.MODIFICATION_TIME, defaultString(rel.getModificationTime())); if (rel.getUser() != null) { writer.writeAttribute(XMLUtil.USER, rel.getUser()); } if (rel.getIsPublic()) { writer.writeAttribute(XMLUtil.PUBLIC, "true"); } /* * write OpenMind attributes of this relation as XML tags */ if (rel.getAttributes().size() > 0) { writer.writeStartElement(XMLUtil.ATTRIBUTES); for (Attribute att : rel.getAttributes()) { if (attStats != null) { // update stats updateAttStats(att, attStats); } // write xml writeAttribute(att, writer, includeNorm); } writer.writeEndElement(); } /* * write own value as content */ if (StringUtils.isNotEmpty(rel.getOwnValue())) { writer.writeCharacters(rel.getOwnValue()); } writer.writeEndElement(); } /** * Write OpenMind entity to XML. * * @param entity * @param writer * @param ps * @param entStats * @throws XMLStreamException */ private static void writeEntity(Entity entity, XMLStreamWriter writer, PersistenceService ps, boolean includeNorm, Map<String, Map<String, Long>> entStats) throws XMLStreamException { writer.writeStartElement((entity.getType().equals(Node.TYPE_TBOX)) ? XMLUtil.DEFINITION : XMLUtil.ENTITY); if (entity.isLightweight()) { // make sure we have all attributes and relations entity = ps.getEntityContent(entity); } // update stats Map<String, Long> attStats = updateNodeStats(entity, entStats); /* * write XML attributes */ writer.writeAttribute(XMLUtil.OBJECT_CLASS, defaultString(entity.getObjectClass())); writer.writeAttribute(XMLUtil.ID, defaultString(entity.getId())); writer.writeAttribute(XMLUtil.ROW_ID, defaultString(entity.getRowId())); if (StringUtils.isNotEmpty(entity.getContentType())) { writer.writeAttribute(XMLUtil.CONTENT_TYPE, entity.getContentType()); } writer.writeAttribute(XMLUtil.VERSION, defaultString(entity.getVersion())); writer.writeAttribute(XMLUtil.MODIFICATION_TIME, defaultString(entity.getModificationTime())); if (entity.getUser() != null) { writer.writeAttribute(XMLUtil.USER, entity.getUser()); } if (entity.getIsPublic()) { writer.writeAttribute(XMLUtil.PUBLIC, "true"); } if (StringUtils.isNotEmpty(entity.getStatus())) { writer.writeAttribute(XMLUtil.STATUS, entity.getStatus()); } /* * write OpenMind attributes of this entity as XML tags */ if (entity.getAttributes().size() > 0) { writer.writeStartElement(XMLUtil.ATTRIBUTES); for (Attribute att : entity.getAttributes()) { // skip empty attributes if (StringUtils.isEmpty(att.getValue())) continue; // skip special attributes if (att.getName().equals("is_autograph")) { // skip non-yes values if (!att.getValue().equals("yes")) continue; } // update stats updateAttStats(att, attStats); // write xml writeAttribute(att, writer, includeNorm); } writer.writeEndElement(); } /* * write outgoing relations of this entity as XML tags */ List<Relation> srcRels = entity.getSourceRelations(); if (srcRels.size() > 0) { writer.writeStartElement(XMLUtil.RELATIONS); for (Relation rel : srcRels) { // write xml (without stats) writeRelation(rel, writer, includeNorm, null); } writer.writeEndElement(); } /* * write incoming relations of this entity as XML tags */ List<Relation> tarRels = entity.getTargetRelations(); if (tarRels.size() > 0) { writer.writeStartElement(XMLUtil.INVRELATIONS); for (Relation rel : tarRels) { // write xml (without stats) writeRelation(rel, writer, includeNorm, null); } writer.writeEndElement(); } /* * write own value */ String ov = entity.getOwnValue(); if (StringUtils.isNotEmpty(ov)) { writer.writeCharacters(ov); String nov = entity.getNormalizedOwnValue(); if (includeNorm && StringUtils.isNotEmpty(nov) && !ov.equals(nov)) { // write normalized value writer.writeStartElement(XMLUtil.NORMALIZED); writer.writeCharacters(nov); writer.writeEndElement(); } } writer.writeEndElement(); } private static void writeAttribute(Attribute att, XMLStreamWriter writer, boolean includeNorm) throws XMLStreamException { writer.writeStartElement(XMLUtil.ATTRIBUTE); String name = att.getName(); if (name.equals("ALIAS")) { name = "alias"; // :-( } /* * write XML attributes */ writer.writeAttribute(XMLUtil.ATTRIBUTE_NAME, defaultString(name)); writer.writeAttribute(XMLUtil.ID, defaultString(att.getId())); writer.writeAttribute(XMLUtil.ROW_ID, defaultString(att.getRowId())); writer.writeAttribute(XMLUtil.CONTENT_TYPE, defaultString(att.getContentType())); writer.writeAttribute(XMLUtil.VERSION, defaultString(att.getVersion())); writer.writeAttribute(XMLUtil.MODIFICATION_TIME, defaultString(att.getModificationTime())); if (att.getUser() != null) { writer.writeAttribute(XMLUtil.USER, att.getUser()); } if (att.getIsPublic()) { writer.writeAttribute(XMLUtil.PUBLIC, "true"); } /* * write value as content */ String ov = att.getValue(); if (StringUtils.isNotEmpty(ov)) { writer.writeCharacters(ov); String nov = att.getNormalizedOwnValue(); if (includeNorm && StringUtils.isNotEmpty(nov) && !ov.equals(nov)) { // write normalized value writer.writeStartElement(XMLUtil.NORMALIZED); writer.writeCharacters(nov); writer.writeEndElement(); } boolean processed = false; // convert endnote-id into additional bibid element if (!processed && name.equals("endnote-id")) { Matcher bibidMatch = bibidPattern.matcher(ov); if (bibidMatch.find()) { String bibid = bibidMatch.group(1); writer.writeStartElement(XMLUtil.BIBID); writer.writeCharacters(bibid); writer.writeEndElement(); processed = true; } } // convert any date JSON into additional isodate element if (!processed && ov.startsWith("{")) { try { JSONObject json = new JSONObject(ov); JSONObject date = null; if (json.has("date")) { date = json.getJSONObject("date"); } else if (json.has("from")) { date = json.getJSONObject("from"); } if (date != null) { int year = date.getInt("year"); int month = date.getInt("month"); int day = date.getInt("dayOfMonth"); DateTime dt = new DateTime(year, month, day, 0, 0); writer.writeStartElement(XMLUtil.ISODATE); writer.writeCharacters(dateFormatter.print(dt)); writer.writeEndElement(); processed = true; } } catch (JSONException e) { // maybe not JSON... } } } writer.writeEndElement(); } private static void writeStats(String statsTag, String entryTag, Map<String, Map<String, Long>> nodeStats, XMLStreamWriter writer) throws XMLStreamException { // write stats tag writer.writeStartElement(statsTag); for (String nodeType : nodeStats.keySet()) { Map<String, Long> attStats = nodeStats.get(nodeType); Long nodeCnt = attStats.get(ENT_KEY); // write tag for entity/attribute writer.writeStartElement(entryTag); writer.writeAttribute(XMLUtil.OBJECT_CLASS, (nodeType == null) ? "null" : nodeType); writer.writeAttribute(XMLUtil.COUNT, nodeCnt.toString()); // write attributes for (String attName : attStats.keySet()) { // skip ENT_KEY if (attName.equals(ENT_KEY)) continue; if (attName.contains("[")) { // write relation tag writer.writeStartElement(XMLUtil.RELATION); } else { // write attribute tag writer.writeStartElement(XMLUtil.ATTRIBUTE); } writer.writeAttribute(XMLUtil.ATTRIBUTE_NAME, attName); Long attCnt = attStats.get(attName); writer.writeAttribute(XMLUtil.COUNT, attCnt.toString()); writer.writeEndElement(); } // end of entity/attribute tag writer.writeEndElement(); } // end of stats tag writer.writeEndElement(); } /** * @param objectClass * @param entStats * @return */ protected static Map<String, Long> updateNodeStats(Node ent, Map<String, Map<String, Long>> entStats) { String objectClass = ent.getObjectClass(); Map<String, Long> attStats = entStats.get(objectClass); if (attStats == null) { // create new attribute stats entry attStats = new HashMap<String, Long>(); // add key to count entities attStats.put(ENT_KEY, 1l); // add to map entStats.put(objectClass, attStats); } else { // increment entity count Long entCnt = attStats.get(ENT_KEY); attStats.put(ENT_KEY, entCnt + 1); } return attStats; } /** * @param att * @param attStats */ protected static void updateAttStats(Attribute att, Map<String, Long> attStats) { String attName = att.getName(); Long cnt = attStats.get(attName); if (cnt == null) { attStats.put(attName, 1l); } else { attStats.put(attName, cnt + 1); } } /** * Update relation statistics. * * Relation stats are saved like attribute stats but with "[entity-type]" before * or after the relation name. * * @param rel * @param relStats */ protected static void updateRelStats(Relation rel, boolean isSrcRel, Map<String, Long> relStats) { String relName = rel.getObjectClass(); if (isSrcRel) { relName = relName + "[" + rel.getTargetObjectClass() + "]"; } else { relName = "[" + rel.getSourceObjectClass() + "]" + relName; } Long cnt = relStats.get(relName); if (cnt == null) { relStats.put(relName, 1l); } else { relStats.put(relName, cnt + 1); } } }