IndexMetaContextualizer: src/main/java/de/mpiwg/indexmeta/AnnotateIndexMeta.java comparison

comparison src/main/java/de/mpiwg/indexmeta/AnnotateIndexMeta.java @ 1:8f6c4dab5d17

First version. Annotates the elements to be contextualized and checks whether some authors already have an ID.

author	Klaus Thoden <kthoden@mpiwg-berlin.mpg.de>
date	Fri, 12 Apr 2013 13:14:33 +0200
parents	dfce13a5f5f9
children	7a2a98655236 bc57f2660b0f

comparison

equal deleted inserted replaced

-:dfce13a5f5f9
+:8f6c4dab5d17
 package de.mpiwg.indexmeta;
 // import stuff
 import java.io.File;
 import java.io.IOException;
+import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.List;
-import javax.print.attribute.standard.MediaSize.Other;
 import javax.xml.parsers.DocumentBuilder;
 import javax.xml.parsers.DocumentBuilderFactory;
 import javax.xml.parsers.ParserConfigurationException;
 import javax.xml.transform.Transformer;
 import javax.xml.transform.TransformerException;
 import javax.xml.transform.dom.DOMSource;
 import javax.xml.transform.stream.StreamResult;
 import org.w3c.dom.Attr;
 import org.w3c.dom.Document;
+import org.w3c.dom.Element;
 import org.w3c.dom.NamedNodeMap;
 import org.w3c.dom.Node;
 import org.w3c.dom.NodeList;
 import org.xml.sax.SAXException;
 public class AnnotateIndexMeta {
-public static void main(String argv[]) {
+public static void main(String argv[])  {
 System.out.println("in main");
 // Methodenaufruf
-String filepath = "/Users/kthoden/eclipse/workspace/dm2eStuff/data/index.meta";
+String filepath = "/Users/kthoden/eclipse/workspace/IndexMetaContextualization/data/index.meta/index.meta_FQPFR8XP";
 // this is a list of all the elements we want to contextualize
 List<String> contextualizableList = Arrays.asList(new String[]{"author","editor","publisher","city","holding-library","keywords"});
-xmlParse(filepath,contextualizableList);
+try {xmlParse(filepath,contextualizableList);
+}
+catch (Exception e) {
+e.printStackTrace();
+};
 System.out.println("Done");
 }
 /**
-* Parses the XML file given as first argument and writes attributes in elements that are to be contextualized.
+* Parses the XML file given as first argument and writes attributes in elements that are to be contextualized. These serve simply as markers for the next tools that are going to fetch these elements to put them in the database.
 * @param filepath path to the file. It will also be used as the basis for the output file (this adds "-annot").
 * @param contextualizableList contains the elements that shall be given a context identifier which is later used to grab the contents and put them into the database to have it contextualized.
-*
+* @throws Exception which means that in the source index.meta file there are already markers for contextualization.
-*/
+*
-public static void xmlParse(String filepath, List<String> contextualizableList) {
+*/
+public static void xmlParse(String filepath, List<String> contextualizableList) throws Exception {
 try {
 // this is how the outputfile will be called
 String outfilepath = filepath + "-annot";
 // open the file and parse it
 DocumentBuilderFactory docFactory = DocumentBuilderFactory.newInstance();
 Integer count = 0;
 for(String contextElement : contextualizableList){
 NodeList nodeList = doc.getElementsByTagName(contextElement);
 for(int i=0; i < nodeList.getLength(); i++){
 Node iter2 = nodeList.item(i);
+String currentNodeValue = iter2.getTextContent();
 NamedNodeMap attr = iter2.getAttributes();
 // make a new attribute
-// DONE would be good if it left existing outputs alone
 if (attr.getNamedItem("context-id") == null){
 Attr attribute = doc.createAttribute ("context-id");
 attribute.setValue (count.toString());
 attr.setNamedItem (attribute);
 }
-else {
+else {throw new Exception("There is already at least one context-id attribute in the source index.meta. This is not allowed. ");
-System.out.println("schon da: " + attr.getNamedItem("context-id"));
+}
-}
 // Just for comfort. Print it out.
 System.out.println(contextElement);
+if (contextElement == "author") {
+findContext(doc, currentNodeValue);
+}
 count++;
 }
 // get the element by name (so they should be unique?)
 //Node iter2 = doc.getElementsByTagName(contextElement).item(0);
 }
 TransformerFactory transformerFactory = TransformerFactory.newInstance();
 Transformer transformer = transformerFactory.newTransformer();
 DOMSource source = new DOMSource(doc);
 StreamResult result = new StreamResult(new File(outfilepath));
 transformer.transform(source, result);
+/*
+* should these really go inside this method?
+*/
 } catch (ParserConfigurationException pce) {
 pce.printStackTrace();
 } catch (TransformerException tfe) {
 tfe.printStackTrace();
 } catch (IOException ioe) {
 ioe.printStackTrace();
 } catch (SAXException sae) {
 sae.printStackTrace();
 }
 }
+/**
+* this method checks the current index.meta file for already existing contextualizations. For example, newer generations of index.meta (as of 2013) already do have GND information for persons associated with the object in question.
+* However, for the sake of backwards compatibility, the nearly-deprecated "author" element is also existant (as well as "city", which is meant to be replaced by "place" which in turn might be superseded by "geo-location")
+* Technically, we parse the XML and construct a map containing a persons name, its remote ID and its role.
+* @param doc
+* @param currentNodeValue
+*/
+public static void findContext(Document doc, String currentNodeValue) {
+// first, define some variables
+String nameOfPerson = "";
+String roleOfPerson = "";
+String idOfPerson= "";
+// next, we try to see if there is already a contextualized author
+// let us concentrate on that element
+// then we look for tags called person
+// if there are any, we take the liberty of querying them. This is a Nodelist
+NodeList personList = doc.getElementsByTagName("person");
+// Debug information for the human eye.
+// System.out.println("The current node value is "+ currentNodeValue + ". Let's do something useful in the findContext method.");
+// System.out.println("This node list has " + personList.getLength() + " members: " + personList.item(0) + "and" + personList.item(1));
+// Integer personCounter = 1;
+// look at every element in the list of persons
+for(int countPerson=0; countPerson < personList.getLength(); countPerson++){
+// just some control
+// System.out.println("This is person number " + personCounter);
+// drill down a bit further. We now can access the person list
+Node iterPerson = personList.item(countPerson);
+// this here produces the role of a person
+if (iterPerson instanceof Element) {
+Element e = (Element)iterPerson;
+roleOfPerson = e.getAttribute("role");
+// System.out.println("Rolle: " + roleOfPerson);
+// there will also be a name attached. It is so written in the index.meta specification. Can we trust that?
+NodeList l0 = e.getElementsByTagName("name");
+if(l0.getLength() > 0){
+Node name = l0.item(0);
+nameOfPerson = name.getFirstChild().getNodeValue();
+// System.out.println("Name: " + nameOfPerson);
+}
+// and the identifier, this should be there, too. Maybe it's not...
+NodeList l1 = e.getElementsByTagName("identifier");
+if(l1.getLength() > 0){
+Node name = l1.item(0);
+idOfPerson = name.getFirstChild().getNodeValue();
+//System.out.println("Identifier: " + idOfPerson);
+}
+// System.out.println("Current Node Value " + currentNodeValue + ". Name of Person " + nameOfPerson);
+// now the final check and why we did all this:
+if (nameOfPerson.equals(currentNodeValue)) {
+ArrayList<String> authorInfo = new ArrayList<String>();
+authorInfo.add(nameOfPerson);
+authorInfo.add(roleOfPerson);
+authorInfo.add(idOfPerson);
+System.out.println("This person has already been contextualized: " + nameOfPerson  + " hat die Rolle " + roleOfPerson + " und den Identifier " + idOfPerson + ".");
+}}
+// personCounter ++;
+}
+System.out.println("printing author");
+}
 }

Mercurial > hg > IndexMetaContextualizer

comparison src/main/java/de/mpiwg/indexmeta/AnnotateIndexMeta.java @ 1:8f6c4dab5d17