changeset 1:8f6c4dab5d17

First version. Annotates the elements to be contextualized and checks whether some authors already have an ID.
author Klaus Thoden <kthoden@mpiwg-berlin.mpg.de>
date Fri, 12 Apr 2013 13:14:33 +0200
parents dfce13a5f5f9
children 58583fbe0606
files src/main/java/de/mpiwg/indexmeta/AnnotateIndexMeta.java
diffstat 1 files changed, 96 insertions(+), 20 deletions(-) [+]
line wrap: on
line diff
--- a/src/main/java/de/mpiwg/indexmeta/AnnotateIndexMeta.java	Thu Apr 11 15:25:26 2013 +0200
+++ b/src/main/java/de/mpiwg/indexmeta/AnnotateIndexMeta.java	Fri Apr 12 13:14:33 2013 +0200
@@ -2,10 +2,10 @@
 // import stuff
 import java.io.File;
 import java.io.IOException;
+import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.List;
 
-import javax.print.attribute.standard.MediaSize.Other;
 import javax.xml.parsers.DocumentBuilder;
 import javax.xml.parsers.DocumentBuilderFactory;
 import javax.xml.parsers.ParserConfigurationException;
@@ -17,6 +17,7 @@
 
 import org.w3c.dom.Attr;
 import org.w3c.dom.Document;
+import org.w3c.dom.Element;
 import org.w3c.dom.NamedNodeMap;
 import org.w3c.dom.Node;
 import org.w3c.dom.NodeList;
@@ -24,24 +25,29 @@
 
 public class AnnotateIndexMeta {
 
-    public static void main(String argv[]) {
+    public static void main(String argv[])  {
         System.out.println("in main");
 
         // Methodenaufruf
-        String filepath = "/Users/kthoden/eclipse/workspace/dm2eStuff/data/index.meta";
+        String filepath = "/Users/kthoden/eclipse/workspace/IndexMetaContextualization/data/index.meta/index.meta_FQPFR8XP";
         // this is a list of all the elements we want to contextualize
         List<String> contextualizableList = Arrays.asList(new String[]{"author","editor","publisher","city","holding-library","keywords"});
-        xmlParse(filepath,contextualizableList);
+        try {xmlParse(filepath,contextualizableList);
+        }
+        catch (Exception e) {
+            e.printStackTrace();
+        };  
         System.out.println("Done");
     }
 
-   /**
-    * Parses the XML file given as first argument and writes attributes in elements that are to be contextualized. 
-    * @param filepath path to the file. It will also be used as the basis for the output file (this adds "-annot").
-    * @param contextualizableList contains the elements that shall be given a context identifier which is later used to grab the contents and put them into the database to have it contextualized.
-    * 
-    */
-    public static void xmlParse(String filepath, List<String> contextualizableList) {
+    /**
+     * Parses the XML file given as first argument and writes attributes in elements that are to be contextualized. These serve simply as markers for the next tools that are going to fetch these elements to put them in the database.
+     * @param filepath path to the file. It will also be used as the basis for the output file (this adds "-annot").
+     * @param contextualizableList contains the elements that shall be given a context identifier which is later used to grab the contents and put them into the database to have it contextualized.
+     * @throws Exception which means that in the source index.meta file there are already markers for contextualization.
+     * 
+     */
+    public static void xmlParse(String filepath, List<String> contextualizableList) throws Exception {
         try {
             // this is how the outputfile will be called
             String outfilepath = filepath + "-annot";
@@ -56,19 +62,21 @@
                 NodeList nodeList = doc.getElementsByTagName(contextElement);
                 for(int i=0; i < nodeList.getLength(); i++){
                     Node iter2 = nodeList.item(i);
+                    String currentNodeValue = iter2.getTextContent();
                     NamedNodeMap attr = iter2.getAttributes();
                     // make a new attribute
-                    // DONE would be good if it left existing outputs alone
                     if (attr.getNamedItem("context-id") == null){
-                    Attr attribute = doc.createAttribute ("context-id");
-                    attribute.setValue (count.toString());
-                    attr.setNamedItem (attribute);
+                        Attr attribute = doc.createAttribute ("context-id");
+                        attribute.setValue (count.toString());
+                        attr.setNamedItem (attribute);
                     }
-                    else {
-                        System.out.println("schon da: " + attr.getNamedItem("context-id"));
-                        }
+                    else {throw new Exception("There is already at least one context-id attribute in the source index.meta. This is not allowed. ");
+                    }
                     // Just for comfort. Print it out.
                     System.out.println(contextElement);
+                    if (contextElement == "author") {
+                        findContext(doc, currentNodeValue);
+                    }
                     count++;
                 }
                 // get the element by name (so they should be unique?)
@@ -80,7 +88,9 @@
             DOMSource source = new DOMSource(doc);
             StreamResult result = new StreamResult(new File(outfilepath));
             transformer.transform(source, result);
-
+            /*
+             * should these really go inside this method?
+             */
         } catch (ParserConfigurationException pce) {
             pce.printStackTrace();
         } catch (TransformerException tfe) {
@@ -91,4 +101,70 @@
             sae.printStackTrace();
         }
     }
-}
\ No newline at end of file
+
+    /**
+     * this method checks the current index.meta file for already existing contextualizations. For example, newer generations of index.meta (as of 2013) already do have GND information for persons associated with the object in question.
+     * However, for the sake of backwards compatibility, the nearly-deprecated "author" element is also existant (as well as "city", which is meant to be replaced by "place" which in turn might be superseded by "geo-location")  
+     * Technically, we parse the XML and construct a map containing a persons name, its remote ID and its role.
+     * @param doc
+     * @param currentNodeValue
+     */
+    public static void findContext(Document doc, String currentNodeValue) {
+        // first, define some variables
+        String nameOfPerson = "";
+        String roleOfPerson = "";
+        String idOfPerson= "";
+
+        // next, we try to see if there is already a contextualized author
+        // let us concentrate on that element
+        // then we look for tags called person
+        // if there are any, we take the liberty of querying them. This is a Nodelist
+        NodeList personList = doc.getElementsByTagName("person");
+        // Debug information for the human eye.
+        // System.out.println("The current node value is "+ currentNodeValue + ". Let's do something useful in the findContext method.");
+        // System.out.println("This node list has " + personList.getLength() + " members: " + personList.item(0) + "and" + personList.item(1));
+        // Integer personCounter = 1;
+        // look at every element in the list of persons
+        for(int countPerson=0; countPerson < personList.getLength(); countPerson++){
+            // just some control
+            // System.out.println("This is person number " + personCounter);    
+            // drill down a bit further. We now can access the person list
+            Node iterPerson = personList.item(countPerson);
+
+            // this here produces the role of a person
+            if (iterPerson instanceof Element) {
+                Element e = (Element)iterPerson;
+                roleOfPerson = e.getAttribute("role");
+                // System.out.println("Rolle: " + roleOfPerson);
+
+                // there will also be a name attached. It is so written in the index.meta specification. Can we trust that?
+                NodeList l0 = e.getElementsByTagName("name");
+                if(l0.getLength() > 0){
+                    Node name = l0.item(0);
+                    nameOfPerson = name.getFirstChild().getNodeValue();
+                    // System.out.println("Name: " + nameOfPerson);
+                }
+
+                // and the identifier, this should be there, too. Maybe it's not...
+                NodeList l1 = e.getElementsByTagName("identifier");
+                if(l1.getLength() > 0){
+                    Node name = l1.item(0);
+                    idOfPerson = name.getFirstChild().getNodeValue();
+                    //System.out.println("Identifier: " + idOfPerson);
+                }
+                // System.out.println("Current Node Value " + currentNodeValue + ". Name of Person " + nameOfPerson);
+                // now the final check and why we did all this:
+                if (nameOfPerson.equals(currentNodeValue)) {
+                    ArrayList<String> authorInfo = new ArrayList<String>();
+                    authorInfo.add(nameOfPerson);
+                    authorInfo.add(roleOfPerson);
+                    authorInfo.add(idOfPerson);
+
+                    System.out.println("This person has already been contextualized: " + nameOfPerson  + " hat die Rolle " + roleOfPerson + " und den Identifier " + idOfPerson + ".");
+                }}
+            // personCounter ++;
+        }
+        System.out.println("printing author");
+    }
+}
+