Mercurial > hg > IndexMetaContextualizer

package de.mpiwg.indexmeta;
// import stuff
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;

import org.w3c.dom.Attr;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;

public class AnnotateIndexMeta {

    public static void main(String argv[])  {
        System.out.println("in main");

        // Methodenaufruf
        String filepath = "/Users/kthoden/eclipse/workspace/IndexMetaContextualization/data/index.meta/index.meta_FQPFR8XP";
        // this is a list of all the elements we want to contextualize
        List<String> contextualizableList = Arrays.asList(new String[]{"author","editor","publisher","city","holding-library","keywords"});
        try {xmlParse(filepath,contextualizableList);
        }
        catch (Exception e) {
            e.printStackTrace();
        };
        System.out.println("Done");
    }

    /**
     * Parses the XML file given as first argument and writes attributes in elements that are to be contextualized. These serve simply as markers for the next tools that are going to fetch these elements to put them in the database.
     * @param filepath path to the file. It will also be used as the basis for the output file (this adds "-annot").
     * @param contextualizableList contains the elements that shall be given a context identifier which is later used to grab the contents and put them into the database to have it contextualized.
     * @throws Exception which means that in the source index.meta file there are already markers for contextualization.
     *
     */
    public static void xmlParse(String filepath, List<String> contextualizableList) throws Exception {
        try {
            // this is how the outputfile will be called
            String outfilepath = filepath + "-annot";
            // open the file and parse it
            DocumentBuilderFactory docFactory = DocumentBuilderFactory.newInstance();
            DocumentBuilder docBuilder = docFactory.newDocumentBuilder();
            Document doc = docBuilder.parse(filepath);

            // iterate through the document
            Integer count = 0;
            for(String contextElement : contextualizableList){
                NodeList nodeList = doc.getElementsByTagName(contextElement);
                for(int i=0; i < nodeList.getLength(); i++){
                    Node iter2 = nodeList.item(i);
                    String currentNodeValue = iter2.getTextContent();
                    NamedNodeMap attr = iter2.getAttributes();
                    // make a new attribute
                    if (attr.getNamedItem("context-id") == null){
                        Attr attribute = doc.createAttribute ("context-id");
                        attribute.setValue (count.toString());
                        attr.setNamedItem (attribute);
                    }
                    else {throw new Exception("There is already at least one context-id attribute in the source index.meta. This is not allowed. ");
                    }
                    // Just for comfort. Print it out.
                    System.out.println(contextElement);
                    if (contextElement == "author") {
                        findContext(doc, currentNodeValue);
                    }
                    count++;
                }
                // get the element by name (so they should be unique?)
                //Node iter2 = doc.getElementsByTagName(contextElement).item(0);
            }
            // write the content into xml file
            TransformerFactory transformerFactory = TransformerFactory.newInstance();
            Transformer transformer = transformerFactory.newTransformer();
            DOMSource source = new DOMSource(doc);
            StreamResult result = new StreamResult(new File(outfilepath));
            transformer.transform(source, result);
            /*
             * should these really go inside this method?
             */
        } catch (ParserConfigurationException pce) {
            pce.printStackTrace();
        } catch (TransformerException tfe) {
            tfe.printStackTrace();
        } catch (IOException ioe) {
            ioe.printStackTrace();
        } catch (SAXException sae) {
            sae.printStackTrace();
        }
    }

    /**
     * this method checks the current index.meta file for already existing contextualizations. For example, newer generations of index.meta (as of 2013) already do have GND information for persons associated with the object in question.
     * However, for the sake of backwards compatibility, the nearly-deprecated "author" element is also existant (as well as "city", which is meant to be replaced by "place" which in turn might be superseded by "geo-location")
     * Technically, we parse the XML and construct a map containing a persons name, its remote ID and its role.
     * @param doc
     * @param currentNodeValue
     */
    public static void findContext(Document doc, String currentNodeValue) {
        // first, define some variables
        String nameOfPerson = "";
        String roleOfPerson = "";
        String idOfPerson= "";

        // next, we try to see if there is already a contextualized author
        // let us concentrate on that element
        // then we look for tags called person
        // if there are any, we take the liberty of querying them. This is a Nodelist
        NodeList personList = doc.getElementsByTagName("person");
        // Debug information for the human eye.
        // System.out.println("The current node value is "+ currentNodeValue + ". Let's do something useful in the findContext method.");
        // System.out.println("This node list has " + personList.getLength() + " members: " + personList.item(0) + "and" + personList.item(1));
        // Integer personCounter = 1;
        // look at every element in the list of persons
        for(int countPerson=0; countPerson < personList.getLength(); countPerson++){
            // just some control
            // System.out.println("This is person number " + personCounter);
            // drill down a bit further. We now can access the person list
            Node iterPerson = personList.item(countPerson);

            // this here produces the role of a person
            if (iterPerson instanceof Element) {
                Element e = (Element)iterPerson;
                roleOfPerson = e.getAttribute("role");
                // System.out.println("Rolle: " + roleOfPerson);

                // there will also be a name attached. It is so written in the index.meta specification. Can we trust that?
                NodeList l0 = e.getElementsByTagName("name");
                if(l0.getLength() > 0){
                    Node name = l0.item(0);
                    nameOfPerson = name.getFirstChild().getNodeValue();
                    // System.out.println("Name: " + nameOfPerson);
                }

                // and the identifier, this should be there, too. Maybe it's not...
                NodeList l1 = e.getElementsByTagName("identifier");
                if(l1.getLength() > 0){
                    Node name = l1.item(0);
                    idOfPerson = name.getFirstChild().getNodeValue();
                    //System.out.println("Identifier: " + idOfPerson);
                }
                // System.out.println("Current Node Value " + currentNodeValue + ". Name of Person " + nameOfPerson);
                // now the final check and why we did all this:
                if (nameOfPerson.equals(currentNodeValue)) {
                    ArrayList<String> authorInfo = new ArrayList<String>();
                    authorInfo.add(nameOfPerson);
                    authorInfo.add(roleOfPerson);
                    authorInfo.add(idOfPerson);

                    System.out.println("This person has already been contextualized: " + nameOfPerson  + " hat die Rolle " + roleOfPerson + " und den Identifier " + idOfPerson + ".");
                }}
            // personCounter ++;
        }
        System.out.println("printing author");
    }
}
author	Klaus Thoden <kthoden@mpiwg-berlin.mpg.de>
date	Fri, 12 Apr 2013 13:14:33 +0200
parents	dfce13a5f5f9
children	7a2a98655236 bc57f2660b0f