Mercurial > hg > IndexMetaContextualizer

package de.mpiwg.indexmeta;
// import stuff
import java.io.File;
import java.io.IOException;
import java.util.Arrays;
import java.util.List;

import javax.print.attribute.standard.MediaSize.Other;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;

import org.w3c.dom.Attr;
import org.w3c.dom.Document;
import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;

public class AnnotateIndexMeta {

    public static void main(String argv[]) {
        System.out.println("in main");

        // Methodenaufruf
        String filepath = "/Users/kthoden/eclipse/workspace/dm2eStuff/data/index.meta";
        // this is a list of all the elements we want to contextualize
        List<String> contextualizableList = Arrays.asList(new String[]{"author","editor","publisher","city","holding-library","keywords"});
        xmlParse(filepath,contextualizableList);
        System.out.println("Done");
    }

   /**
    * Parses the XML file given as first argument and writes attributes in elements that are to be contextualized.
    * @param filepath path to the file. It will also be used as the basis for the output file (this adds "-annot").
    * @param contextualizableList contains the elements that shall be given a context identifier which is later used to grab the contents and put them into the database to have it contextualized.
    *
    */
    public static void xmlParse(String filepath, List<String> contextualizableList) {
        try {
            // this is how the outputfile will be called
            String outfilepath = filepath + "-annot";
            // open the file and parse it
            DocumentBuilderFactory docFactory = DocumentBuilderFactory.newInstance();
            DocumentBuilder docBuilder = docFactory.newDocumentBuilder();
            Document doc = docBuilder.parse(filepath);

            // iterate through the document
            Integer count = 0;
            for(String contextElement : contextualizableList){
                NodeList nodeList = doc.getElementsByTagName(contextElement);
                for(int i=0; i < nodeList.getLength(); i++){
                    Node iter2 = nodeList.item(i);
                    NamedNodeMap attr = iter2.getAttributes();
                    // make a new attribute
                    // DONE would be good if it left existing outputs alone
                    if (attr.getNamedItem("context-id") == null){
                    Attr attribute = doc.createAttribute ("context-id");
                    attribute.setValue (count.toString());
                    attr.setNamedItem (attribute);
                    }
                    else {
                        System.out.println("schon da: " + attr.getNamedItem("context-id"));
                        }
                    // Just for comfort. Print it out.
                    System.out.println(contextElement);
                    count++;
                }
                // get the element by name (so they should be unique?)
                //Node iter2 = doc.getElementsByTagName(contextElement).item(0);
            }
            // write the content into xml file
            TransformerFactory transformerFactory = TransformerFactory.newInstance();
            Transformer transformer = transformerFactory.newTransformer();
            DOMSource source = new DOMSource(doc);
            StreamResult result = new StreamResult(new File(outfilepath));
            transformer.transform(source, result);

        } catch (ParserConfigurationException pce) {
            pce.printStackTrace();
        } catch (TransformerException tfe) {
            tfe.printStackTrace();
        } catch (IOException ioe) {
            ioe.printStackTrace();
        } catch (SAXException sae) {
            sae.printStackTrace();
        }
    }
}
author	Jorge Urzua <jurzua@mpiwg-berlin.mpg.de>
date	Thu, 11 Apr 2013 15:25:26 +0200
parents
children	8f6c4dab5d17