annotate src/main/java/de/mpiwg/indexmeta/AnnotateIndexMeta.java @ 0:dfce13a5f5f9

nit project!
author Jorge Urzua <jurzua@mpiwg-berlin.mpg.de>
date Thu, 11 Apr 2013 15:25:26 +0200
parents
children 8f6c4dab5d17
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
dfce13a5f5f9 nit project!
Jorge Urzua <jurzua@mpiwg-berlin.mpg.de>
parents:
diff changeset
1 package de.mpiwg.indexmeta;
dfce13a5f5f9 nit project!
Jorge Urzua <jurzua@mpiwg-berlin.mpg.de>
parents:
diff changeset
2 // import stuff
dfce13a5f5f9 nit project!
Jorge Urzua <jurzua@mpiwg-berlin.mpg.de>
parents:
diff changeset
3 import java.io.File;
dfce13a5f5f9 nit project!
Jorge Urzua <jurzua@mpiwg-berlin.mpg.de>
parents:
diff changeset
4 import java.io.IOException;
dfce13a5f5f9 nit project!
Jorge Urzua <jurzua@mpiwg-berlin.mpg.de>
parents:
diff changeset
5 import java.util.Arrays;
dfce13a5f5f9 nit project!
Jorge Urzua <jurzua@mpiwg-berlin.mpg.de>
parents:
diff changeset
6 import java.util.List;
dfce13a5f5f9 nit project!
Jorge Urzua <jurzua@mpiwg-berlin.mpg.de>
parents:
diff changeset
7
dfce13a5f5f9 nit project!
Jorge Urzua <jurzua@mpiwg-berlin.mpg.de>
parents:
diff changeset
8 import javax.print.attribute.standard.MediaSize.Other;
dfce13a5f5f9 nit project!
Jorge Urzua <jurzua@mpiwg-berlin.mpg.de>
parents:
diff changeset
9 import javax.xml.parsers.DocumentBuilder;
dfce13a5f5f9 nit project!
Jorge Urzua <jurzua@mpiwg-berlin.mpg.de>
parents:
diff changeset
10 import javax.xml.parsers.DocumentBuilderFactory;
dfce13a5f5f9 nit project!
Jorge Urzua <jurzua@mpiwg-berlin.mpg.de>
parents:
diff changeset
11 import javax.xml.parsers.ParserConfigurationException;
dfce13a5f5f9 nit project!
Jorge Urzua <jurzua@mpiwg-berlin.mpg.de>
parents:
diff changeset
12 import javax.xml.transform.Transformer;
dfce13a5f5f9 nit project!
Jorge Urzua <jurzua@mpiwg-berlin.mpg.de>
parents:
diff changeset
13 import javax.xml.transform.TransformerException;
dfce13a5f5f9 nit project!
Jorge Urzua <jurzua@mpiwg-berlin.mpg.de>
parents:
diff changeset
14 import javax.xml.transform.TransformerFactory;
dfce13a5f5f9 nit project!
Jorge Urzua <jurzua@mpiwg-berlin.mpg.de>
parents:
diff changeset
15 import javax.xml.transform.dom.DOMSource;
dfce13a5f5f9 nit project!
Jorge Urzua <jurzua@mpiwg-berlin.mpg.de>
parents:
diff changeset
16 import javax.xml.transform.stream.StreamResult;
dfce13a5f5f9 nit project!
Jorge Urzua <jurzua@mpiwg-berlin.mpg.de>
parents:
diff changeset
17
dfce13a5f5f9 nit project!
Jorge Urzua <jurzua@mpiwg-berlin.mpg.de>
parents:
diff changeset
18 import org.w3c.dom.Attr;
dfce13a5f5f9 nit project!
Jorge Urzua <jurzua@mpiwg-berlin.mpg.de>
parents:
diff changeset
19 import org.w3c.dom.Document;
dfce13a5f5f9 nit project!
Jorge Urzua <jurzua@mpiwg-berlin.mpg.de>
parents:
diff changeset
20 import org.w3c.dom.NamedNodeMap;
dfce13a5f5f9 nit project!
Jorge Urzua <jurzua@mpiwg-berlin.mpg.de>
parents:
diff changeset
21 import org.w3c.dom.Node;
dfce13a5f5f9 nit project!
Jorge Urzua <jurzua@mpiwg-berlin.mpg.de>
parents:
diff changeset
22 import org.w3c.dom.NodeList;
dfce13a5f5f9 nit project!
Jorge Urzua <jurzua@mpiwg-berlin.mpg.de>
parents:
diff changeset
23 import org.xml.sax.SAXException;
dfce13a5f5f9 nit project!
Jorge Urzua <jurzua@mpiwg-berlin.mpg.de>
parents:
diff changeset
24
dfce13a5f5f9 nit project!
Jorge Urzua <jurzua@mpiwg-berlin.mpg.de>
parents:
diff changeset
25 public class AnnotateIndexMeta {
dfce13a5f5f9 nit project!
Jorge Urzua <jurzua@mpiwg-berlin.mpg.de>
parents:
diff changeset
26
dfce13a5f5f9 nit project!
Jorge Urzua <jurzua@mpiwg-berlin.mpg.de>
parents:
diff changeset
27 public static void main(String argv[]) {
dfce13a5f5f9 nit project!
Jorge Urzua <jurzua@mpiwg-berlin.mpg.de>
parents:
diff changeset
28 System.out.println("in main");
dfce13a5f5f9 nit project!
Jorge Urzua <jurzua@mpiwg-berlin.mpg.de>
parents:
diff changeset
29
dfce13a5f5f9 nit project!
Jorge Urzua <jurzua@mpiwg-berlin.mpg.de>
parents:
diff changeset
30 // Methodenaufruf
dfce13a5f5f9 nit project!
Jorge Urzua <jurzua@mpiwg-berlin.mpg.de>
parents:
diff changeset
31 String filepath = "/Users/kthoden/eclipse/workspace/dm2eStuff/data/index.meta";
dfce13a5f5f9 nit project!
Jorge Urzua <jurzua@mpiwg-berlin.mpg.de>
parents:
diff changeset
32 // this is a list of all the elements we want to contextualize
dfce13a5f5f9 nit project!
Jorge Urzua <jurzua@mpiwg-berlin.mpg.de>
parents:
diff changeset
33 List<String> contextualizableList = Arrays.asList(new String[]{"author","editor","publisher","city","holding-library","keywords"});
dfce13a5f5f9 nit project!
Jorge Urzua <jurzua@mpiwg-berlin.mpg.de>
parents:
diff changeset
34 xmlParse(filepath,contextualizableList);
dfce13a5f5f9 nit project!
Jorge Urzua <jurzua@mpiwg-berlin.mpg.de>
parents:
diff changeset
35 System.out.println("Done");
dfce13a5f5f9 nit project!
Jorge Urzua <jurzua@mpiwg-berlin.mpg.de>
parents:
diff changeset
36 }
dfce13a5f5f9 nit project!
Jorge Urzua <jurzua@mpiwg-berlin.mpg.de>
parents:
diff changeset
37
dfce13a5f5f9 nit project!
Jorge Urzua <jurzua@mpiwg-berlin.mpg.de>
parents:
diff changeset
38 /**
dfce13a5f5f9 nit project!
Jorge Urzua <jurzua@mpiwg-berlin.mpg.de>
parents:
diff changeset
39 * Parses the XML file given as first argument and writes attributes in elements that are to be contextualized.
dfce13a5f5f9 nit project!
Jorge Urzua <jurzua@mpiwg-berlin.mpg.de>
parents:
diff changeset
40 * @param filepath path to the file. It will also be used as the basis for the output file (this adds "-annot").
dfce13a5f5f9 nit project!
Jorge Urzua <jurzua@mpiwg-berlin.mpg.de>
parents:
diff changeset
41 * @param contextualizableList contains the elements that shall be given a context identifier which is later used to grab the contents and put them into the database to have it contextualized.
dfce13a5f5f9 nit project!
Jorge Urzua <jurzua@mpiwg-berlin.mpg.de>
parents:
diff changeset
42 *
dfce13a5f5f9 nit project!
Jorge Urzua <jurzua@mpiwg-berlin.mpg.de>
parents:
diff changeset
43 */
dfce13a5f5f9 nit project!
Jorge Urzua <jurzua@mpiwg-berlin.mpg.de>
parents:
diff changeset
44 public static void xmlParse(String filepath, List<String> contextualizableList) {
dfce13a5f5f9 nit project!
Jorge Urzua <jurzua@mpiwg-berlin.mpg.de>
parents:
diff changeset
45 try {
dfce13a5f5f9 nit project!
Jorge Urzua <jurzua@mpiwg-berlin.mpg.de>
parents:
diff changeset
46 // this is how the outputfile will be called
dfce13a5f5f9 nit project!
Jorge Urzua <jurzua@mpiwg-berlin.mpg.de>
parents:
diff changeset
47 String outfilepath = filepath + "-annot";
dfce13a5f5f9 nit project!
Jorge Urzua <jurzua@mpiwg-berlin.mpg.de>
parents:
diff changeset
48 // open the file and parse it
dfce13a5f5f9 nit project!
Jorge Urzua <jurzua@mpiwg-berlin.mpg.de>
parents:
diff changeset
49 DocumentBuilderFactory docFactory = DocumentBuilderFactory.newInstance();
dfce13a5f5f9 nit project!
Jorge Urzua <jurzua@mpiwg-berlin.mpg.de>
parents:
diff changeset
50 DocumentBuilder docBuilder = docFactory.newDocumentBuilder();
dfce13a5f5f9 nit project!
Jorge Urzua <jurzua@mpiwg-berlin.mpg.de>
parents:
diff changeset
51 Document doc = docBuilder.parse(filepath);
dfce13a5f5f9 nit project!
Jorge Urzua <jurzua@mpiwg-berlin.mpg.de>
parents:
diff changeset
52
dfce13a5f5f9 nit project!
Jorge Urzua <jurzua@mpiwg-berlin.mpg.de>
parents:
diff changeset
53 // iterate through the document
dfce13a5f5f9 nit project!
Jorge Urzua <jurzua@mpiwg-berlin.mpg.de>
parents:
diff changeset
54 Integer count = 0;
dfce13a5f5f9 nit project!
Jorge Urzua <jurzua@mpiwg-berlin.mpg.de>
parents:
diff changeset
55 for(String contextElement : contextualizableList){
dfce13a5f5f9 nit project!
Jorge Urzua <jurzua@mpiwg-berlin.mpg.de>
parents:
diff changeset
56 NodeList nodeList = doc.getElementsByTagName(contextElement);
dfce13a5f5f9 nit project!
Jorge Urzua <jurzua@mpiwg-berlin.mpg.de>
parents:
diff changeset
57 for(int i=0; i < nodeList.getLength(); i++){
dfce13a5f5f9 nit project!
Jorge Urzua <jurzua@mpiwg-berlin.mpg.de>
parents:
diff changeset
58 Node iter2 = nodeList.item(i);
dfce13a5f5f9 nit project!
Jorge Urzua <jurzua@mpiwg-berlin.mpg.de>
parents:
diff changeset
59 NamedNodeMap attr = iter2.getAttributes();
dfce13a5f5f9 nit project!
Jorge Urzua <jurzua@mpiwg-berlin.mpg.de>
parents:
diff changeset
60 // make a new attribute
dfce13a5f5f9 nit project!
Jorge Urzua <jurzua@mpiwg-berlin.mpg.de>
parents:
diff changeset
61 // DONE would be good if it left existing outputs alone
dfce13a5f5f9 nit project!
Jorge Urzua <jurzua@mpiwg-berlin.mpg.de>
parents:
diff changeset
62 if (attr.getNamedItem("context-id") == null){
dfce13a5f5f9 nit project!
Jorge Urzua <jurzua@mpiwg-berlin.mpg.de>
parents:
diff changeset
63 Attr attribute = doc.createAttribute ("context-id");
dfce13a5f5f9 nit project!
Jorge Urzua <jurzua@mpiwg-berlin.mpg.de>
parents:
diff changeset
64 attribute.setValue (count.toString());
dfce13a5f5f9 nit project!
Jorge Urzua <jurzua@mpiwg-berlin.mpg.de>
parents:
diff changeset
65 attr.setNamedItem (attribute);
dfce13a5f5f9 nit project!
Jorge Urzua <jurzua@mpiwg-berlin.mpg.de>
parents:
diff changeset
66 }
dfce13a5f5f9 nit project!
Jorge Urzua <jurzua@mpiwg-berlin.mpg.de>
parents:
diff changeset
67 else {
dfce13a5f5f9 nit project!
Jorge Urzua <jurzua@mpiwg-berlin.mpg.de>
parents:
diff changeset
68 System.out.println("schon da: " + attr.getNamedItem("context-id"));
dfce13a5f5f9 nit project!
Jorge Urzua <jurzua@mpiwg-berlin.mpg.de>
parents:
diff changeset
69 }
dfce13a5f5f9 nit project!
Jorge Urzua <jurzua@mpiwg-berlin.mpg.de>
parents:
diff changeset
70 // Just for comfort. Print it out.
dfce13a5f5f9 nit project!
Jorge Urzua <jurzua@mpiwg-berlin.mpg.de>
parents:
diff changeset
71 System.out.println(contextElement);
dfce13a5f5f9 nit project!
Jorge Urzua <jurzua@mpiwg-berlin.mpg.de>
parents:
diff changeset
72 count++;
dfce13a5f5f9 nit project!
Jorge Urzua <jurzua@mpiwg-berlin.mpg.de>
parents:
diff changeset
73 }
dfce13a5f5f9 nit project!
Jorge Urzua <jurzua@mpiwg-berlin.mpg.de>
parents:
diff changeset
74 // get the element by name (so they should be unique?)
dfce13a5f5f9 nit project!
Jorge Urzua <jurzua@mpiwg-berlin.mpg.de>
parents:
diff changeset
75 //Node iter2 = doc.getElementsByTagName(contextElement).item(0);
dfce13a5f5f9 nit project!
Jorge Urzua <jurzua@mpiwg-berlin.mpg.de>
parents:
diff changeset
76 }
dfce13a5f5f9 nit project!
Jorge Urzua <jurzua@mpiwg-berlin.mpg.de>
parents:
diff changeset
77 // write the content into xml file
dfce13a5f5f9 nit project!
Jorge Urzua <jurzua@mpiwg-berlin.mpg.de>
parents:
diff changeset
78 TransformerFactory transformerFactory = TransformerFactory.newInstance();
dfce13a5f5f9 nit project!
Jorge Urzua <jurzua@mpiwg-berlin.mpg.de>
parents:
diff changeset
79 Transformer transformer = transformerFactory.newTransformer();
dfce13a5f5f9 nit project!
Jorge Urzua <jurzua@mpiwg-berlin.mpg.de>
parents:
diff changeset
80 DOMSource source = new DOMSource(doc);
dfce13a5f5f9 nit project!
Jorge Urzua <jurzua@mpiwg-berlin.mpg.de>
parents:
diff changeset
81 StreamResult result = new StreamResult(new File(outfilepath));
dfce13a5f5f9 nit project!
Jorge Urzua <jurzua@mpiwg-berlin.mpg.de>
parents:
diff changeset
82 transformer.transform(source, result);
dfce13a5f5f9 nit project!
Jorge Urzua <jurzua@mpiwg-berlin.mpg.de>
parents:
diff changeset
83
dfce13a5f5f9 nit project!
Jorge Urzua <jurzua@mpiwg-berlin.mpg.de>
parents:
diff changeset
84 } catch (ParserConfigurationException pce) {
dfce13a5f5f9 nit project!
Jorge Urzua <jurzua@mpiwg-berlin.mpg.de>
parents:
diff changeset
85 pce.printStackTrace();
dfce13a5f5f9 nit project!
Jorge Urzua <jurzua@mpiwg-berlin.mpg.de>
parents:
diff changeset
86 } catch (TransformerException tfe) {
dfce13a5f5f9 nit project!
Jorge Urzua <jurzua@mpiwg-berlin.mpg.de>
parents:
diff changeset
87 tfe.printStackTrace();
dfce13a5f5f9 nit project!
Jorge Urzua <jurzua@mpiwg-berlin.mpg.de>
parents:
diff changeset
88 } catch (IOException ioe) {
dfce13a5f5f9 nit project!
Jorge Urzua <jurzua@mpiwg-berlin.mpg.de>
parents:
diff changeset
89 ioe.printStackTrace();
dfce13a5f5f9 nit project!
Jorge Urzua <jurzua@mpiwg-berlin.mpg.de>
parents:
diff changeset
90 } catch (SAXException sae) {
dfce13a5f5f9 nit project!
Jorge Urzua <jurzua@mpiwg-berlin.mpg.de>
parents:
diff changeset
91 sae.printStackTrace();
dfce13a5f5f9 nit project!
Jorge Urzua <jurzua@mpiwg-berlin.mpg.de>
parents:
diff changeset
92 }
dfce13a5f5f9 nit project!
Jorge Urzua <jurzua@mpiwg-berlin.mpg.de>
parents:
diff changeset
93 }
dfce13a5f5f9 nit project!
Jorge Urzua <jurzua@mpiwg-berlin.mpg.de>
parents:
diff changeset
94 }