comparison src/main/java/de/mpiwg/indexmeta/AnnotateIndexMeta.java @ 1:8f6c4dab5d17

First version. Annotates the elements to be contextualized and checks whether some authors already have an ID.
author Klaus Thoden <kthoden@mpiwg-berlin.mpg.de>
date Fri, 12 Apr 2013 13:14:33 +0200
parents dfce13a5f5f9
children 7a2a98655236 bc57f2660b0f
comparison
equal deleted inserted replaced
0:dfce13a5f5f9 1:8f6c4dab5d17
1 package de.mpiwg.indexmeta; 1 package de.mpiwg.indexmeta;
2 // import stuff 2 // import stuff
3 import java.io.File; 3 import java.io.File;
4 import java.io.IOException; 4 import java.io.IOException;
5 import java.util.ArrayList;
5 import java.util.Arrays; 6 import java.util.Arrays;
6 import java.util.List; 7 import java.util.List;
7 8
8 import javax.print.attribute.standard.MediaSize.Other;
9 import javax.xml.parsers.DocumentBuilder; 9 import javax.xml.parsers.DocumentBuilder;
10 import javax.xml.parsers.DocumentBuilderFactory; 10 import javax.xml.parsers.DocumentBuilderFactory;
11 import javax.xml.parsers.ParserConfigurationException; 11 import javax.xml.parsers.ParserConfigurationException;
12 import javax.xml.transform.Transformer; 12 import javax.xml.transform.Transformer;
13 import javax.xml.transform.TransformerException; 13 import javax.xml.transform.TransformerException;
15 import javax.xml.transform.dom.DOMSource; 15 import javax.xml.transform.dom.DOMSource;
16 import javax.xml.transform.stream.StreamResult; 16 import javax.xml.transform.stream.StreamResult;
17 17
18 import org.w3c.dom.Attr; 18 import org.w3c.dom.Attr;
19 import org.w3c.dom.Document; 19 import org.w3c.dom.Document;
20 import org.w3c.dom.Element;
20 import org.w3c.dom.NamedNodeMap; 21 import org.w3c.dom.NamedNodeMap;
21 import org.w3c.dom.Node; 22 import org.w3c.dom.Node;
22 import org.w3c.dom.NodeList; 23 import org.w3c.dom.NodeList;
23 import org.xml.sax.SAXException; 24 import org.xml.sax.SAXException;
24 25
25 public class AnnotateIndexMeta { 26 public class AnnotateIndexMeta {
26 27
27 public static void main(String argv[]) { 28 public static void main(String argv[]) {
28 System.out.println("in main"); 29 System.out.println("in main");
29 30
30 // Methodenaufruf 31 // Methodenaufruf
31 String filepath = "/Users/kthoden/eclipse/workspace/dm2eStuff/data/index.meta"; 32 String filepath = "/Users/kthoden/eclipse/workspace/IndexMetaContextualization/data/index.meta/index.meta_FQPFR8XP";
32 // this is a list of all the elements we want to contextualize 33 // this is a list of all the elements we want to contextualize
33 List<String> contextualizableList = Arrays.asList(new String[]{"author","editor","publisher","city","holding-library","keywords"}); 34 List<String> contextualizableList = Arrays.asList(new String[]{"author","editor","publisher","city","holding-library","keywords"});
34 xmlParse(filepath,contextualizableList); 35 try {xmlParse(filepath,contextualizableList);
36 }
37 catch (Exception e) {
38 e.printStackTrace();
39 };
35 System.out.println("Done"); 40 System.out.println("Done");
36 } 41 }
37 42
38 /** 43 /**
39 * Parses the XML file given as first argument and writes attributes in elements that are to be contextualized. 44 * Parses the XML file given as first argument and writes attributes in elements that are to be contextualized. These serve simply as markers for the next tools that are going to fetch these elements to put them in the database.
40 * @param filepath path to the file. It will also be used as the basis for the output file (this adds "-annot"). 45 * @param filepath path to the file. It will also be used as the basis for the output file (this adds "-annot").
41 * @param contextualizableList contains the elements that shall be given a context identifier which is later used to grab the contents and put them into the database to have it contextualized. 46 * @param contextualizableList contains the elements that shall be given a context identifier which is later used to grab the contents and put them into the database to have it contextualized.
42 * 47 * @throws Exception which means that in the source index.meta file there are already markers for contextualization.
43 */ 48 *
44 public static void xmlParse(String filepath, List<String> contextualizableList) { 49 */
50 public static void xmlParse(String filepath, List<String> contextualizableList) throws Exception {
45 try { 51 try {
46 // this is how the outputfile will be called 52 // this is how the outputfile will be called
47 String outfilepath = filepath + "-annot"; 53 String outfilepath = filepath + "-annot";
48 // open the file and parse it 54 // open the file and parse it
49 DocumentBuilderFactory docFactory = DocumentBuilderFactory.newInstance(); 55 DocumentBuilderFactory docFactory = DocumentBuilderFactory.newInstance();
54 Integer count = 0; 60 Integer count = 0;
55 for(String contextElement : contextualizableList){ 61 for(String contextElement : contextualizableList){
56 NodeList nodeList = doc.getElementsByTagName(contextElement); 62 NodeList nodeList = doc.getElementsByTagName(contextElement);
57 for(int i=0; i < nodeList.getLength(); i++){ 63 for(int i=0; i < nodeList.getLength(); i++){
58 Node iter2 = nodeList.item(i); 64 Node iter2 = nodeList.item(i);
65 String currentNodeValue = iter2.getTextContent();
59 NamedNodeMap attr = iter2.getAttributes(); 66 NamedNodeMap attr = iter2.getAttributes();
60 // make a new attribute 67 // make a new attribute
61 // DONE would be good if it left existing outputs alone
62 if (attr.getNamedItem("context-id") == null){ 68 if (attr.getNamedItem("context-id") == null){
63 Attr attribute = doc.createAttribute ("context-id"); 69 Attr attribute = doc.createAttribute ("context-id");
64 attribute.setValue (count.toString()); 70 attribute.setValue (count.toString());
65 attr.setNamedItem (attribute); 71 attr.setNamedItem (attribute);
66 } 72 }
67 else { 73 else {throw new Exception("There is already at least one context-id attribute in the source index.meta. This is not allowed. ");
68 System.out.println("schon da: " + attr.getNamedItem("context-id")); 74 }
69 }
70 // Just for comfort. Print it out. 75 // Just for comfort. Print it out.
71 System.out.println(contextElement); 76 System.out.println(contextElement);
77 if (contextElement == "author") {
78 findContext(doc, currentNodeValue);
79 }
72 count++; 80 count++;
73 } 81 }
74 // get the element by name (so they should be unique?) 82 // get the element by name (so they should be unique?)
75 //Node iter2 = doc.getElementsByTagName(contextElement).item(0); 83 //Node iter2 = doc.getElementsByTagName(contextElement).item(0);
76 } 84 }
78 TransformerFactory transformerFactory = TransformerFactory.newInstance(); 86 TransformerFactory transformerFactory = TransformerFactory.newInstance();
79 Transformer transformer = transformerFactory.newTransformer(); 87 Transformer transformer = transformerFactory.newTransformer();
80 DOMSource source = new DOMSource(doc); 88 DOMSource source = new DOMSource(doc);
81 StreamResult result = new StreamResult(new File(outfilepath)); 89 StreamResult result = new StreamResult(new File(outfilepath));
82 transformer.transform(source, result); 90 transformer.transform(source, result);
83 91 /*
92 * should these really go inside this method?
93 */
84 } catch (ParserConfigurationException pce) { 94 } catch (ParserConfigurationException pce) {
85 pce.printStackTrace(); 95 pce.printStackTrace();
86 } catch (TransformerException tfe) { 96 } catch (TransformerException tfe) {
87 tfe.printStackTrace(); 97 tfe.printStackTrace();
88 } catch (IOException ioe) { 98 } catch (IOException ioe) {
89 ioe.printStackTrace(); 99 ioe.printStackTrace();
90 } catch (SAXException sae) { 100 } catch (SAXException sae) {
91 sae.printStackTrace(); 101 sae.printStackTrace();
92 } 102 }
93 } 103 }
104
105 /**
106 * this method checks the current index.meta file for already existing contextualizations. For example, newer generations of index.meta (as of 2013) already do have GND information for persons associated with the object in question.
107 * However, for the sake of backwards compatibility, the nearly-deprecated "author" element is also existant (as well as "city", which is meant to be replaced by "place" which in turn might be superseded by "geo-location")
108 * Technically, we parse the XML and construct a map containing a persons name, its remote ID and its role.
109 * @param doc
110 * @param currentNodeValue
111 */
112 public static void findContext(Document doc, String currentNodeValue) {
113 // first, define some variables
114 String nameOfPerson = "";
115 String roleOfPerson = "";
116 String idOfPerson= "";
117
118 // next, we try to see if there is already a contextualized author
119 // let us concentrate on that element
120 // then we look for tags called person
121 // if there are any, we take the liberty of querying them. This is a Nodelist
122 NodeList personList = doc.getElementsByTagName("person");
123 // Debug information for the human eye.
124 // System.out.println("The current node value is "+ currentNodeValue + ". Let's do something useful in the findContext method.");
125 // System.out.println("This node list has " + personList.getLength() + " members: " + personList.item(0) + "and" + personList.item(1));
126 // Integer personCounter = 1;
127 // look at every element in the list of persons
128 for(int countPerson=0; countPerson < personList.getLength(); countPerson++){
129 // just some control
130 // System.out.println("This is person number " + personCounter);
131 // drill down a bit further. We now can access the person list
132 Node iterPerson = personList.item(countPerson);
133
134 // this here produces the role of a person
135 if (iterPerson instanceof Element) {
136 Element e = (Element)iterPerson;
137 roleOfPerson = e.getAttribute("role");
138 // System.out.println("Rolle: " + roleOfPerson);
139
140 // there will also be a name attached. It is so written in the index.meta specification. Can we trust that?
141 NodeList l0 = e.getElementsByTagName("name");
142 if(l0.getLength() > 0){
143 Node name = l0.item(0);
144 nameOfPerson = name.getFirstChild().getNodeValue();
145 // System.out.println("Name: " + nameOfPerson);
146 }
147
148 // and the identifier, this should be there, too. Maybe it's not...
149 NodeList l1 = e.getElementsByTagName("identifier");
150 if(l1.getLength() > 0){
151 Node name = l1.item(0);
152 idOfPerson = name.getFirstChild().getNodeValue();
153 //System.out.println("Identifier: " + idOfPerson);
154 }
155 // System.out.println("Current Node Value " + currentNodeValue + ". Name of Person " + nameOfPerson);
156 // now the final check and why we did all this:
157 if (nameOfPerson.equals(currentNodeValue)) {
158 ArrayList<String> authorInfo = new ArrayList<String>();
159 authorInfo.add(nameOfPerson);
160 authorInfo.add(roleOfPerson);
161 authorInfo.add(idOfPerson);
162
163 System.out.println("This person has already been contextualized: " + nameOfPerson + " hat die Rolle " + roleOfPerson + " und den Identifier " + idOfPerson + ".");
164 }}
165 // personCounter ++;
166 }
167 System.out.println("printing author");
168 }
94 } 169 }
170