Mercurial > hg > IndexMetaContextualizer
comparison src/main/java/de/mpiwg/indexmeta/AnnotateIndexMeta.java @ 1:8f6c4dab5d17
First version. Annotates the elements to be contextualized and checks whether some authors already have an ID.
author | Klaus Thoden <kthoden@mpiwg-berlin.mpg.de> |
---|---|
date | Fri, 12 Apr 2013 13:14:33 +0200 |
parents | dfce13a5f5f9 |
children | 7a2a98655236 bc57f2660b0f |
comparison
equal
deleted
inserted
replaced
0:dfce13a5f5f9 | 1:8f6c4dab5d17 |
---|---|
1 package de.mpiwg.indexmeta; | 1 package de.mpiwg.indexmeta; |
2 // import stuff | 2 // import stuff |
3 import java.io.File; | 3 import java.io.File; |
4 import java.io.IOException; | 4 import java.io.IOException; |
5 import java.util.ArrayList; | |
5 import java.util.Arrays; | 6 import java.util.Arrays; |
6 import java.util.List; | 7 import java.util.List; |
7 | 8 |
8 import javax.print.attribute.standard.MediaSize.Other; | |
9 import javax.xml.parsers.DocumentBuilder; | 9 import javax.xml.parsers.DocumentBuilder; |
10 import javax.xml.parsers.DocumentBuilderFactory; | 10 import javax.xml.parsers.DocumentBuilderFactory; |
11 import javax.xml.parsers.ParserConfigurationException; | 11 import javax.xml.parsers.ParserConfigurationException; |
12 import javax.xml.transform.Transformer; | 12 import javax.xml.transform.Transformer; |
13 import javax.xml.transform.TransformerException; | 13 import javax.xml.transform.TransformerException; |
15 import javax.xml.transform.dom.DOMSource; | 15 import javax.xml.transform.dom.DOMSource; |
16 import javax.xml.transform.stream.StreamResult; | 16 import javax.xml.transform.stream.StreamResult; |
17 | 17 |
18 import org.w3c.dom.Attr; | 18 import org.w3c.dom.Attr; |
19 import org.w3c.dom.Document; | 19 import org.w3c.dom.Document; |
20 import org.w3c.dom.Element; | |
20 import org.w3c.dom.NamedNodeMap; | 21 import org.w3c.dom.NamedNodeMap; |
21 import org.w3c.dom.Node; | 22 import org.w3c.dom.Node; |
22 import org.w3c.dom.NodeList; | 23 import org.w3c.dom.NodeList; |
23 import org.xml.sax.SAXException; | 24 import org.xml.sax.SAXException; |
24 | 25 |
25 public class AnnotateIndexMeta { | 26 public class AnnotateIndexMeta { |
26 | 27 |
27 public static void main(String argv[]) { | 28 public static void main(String argv[]) { |
28 System.out.println("in main"); | 29 System.out.println("in main"); |
29 | 30 |
30 // Methodenaufruf | 31 // Methodenaufruf |
31 String filepath = "/Users/kthoden/eclipse/workspace/dm2eStuff/data/index.meta"; | 32 String filepath = "/Users/kthoden/eclipse/workspace/IndexMetaContextualization/data/index.meta/index.meta_FQPFR8XP"; |
32 // this is a list of all the elements we want to contextualize | 33 // this is a list of all the elements we want to contextualize |
33 List<String> contextualizableList = Arrays.asList(new String[]{"author","editor","publisher","city","holding-library","keywords"}); | 34 List<String> contextualizableList = Arrays.asList(new String[]{"author","editor","publisher","city","holding-library","keywords"}); |
34 xmlParse(filepath,contextualizableList); | 35 try {xmlParse(filepath,contextualizableList); |
36 } | |
37 catch (Exception e) { | |
38 e.printStackTrace(); | |
39 }; | |
35 System.out.println("Done"); | 40 System.out.println("Done"); |
36 } | 41 } |
37 | 42 |
38 /** | 43 /** |
39 * Parses the XML file given as first argument and writes attributes in elements that are to be contextualized. | 44 * Parses the XML file given as first argument and writes attributes in elements that are to be contextualized. These serve simply as markers for the next tools that are going to fetch these elements to put them in the database. |
40 * @param filepath path to the file. It will also be used as the basis for the output file (this adds "-annot"). | 45 * @param filepath path to the file. It will also be used as the basis for the output file (this adds "-annot"). |
41 * @param contextualizableList contains the elements that shall be given a context identifier which is later used to grab the contents and put them into the database to have it contextualized. | 46 * @param contextualizableList contains the elements that shall be given a context identifier which is later used to grab the contents and put them into the database to have it contextualized. |
42 * | 47 * @throws Exception which means that in the source index.meta file there are already markers for contextualization. |
43 */ | 48 * |
44 public static void xmlParse(String filepath, List<String> contextualizableList) { | 49 */ |
50 public static void xmlParse(String filepath, List<String> contextualizableList) throws Exception { | |
45 try { | 51 try { |
46 // this is how the outputfile will be called | 52 // this is how the outputfile will be called |
47 String outfilepath = filepath + "-annot"; | 53 String outfilepath = filepath + "-annot"; |
48 // open the file and parse it | 54 // open the file and parse it |
49 DocumentBuilderFactory docFactory = DocumentBuilderFactory.newInstance(); | 55 DocumentBuilderFactory docFactory = DocumentBuilderFactory.newInstance(); |
54 Integer count = 0; | 60 Integer count = 0; |
55 for(String contextElement : contextualizableList){ | 61 for(String contextElement : contextualizableList){ |
56 NodeList nodeList = doc.getElementsByTagName(contextElement); | 62 NodeList nodeList = doc.getElementsByTagName(contextElement); |
57 for(int i=0; i < nodeList.getLength(); i++){ | 63 for(int i=0; i < nodeList.getLength(); i++){ |
58 Node iter2 = nodeList.item(i); | 64 Node iter2 = nodeList.item(i); |
65 String currentNodeValue = iter2.getTextContent(); | |
59 NamedNodeMap attr = iter2.getAttributes(); | 66 NamedNodeMap attr = iter2.getAttributes(); |
60 // make a new attribute | 67 // make a new attribute |
61 // DONE would be good if it left existing outputs alone | |
62 if (attr.getNamedItem("context-id") == null){ | 68 if (attr.getNamedItem("context-id") == null){ |
63 Attr attribute = doc.createAttribute ("context-id"); | 69 Attr attribute = doc.createAttribute ("context-id"); |
64 attribute.setValue (count.toString()); | 70 attribute.setValue (count.toString()); |
65 attr.setNamedItem (attribute); | 71 attr.setNamedItem (attribute); |
66 } | 72 } |
67 else { | 73 else {throw new Exception("There is already at least one context-id attribute in the source index.meta. This is not allowed. "); |
68 System.out.println("schon da: " + attr.getNamedItem("context-id")); | 74 } |
69 } | |
70 // Just for comfort. Print it out. | 75 // Just for comfort. Print it out. |
71 System.out.println(contextElement); | 76 System.out.println(contextElement); |
77 if (contextElement == "author") { | |
78 findContext(doc, currentNodeValue); | |
79 } | |
72 count++; | 80 count++; |
73 } | 81 } |
74 // get the element by name (so they should be unique?) | 82 // get the element by name (so they should be unique?) |
75 //Node iter2 = doc.getElementsByTagName(contextElement).item(0); | 83 //Node iter2 = doc.getElementsByTagName(contextElement).item(0); |
76 } | 84 } |
78 TransformerFactory transformerFactory = TransformerFactory.newInstance(); | 86 TransformerFactory transformerFactory = TransformerFactory.newInstance(); |
79 Transformer transformer = transformerFactory.newTransformer(); | 87 Transformer transformer = transformerFactory.newTransformer(); |
80 DOMSource source = new DOMSource(doc); | 88 DOMSource source = new DOMSource(doc); |
81 StreamResult result = new StreamResult(new File(outfilepath)); | 89 StreamResult result = new StreamResult(new File(outfilepath)); |
82 transformer.transform(source, result); | 90 transformer.transform(source, result); |
83 | 91 /* |
92 * should these really go inside this method? | |
93 */ | |
84 } catch (ParserConfigurationException pce) { | 94 } catch (ParserConfigurationException pce) { |
85 pce.printStackTrace(); | 95 pce.printStackTrace(); |
86 } catch (TransformerException tfe) { | 96 } catch (TransformerException tfe) { |
87 tfe.printStackTrace(); | 97 tfe.printStackTrace(); |
88 } catch (IOException ioe) { | 98 } catch (IOException ioe) { |
89 ioe.printStackTrace(); | 99 ioe.printStackTrace(); |
90 } catch (SAXException sae) { | 100 } catch (SAXException sae) { |
91 sae.printStackTrace(); | 101 sae.printStackTrace(); |
92 } | 102 } |
93 } | 103 } |
104 | |
105 /** | |
106 * this method checks the current index.meta file for already existing contextualizations. For example, newer generations of index.meta (as of 2013) already do have GND information for persons associated with the object in question. | |
107 * However, for the sake of backwards compatibility, the nearly-deprecated "author" element is also existant (as well as "city", which is meant to be replaced by "place" which in turn might be superseded by "geo-location") | |
108 * Technically, we parse the XML and construct a map containing a persons name, its remote ID and its role. | |
109 * @param doc | |
110 * @param currentNodeValue | |
111 */ | |
112 public static void findContext(Document doc, String currentNodeValue) { | |
113 // first, define some variables | |
114 String nameOfPerson = ""; | |
115 String roleOfPerson = ""; | |
116 String idOfPerson= ""; | |
117 | |
118 // next, we try to see if there is already a contextualized author | |
119 // let us concentrate on that element | |
120 // then we look for tags called person | |
121 // if there are any, we take the liberty of querying them. This is a Nodelist | |
122 NodeList personList = doc.getElementsByTagName("person"); | |
123 // Debug information for the human eye. | |
124 // System.out.println("The current node value is "+ currentNodeValue + ". Let's do something useful in the findContext method."); | |
125 // System.out.println("This node list has " + personList.getLength() + " members: " + personList.item(0) + "and" + personList.item(1)); | |
126 // Integer personCounter = 1; | |
127 // look at every element in the list of persons | |
128 for(int countPerson=0; countPerson < personList.getLength(); countPerson++){ | |
129 // just some control | |
130 // System.out.println("This is person number " + personCounter); | |
131 // drill down a bit further. We now can access the person list | |
132 Node iterPerson = personList.item(countPerson); | |
133 | |
134 // this here produces the role of a person | |
135 if (iterPerson instanceof Element) { | |
136 Element e = (Element)iterPerson; | |
137 roleOfPerson = e.getAttribute("role"); | |
138 // System.out.println("Rolle: " + roleOfPerson); | |
139 | |
140 // there will also be a name attached. It is so written in the index.meta specification. Can we trust that? | |
141 NodeList l0 = e.getElementsByTagName("name"); | |
142 if(l0.getLength() > 0){ | |
143 Node name = l0.item(0); | |
144 nameOfPerson = name.getFirstChild().getNodeValue(); | |
145 // System.out.println("Name: " + nameOfPerson); | |
146 } | |
147 | |
148 // and the identifier, this should be there, too. Maybe it's not... | |
149 NodeList l1 = e.getElementsByTagName("identifier"); | |
150 if(l1.getLength() > 0){ | |
151 Node name = l1.item(0); | |
152 idOfPerson = name.getFirstChild().getNodeValue(); | |
153 //System.out.println("Identifier: " + idOfPerson); | |
154 } | |
155 // System.out.println("Current Node Value " + currentNodeValue + ". Name of Person " + nameOfPerson); | |
156 // now the final check and why we did all this: | |
157 if (nameOfPerson.equals(currentNodeValue)) { | |
158 ArrayList<String> authorInfo = new ArrayList<String>(); | |
159 authorInfo.add(nameOfPerson); | |
160 authorInfo.add(roleOfPerson); | |
161 authorInfo.add(idOfPerson); | |
162 | |
163 System.out.println("This person has already been contextualized: " + nameOfPerson + " hat die Rolle " + roleOfPerson + " und den Identifier " + idOfPerson + "."); | |
164 }} | |
165 // personCounter ++; | |
166 } | |
167 System.out.println("printing author"); | |
168 } | |
94 } | 169 } |
170 |