annotate src/plugin/parse-mpiwg/src/java/de/mpiwg/itgroup/mpiwg/parse/MPIWGParser.java @ 0:3b37d71af924 default tip

iniitial
author dwinter
date Tue, 26 Feb 2013 15:50:30 +0100
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
3b37d71af924 iniitial
dwinter
parents:
diff changeset
1 package de.mpiwg.itgroup.mpiwg.parse;
3b37d71af924 iniitial
dwinter
parents:
diff changeset
2
3b37d71af924 iniitial
dwinter
parents:
diff changeset
3 import java.io.BufferedReader;
3b37d71af924 iniitial
dwinter
parents:
diff changeset
4 import java.io.ByteArrayInputStream;
3b37d71af924 iniitial
dwinter
parents:
diff changeset
5 import java.io.FileWriter;
3b37d71af924 iniitial
dwinter
parents:
diff changeset
6 import java.io.IOException;
3b37d71af924 iniitial
dwinter
parents:
diff changeset
7 import java.io.InputStreamReader;
3b37d71af924 iniitial
dwinter
parents:
diff changeset
8 import java.io.UnsupportedEncodingException;
3b37d71af924 iniitial
dwinter
parents:
diff changeset
9 import java.util.ArrayList;
3b37d71af924 iniitial
dwinter
parents:
diff changeset
10 import java.util.HashMap;
3b37d71af924 iniitial
dwinter
parents:
diff changeset
11 import java.util.List;
3b37d71af924 iniitial
dwinter
parents:
diff changeset
12 import java.util.Map;
3b37d71af924 iniitial
dwinter
parents:
diff changeset
13 import java.util.regex.Matcher;
3b37d71af924 iniitial
dwinter
parents:
diff changeset
14 import java.util.regex.Pattern;
3b37d71af924 iniitial
dwinter
parents:
diff changeset
15 import java.util.regex.PatternSyntaxException;
3b37d71af924 iniitial
dwinter
parents:
diff changeset
16
3b37d71af924 iniitial
dwinter
parents:
diff changeset
17 import org.apache.nutch.parse.HTMLMetaTags;
3b37d71af924 iniitial
dwinter
parents:
diff changeset
18 import org.apache.nutch.parse.HtmlParseFilter;
3b37d71af924 iniitial
dwinter
parents:
diff changeset
19 import org.apache.nutch.parse.Parse;
3b37d71af924 iniitial
dwinter
parents:
diff changeset
20 import org.apache.nutch.parse.ParseResult;
3b37d71af924 iniitial
dwinter
parents:
diff changeset
21
3b37d71af924 iniitial
dwinter
parents:
diff changeset
22 import org.apache.commons.logging.Log;
3b37d71af924 iniitial
dwinter
parents:
diff changeset
23 import org.apache.commons.logging.LogFactory;
3b37d71af924 iniitial
dwinter
parents:
diff changeset
24 import org.apache.hadoop.conf.Configuration;
3b37d71af924 iniitial
dwinter
parents:
diff changeset
25 import org.apache.nutch.metadata.Metadata;
3b37d71af924 iniitial
dwinter
parents:
diff changeset
26
3b37d71af924 iniitial
dwinter
parents:
diff changeset
27 import org.apache.nutch.protocol.Content;
3b37d71af924 iniitial
dwinter
parents:
diff changeset
28
3b37d71af924 iniitial
dwinter
parents:
diff changeset
29 import org.slf4j.Logger;
3b37d71af924 iniitial
dwinter
parents:
diff changeset
30 import org.slf4j.LoggerFactory;
3b37d71af924 iniitial
dwinter
parents:
diff changeset
31 import org.w3c.dom.Document;
3b37d71af924 iniitial
dwinter
parents:
diff changeset
32 import org.w3c.dom.DocumentFragment;
3b37d71af924 iniitial
dwinter
parents:
diff changeset
33 import org.w3c.dom.Element;
3b37d71af924 iniitial
dwinter
parents:
diff changeset
34 import org.w3c.dom.Node;
3b37d71af924 iniitial
dwinter
parents:
diff changeset
35 import org.w3c.dom.NodeList;
3b37d71af924 iniitial
dwinter
parents:
diff changeset
36 import org.w3c.dom.Text;
3b37d71af924 iniitial
dwinter
parents:
diff changeset
37 import org.xml.sax.InputSource;
3b37d71af924 iniitial
dwinter
parents:
diff changeset
38 import org.xml.sax.SAXException;
3b37d71af924 iniitial
dwinter
parents:
diff changeset
39
3b37d71af924 iniitial
dwinter
parents:
diff changeset
40 import java.io.Reader;
3b37d71af924 iniitial
dwinter
parents:
diff changeset
41
3b37d71af924 iniitial
dwinter
parents:
diff changeset
42 import javax.xml.parsers.DocumentBuilderFactory;
3b37d71af924 iniitial
dwinter
parents:
diff changeset
43 import javax.xml.parsers.ParserConfigurationException;
3b37d71af924 iniitial
dwinter
parents:
diff changeset
44
3b37d71af924 iniitial
dwinter
parents:
diff changeset
45 public class MPIWGParser implements HtmlParseFilter {
3b37d71af924 iniitial
dwinter
parents:
diff changeset
46
3b37d71af924 iniitial
dwinter
parents:
diff changeset
47 public static final Logger LOG = LoggerFactory.getLogger(MPIWGParser.class);
3b37d71af924 iniitial
dwinter
parents:
diff changeset
48
3b37d71af924 iniitial
dwinter
parents:
diff changeset
49 public static final String TAG_KEY = "uploader";
3b37d71af924 iniitial
dwinter
parents:
diff changeset
50
3b37d71af924 iniitial
dwinter
parents:
diff changeset
51 private FileWriter fw;
3b37d71af924 iniitial
dwinter
parents:
diff changeset
52
3b37d71af924 iniitial
dwinter
parents:
diff changeset
53 public MPIWGParser(){
3b37d71af924 iniitial
dwinter
parents:
diff changeset
54 try {
3b37d71af924 iniitial
dwinter
parents:
diff changeset
55 fw = new FileWriter("/tmp/out");
3b37d71af924 iniitial
dwinter
parents:
diff changeset
56 } catch (IOException e) {
3b37d71af924 iniitial
dwinter
parents:
diff changeset
57 // TODO Auto-generated catch block
3b37d71af924 iniitial
dwinter
parents:
diff changeset
58 e.printStackTrace();
3b37d71af924 iniitial
dwinter
parents:
diff changeset
59 }
3b37d71af924 iniitial
dwinter
parents:
diff changeset
60 }
3b37d71af924 iniitial
dwinter
parents:
diff changeset
61 // private static final Pattern selectInfoPattern =
3b37d71af924 iniitial
dwinter
parents:
diff changeset
62 // Pattern.compile("<span class=\"mpiwg-first_name\">(.*?)</span><span class=\"mpiwg-last_name\">(.*?)</span>");
3b37d71af924 iniitial
dwinter
parents:
diff changeset
63 // private Pattern selectInfoPattern = null;
3b37d71af924 iniitial
dwinter
parents:
diff changeset
64 // private String[] groupNames = null;
3b37d71af924 iniitial
dwinter
parents:
diff changeset
65 // private String lineIdentification=null;
3b37d71af924 iniitial
dwinter
parents:
diff changeset
66
3b37d71af924 iniitial
dwinter
parents:
diff changeset
67 private Map<String,MPIWGFilter> filters = new HashMap<String,MPIWGFilter>();
3b37d71af924 iniitial
dwinter
parents:
diff changeset
68 private Configuration conf;
3b37d71af924 iniitial
dwinter
parents:
diff changeset
69
3b37d71af924 iniitial
dwinter
parents:
diff changeset
70 public void setConf(Configuration conf) {
3b37d71af924 iniitial
dwinter
parents:
diff changeset
71 this.conf = conf;
3b37d71af924 iniitial
dwinter
parents:
diff changeset
72 if (conf == null)
3b37d71af924 iniitial
dwinter
parents:
diff changeset
73 return;
3b37d71af924 iniitial
dwinter
parents:
diff changeset
74 // the default constructor was called
3b37d71af924 iniitial
dwinter
parents:
diff changeset
75
3b37d71af924 iniitial
dwinter
parents:
diff changeset
76 String confName = getConf().get("urlmeta.mpiwg-parser");
3b37d71af924 iniitial
dwinter
parents:
diff changeset
77 Reader reader = getConf().getConfResourceAsReader(confName);
3b37d71af924 iniitial
dwinter
parents:
diff changeset
78
3b37d71af924 iniitial
dwinter
parents:
diff changeset
79 // borrowed heavily from code in Configuration.java
3b37d71af924 iniitial
dwinter
parents:
diff changeset
80 Document doc;
3b37d71af924 iniitial
dwinter
parents:
diff changeset
81 try {
3b37d71af924 iniitial
dwinter
parents:
diff changeset
82 doc = DocumentBuilderFactory.newInstance().newDocumentBuilder()
3b37d71af924 iniitial
dwinter
parents:
diff changeset
83 .parse(new InputSource(reader));
3b37d71af924 iniitial
dwinter
parents:
diff changeset
84
3b37d71af924 iniitial
dwinter
parents:
diff changeset
85 Element root = doc.getDocumentElement();
3b37d71af924 iniitial
dwinter
parents:
diff changeset
86 if ((!"mpiwg-parser".equals(root.getTagName()))
3b37d71af924 iniitial
dwinter
parents:
diff changeset
87 && (LOG.isErrorEnabled())) {
3b37d71af924 iniitial
dwinter
parents:
diff changeset
88 LOG.error("bad conf file: top-level element not <mpiwg-parser>");
3b37d71af924 iniitial
dwinter
parents:
diff changeset
89 }
3b37d71af924 iniitial
dwinter
parents:
diff changeset
90
3b37d71af924 iniitial
dwinter
parents:
diff changeset
91 // finde all filter
3b37d71af924 iniitial
dwinter
parents:
diff changeset
92 NodeList filters = root.getChildNodes();
3b37d71af924 iniitial
dwinter
parents:
diff changeset
93 for (int i = 0; i < filters.getLength(); i++) {
3b37d71af924 iniitial
dwinter
parents:
diff changeset
94 Node filterNode = filters.item(i);
3b37d71af924 iniitial
dwinter
parents:
diff changeset
95 if (!(filterNode instanceof Element))
3b37d71af924 iniitial
dwinter
parents:
diff changeset
96 continue;
3b37d71af924 iniitial
dwinter
parents:
diff changeset
97 Element filter = (Element) filterNode;
3b37d71af924 iniitial
dwinter
parents:
diff changeset
98 if ((!"filter".equals(filter.getTagName()))
3b37d71af924 iniitial
dwinter
parents:
diff changeset
99 && (LOG.isWarnEnabled())) {
3b37d71af924 iniitial
dwinter
parents:
diff changeset
100 LOG.warn("bad conf file: element not <filter>");
3b37d71af924 iniitial
dwinter
parents:
diff changeset
101 }
3b37d71af924 iniitial
dwinter
parents:
diff changeset
102
3b37d71af924 iniitial
dwinter
parents:
diff changeset
103 MPIWGFilter currentFilter = new MPIWGFilter();
3b37d71af924 iniitial
dwinter
parents:
diff changeset
104 // gehe jetzt durch die filter
3b37d71af924 iniitial
dwinter
parents:
diff changeset
105 NodeList fields = filter.getChildNodes();
3b37d71af924 iniitial
dwinter
parents:
diff changeset
106 currentFilter.mutiline=-1;
3b37d71af924 iniitial
dwinter
parents:
diff changeset
107 for (int j = 0; j < fields.getLength(); j++) {
3b37d71af924 iniitial
dwinter
parents:
diff changeset
108 Node fieldNode = fields.item(j);
3b37d71af924 iniitial
dwinter
parents:
diff changeset
109 if (!(fieldNode instanceof Element))
3b37d71af924 iniitial
dwinter
parents:
diff changeset
110 continue;
3b37d71af924 iniitial
dwinter
parents:
diff changeset
111 Element field = (Element) fieldNode;
3b37d71af924 iniitial
dwinter
parents:
diff changeset
112 if ("name".equals(field.getTagName())
3b37d71af924 iniitial
dwinter
parents:
diff changeset
113 && field.hasChildNodes())
3b37d71af924 iniitial
dwinter
parents:
diff changeset
114 currentFilter.name = ((Text) field.getFirstChild())
3b37d71af924 iniitial
dwinter
parents:
diff changeset
115 .getData();
3b37d71af924 iniitial
dwinter
parents:
diff changeset
116
3b37d71af924 iniitial
dwinter
parents:
diff changeset
117 if ("searchPattern".equals(field.getTagName())
3b37d71af924 iniitial
dwinter
parents:
diff changeset
118 && field.hasChildNodes())
3b37d71af924 iniitial
dwinter
parents:
diff changeset
119 currentFilter.searchPattern = ((Text) field
3b37d71af924 iniitial
dwinter
parents:
diff changeset
120 .getFirstChild()).getData();
3b37d71af924 iniitial
dwinter
parents:
diff changeset
121
3b37d71af924 iniitial
dwinter
parents:
diff changeset
122 if ("line-identification".equals(field.getTagName())
3b37d71af924 iniitial
dwinter
parents:
diff changeset
123 && field.hasChildNodes())
3b37d71af924 iniitial
dwinter
parents:
diff changeset
124 currentFilter.lineIdentification = ((Text) field
3b37d71af924 iniitial
dwinter
parents:
diff changeset
125 .getFirstChild()).getData();
3b37d71af924 iniitial
dwinter
parents:
diff changeset
126
3b37d71af924 iniitial
dwinter
parents:
diff changeset
127 if ("multiline".equals(field.getTagName())
3b37d71af924 iniitial
dwinter
parents:
diff changeset
128 && field.hasChildNodes())
3b37d71af924 iniitial
dwinter
parents:
diff changeset
129 currentFilter.mutiline = Integer.valueOf(((Text) field
3b37d71af924 iniitial
dwinter
parents:
diff changeset
130 .getFirstChild()).getData());
3b37d71af924 iniitial
dwinter
parents:
diff changeset
131
3b37d71af924 iniitial
dwinter
parents:
diff changeset
132 if ("group-name".equals(field.getTagName())
3b37d71af924 iniitial
dwinter
parents:
diff changeset
133 && field.hasChildNodes())
3b37d71af924 iniitial
dwinter
parents:
diff changeset
134 currentFilter.groupNames.add(((Text) field
3b37d71af924 iniitial
dwinter
parents:
diff changeset
135 .getFirstChild()).getData());
3b37d71af924 iniitial
dwinter
parents:
diff changeset
136
3b37d71af924 iniitial
dwinter
parents:
diff changeset
137 }
3b37d71af924 iniitial
dwinter
parents:
diff changeset
138 this.filters.put(currentFilter.name,currentFilter);
3b37d71af924 iniitial
dwinter
parents:
diff changeset
139 }
3b37d71af924 iniitial
dwinter
parents:
diff changeset
140 } catch (Exception e) {
3b37d71af924 iniitial
dwinter
parents:
diff changeset
141 // TODO Auto-generated catch block
3b37d71af924 iniitial
dwinter
parents:
diff changeset
142 e.printStackTrace();
3b37d71af924 iniitial
dwinter
parents:
diff changeset
143 }
3b37d71af924 iniitial
dwinter
parents:
diff changeset
144
3b37d71af924 iniitial
dwinter
parents:
diff changeset
145 }
3b37d71af924 iniitial
dwinter
parents:
diff changeset
146
3b37d71af924 iniitial
dwinter
parents:
diff changeset
147 public Configuration getConf() {
3b37d71af924 iniitial
dwinter
parents:
diff changeset
148 return this.conf;
3b37d71af924 iniitial
dwinter
parents:
diff changeset
149 }
3b37d71af924 iniitial
dwinter
parents:
diff changeset
150
3b37d71af924 iniitial
dwinter
parents:
diff changeset
151 public ParseResult filter(Content content, ParseResult parseResult,
3b37d71af924 iniitial
dwinter
parents:
diff changeset
152 HTMLMetaTags metaTags, DocumentFragment doc) {
3b37d71af924 iniitial
dwinter
parents:
diff changeset
153
3b37d71af924 iniitial
dwinter
parents:
diff changeset
154 if (conf != null)
3b37d71af924 iniitial
dwinter
parents:
diff changeset
155 this.setConf(conf);
3b37d71af924 iniitial
dwinter
parents:
diff changeset
156
3b37d71af924 iniitial
dwinter
parents:
diff changeset
157 for (String currentFilterName : filters.keySet()) {
3b37d71af924 iniitial
dwinter
parents:
diff changeset
158 MPIWGFilter currentFilter = filters.get(currentFilterName);
3b37d71af924 iniitial
dwinter
parents:
diff changeset
159 if (currentFilter.searchPattern == null) // kein pattern gesetzt
3b37d71af924 iniitial
dwinter
parents:
diff changeset
160 return parseResult;
3b37d71af924 iniitial
dwinter
parents:
diff changeset
161
3b37d71af924 iniitial
dwinter
parents:
diff changeset
162 Pattern pattern = Pattern.compile(currentFilter.searchPattern,Pattern.DOTALL);
3b37d71af924 iniitial
dwinter
parents:
diff changeset
163 BufferedReader reader;
3b37d71af924 iniitial
dwinter
parents:
diff changeset
164 try {
3b37d71af924 iniitial
dwinter
parents:
diff changeset
165 reader = new BufferedReader(new InputStreamReader(
3b37d71af924 iniitial
dwinter
parents:
diff changeset
166 new ByteArrayInputStream(content.getContent()),"utf-8"));
3b37d71af924 iniitial
dwinter
parents:
diff changeset
167 } catch (UnsupportedEncodingException e1) {
3b37d71af924 iniitial
dwinter
parents:
diff changeset
168 LOG.debug("unsupported encoding!");
3b37d71af924 iniitial
dwinter
parents:
diff changeset
169 return parseResult;
3b37d71af924 iniitial
dwinter
parents:
diff changeset
170
3b37d71af924 iniitial
dwinter
parents:
diff changeset
171 }
3b37d71af924 iniitial
dwinter
parents:
diff changeset
172
3b37d71af924 iniitial
dwinter
parents:
diff changeset
173 String line;
3b37d71af924 iniitial
dwinter
parents:
diff changeset
174
3b37d71af924 iniitial
dwinter
parents:
diff changeset
175 Map<String, String> tags = new HashMap<String, String>();
3b37d71af924 iniitial
dwinter
parents:
diff changeset
176 try {
3b37d71af924 iniitial
dwinter
parents:
diff changeset
177 while ((line = reader.readLine()) != null) {
3b37d71af924 iniitial
dwinter
parents:
diff changeset
178
3b37d71af924 iniitial
dwinter
parents:
diff changeset
179 if (line.contains(currentFilter.lineIdentification)) {
3b37d71af924 iniitial
dwinter
parents:
diff changeset
180
3b37d71af924 iniitial
dwinter
parents:
diff changeset
181 //Multiline matching first collet lines
3b37d71af924 iniitial
dwinter
parents:
diff changeset
182
3b37d71af924 iniitial
dwinter
parents:
diff changeset
183 int count = 0;
3b37d71af924 iniitial
dwinter
parents:
diff changeset
184
3b37d71af924 iniitial
dwinter
parents:
diff changeset
185 String line2;
3b37d71af924 iniitial
dwinter
parents:
diff changeset
186 // gehe durch multiline if multiline >0
3b37d71af924 iniitial
dwinter
parents:
diff changeset
187 while ( ((line2 = reader.readLine()) != null) & (count<currentFilter.mutiline)) {
3b37d71af924 iniitial
dwinter
parents:
diff changeset
188 count++;
3b37d71af924 iniitial
dwinter
parents:
diff changeset
189 line+=line2;
3b37d71af924 iniitial
dwinter
parents:
diff changeset
190 }
3b37d71af924 iniitial
dwinter
parents:
diff changeset
191
3b37d71af924 iniitial
dwinter
parents:
diff changeset
192 Matcher m = pattern.matcher(line);
3b37d71af924 iniitial
dwinter
parents:
diff changeset
193 if (m.find()) {
3b37d71af924 iniitial
dwinter
parents:
diff changeset
194 for (int i = 0; i < currentFilter.groupNames.size(); i++)
3b37d71af924 iniitial
dwinter
parents:
diff changeset
195 tags.put(currentFilter.groupNames.get(i), m
3b37d71af924 iniitial
dwinter
parents:
diff changeset
196 .group(i + 1).trim()); // ordne
3b37d71af924 iniitial
dwinter
parents:
diff changeset
197 // groupnamen
3b37d71af924 iniitial
dwinter
parents:
diff changeset
198 // gruppen zu
3b37d71af924 iniitial
dwinter
parents:
diff changeset
199 // LOG.debug(Adding tag: m.group(1));
3b37d71af924 iniitial
dwinter
parents:
diff changeset
200 // tags.put("first_name", m.group(1));
3b37d71af924 iniitial
dwinter
parents:
diff changeset
201 // tags.put("last_name", m.group(2));
3b37d71af924 iniitial
dwinter
parents:
diff changeset
202 }
3b37d71af924 iniitial
dwinter
parents:
diff changeset
203 }
3b37d71af924 iniitial
dwinter
parents:
diff changeset
204 }
3b37d71af924 iniitial
dwinter
parents:
diff changeset
205 reader.close();
3b37d71af924 iniitial
dwinter
parents:
diff changeset
206 } catch (IOException e) {
3b37d71af924 iniitial
dwinter
parents:
diff changeset
207 LOG.warn("IOException encountered parsing file:", e);
3b37d71af924 iniitial
dwinter
parents:
diff changeset
208 }
3b37d71af924 iniitial
dwinter
parents:
diff changeset
209 Parse parse = parseResult.get(content.getUrl());
3b37d71af924 iniitial
dwinter
parents:
diff changeset
210 Metadata metadata = parse.getData().getParseMeta();
3b37d71af924 iniitial
dwinter
parents:
diff changeset
211 for (String tag : tags.keySet()) {
3b37d71af924 iniitial
dwinter
parents:
diff changeset
212 try {
3b37d71af924 iniitial
dwinter
parents:
diff changeset
213 fw.write(String.format("%s - %s", tag, tags.get(tag)));
3b37d71af924 iniitial
dwinter
parents:
diff changeset
214 fw.flush();
3b37d71af924 iniitial
dwinter
parents:
diff changeset
215 } catch (IOException e) {
3b37d71af924 iniitial
dwinter
parents:
diff changeset
216 // TODO Auto-generated catch block
3b37d71af924 iniitial
dwinter
parents:
diff changeset
217 e.printStackTrace();
3b37d71af924 iniitial
dwinter
parents:
diff changeset
218 }
3b37d71af924 iniitial
dwinter
parents:
diff changeset
219 metadata.add(tag, tags.get(tag));
3b37d71af924 iniitial
dwinter
parents:
diff changeset
220 }
3b37d71af924 iniitial
dwinter
parents:
diff changeset
221
3b37d71af924 iniitial
dwinter
parents:
diff changeset
222 }
3b37d71af924 iniitial
dwinter
parents:
diff changeset
223 return parseResult;
3b37d71af924 iniitial
dwinter
parents:
diff changeset
224 }
3b37d71af924 iniitial
dwinter
parents:
diff changeset
225 }