0
|
1 package de.mpiwg.itgroup.mpiwg.parse;
|
|
2
|
|
3 import java.io.BufferedReader;
|
|
4 import java.io.ByteArrayInputStream;
|
|
5 import java.io.FileWriter;
|
|
6 import java.io.IOException;
|
|
7 import java.io.InputStreamReader;
|
|
8 import java.io.UnsupportedEncodingException;
|
|
9 import java.util.ArrayList;
|
|
10 import java.util.HashMap;
|
|
11 import java.util.List;
|
|
12 import java.util.Map;
|
|
13 import java.util.regex.Matcher;
|
|
14 import java.util.regex.Pattern;
|
|
15 import java.util.regex.PatternSyntaxException;
|
|
16
|
|
17 import org.apache.nutch.parse.HTMLMetaTags;
|
|
18 import org.apache.nutch.parse.HtmlParseFilter;
|
|
19 import org.apache.nutch.parse.Parse;
|
|
20 import org.apache.nutch.parse.ParseResult;
|
|
21
|
|
22 import org.apache.commons.logging.Log;
|
|
23 import org.apache.commons.logging.LogFactory;
|
|
24 import org.apache.hadoop.conf.Configuration;
|
|
25 import org.apache.nutch.metadata.Metadata;
|
|
26
|
|
27 import org.apache.nutch.protocol.Content;
|
|
28
|
|
29 import org.slf4j.Logger;
|
|
30 import org.slf4j.LoggerFactory;
|
|
31 import org.w3c.dom.Document;
|
|
32 import org.w3c.dom.DocumentFragment;
|
|
33 import org.w3c.dom.Element;
|
|
34 import org.w3c.dom.Node;
|
|
35 import org.w3c.dom.NodeList;
|
|
36 import org.w3c.dom.Text;
|
|
37 import org.xml.sax.InputSource;
|
|
38 import org.xml.sax.SAXException;
|
|
39
|
|
40 import java.io.Reader;
|
|
41
|
|
42 import javax.xml.parsers.DocumentBuilderFactory;
|
|
43 import javax.xml.parsers.ParserConfigurationException;
|
|
44
|
|
45 public class MPIWGParser implements HtmlParseFilter {
|
|
46
|
|
47 public static final Logger LOG = LoggerFactory.getLogger(MPIWGParser.class);
|
|
48
|
|
49 public static final String TAG_KEY = "uploader";
|
|
50
|
|
51 private FileWriter fw;
|
|
52
|
|
53 public MPIWGParser(){
|
|
54 try {
|
|
55 fw = new FileWriter("/tmp/out");
|
|
56 } catch (IOException e) {
|
|
57 // TODO Auto-generated catch block
|
|
58 e.printStackTrace();
|
|
59 }
|
|
60 }
|
|
61 // private static final Pattern selectInfoPattern =
|
|
62 // Pattern.compile("<span class=\"mpiwg-first_name\">(.*?)</span><span class=\"mpiwg-last_name\">(.*?)</span>");
|
|
63 // private Pattern selectInfoPattern = null;
|
|
64 // private String[] groupNames = null;
|
|
65 // private String lineIdentification=null;
|
|
66
|
|
67 private Map<String,MPIWGFilter> filters = new HashMap<String,MPIWGFilter>();
|
|
68 private Configuration conf;
|
|
69
|
|
70 public void setConf(Configuration conf) {
|
|
71 this.conf = conf;
|
|
72 if (conf == null)
|
|
73 return;
|
|
74 // the default constructor was called
|
|
75
|
|
76 String confName = getConf().get("urlmeta.mpiwg-parser");
|
|
77 Reader reader = getConf().getConfResourceAsReader(confName);
|
|
78
|
|
79 // borrowed heavily from code in Configuration.java
|
|
80 Document doc;
|
|
81 try {
|
|
82 doc = DocumentBuilderFactory.newInstance().newDocumentBuilder()
|
|
83 .parse(new InputSource(reader));
|
|
84
|
|
85 Element root = doc.getDocumentElement();
|
|
86 if ((!"mpiwg-parser".equals(root.getTagName()))
|
|
87 && (LOG.isErrorEnabled())) {
|
|
88 LOG.error("bad conf file: top-level element not <mpiwg-parser>");
|
|
89 }
|
|
90
|
|
91 // finde all filter
|
|
92 NodeList filters = root.getChildNodes();
|
|
93 for (int i = 0; i < filters.getLength(); i++) {
|
|
94 Node filterNode = filters.item(i);
|
|
95 if (!(filterNode instanceof Element))
|
|
96 continue;
|
|
97 Element filter = (Element) filterNode;
|
|
98 if ((!"filter".equals(filter.getTagName()))
|
|
99 && (LOG.isWarnEnabled())) {
|
|
100 LOG.warn("bad conf file: element not <filter>");
|
|
101 }
|
|
102
|
|
103 MPIWGFilter currentFilter = new MPIWGFilter();
|
|
104 // gehe jetzt durch die filter
|
|
105 NodeList fields = filter.getChildNodes();
|
|
106 currentFilter.mutiline=-1;
|
|
107 for (int j = 0; j < fields.getLength(); j++) {
|
|
108 Node fieldNode = fields.item(j);
|
|
109 if (!(fieldNode instanceof Element))
|
|
110 continue;
|
|
111 Element field = (Element) fieldNode;
|
|
112 if ("name".equals(field.getTagName())
|
|
113 && field.hasChildNodes())
|
|
114 currentFilter.name = ((Text) field.getFirstChild())
|
|
115 .getData();
|
|
116
|
|
117 if ("searchPattern".equals(field.getTagName())
|
|
118 && field.hasChildNodes())
|
|
119 currentFilter.searchPattern = ((Text) field
|
|
120 .getFirstChild()).getData();
|
|
121
|
|
122 if ("line-identification".equals(field.getTagName())
|
|
123 && field.hasChildNodes())
|
|
124 currentFilter.lineIdentification = ((Text) field
|
|
125 .getFirstChild()).getData();
|
|
126
|
|
127 if ("multiline".equals(field.getTagName())
|
|
128 && field.hasChildNodes())
|
|
129 currentFilter.mutiline = Integer.valueOf(((Text) field
|
|
130 .getFirstChild()).getData());
|
|
131
|
|
132 if ("group-name".equals(field.getTagName())
|
|
133 && field.hasChildNodes())
|
|
134 currentFilter.groupNames.add(((Text) field
|
|
135 .getFirstChild()).getData());
|
|
136
|
|
137 }
|
|
138 this.filters.put(currentFilter.name,currentFilter);
|
|
139 }
|
|
140 } catch (Exception e) {
|
|
141 // TODO Auto-generated catch block
|
|
142 e.printStackTrace();
|
|
143 }
|
|
144
|
|
145 }
|
|
146
|
|
147 public Configuration getConf() {
|
|
148 return this.conf;
|
|
149 }
|
|
150
|
|
151 public ParseResult filter(Content content, ParseResult parseResult,
|
|
152 HTMLMetaTags metaTags, DocumentFragment doc) {
|
|
153
|
|
154 if (conf != null)
|
|
155 this.setConf(conf);
|
|
156
|
|
157 for (String currentFilterName : filters.keySet()) {
|
|
158 MPIWGFilter currentFilter = filters.get(currentFilterName);
|
|
159 if (currentFilter.searchPattern == null) // kein pattern gesetzt
|
|
160 return parseResult;
|
|
161
|
|
162 Pattern pattern = Pattern.compile(currentFilter.searchPattern,Pattern.DOTALL);
|
|
163 BufferedReader reader;
|
|
164 try {
|
|
165 reader = new BufferedReader(new InputStreamReader(
|
|
166 new ByteArrayInputStream(content.getContent()),"utf-8"));
|
|
167 } catch (UnsupportedEncodingException e1) {
|
|
168 LOG.debug("unsupported encoding!");
|
|
169 return parseResult;
|
|
170
|
|
171 }
|
|
172
|
|
173 String line;
|
|
174
|
|
175 Map<String, String> tags = new HashMap<String, String>();
|
|
176 try {
|
|
177 while ((line = reader.readLine()) != null) {
|
|
178
|
|
179 if (line.contains(currentFilter.lineIdentification)) {
|
|
180
|
|
181 //Multiline matching first collet lines
|
|
182
|
|
183 int count = 0;
|
|
184
|
|
185 String line2;
|
|
186 // gehe durch multiline if multiline >0
|
|
187 while ( ((line2 = reader.readLine()) != null) & (count<currentFilter.mutiline)) {
|
|
188 count++;
|
|
189 line+=line2;
|
|
190 }
|
|
191
|
|
192 Matcher m = pattern.matcher(line);
|
|
193 if (m.find()) {
|
|
194 for (int i = 0; i < currentFilter.groupNames.size(); i++)
|
|
195 tags.put(currentFilter.groupNames.get(i), m
|
|
196 .group(i + 1).trim()); // ordne
|
|
197 // groupnamen
|
|
198 // gruppen zu
|
|
199 // LOG.debug(Adding tag: m.group(1));
|
|
200 // tags.put("first_name", m.group(1));
|
|
201 // tags.put("last_name", m.group(2));
|
|
202 }
|
|
203 }
|
|
204 }
|
|
205 reader.close();
|
|
206 } catch (IOException e) {
|
|
207 LOG.warn("IOException encountered parsing file:", e);
|
|
208 }
|
|
209 Parse parse = parseResult.get(content.getUrl());
|
|
210 Metadata metadata = parse.getData().getParseMeta();
|
|
211 for (String tag : tags.keySet()) {
|
|
212 try {
|
|
213 fw.write(String.format("%s - %s", tag, tags.get(tag)));
|
|
214 fw.flush();
|
|
215 } catch (IOException e) {
|
|
216 // TODO Auto-generated catch block
|
|
217 e.printStackTrace();
|
|
218 }
|
|
219 metadata.add(tag, tags.get(tag));
|
|
220 }
|
|
221
|
|
222 }
|
|
223 return parseResult;
|
|
224 }
|
|
225 } |