Mercurial > hg > nutch-mpiwg-plugins
diff src/plugin/parse-mpiwg/src/java/de/mpiwg/itgroup/mpiwg/parse/MPIWGParser.java @ 0:3b37d71af924 default tip
iniitial
author | dwinter |
---|---|
date | Tue, 26 Feb 2013 15:50:30 +0100 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/plugin/parse-mpiwg/src/java/de/mpiwg/itgroup/mpiwg/parse/MPIWGParser.java Tue Feb 26 15:50:30 2013 +0100 @@ -0,0 +1,225 @@ +package de.mpiwg.itgroup.mpiwg.parse; + +import java.io.BufferedReader; +import java.io.ByteArrayInputStream; +import java.io.FileWriter; +import java.io.IOException; +import java.io.InputStreamReader; +import java.io.UnsupportedEncodingException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.regex.Matcher; +import java.util.regex.Pattern; +import java.util.regex.PatternSyntaxException; + +import org.apache.nutch.parse.HTMLMetaTags; +import org.apache.nutch.parse.HtmlParseFilter; +import org.apache.nutch.parse.Parse; +import org.apache.nutch.parse.ParseResult; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.metadata.Metadata; + +import org.apache.nutch.protocol.Content; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.w3c.dom.Document; +import org.w3c.dom.DocumentFragment; +import org.w3c.dom.Element; +import org.w3c.dom.Node; +import org.w3c.dom.NodeList; +import org.w3c.dom.Text; +import org.xml.sax.InputSource; +import org.xml.sax.SAXException; + +import java.io.Reader; + +import javax.xml.parsers.DocumentBuilderFactory; +import javax.xml.parsers.ParserConfigurationException; + +public class MPIWGParser implements HtmlParseFilter { + + public static final Logger LOG = LoggerFactory.getLogger(MPIWGParser.class); + + public static final String TAG_KEY = "uploader"; + + private FileWriter fw; + + public MPIWGParser(){ + try { + fw = new FileWriter("/tmp/out"); + } catch (IOException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + } + // private static final Pattern selectInfoPattern = + // Pattern.compile("<span class=\"mpiwg-first_name\">(.*?)</span><span class=\"mpiwg-last_name\">(.*?)</span>"); + // private Pattern selectInfoPattern = null; + // private String[] groupNames = null; + // private String lineIdentification=null; + + private Map<String,MPIWGFilter> filters = new HashMap<String,MPIWGFilter>(); + private Configuration conf; + + public void setConf(Configuration conf) { + this.conf = conf; + if (conf == null) + return; + // the default constructor was called + + String confName = getConf().get("urlmeta.mpiwg-parser"); + Reader reader = getConf().getConfResourceAsReader(confName); + + // borrowed heavily from code in Configuration.java + Document doc; + try { + doc = DocumentBuilderFactory.newInstance().newDocumentBuilder() + .parse(new InputSource(reader)); + + Element root = doc.getDocumentElement(); + if ((!"mpiwg-parser".equals(root.getTagName())) + && (LOG.isErrorEnabled())) { + LOG.error("bad conf file: top-level element not <mpiwg-parser>"); + } + + // finde all filter + NodeList filters = root.getChildNodes(); + for (int i = 0; i < filters.getLength(); i++) { + Node filterNode = filters.item(i); + if (!(filterNode instanceof Element)) + continue; + Element filter = (Element) filterNode; + if ((!"filter".equals(filter.getTagName())) + && (LOG.isWarnEnabled())) { + LOG.warn("bad conf file: element not <filter>"); + } + + MPIWGFilter currentFilter = new MPIWGFilter(); + // gehe jetzt durch die filter + NodeList fields = filter.getChildNodes(); + currentFilter.mutiline=-1; + for (int j = 0; j < fields.getLength(); j++) { + Node fieldNode = fields.item(j); + if (!(fieldNode instanceof Element)) + continue; + Element field = (Element) fieldNode; + if ("name".equals(field.getTagName()) + && field.hasChildNodes()) + currentFilter.name = ((Text) field.getFirstChild()) + .getData(); + + if ("searchPattern".equals(field.getTagName()) + && field.hasChildNodes()) + currentFilter.searchPattern = ((Text) field + .getFirstChild()).getData(); + + if ("line-identification".equals(field.getTagName()) + && field.hasChildNodes()) + currentFilter.lineIdentification = ((Text) field + .getFirstChild()).getData(); + + if ("multiline".equals(field.getTagName()) + && field.hasChildNodes()) + currentFilter.mutiline = Integer.valueOf(((Text) field + .getFirstChild()).getData()); + + if ("group-name".equals(field.getTagName()) + && field.hasChildNodes()) + currentFilter.groupNames.add(((Text) field + .getFirstChild()).getData()); + + } + this.filters.put(currentFilter.name,currentFilter); + } + } catch (Exception e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + + } + + public Configuration getConf() { + return this.conf; + } + + public ParseResult filter(Content content, ParseResult parseResult, + HTMLMetaTags metaTags, DocumentFragment doc) { + + if (conf != null) + this.setConf(conf); + + for (String currentFilterName : filters.keySet()) { + MPIWGFilter currentFilter = filters.get(currentFilterName); + if (currentFilter.searchPattern == null) // kein pattern gesetzt + return parseResult; + + Pattern pattern = Pattern.compile(currentFilter.searchPattern,Pattern.DOTALL); + BufferedReader reader; + try { + reader = new BufferedReader(new InputStreamReader( + new ByteArrayInputStream(content.getContent()),"utf-8")); + } catch (UnsupportedEncodingException e1) { + LOG.debug("unsupported encoding!"); + return parseResult; + + } + + String line; + + Map<String, String> tags = new HashMap<String, String>(); + try { + while ((line = reader.readLine()) != null) { + + if (line.contains(currentFilter.lineIdentification)) { + + //Multiline matching first collet lines + + int count = 0; + + String line2; + // gehe durch multiline if multiline >0 + while ( ((line2 = reader.readLine()) != null) & (count<currentFilter.mutiline)) { + count++; + line+=line2; + } + + Matcher m = pattern.matcher(line); + if (m.find()) { + for (int i = 0; i < currentFilter.groupNames.size(); i++) + tags.put(currentFilter.groupNames.get(i), m + .group(i + 1).trim()); // ordne + // groupnamen + // gruppen zu + // LOG.debug(Adding tag: m.group(1)); + // tags.put("first_name", m.group(1)); + // tags.put("last_name", m.group(2)); + } + } + } + reader.close(); + } catch (IOException e) { + LOG.warn("IOException encountered parsing file:", e); + } + Parse parse = parseResult.get(content.getUrl()); + Metadata metadata = parse.getData().getParseMeta(); + for (String tag : tags.keySet()) { + try { + fw.write(String.format("%s - %s", tag, tags.get(tag))); + fw.flush(); + } catch (IOException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + metadata.add(tag, tags.get(tag)); + } + + } + return parseResult; + } +} \ No newline at end of file