Mercurial > hg > nutch-mpiwg-plugins
view src/plugin/parse-mpiwg/src/java/de/mpiwg/itgroup/mpiwg/parse/MPIWGParser.java @ 0:3b37d71af924 default tip
iniitial
author | dwinter |
---|---|
date | Tue, 26 Feb 2013 15:50:30 +0100 |
parents | |
children |
line wrap: on
line source
package de.mpiwg.itgroup.mpiwg.parse; import java.io.BufferedReader; import java.io.ByteArrayInputStream; import java.io.FileWriter; import java.io.IOException; import java.io.InputStreamReader; import java.io.UnsupportedEncodingException; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.regex.Matcher; import java.util.regex.Pattern; import java.util.regex.PatternSyntaxException; import org.apache.nutch.parse.HTMLMetaTags; import org.apache.nutch.parse.HtmlParseFilter; import org.apache.nutch.parse.Parse; import org.apache.nutch.parse.ParseResult; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.nutch.metadata.Metadata; import org.apache.nutch.protocol.Content; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.w3c.dom.Document; import org.w3c.dom.DocumentFragment; import org.w3c.dom.Element; import org.w3c.dom.Node; import org.w3c.dom.NodeList; import org.w3c.dom.Text; import org.xml.sax.InputSource; import org.xml.sax.SAXException; import java.io.Reader; import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.parsers.ParserConfigurationException; public class MPIWGParser implements HtmlParseFilter { public static final Logger LOG = LoggerFactory.getLogger(MPIWGParser.class); public static final String TAG_KEY = "uploader"; private FileWriter fw; public MPIWGParser(){ try { fw = new FileWriter("/tmp/out"); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } // private static final Pattern selectInfoPattern = // Pattern.compile("<span class=\"mpiwg-first_name\">(.*?)</span><span class=\"mpiwg-last_name\">(.*?)</span>"); // private Pattern selectInfoPattern = null; // private String[] groupNames = null; // private String lineIdentification=null; private Map<String,MPIWGFilter> filters = new HashMap<String,MPIWGFilter>(); private Configuration conf; public void setConf(Configuration conf) { this.conf = conf; if (conf == null) return; // the default constructor was called String confName = getConf().get("urlmeta.mpiwg-parser"); Reader reader = getConf().getConfResourceAsReader(confName); // borrowed heavily from code in Configuration.java Document doc; try { doc = DocumentBuilderFactory.newInstance().newDocumentBuilder() .parse(new InputSource(reader)); Element root = doc.getDocumentElement(); if ((!"mpiwg-parser".equals(root.getTagName())) && (LOG.isErrorEnabled())) { LOG.error("bad conf file: top-level element not <mpiwg-parser>"); } // finde all filter NodeList filters = root.getChildNodes(); for (int i = 0; i < filters.getLength(); i++) { Node filterNode = filters.item(i); if (!(filterNode instanceof Element)) continue; Element filter = (Element) filterNode; if ((!"filter".equals(filter.getTagName())) && (LOG.isWarnEnabled())) { LOG.warn("bad conf file: element not <filter>"); } MPIWGFilter currentFilter = new MPIWGFilter(); // gehe jetzt durch die filter NodeList fields = filter.getChildNodes(); currentFilter.mutiline=-1; for (int j = 0; j < fields.getLength(); j++) { Node fieldNode = fields.item(j); if (!(fieldNode instanceof Element)) continue; Element field = (Element) fieldNode; if ("name".equals(field.getTagName()) && field.hasChildNodes()) currentFilter.name = ((Text) field.getFirstChild()) .getData(); if ("searchPattern".equals(field.getTagName()) && field.hasChildNodes()) currentFilter.searchPattern = ((Text) field .getFirstChild()).getData(); if ("line-identification".equals(field.getTagName()) && field.hasChildNodes()) currentFilter.lineIdentification = ((Text) field .getFirstChild()).getData(); if ("multiline".equals(field.getTagName()) && field.hasChildNodes()) currentFilter.mutiline = Integer.valueOf(((Text) field .getFirstChild()).getData()); if ("group-name".equals(field.getTagName()) && field.hasChildNodes()) currentFilter.groupNames.add(((Text) field .getFirstChild()).getData()); } this.filters.put(currentFilter.name,currentFilter); } } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } } public Configuration getConf() { return this.conf; } public ParseResult filter(Content content, ParseResult parseResult, HTMLMetaTags metaTags, DocumentFragment doc) { if (conf != null) this.setConf(conf); for (String currentFilterName : filters.keySet()) { MPIWGFilter currentFilter = filters.get(currentFilterName); if (currentFilter.searchPattern == null) // kein pattern gesetzt return parseResult; Pattern pattern = Pattern.compile(currentFilter.searchPattern,Pattern.DOTALL); BufferedReader reader; try { reader = new BufferedReader(new InputStreamReader( new ByteArrayInputStream(content.getContent()),"utf-8")); } catch (UnsupportedEncodingException e1) { LOG.debug("unsupported encoding!"); return parseResult; } String line; Map<String, String> tags = new HashMap<String, String>(); try { while ((line = reader.readLine()) != null) { if (line.contains(currentFilter.lineIdentification)) { //Multiline matching first collet lines int count = 0; String line2; // gehe durch multiline if multiline >0 while ( ((line2 = reader.readLine()) != null) & (count<currentFilter.mutiline)) { count++; line+=line2; } Matcher m = pattern.matcher(line); if (m.find()) { for (int i = 0; i < currentFilter.groupNames.size(); i++) tags.put(currentFilter.groupNames.get(i), m .group(i + 1).trim()); // ordne // groupnamen // gruppen zu // LOG.debug(Adding tag: m.group(1)); // tags.put("first_name", m.group(1)); // tags.put("last_name", m.group(2)); } } } reader.close(); } catch (IOException e) { LOG.warn("IOException encountered parsing file:", e); } Parse parse = parseResult.get(content.getUrl()); Metadata metadata = parse.getData().getParseMeta(); for (String tag : tags.keySet()) { try { fw.write(String.format("%s - %s", tag, tags.get(tag))); fw.flush(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } metadata.add(tag, tags.get(tag)); } } return parseResult; } }