Mercurial > hg > nutch-mpiwg-plugins
view src/plugin/parse-mpiwg/src/java/de/mpiwg/itgroup/mpiwg/parse/MPIWGDomParser.java @ 0:3b37d71af924 default tip
iniitial
author | dwinter |
---|---|
date | Tue, 26 Feb 2013 15:50:30 +0100 |
parents | |
children |
line wrap: on
line source
package de.mpiwg.itgroup.mpiwg.parse; import java.io.BufferedReader; import java.io.ByteArrayInputStream; import java.io.FileWriter; import java.io.IOException; import java.io.InputStreamReader; import java.io.UnsupportedEncodingException; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.regex.Matcher; import java.util.regex.Pattern; import java.util.regex.PatternSyntaxException; import org.apache.nutch.parse.HTMLMetaTags; import org.apache.nutch.parse.HtmlParseFilter; import org.apache.nutch.parse.Parse; import org.apache.nutch.parse.ParseResult; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.nutch.metadata.Metadata; import org.apache.nutch.protocol.Content; import org.apache.taglibs.standard.tag.common.xml.XPathUtil; import org.apache.xerces.dom.DocumentFragmentImpl; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.w3c.dom.Document; import org.w3c.dom.DocumentFragment; import org.w3c.dom.Element; import org.w3c.dom.Node; import org.w3c.dom.NodeList; import org.w3c.dom.Text; import org.xml.sax.InputSource; import org.xml.sax.SAXException; import java.io.Reader; import javax.servlet.jsp.JspTagException; import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.parsers.ParserConfigurationException; import javax.xml.xpath.XPath; import javax.xml.xpath.XPathConstants; import javax.xml.xpath.XPathExpression; import javax.xml.xpath.XPathExpressionException; import javax.xml.xpath.XPathFactory; public class MPIWGDomParser implements HtmlParseFilter { public static final Logger LOG = LoggerFactory.getLogger(MPIWGDomParser.class); //public static final String TAG_KEY = "uploader"; private FileWriter fw; public MPIWGDomParser(){ try { fw = new FileWriter("/tmp/out3"); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } // private static final Pattern selectInfoPattern = // Pattern.compile("<span class=\"mpiwg-first_name\">(.*?)</span><span class=\"mpiwg-last_name\">(.*?)</span>"); // private Pattern selectInfoPattern = null; // private String[] groupNames = null; // private String lineIdentification=null; private Map<String,MPIWGDomFilter> filters = new HashMap<String,MPIWGDomFilter>(); private Configuration conf; public void setConf(Configuration conf) { this.conf = conf; if (conf == null) return; // the default constructor was called String confName = getConf().get("urlmeta.mpiwg-dom-parser"); Reader reader = getConf().getConfResourceAsReader(confName); // borrowed heavily from code in Configuration.java Document doc; try { doc = DocumentBuilderFactory.newInstance().newDocumentBuilder() .parse(new InputSource(reader)); Element root = doc.getDocumentElement(); if ((!"mpiwg-dom-parser".equals(root.getTagName())) && (LOG.isErrorEnabled())) { LOG.error("bad conf file: top-level element not <mpiwg-parser>"); } // finde all filter NodeList filters = root.getChildNodes(); for (int i = 0; i < filters.getLength(); i++) { Node filterNode = filters.item(i); if (!(filterNode instanceof Element)) continue; Element filter = (Element) filterNode; if ((!"filter".equals(filter.getTagName())) && (LOG.isWarnEnabled())) { LOG.warn("bad conf file: element not <filter>"); } MPIWGDomFilter currentFilter = new MPIWGDomFilter(); // gehe jetzt durch die filter NodeList fields = filter.getChildNodes(); for (int j = 0; j < fields.getLength(); j++) { Node fieldNode = fields.item(j); if (!(fieldNode instanceof Element)) continue; Element field = (Element) fieldNode; if ("name".equals(field.getTagName()) && field.hasChildNodes()) currentFilter.name = ((Text) field.getFirstChild()) .getData(); if ("tagname".equals(field.getTagName()) && field.hasChildNodes()) currentFilter.tagname = ((Text) field .getFirstChild()).getData(); if ("tagclass".equals(field.getTagName()) && field.hasChildNodes()) currentFilter.tagclass = ((Text) field .getFirstChild()).getData(); } this.filters.put(currentFilter.name,currentFilter); } } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } } public Configuration getConf() { return this.conf; } public ParseResult filter(Content content, ParseResult parseResult, HTMLMetaTags metaTags, DocumentFragment doc) { if (conf != null) this.setConf(conf); Parse parse2 = parseResult.get(content.getUrl()); Metadata metadata2 = parse2.getData().getParseMeta(); // normalisiere url (index.html und index_html) an Ende werden geloescht String urlNorm = content.getUrl().replace("index.html", "").replace("index_html",""); if (urlNorm.endsWith("/")){ urlNorm=urlNorm.substring(0,urlNorm.length()-1); } metadata2.add("urlNorm", urlNorm); // language if (urlNorm.contains("/en/")){ metadata2.add("lang", "en"); } else { metadata2.add("lang", "de"); } for (String currentFilterName : filters.keySet()) { MPIWGDomFilter currentFilter = filters.get(currentFilterName); if (currentFilter.tagname == null) // kein pattern gesetzt return parseResult; // ddoc.get DocumentFragmentImpl d = (DocumentFragmentImpl)doc; XPath xp; NodeList res; xp = XPathFactory.newInstance().newXPath(); XPathExpression g; try { g = xp.compile("//DIV[@class=\"main\"]"); res = (NodeList)g.evaluate(d,XPathConstants.NODESET); } catch (XPathExpressionException e) { // TODO Auto-generated catch block e.printStackTrace(); return parseResult; } String text = new String(""); for (int i=0;i<res.getLength();i++){ Node n =res.item(i); text = text+new String(n.getTextContent()); } Parse parse = parseResult.get(content.getUrl()); Metadata metadata = parse.getData().getParseMeta(); try { fw.write("-------------\n"); fw.write("URL:"+content.getUrl()); fw.write("-------------\n"); fw.write(text); fw.write("\n"); fw.flush(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } metadata.add(currentFilterName, text); try { fw.write("XXXXXXXXXXXXXX\n"); fw.write(metadata.get(currentFilterName)); fw.write("\n"); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } return parseResult; } }