view src/plugin/parse-mpiwg/src/java/de/mpiwg/itgroup/mpiwg/parse/MPIWGDomParser.java @ 0:3b37d71af924 default tip

iniitial
author dwinter
date Tue, 26 Feb 2013 15:50:30 +0100
parents
children
line wrap: on
line source

package de.mpiwg.itgroup.mpiwg.parse;

import java.io.BufferedReader;
import java.io.ByteArrayInputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;

import org.apache.nutch.parse.HTMLMetaTags;
import org.apache.nutch.parse.HtmlParseFilter;
import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.ParseResult;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.metadata.Metadata;

import org.apache.nutch.protocol.Content;
import org.apache.taglibs.standard.tag.common.xml.XPathUtil;
import org.apache.xerces.dom.DocumentFragmentImpl;


import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.Document;
import org.w3c.dom.DocumentFragment;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.w3c.dom.Text;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;

import java.io.Reader;

import javax.servlet.jsp.JspTagException;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpression;
import javax.xml.xpath.XPathExpressionException;
import javax.xml.xpath.XPathFactory;

public class MPIWGDomParser implements HtmlParseFilter {

	public static final Logger LOG = LoggerFactory.getLogger(MPIWGDomParser.class);

	//public static final String TAG_KEY = "uploader";

	private FileWriter fw;

	public MPIWGDomParser(){
		 try {
			fw = new FileWriter("/tmp/out3");
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
	}
	// private static final Pattern selectInfoPattern =
	// Pattern.compile("<span class=\"mpiwg-first_name\">(.*?)</span><span class=\"mpiwg-last_name\">(.*?)</span>");
	// private Pattern selectInfoPattern = null;
	// private String[] groupNames = null;
	// private String lineIdentification=null;

	private Map<String,MPIWGDomFilter> filters = new HashMap<String,MPIWGDomFilter>();
	private Configuration conf;

	public void setConf(Configuration conf) {
		this.conf = conf;
		if (conf == null)
			return;
		// the default constructor was called

		String confName = getConf().get("urlmeta.mpiwg-dom-parser");
		Reader reader = getConf().getConfResourceAsReader(confName);

		// borrowed heavily from code in Configuration.java
		Document doc;
		try {
			doc = DocumentBuilderFactory.newInstance().newDocumentBuilder()
					.parse(new InputSource(reader));

			Element root = doc.getDocumentElement();
			if ((!"mpiwg-dom-parser".equals(root.getTagName()))
					&& (LOG.isErrorEnabled())) {
				LOG.error("bad conf file: top-level element not <mpiwg-parser>");
			}

			// finde all filter
			NodeList filters = root.getChildNodes();
			for (int i = 0; i < filters.getLength(); i++) {
				Node filterNode = filters.item(i);
				if (!(filterNode instanceof Element))
					continue;
				Element filter = (Element) filterNode;
				if ((!"filter".equals(filter.getTagName()))
						&& (LOG.isWarnEnabled())) {
					LOG.warn("bad conf file: element not <filter>");
				}

				MPIWGDomFilter currentFilter = new MPIWGDomFilter();
				// gehe jetzt durch die filter
				NodeList fields = filter.getChildNodes();
				for (int j = 0; j < fields.getLength(); j++) {
					Node fieldNode = fields.item(j);
					if (!(fieldNode instanceof Element))
						continue;
					Element field = (Element) fieldNode;
					if ("name".equals(field.getTagName())
							&& field.hasChildNodes())
						currentFilter.name = ((Text) field.getFirstChild())
								.getData();

					if ("tagname".equals(field.getTagName())
							&& field.hasChildNodes())
						currentFilter.tagname = ((Text) field
								.getFirstChild()).getData();

					if ("tagclass".equals(field.getTagName())
							&& field.hasChildNodes())
						currentFilter.tagclass = ((Text) field
								.getFirstChild()).getData();

					
				}
				this.filters.put(currentFilter.name,currentFilter);
			}
		} catch (Exception e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}

	}

	public Configuration getConf() {
		return this.conf;
	}

	public ParseResult filter(Content content, ParseResult parseResult,
			HTMLMetaTags metaTags, DocumentFragment doc) {

		if (conf != null)
			this.setConf(conf);

		
		Parse parse2 = parseResult.get(content.getUrl());
		Metadata metadata2 = parse2.getData().getParseMeta();
		
		// normalisiere url (index.html und index_html) an Ende werden geloescht
		String urlNorm = content.getUrl().replace("index.html", "").replace("index_html","");
		if (urlNorm.endsWith("/")){
			urlNorm=urlNorm.substring(0,urlNorm.length()-1);
		}
		
		metadata2.add("urlNorm", urlNorm);
		
		// language
		if (urlNorm.contains("/en/")){
			metadata2.add("lang", "en");
		} else {
			metadata2.add("lang", "de");
		}
		
		for (String currentFilterName : filters.keySet()) {
			MPIWGDomFilter currentFilter = filters.get(currentFilterName);
			if (currentFilter.tagname == null) // kein pattern gesetzt
				return parseResult;

		
		
			// ddoc.get
			DocumentFragmentImpl d = (DocumentFragmentImpl)doc;
			
			
			XPath xp;
			NodeList res;
			xp = XPathFactory.newInstance().newXPath();
			XPathExpression g;
			try {
				g = xp.compile("//DIV[@class=\"main\"]");
				
				res  = (NodeList)g.evaluate(d,XPathConstants.NODESET);
			} catch (XPathExpressionException e) {
				// TODO Auto-generated catch block
				e.printStackTrace();
				return parseResult;
			}	
			
			
			String text = new String("");
			for (int i=0;i<res.getLength();i++){
				Node n =res.item(i);
				
				text = text+new String(n.getTextContent());
				
			}
			
			
			
			
			Parse parse = parseResult.get(content.getUrl());
			Metadata metadata = parse.getData().getParseMeta();
			try {
				fw.write("-------------\n");
				fw.write("URL:"+content.getUrl());
				fw.write("-------------\n");
				fw.write(text);
				fw.write("\n");
				fw.flush();
			} catch (IOException e) {
				// TODO Auto-generated catch block
				e.printStackTrace();
			}
			metadata.add(currentFilterName, text);
			try {
				fw.write("XXXXXXXXXXXXXX\n");
				fw.write(metadata.get(currentFilterName));
				fw.write("\n");
			} catch (IOException e) {
				// TODO Auto-generated catch block
				e.printStackTrace();
			}


			

		}
		return parseResult;
	}
}