view src/plugin/parse-mpiwg/src/java/de/mpiwg/itgroup/mpiwg/parse/MPIWGParser.java @ 0:3b37d71af924 default tip

iniitial
author dwinter
date Tue, 26 Feb 2013 15:50:30 +0100
parents
children
line wrap: on
line source

package de.mpiwg.itgroup.mpiwg.parse;

import java.io.BufferedReader;
import java.io.ByteArrayInputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;

import org.apache.nutch.parse.HTMLMetaTags;
import org.apache.nutch.parse.HtmlParseFilter;
import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.ParseResult;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.metadata.Metadata;

import org.apache.nutch.protocol.Content;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.Document;
import org.w3c.dom.DocumentFragment;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.w3c.dom.Text;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;

import java.io.Reader;

import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;

public class MPIWGParser implements HtmlParseFilter {

	public static final Logger LOG = LoggerFactory.getLogger(MPIWGParser.class);

	public static final String TAG_KEY = "uploader";

	private FileWriter fw;

	public MPIWGParser(){
		 try {
			fw = new FileWriter("/tmp/out");
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
	}
	// private static final Pattern selectInfoPattern =
	// Pattern.compile("<span class=\"mpiwg-first_name\">(.*?)</span><span class=\"mpiwg-last_name\">(.*?)</span>");
	// private Pattern selectInfoPattern = null;
	// private String[] groupNames = null;
	// private String lineIdentification=null;

	private Map<String,MPIWGFilter> filters = new HashMap<String,MPIWGFilter>();
	private Configuration conf;

	public void setConf(Configuration conf) {
		this.conf = conf;
		if (conf == null)
			return;
		// the default constructor was called

		String confName = getConf().get("urlmeta.mpiwg-parser");
		Reader reader = getConf().getConfResourceAsReader(confName);

		// borrowed heavily from code in Configuration.java
		Document doc;
		try {
			doc = DocumentBuilderFactory.newInstance().newDocumentBuilder()
					.parse(new InputSource(reader));

			Element root = doc.getDocumentElement();
			if ((!"mpiwg-parser".equals(root.getTagName()))
					&& (LOG.isErrorEnabled())) {
				LOG.error("bad conf file: top-level element not <mpiwg-parser>");
			}

			// finde all filter
			NodeList filters = root.getChildNodes();
			for (int i = 0; i < filters.getLength(); i++) {
				Node filterNode = filters.item(i);
				if (!(filterNode instanceof Element))
					continue;
				Element filter = (Element) filterNode;
				if ((!"filter".equals(filter.getTagName()))
						&& (LOG.isWarnEnabled())) {
					LOG.warn("bad conf file: element not <filter>");
				}

				MPIWGFilter currentFilter = new MPIWGFilter();
				// gehe jetzt durch die filter
				NodeList fields = filter.getChildNodes();
				currentFilter.mutiline=-1;
				for (int j = 0; j < fields.getLength(); j++) {
					Node fieldNode = fields.item(j);
					if (!(fieldNode instanceof Element))
						continue;
					Element field = (Element) fieldNode;
					if ("name".equals(field.getTagName())
							&& field.hasChildNodes())
						currentFilter.name = ((Text) field.getFirstChild())
								.getData();

					if ("searchPattern".equals(field.getTagName())
							&& field.hasChildNodes())
						currentFilter.searchPattern = ((Text) field
								.getFirstChild()).getData();

					if ("line-identification".equals(field.getTagName())
							&& field.hasChildNodes())
						currentFilter.lineIdentification = ((Text) field
								.getFirstChild()).getData();

					if ("multiline".equals(field.getTagName())
							&& field.hasChildNodes())
						currentFilter.mutiline = Integer.valueOf(((Text) field
								.getFirstChild()).getData());

					if ("group-name".equals(field.getTagName())
							&& field.hasChildNodes())
						currentFilter.groupNames.add(((Text) field
								.getFirstChild()).getData());

				}
				this.filters.put(currentFilter.name,currentFilter);
			}
		} catch (Exception e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}

	}

	public Configuration getConf() {
		return this.conf;
	}

	public ParseResult filter(Content content, ParseResult parseResult,
			HTMLMetaTags metaTags, DocumentFragment doc) {

		if (conf != null)
			this.setConf(conf);

		for (String currentFilterName : filters.keySet()) {
			MPIWGFilter currentFilter = filters.get(currentFilterName);
			if (currentFilter.searchPattern == null) // kein pattern gesetzt
				return parseResult;

			Pattern pattern = Pattern.compile(currentFilter.searchPattern,Pattern.DOTALL);
			BufferedReader reader;
			try {
				reader = new BufferedReader(new InputStreamReader(
						new ByteArrayInputStream(content.getContent()),"utf-8"));
			} catch (UnsupportedEncodingException e1) {
				LOG.debug("unsupported encoding!");
				return parseResult;
				
			}

			String line;

			Map<String, String> tags = new HashMap<String, String>();
			try {
				while ((line = reader.readLine()) != null) {

					if (line.contains(currentFilter.lineIdentification)) {
						
						//Multiline matching first collet lines
						
						int count = 0;
						
						String line2;
						// gehe durch multiline if multiline >0
						while ( ((line2 = reader.readLine()) != null) & (count<currentFilter.mutiline)) {
							count++;
							line+=line2;
						}
						
						Matcher m = pattern.matcher(line);
						if (m.find()) {
							for (int i = 0; i < currentFilter.groupNames.size(); i++)
								tags.put(currentFilter.groupNames.get(i), m
										.group(i + 1).trim()); // ordne
																// groupnamen
																// gruppen zu
							// LOG.debug(Adding tag: m.group(1));
							// tags.put("first_name", m.group(1));
							// tags.put("last_name", m.group(2));
						}
					}
				}
				reader.close();
			} catch (IOException e) {
				LOG.warn("IOException encountered parsing file:", e);
			}
			Parse parse = parseResult.get(content.getUrl());
			Metadata metadata = parse.getData().getParseMeta();
			for (String tag : tags.keySet()) {
				try {
					fw.write(String.format("%s - %s", tag, tags.get(tag)));
					fw.flush();
				} catch (IOException e) {
					// TODO Auto-generated catch block
					e.printStackTrace();
				}
				metadata.add(tag, tags.get(tag));
			}

		}
		return parseResult;
	}
}