diff src/plugin/parse-mpiwg/src/java/de/mpiwg/itgroup/mpiwg/parse/MPIWGParser.java @ 0:3b37d71af924 default tip

iniitial
author dwinter
date Tue, 26 Feb 2013 15:50:30 +0100
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/plugin/parse-mpiwg/src/java/de/mpiwg/itgroup/mpiwg/parse/MPIWGParser.java	Tue Feb 26 15:50:30 2013 +0100
@@ -0,0 +1,225 @@
+package de.mpiwg.itgroup.mpiwg.parse;
+
+import java.io.BufferedReader;
+import java.io.ByteArrayInputStream;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.io.UnsupportedEncodingException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+import java.util.regex.PatternSyntaxException;
+
+import org.apache.nutch.parse.HTMLMetaTags;
+import org.apache.nutch.parse.HtmlParseFilter;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseResult;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.metadata.Metadata;
+
+import org.apache.nutch.protocol.Content;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.w3c.dom.Document;
+import org.w3c.dom.DocumentFragment;
+import org.w3c.dom.Element;
+import org.w3c.dom.Node;
+import org.w3c.dom.NodeList;
+import org.w3c.dom.Text;
+import org.xml.sax.InputSource;
+import org.xml.sax.SAXException;
+
+import java.io.Reader;
+
+import javax.xml.parsers.DocumentBuilderFactory;
+import javax.xml.parsers.ParserConfigurationException;
+
+public class MPIWGParser implements HtmlParseFilter {
+
+	public static final Logger LOG = LoggerFactory.getLogger(MPIWGParser.class);
+
+	public static final String TAG_KEY = "uploader";
+
+	private FileWriter fw;
+
+	public MPIWGParser(){
+		 try {
+			fw = new FileWriter("/tmp/out");
+		} catch (IOException e) {
+			// TODO Auto-generated catch block
+			e.printStackTrace();
+		}
+	}
+	// private static final Pattern selectInfoPattern =
+	// Pattern.compile("<span class=\"mpiwg-first_name\">(.*?)</span><span class=\"mpiwg-last_name\">(.*?)</span>");
+	// private Pattern selectInfoPattern = null;
+	// private String[] groupNames = null;
+	// private String lineIdentification=null;
+
+	private Map<String,MPIWGFilter> filters = new HashMap<String,MPIWGFilter>();
+	private Configuration conf;
+
+	public void setConf(Configuration conf) {
+		this.conf = conf;
+		if (conf == null)
+			return;
+		// the default constructor was called
+
+		String confName = getConf().get("urlmeta.mpiwg-parser");
+		Reader reader = getConf().getConfResourceAsReader(confName);
+
+		// borrowed heavily from code in Configuration.java
+		Document doc;
+		try {
+			doc = DocumentBuilderFactory.newInstance().newDocumentBuilder()
+					.parse(new InputSource(reader));
+
+			Element root = doc.getDocumentElement();
+			if ((!"mpiwg-parser".equals(root.getTagName()))
+					&& (LOG.isErrorEnabled())) {
+				LOG.error("bad conf file: top-level element not <mpiwg-parser>");
+			}
+
+			// finde all filter
+			NodeList filters = root.getChildNodes();
+			for (int i = 0; i < filters.getLength(); i++) {
+				Node filterNode = filters.item(i);
+				if (!(filterNode instanceof Element))
+					continue;
+				Element filter = (Element) filterNode;
+				if ((!"filter".equals(filter.getTagName()))
+						&& (LOG.isWarnEnabled())) {
+					LOG.warn("bad conf file: element not <filter>");
+				}
+
+				MPIWGFilter currentFilter = new MPIWGFilter();
+				// gehe jetzt durch die filter
+				NodeList fields = filter.getChildNodes();
+				currentFilter.mutiline=-1;
+				for (int j = 0; j < fields.getLength(); j++) {
+					Node fieldNode = fields.item(j);
+					if (!(fieldNode instanceof Element))
+						continue;
+					Element field = (Element) fieldNode;
+					if ("name".equals(field.getTagName())
+							&& field.hasChildNodes())
+						currentFilter.name = ((Text) field.getFirstChild())
+								.getData();
+
+					if ("searchPattern".equals(field.getTagName())
+							&& field.hasChildNodes())
+						currentFilter.searchPattern = ((Text) field
+								.getFirstChild()).getData();
+
+					if ("line-identification".equals(field.getTagName())
+							&& field.hasChildNodes())
+						currentFilter.lineIdentification = ((Text) field
+								.getFirstChild()).getData();
+
+					if ("multiline".equals(field.getTagName())
+							&& field.hasChildNodes())
+						currentFilter.mutiline = Integer.valueOf(((Text) field
+								.getFirstChild()).getData());
+
+					if ("group-name".equals(field.getTagName())
+							&& field.hasChildNodes())
+						currentFilter.groupNames.add(((Text) field
+								.getFirstChild()).getData());
+
+				}
+				this.filters.put(currentFilter.name,currentFilter);
+			}
+		} catch (Exception e) {
+			// TODO Auto-generated catch block
+			e.printStackTrace();
+		}
+
+	}
+
+	public Configuration getConf() {
+		return this.conf;
+	}
+
+	public ParseResult filter(Content content, ParseResult parseResult,
+			HTMLMetaTags metaTags, DocumentFragment doc) {
+
+		if (conf != null)
+			this.setConf(conf);
+
+		for (String currentFilterName : filters.keySet()) {
+			MPIWGFilter currentFilter = filters.get(currentFilterName);
+			if (currentFilter.searchPattern == null) // kein pattern gesetzt
+				return parseResult;
+
+			Pattern pattern = Pattern.compile(currentFilter.searchPattern,Pattern.DOTALL);
+			BufferedReader reader;
+			try {
+				reader = new BufferedReader(new InputStreamReader(
+						new ByteArrayInputStream(content.getContent()),"utf-8"));
+			} catch (UnsupportedEncodingException e1) {
+				LOG.debug("unsupported encoding!");
+				return parseResult;
+				
+			}
+
+			String line;
+
+			Map<String, String> tags = new HashMap<String, String>();
+			try {
+				while ((line = reader.readLine()) != null) {
+
+					if (line.contains(currentFilter.lineIdentification)) {
+						
+						//Multiline matching first collet lines
+						
+						int count = 0;
+						
+						String line2;
+						// gehe durch multiline if multiline >0
+						while ( ((line2 = reader.readLine()) != null) & (count<currentFilter.mutiline)) {
+							count++;
+							line+=line2;
+						}
+						
+						Matcher m = pattern.matcher(line);
+						if (m.find()) {
+							for (int i = 0; i < currentFilter.groupNames.size(); i++)
+								tags.put(currentFilter.groupNames.get(i), m
+										.group(i + 1).trim()); // ordne
+																// groupnamen
+																// gruppen zu
+							// LOG.debug(Adding tag: m.group(1));
+							// tags.put("first_name", m.group(1));
+							// tags.put("last_name", m.group(2));
+						}
+					}
+				}
+				reader.close();
+			} catch (IOException e) {
+				LOG.warn("IOException encountered parsing file:", e);
+			}
+			Parse parse = parseResult.get(content.getUrl());
+			Metadata metadata = parse.getData().getParseMeta();
+			for (String tag : tags.keySet()) {
+				try {
+					fw.write(String.format("%s - %s", tag, tags.get(tag)));
+					fw.flush();
+				} catch (IOException e) {
+					// TODO Auto-generated catch block
+					e.printStackTrace();
+				}
+				metadata.add(tag, tags.get(tag));
+			}
+
+		}
+		return parseResult;
+	}
+}
\ No newline at end of file