diff src/de/mpiwg/anteater/persons/impl/StanfordNLPPersonFinder.java @ 0:036535fcd179

anteater
author jdamerow
date Fri, 14 Sep 2012 10:30:43 +0200
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/de/mpiwg/anteater/persons/impl/StanfordNLPPersonFinder.java	Fri Sep 14 10:30:43 2012 +0200
@@ -0,0 +1,74 @@
+package de.mpiwg.anteater.persons.impl;
+
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.commons.lang3.StringEscapeUtils;
+
+import de.mpiwg.anteater.logging.IAnteaterLogger;
+import de.mpiwg.anteater.persons.IPersonFinder;
+import edu.stanford.nlp.ie.AbstractSequenceClassifier;
+import edu.stanford.nlp.ie.crf.CRFClassifier;
+import edu.stanford.nlp.ling.CoreLabel;
+
+public class StanfordNLPPersonFinder implements IPersonFinder {
+
+	public final static String COMPONENT_NAME = StanfordNLPPersonFinder.class
+			.getSimpleName();
+
+	private IAnteaterLogger logger;
+	private AbstractSequenceClassifier<CoreLabel> classifier;
+	private boolean initiated = false;
+
+	public StanfordNLPPersonFinder(IAnteaterLogger logger) {
+		this.logger = logger;
+
+	}
+
+	@SuppressWarnings("unchecked")
+	@Override
+	public void init() {
+		if (!initiated) {
+			String serializedClassifier = "classifiers/english.all.3class.distsim.crf.ser.gz";
+			classifier = CRFClassifier
+					.getClassifierNoExceptions(serializedClassifier);
+			initiated = true;
+		}
+	}
+
+	@Override
+	public String findPersons(String text) {
+		logger.logMessage(COMPONENT_NAME, "Running Stanford NLP NER Parser...");
+		init();
+
+		String personXML = classifier.classifyWithInlineXML(text);
+
+		Pattern pattern = Pattern.compile("(\\<([A-Z]+)\\>)(.+?)\\</[A-Z]+\\>");
+		Matcher matcher = pattern.matcher(personXML);
+		//int textcounter = 0;
+		int lastTextend = 0;
+		StringBuffer sb = new StringBuffer();
+		StringBuffer textBuffer = new StringBuffer();
+		while (matcher.find()) {
+			int startIdx = matcher.start();
+			textBuffer.append(personXML.substring(lastTextend, startIdx));
+			String name = matcher.group(3);
+			String type = matcher.group(2);
+			//textcounter += startIdx - lastTextend;
+			//int idx = textcounter;
+			//textcounter += name.length();
+			lastTextend = matcher.end();
+
+			sb.append("<" + type.toLowerCase() + " start=\"" + textBuffer.toString().length()
+					+ "\" length=\"" + name.length() + "\">");
+			sb.append(StringEscapeUtils.escapeXml(name));
+			sb.append("</" + type.toLowerCase() + ">");
+			
+			textBuffer.append(name);
+		}
+
+		// return complete xml
+		return "<stanford_ner>" + sb.toString() + "</stanford_ner>";
+	}
+
+}