view src/de/mpiwg/anteater/persons/impl/StanfordNLPPersonFinder.java @ 0:036535fcd179

anteater
author jdamerow
date Fri, 14 Sep 2012 10:30:43 +0200
parents
children
line wrap: on
line source

package de.mpiwg.anteater.persons.impl;

import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.lang3.StringEscapeUtils;

import de.mpiwg.anteater.logging.IAnteaterLogger;
import de.mpiwg.anteater.persons.IPersonFinder;
import edu.stanford.nlp.ie.AbstractSequenceClassifier;
import edu.stanford.nlp.ie.crf.CRFClassifier;
import edu.stanford.nlp.ling.CoreLabel;

public class StanfordNLPPersonFinder implements IPersonFinder {

	public final static String COMPONENT_NAME = StanfordNLPPersonFinder.class
			.getSimpleName();

	private IAnteaterLogger logger;
	private AbstractSequenceClassifier<CoreLabel> classifier;
	private boolean initiated = false;

	public StanfordNLPPersonFinder(IAnteaterLogger logger) {
		this.logger = logger;

	}

	@SuppressWarnings("unchecked")
	@Override
	public void init() {
		if (!initiated) {
			String serializedClassifier = "classifiers/english.all.3class.distsim.crf.ser.gz";
			classifier = CRFClassifier
					.getClassifierNoExceptions(serializedClassifier);
			initiated = true;
		}
	}

	@Override
	public String findPersons(String text) {
		logger.logMessage(COMPONENT_NAME, "Running Stanford NLP NER Parser...");
		init();

		String personXML = classifier.classifyWithInlineXML(text);

		Pattern pattern = Pattern.compile("(\\<([A-Z]+)\\>)(.+?)\\</[A-Z]+\\>");
		Matcher matcher = pattern.matcher(personXML);
		//int textcounter = 0;
		int lastTextend = 0;
		StringBuffer sb = new StringBuffer();
		StringBuffer textBuffer = new StringBuffer();
		while (matcher.find()) {
			int startIdx = matcher.start();
			textBuffer.append(personXML.substring(lastTextend, startIdx));
			String name = matcher.group(3);
			String type = matcher.group(2);
			//textcounter += startIdx - lastTextend;
			//int idx = textcounter;
			//textcounter += name.length();
			lastTextend = matcher.end();

			sb.append("<" + type.toLowerCase() + " start=\"" + textBuffer.toString().length()
					+ "\" length=\"" + name.length() + "\">");
			sb.append(StringEscapeUtils.escapeXml(name));
			sb.append("</" + type.toLowerCase() + ">");
			
			textBuffer.append(name);
		}

		// return complete xml
		return "<stanford_ner>" + sb.toString() + "</stanford_ner>";
	}

}