Mercurial > hg > anteater
diff src/de/mpiwg/anteater/persons/impl/StanfordNLPPersonFinder.java @ 0:036535fcd179
anteater
author | jdamerow |
---|---|
date | Fri, 14 Sep 2012 10:30:43 +0200 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/de/mpiwg/anteater/persons/impl/StanfordNLPPersonFinder.java Fri Sep 14 10:30:43 2012 +0200 @@ -0,0 +1,74 @@ +package de.mpiwg.anteater.persons.impl; + +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.apache.commons.lang3.StringEscapeUtils; + +import de.mpiwg.anteater.logging.IAnteaterLogger; +import de.mpiwg.anteater.persons.IPersonFinder; +import edu.stanford.nlp.ie.AbstractSequenceClassifier; +import edu.stanford.nlp.ie.crf.CRFClassifier; +import edu.stanford.nlp.ling.CoreLabel; + +public class StanfordNLPPersonFinder implements IPersonFinder { + + public final static String COMPONENT_NAME = StanfordNLPPersonFinder.class + .getSimpleName(); + + private IAnteaterLogger logger; + private AbstractSequenceClassifier<CoreLabel> classifier; + private boolean initiated = false; + + public StanfordNLPPersonFinder(IAnteaterLogger logger) { + this.logger = logger; + + } + + @SuppressWarnings("unchecked") + @Override + public void init() { + if (!initiated) { + String serializedClassifier = "classifiers/english.all.3class.distsim.crf.ser.gz"; + classifier = CRFClassifier + .getClassifierNoExceptions(serializedClassifier); + initiated = true; + } + } + + @Override + public String findPersons(String text) { + logger.logMessage(COMPONENT_NAME, "Running Stanford NLP NER Parser..."); + init(); + + String personXML = classifier.classifyWithInlineXML(text); + + Pattern pattern = Pattern.compile("(\\<([A-Z]+)\\>)(.+?)\\</[A-Z]+\\>"); + Matcher matcher = pattern.matcher(personXML); + //int textcounter = 0; + int lastTextend = 0; + StringBuffer sb = new StringBuffer(); + StringBuffer textBuffer = new StringBuffer(); + while (matcher.find()) { + int startIdx = matcher.start(); + textBuffer.append(personXML.substring(lastTextend, startIdx)); + String name = matcher.group(3); + String type = matcher.group(2); + //textcounter += startIdx - lastTextend; + //int idx = textcounter; + //textcounter += name.length(); + lastTextend = matcher.end(); + + sb.append("<" + type.toLowerCase() + " start=\"" + textBuffer.toString().length() + + "\" length=\"" + name.length() + "\">"); + sb.append(StringEscapeUtils.escapeXml(name)); + sb.append("</" + type.toLowerCase() + ">"); + + textBuffer.append(name); + } + + // return complete xml + return "<stanford_ner>" + sb.toString() + "</stanford_ner>"; + } + +}