Mercurial > hg > anteater
view src/de/mpiwg/anteater/persons/impl/StanfordNLPPersonFinder.java @ 0:036535fcd179
anteater
author | jdamerow |
---|---|
date | Fri, 14 Sep 2012 10:30:43 +0200 |
parents | |
children |
line wrap: on
line source
package de.mpiwg.anteater.persons.impl; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.commons.lang3.StringEscapeUtils; import de.mpiwg.anteater.logging.IAnteaterLogger; import de.mpiwg.anteater.persons.IPersonFinder; import edu.stanford.nlp.ie.AbstractSequenceClassifier; import edu.stanford.nlp.ie.crf.CRFClassifier; import edu.stanford.nlp.ling.CoreLabel; public class StanfordNLPPersonFinder implements IPersonFinder { public final static String COMPONENT_NAME = StanfordNLPPersonFinder.class .getSimpleName(); private IAnteaterLogger logger; private AbstractSequenceClassifier<CoreLabel> classifier; private boolean initiated = false; public StanfordNLPPersonFinder(IAnteaterLogger logger) { this.logger = logger; } @SuppressWarnings("unchecked") @Override public void init() { if (!initiated) { String serializedClassifier = "classifiers/english.all.3class.distsim.crf.ser.gz"; classifier = CRFClassifier .getClassifierNoExceptions(serializedClassifier); initiated = true; } } @Override public String findPersons(String text) { logger.logMessage(COMPONENT_NAME, "Running Stanford NLP NER Parser..."); init(); String personXML = classifier.classifyWithInlineXML(text); Pattern pattern = Pattern.compile("(\\<([A-Z]+)\\>)(.+?)\\</[A-Z]+\\>"); Matcher matcher = pattern.matcher(personXML); //int textcounter = 0; int lastTextend = 0; StringBuffer sb = new StringBuffer(); StringBuffer textBuffer = new StringBuffer(); while (matcher.find()) { int startIdx = matcher.start(); textBuffer.append(personXML.substring(lastTextend, startIdx)); String name = matcher.group(3); String type = matcher.group(2); //textcounter += startIdx - lastTextend; //int idx = textcounter; //textcounter += name.length(); lastTextend = matcher.end(); sb.append("<" + type.toLowerCase() + " start=\"" + textBuffer.toString().length() + "\" length=\"" + name.length() + "\">"); sb.append(StringEscapeUtils.escapeXml(name)); sb.append("</" + type.toLowerCase() + ">"); textBuffer.append(name); } // return complete xml return "<stanford_ner>" + sb.toString() + "</stanford_ner>"; } }