0
|
1 package de.mpiwg.anteater.persons.impl;
|
|
2
|
|
3 import java.util.regex.Matcher;
|
|
4 import java.util.regex.Pattern;
|
|
5
|
|
6 import org.apache.commons.lang3.StringEscapeUtils;
|
|
7
|
|
8 import de.mpiwg.anteater.logging.IAnteaterLogger;
|
|
9 import de.mpiwg.anteater.persons.IPersonFinder;
|
|
10 import edu.stanford.nlp.ie.AbstractSequenceClassifier;
|
|
11 import edu.stanford.nlp.ie.crf.CRFClassifier;
|
|
12 import edu.stanford.nlp.ling.CoreLabel;
|
|
13
|
|
14 public class StanfordNLPPersonFinder implements IPersonFinder {
|
|
15
|
|
16 public final static String COMPONENT_NAME = StanfordNLPPersonFinder.class
|
|
17 .getSimpleName();
|
|
18
|
|
19 private IAnteaterLogger logger;
|
|
20 private AbstractSequenceClassifier<CoreLabel> classifier;
|
|
21 private boolean initiated = false;
|
|
22
|
|
23 public StanfordNLPPersonFinder(IAnteaterLogger logger) {
|
|
24 this.logger = logger;
|
|
25
|
|
26 }
|
|
27
|
|
28 @SuppressWarnings("unchecked")
|
|
29 @Override
|
|
30 public void init() {
|
|
31 if (!initiated) {
|
|
32 String serializedClassifier = "classifiers/english.all.3class.distsim.crf.ser.gz";
|
|
33 classifier = CRFClassifier
|
|
34 .getClassifierNoExceptions(serializedClassifier);
|
|
35 initiated = true;
|
|
36 }
|
|
37 }
|
|
38
|
|
39 @Override
|
|
40 public String findPersons(String text) {
|
|
41 logger.logMessage(COMPONENT_NAME, "Running Stanford NLP NER Parser...");
|
|
42 init();
|
|
43
|
|
44 String personXML = classifier.classifyWithInlineXML(text);
|
|
45
|
|
46 Pattern pattern = Pattern.compile("(\\<([A-Z]+)\\>)(.+?)\\</[A-Z]+\\>");
|
|
47 Matcher matcher = pattern.matcher(personXML);
|
|
48 //int textcounter = 0;
|
|
49 int lastTextend = 0;
|
|
50 StringBuffer sb = new StringBuffer();
|
|
51 StringBuffer textBuffer = new StringBuffer();
|
|
52 while (matcher.find()) {
|
|
53 int startIdx = matcher.start();
|
|
54 textBuffer.append(personXML.substring(lastTextend, startIdx));
|
|
55 String name = matcher.group(3);
|
|
56 String type = matcher.group(2);
|
|
57 //textcounter += startIdx - lastTextend;
|
|
58 //int idx = textcounter;
|
|
59 //textcounter += name.length();
|
|
60 lastTextend = matcher.end();
|
|
61
|
|
62 sb.append("<" + type.toLowerCase() + " start=\"" + textBuffer.toString().length()
|
|
63 + "\" length=\"" + name.length() + "\">");
|
|
64 sb.append(StringEscapeUtils.escapeXml(name));
|
|
65 sb.append("</" + type.toLowerCase() + ">");
|
|
66
|
|
67 textBuffer.append(name);
|
|
68 }
|
|
69
|
|
70 // return complete xml
|
|
71 return "<stanford_ner>" + sb.toString() + "</stanford_ner>";
|
|
72 }
|
|
73
|
|
74 }
|