annotate src/de/mpiwg/anteater/persons/impl/StanfordNLPPersonFinder.java @ 0:036535fcd179

anteater
author jdamerow
date Fri, 14 Sep 2012 10:30:43 +0200
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
036535fcd179 anteater
jdamerow
parents:
diff changeset
1 package de.mpiwg.anteater.persons.impl;
036535fcd179 anteater
jdamerow
parents:
diff changeset
2
036535fcd179 anteater
jdamerow
parents:
diff changeset
3 import java.util.regex.Matcher;
036535fcd179 anteater
jdamerow
parents:
diff changeset
4 import java.util.regex.Pattern;
036535fcd179 anteater
jdamerow
parents:
diff changeset
5
036535fcd179 anteater
jdamerow
parents:
diff changeset
6 import org.apache.commons.lang3.StringEscapeUtils;
036535fcd179 anteater
jdamerow
parents:
diff changeset
7
036535fcd179 anteater
jdamerow
parents:
diff changeset
8 import de.mpiwg.anteater.logging.IAnteaterLogger;
036535fcd179 anteater
jdamerow
parents:
diff changeset
9 import de.mpiwg.anteater.persons.IPersonFinder;
036535fcd179 anteater
jdamerow
parents:
diff changeset
10 import edu.stanford.nlp.ie.AbstractSequenceClassifier;
036535fcd179 anteater
jdamerow
parents:
diff changeset
11 import edu.stanford.nlp.ie.crf.CRFClassifier;
036535fcd179 anteater
jdamerow
parents:
diff changeset
12 import edu.stanford.nlp.ling.CoreLabel;
036535fcd179 anteater
jdamerow
parents:
diff changeset
13
036535fcd179 anteater
jdamerow
parents:
diff changeset
14 public class StanfordNLPPersonFinder implements IPersonFinder {
036535fcd179 anteater
jdamerow
parents:
diff changeset
15
036535fcd179 anteater
jdamerow
parents:
diff changeset
16 public final static String COMPONENT_NAME = StanfordNLPPersonFinder.class
036535fcd179 anteater
jdamerow
parents:
diff changeset
17 .getSimpleName();
036535fcd179 anteater
jdamerow
parents:
diff changeset
18
036535fcd179 anteater
jdamerow
parents:
diff changeset
19 private IAnteaterLogger logger;
036535fcd179 anteater
jdamerow
parents:
diff changeset
20 private AbstractSequenceClassifier<CoreLabel> classifier;
036535fcd179 anteater
jdamerow
parents:
diff changeset
21 private boolean initiated = false;
036535fcd179 anteater
jdamerow
parents:
diff changeset
22
036535fcd179 anteater
jdamerow
parents:
diff changeset
23 public StanfordNLPPersonFinder(IAnteaterLogger logger) {
036535fcd179 anteater
jdamerow
parents:
diff changeset
24 this.logger = logger;
036535fcd179 anteater
jdamerow
parents:
diff changeset
25
036535fcd179 anteater
jdamerow
parents:
diff changeset
26 }
036535fcd179 anteater
jdamerow
parents:
diff changeset
27
036535fcd179 anteater
jdamerow
parents:
diff changeset
28 @SuppressWarnings("unchecked")
036535fcd179 anteater
jdamerow
parents:
diff changeset
29 @Override
036535fcd179 anteater
jdamerow
parents:
diff changeset
30 public void init() {
036535fcd179 anteater
jdamerow
parents:
diff changeset
31 if (!initiated) {
036535fcd179 anteater
jdamerow
parents:
diff changeset
32 String serializedClassifier = "classifiers/english.all.3class.distsim.crf.ser.gz";
036535fcd179 anteater
jdamerow
parents:
diff changeset
33 classifier = CRFClassifier
036535fcd179 anteater
jdamerow
parents:
diff changeset
34 .getClassifierNoExceptions(serializedClassifier);
036535fcd179 anteater
jdamerow
parents:
diff changeset
35 initiated = true;
036535fcd179 anteater
jdamerow
parents:
diff changeset
36 }
036535fcd179 anteater
jdamerow
parents:
diff changeset
37 }
036535fcd179 anteater
jdamerow
parents:
diff changeset
38
036535fcd179 anteater
jdamerow
parents:
diff changeset
39 @Override
036535fcd179 anteater
jdamerow
parents:
diff changeset
40 public String findPersons(String text) {
036535fcd179 anteater
jdamerow
parents:
diff changeset
41 logger.logMessage(COMPONENT_NAME, "Running Stanford NLP NER Parser...");
036535fcd179 anteater
jdamerow
parents:
diff changeset
42 init();
036535fcd179 anteater
jdamerow
parents:
diff changeset
43
036535fcd179 anteater
jdamerow
parents:
diff changeset
44 String personXML = classifier.classifyWithInlineXML(text);
036535fcd179 anteater
jdamerow
parents:
diff changeset
45
036535fcd179 anteater
jdamerow
parents:
diff changeset
46 Pattern pattern = Pattern.compile("(\\<([A-Z]+)\\>)(.+?)\\</[A-Z]+\\>");
036535fcd179 anteater
jdamerow
parents:
diff changeset
47 Matcher matcher = pattern.matcher(personXML);
036535fcd179 anteater
jdamerow
parents:
diff changeset
48 //int textcounter = 0;
036535fcd179 anteater
jdamerow
parents:
diff changeset
49 int lastTextend = 0;
036535fcd179 anteater
jdamerow
parents:
diff changeset
50 StringBuffer sb = new StringBuffer();
036535fcd179 anteater
jdamerow
parents:
diff changeset
51 StringBuffer textBuffer = new StringBuffer();
036535fcd179 anteater
jdamerow
parents:
diff changeset
52 while (matcher.find()) {
036535fcd179 anteater
jdamerow
parents:
diff changeset
53 int startIdx = matcher.start();
036535fcd179 anteater
jdamerow
parents:
diff changeset
54 textBuffer.append(personXML.substring(lastTextend, startIdx));
036535fcd179 anteater
jdamerow
parents:
diff changeset
55 String name = matcher.group(3);
036535fcd179 anteater
jdamerow
parents:
diff changeset
56 String type = matcher.group(2);
036535fcd179 anteater
jdamerow
parents:
diff changeset
57 //textcounter += startIdx - lastTextend;
036535fcd179 anteater
jdamerow
parents:
diff changeset
58 //int idx = textcounter;
036535fcd179 anteater
jdamerow
parents:
diff changeset
59 //textcounter += name.length();
036535fcd179 anteater
jdamerow
parents:
diff changeset
60 lastTextend = matcher.end();
036535fcd179 anteater
jdamerow
parents:
diff changeset
61
036535fcd179 anteater
jdamerow
parents:
diff changeset
62 sb.append("<" + type.toLowerCase() + " start=\"" + textBuffer.toString().length()
036535fcd179 anteater
jdamerow
parents:
diff changeset
63 + "\" length=\"" + name.length() + "\">");
036535fcd179 anteater
jdamerow
parents:
diff changeset
64 sb.append(StringEscapeUtils.escapeXml(name));
036535fcd179 anteater
jdamerow
parents:
diff changeset
65 sb.append("</" + type.toLowerCase() + ">");
036535fcd179 anteater
jdamerow
parents:
diff changeset
66
036535fcd179 anteater
jdamerow
parents:
diff changeset
67 textBuffer.append(name);
036535fcd179 anteater
jdamerow
parents:
diff changeset
68 }
036535fcd179 anteater
jdamerow
parents:
diff changeset
69
036535fcd179 anteater
jdamerow
parents:
diff changeset
70 // return complete xml
036535fcd179 anteater
jdamerow
parents:
diff changeset
71 return "<stanford_ner>" + sb.toString() + "</stanford_ner>";
036535fcd179 anteater
jdamerow
parents:
diff changeset
72 }
036535fcd179 anteater
jdamerow
parents:
diff changeset
73
036535fcd179 anteater
jdamerow
parents:
diff changeset
74 }