annotate src/de/mpiwg/anteater/ml/impl/StanfordNLPTextParser.java @ 0:036535fcd179

anteater
author jdamerow
date Fri, 14 Sep 2012 10:30:43 +0200
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
036535fcd179 anteater
jdamerow
parents:
diff changeset
1 package de.mpiwg.anteater.ml.impl;
036535fcd179 anteater
jdamerow
parents:
diff changeset
2
036535fcd179 anteater
jdamerow
parents:
diff changeset
3 import java.util.ArrayList;
036535fcd179 anteater
jdamerow
parents:
diff changeset
4 import java.util.List;
036535fcd179 anteater
jdamerow
parents:
diff changeset
5 import java.util.Properties;
036535fcd179 anteater
jdamerow
parents:
diff changeset
6 import java.util.Set;
036535fcd179 anteater
jdamerow
parents:
diff changeset
7
036535fcd179 anteater
jdamerow
parents:
diff changeset
8 import de.mpiwg.anteater.ml.ITextParser;
036535fcd179 anteater
jdamerow
parents:
diff changeset
9 import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation;
036535fcd179 anteater
jdamerow
parents:
diff changeset
10 import edu.stanford.nlp.ling.CoreAnnotations.TextAnnotation;
036535fcd179 anteater
jdamerow
parents:
diff changeset
11 import edu.stanford.nlp.ling.IndexedWord;
036535fcd179 anteater
jdamerow
parents:
diff changeset
12 import edu.stanford.nlp.pipeline.Annotation;
036535fcd179 anteater
jdamerow
parents:
diff changeset
13 import edu.stanford.nlp.pipeline.StanfordCoreNLP;
036535fcd179 anteater
jdamerow
parents:
diff changeset
14 import edu.stanford.nlp.trees.semgraph.SemanticGraph;
036535fcd179 anteater
jdamerow
parents:
diff changeset
15 import edu.stanford.nlp.trees.semgraph.SemanticGraphCoreAnnotations.BasicDependenciesAnnotation;
036535fcd179 anteater
jdamerow
parents:
diff changeset
16 import edu.stanford.nlp.trees.semgraph.SemanticGraphEdge;
036535fcd179 anteater
jdamerow
parents:
diff changeset
17 import edu.stanford.nlp.util.CoreMap;
036535fcd179 anteater
jdamerow
parents:
diff changeset
18
036535fcd179 anteater
jdamerow
parents:
diff changeset
19 public class StanfordNLPTextParser implements ITextParser {
036535fcd179 anteater
jdamerow
parents:
diff changeset
20
036535fcd179 anteater
jdamerow
parents:
diff changeset
21 private StanfordCoreNLP pipeline;
036535fcd179 anteater
jdamerow
parents:
diff changeset
22
036535fcd179 anteater
jdamerow
parents:
diff changeset
23 public StanfordNLPTextParser() {
036535fcd179 anteater
jdamerow
parents:
diff changeset
24 Properties props = new Properties();
036535fcd179 anteater
jdamerow
parents:
diff changeset
25 props.put("annotators", "tokenize, ssplit, parse");
036535fcd179 anteater
jdamerow
parents:
diff changeset
26 pipeline = new StanfordCoreNLP(props);
036535fcd179 anteater
jdamerow
parents:
diff changeset
27
036535fcd179 anteater
jdamerow
parents:
diff changeset
28 }
036535fcd179 anteater
jdamerow
parents:
diff changeset
29
036535fcd179 anteater
jdamerow
parents:
diff changeset
30 public List<String> getSentences(String text) {
036535fcd179 anteater
jdamerow
parents:
diff changeset
31 // creates a StanfordCoreNLP object, with POS tagging, lemmatization,
036535fcd179 anteater
jdamerow
parents:
diff changeset
32 // NER, parsing, and coreference resolution
036535fcd179 anteater
jdamerow
parents:
diff changeset
33 Properties props = new Properties();
036535fcd179 anteater
jdamerow
parents:
diff changeset
34 props.put("annotators", "tokenize, ssplit");
036535fcd179 anteater
jdamerow
parents:
diff changeset
35 StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
036535fcd179 anteater
jdamerow
parents:
diff changeset
36
036535fcd179 anteater
jdamerow
parents:
diff changeset
37 // create an empty Annotation just with the given text
036535fcd179 anteater
jdamerow
parents:
diff changeset
38 Annotation document = new Annotation(text);
036535fcd179 anteater
jdamerow
parents:
diff changeset
39
036535fcd179 anteater
jdamerow
parents:
diff changeset
40 // run all Annotators on this text
036535fcd179 anteater
jdamerow
parents:
diff changeset
41 pipeline.annotate(document);
036535fcd179 anteater
jdamerow
parents:
diff changeset
42
036535fcd179 anteater
jdamerow
parents:
diff changeset
43 List<CoreMap> sentenceAnnot = document.get(SentencesAnnotation.class);
036535fcd179 anteater
jdamerow
parents:
diff changeset
44
036535fcd179 anteater
jdamerow
parents:
diff changeset
45 List<String> sentences = new ArrayList<String>();
036535fcd179 anteater
jdamerow
parents:
diff changeset
46 for (CoreMap sentence : sentenceAnnot) {
036535fcd179 anteater
jdamerow
parents:
diff changeset
47 sentences.add(sentence.get(TextAnnotation.class));
036535fcd179 anteater
jdamerow
parents:
diff changeset
48 }
036535fcd179 anteater
jdamerow
parents:
diff changeset
49 return sentences;
036535fcd179 anteater
jdamerow
parents:
diff changeset
50 }
036535fcd179 anteater
jdamerow
parents:
diff changeset
51
036535fcd179 anteater
jdamerow
parents:
diff changeset
52 @Override
036535fcd179 anteater
jdamerow
parents:
diff changeset
53 public List<Word> getSubjects(String sentence) {
036535fcd179 anteater
jdamerow
parents:
diff changeset
54 // create an empty Annotation just with the given text
036535fcd179 anteater
jdamerow
parents:
diff changeset
55 List<Word> words = new ArrayList<Word>();
036535fcd179 anteater
jdamerow
parents:
diff changeset
56
036535fcd179 anteater
jdamerow
parents:
diff changeset
57 if (sentence == null)
036535fcd179 anteater
jdamerow
parents:
diff changeset
58 return words;
036535fcd179 anteater
jdamerow
parents:
diff changeset
59
036535fcd179 anteater
jdamerow
parents:
diff changeset
60 Annotation document = new Annotation(sentence);
036535fcd179 anteater
jdamerow
parents:
diff changeset
61
036535fcd179 anteater
jdamerow
parents:
diff changeset
62 // run all Annotators on this text
036535fcd179 anteater
jdamerow
parents:
diff changeset
63 pipeline.annotate(document);
036535fcd179 anteater
jdamerow
parents:
diff changeset
64
036535fcd179 anteater
jdamerow
parents:
diff changeset
65 List<CoreMap> sentences = document.get(SentencesAnnotation.class);
036535fcd179 anteater
jdamerow
parents:
diff changeset
66
036535fcd179 anteater
jdamerow
parents:
diff changeset
67 for(CoreMap sen: sentences) {
036535fcd179 anteater
jdamerow
parents:
diff changeset
68 SemanticGraph annotations = sen
036535fcd179 anteater
jdamerow
parents:
diff changeset
69 .get(BasicDependenciesAnnotation.class);
036535fcd179 anteater
jdamerow
parents:
diff changeset
70
036535fcd179 anteater
jdamerow
parents:
diff changeset
71 Set<SemanticGraphEdge> edges = annotations.getEdgeSet();
036535fcd179 anteater
jdamerow
parents:
diff changeset
72 for (SemanticGraphEdge edge : edges) {
036535fcd179 anteater
jdamerow
parents:
diff changeset
73 String shortname = edge.getRelation().getShortName();
036535fcd179 anteater
jdamerow
parents:
diff changeset
74 if (shortname.contains("subj")) {
036535fcd179 anteater
jdamerow
parents:
diff changeset
75 Word word = new Word();
036535fcd179 anteater
jdamerow
parents:
diff changeset
76 IndexedWord idxword = edge.getTarget();
036535fcd179 anteater
jdamerow
parents:
diff changeset
77
036535fcd179 anteater
jdamerow
parents:
diff changeset
78 word.setWord(idxword.originalText());
036535fcd179 anteater
jdamerow
parents:
diff changeset
79 word.setIndex(idxword.beginPosition());
036535fcd179 anteater
jdamerow
parents:
diff changeset
80 if (shortname.contains("pass"))
036535fcd179 anteater
jdamerow
parents:
diff changeset
81 word.setPassive(true);
036535fcd179 anteater
jdamerow
parents:
diff changeset
82 words.add(word);
036535fcd179 anteater
jdamerow
parents:
diff changeset
83 }
036535fcd179 anteater
jdamerow
parents:
diff changeset
84 }
036535fcd179 anteater
jdamerow
parents:
diff changeset
85 }
036535fcd179 anteater
jdamerow
parents:
diff changeset
86
036535fcd179 anteater
jdamerow
parents:
diff changeset
87 return words;
036535fcd179 anteater
jdamerow
parents:
diff changeset
88 }
036535fcd179 anteater
jdamerow
parents:
diff changeset
89
036535fcd179 anteater
jdamerow
parents:
diff changeset
90 @Override
036535fcd179 anteater
jdamerow
parents:
diff changeset
91 public List<Word> getAbbreviations(String sentence) {
036535fcd179 anteater
jdamerow
parents:
diff changeset
92 // create an empty Annotation just with the given text
036535fcd179 anteater
jdamerow
parents:
diff changeset
93 Annotation document = new Annotation(sentence);
036535fcd179 anteater
jdamerow
parents:
diff changeset
94
036535fcd179 anteater
jdamerow
parents:
diff changeset
95 // run all Annotators on this text
036535fcd179 anteater
jdamerow
parents:
diff changeset
96 pipeline.annotate(document);
036535fcd179 anteater
jdamerow
parents:
diff changeset
97
036535fcd179 anteater
jdamerow
parents:
diff changeset
98 List<CoreMap> sentences = document.get(SentencesAnnotation.class);
036535fcd179 anteater
jdamerow
parents:
diff changeset
99
036535fcd179 anteater
jdamerow
parents:
diff changeset
100 List<Word> words = new ArrayList<Word>();
036535fcd179 anteater
jdamerow
parents:
diff changeset
101 for(CoreMap sen: sentences) {
036535fcd179 anteater
jdamerow
parents:
diff changeset
102 SemanticGraph annotations = sen
036535fcd179 anteater
jdamerow
parents:
diff changeset
103 .get(BasicDependenciesAnnotation.class);
036535fcd179 anteater
jdamerow
parents:
diff changeset
104
036535fcd179 anteater
jdamerow
parents:
diff changeset
105 Set<SemanticGraphEdge> edges = annotations.getEdgeSet();
036535fcd179 anteater
jdamerow
parents:
diff changeset
106 for (SemanticGraphEdge edge : edges) {
036535fcd179 anteater
jdamerow
parents:
diff changeset
107 String shortname = edge.getRelation().getShortName();
036535fcd179 anteater
jdamerow
parents:
diff changeset
108 if (shortname.equals("abbrev")) {
036535fcd179 anteater
jdamerow
parents:
diff changeset
109 Word word = new Word();
036535fcd179 anteater
jdamerow
parents:
diff changeset
110 IndexedWord idxword = edge.getTarget();
036535fcd179 anteater
jdamerow
parents:
diff changeset
111
036535fcd179 anteater
jdamerow
parents:
diff changeset
112 word.setWord(idxword.originalText());
036535fcd179 anteater
jdamerow
parents:
diff changeset
113 word.setIndex(idxword.beginPosition());
036535fcd179 anteater
jdamerow
parents:
diff changeset
114 words.add(word);
036535fcd179 anteater
jdamerow
parents:
diff changeset
115 }
036535fcd179 anteater
jdamerow
parents:
diff changeset
116 }
036535fcd179 anteater
jdamerow
parents:
diff changeset
117 }
036535fcd179 anteater
jdamerow
parents:
diff changeset
118
036535fcd179 anteater
jdamerow
parents:
diff changeset
119 return words;
036535fcd179 anteater
jdamerow
parents:
diff changeset
120 }
036535fcd179 anteater
jdamerow
parents:
diff changeset
121 }