comparison src/de/mpiwg/anteater/ml/impl/StanfordNLPTextParser.java @ 0:036535fcd179

anteater
author jdamerow
date Fri, 14 Sep 2012 10:30:43 +0200
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:036535fcd179
1 package de.mpiwg.anteater.ml.impl;
2
3 import java.util.ArrayList;
4 import java.util.List;
5 import java.util.Properties;
6 import java.util.Set;
7
8 import de.mpiwg.anteater.ml.ITextParser;
9 import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation;
10 import edu.stanford.nlp.ling.CoreAnnotations.TextAnnotation;
11 import edu.stanford.nlp.ling.IndexedWord;
12 import edu.stanford.nlp.pipeline.Annotation;
13 import edu.stanford.nlp.pipeline.StanfordCoreNLP;
14 import edu.stanford.nlp.trees.semgraph.SemanticGraph;
15 import edu.stanford.nlp.trees.semgraph.SemanticGraphCoreAnnotations.BasicDependenciesAnnotation;
16 import edu.stanford.nlp.trees.semgraph.SemanticGraphEdge;
17 import edu.stanford.nlp.util.CoreMap;
18
19 public class StanfordNLPTextParser implements ITextParser {
20
21 private StanfordCoreNLP pipeline;
22
23 public StanfordNLPTextParser() {
24 Properties props = new Properties();
25 props.put("annotators", "tokenize, ssplit, parse");
26 pipeline = new StanfordCoreNLP(props);
27
28 }
29
30 public List<String> getSentences(String text) {
31 // creates a StanfordCoreNLP object, with POS tagging, lemmatization,
32 // NER, parsing, and coreference resolution
33 Properties props = new Properties();
34 props.put("annotators", "tokenize, ssplit");
35 StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
36
37 // create an empty Annotation just with the given text
38 Annotation document = new Annotation(text);
39
40 // run all Annotators on this text
41 pipeline.annotate(document);
42
43 List<CoreMap> sentenceAnnot = document.get(SentencesAnnotation.class);
44
45 List<String> sentences = new ArrayList<String>();
46 for (CoreMap sentence : sentenceAnnot) {
47 sentences.add(sentence.get(TextAnnotation.class));
48 }
49 return sentences;
50 }
51
52 @Override
53 public List<Word> getSubjects(String sentence) {
54 // create an empty Annotation just with the given text
55 List<Word> words = new ArrayList<Word>();
56
57 if (sentence == null)
58 return words;
59
60 Annotation document = new Annotation(sentence);
61
62 // run all Annotators on this text
63 pipeline.annotate(document);
64
65 List<CoreMap> sentences = document.get(SentencesAnnotation.class);
66
67 for(CoreMap sen: sentences) {
68 SemanticGraph annotations = sen
69 .get(BasicDependenciesAnnotation.class);
70
71 Set<SemanticGraphEdge> edges = annotations.getEdgeSet();
72 for (SemanticGraphEdge edge : edges) {
73 String shortname = edge.getRelation().getShortName();
74 if (shortname.contains("subj")) {
75 Word word = new Word();
76 IndexedWord idxword = edge.getTarget();
77
78 word.setWord(idxword.originalText());
79 word.setIndex(idxword.beginPosition());
80 if (shortname.contains("pass"))
81 word.setPassive(true);
82 words.add(word);
83 }
84 }
85 }
86
87 return words;
88 }
89
90 @Override
91 public List<Word> getAbbreviations(String sentence) {
92 // create an empty Annotation just with the given text
93 Annotation document = new Annotation(sentence);
94
95 // run all Annotators on this text
96 pipeline.annotate(document);
97
98 List<CoreMap> sentences = document.get(SentencesAnnotation.class);
99
100 List<Word> words = new ArrayList<Word>();
101 for(CoreMap sen: sentences) {
102 SemanticGraph annotations = sen
103 .get(BasicDependenciesAnnotation.class);
104
105 Set<SemanticGraphEdge> edges = annotations.getEdgeSet();
106 for (SemanticGraphEdge edge : edges) {
107 String shortname = edge.getRelation().getShortName();
108 if (shortname.equals("abbrev")) {
109 Word word = new Word();
110 IndexedWord idxword = edge.getTarget();
111
112 word.setWord(idxword.originalText());
113 word.setIndex(idxword.beginPosition());
114 words.add(word);
115 }
116 }
117 }
118
119 return words;
120 }
121 }