0
|
1 package de.mpiwg.anteater.ml.impl;
|
|
2
|
|
3 import java.util.ArrayList;
|
|
4 import java.util.List;
|
|
5 import java.util.Properties;
|
|
6 import java.util.Set;
|
|
7
|
|
8 import de.mpiwg.anteater.ml.ITextParser;
|
|
9 import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation;
|
|
10 import edu.stanford.nlp.ling.CoreAnnotations.TextAnnotation;
|
|
11 import edu.stanford.nlp.ling.IndexedWord;
|
|
12 import edu.stanford.nlp.pipeline.Annotation;
|
|
13 import edu.stanford.nlp.pipeline.StanfordCoreNLP;
|
|
14 import edu.stanford.nlp.trees.semgraph.SemanticGraph;
|
|
15 import edu.stanford.nlp.trees.semgraph.SemanticGraphCoreAnnotations.BasicDependenciesAnnotation;
|
|
16 import edu.stanford.nlp.trees.semgraph.SemanticGraphEdge;
|
|
17 import edu.stanford.nlp.util.CoreMap;
|
|
18
|
|
19 public class StanfordNLPTextParser implements ITextParser {
|
|
20
|
|
21 private StanfordCoreNLP pipeline;
|
|
22
|
|
23 public StanfordNLPTextParser() {
|
|
24 Properties props = new Properties();
|
|
25 props.put("annotators", "tokenize, ssplit, parse");
|
|
26 pipeline = new StanfordCoreNLP(props);
|
|
27
|
|
28 }
|
|
29
|
|
30 public List<String> getSentences(String text) {
|
|
31 // creates a StanfordCoreNLP object, with POS tagging, lemmatization,
|
|
32 // NER, parsing, and coreference resolution
|
|
33 Properties props = new Properties();
|
|
34 props.put("annotators", "tokenize, ssplit");
|
|
35 StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
|
|
36
|
|
37 // create an empty Annotation just with the given text
|
|
38 Annotation document = new Annotation(text);
|
|
39
|
|
40 // run all Annotators on this text
|
|
41 pipeline.annotate(document);
|
|
42
|
|
43 List<CoreMap> sentenceAnnot = document.get(SentencesAnnotation.class);
|
|
44
|
|
45 List<String> sentences = new ArrayList<String>();
|
|
46 for (CoreMap sentence : sentenceAnnot) {
|
|
47 sentences.add(sentence.get(TextAnnotation.class));
|
|
48 }
|
|
49 return sentences;
|
|
50 }
|
|
51
|
|
52 @Override
|
|
53 public List<Word> getSubjects(String sentence) {
|
|
54 // create an empty Annotation just with the given text
|
|
55 List<Word> words = new ArrayList<Word>();
|
|
56
|
|
57 if (sentence == null)
|
|
58 return words;
|
|
59
|
|
60 Annotation document = new Annotation(sentence);
|
|
61
|
|
62 // run all Annotators on this text
|
|
63 pipeline.annotate(document);
|
|
64
|
|
65 List<CoreMap> sentences = document.get(SentencesAnnotation.class);
|
|
66
|
|
67 for(CoreMap sen: sentences) {
|
|
68 SemanticGraph annotations = sen
|
|
69 .get(BasicDependenciesAnnotation.class);
|
|
70
|
|
71 Set<SemanticGraphEdge> edges = annotations.getEdgeSet();
|
|
72 for (SemanticGraphEdge edge : edges) {
|
|
73 String shortname = edge.getRelation().getShortName();
|
|
74 if (shortname.contains("subj")) {
|
|
75 Word word = new Word();
|
|
76 IndexedWord idxword = edge.getTarget();
|
|
77
|
|
78 word.setWord(idxword.originalText());
|
|
79 word.setIndex(idxword.beginPosition());
|
|
80 if (shortname.contains("pass"))
|
|
81 word.setPassive(true);
|
|
82 words.add(word);
|
|
83 }
|
|
84 }
|
|
85 }
|
|
86
|
|
87 return words;
|
|
88 }
|
|
89
|
|
90 @Override
|
|
91 public List<Word> getAbbreviations(String sentence) {
|
|
92 // create an empty Annotation just with the given text
|
|
93 Annotation document = new Annotation(sentence);
|
|
94
|
|
95 // run all Annotators on this text
|
|
96 pipeline.annotate(document);
|
|
97
|
|
98 List<CoreMap> sentences = document.get(SentencesAnnotation.class);
|
|
99
|
|
100 List<Word> words = new ArrayList<Word>();
|
|
101 for(CoreMap sen: sentences) {
|
|
102 SemanticGraph annotations = sen
|
|
103 .get(BasicDependenciesAnnotation.class);
|
|
104
|
|
105 Set<SemanticGraphEdge> edges = annotations.getEdgeSet();
|
|
106 for (SemanticGraphEdge edge : edges) {
|
|
107 String shortname = edge.getRelation().getShortName();
|
|
108 if (shortname.equals("abbrev")) {
|
|
109 Word word = new Word();
|
|
110 IndexedWord idxword = edge.getTarget();
|
|
111
|
|
112 word.setWord(idxword.originalText());
|
|
113 word.setIndex(idxword.beginPosition());
|
|
114 words.add(word);
|
|
115 }
|
|
116 }
|
|
117 }
|
|
118
|
|
119 return words;
|
|
120 }
|
|
121 }
|