Mercurial > hg > anteater
comparison src/de/mpiwg/anteater/ml/impl/StanfordNLPTextParser.java @ 0:036535fcd179
anteater
author | jdamerow |
---|---|
date | Fri, 14 Sep 2012 10:30:43 +0200 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:036535fcd179 |
---|---|
1 package de.mpiwg.anteater.ml.impl; | |
2 | |
3 import java.util.ArrayList; | |
4 import java.util.List; | |
5 import java.util.Properties; | |
6 import java.util.Set; | |
7 | |
8 import de.mpiwg.anteater.ml.ITextParser; | |
9 import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation; | |
10 import edu.stanford.nlp.ling.CoreAnnotations.TextAnnotation; | |
11 import edu.stanford.nlp.ling.IndexedWord; | |
12 import edu.stanford.nlp.pipeline.Annotation; | |
13 import edu.stanford.nlp.pipeline.StanfordCoreNLP; | |
14 import edu.stanford.nlp.trees.semgraph.SemanticGraph; | |
15 import edu.stanford.nlp.trees.semgraph.SemanticGraphCoreAnnotations.BasicDependenciesAnnotation; | |
16 import edu.stanford.nlp.trees.semgraph.SemanticGraphEdge; | |
17 import edu.stanford.nlp.util.CoreMap; | |
18 | |
19 public class StanfordNLPTextParser implements ITextParser { | |
20 | |
21 private StanfordCoreNLP pipeline; | |
22 | |
23 public StanfordNLPTextParser() { | |
24 Properties props = new Properties(); | |
25 props.put("annotators", "tokenize, ssplit, parse"); | |
26 pipeline = new StanfordCoreNLP(props); | |
27 | |
28 } | |
29 | |
30 public List<String> getSentences(String text) { | |
31 // creates a StanfordCoreNLP object, with POS tagging, lemmatization, | |
32 // NER, parsing, and coreference resolution | |
33 Properties props = new Properties(); | |
34 props.put("annotators", "tokenize, ssplit"); | |
35 StanfordCoreNLP pipeline = new StanfordCoreNLP(props); | |
36 | |
37 // create an empty Annotation just with the given text | |
38 Annotation document = new Annotation(text); | |
39 | |
40 // run all Annotators on this text | |
41 pipeline.annotate(document); | |
42 | |
43 List<CoreMap> sentenceAnnot = document.get(SentencesAnnotation.class); | |
44 | |
45 List<String> sentences = new ArrayList<String>(); | |
46 for (CoreMap sentence : sentenceAnnot) { | |
47 sentences.add(sentence.get(TextAnnotation.class)); | |
48 } | |
49 return sentences; | |
50 } | |
51 | |
52 @Override | |
53 public List<Word> getSubjects(String sentence) { | |
54 // create an empty Annotation just with the given text | |
55 List<Word> words = new ArrayList<Word>(); | |
56 | |
57 if (sentence == null) | |
58 return words; | |
59 | |
60 Annotation document = new Annotation(sentence); | |
61 | |
62 // run all Annotators on this text | |
63 pipeline.annotate(document); | |
64 | |
65 List<CoreMap> sentences = document.get(SentencesAnnotation.class); | |
66 | |
67 for(CoreMap sen: sentences) { | |
68 SemanticGraph annotations = sen | |
69 .get(BasicDependenciesAnnotation.class); | |
70 | |
71 Set<SemanticGraphEdge> edges = annotations.getEdgeSet(); | |
72 for (SemanticGraphEdge edge : edges) { | |
73 String shortname = edge.getRelation().getShortName(); | |
74 if (shortname.contains("subj")) { | |
75 Word word = new Word(); | |
76 IndexedWord idxword = edge.getTarget(); | |
77 | |
78 word.setWord(idxword.originalText()); | |
79 word.setIndex(idxword.beginPosition()); | |
80 if (shortname.contains("pass")) | |
81 word.setPassive(true); | |
82 words.add(word); | |
83 } | |
84 } | |
85 } | |
86 | |
87 return words; | |
88 } | |
89 | |
90 @Override | |
91 public List<Word> getAbbreviations(String sentence) { | |
92 // create an empty Annotation just with the given text | |
93 Annotation document = new Annotation(sentence); | |
94 | |
95 // run all Annotators on this text | |
96 pipeline.annotate(document); | |
97 | |
98 List<CoreMap> sentences = document.get(SentencesAnnotation.class); | |
99 | |
100 List<Word> words = new ArrayList<Word>(); | |
101 for(CoreMap sen: sentences) { | |
102 SemanticGraph annotations = sen | |
103 .get(BasicDependenciesAnnotation.class); | |
104 | |
105 Set<SemanticGraphEdge> edges = annotations.getEdgeSet(); | |
106 for (SemanticGraphEdge edge : edges) { | |
107 String shortname = edge.getRelation().getShortName(); | |
108 if (shortname.equals("abbrev")) { | |
109 Word word = new Word(); | |
110 IndexedWord idxword = edge.getTarget(); | |
111 | |
112 word.setWord(idxword.originalText()); | |
113 word.setIndex(idxword.beginPosition()); | |
114 words.add(word); | |
115 } | |
116 } | |
117 } | |
118 | |
119 return words; | |
120 } | |
121 } |