Mercurial > hg > anteater
diff src/de/mpiwg/anteater/ml/impl/StanfordNLPTextParser.java @ 0:036535fcd179
anteater
author | jdamerow |
---|---|
date | Fri, 14 Sep 2012 10:30:43 +0200 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/de/mpiwg/anteater/ml/impl/StanfordNLPTextParser.java Fri Sep 14 10:30:43 2012 +0200 @@ -0,0 +1,121 @@ +package de.mpiwg.anteater.ml.impl; + +import java.util.ArrayList; +import java.util.List; +import java.util.Properties; +import java.util.Set; + +import de.mpiwg.anteater.ml.ITextParser; +import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation; +import edu.stanford.nlp.ling.CoreAnnotations.TextAnnotation; +import edu.stanford.nlp.ling.IndexedWord; +import edu.stanford.nlp.pipeline.Annotation; +import edu.stanford.nlp.pipeline.StanfordCoreNLP; +import edu.stanford.nlp.trees.semgraph.SemanticGraph; +import edu.stanford.nlp.trees.semgraph.SemanticGraphCoreAnnotations.BasicDependenciesAnnotation; +import edu.stanford.nlp.trees.semgraph.SemanticGraphEdge; +import edu.stanford.nlp.util.CoreMap; + +public class StanfordNLPTextParser implements ITextParser { + + private StanfordCoreNLP pipeline; + + public StanfordNLPTextParser() { + Properties props = new Properties(); + props.put("annotators", "tokenize, ssplit, parse"); + pipeline = new StanfordCoreNLP(props); + + } + + public List<String> getSentences(String text) { + // creates a StanfordCoreNLP object, with POS tagging, lemmatization, + // NER, parsing, and coreference resolution + Properties props = new Properties(); + props.put("annotators", "tokenize, ssplit"); + StanfordCoreNLP pipeline = new StanfordCoreNLP(props); + + // create an empty Annotation just with the given text + Annotation document = new Annotation(text); + + // run all Annotators on this text + pipeline.annotate(document); + + List<CoreMap> sentenceAnnot = document.get(SentencesAnnotation.class); + + List<String> sentences = new ArrayList<String>(); + for (CoreMap sentence : sentenceAnnot) { + sentences.add(sentence.get(TextAnnotation.class)); + } + return sentences; + } + + @Override + public List<Word> getSubjects(String sentence) { + // create an empty Annotation just with the given text + List<Word> words = new ArrayList<Word>(); + + if (sentence == null) + return words; + + Annotation document = new Annotation(sentence); + + // run all Annotators on this text + pipeline.annotate(document); + + List<CoreMap> sentences = document.get(SentencesAnnotation.class); + + for(CoreMap sen: sentences) { + SemanticGraph annotations = sen + .get(BasicDependenciesAnnotation.class); + + Set<SemanticGraphEdge> edges = annotations.getEdgeSet(); + for (SemanticGraphEdge edge : edges) { + String shortname = edge.getRelation().getShortName(); + if (shortname.contains("subj")) { + Word word = new Word(); + IndexedWord idxword = edge.getTarget(); + + word.setWord(idxword.originalText()); + word.setIndex(idxword.beginPosition()); + if (shortname.contains("pass")) + word.setPassive(true); + words.add(word); + } + } + } + + return words; + } + + @Override + public List<Word> getAbbreviations(String sentence) { + // create an empty Annotation just with the given text + Annotation document = new Annotation(sentence); + + // run all Annotators on this text + pipeline.annotate(document); + + List<CoreMap> sentences = document.get(SentencesAnnotation.class); + + List<Word> words = new ArrayList<Word>(); + for(CoreMap sen: sentences) { + SemanticGraph annotations = sen + .get(BasicDependenciesAnnotation.class); + + Set<SemanticGraphEdge> edges = annotations.getEdgeSet(); + for (SemanticGraphEdge edge : edges) { + String shortname = edge.getRelation().getShortName(); + if (shortname.equals("abbrev")) { + Word word = new Word(); + IndexedWord idxword = edge.getTarget(); + + word.setWord(idxword.originalText()); + word.setIndex(idxword.beginPosition()); + words.add(word); + } + } + } + + return words; + } +}