Mercurial > hg > anteater
view src/de/mpiwg/anteater/ml/impl/StanfordNLPTextParser.java @ 0:036535fcd179
anteater
author | jdamerow |
---|---|
date | Fri, 14 Sep 2012 10:30:43 +0200 |
parents | |
children |
line wrap: on
line source
package de.mpiwg.anteater.ml.impl; import java.util.ArrayList; import java.util.List; import java.util.Properties; import java.util.Set; import de.mpiwg.anteater.ml.ITextParser; import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation; import edu.stanford.nlp.ling.CoreAnnotations.TextAnnotation; import edu.stanford.nlp.ling.IndexedWord; import edu.stanford.nlp.pipeline.Annotation; import edu.stanford.nlp.pipeline.StanfordCoreNLP; import edu.stanford.nlp.trees.semgraph.SemanticGraph; import edu.stanford.nlp.trees.semgraph.SemanticGraphCoreAnnotations.BasicDependenciesAnnotation; import edu.stanford.nlp.trees.semgraph.SemanticGraphEdge; import edu.stanford.nlp.util.CoreMap; public class StanfordNLPTextParser implements ITextParser { private StanfordCoreNLP pipeline; public StanfordNLPTextParser() { Properties props = new Properties(); props.put("annotators", "tokenize, ssplit, parse"); pipeline = new StanfordCoreNLP(props); } public List<String> getSentences(String text) { // creates a StanfordCoreNLP object, with POS tagging, lemmatization, // NER, parsing, and coreference resolution Properties props = new Properties(); props.put("annotators", "tokenize, ssplit"); StanfordCoreNLP pipeline = new StanfordCoreNLP(props); // create an empty Annotation just with the given text Annotation document = new Annotation(text); // run all Annotators on this text pipeline.annotate(document); List<CoreMap> sentenceAnnot = document.get(SentencesAnnotation.class); List<String> sentences = new ArrayList<String>(); for (CoreMap sentence : sentenceAnnot) { sentences.add(sentence.get(TextAnnotation.class)); } return sentences; } @Override public List<Word> getSubjects(String sentence) { // create an empty Annotation just with the given text List<Word> words = new ArrayList<Word>(); if (sentence == null) return words; Annotation document = new Annotation(sentence); // run all Annotators on this text pipeline.annotate(document); List<CoreMap> sentences = document.get(SentencesAnnotation.class); for(CoreMap sen: sentences) { SemanticGraph annotations = sen .get(BasicDependenciesAnnotation.class); Set<SemanticGraphEdge> edges = annotations.getEdgeSet(); for (SemanticGraphEdge edge : edges) { String shortname = edge.getRelation().getShortName(); if (shortname.contains("subj")) { Word word = new Word(); IndexedWord idxword = edge.getTarget(); word.setWord(idxword.originalText()); word.setIndex(idxword.beginPosition()); if (shortname.contains("pass")) word.setPassive(true); words.add(word); } } } return words; } @Override public List<Word> getAbbreviations(String sentence) { // create an empty Annotation just with the given text Annotation document = new Annotation(sentence); // run all Annotators on this text pipeline.annotate(document); List<CoreMap> sentences = document.get(SentencesAnnotation.class); List<Word> words = new ArrayList<Word>(); for(CoreMap sen: sentences) { SemanticGraph annotations = sen .get(BasicDependenciesAnnotation.class); Set<SemanticGraphEdge> edges = annotations.getEdgeSet(); for (SemanticGraphEdge edge : edges) { String shortname = edge.getRelation().getShortName(); if (shortname.equals("abbrev")) { Word word = new Word(); IndexedWord idxword = edge.getTarget(); word.setWord(idxword.originalText()); word.setIndex(idxword.beginPosition()); words.add(word); } } } return words; } }