diff src/de/mpiwg/anteater/ml/impl/StanfordNLPTextParser.java @ 0:036535fcd179

anteater
author jdamerow
date Fri, 14 Sep 2012 10:30:43 +0200
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/de/mpiwg/anteater/ml/impl/StanfordNLPTextParser.java	Fri Sep 14 10:30:43 2012 +0200
@@ -0,0 +1,121 @@
+package de.mpiwg.anteater.ml.impl;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Properties;
+import java.util.Set;
+
+import de.mpiwg.anteater.ml.ITextParser;
+import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation;
+import edu.stanford.nlp.ling.CoreAnnotations.TextAnnotation;
+import edu.stanford.nlp.ling.IndexedWord;
+import edu.stanford.nlp.pipeline.Annotation;
+import edu.stanford.nlp.pipeline.StanfordCoreNLP;
+import edu.stanford.nlp.trees.semgraph.SemanticGraph;
+import edu.stanford.nlp.trees.semgraph.SemanticGraphCoreAnnotations.BasicDependenciesAnnotation;
+import edu.stanford.nlp.trees.semgraph.SemanticGraphEdge;
+import edu.stanford.nlp.util.CoreMap;
+
+public class StanfordNLPTextParser implements ITextParser {
+	
+	private StanfordCoreNLP pipeline;
+
+	public StanfordNLPTextParser() {
+		Properties props = new Properties();
+		props.put("annotators", "tokenize, ssplit, parse");
+		pipeline = new StanfordCoreNLP(props);
+
+	}
+
+	public List<String> getSentences(String text) {
+		// creates a StanfordCoreNLP object, with POS tagging, lemmatization,
+		// NER, parsing, and coreference resolution
+		Properties props = new Properties();
+		props.put("annotators", "tokenize, ssplit");
+		StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
+
+		// create an empty Annotation just with the given text
+		Annotation document = new Annotation(text);
+
+		// run all Annotators on this text
+		pipeline.annotate(document);
+
+		List<CoreMap> sentenceAnnot = document.get(SentencesAnnotation.class);
+
+		List<String> sentences = new ArrayList<String>();
+		for (CoreMap sentence : sentenceAnnot) {
+			sentences.add(sentence.get(TextAnnotation.class));
+		}
+		return sentences;
+	}
+
+	@Override
+	public List<Word> getSubjects(String sentence) {
+		// create an empty Annotation just with the given text
+		List<Word> words = new ArrayList<Word>();
+		   
+		if (sentence == null)
+			return words;
+		
+		Annotation document = new Annotation(sentence);
+
+		// run all Annotators on this text
+		pipeline.annotate(document);
+		
+		List<CoreMap> sentences = document.get(SentencesAnnotation.class);
+	    
+		 for(CoreMap sen: sentences) {
+	    	SemanticGraph annotations = sen
+			.get(BasicDependenciesAnnotation.class);
+
+			Set<SemanticGraphEdge> edges = annotations.getEdgeSet();
+			for (SemanticGraphEdge edge : edges) {
+				String shortname = edge.getRelation().getShortName();
+				if (shortname.contains("subj")) {
+					Word word = new Word();
+					IndexedWord idxword = edge.getTarget();
+					
+					word.setWord(idxword.originalText());
+					word.setIndex(idxword.beginPosition());
+					if (shortname.contains("pass"))
+						word.setPassive(true);
+					words.add(word);
+				}
+			}
+	    }
+	    
+	    return words;
+	}
+	
+	@Override
+	public List<Word> getAbbreviations(String sentence) {
+		// create an empty Annotation just with the given text
+		Annotation document = new Annotation(sentence);
+
+		// run all Annotators on this text
+		pipeline.annotate(document);
+		
+		List<CoreMap> sentences = document.get(SentencesAnnotation.class);
+	    
+		List<Word> words = new ArrayList<Word>();
+	    for(CoreMap sen: sentences) {
+	    	SemanticGraph annotations = sen
+			.get(BasicDependenciesAnnotation.class);
+
+			Set<SemanticGraphEdge> edges = annotations.getEdgeSet();
+			for (SemanticGraphEdge edge : edges) {
+				String shortname = edge.getRelation().getShortName();
+				if (shortname.equals("abbrev")) {
+					Word word = new Word();
+					IndexedWord idxword = edge.getTarget();
+					
+					word.setWord(idxword.originalText());
+					word.setIndex(idxword.beginPosition());
+					words.add(word);
+				}
+			}
+	    }
+	    
+	    return words;
+	}
+}