Mercurial > hg > anteater

package de.mpiwg.anteater.ml.impl;

import java.util.ArrayList;
import java.util.List;
import java.util.Properties;
import java.util.Set;

import de.mpiwg.anteater.ml.ITextParser;
import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.TextAnnotation;
import edu.stanford.nlp.ling.IndexedWord;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.pipeline.StanfordCoreNLP;
import edu.stanford.nlp.trees.semgraph.SemanticGraph;
import edu.stanford.nlp.trees.semgraph.SemanticGraphCoreAnnotations.BasicDependenciesAnnotation;
import edu.stanford.nlp.trees.semgraph.SemanticGraphEdge;
import edu.stanford.nlp.util.CoreMap;

public class StanfordNLPTextParser implements ITextParser {

	private StanfordCoreNLP pipeline;

	public StanfordNLPTextParser() {
		Properties props = new Properties();
		props.put("annotators", "tokenize, ssplit, parse");
		pipeline = new StanfordCoreNLP(props);

	}

	public List<String> getSentences(String text) {
		// creates a StanfordCoreNLP object, with POS tagging, lemmatization,
		// NER, parsing, and coreference resolution
		Properties props = new Properties();
		props.put("annotators", "tokenize, ssplit");
		StanfordCoreNLP pipeline = new StanfordCoreNLP(props);

		// create an empty Annotation just with the given text
		Annotation document = new Annotation(text);

		// run all Annotators on this text
		pipeline.annotate(document);

		List<CoreMap> sentenceAnnot = document.get(SentencesAnnotation.class);

		List<String> sentences = new ArrayList<String>();
		for (CoreMap sentence : sentenceAnnot) {
			sentences.add(sentence.get(TextAnnotation.class));
		}
		return sentences;
	}

	@Override
	public List<Word> getSubjects(String sentence) {
		// create an empty Annotation just with the given text
		List<Word> words = new ArrayList<Word>();

		if (sentence == null)
			return words;

		Annotation document = new Annotation(sentence);

		// run all Annotators on this text
		pipeline.annotate(document);

		List<CoreMap> sentences = document.get(SentencesAnnotation.class);

		 for(CoreMap sen: sentences) {
	    	SemanticGraph annotations = sen
			.get(BasicDependenciesAnnotation.class);

			Set<SemanticGraphEdge> edges = annotations.getEdgeSet();
			for (SemanticGraphEdge edge : edges) {
				String shortname = edge.getRelation().getShortName();
				if (shortname.contains("subj")) {
					Word word = new Word();
					IndexedWord idxword = edge.getTarget();

					word.setWord(idxword.originalText());
					word.setIndex(idxword.beginPosition());
					if (shortname.contains("pass"))
						word.setPassive(true);
					words.add(word);
				}
			}
	    }

	    return words;
	}

	@Override
	public List<Word> getAbbreviations(String sentence) {
		// create an empty Annotation just with the given text
		Annotation document = new Annotation(sentence);

		// run all Annotators on this text
		pipeline.annotate(document);

		List<CoreMap> sentences = document.get(SentencesAnnotation.class);

		List<Word> words = new ArrayList<Word>();
	    for(CoreMap sen: sentences) {
	    	SemanticGraph annotations = sen
			.get(BasicDependenciesAnnotation.class);

			Set<SemanticGraphEdge> edges = annotations.getEdgeSet();
			for (SemanticGraphEdge edge : edges) {
				String shortname = edge.getRelation().getShortName();
				if (shortname.equals("abbrev")) {
					Word word = new Word();
					IndexedWord idxword = edge.getTarget();

					word.setWord(idxword.originalText());
					word.setIndex(idxword.beginPosition());
					words.add(word);
				}
			}
	    }

	    return words;
	}
}
author	jdamerow
date	Fri, 14 Sep 2012 10:30:43 +0200
parents
children