view src/de/mpiwg/anteater/ml/impl/LingPipeTextParser.java @ 0:036535fcd179

anteater
author jdamerow
date Fri, 14 Sep 2012 10:30:43 +0200
parents
children
line wrap: on
line source

package de.mpiwg.anteater.ml.impl;

import java.util.ArrayList;
import java.util.List;
import java.util.Set;

import com.aliasi.chunk.Chunk;
import com.aliasi.chunk.Chunking;
import com.aliasi.sentences.MedlineSentenceModel;
import com.aliasi.sentences.SentenceChunker;
import com.aliasi.sentences.SentenceModel;
import com.aliasi.tokenizer.IndoEuropeanTokenizerFactory;
import com.aliasi.tokenizer.TokenizerFactory;

import de.mpiwg.anteater.ml.ITextParser;

public class LingPipeTextParser implements ITextParser {

	@Override
	public List<String> getSentences(String text) {
		TokenizerFactory tokenizer_fac = IndoEuropeanTokenizerFactory.INSTANCE;
		SentenceModel sentence_model = new MedlineSentenceModel();
		SentenceChunker chunker = new SentenceChunker(tokenizer_fac,
				sentence_model);

		Chunking chunking = chunker.chunk(text.toCharArray(), 0, text.length());
		Set<Chunk> sentences = chunking.chunkSet();
		String slice = chunking.charSequence().toString();

		List<String> sentenceList = new ArrayList<String>();
		for (Chunk sentence : sentences) {
			int start = sentence.start();
			int end = sentence.end();
			sentenceList.add(slice.substring(start, end));
		}
		
		return sentenceList;
	}

	@Override
	public List<Word> getSubjects(String sentence) {
		// TODO Auto-generated method stub
		return null;
	}

	@Override
	public List<Word> getAbbreviations(String sentence) {
		// TODO Auto-generated method stub
		return null;
	}

}