Mercurial > hg > anteater
view src/de/mpiwg/anteater/ml/impl/LingPipeTextParser.java @ 0:036535fcd179
anteater
author | jdamerow |
---|---|
date | Fri, 14 Sep 2012 10:30:43 +0200 |
parents | |
children |
line wrap: on
line source
package de.mpiwg.anteater.ml.impl; import java.util.ArrayList; import java.util.List; import java.util.Set; import com.aliasi.chunk.Chunk; import com.aliasi.chunk.Chunking; import com.aliasi.sentences.MedlineSentenceModel; import com.aliasi.sentences.SentenceChunker; import com.aliasi.sentences.SentenceModel; import com.aliasi.tokenizer.IndoEuropeanTokenizerFactory; import com.aliasi.tokenizer.TokenizerFactory; import de.mpiwg.anteater.ml.ITextParser; public class LingPipeTextParser implements ITextParser { @Override public List<String> getSentences(String text) { TokenizerFactory tokenizer_fac = IndoEuropeanTokenizerFactory.INSTANCE; SentenceModel sentence_model = new MedlineSentenceModel(); SentenceChunker chunker = new SentenceChunker(tokenizer_fac, sentence_model); Chunking chunking = chunker.chunk(text.toCharArray(), 0, text.length()); Set<Chunk> sentences = chunking.chunkSet(); String slice = chunking.charSequence().toString(); List<String> sentenceList = new ArrayList<String>(); for (Chunk sentence : sentences) { int start = sentence.start(); int end = sentence.end(); sentenceList.add(slice.substring(start, end)); } return sentenceList; } @Override public List<Word> getSubjects(String sentence) { // TODO Auto-generated method stub return null; } @Override public List<Word> getAbbreviations(String sentence) { // TODO Auto-generated method stub return null; } }