Mercurial > hg > anteater
diff src/de/mpiwg/anteater/ml/impl/LingPipeTextParser.java @ 0:036535fcd179
anteater
author | jdamerow |
---|---|
date | Fri, 14 Sep 2012 10:30:43 +0200 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/de/mpiwg/anteater/ml/impl/LingPipeTextParser.java Fri Sep 14 10:30:43 2012 +0200 @@ -0,0 +1,52 @@ +package de.mpiwg.anteater.ml.impl; + +import java.util.ArrayList; +import java.util.List; +import java.util.Set; + +import com.aliasi.chunk.Chunk; +import com.aliasi.chunk.Chunking; +import com.aliasi.sentences.MedlineSentenceModel; +import com.aliasi.sentences.SentenceChunker; +import com.aliasi.sentences.SentenceModel; +import com.aliasi.tokenizer.IndoEuropeanTokenizerFactory; +import com.aliasi.tokenizer.TokenizerFactory; + +import de.mpiwg.anteater.ml.ITextParser; + +public class LingPipeTextParser implements ITextParser { + + @Override + public List<String> getSentences(String text) { + TokenizerFactory tokenizer_fac = IndoEuropeanTokenizerFactory.INSTANCE; + SentenceModel sentence_model = new MedlineSentenceModel(); + SentenceChunker chunker = new SentenceChunker(tokenizer_fac, + sentence_model); + + Chunking chunking = chunker.chunk(text.toCharArray(), 0, text.length()); + Set<Chunk> sentences = chunking.chunkSet(); + String slice = chunking.charSequence().toString(); + + List<String> sentenceList = new ArrayList<String>(); + for (Chunk sentence : sentences) { + int start = sentence.start(); + int end = sentence.end(); + sentenceList.add(slice.substring(start, end)); + } + + return sentenceList; + } + + @Override + public List<Word> getSubjects(String sentence) { + // TODO Auto-generated method stub + return null; + } + + @Override + public List<Word> getAbbreviations(String sentence) { + // TODO Auto-generated method stub + return null; + } + +}