Mercurial > hg > anteater
diff src/de/mpiwg/anteater/ml/impl/ICUTextParser.java @ 0:036535fcd179
anteater
author | jdamerow |
---|---|
date | Fri, 14 Sep 2012 10:30:43 +0200 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/de/mpiwg/anteater/ml/impl/ICUTextParser.java Fri Sep 14 10:30:43 2012 +0200 @@ -0,0 +1,42 @@ +package de.mpiwg.anteater.ml.impl; + +import java.util.ArrayList; +import java.util.List; +import java.util.Locale; + +import com.ibm.icu.text.BreakIterator; + +import de.mpiwg.anteater.ml.ITextParser; + +public class ICUTextParser implements ITextParser { + + @Override + public List<String> getSentences(String text) { + BreakIterator iterator = BreakIterator.getSentenceInstance(Locale.US); + iterator.setText(text); + + List<String> sentences = new ArrayList<String>(); + + int start = iterator.first(); + for (int end = iterator.next(); + end != BreakIterator.DONE; + start = end, end = iterator.next()) { + sentences.add(text.substring(start,end)); + } + + return sentences; + } + + @Override + public List<Word> getSubjects(String sentence) { + // TODO Auto-generated method stub + return null; + } + + @Override + public List<Word> getAbbreviations(String sentence) { + // TODO Auto-generated method stub + return null; + } + +}