diff src/de/mpiwg/anteater/ml/impl/ICUTextParser.java @ 0:036535fcd179

anteater
author jdamerow
date Fri, 14 Sep 2012 10:30:43 +0200
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/de/mpiwg/anteater/ml/impl/ICUTextParser.java	Fri Sep 14 10:30:43 2012 +0200
@@ -0,0 +1,42 @@
+package de.mpiwg.anteater.ml.impl;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Locale;
+
+import com.ibm.icu.text.BreakIterator;
+
+import de.mpiwg.anteater.ml.ITextParser;
+
+public class ICUTextParser implements ITextParser {
+
+	@Override
+	public List<String> getSentences(String text) {
+		BreakIterator iterator = BreakIterator.getSentenceInstance(Locale.US);
+		iterator.setText(text);
+		
+		List<String> sentences = new ArrayList<String>();
+		
+		int start = iterator.first();
+		for (int end = iterator.next();
+		    end != BreakIterator.DONE;
+		    start = end, end = iterator.next()) {
+		  	sentences.add(text.substring(start,end));
+		}
+		
+		return sentences;
+	}
+
+	@Override
+	public List<Word> getSubjects(String sentence) {
+		// TODO Auto-generated method stub
+		return null;
+	}
+
+	@Override
+	public List<Word> getAbbreviations(String sentence) {
+		// TODO Auto-generated method stub
+		return null;
+	}
+
+}