diff src/de/mpiwg/anteater/ml/impl/LingPipeTextParser.java @ 0:036535fcd179

anteater
author jdamerow
date Fri, 14 Sep 2012 10:30:43 +0200
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/de/mpiwg/anteater/ml/impl/LingPipeTextParser.java	Fri Sep 14 10:30:43 2012 +0200
@@ -0,0 +1,52 @@
+package de.mpiwg.anteater.ml.impl;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Set;
+
+import com.aliasi.chunk.Chunk;
+import com.aliasi.chunk.Chunking;
+import com.aliasi.sentences.MedlineSentenceModel;
+import com.aliasi.sentences.SentenceChunker;
+import com.aliasi.sentences.SentenceModel;
+import com.aliasi.tokenizer.IndoEuropeanTokenizerFactory;
+import com.aliasi.tokenizer.TokenizerFactory;
+
+import de.mpiwg.anteater.ml.ITextParser;
+
+public class LingPipeTextParser implements ITextParser {
+
+	@Override
+	public List<String> getSentences(String text) {
+		TokenizerFactory tokenizer_fac = IndoEuropeanTokenizerFactory.INSTANCE;
+		SentenceModel sentence_model = new MedlineSentenceModel();
+		SentenceChunker chunker = new SentenceChunker(tokenizer_fac,
+				sentence_model);
+
+		Chunking chunking = chunker.chunk(text.toCharArray(), 0, text.length());
+		Set<Chunk> sentences = chunking.chunkSet();
+		String slice = chunking.charSequence().toString();
+
+		List<String> sentenceList = new ArrayList<String>();
+		for (Chunk sentence : sentences) {
+			int start = sentence.start();
+			int end = sentence.end();
+			sentenceList.add(slice.substring(start, end));
+		}
+		
+		return sentenceList;
+	}
+
+	@Override
+	public List<Word> getSubjects(String sentence) {
+		// TODO Auto-generated method stub
+		return null;
+	}
+
+	@Override
+	public List<Word> getAbbreviations(String sentence) {
+		// TODO Auto-generated method stub
+		return null;
+	}
+
+}