annotate src/de/mpiwg/anteater/ml/impl/LingPipeTextParser.java @ 0:036535fcd179

anteater
author jdamerow
date Fri, 14 Sep 2012 10:30:43 +0200
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
036535fcd179 anteater
jdamerow
parents:
diff changeset
1 package de.mpiwg.anteater.ml.impl;
036535fcd179 anteater
jdamerow
parents:
diff changeset
2
036535fcd179 anteater
jdamerow
parents:
diff changeset
3 import java.util.ArrayList;
036535fcd179 anteater
jdamerow
parents:
diff changeset
4 import java.util.List;
036535fcd179 anteater
jdamerow
parents:
diff changeset
5 import java.util.Set;
036535fcd179 anteater
jdamerow
parents:
diff changeset
6
036535fcd179 anteater
jdamerow
parents:
diff changeset
7 import com.aliasi.chunk.Chunk;
036535fcd179 anteater
jdamerow
parents:
diff changeset
8 import com.aliasi.chunk.Chunking;
036535fcd179 anteater
jdamerow
parents:
diff changeset
9 import com.aliasi.sentences.MedlineSentenceModel;
036535fcd179 anteater
jdamerow
parents:
diff changeset
10 import com.aliasi.sentences.SentenceChunker;
036535fcd179 anteater
jdamerow
parents:
diff changeset
11 import com.aliasi.sentences.SentenceModel;
036535fcd179 anteater
jdamerow
parents:
diff changeset
12 import com.aliasi.tokenizer.IndoEuropeanTokenizerFactory;
036535fcd179 anteater
jdamerow
parents:
diff changeset
13 import com.aliasi.tokenizer.TokenizerFactory;
036535fcd179 anteater
jdamerow
parents:
diff changeset
14
036535fcd179 anteater
jdamerow
parents:
diff changeset
15 import de.mpiwg.anteater.ml.ITextParser;
036535fcd179 anteater
jdamerow
parents:
diff changeset
16
036535fcd179 anteater
jdamerow
parents:
diff changeset
17 public class LingPipeTextParser implements ITextParser {
036535fcd179 anteater
jdamerow
parents:
diff changeset
18
036535fcd179 anteater
jdamerow
parents:
diff changeset
19 @Override
036535fcd179 anteater
jdamerow
parents:
diff changeset
20 public List<String> getSentences(String text) {
036535fcd179 anteater
jdamerow
parents:
diff changeset
21 TokenizerFactory tokenizer_fac = IndoEuropeanTokenizerFactory.INSTANCE;
036535fcd179 anteater
jdamerow
parents:
diff changeset
22 SentenceModel sentence_model = new MedlineSentenceModel();
036535fcd179 anteater
jdamerow
parents:
diff changeset
23 SentenceChunker chunker = new SentenceChunker(tokenizer_fac,
036535fcd179 anteater
jdamerow
parents:
diff changeset
24 sentence_model);
036535fcd179 anteater
jdamerow
parents:
diff changeset
25
036535fcd179 anteater
jdamerow
parents:
diff changeset
26 Chunking chunking = chunker.chunk(text.toCharArray(), 0, text.length());
036535fcd179 anteater
jdamerow
parents:
diff changeset
27 Set<Chunk> sentences = chunking.chunkSet();
036535fcd179 anteater
jdamerow
parents:
diff changeset
28 String slice = chunking.charSequence().toString();
036535fcd179 anteater
jdamerow
parents:
diff changeset
29
036535fcd179 anteater
jdamerow
parents:
diff changeset
30 List<String> sentenceList = new ArrayList<String>();
036535fcd179 anteater
jdamerow
parents:
diff changeset
31 for (Chunk sentence : sentences) {
036535fcd179 anteater
jdamerow
parents:
diff changeset
32 int start = sentence.start();
036535fcd179 anteater
jdamerow
parents:
diff changeset
33 int end = sentence.end();
036535fcd179 anteater
jdamerow
parents:
diff changeset
34 sentenceList.add(slice.substring(start, end));
036535fcd179 anteater
jdamerow
parents:
diff changeset
35 }
036535fcd179 anteater
jdamerow
parents:
diff changeset
36
036535fcd179 anteater
jdamerow
parents:
diff changeset
37 return sentenceList;
036535fcd179 anteater
jdamerow
parents:
diff changeset
38 }
036535fcd179 anteater
jdamerow
parents:
diff changeset
39
036535fcd179 anteater
jdamerow
parents:
diff changeset
40 @Override
036535fcd179 anteater
jdamerow
parents:
diff changeset
41 public List<Word> getSubjects(String sentence) {
036535fcd179 anteater
jdamerow
parents:
diff changeset
42 // TODO Auto-generated method stub
036535fcd179 anteater
jdamerow
parents:
diff changeset
43 return null;
036535fcd179 anteater
jdamerow
parents:
diff changeset
44 }
036535fcd179 anteater
jdamerow
parents:
diff changeset
45
036535fcd179 anteater
jdamerow
parents:
diff changeset
46 @Override
036535fcd179 anteater
jdamerow
parents:
diff changeset
47 public List<Word> getAbbreviations(String sentence) {
036535fcd179 anteater
jdamerow
parents:
diff changeset
48 // TODO Auto-generated method stub
036535fcd179 anteater
jdamerow
parents:
diff changeset
49 return null;
036535fcd179 anteater
jdamerow
parents:
diff changeset
50 }
036535fcd179 anteater
jdamerow
parents:
diff changeset
51
036535fcd179 anteater
jdamerow
parents:
diff changeset
52 }