0
|
1 package de.mpiwg.anteater.ml.impl;
|
|
2
|
|
3 import java.util.ArrayList;
|
|
4 import java.util.List;
|
|
5 import java.util.Set;
|
|
6
|
|
7 import com.aliasi.chunk.Chunk;
|
|
8 import com.aliasi.chunk.Chunking;
|
|
9 import com.aliasi.sentences.MedlineSentenceModel;
|
|
10 import com.aliasi.sentences.SentenceChunker;
|
|
11 import com.aliasi.sentences.SentenceModel;
|
|
12 import com.aliasi.tokenizer.IndoEuropeanTokenizerFactory;
|
|
13 import com.aliasi.tokenizer.TokenizerFactory;
|
|
14
|
|
15 import de.mpiwg.anteater.ml.ITextParser;
|
|
16
|
|
17 public class LingPipeTextParser implements ITextParser {
|
|
18
|
|
19 @Override
|
|
20 public List<String> getSentences(String text) {
|
|
21 TokenizerFactory tokenizer_fac = IndoEuropeanTokenizerFactory.INSTANCE;
|
|
22 SentenceModel sentence_model = new MedlineSentenceModel();
|
|
23 SentenceChunker chunker = new SentenceChunker(tokenizer_fac,
|
|
24 sentence_model);
|
|
25
|
|
26 Chunking chunking = chunker.chunk(text.toCharArray(), 0, text.length());
|
|
27 Set<Chunk> sentences = chunking.chunkSet();
|
|
28 String slice = chunking.charSequence().toString();
|
|
29
|
|
30 List<String> sentenceList = new ArrayList<String>();
|
|
31 for (Chunk sentence : sentences) {
|
|
32 int start = sentence.start();
|
|
33 int end = sentence.end();
|
|
34 sentenceList.add(slice.substring(start, end));
|
|
35 }
|
|
36
|
|
37 return sentenceList;
|
|
38 }
|
|
39
|
|
40 @Override
|
|
41 public List<Word> getSubjects(String sentence) {
|
|
42 // TODO Auto-generated method stub
|
|
43 return null;
|
|
44 }
|
|
45
|
|
46 @Override
|
|
47 public List<Word> getAbbreviations(String sentence) {
|
|
48 // TODO Auto-generated method stub
|
|
49 return null;
|
|
50 }
|
|
51
|
|
52 }
|