0
|
1 package de.mpiwg.anteater.ml.preprocessing;
|
|
2
|
|
3 import java.util.HashMap;
|
|
4 import java.util.List;
|
|
5 import java.util.Map;
|
|
6
|
|
7 import de.mpiwg.anteater.ml.ITextParser;
|
|
8
|
|
9 public class FeatureCalculator {
|
|
10
|
|
11 public final static String KEYWORD_ISSUED = "issued";
|
|
12 public final static String KEYWORD_APPLIED = "applied";
|
|
13 public final static String KEYWORD_PERMIT = "permit";
|
|
14 public final static String KEYWORD_COMMENT = "comment";
|
|
15 public final static String KEYWORD_APPLICANT = "applicant";
|
|
16
|
|
17 protected Map<Integer, String> sentences;
|
|
18 protected ITextParser parser;
|
|
19
|
|
20
|
|
21 public FeatureCalculator(List<String> sentenceList, ITextParser parser, String text) {
|
|
22 init(sentenceList, text);
|
|
23 this.parser = parser;
|
|
24 }
|
|
25
|
|
26 protected void init(List<String> sentenceList, String text) {
|
|
27 int counter = 0;
|
|
28 sentences = new HashMap<Integer, String>();
|
|
29
|
|
30 for (String sent : sentenceList) {
|
|
31 int index = counter;
|
|
32 if (counter < text.length())
|
|
33 index = text.substring(counter).indexOf(sent) + counter;
|
|
34 sentences.put(index, sent);
|
|
35 counter = index + sent.length() + 1;
|
|
36 }
|
|
37 }
|
|
38
|
|
39
|
|
40
|
|
41 protected int getSentenceContainsKeyword(String keyword, int indexOfCandidate) {
|
|
42 String sentence = getSentenceContainingCandidate(indexOfCandidate);
|
|
43
|
|
44 if (sentence != null) {
|
|
45 if (sentence.contains(keyword))
|
|
46 return 1;
|
|
47 else
|
|
48 return 0;
|
|
49 }
|
|
50
|
|
51 return 0;
|
|
52 }
|
|
53
|
|
54 protected String getSentenceContainingCandidate(int indexOfCandidate) {
|
|
55 String sentence = null;
|
|
56
|
|
57 for (int senPos : sentences.keySet()) {
|
|
58 int senEnd = senPos + sentences.get(senPos).length();
|
|
59 if (indexOfCandidate >= senPos && indexOfCandidate < senEnd) {
|
|
60 sentence = sentences.get(senPos);
|
|
61 break;
|
|
62 }
|
|
63 }
|
|
64
|
|
65 return sentence;
|
|
66 }
|
|
67
|
|
68 protected int getStartOfSentenceContainingCandidiate(int indexOfCandidate) {
|
|
69 for (int senPos : sentences.keySet()) {
|
|
70 int senEnd = senPos + sentences.get(senPos).length();
|
|
71 if (indexOfCandidate >= senPos && indexOfCandidate < senEnd) {
|
|
72 return senPos;
|
|
73 }
|
|
74 }
|
|
75 return -1;
|
|
76 }
|
|
77
|
|
78 protected int getOffsetToClosestWord(int candidateStart, int candidateLength, String word, String text) {
|
|
79 if (candidateStart < 0 || candidateLength < 0 || candidateStart + candidateLength > text.length())
|
|
80 return 0;
|
|
81
|
|
82 String firstPart = text.substring(0, candidateStart);
|
|
83 String secondPart = text.substring(candidateStart + candidateLength, text.length());
|
|
84 int beforeStudy = firstPart.lastIndexOf(word);
|
|
85 int afterStudy = secondPart.indexOf(word);
|
|
86
|
|
87 int offsetBefore = 0;
|
|
88 int offsetAfter = 0;
|
|
89
|
|
90 if (beforeStudy > -1)
|
|
91 offsetBefore = beforeStudy - candidateStart;
|
|
92
|
|
93 if (afterStudy > -1)
|
|
94 offsetAfter = afterStudy - candidateStart;
|
|
95
|
|
96 if ((-1*offsetBefore) < offsetAfter)
|
|
97 return offsetBefore;
|
|
98 else return offsetAfter;
|
|
99 }
|
|
100
|
|
101 }
|