annotate src/de/mpiwg/anteater/ml/preprocessing/FeatureCalculator.java @ 10:70510ec97f4a default tip

annotate texts with results and build events with linnaeus
author jdamerow
date Mon, 19 Nov 2012 16:36:54 -0700
parents 036535fcd179
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
036535fcd179 anteater
jdamerow
parents:
diff changeset
1 package de.mpiwg.anteater.ml.preprocessing;
036535fcd179 anteater
jdamerow
parents:
diff changeset
2
036535fcd179 anteater
jdamerow
parents:
diff changeset
3 import java.util.HashMap;
036535fcd179 anteater
jdamerow
parents:
diff changeset
4 import java.util.List;
036535fcd179 anteater
jdamerow
parents:
diff changeset
5 import java.util.Map;
036535fcd179 anteater
jdamerow
parents:
diff changeset
6
036535fcd179 anteater
jdamerow
parents:
diff changeset
7 import de.mpiwg.anteater.ml.ITextParser;
036535fcd179 anteater
jdamerow
parents:
diff changeset
8
036535fcd179 anteater
jdamerow
parents:
diff changeset
9 public class FeatureCalculator {
036535fcd179 anteater
jdamerow
parents:
diff changeset
10
036535fcd179 anteater
jdamerow
parents:
diff changeset
11 public final static String KEYWORD_ISSUED = "issued";
036535fcd179 anteater
jdamerow
parents:
diff changeset
12 public final static String KEYWORD_APPLIED = "applied";
036535fcd179 anteater
jdamerow
parents:
diff changeset
13 public final static String KEYWORD_PERMIT = "permit";
036535fcd179 anteater
jdamerow
parents:
diff changeset
14 public final static String KEYWORD_COMMENT = "comment";
036535fcd179 anteater
jdamerow
parents:
diff changeset
15 public final static String KEYWORD_APPLICANT = "applicant";
036535fcd179 anteater
jdamerow
parents:
diff changeset
16
036535fcd179 anteater
jdamerow
parents:
diff changeset
17 protected Map<Integer, String> sentences;
036535fcd179 anteater
jdamerow
parents:
diff changeset
18 protected ITextParser parser;
036535fcd179 anteater
jdamerow
parents:
diff changeset
19
036535fcd179 anteater
jdamerow
parents:
diff changeset
20
036535fcd179 anteater
jdamerow
parents:
diff changeset
21 public FeatureCalculator(List<String> sentenceList, ITextParser parser, String text) {
036535fcd179 anteater
jdamerow
parents:
diff changeset
22 init(sentenceList, text);
036535fcd179 anteater
jdamerow
parents:
diff changeset
23 this.parser = parser;
036535fcd179 anteater
jdamerow
parents:
diff changeset
24 }
036535fcd179 anteater
jdamerow
parents:
diff changeset
25
036535fcd179 anteater
jdamerow
parents:
diff changeset
26 protected void init(List<String> sentenceList, String text) {
036535fcd179 anteater
jdamerow
parents:
diff changeset
27 int counter = 0;
036535fcd179 anteater
jdamerow
parents:
diff changeset
28 sentences = new HashMap<Integer, String>();
036535fcd179 anteater
jdamerow
parents:
diff changeset
29
036535fcd179 anteater
jdamerow
parents:
diff changeset
30 for (String sent : sentenceList) {
036535fcd179 anteater
jdamerow
parents:
diff changeset
31 int index = counter;
036535fcd179 anteater
jdamerow
parents:
diff changeset
32 if (counter < text.length())
036535fcd179 anteater
jdamerow
parents:
diff changeset
33 index = text.substring(counter).indexOf(sent) + counter;
036535fcd179 anteater
jdamerow
parents:
diff changeset
34 sentences.put(index, sent);
036535fcd179 anteater
jdamerow
parents:
diff changeset
35 counter = index + sent.length() + 1;
036535fcd179 anteater
jdamerow
parents:
diff changeset
36 }
036535fcd179 anteater
jdamerow
parents:
diff changeset
37 }
036535fcd179 anteater
jdamerow
parents:
diff changeset
38
036535fcd179 anteater
jdamerow
parents:
diff changeset
39
036535fcd179 anteater
jdamerow
parents:
diff changeset
40
036535fcd179 anteater
jdamerow
parents:
diff changeset
41 protected int getSentenceContainsKeyword(String keyword, int indexOfCandidate) {
036535fcd179 anteater
jdamerow
parents:
diff changeset
42 String sentence = getSentenceContainingCandidate(indexOfCandidate);
036535fcd179 anteater
jdamerow
parents:
diff changeset
43
036535fcd179 anteater
jdamerow
parents:
diff changeset
44 if (sentence != null) {
036535fcd179 anteater
jdamerow
parents:
diff changeset
45 if (sentence.contains(keyword))
036535fcd179 anteater
jdamerow
parents:
diff changeset
46 return 1;
036535fcd179 anteater
jdamerow
parents:
diff changeset
47 else
036535fcd179 anteater
jdamerow
parents:
diff changeset
48 return 0;
036535fcd179 anteater
jdamerow
parents:
diff changeset
49 }
036535fcd179 anteater
jdamerow
parents:
diff changeset
50
036535fcd179 anteater
jdamerow
parents:
diff changeset
51 return 0;
036535fcd179 anteater
jdamerow
parents:
diff changeset
52 }
036535fcd179 anteater
jdamerow
parents:
diff changeset
53
036535fcd179 anteater
jdamerow
parents:
diff changeset
54 protected String getSentenceContainingCandidate(int indexOfCandidate) {
036535fcd179 anteater
jdamerow
parents:
diff changeset
55 String sentence = null;
036535fcd179 anteater
jdamerow
parents:
diff changeset
56
036535fcd179 anteater
jdamerow
parents:
diff changeset
57 for (int senPos : sentences.keySet()) {
036535fcd179 anteater
jdamerow
parents:
diff changeset
58 int senEnd = senPos + sentences.get(senPos).length();
036535fcd179 anteater
jdamerow
parents:
diff changeset
59 if (indexOfCandidate >= senPos && indexOfCandidate < senEnd) {
036535fcd179 anteater
jdamerow
parents:
diff changeset
60 sentence = sentences.get(senPos);
036535fcd179 anteater
jdamerow
parents:
diff changeset
61 break;
036535fcd179 anteater
jdamerow
parents:
diff changeset
62 }
036535fcd179 anteater
jdamerow
parents:
diff changeset
63 }
036535fcd179 anteater
jdamerow
parents:
diff changeset
64
036535fcd179 anteater
jdamerow
parents:
diff changeset
65 return sentence;
036535fcd179 anteater
jdamerow
parents:
diff changeset
66 }
036535fcd179 anteater
jdamerow
parents:
diff changeset
67
036535fcd179 anteater
jdamerow
parents:
diff changeset
68 protected int getStartOfSentenceContainingCandidiate(int indexOfCandidate) {
036535fcd179 anteater
jdamerow
parents:
diff changeset
69 for (int senPos : sentences.keySet()) {
036535fcd179 anteater
jdamerow
parents:
diff changeset
70 int senEnd = senPos + sentences.get(senPos).length();
036535fcd179 anteater
jdamerow
parents:
diff changeset
71 if (indexOfCandidate >= senPos && indexOfCandidate < senEnd) {
036535fcd179 anteater
jdamerow
parents:
diff changeset
72 return senPos;
036535fcd179 anteater
jdamerow
parents:
diff changeset
73 }
036535fcd179 anteater
jdamerow
parents:
diff changeset
74 }
036535fcd179 anteater
jdamerow
parents:
diff changeset
75 return -1;
036535fcd179 anteater
jdamerow
parents:
diff changeset
76 }
036535fcd179 anteater
jdamerow
parents:
diff changeset
77
036535fcd179 anteater
jdamerow
parents:
diff changeset
78 protected int getOffsetToClosestWord(int candidateStart, int candidateLength, String word, String text) {
036535fcd179 anteater
jdamerow
parents:
diff changeset
79 if (candidateStart < 0 || candidateLength < 0 || candidateStart + candidateLength > text.length())
036535fcd179 anteater
jdamerow
parents:
diff changeset
80 return 0;
036535fcd179 anteater
jdamerow
parents:
diff changeset
81
036535fcd179 anteater
jdamerow
parents:
diff changeset
82 String firstPart = text.substring(0, candidateStart);
036535fcd179 anteater
jdamerow
parents:
diff changeset
83 String secondPart = text.substring(candidateStart + candidateLength, text.length());
036535fcd179 anteater
jdamerow
parents:
diff changeset
84 int beforeStudy = firstPart.lastIndexOf(word);
036535fcd179 anteater
jdamerow
parents:
diff changeset
85 int afterStudy = secondPart.indexOf(word);
036535fcd179 anteater
jdamerow
parents:
diff changeset
86
036535fcd179 anteater
jdamerow
parents:
diff changeset
87 int offsetBefore = 0;
036535fcd179 anteater
jdamerow
parents:
diff changeset
88 int offsetAfter = 0;
036535fcd179 anteater
jdamerow
parents:
diff changeset
89
036535fcd179 anteater
jdamerow
parents:
diff changeset
90 if (beforeStudy > -1)
036535fcd179 anteater
jdamerow
parents:
diff changeset
91 offsetBefore = beforeStudy - candidateStart;
036535fcd179 anteater
jdamerow
parents:
diff changeset
92
036535fcd179 anteater
jdamerow
parents:
diff changeset
93 if (afterStudy > -1)
036535fcd179 anteater
jdamerow
parents:
diff changeset
94 offsetAfter = afterStudy - candidateStart;
036535fcd179 anteater
jdamerow
parents:
diff changeset
95
036535fcd179 anteater
jdamerow
parents:
diff changeset
96 if ((-1*offsetBefore) < offsetAfter)
036535fcd179 anteater
jdamerow
parents:
diff changeset
97 return offsetBefore;
036535fcd179 anteater
jdamerow
parents:
diff changeset
98 else return offsetAfter;
036535fcd179 anteater
jdamerow
parents:
diff changeset
99 }
036535fcd179 anteater
jdamerow
parents:
diff changeset
100
036535fcd179 anteater
jdamerow
parents:
diff changeset
101 }