Mercurial > hg > anteater
diff src/de/mpiwg/anteater/ml/preprocessing/FeatureCalculator.java @ 0:036535fcd179
anteater
author | jdamerow |
---|---|
date | Fri, 14 Sep 2012 10:30:43 +0200 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/de/mpiwg/anteater/ml/preprocessing/FeatureCalculator.java Fri Sep 14 10:30:43 2012 +0200 @@ -0,0 +1,101 @@ +package de.mpiwg.anteater.ml.preprocessing; + +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import de.mpiwg.anteater.ml.ITextParser; + +public class FeatureCalculator { + + public final static String KEYWORD_ISSUED = "issued"; + public final static String KEYWORD_APPLIED = "applied"; + public final static String KEYWORD_PERMIT = "permit"; + public final static String KEYWORD_COMMENT = "comment"; + public final static String KEYWORD_APPLICANT = "applicant"; + + protected Map<Integer, String> sentences; + protected ITextParser parser; + + + public FeatureCalculator(List<String> sentenceList, ITextParser parser, String text) { + init(sentenceList, text); + this.parser = parser; + } + + protected void init(List<String> sentenceList, String text) { + int counter = 0; + sentences = new HashMap<Integer, String>(); + + for (String sent : sentenceList) { + int index = counter; + if (counter < text.length()) + index = text.substring(counter).indexOf(sent) + counter; + sentences.put(index, sent); + counter = index + sent.length() + 1; + } + } + + + + protected int getSentenceContainsKeyword(String keyword, int indexOfCandidate) { + String sentence = getSentenceContainingCandidate(indexOfCandidate); + + if (sentence != null) { + if (sentence.contains(keyword)) + return 1; + else + return 0; + } + + return 0; + } + + protected String getSentenceContainingCandidate(int indexOfCandidate) { + String sentence = null; + + for (int senPos : sentences.keySet()) { + int senEnd = senPos + sentences.get(senPos).length(); + if (indexOfCandidate >= senPos && indexOfCandidate < senEnd) { + sentence = sentences.get(senPos); + break; + } + } + + return sentence; + } + + protected int getStartOfSentenceContainingCandidiate(int indexOfCandidate) { + for (int senPos : sentences.keySet()) { + int senEnd = senPos + sentences.get(senPos).length(); + if (indexOfCandidate >= senPos && indexOfCandidate < senEnd) { + return senPos; + } + } + return -1; + } + + protected int getOffsetToClosestWord(int candidateStart, int candidateLength, String word, String text) { + if (candidateStart < 0 || candidateLength < 0 || candidateStart + candidateLength > text.length()) + return 0; + + String firstPart = text.substring(0, candidateStart); + String secondPart = text.substring(candidateStart + candidateLength, text.length()); + int beforeStudy = firstPart.lastIndexOf(word); + int afterStudy = secondPart.indexOf(word); + + int offsetBefore = 0; + int offsetAfter = 0; + + if (beforeStudy > -1) + offsetBefore = beforeStudy - candidateStart; + + if (afterStudy > -1) + offsetAfter = afterStudy - candidateStart; + + if ((-1*offsetBefore) < offsetAfter) + return offsetBefore; + else return offsetAfter; + } + +}