Mercurial > hg > anteater
view src/de/mpiwg/anteater/ml/preprocessing/FeatureCalculator.java @ 0:036535fcd179
anteater
author | jdamerow |
---|---|
date | Fri, 14 Sep 2012 10:30:43 +0200 |
parents | |
children |
line wrap: on
line source
package de.mpiwg.anteater.ml.preprocessing; import java.util.HashMap; import java.util.List; import java.util.Map; import de.mpiwg.anteater.ml.ITextParser; public class FeatureCalculator { public final static String KEYWORD_ISSUED = "issued"; public final static String KEYWORD_APPLIED = "applied"; public final static String KEYWORD_PERMIT = "permit"; public final static String KEYWORD_COMMENT = "comment"; public final static String KEYWORD_APPLICANT = "applicant"; protected Map<Integer, String> sentences; protected ITextParser parser; public FeatureCalculator(List<String> sentenceList, ITextParser parser, String text) { init(sentenceList, text); this.parser = parser; } protected void init(List<String> sentenceList, String text) { int counter = 0; sentences = new HashMap<Integer, String>(); for (String sent : sentenceList) { int index = counter; if (counter < text.length()) index = text.substring(counter).indexOf(sent) + counter; sentences.put(index, sent); counter = index + sent.length() + 1; } } protected int getSentenceContainsKeyword(String keyword, int indexOfCandidate) { String sentence = getSentenceContainingCandidate(indexOfCandidate); if (sentence != null) { if (sentence.contains(keyword)) return 1; else return 0; } return 0; } protected String getSentenceContainingCandidate(int indexOfCandidate) { String sentence = null; for (int senPos : sentences.keySet()) { int senEnd = senPos + sentences.get(senPos).length(); if (indexOfCandidate >= senPos && indexOfCandidate < senEnd) { sentence = sentences.get(senPos); break; } } return sentence; } protected int getStartOfSentenceContainingCandidiate(int indexOfCandidate) { for (int senPos : sentences.keySet()) { int senEnd = senPos + sentences.get(senPos).length(); if (indexOfCandidate >= senPos && indexOfCandidate < senEnd) { return senPos; } } return -1; } protected int getOffsetToClosestWord(int candidateStart, int candidateLength, String word, String text) { if (candidateStart < 0 || candidateLength < 0 || candidateStart + candidateLength > text.length()) return 0; String firstPart = text.substring(0, candidateStart); String secondPart = text.substring(candidateStart + candidateLength, text.length()); int beforeStudy = firstPart.lastIndexOf(word); int afterStudy = secondPart.indexOf(word); int offsetBefore = 0; int offsetAfter = 0; if (beforeStudy > -1) offsetBefore = beforeStudy - candidateStart; if (afterStudy > -1) offsetAfter = afterStudy - candidateStart; if ((-1*offsetBefore) < offsetAfter) return offsetBefore; else return offsetAfter; } }