diff src/de/mpiwg/anteater/ml/preprocessing/FeatureCalculator.java @ 0:036535fcd179

anteater
author jdamerow
date Fri, 14 Sep 2012 10:30:43 +0200
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/de/mpiwg/anteater/ml/preprocessing/FeatureCalculator.java	Fri Sep 14 10:30:43 2012 +0200
@@ -0,0 +1,101 @@
+package de.mpiwg.anteater.ml.preprocessing;
+
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import de.mpiwg.anteater.ml.ITextParser;
+
+public class FeatureCalculator {
+	
+	public final static String KEYWORD_ISSUED = "issued";
+	public final static String KEYWORD_APPLIED = "applied";
+	public final static String KEYWORD_PERMIT = "permit";
+	public final static String KEYWORD_COMMENT = "comment";
+	public final static String KEYWORD_APPLICANT = "applicant";
+	
+	protected Map<Integer, String> sentences;
+	protected ITextParser parser;
+	
+
+	public FeatureCalculator(List<String> sentenceList, ITextParser parser, String text) {
+		init(sentenceList, text);
+		this.parser = parser;
+	}
+	
+	protected void init(List<String> sentenceList, String text) {
+		int counter = 0;
+		sentences = new HashMap<Integer, String>();
+		
+		for (String sent : sentenceList) {
+			int index = counter;
+			if (counter < text.length())
+				index = text.substring(counter).indexOf(sent) + counter;
+			sentences.put(index, sent);
+			counter = index + sent.length() + 1;
+		}
+	}
+
+	
+	
+	protected int getSentenceContainsKeyword(String keyword, int indexOfCandidate) {
+		String sentence = getSentenceContainingCandidate(indexOfCandidate);
+		
+		if (sentence != null) {
+			if (sentence.contains(keyword))
+				return 1;
+			else
+				return 0;
+		}
+		
+		return 0;
+	}
+	
+	protected String getSentenceContainingCandidate(int indexOfCandidate) {
+		String sentence = null;
+		
+		for (int senPos : sentences.keySet()) {
+			int senEnd = senPos + sentences.get(senPos).length();
+			if (indexOfCandidate >= senPos && indexOfCandidate < senEnd) {
+				sentence = sentences.get(senPos);
+				break;
+			}
+		}
+		
+		return sentence;
+	}
+	
+	protected int getStartOfSentenceContainingCandidiate(int indexOfCandidate) {
+		for (int senPos : sentences.keySet()) {
+			int senEnd = senPos + sentences.get(senPos).length();
+			if (indexOfCandidate >= senPos && indexOfCandidate < senEnd) {
+				return senPos;
+			}
+		}
+		return -1;
+	}
+	
+	protected int getOffsetToClosestWord(int candidateStart, int candidateLength, String word, String text) {
+		if (candidateStart < 0 || candidateLength < 0 || candidateStart + candidateLength > text.length())
+			return 0;
+		
+		String firstPart = text.substring(0, candidateStart);
+		String secondPart = text.substring(candidateStart + candidateLength, text.length());
+		int beforeStudy = firstPart.lastIndexOf(word);
+		int afterStudy = secondPart.indexOf(word);
+		
+		int offsetBefore = 0;
+		int offsetAfter = 0;
+		
+		if (beforeStudy > -1)
+			offsetBefore = beforeStudy - candidateStart;
+		
+		if (afterStudy > -1) 
+			offsetAfter = afterStudy - candidateStart;
+		
+		if ((-1*offsetBefore) < offsetAfter)
+			return offsetBefore;
+		else return offsetAfter;
+	}
+
+}