view src/de/mpiwg/anteater/ml/preprocessing/FeatureCalculator.java @ 0:036535fcd179

anteater
author jdamerow
date Fri, 14 Sep 2012 10:30:43 +0200
parents
children
line wrap: on
line source

package de.mpiwg.anteater.ml.preprocessing;

import java.util.HashMap;
import java.util.List;
import java.util.Map;

import de.mpiwg.anteater.ml.ITextParser;

public class FeatureCalculator {
	
	public final static String KEYWORD_ISSUED = "issued";
	public final static String KEYWORD_APPLIED = "applied";
	public final static String KEYWORD_PERMIT = "permit";
	public final static String KEYWORD_COMMENT = "comment";
	public final static String KEYWORD_APPLICANT = "applicant";
	
	protected Map<Integer, String> sentences;
	protected ITextParser parser;
	

	public FeatureCalculator(List<String> sentenceList, ITextParser parser, String text) {
		init(sentenceList, text);
		this.parser = parser;
	}
	
	protected void init(List<String> sentenceList, String text) {
		int counter = 0;
		sentences = new HashMap<Integer, String>();
		
		for (String sent : sentenceList) {
			int index = counter;
			if (counter < text.length())
				index = text.substring(counter).indexOf(sent) + counter;
			sentences.put(index, sent);
			counter = index + sent.length() + 1;
		}
	}

	
	
	protected int getSentenceContainsKeyword(String keyword, int indexOfCandidate) {
		String sentence = getSentenceContainingCandidate(indexOfCandidate);
		
		if (sentence != null) {
			if (sentence.contains(keyword))
				return 1;
			else
				return 0;
		}
		
		return 0;
	}
	
	protected String getSentenceContainingCandidate(int indexOfCandidate) {
		String sentence = null;
		
		for (int senPos : sentences.keySet()) {
			int senEnd = senPos + sentences.get(senPos).length();
			if (indexOfCandidate >= senPos && indexOfCandidate < senEnd) {
				sentence = sentences.get(senPos);
				break;
			}
		}
		
		return sentence;
	}
	
	protected int getStartOfSentenceContainingCandidiate(int indexOfCandidate) {
		for (int senPos : sentences.keySet()) {
			int senEnd = senPos + sentences.get(senPos).length();
			if (indexOfCandidate >= senPos && indexOfCandidate < senEnd) {
				return senPos;
			}
		}
		return -1;
	}
	
	protected int getOffsetToClosestWord(int candidateStart, int candidateLength, String word, String text) {
		if (candidateStart < 0 || candidateLength < 0 || candidateStart + candidateLength > text.length())
			return 0;
		
		String firstPart = text.substring(0, candidateStart);
		String secondPart = text.substring(candidateStart + candidateLength, text.length());
		int beforeStudy = firstPart.lastIndexOf(word);
		int afterStudy = secondPart.indexOf(word);
		
		int offsetBefore = 0;
		int offsetAfter = 0;
		
		if (beforeStudy > -1)
			offsetBefore = beforeStudy - candidateStart;
		
		if (afterStudy > -1) 
			offsetAfter = afterStudy - candidateStart;
		
		if ((-1*offsetBefore) < offsetAfter)
			return offsetBefore;
		else return offsetAfter;
	}

}