view src/de/mpiwg/anteater/places/ml/preprocessing/LocationFeatureCalculator.java @ 0:036535fcd179

anteater
author jdamerow
date Fri, 14 Sep 2012 10:30:43 +0200
parents
children
line wrap: on
line source

package de.mpiwg.anteater.places.ml.preprocessing;

import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import de.mpiwg.anteater.ml.ITextParser;
import de.mpiwg.anteater.ml.preprocessing.FeatureCalculator;
import de.mpiwg.anteater.persons.APerson;
import de.mpiwg.anteater.places.Place;
import de.mpiwg.anteater.places.PlaceInformation;
import de.mpiwg.anteater.species.scientific.ScientificName;
import de.mpiwg.anteater.text.Paragraph;
import de.mpiwg.anteater.text.TextPart;

public class LocationFeatureCalculator extends FeatureCalculator {

	private final String _university = "university";
	private final String _study = "study";
	private final String _studies = "studies";
	private final String _in = " in ";
	private final String _at = " at ";
	private final String _survey = "survey";
	private final String _species = "species";

	private List<ScientificName> foundNames;
	private List<APerson> foundApplicants;
	private TextPart text;
	private Map<String, Integer> typeMap;

	public LocationFeatureCalculator(List<String> sentenceList,
			ITextParser parser, TextPart text) {
		super(sentenceList, parser, text.getText());
		this.text = text;
		
		typeMap = new HashMap<String, Integer>();
		typeMap.put(Place.TOWN, 1);
		typeMap.put(Place.COUNTY, 2);
		typeMap.put(Place.STATE, 3);
		typeMap.put(Place.COUNTRY, 4);
		typeMap.put(Place.SUBURB, 5);
		typeMap.put(Place.POI, 6);
		typeMap.put(Place.ZIP, 7);
		typeMap.put(Place.OCEAN, 8);
	}

	public double getNumberWordRelation(String[] placeParts) {
		int numbers = 0;
		int words = placeParts.length;

		for (String part : placeParts) {
			if (part.matches("[0-9]{1}.*"))
				numbers++;
		}

		return 1.0 * numbers / words;
	}

	public double getUppercasedWordsToAllRelation(String[] placeParts) {
		int uppercaseWords = 0;
		int words = placeParts.length;

		for (String part : placeParts) {
			if (part.matches("[A-Z]{1}.*"))
				uppercaseWords++;
		}

		return 1.0 * uppercaseWords / words;
	}

	public int contains2UppercaseCharacterWord(String[] placeParts) {
		for (String part : placeParts) {
			if (part.matches("[A-Z][A-Z]"))
				return 1;
		}
		return 0;
	}

	public int containsUniversity(String[] parts) {
		for (String part : parts) {
			if (part.trim().toLowerCase().equals(_university))
				return 1;
		}
		return 0;
	}

	public int isPreceededByAnd(PlaceInformation candidate) {
		String sentence = getSentenceContainingCandidate(candidate.getStart());
		if (sentence == null)
			return 0;

		int offset = getStartOfSentenceContainingCandidiate(candidate.getStart());

		if (sentence.substring(0, candidate.getStart() - offset).trim()
				.endsWith("and"))
			return 1;
		return 0;
	}

	public int isPreceededByThe(PlaceInformation candidate) {
		String sentence = getSentenceContainingCandidate(candidate.getStart());
		if (sentence == null)
			return 0;

		int offset = getStartOfSentenceContainingCandidiate(candidate.getStart());

		if (sentence.substring(0, candidate.getStart() - offset).trim()
				.endsWith("the"))
			return 1;
		return 0;
	}

	public int isSurroundedByBrackets(PlaceInformation candidate) {
		String sentence = getSentenceContainingCandidate(candidate.getStart());
		if (sentence == null)
			return 0;

		Pattern pattern = Pattern.compile("\\("
				+ candidate.getReferenceInText().replace("(", "\\(").replace(")","\\)") + "\\)");
		Matcher matcher = pattern.matcher(sentence);
		if (matcher.find())
			return 1;
		return 0;
	}

	public int isSurroundedByCommata(PlaceInformation candidate) {
		String sentence = getSentenceContainingCandidate(candidate.getStart());
		if (sentence == null)
			return 0;

		if (sentence.contains(", " + candidate.getReferenceInText() + ","))
			return 1;
		return 0;
	}

	public int getCharsToLastSpeciesInParagraph(PlaceInformation candidate) {

		Paragraph paragraph = getParagraphOfCandidate(candidate);
		int textcounter = text.getPositionInTextFromParagraph(paragraph, 0);

		// find last species name
		ScientificName closestName = null;
		for (ScientificName name : foundNames) {
			// if species is before candiddate
			if (name.getStart() < candidate.getStart()
					&& name.getStart() > textcounter) {
				// if it's closer than other last species replace lastName
				if (closestName == null
						|| closestName.getStart() < name.getStart())
					closestName = name;
			}
		}
		
		if (closestName == null)
			return -1;
		
		return candidate.getStart() - (closestName.getStart() + closestName.getLength());
	}
	
	public int getCharsToNextSpeciesInParagraph(PlaceInformation candidate) {

		Paragraph paragraph = getParagraphOfCandidate(candidate);
		int textcounter = text.getPositionInTextFromParagraph(paragraph, 0);

		// find last species name
		ScientificName closestName = null;
		for (ScientificName name : foundNames) {
			// if species is before candiddate
			if (name.getStart() > candidate.getStart() + candidate.getLength()
					&& name.getStart() < textcounter + paragraph.getParagraphText().length()) {
				// if it's closer than other last species replace lastName
				if (closestName == null
						|| closestName.getStart() > name.getStart())
					closestName = name;
			}
		}
		
		if (closestName == null)
			return -1;
		
		return closestName.getStart() - (candidate.getStart() + candidate.getLength());
	}
	
	public int getCharsToLastApplicantInParagraph(PlaceInformation candidate) {

		Paragraph paragraph = getParagraphOfCandidate(candidate);
		int textcounter = text.getPositionInTextFromParagraph(paragraph, 0);

		// find last species name
		APerson closestApplicant = null;
		for (APerson person : foundApplicants) {
			// if species is before candiddate
			if (person.getStart() < candidate.getStart()
					&& person.getStart() > textcounter) {
				// if it's closer than other last species replace lastName
				if (closestApplicant == null
						|| closestApplicant.getStart() < person.getStart())
					closestApplicant = person;
			}
		}
		
		if (closestApplicant == null)
			return -1;
		
		return candidate.getStart() - (closestApplicant.getStart() + closestApplicant.getLength());
	}
	
	public int getCharToStudyInParagraph(PlaceInformation candidate) {
		Paragraph paragraph = getParagraphOfCandidate(candidate);
		int posInPara = text.getPositionInParagraphFromText(candidate.getStart());
		
		return getOffsetToClosestWord(posInPara, candidate.getLength(), _study, paragraph.getParagraphText());		
	}
	
	public int getCharToStudiesInParagraph(PlaceInformation candidate) {
		Paragraph paragraph = getParagraphOfCandidate(candidate);
		int posInPara = text.getPositionInParagraphFromText(candidate.getStart());
		
		return getOffsetToClosestWord(posInPara, candidate.getLength(), _studies, paragraph.getParagraphText());			
	}
	
	public int getCharToSurveyInSentence(PlaceInformation candidate) {
		String sentence = getSentenceContainingCandidate(candidate.getStart());
		
		return getOffsetToClosestWord(getPosInSentence(candidate), candidate.getLength(), _survey, sentence);		
	}
	
	public int getCharToSpeciesInSentence(PlaceInformation candidate) {
		String sentence = getSentenceContainingCandidate(candidate.getStart());
		
		return getOffsetToClosestWord(getPosInSentence(candidate), candidate.getLength(), _species, sentence);		
	}
	
	public int hasComma(PlaceInformation candidate) {
		if (candidate.getReferenceInText().contains(","))
			return 1;
		return 0;
	}
	
	public int hasBracket(PlaceInformation candidate) {
		if (candidate.getReferenceInText().contains(")") || candidate.getReferenceInText().contains("("))
			return 1;
		return 0;
	}
	
	public int getCharToInInSentence(PlaceInformation candidate) {
		String sentence = getSentenceContainingCandidate(candidate.getStart());
		
		return getOffsetToClosestWord(getPosInSentence(candidate), candidate.getLength(), _in, sentence);
	}

	public int getCharToAtInSentence(PlaceInformation candidate) {
		String sentence = getSentenceContainingCandidate(candidate.getStart());
		
		return getOffsetToClosestWord(getPosInSentence(candidate), candidate.getLength(), _at, sentence);
	}
	
	private int getPosInSentence(PlaceInformation candidate) {
		Paragraph paragraph = getParagraphOfCandidate(candidate);
		String sentence = getSentenceContainingCandidate(candidate.getStart());
		int indexOfSentence = paragraph.getParagraphText().indexOf(sentence);
		int posInPara = text.getPositionInParagraphFromText(candidate.getStart());
		return posInPara - indexOfSentence;
	}

	private Paragraph getParagraphOfCandidate(PlaceInformation candidate) {
		int textcounter = 0;
		Paragraph paragraph = null;
		for (Paragraph para : text.getParagraphs()) {
			if (candidate.getStart() >= textcounter
					&& candidate.getStart() < textcounter
							+ para.getParagraphText().length()) {
				paragraph = para;
				break;
			}
			textcounter += para.getParagraphText().length() + 1;
		}
		return paragraph;
	}
	
	public int getType(Place candidate) {
		String type = candidate.getType();
		
		if (typeMap.get(type) == null)
			return 0;
		
		return typeMap.get(type);
	}

	public List<ScientificName> getFoundSpecies() {
		return foundNames;
	}

	public void setFoundSpecies(List<ScientificName> foundSpecies) {
		this.foundNames = foundSpecies;
	}

	public List<APerson> getFoundApplicant() {
		return foundApplicants;
	}

	public void setFoundApplicant(List<APerson> foundApplicant) {
		this.foundApplicants = foundApplicant;
	}
}