diff src/de/mpiwg/anteater/places/ml/preprocessing/LocationFeatureCalculator.java @ 0:036535fcd179

anteater
author jdamerow
date Fri, 14 Sep 2012 10:30:43 +0200
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/de/mpiwg/anteater/places/ml/preprocessing/LocationFeatureCalculator.java	Fri Sep 14 10:30:43 2012 +0200
@@ -0,0 +1,307 @@
+package de.mpiwg.anteater.places.ml.preprocessing;
+
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import de.mpiwg.anteater.ml.ITextParser;
+import de.mpiwg.anteater.ml.preprocessing.FeatureCalculator;
+import de.mpiwg.anteater.persons.APerson;
+import de.mpiwg.anteater.places.Place;
+import de.mpiwg.anteater.places.PlaceInformation;
+import de.mpiwg.anteater.species.scientific.ScientificName;
+import de.mpiwg.anteater.text.Paragraph;
+import de.mpiwg.anteater.text.TextPart;
+
+public class LocationFeatureCalculator extends FeatureCalculator {
+
+	private final String _university = "university";
+	private final String _study = "study";
+	private final String _studies = "studies";
+	private final String _in = " in ";
+	private final String _at = " at ";
+	private final String _survey = "survey";
+	private final String _species = "species";
+
+	private List<ScientificName> foundNames;
+	private List<APerson> foundApplicants;
+	private TextPart text;
+	private Map<String, Integer> typeMap;
+
+	public LocationFeatureCalculator(List<String> sentenceList,
+			ITextParser parser, TextPart text) {
+		super(sentenceList, parser, text.getText());
+		this.text = text;
+		
+		typeMap = new HashMap<String, Integer>();
+		typeMap.put(Place.TOWN, 1);
+		typeMap.put(Place.COUNTY, 2);
+		typeMap.put(Place.STATE, 3);
+		typeMap.put(Place.COUNTRY, 4);
+		typeMap.put(Place.SUBURB, 5);
+		typeMap.put(Place.POI, 6);
+		typeMap.put(Place.ZIP, 7);
+		typeMap.put(Place.OCEAN, 8);
+	}
+
+	public double getNumberWordRelation(String[] placeParts) {
+		int numbers = 0;
+		int words = placeParts.length;
+
+		for (String part : placeParts) {
+			if (part.matches("[0-9]{1}.*"))
+				numbers++;
+		}
+
+		return 1.0 * numbers / words;
+	}
+
+	public double getUppercasedWordsToAllRelation(String[] placeParts) {
+		int uppercaseWords = 0;
+		int words = placeParts.length;
+
+		for (String part : placeParts) {
+			if (part.matches("[A-Z]{1}.*"))
+				uppercaseWords++;
+		}
+
+		return 1.0 * uppercaseWords / words;
+	}
+
+	public int contains2UppercaseCharacterWord(String[] placeParts) {
+		for (String part : placeParts) {
+			if (part.matches("[A-Z][A-Z]"))
+				return 1;
+		}
+		return 0;
+	}
+
+	public int containsUniversity(String[] parts) {
+		for (String part : parts) {
+			if (part.trim().toLowerCase().equals(_university))
+				return 1;
+		}
+		return 0;
+	}
+
+	public int isPreceededByAnd(PlaceInformation candidate) {
+		String sentence = getSentenceContainingCandidate(candidate.getStart());
+		if (sentence == null)
+			return 0;
+
+		int offset = getStartOfSentenceContainingCandidiate(candidate.getStart());
+
+		if (sentence.substring(0, candidate.getStart() - offset).trim()
+				.endsWith("and"))
+			return 1;
+		return 0;
+	}
+
+	public int isPreceededByThe(PlaceInformation candidate) {
+		String sentence = getSentenceContainingCandidate(candidate.getStart());
+		if (sentence == null)
+			return 0;
+
+		int offset = getStartOfSentenceContainingCandidiate(candidate.getStart());
+
+		if (sentence.substring(0, candidate.getStart() - offset).trim()
+				.endsWith("the"))
+			return 1;
+		return 0;
+	}
+
+	public int isSurroundedByBrackets(PlaceInformation candidate) {
+		String sentence = getSentenceContainingCandidate(candidate.getStart());
+		if (sentence == null)
+			return 0;
+
+		Pattern pattern = Pattern.compile("\\("
+				+ candidate.getReferenceInText().replace("(", "\\(").replace(")","\\)") + "\\)");
+		Matcher matcher = pattern.matcher(sentence);
+		if (matcher.find())
+			return 1;
+		return 0;
+	}
+
+	public int isSurroundedByCommata(PlaceInformation candidate) {
+		String sentence = getSentenceContainingCandidate(candidate.getStart());
+		if (sentence == null)
+			return 0;
+
+		if (sentence.contains(", " + candidate.getReferenceInText() + ","))
+			return 1;
+		return 0;
+	}
+
+	public int getCharsToLastSpeciesInParagraph(PlaceInformation candidate) {
+
+		Paragraph paragraph = getParagraphOfCandidate(candidate);
+		int textcounter = text.getPositionInTextFromParagraph(paragraph, 0);
+
+		// find last species name
+		ScientificName closestName = null;
+		for (ScientificName name : foundNames) {
+			// if species is before candiddate
+			if (name.getStart() < candidate.getStart()
+					&& name.getStart() > textcounter) {
+				// if it's closer than other last species replace lastName
+				if (closestName == null
+						|| closestName.getStart() < name.getStart())
+					closestName = name;
+			}
+		}
+		
+		if (closestName == null)
+			return -1;
+		
+		return candidate.getStart() - (closestName.getStart() + closestName.getLength());
+	}
+	
+	public int getCharsToNextSpeciesInParagraph(PlaceInformation candidate) {
+
+		Paragraph paragraph = getParagraphOfCandidate(candidate);
+		int textcounter = text.getPositionInTextFromParagraph(paragraph, 0);
+
+		// find last species name
+		ScientificName closestName = null;
+		for (ScientificName name : foundNames) {
+			// if species is before candiddate
+			if (name.getStart() > candidate.getStart() + candidate.getLength()
+					&& name.getStart() < textcounter + paragraph.getParagraphText().length()) {
+				// if it's closer than other last species replace lastName
+				if (closestName == null
+						|| closestName.getStart() > name.getStart())
+					closestName = name;
+			}
+		}
+		
+		if (closestName == null)
+			return -1;
+		
+		return closestName.getStart() - (candidate.getStart() + candidate.getLength());
+	}
+	
+	public int getCharsToLastApplicantInParagraph(PlaceInformation candidate) {
+
+		Paragraph paragraph = getParagraphOfCandidate(candidate);
+		int textcounter = text.getPositionInTextFromParagraph(paragraph, 0);
+
+		// find last species name
+		APerson closestApplicant = null;
+		for (APerson person : foundApplicants) {
+			// if species is before candiddate
+			if (person.getStart() < candidate.getStart()
+					&& person.getStart() > textcounter) {
+				// if it's closer than other last species replace lastName
+				if (closestApplicant == null
+						|| closestApplicant.getStart() < person.getStart())
+					closestApplicant = person;
+			}
+		}
+		
+		if (closestApplicant == null)
+			return -1;
+		
+		return candidate.getStart() - (closestApplicant.getStart() + closestApplicant.getLength());
+	}
+	
+	public int getCharToStudyInParagraph(PlaceInformation candidate) {
+		Paragraph paragraph = getParagraphOfCandidate(candidate);
+		int posInPara = text.getPositionInParagraphFromText(candidate.getStart());
+		
+		return getOffsetToClosestWord(posInPara, candidate.getLength(), _study, paragraph.getParagraphText());		
+	}
+	
+	public int getCharToStudiesInParagraph(PlaceInformation candidate) {
+		Paragraph paragraph = getParagraphOfCandidate(candidate);
+		int posInPara = text.getPositionInParagraphFromText(candidate.getStart());
+		
+		return getOffsetToClosestWord(posInPara, candidate.getLength(), _studies, paragraph.getParagraphText());			
+	}
+	
+	public int getCharToSurveyInSentence(PlaceInformation candidate) {
+		String sentence = getSentenceContainingCandidate(candidate.getStart());
+		
+		return getOffsetToClosestWord(getPosInSentence(candidate), candidate.getLength(), _survey, sentence);		
+	}
+	
+	public int getCharToSpeciesInSentence(PlaceInformation candidate) {
+		String sentence = getSentenceContainingCandidate(candidate.getStart());
+		
+		return getOffsetToClosestWord(getPosInSentence(candidate), candidate.getLength(), _species, sentence);		
+	}
+	
+	public int hasComma(PlaceInformation candidate) {
+		if (candidate.getReferenceInText().contains(","))
+			return 1;
+		return 0;
+	}
+	
+	public int hasBracket(PlaceInformation candidate) {
+		if (candidate.getReferenceInText().contains(")") || candidate.getReferenceInText().contains("("))
+			return 1;
+		return 0;
+	}
+	
+	public int getCharToInInSentence(PlaceInformation candidate) {
+		String sentence = getSentenceContainingCandidate(candidate.getStart());
+		
+		return getOffsetToClosestWord(getPosInSentence(candidate), candidate.getLength(), _in, sentence);
+	}
+
+	public int getCharToAtInSentence(PlaceInformation candidate) {
+		String sentence = getSentenceContainingCandidate(candidate.getStart());
+		
+		return getOffsetToClosestWord(getPosInSentence(candidate), candidate.getLength(), _at, sentence);
+	}
+	
+	private int getPosInSentence(PlaceInformation candidate) {
+		Paragraph paragraph = getParagraphOfCandidate(candidate);
+		String sentence = getSentenceContainingCandidate(candidate.getStart());
+		int indexOfSentence = paragraph.getParagraphText().indexOf(sentence);
+		int posInPara = text.getPositionInParagraphFromText(candidate.getStart());
+		return posInPara - indexOfSentence;
+	}
+
+	private Paragraph getParagraphOfCandidate(PlaceInformation candidate) {
+		int textcounter = 0;
+		Paragraph paragraph = null;
+		for (Paragraph para : text.getParagraphs()) {
+			if (candidate.getStart() >= textcounter
+					&& candidate.getStart() < textcounter
+							+ para.getParagraphText().length()) {
+				paragraph = para;
+				break;
+			}
+			textcounter += para.getParagraphText().length() + 1;
+		}
+		return paragraph;
+	}
+	
+	public int getType(Place candidate) {
+		String type = candidate.getType();
+		
+		if (typeMap.get(type) == null)
+			return 0;
+		
+		return typeMap.get(type);
+	}
+
+	public List<ScientificName> getFoundSpecies() {
+		return foundNames;
+	}
+
+	public void setFoundSpecies(List<ScientificName> foundSpecies) {
+		this.foundNames = foundSpecies;
+	}
+
+	public List<APerson> getFoundApplicant() {
+		return foundApplicants;
+	}
+
+	public void setFoundApplicant(List<APerson> foundApplicant) {
+		this.foundApplicants = foundApplicant;
+	}
+}