Mercurial > hg > anteater
diff src/de/mpiwg/anteater/places/ml/preprocessing/LocationFeatureCalculator.java @ 0:036535fcd179
anteater
author | jdamerow |
---|---|
date | Fri, 14 Sep 2012 10:30:43 +0200 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/de/mpiwg/anteater/places/ml/preprocessing/LocationFeatureCalculator.java Fri Sep 14 10:30:43 2012 +0200 @@ -0,0 +1,307 @@ +package de.mpiwg.anteater.places.ml.preprocessing; + +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import de.mpiwg.anteater.ml.ITextParser; +import de.mpiwg.anteater.ml.preprocessing.FeatureCalculator; +import de.mpiwg.anteater.persons.APerson; +import de.mpiwg.anteater.places.Place; +import de.mpiwg.anteater.places.PlaceInformation; +import de.mpiwg.anteater.species.scientific.ScientificName; +import de.mpiwg.anteater.text.Paragraph; +import de.mpiwg.anteater.text.TextPart; + +public class LocationFeatureCalculator extends FeatureCalculator { + + private final String _university = "university"; + private final String _study = "study"; + private final String _studies = "studies"; + private final String _in = " in "; + private final String _at = " at "; + private final String _survey = "survey"; + private final String _species = "species"; + + private List<ScientificName> foundNames; + private List<APerson> foundApplicants; + private TextPart text; + private Map<String, Integer> typeMap; + + public LocationFeatureCalculator(List<String> sentenceList, + ITextParser parser, TextPart text) { + super(sentenceList, parser, text.getText()); + this.text = text; + + typeMap = new HashMap<String, Integer>(); + typeMap.put(Place.TOWN, 1); + typeMap.put(Place.COUNTY, 2); + typeMap.put(Place.STATE, 3); + typeMap.put(Place.COUNTRY, 4); + typeMap.put(Place.SUBURB, 5); + typeMap.put(Place.POI, 6); + typeMap.put(Place.ZIP, 7); + typeMap.put(Place.OCEAN, 8); + } + + public double getNumberWordRelation(String[] placeParts) { + int numbers = 0; + int words = placeParts.length; + + for (String part : placeParts) { + if (part.matches("[0-9]{1}.*")) + numbers++; + } + + return 1.0 * numbers / words; + } + + public double getUppercasedWordsToAllRelation(String[] placeParts) { + int uppercaseWords = 0; + int words = placeParts.length; + + for (String part : placeParts) { + if (part.matches("[A-Z]{1}.*")) + uppercaseWords++; + } + + return 1.0 * uppercaseWords / words; + } + + public int contains2UppercaseCharacterWord(String[] placeParts) { + for (String part : placeParts) { + if (part.matches("[A-Z][A-Z]")) + return 1; + } + return 0; + } + + public int containsUniversity(String[] parts) { + for (String part : parts) { + if (part.trim().toLowerCase().equals(_university)) + return 1; + } + return 0; + } + + public int isPreceededByAnd(PlaceInformation candidate) { + String sentence = getSentenceContainingCandidate(candidate.getStart()); + if (sentence == null) + return 0; + + int offset = getStartOfSentenceContainingCandidiate(candidate.getStart()); + + if (sentence.substring(0, candidate.getStart() - offset).trim() + .endsWith("and")) + return 1; + return 0; + } + + public int isPreceededByThe(PlaceInformation candidate) { + String sentence = getSentenceContainingCandidate(candidate.getStart()); + if (sentence == null) + return 0; + + int offset = getStartOfSentenceContainingCandidiate(candidate.getStart()); + + if (sentence.substring(0, candidate.getStart() - offset).trim() + .endsWith("the")) + return 1; + return 0; + } + + public int isSurroundedByBrackets(PlaceInformation candidate) { + String sentence = getSentenceContainingCandidate(candidate.getStart()); + if (sentence == null) + return 0; + + Pattern pattern = Pattern.compile("\\(" + + candidate.getReferenceInText().replace("(", "\\(").replace(")","\\)") + "\\)"); + Matcher matcher = pattern.matcher(sentence); + if (matcher.find()) + return 1; + return 0; + } + + public int isSurroundedByCommata(PlaceInformation candidate) { + String sentence = getSentenceContainingCandidate(candidate.getStart()); + if (sentence == null) + return 0; + + if (sentence.contains(", " + candidate.getReferenceInText() + ",")) + return 1; + return 0; + } + + public int getCharsToLastSpeciesInParagraph(PlaceInformation candidate) { + + Paragraph paragraph = getParagraphOfCandidate(candidate); + int textcounter = text.getPositionInTextFromParagraph(paragraph, 0); + + // find last species name + ScientificName closestName = null; + for (ScientificName name : foundNames) { + // if species is before candiddate + if (name.getStart() < candidate.getStart() + && name.getStart() > textcounter) { + // if it's closer than other last species replace lastName + if (closestName == null + || closestName.getStart() < name.getStart()) + closestName = name; + } + } + + if (closestName == null) + return -1; + + return candidate.getStart() - (closestName.getStart() + closestName.getLength()); + } + + public int getCharsToNextSpeciesInParagraph(PlaceInformation candidate) { + + Paragraph paragraph = getParagraphOfCandidate(candidate); + int textcounter = text.getPositionInTextFromParagraph(paragraph, 0); + + // find last species name + ScientificName closestName = null; + for (ScientificName name : foundNames) { + // if species is before candiddate + if (name.getStart() > candidate.getStart() + candidate.getLength() + && name.getStart() < textcounter + paragraph.getParagraphText().length()) { + // if it's closer than other last species replace lastName + if (closestName == null + || closestName.getStart() > name.getStart()) + closestName = name; + } + } + + if (closestName == null) + return -1; + + return closestName.getStart() - (candidate.getStart() + candidate.getLength()); + } + + public int getCharsToLastApplicantInParagraph(PlaceInformation candidate) { + + Paragraph paragraph = getParagraphOfCandidate(candidate); + int textcounter = text.getPositionInTextFromParagraph(paragraph, 0); + + // find last species name + APerson closestApplicant = null; + for (APerson person : foundApplicants) { + // if species is before candiddate + if (person.getStart() < candidate.getStart() + && person.getStart() > textcounter) { + // if it's closer than other last species replace lastName + if (closestApplicant == null + || closestApplicant.getStart() < person.getStart()) + closestApplicant = person; + } + } + + if (closestApplicant == null) + return -1; + + return candidate.getStart() - (closestApplicant.getStart() + closestApplicant.getLength()); + } + + public int getCharToStudyInParagraph(PlaceInformation candidate) { + Paragraph paragraph = getParagraphOfCandidate(candidate); + int posInPara = text.getPositionInParagraphFromText(candidate.getStart()); + + return getOffsetToClosestWord(posInPara, candidate.getLength(), _study, paragraph.getParagraphText()); + } + + public int getCharToStudiesInParagraph(PlaceInformation candidate) { + Paragraph paragraph = getParagraphOfCandidate(candidate); + int posInPara = text.getPositionInParagraphFromText(candidate.getStart()); + + return getOffsetToClosestWord(posInPara, candidate.getLength(), _studies, paragraph.getParagraphText()); + } + + public int getCharToSurveyInSentence(PlaceInformation candidate) { + String sentence = getSentenceContainingCandidate(candidate.getStart()); + + return getOffsetToClosestWord(getPosInSentence(candidate), candidate.getLength(), _survey, sentence); + } + + public int getCharToSpeciesInSentence(PlaceInformation candidate) { + String sentence = getSentenceContainingCandidate(candidate.getStart()); + + return getOffsetToClosestWord(getPosInSentence(candidate), candidate.getLength(), _species, sentence); + } + + public int hasComma(PlaceInformation candidate) { + if (candidate.getReferenceInText().contains(",")) + return 1; + return 0; + } + + public int hasBracket(PlaceInformation candidate) { + if (candidate.getReferenceInText().contains(")") || candidate.getReferenceInText().contains("(")) + return 1; + return 0; + } + + public int getCharToInInSentence(PlaceInformation candidate) { + String sentence = getSentenceContainingCandidate(candidate.getStart()); + + return getOffsetToClosestWord(getPosInSentence(candidate), candidate.getLength(), _in, sentence); + } + + public int getCharToAtInSentence(PlaceInformation candidate) { + String sentence = getSentenceContainingCandidate(candidate.getStart()); + + return getOffsetToClosestWord(getPosInSentence(candidate), candidate.getLength(), _at, sentence); + } + + private int getPosInSentence(PlaceInformation candidate) { + Paragraph paragraph = getParagraphOfCandidate(candidate); + String sentence = getSentenceContainingCandidate(candidate.getStart()); + int indexOfSentence = paragraph.getParagraphText().indexOf(sentence); + int posInPara = text.getPositionInParagraphFromText(candidate.getStart()); + return posInPara - indexOfSentence; + } + + private Paragraph getParagraphOfCandidate(PlaceInformation candidate) { + int textcounter = 0; + Paragraph paragraph = null; + for (Paragraph para : text.getParagraphs()) { + if (candidate.getStart() >= textcounter + && candidate.getStart() < textcounter + + para.getParagraphText().length()) { + paragraph = para; + break; + } + textcounter += para.getParagraphText().length() + 1; + } + return paragraph; + } + + public int getType(Place candidate) { + String type = candidate.getType(); + + if (typeMap.get(type) == null) + return 0; + + return typeMap.get(type); + } + + public List<ScientificName> getFoundSpecies() { + return foundNames; + } + + public void setFoundSpecies(List<ScientificName> foundSpecies) { + this.foundNames = foundSpecies; + } + + public List<APerson> getFoundApplicant() { + return foundApplicants; + } + + public void setFoundApplicant(List<APerson> foundApplicant) { + this.foundApplicants = foundApplicant; + } +}