Mercurial > hg > anteater
view src/de/mpiwg/anteater/places/ml/preprocessing/LocationFeatureCalculator.java @ 0:036535fcd179
anteater
author | jdamerow |
---|---|
date | Fri, 14 Sep 2012 10:30:43 +0200 |
parents | |
children |
line wrap: on
line source
package de.mpiwg.anteater.places.ml.preprocessing; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.regex.Matcher; import java.util.regex.Pattern; import de.mpiwg.anteater.ml.ITextParser; import de.mpiwg.anteater.ml.preprocessing.FeatureCalculator; import de.mpiwg.anteater.persons.APerson; import de.mpiwg.anteater.places.Place; import de.mpiwg.anteater.places.PlaceInformation; import de.mpiwg.anteater.species.scientific.ScientificName; import de.mpiwg.anteater.text.Paragraph; import de.mpiwg.anteater.text.TextPart; public class LocationFeatureCalculator extends FeatureCalculator { private final String _university = "university"; private final String _study = "study"; private final String _studies = "studies"; private final String _in = " in "; private final String _at = " at "; private final String _survey = "survey"; private final String _species = "species"; private List<ScientificName> foundNames; private List<APerson> foundApplicants; private TextPart text; private Map<String, Integer> typeMap; public LocationFeatureCalculator(List<String> sentenceList, ITextParser parser, TextPart text) { super(sentenceList, parser, text.getText()); this.text = text; typeMap = new HashMap<String, Integer>(); typeMap.put(Place.TOWN, 1); typeMap.put(Place.COUNTY, 2); typeMap.put(Place.STATE, 3); typeMap.put(Place.COUNTRY, 4); typeMap.put(Place.SUBURB, 5); typeMap.put(Place.POI, 6); typeMap.put(Place.ZIP, 7); typeMap.put(Place.OCEAN, 8); } public double getNumberWordRelation(String[] placeParts) { int numbers = 0; int words = placeParts.length; for (String part : placeParts) { if (part.matches("[0-9]{1}.*")) numbers++; } return 1.0 * numbers / words; } public double getUppercasedWordsToAllRelation(String[] placeParts) { int uppercaseWords = 0; int words = placeParts.length; for (String part : placeParts) { if (part.matches("[A-Z]{1}.*")) uppercaseWords++; } return 1.0 * uppercaseWords / words; } public int contains2UppercaseCharacterWord(String[] placeParts) { for (String part : placeParts) { if (part.matches("[A-Z][A-Z]")) return 1; } return 0; } public int containsUniversity(String[] parts) { for (String part : parts) { if (part.trim().toLowerCase().equals(_university)) return 1; } return 0; } public int isPreceededByAnd(PlaceInformation candidate) { String sentence = getSentenceContainingCandidate(candidate.getStart()); if (sentence == null) return 0; int offset = getStartOfSentenceContainingCandidiate(candidate.getStart()); if (sentence.substring(0, candidate.getStart() - offset).trim() .endsWith("and")) return 1; return 0; } public int isPreceededByThe(PlaceInformation candidate) { String sentence = getSentenceContainingCandidate(candidate.getStart()); if (sentence == null) return 0; int offset = getStartOfSentenceContainingCandidiate(candidate.getStart()); if (sentence.substring(0, candidate.getStart() - offset).trim() .endsWith("the")) return 1; return 0; } public int isSurroundedByBrackets(PlaceInformation candidate) { String sentence = getSentenceContainingCandidate(candidate.getStart()); if (sentence == null) return 0; Pattern pattern = Pattern.compile("\\(" + candidate.getReferenceInText().replace("(", "\\(").replace(")","\\)") + "\\)"); Matcher matcher = pattern.matcher(sentence); if (matcher.find()) return 1; return 0; } public int isSurroundedByCommata(PlaceInformation candidate) { String sentence = getSentenceContainingCandidate(candidate.getStart()); if (sentence == null) return 0; if (sentence.contains(", " + candidate.getReferenceInText() + ",")) return 1; return 0; } public int getCharsToLastSpeciesInParagraph(PlaceInformation candidate) { Paragraph paragraph = getParagraphOfCandidate(candidate); int textcounter = text.getPositionInTextFromParagraph(paragraph, 0); // find last species name ScientificName closestName = null; for (ScientificName name : foundNames) { // if species is before candiddate if (name.getStart() < candidate.getStart() && name.getStart() > textcounter) { // if it's closer than other last species replace lastName if (closestName == null || closestName.getStart() < name.getStart()) closestName = name; } } if (closestName == null) return -1; return candidate.getStart() - (closestName.getStart() + closestName.getLength()); } public int getCharsToNextSpeciesInParagraph(PlaceInformation candidate) { Paragraph paragraph = getParagraphOfCandidate(candidate); int textcounter = text.getPositionInTextFromParagraph(paragraph, 0); // find last species name ScientificName closestName = null; for (ScientificName name : foundNames) { // if species is before candiddate if (name.getStart() > candidate.getStart() + candidate.getLength() && name.getStart() < textcounter + paragraph.getParagraphText().length()) { // if it's closer than other last species replace lastName if (closestName == null || closestName.getStart() > name.getStart()) closestName = name; } } if (closestName == null) return -1; return closestName.getStart() - (candidate.getStart() + candidate.getLength()); } public int getCharsToLastApplicantInParagraph(PlaceInformation candidate) { Paragraph paragraph = getParagraphOfCandidate(candidate); int textcounter = text.getPositionInTextFromParagraph(paragraph, 0); // find last species name APerson closestApplicant = null; for (APerson person : foundApplicants) { // if species is before candiddate if (person.getStart() < candidate.getStart() && person.getStart() > textcounter) { // if it's closer than other last species replace lastName if (closestApplicant == null || closestApplicant.getStart() < person.getStart()) closestApplicant = person; } } if (closestApplicant == null) return -1; return candidate.getStart() - (closestApplicant.getStart() + closestApplicant.getLength()); } public int getCharToStudyInParagraph(PlaceInformation candidate) { Paragraph paragraph = getParagraphOfCandidate(candidate); int posInPara = text.getPositionInParagraphFromText(candidate.getStart()); return getOffsetToClosestWord(posInPara, candidate.getLength(), _study, paragraph.getParagraphText()); } public int getCharToStudiesInParagraph(PlaceInformation candidate) { Paragraph paragraph = getParagraphOfCandidate(candidate); int posInPara = text.getPositionInParagraphFromText(candidate.getStart()); return getOffsetToClosestWord(posInPara, candidate.getLength(), _studies, paragraph.getParagraphText()); } public int getCharToSurveyInSentence(PlaceInformation candidate) { String sentence = getSentenceContainingCandidate(candidate.getStart()); return getOffsetToClosestWord(getPosInSentence(candidate), candidate.getLength(), _survey, sentence); } public int getCharToSpeciesInSentence(PlaceInformation candidate) { String sentence = getSentenceContainingCandidate(candidate.getStart()); return getOffsetToClosestWord(getPosInSentence(candidate), candidate.getLength(), _species, sentence); } public int hasComma(PlaceInformation candidate) { if (candidate.getReferenceInText().contains(",")) return 1; return 0; } public int hasBracket(PlaceInformation candidate) { if (candidate.getReferenceInText().contains(")") || candidate.getReferenceInText().contains("(")) return 1; return 0; } public int getCharToInInSentence(PlaceInformation candidate) { String sentence = getSentenceContainingCandidate(candidate.getStart()); return getOffsetToClosestWord(getPosInSentence(candidate), candidate.getLength(), _in, sentence); } public int getCharToAtInSentence(PlaceInformation candidate) { String sentence = getSentenceContainingCandidate(candidate.getStart()); return getOffsetToClosestWord(getPosInSentence(candidate), candidate.getLength(), _at, sentence); } private int getPosInSentence(PlaceInformation candidate) { Paragraph paragraph = getParagraphOfCandidate(candidate); String sentence = getSentenceContainingCandidate(candidate.getStart()); int indexOfSentence = paragraph.getParagraphText().indexOf(sentence); int posInPara = text.getPositionInParagraphFromText(candidate.getStart()); return posInPara - indexOfSentence; } private Paragraph getParagraphOfCandidate(PlaceInformation candidate) { int textcounter = 0; Paragraph paragraph = null; for (Paragraph para : text.getParagraphs()) { if (candidate.getStart() >= textcounter && candidate.getStart() < textcounter + para.getParagraphText().length()) { paragraph = para; break; } textcounter += para.getParagraphText().length() + 1; } return paragraph; } public int getType(Place candidate) { String type = candidate.getType(); if (typeMap.get(type) == null) return 0; return typeMap.get(type); } public List<ScientificName> getFoundSpecies() { return foundNames; } public void setFoundSpecies(List<ScientificName> foundSpecies) { this.foundNames = foundSpecies; } public List<APerson> getFoundApplicant() { return foundApplicants; } public void setFoundApplicant(List<APerson> foundApplicant) { this.foundApplicants = foundApplicant; } }