Mercurial > hg > anteater
diff src/de/mpiwg/anteater/places/ml/preprocessing/LocationDataCreator.java @ 0:036535fcd179
anteater
author | jdamerow |
---|---|
date | Fri, 14 Sep 2012 10:30:43 +0200 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/de/mpiwg/anteater/places/ml/preprocessing/LocationDataCreator.java Fri Sep 14 10:30:43 2012 +0200 @@ -0,0 +1,192 @@ +package de.mpiwg.anteater.places.ml.preprocessing; + +import java.util.ArrayList; +import java.util.List; + +import de.mpiwg.anteater.AnteaterConfiguration; +import de.mpiwg.anteater.ml.ITextParser; +import de.mpiwg.anteater.ml.preprocessing.DataCreator; +import de.mpiwg.anteater.persons.APerson; +import de.mpiwg.anteater.persons.PersonsExtraction; +import de.mpiwg.anteater.places.Place; +import de.mpiwg.anteater.places.PlaceInformation; +import de.mpiwg.anteater.places.PlacesExtraction; +import de.mpiwg.anteater.results.ApplicantResult; +import de.mpiwg.anteater.results.SpeciesScientificResult; +import de.mpiwg.anteater.species.scientific.ScientificName; +import de.mpiwg.anteater.species.scientific.ScientificNamesExtraction; +import de.mpiwg.anteater.text.Paragraph; +import de.mpiwg.anteater.text.TextInformation; +import de.mpiwg.anteater.text.TextPart; + +public class LocationDataCreator extends DataCreator { + + private List<SpeciesScientificResult> predictedSpecies; + private List<ApplicantResult> predictedApplicants; + + public LocationDataCreator(AnteaterConfiguration configuration, List<SpeciesScientificResult> predictedSpecies, List<ApplicantResult> predictedApplicants) { + super(configuration, "LOCATION_"); + this.predictedApplicants = predictedApplicants; + this.predictedSpecies = predictedSpecies; + } + + @Override + public void createFileContents(TextInformation info, + StringBuffer arffContents, ITextParser textParser) { + List<PlacesExtraction> placesExtractions = info.getPlacesExtractions(); + + for (PlacesExtraction extraction : placesExtractions) { + List<PlaceInformation> placeInfos = extraction.getPlaceInformation(); + + // get sentences + TextPart text = null; + switch(extraction.getType()) { + // summary + case 1: text = info.getSummaries().get(extraction.getTextIdx()); break; + // supplementary information + case 2: text = info.getSupplInfos().get(extraction.getTextIdx()); + } + + if (text == null) + continue; + + List<ScientificName> predictedNames = new ArrayList<ScientificName>(); + List<APerson> predictedPeople = new ArrayList<APerson>(); + + for (SpeciesScientificResult result : predictedSpecies) { + if (result.getTextInfo() == info) { + if (result.getPrediction() >= 1.0) { + ScientificNamesExtraction textExtrac = result.getResult(); + if (textExtrac.getType() == extraction.getType() && textExtrac.getTextIdx() == extraction.getTextIdx()) + predictedNames.add(result.getFinding()); + } + } + } + + for (ApplicantResult appResult : predictedApplicants) { + if (appResult.getTextInfo() == info) { + if (appResult.getPrediction() >= 1.0) { + PersonsExtraction textExtrac = appResult.getResult(); + if (textExtrac.getType() == extraction.getType() && textExtrac.getTextIdx() == extraction.getTextIdx()) + predictedPeople.add(appResult.getFinding()); + } + } + } + + List<String> sentences = new ArrayList<String>(); + //ITextParser icuParser = new ICUTextParser(); + long start = System.currentTimeMillis(); + for (Paragraph p : text.getParagraphs()) { + sentences.addAll(textParser.getSentences(p.getParagraphText())); + } + long end = System.currentTimeMillis(); + configuration.getLogger().logMessage(COMPONENT_NAME, "Splitting text into sentences: " + (end - start) + "ms"); + + + // feature calculator + LocationFeatureCalculator calculator = new LocationFeatureCalculator(sentences, textParser, text); + calculator.setFoundApplicant(predictedPeople); + calculator.setFoundSpecies(predictedNames); + + for (PlaceInformation pInfo : placeInfos) { + List<Place> places = pInfo.getPlaces(); + + String placeReference = pInfo.getReferenceInText(); + String[] placeParts = placeReference.split(" "); + + for (Place place : places) { + StringBuffer dataPoint = new StringBuffer(); + + // unknown class + dataPoint.append(UNKNOWN_CLASS_SYMBOL); + dataPoint.append(","); + + // numbers/words + dataPoint.append(calculator.getNumberWordRelation(placeParts)); + dataPoint.append(","); + + // starts_with_uppercase/words + dataPoint.append(calculator.getUppercasedWordsToAllRelation(placeParts)); + dataPoint.append(","); + + // contains_2_uppercase_letter_word + dataPoint.append(calculator.contains2UppercaseCharacterWord(placeParts)); + dataPoint.append(","); + + // contains_university + dataPoint.append(calculator.containsUniversity(placeParts)); + dataPoint.append(","); + + // surrounded_by_comma + dataPoint.append(calculator.isSurroundedByCommata(pInfo)); + dataPoint.append(","); + + // surrounded_by_brackets + dataPoint.append(calculator.isSurroundedByBrackets(pInfo)); + dataPoint.append(","); + + // preceeded_by_and + dataPoint.append(calculator.isPreceededByAnd(pInfo)); + dataPoint.append(","); + + // preceeded_by_the + dataPoint.append(calculator.isPreceededByThe(pInfo)); + dataPoint.append(","); + + // char_to_last_species_in_p + dataPoint.append(calculator.getCharsToLastSpeciesInParagraph(pInfo)); + dataPoint.append(","); + + // char_to_next_species_in_p + dataPoint.append(calculator.getCharsToNextSpeciesInParagraph(pInfo)); + dataPoint.append(","); + + // char_to_study_in_p + dataPoint.append(calculator.getCharToStudyInParagraph(pInfo)); + dataPoint.append(","); + + // char_to_studies_in_p + dataPoint.append(calculator.getCharToStudiesInParagraph(pInfo)); + dataPoint.append(","); + + // char_to_in_in_s + dataPoint.append(calculator.getCharToInInSentence(pInfo)); + dataPoint.append(","); + + // char_to_at_in_s + dataPoint.append(calculator.getCharToAtInSentence(pInfo)); + dataPoint.append(","); + + //nr_char_to_last_applicant_in_text(end of applicant) + dataPoint.append(calculator.getCharsToLastApplicantInParagraph(pInfo)); + dataPoint.append(","); + + // has comma + dataPoint.append(calculator.hasComma(pInfo)); + dataPoint.append(","); + + // hasBracket + dataPoint.append(calculator.hasBracket(pInfo)); + dataPoint.append(","); + + // type(0=Other,1=Town,2=County,3=State,4=Country,5=Suburb,6=POI,7=Zip,8=Ocean) + dataPoint.append(calculator.getType(place)); + dataPoint.append(","); + + // chars_to_survey_in_s + dataPoint.append(calculator.getCharToSurveyInSentence(pInfo)); + dataPoint.append(","); + + // chars_to_species_in_s + dataPoint.append(calculator.getCharToSpeciesInSentence(pInfo)); + dataPoint.append(","); + + dataPoint.append("\n"); + + arffContents.append(dataPoint); + } + } + } + } + +}