Mercurial > hg > anteater
view src/de/mpiwg/anteater/places/ml/preprocessing/LocationDataCreator.java @ 0:036535fcd179
anteater
author | jdamerow |
---|---|
date | Fri, 14 Sep 2012 10:30:43 +0200 |
parents | |
children |
line wrap: on
line source
package de.mpiwg.anteater.places.ml.preprocessing; import java.util.ArrayList; import java.util.List; import de.mpiwg.anteater.AnteaterConfiguration; import de.mpiwg.anteater.ml.ITextParser; import de.mpiwg.anteater.ml.preprocessing.DataCreator; import de.mpiwg.anteater.persons.APerson; import de.mpiwg.anteater.persons.PersonsExtraction; import de.mpiwg.anteater.places.Place; import de.mpiwg.anteater.places.PlaceInformation; import de.mpiwg.anteater.places.PlacesExtraction; import de.mpiwg.anteater.results.ApplicantResult; import de.mpiwg.anteater.results.SpeciesScientificResult; import de.mpiwg.anteater.species.scientific.ScientificName; import de.mpiwg.anteater.species.scientific.ScientificNamesExtraction; import de.mpiwg.anteater.text.Paragraph; import de.mpiwg.anteater.text.TextInformation; import de.mpiwg.anteater.text.TextPart; public class LocationDataCreator extends DataCreator { private List<SpeciesScientificResult> predictedSpecies; private List<ApplicantResult> predictedApplicants; public LocationDataCreator(AnteaterConfiguration configuration, List<SpeciesScientificResult> predictedSpecies, List<ApplicantResult> predictedApplicants) { super(configuration, "LOCATION_"); this.predictedApplicants = predictedApplicants; this.predictedSpecies = predictedSpecies; } @Override public void createFileContents(TextInformation info, StringBuffer arffContents, ITextParser textParser) { List<PlacesExtraction> placesExtractions = info.getPlacesExtractions(); for (PlacesExtraction extraction : placesExtractions) { List<PlaceInformation> placeInfos = extraction.getPlaceInformation(); // get sentences TextPart text = null; switch(extraction.getType()) { // summary case 1: text = info.getSummaries().get(extraction.getTextIdx()); break; // supplementary information case 2: text = info.getSupplInfos().get(extraction.getTextIdx()); } if (text == null) continue; List<ScientificName> predictedNames = new ArrayList<ScientificName>(); List<APerson> predictedPeople = new ArrayList<APerson>(); for (SpeciesScientificResult result : predictedSpecies) { if (result.getTextInfo() == info) { if (result.getPrediction() >= 1.0) { ScientificNamesExtraction textExtrac = result.getResult(); if (textExtrac.getType() == extraction.getType() && textExtrac.getTextIdx() == extraction.getTextIdx()) predictedNames.add(result.getFinding()); } } } for (ApplicantResult appResult : predictedApplicants) { if (appResult.getTextInfo() == info) { if (appResult.getPrediction() >= 1.0) { PersonsExtraction textExtrac = appResult.getResult(); if (textExtrac.getType() == extraction.getType() && textExtrac.getTextIdx() == extraction.getTextIdx()) predictedPeople.add(appResult.getFinding()); } } } List<String> sentences = new ArrayList<String>(); //ITextParser icuParser = new ICUTextParser(); long start = System.currentTimeMillis(); for (Paragraph p : text.getParagraphs()) { sentences.addAll(textParser.getSentences(p.getParagraphText())); } long end = System.currentTimeMillis(); configuration.getLogger().logMessage(COMPONENT_NAME, "Splitting text into sentences: " + (end - start) + "ms"); // feature calculator LocationFeatureCalculator calculator = new LocationFeatureCalculator(sentences, textParser, text); calculator.setFoundApplicant(predictedPeople); calculator.setFoundSpecies(predictedNames); for (PlaceInformation pInfo : placeInfos) { List<Place> places = pInfo.getPlaces(); String placeReference = pInfo.getReferenceInText(); String[] placeParts = placeReference.split(" "); for (Place place : places) { StringBuffer dataPoint = new StringBuffer(); // unknown class dataPoint.append(UNKNOWN_CLASS_SYMBOL); dataPoint.append(","); // numbers/words dataPoint.append(calculator.getNumberWordRelation(placeParts)); dataPoint.append(","); // starts_with_uppercase/words dataPoint.append(calculator.getUppercasedWordsToAllRelation(placeParts)); dataPoint.append(","); // contains_2_uppercase_letter_word dataPoint.append(calculator.contains2UppercaseCharacterWord(placeParts)); dataPoint.append(","); // contains_university dataPoint.append(calculator.containsUniversity(placeParts)); dataPoint.append(","); // surrounded_by_comma dataPoint.append(calculator.isSurroundedByCommata(pInfo)); dataPoint.append(","); // surrounded_by_brackets dataPoint.append(calculator.isSurroundedByBrackets(pInfo)); dataPoint.append(","); // preceeded_by_and dataPoint.append(calculator.isPreceededByAnd(pInfo)); dataPoint.append(","); // preceeded_by_the dataPoint.append(calculator.isPreceededByThe(pInfo)); dataPoint.append(","); // char_to_last_species_in_p dataPoint.append(calculator.getCharsToLastSpeciesInParagraph(pInfo)); dataPoint.append(","); // char_to_next_species_in_p dataPoint.append(calculator.getCharsToNextSpeciesInParagraph(pInfo)); dataPoint.append(","); // char_to_study_in_p dataPoint.append(calculator.getCharToStudyInParagraph(pInfo)); dataPoint.append(","); // char_to_studies_in_p dataPoint.append(calculator.getCharToStudiesInParagraph(pInfo)); dataPoint.append(","); // char_to_in_in_s dataPoint.append(calculator.getCharToInInSentence(pInfo)); dataPoint.append(","); // char_to_at_in_s dataPoint.append(calculator.getCharToAtInSentence(pInfo)); dataPoint.append(","); //nr_char_to_last_applicant_in_text(end of applicant) dataPoint.append(calculator.getCharsToLastApplicantInParagraph(pInfo)); dataPoint.append(","); // has comma dataPoint.append(calculator.hasComma(pInfo)); dataPoint.append(","); // hasBracket dataPoint.append(calculator.hasBracket(pInfo)); dataPoint.append(","); // type(0=Other,1=Town,2=County,3=State,4=Country,5=Suburb,6=POI,7=Zip,8=Ocean) dataPoint.append(calculator.getType(place)); dataPoint.append(","); // chars_to_survey_in_s dataPoint.append(calculator.getCharToSurveyInSentence(pInfo)); dataPoint.append(","); // chars_to_species_in_s dataPoint.append(calculator.getCharToSpeciesInSentence(pInfo)); dataPoint.append(","); dataPoint.append("\n"); arffContents.append(dataPoint); } } } } }