Mercurial > hg > anteater
view src/de/mpiwg/anteater/persons/ml/preprocessing/ApplicantDataCreator.java @ 0:036535fcd179
anteater
author | jdamerow |
---|---|
date | Fri, 14 Sep 2012 10:30:43 +0200 |
parents | |
children |
line wrap: on
line source
package de.mpiwg.anteater.persons.ml.preprocessing; import java.util.ArrayList; import java.util.List; import de.mpiwg.anteater.AnteaterConfiguration; import de.mpiwg.anteater.ml.ITextParser; import de.mpiwg.anteater.ml.preprocessing.DataCreator; import de.mpiwg.anteater.persons.APerson; import de.mpiwg.anteater.persons.Location; import de.mpiwg.anteater.persons.Organization; import de.mpiwg.anteater.persons.Person; import de.mpiwg.anteater.persons.PersonsExtraction; import de.mpiwg.anteater.places.PlaceInformation; import de.mpiwg.anteater.places.PlacesExtraction; import de.mpiwg.anteater.species.scientific.ScientificName; import de.mpiwg.anteater.species.scientific.ScientificNamesExtraction; import de.mpiwg.anteater.text.Paragraph; import de.mpiwg.anteater.text.TextInformation; import de.mpiwg.anteater.text.TextPart; public class ApplicantDataCreator extends DataCreator { public ApplicantDataCreator(AnteaterConfiguration configuration) { super(configuration, "APPLICANT_"); } @Override public void createFileContents(TextInformation info, StringBuffer arffContents, ITextParser textParser) { List<PersonsExtraction> results = info.getPersonsExtractions(); for (PersonsExtraction pResult : results) { List<APerson> candidates = pResult.getPersons(); TextPart text = null; switch(pResult.getType()) { // summary case 1: text = info.getSummaries().get(pResult.getTextIdx()); break; // supplementary information case 2: text = info.getSupplInfos().get(pResult.getTextIdx()); } if (text == null) continue; List<String> sentences = new ArrayList<String>(); //ITextParser icuParser = new ICUTextParser(); long start = System.currentTimeMillis(); for (Paragraph p : text.getParagraphs()) { sentences.addAll(textParser.getSentences(p.getParagraphText())); } long end = System.currentTimeMillis(); configuration.getLogger().logMessage(COMPONENT_NAME, "Splitting text into sentences: " + (end - start) + "ms"); ApplicantFeatureCalculator calculator = new ApplicantFeatureCalculator(sentences, textParser, text); List<PlaceInformation> places = new ArrayList<PlaceInformation>(); List<ScientificName> names = new ArrayList<ScientificName>(); for (PlacesExtraction r : info.getPlacesExtractions()) { if (r.getTextIdx() == pResult.getTextIdx()) places = r.getPlaceInformation(); } for (ScientificNamesExtraction r : info.getScientificNamesExtractions()) { if (r.getTextIdx() == pResult.getTextIdx()) names = r.getNames(); } calculator.setNames(names); calculator.setPlaces(places); for (APerson candidate : candidates) { StringBuffer dataPoint = new StringBuffer(); // unknown class dataPoint.append(UNKNOWN_CLASS_SYMBOL); dataPoint.append(","); // add text type dataPoint.append(pResult.getType()); dataPoint.append(","); // name_length dataPoint.append(candidate.getReferenceInText().length()); dataPoint.append(","); // contains issued dataPoint.append(calculator.getSentenceContainsIssued(candidate)); dataPoint.append(","); // contains applied dataPoint.append(calculator.getSentenceContainsApplied(candidate)); dataPoint.append(","); // contains permit dataPoint.append(calculator.getSentenceContainsPermit(candidate)); dataPoint.append(","); // contains comment dataPoint.append(calculator.getSentenceContainsComment(candidate)); dataPoint.append(","); // is subject start = System.currentTimeMillis(); dataPoint.append(calculator.getIsSubject(candidate)); dataPoint.append(","); end = System.currentTimeMillis(); configuration.getLogger().logMessage(COMPONENT_NAME, "Determining subject: " + (end - start) + "ms"); // contains applicant dataPoint.append(calculator.getSentenceContainsApplicant(candidate)); dataPoint.append(","); // distance term to applicant dataPoint.append(calculator.getDistanceCandidateToApplicant(candidate)); dataPoint.append(","); // person, location, organization if (candidate instanceof Person) dataPoint.append(1); else if (candidate instanceof Organization) dataPoint.append(2); else if (candidate instanceof Location) dataPoint.append(3); else continue; dataPoint.append(","); // get similarity to speciies names dataPoint.append(calculator.getSimilarityPersonNameForPerson(candidate)); dataPoint.append(","); dataPoint.append(calculator.getSimilarityPersonNameForName(candidate)); dataPoint.append(","); dataPoint.append(calculator.doPersonAndNameStartAtSameIdx(candidate)); dataPoint.append(","); // get similarity to places names dataPoint.append(calculator.getSimilarityPersonPlaceForPerson(candidate)); dataPoint.append(","); dataPoint.append(calculator.getSimilarityPersonPlaceForPlace(candidate)); dataPoint.append(","); dataPoint.append(calculator.doPersonAndPlaceStartAtSameIdx(candidate)); dataPoint.append(","); // is surrounded by brackets dataPoint.append(calculator.isSurroundedByBrackets(candidate)); dataPoint.append(","); // is surrounded by commata dataPoint.append(calculator.isSurroundedByCommata(candidate)); dataPoint.append(","); // followed by 's dataPoint.append(calculator.isFollowedBy_s(candidate)); dataPoint.append(","); // is abbreviation dataPoint.append(calculator.getIsAbbreviation(candidate)); dataPoint.append("\n"); arffContents.append(dataPoint); } } } }