Mercurial > hg > anteater
diff src/de/mpiwg/anteater/persons/ml/preprocessing/ApplicantDataCreator.java @ 0:036535fcd179
anteater
author | jdamerow |
---|---|
date | Fri, 14 Sep 2012 10:30:43 +0200 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/de/mpiwg/anteater/persons/ml/preprocessing/ApplicantDataCreator.java Fri Sep 14 10:30:43 2012 +0200 @@ -0,0 +1,158 @@ +package de.mpiwg.anteater.persons.ml.preprocessing; + +import java.util.ArrayList; +import java.util.List; + +import de.mpiwg.anteater.AnteaterConfiguration; +import de.mpiwg.anteater.ml.ITextParser; +import de.mpiwg.anteater.ml.preprocessing.DataCreator; +import de.mpiwg.anteater.persons.APerson; +import de.mpiwg.anteater.persons.Location; +import de.mpiwg.anteater.persons.Organization; +import de.mpiwg.anteater.persons.Person; +import de.mpiwg.anteater.persons.PersonsExtraction; +import de.mpiwg.anteater.places.PlaceInformation; +import de.mpiwg.anteater.places.PlacesExtraction; +import de.mpiwg.anteater.species.scientific.ScientificName; +import de.mpiwg.anteater.species.scientific.ScientificNamesExtraction; +import de.mpiwg.anteater.text.Paragraph; +import de.mpiwg.anteater.text.TextInformation; +import de.mpiwg.anteater.text.TextPart; + +public class ApplicantDataCreator extends DataCreator { + + public ApplicantDataCreator(AnteaterConfiguration configuration) { + super(configuration, "APPLICANT_"); + } + + @Override + public void createFileContents(TextInformation info, StringBuffer arffContents, ITextParser textParser) { + List<PersonsExtraction> results = info.getPersonsExtractions(); + + for (PersonsExtraction pResult : results) { + List<APerson> candidates = pResult.getPersons(); + TextPart text = null; + switch(pResult.getType()) { + // summary + case 1: text = info.getSummaries().get(pResult.getTextIdx()); break; + // supplementary information + case 2: text = info.getSupplInfos().get(pResult.getTextIdx()); + } + + if (text == null) + continue; + + List<String> sentences = new ArrayList<String>(); + + //ITextParser icuParser = new ICUTextParser(); + + long start = System.currentTimeMillis(); + for (Paragraph p : text.getParagraphs()) { + sentences.addAll(textParser.getSentences(p.getParagraphText())); + } + long end = System.currentTimeMillis(); + configuration.getLogger().logMessage(COMPONENT_NAME, "Splitting text into sentences: " + (end - start) + "ms"); + + ApplicantFeatureCalculator calculator = new ApplicantFeatureCalculator(sentences, textParser, text); + List<PlaceInformation> places = new ArrayList<PlaceInformation>(); + List<ScientificName> names = new ArrayList<ScientificName>(); + + for (PlacesExtraction r : info.getPlacesExtractions()) { + if (r.getTextIdx() == pResult.getTextIdx()) + places = r.getPlaceInformation(); + } + + for (ScientificNamesExtraction r : info.getScientificNamesExtractions()) { + if (r.getTextIdx() == pResult.getTextIdx()) + names = r.getNames(); + } + calculator.setNames(names); + calculator.setPlaces(places); + + for (APerson candidate : candidates) { + StringBuffer dataPoint = new StringBuffer(); + + // unknown class + dataPoint.append(UNKNOWN_CLASS_SYMBOL); + dataPoint.append(","); + + // add text type + dataPoint.append(pResult.getType()); + dataPoint.append(","); + // name_length + dataPoint.append(candidate.getReferenceInText().length()); + dataPoint.append(","); + // contains issued + dataPoint.append(calculator.getSentenceContainsIssued(candidate)); + dataPoint.append(","); + // contains applied + dataPoint.append(calculator.getSentenceContainsApplied(candidate)); + dataPoint.append(","); + // contains permit + dataPoint.append(calculator.getSentenceContainsPermit(candidate)); + dataPoint.append(","); + // contains comment + dataPoint.append(calculator.getSentenceContainsComment(candidate)); + dataPoint.append(","); + // is subject + start = System.currentTimeMillis(); + dataPoint.append(calculator.getIsSubject(candidate)); + dataPoint.append(","); + end = System.currentTimeMillis(); + configuration.getLogger().logMessage(COMPONENT_NAME, "Determining subject: " + (end - start) + "ms"); + + // contains applicant + dataPoint.append(calculator.getSentenceContainsApplicant(candidate)); + dataPoint.append(","); + // distance term to applicant + dataPoint.append(calculator.getDistanceCandidateToApplicant(candidate)); + dataPoint.append(","); + // person, location, organization + if (candidate instanceof Person) + dataPoint.append(1); + else if (candidate instanceof Organization) + dataPoint.append(2); + else if (candidate instanceof Location) + dataPoint.append(3); + else + continue; + dataPoint.append(","); + + // get similarity to speciies names + dataPoint.append(calculator.getSimilarityPersonNameForPerson(candidate)); + dataPoint.append(","); + dataPoint.append(calculator.getSimilarityPersonNameForName(candidate)); + dataPoint.append(","); + dataPoint.append(calculator.doPersonAndNameStartAtSameIdx(candidate)); + dataPoint.append(","); + + // get similarity to places names + dataPoint.append(calculator.getSimilarityPersonPlaceForPerson(candidate)); + dataPoint.append(","); + dataPoint.append(calculator.getSimilarityPersonPlaceForPlace(candidate)); + dataPoint.append(","); + dataPoint.append(calculator.doPersonAndPlaceStartAtSameIdx(candidate)); + dataPoint.append(","); + + // is surrounded by brackets + dataPoint.append(calculator.isSurroundedByBrackets(candidate)); + dataPoint.append(","); + + // is surrounded by commata + dataPoint.append(calculator.isSurroundedByCommata(candidate)); + dataPoint.append(","); + // followed by 's + dataPoint.append(calculator.isFollowedBy_s(candidate)); + dataPoint.append(","); + + // is abbreviation + dataPoint.append(calculator.getIsAbbreviation(candidate)); + + dataPoint.append("\n"); + + arffContents.append(dataPoint); + } + } + } + +}