diff src/de/mpiwg/anteater/persons/ml/preprocessing/ApplicantDataCreator.java @ 0:036535fcd179

anteater
author jdamerow
date Fri, 14 Sep 2012 10:30:43 +0200
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/de/mpiwg/anteater/persons/ml/preprocessing/ApplicantDataCreator.java	Fri Sep 14 10:30:43 2012 +0200
@@ -0,0 +1,158 @@
+package de.mpiwg.anteater.persons.ml.preprocessing;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import de.mpiwg.anteater.AnteaterConfiguration;
+import de.mpiwg.anteater.ml.ITextParser;
+import de.mpiwg.anteater.ml.preprocessing.DataCreator;
+import de.mpiwg.anteater.persons.APerson;
+import de.mpiwg.anteater.persons.Location;
+import de.mpiwg.anteater.persons.Organization;
+import de.mpiwg.anteater.persons.Person;
+import de.mpiwg.anteater.persons.PersonsExtraction;
+import de.mpiwg.anteater.places.PlaceInformation;
+import de.mpiwg.anteater.places.PlacesExtraction;
+import de.mpiwg.anteater.species.scientific.ScientificName;
+import de.mpiwg.anteater.species.scientific.ScientificNamesExtraction;
+import de.mpiwg.anteater.text.Paragraph;
+import de.mpiwg.anteater.text.TextInformation;
+import de.mpiwg.anteater.text.TextPart;
+
+public class ApplicantDataCreator extends DataCreator {
+
+	public ApplicantDataCreator(AnteaterConfiguration configuration) {
+		super(configuration, "APPLICANT_");
+	}
+
+	@Override
+	public void createFileContents(TextInformation info, StringBuffer arffContents, ITextParser textParser) {
+		List<PersonsExtraction> results = info.getPersonsExtractions();
+		
+		for (PersonsExtraction pResult : results) {
+			List<APerson> candidates = pResult.getPersons();
+			TextPart text = null;
+			switch(pResult.getType()) {
+				// summary
+				case 1: text = info.getSummaries().get(pResult.getTextIdx()); break;
+				// supplementary information
+				case 2: text = info.getSupplInfos().get(pResult.getTextIdx());
+			}
+			
+			if (text == null)
+				continue;
+			
+			List<String> sentences = new ArrayList<String>();
+			
+			//ITextParser icuParser = new ICUTextParser();
+			
+			long start = System.currentTimeMillis();
+			for (Paragraph p : text.getParagraphs()) {
+				sentences.addAll(textParser.getSentences(p.getParagraphText()));
+			}
+			long end = System.currentTimeMillis();
+			configuration.getLogger().logMessage(COMPONENT_NAME, "Splitting text into sentences: " + (end - start) + "ms");
+			
+			ApplicantFeatureCalculator calculator = new ApplicantFeatureCalculator(sentences, textParser, text);
+			List<PlaceInformation> places = new ArrayList<PlaceInformation>();
+			List<ScientificName> names = new ArrayList<ScientificName>();
+			
+			for (PlacesExtraction r : info.getPlacesExtractions()) {
+				if (r.getTextIdx() == pResult.getTextIdx())
+					places = r.getPlaceInformation();
+			}
+			
+			for (ScientificNamesExtraction r : info.getScientificNamesExtractions()) {
+				if (r.getTextIdx() == pResult.getTextIdx())
+					names = r.getNames();
+			}
+			calculator.setNames(names);
+			calculator.setPlaces(places);
+			
+			for (APerson candidate : candidates) {
+				StringBuffer dataPoint = new StringBuffer();
+				
+				// unknown class
+				dataPoint.append(UNKNOWN_CLASS_SYMBOL);
+				dataPoint.append(",");
+				
+				// add text type
+				dataPoint.append(pResult.getType());
+				dataPoint.append(",");
+				// name_length
+				dataPoint.append(candidate.getReferenceInText().length());
+				dataPoint.append(",");
+				// contains issued
+				dataPoint.append(calculator.getSentenceContainsIssued(candidate));
+				dataPoint.append(",");
+				// contains applied
+				dataPoint.append(calculator.getSentenceContainsApplied(candidate));
+				dataPoint.append(",");
+				// contains permit
+				dataPoint.append(calculator.getSentenceContainsPermit(candidate));
+				dataPoint.append(",");
+				// contains comment
+				dataPoint.append(calculator.getSentenceContainsComment(candidate));
+				dataPoint.append(",");
+				// is subject
+				start = System.currentTimeMillis();
+				dataPoint.append(calculator.getIsSubject(candidate));
+				dataPoint.append(",");
+				end = System.currentTimeMillis();
+				configuration.getLogger().logMessage(COMPONENT_NAME, "Determining subject: " + (end - start) + "ms");
+				
+				// contains applicant
+				dataPoint.append(calculator.getSentenceContainsApplicant(candidate));
+				dataPoint.append(",");
+				// distance term to applicant
+				dataPoint.append(calculator.getDistanceCandidateToApplicant(candidate));
+				dataPoint.append(",");
+				// person, location, organization
+				if (candidate instanceof Person)
+					dataPoint.append(1);
+				else if (candidate instanceof Organization)
+					dataPoint.append(2);
+				else if (candidate instanceof Location)
+					dataPoint.append(3);
+				else
+					continue;
+				dataPoint.append(",");
+				
+				// get similarity to speciies names
+				dataPoint.append(calculator.getSimilarityPersonNameForPerson(candidate));
+				dataPoint.append(",");
+				dataPoint.append(calculator.getSimilarityPersonNameForName(candidate));
+				dataPoint.append(",");
+				dataPoint.append(calculator.doPersonAndNameStartAtSameIdx(candidate));
+				dataPoint.append(",");
+				
+				// get similarity to places names
+				dataPoint.append(calculator.getSimilarityPersonPlaceForPerson(candidate));
+				dataPoint.append(",");
+				dataPoint.append(calculator.getSimilarityPersonPlaceForPlace(candidate));
+				dataPoint.append(",");
+				dataPoint.append(calculator.doPersonAndPlaceStartAtSameIdx(candidate));
+				dataPoint.append(",");
+				
+				// is surrounded by brackets
+				dataPoint.append(calculator.isSurroundedByBrackets(candidate));
+				dataPoint.append(",");
+				
+				// is surrounded by commata
+				dataPoint.append(calculator.isSurroundedByCommata(candidate));
+				dataPoint.append(",");
+				// followed by 's
+				dataPoint.append(calculator.isFollowedBy_s(candidate));
+				dataPoint.append(",");
+				
+				// is abbreviation
+				dataPoint.append(calculator.getIsAbbreviation(candidate));
+				
+				dataPoint.append("\n");
+				
+				arffContents.append(dataPoint);
+			}
+		}
+	}
+
+}