diff src/de/mpiwg/anteater/places/ml/preprocessing/LocationDataCreator.java @ 0:036535fcd179

anteater
author jdamerow
date Fri, 14 Sep 2012 10:30:43 +0200
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/de/mpiwg/anteater/places/ml/preprocessing/LocationDataCreator.java	Fri Sep 14 10:30:43 2012 +0200
@@ -0,0 +1,192 @@
+package de.mpiwg.anteater.places.ml.preprocessing;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import de.mpiwg.anteater.AnteaterConfiguration;
+import de.mpiwg.anteater.ml.ITextParser;
+import de.mpiwg.anteater.ml.preprocessing.DataCreator;
+import de.mpiwg.anteater.persons.APerson;
+import de.mpiwg.anteater.persons.PersonsExtraction;
+import de.mpiwg.anteater.places.Place;
+import de.mpiwg.anteater.places.PlaceInformation;
+import de.mpiwg.anteater.places.PlacesExtraction;
+import de.mpiwg.anteater.results.ApplicantResult;
+import de.mpiwg.anteater.results.SpeciesScientificResult;
+import de.mpiwg.anteater.species.scientific.ScientificName;
+import de.mpiwg.anteater.species.scientific.ScientificNamesExtraction;
+import de.mpiwg.anteater.text.Paragraph;
+import de.mpiwg.anteater.text.TextInformation;
+import de.mpiwg.anteater.text.TextPart;
+
+public class LocationDataCreator extends DataCreator {
+	
+	private List<SpeciesScientificResult> predictedSpecies;
+	private List<ApplicantResult> predictedApplicants;
+
+	public LocationDataCreator(AnteaterConfiguration configuration, List<SpeciesScientificResult> predictedSpecies, List<ApplicantResult> predictedApplicants) {
+		super(configuration, "LOCATION_");		
+		this.predictedApplicants = predictedApplicants;
+		this.predictedSpecies = predictedSpecies;
+	}
+
+	@Override
+	public void createFileContents(TextInformation info,
+			StringBuffer arffContents, ITextParser textParser) {
+		List<PlacesExtraction> placesExtractions = info.getPlacesExtractions();
+		
+		for (PlacesExtraction extraction : placesExtractions) {
+			List<PlaceInformation> placeInfos = extraction.getPlaceInformation();
+			
+			// get sentences
+			TextPart text = null;
+			switch(extraction.getType()) {
+				// summary
+				case 1: text = info.getSummaries().get(extraction.getTextIdx()); break;
+				// supplementary information
+				case 2: text = info.getSupplInfos().get(extraction.getTextIdx());
+			}
+			
+			if (text == null)
+				continue;
+			
+			List<ScientificName> predictedNames = new ArrayList<ScientificName>();
+			List<APerson> predictedPeople = new ArrayList<APerson>();
+			
+			for (SpeciesScientificResult result : predictedSpecies) {
+				if (result.getTextInfo() == info) {
+					if (result.getPrediction() >= 1.0) {
+						ScientificNamesExtraction textExtrac = result.getResult();
+						if (textExtrac.getType() == extraction.getType() && textExtrac.getTextIdx() == extraction.getTextIdx())
+							predictedNames.add(result.getFinding());
+					}
+				}
+			}
+			
+			for (ApplicantResult appResult : predictedApplicants) {
+				if (appResult.getTextInfo() == info) {
+					if (appResult.getPrediction() >= 1.0) {
+						PersonsExtraction textExtrac = appResult.getResult();
+						if (textExtrac.getType() == extraction.getType() && textExtrac.getTextIdx() == extraction.getTextIdx())
+							predictedPeople.add(appResult.getFinding());
+					}
+				}
+			}
+			
+			List<String> sentences = new ArrayList<String>();
+			//ITextParser icuParser = new ICUTextParser();
+			long start = System.currentTimeMillis();
+			for (Paragraph p : text.getParagraphs()) {
+				sentences.addAll(textParser.getSentences(p.getParagraphText()));
+			}
+			long end = System.currentTimeMillis();
+			configuration.getLogger().logMessage(COMPONENT_NAME, "Splitting text into sentences: " + (end - start) + "ms");
+			
+			
+			// feature calculator
+			LocationFeatureCalculator calculator = new LocationFeatureCalculator(sentences, textParser, text);
+			calculator.setFoundApplicant(predictedPeople);
+			calculator.setFoundSpecies(predictedNames);
+			
+			for (PlaceInformation pInfo : placeInfos) {
+				List<Place> places = pInfo.getPlaces();
+				
+				String placeReference = pInfo.getReferenceInText();
+				String[] placeParts = placeReference.split(" ");
+				
+				for (Place place : places) {
+					StringBuffer dataPoint = new StringBuffer();
+					
+					// unknown class
+					dataPoint.append(UNKNOWN_CLASS_SYMBOL);
+					dataPoint.append(",");
+					
+					// numbers/words
+					dataPoint.append(calculator.getNumberWordRelation(placeParts));
+					dataPoint.append(",");
+					
+					// starts_with_uppercase/words
+					dataPoint.append(calculator.getUppercasedWordsToAllRelation(placeParts));
+					dataPoint.append(",");
+					
+					// contains_2_uppercase_letter_word
+					dataPoint.append(calculator.contains2UppercaseCharacterWord(placeParts));
+					dataPoint.append(",");
+					
+					// contains_university
+					dataPoint.append(calculator.containsUniversity(placeParts));
+					dataPoint.append(",");
+					
+					// surrounded_by_comma
+					dataPoint.append(calculator.isSurroundedByCommata(pInfo));
+					dataPoint.append(",");
+					
+					// surrounded_by_brackets
+					dataPoint.append(calculator.isSurroundedByBrackets(pInfo));
+					dataPoint.append(",");
+					
+					// preceeded_by_and
+					dataPoint.append(calculator.isPreceededByAnd(pInfo));
+					dataPoint.append(",");
+					
+					// preceeded_by_the
+					dataPoint.append(calculator.isPreceededByThe(pInfo));
+					dataPoint.append(",");
+					
+					// char_to_last_species_in_p
+					dataPoint.append(calculator.getCharsToLastSpeciesInParagraph(pInfo));
+					dataPoint.append(",");
+					
+					// char_to_next_species_in_p
+					dataPoint.append(calculator.getCharsToNextSpeciesInParagraph(pInfo));
+					dataPoint.append(",");
+					
+					// char_to_study_in_p
+					dataPoint.append(calculator.getCharToStudyInParagraph(pInfo));
+					dataPoint.append(",");
+					
+					// char_to_studies_in_p
+					dataPoint.append(calculator.getCharToStudiesInParagraph(pInfo));
+					dataPoint.append(",");
+					
+					// char_to_in_in_s
+					dataPoint.append(calculator.getCharToInInSentence(pInfo));
+					dataPoint.append(",");
+					
+					// char_to_at_in_s
+					dataPoint.append(calculator.getCharToAtInSentence(pInfo));
+					dataPoint.append(",");
+					
+					//nr_char_to_last_applicant_in_text(end of applicant)
+					dataPoint.append(calculator.getCharsToLastApplicantInParagraph(pInfo));
+					dataPoint.append(",");
+					
+					// has comma
+					dataPoint.append(calculator.hasComma(pInfo));
+					dataPoint.append(",");
+					
+					// hasBracket
+					dataPoint.append(calculator.hasBracket(pInfo));
+					dataPoint.append(",");
+					
+					// type(0=Other,1=Town,2=County,3=State,4=Country,5=Suburb,6=POI,7=Zip,8=Ocean)
+					dataPoint.append(calculator.getType(place));
+					dataPoint.append(",");
+					
+					// chars_to_survey_in_s
+					dataPoint.append(calculator.getCharToSurveyInSentence(pInfo));
+					dataPoint.append(",");
+					
+					// chars_to_species_in_s
+					dataPoint.append(calculator.getCharToSpeciesInSentence(pInfo));
+					dataPoint.append(",");
+					
+					dataPoint.append("\n");
+					
+					arffContents.append(dataPoint);
+				}
+			}
+		}
+	}
+
+}