view src/de/mpiwg/anteater/places/ml/preprocessing/LocationDataCreator.java @ 0:036535fcd179

anteater
author jdamerow
date Fri, 14 Sep 2012 10:30:43 +0200
parents
children
line wrap: on
line source

package de.mpiwg.anteater.places.ml.preprocessing;

import java.util.ArrayList;
import java.util.List;

import de.mpiwg.anteater.AnteaterConfiguration;
import de.mpiwg.anteater.ml.ITextParser;
import de.mpiwg.anteater.ml.preprocessing.DataCreator;
import de.mpiwg.anteater.persons.APerson;
import de.mpiwg.anteater.persons.PersonsExtraction;
import de.mpiwg.anteater.places.Place;
import de.mpiwg.anteater.places.PlaceInformation;
import de.mpiwg.anteater.places.PlacesExtraction;
import de.mpiwg.anteater.results.ApplicantResult;
import de.mpiwg.anteater.results.SpeciesScientificResult;
import de.mpiwg.anteater.species.scientific.ScientificName;
import de.mpiwg.anteater.species.scientific.ScientificNamesExtraction;
import de.mpiwg.anteater.text.Paragraph;
import de.mpiwg.anteater.text.TextInformation;
import de.mpiwg.anteater.text.TextPart;

public class LocationDataCreator extends DataCreator {
	
	private List<SpeciesScientificResult> predictedSpecies;
	private List<ApplicantResult> predictedApplicants;

	public LocationDataCreator(AnteaterConfiguration configuration, List<SpeciesScientificResult> predictedSpecies, List<ApplicantResult> predictedApplicants) {
		super(configuration, "LOCATION_");		
		this.predictedApplicants = predictedApplicants;
		this.predictedSpecies = predictedSpecies;
	}

	@Override
	public void createFileContents(TextInformation info,
			StringBuffer arffContents, ITextParser textParser) {
		List<PlacesExtraction> placesExtractions = info.getPlacesExtractions();
		
		for (PlacesExtraction extraction : placesExtractions) {
			List<PlaceInformation> placeInfos = extraction.getPlaceInformation();
			
			// get sentences
			TextPart text = null;
			switch(extraction.getType()) {
				// summary
				case 1: text = info.getSummaries().get(extraction.getTextIdx()); break;
				// supplementary information
				case 2: text = info.getSupplInfos().get(extraction.getTextIdx());
			}
			
			if (text == null)
				continue;
			
			List<ScientificName> predictedNames = new ArrayList<ScientificName>();
			List<APerson> predictedPeople = new ArrayList<APerson>();
			
			for (SpeciesScientificResult result : predictedSpecies) {
				if (result.getTextInfo() == info) {
					if (result.getPrediction() >= 1.0) {
						ScientificNamesExtraction textExtrac = result.getResult();
						if (textExtrac.getType() == extraction.getType() && textExtrac.getTextIdx() == extraction.getTextIdx())
							predictedNames.add(result.getFinding());
					}
				}
			}
			
			for (ApplicantResult appResult : predictedApplicants) {
				if (appResult.getTextInfo() == info) {
					if (appResult.getPrediction() >= 1.0) {
						PersonsExtraction textExtrac = appResult.getResult();
						if (textExtrac.getType() == extraction.getType() && textExtrac.getTextIdx() == extraction.getTextIdx())
							predictedPeople.add(appResult.getFinding());
					}
				}
			}
			
			List<String> sentences = new ArrayList<String>();
			//ITextParser icuParser = new ICUTextParser();
			long start = System.currentTimeMillis();
			for (Paragraph p : text.getParagraphs()) {
				sentences.addAll(textParser.getSentences(p.getParagraphText()));
			}
			long end = System.currentTimeMillis();
			configuration.getLogger().logMessage(COMPONENT_NAME, "Splitting text into sentences: " + (end - start) + "ms");
			
			
			// feature calculator
			LocationFeatureCalculator calculator = new LocationFeatureCalculator(sentences, textParser, text);
			calculator.setFoundApplicant(predictedPeople);
			calculator.setFoundSpecies(predictedNames);
			
			for (PlaceInformation pInfo : placeInfos) {
				List<Place> places = pInfo.getPlaces();
				
				String placeReference = pInfo.getReferenceInText();
				String[] placeParts = placeReference.split(" ");
				
				for (Place place : places) {
					StringBuffer dataPoint = new StringBuffer();
					
					// unknown class
					dataPoint.append(UNKNOWN_CLASS_SYMBOL);
					dataPoint.append(",");
					
					// numbers/words
					dataPoint.append(calculator.getNumberWordRelation(placeParts));
					dataPoint.append(",");
					
					// starts_with_uppercase/words
					dataPoint.append(calculator.getUppercasedWordsToAllRelation(placeParts));
					dataPoint.append(",");
					
					// contains_2_uppercase_letter_word
					dataPoint.append(calculator.contains2UppercaseCharacterWord(placeParts));
					dataPoint.append(",");
					
					// contains_university
					dataPoint.append(calculator.containsUniversity(placeParts));
					dataPoint.append(",");
					
					// surrounded_by_comma
					dataPoint.append(calculator.isSurroundedByCommata(pInfo));
					dataPoint.append(",");
					
					// surrounded_by_brackets
					dataPoint.append(calculator.isSurroundedByBrackets(pInfo));
					dataPoint.append(",");
					
					// preceeded_by_and
					dataPoint.append(calculator.isPreceededByAnd(pInfo));
					dataPoint.append(",");
					
					// preceeded_by_the
					dataPoint.append(calculator.isPreceededByThe(pInfo));
					dataPoint.append(",");
					
					// char_to_last_species_in_p
					dataPoint.append(calculator.getCharsToLastSpeciesInParagraph(pInfo));
					dataPoint.append(",");
					
					// char_to_next_species_in_p
					dataPoint.append(calculator.getCharsToNextSpeciesInParagraph(pInfo));
					dataPoint.append(",");
					
					// char_to_study_in_p
					dataPoint.append(calculator.getCharToStudyInParagraph(pInfo));
					dataPoint.append(",");
					
					// char_to_studies_in_p
					dataPoint.append(calculator.getCharToStudiesInParagraph(pInfo));
					dataPoint.append(",");
					
					// char_to_in_in_s
					dataPoint.append(calculator.getCharToInInSentence(pInfo));
					dataPoint.append(",");
					
					// char_to_at_in_s
					dataPoint.append(calculator.getCharToAtInSentence(pInfo));
					dataPoint.append(",");
					
					//nr_char_to_last_applicant_in_text(end of applicant)
					dataPoint.append(calculator.getCharsToLastApplicantInParagraph(pInfo));
					dataPoint.append(",");
					
					// has comma
					dataPoint.append(calculator.hasComma(pInfo));
					dataPoint.append(",");
					
					// hasBracket
					dataPoint.append(calculator.hasBracket(pInfo));
					dataPoint.append(",");
					
					// type(0=Other,1=Town,2=County,3=State,4=Country,5=Suburb,6=POI,7=Zip,8=Ocean)
					dataPoint.append(calculator.getType(place));
					dataPoint.append(",");
					
					// chars_to_survey_in_s
					dataPoint.append(calculator.getCharToSurveyInSentence(pInfo));
					dataPoint.append(",");
					
					// chars_to_species_in_s
					dataPoint.append(calculator.getCharToSpeciesInSentence(pInfo));
					dataPoint.append(",");
					
					dataPoint.append("\n");
					
					arffContents.append(dataPoint);
				}
			}
		}
	}

}