view src/de/mpiwg/anteater/persons/ml/preprocessing/ApplicantDataCreator.java @ 0:036535fcd179

anteater
author jdamerow
date Fri, 14 Sep 2012 10:30:43 +0200
parents
children
line wrap: on
line source

package de.mpiwg.anteater.persons.ml.preprocessing;

import java.util.ArrayList;
import java.util.List;

import de.mpiwg.anteater.AnteaterConfiguration;
import de.mpiwg.anteater.ml.ITextParser;
import de.mpiwg.anteater.ml.preprocessing.DataCreator;
import de.mpiwg.anteater.persons.APerson;
import de.mpiwg.anteater.persons.Location;
import de.mpiwg.anteater.persons.Organization;
import de.mpiwg.anteater.persons.Person;
import de.mpiwg.anteater.persons.PersonsExtraction;
import de.mpiwg.anteater.places.PlaceInformation;
import de.mpiwg.anteater.places.PlacesExtraction;
import de.mpiwg.anteater.species.scientific.ScientificName;
import de.mpiwg.anteater.species.scientific.ScientificNamesExtraction;
import de.mpiwg.anteater.text.Paragraph;
import de.mpiwg.anteater.text.TextInformation;
import de.mpiwg.anteater.text.TextPart;

public class ApplicantDataCreator extends DataCreator {

	public ApplicantDataCreator(AnteaterConfiguration configuration) {
		super(configuration, "APPLICANT_");
	}

	@Override
	public void createFileContents(TextInformation info, StringBuffer arffContents, ITextParser textParser) {
		List<PersonsExtraction> results = info.getPersonsExtractions();
		
		for (PersonsExtraction pResult : results) {
			List<APerson> candidates = pResult.getPersons();
			TextPart text = null;
			switch(pResult.getType()) {
				// summary
				case 1: text = info.getSummaries().get(pResult.getTextIdx()); break;
				// supplementary information
				case 2: text = info.getSupplInfos().get(pResult.getTextIdx());
			}
			
			if (text == null)
				continue;
			
			List<String> sentences = new ArrayList<String>();
			
			//ITextParser icuParser = new ICUTextParser();
			
			long start = System.currentTimeMillis();
			for (Paragraph p : text.getParagraphs()) {
				sentences.addAll(textParser.getSentences(p.getParagraphText()));
			}
			long end = System.currentTimeMillis();
			configuration.getLogger().logMessage(COMPONENT_NAME, "Splitting text into sentences: " + (end - start) + "ms");
			
			ApplicantFeatureCalculator calculator = new ApplicantFeatureCalculator(sentences, textParser, text);
			List<PlaceInformation> places = new ArrayList<PlaceInformation>();
			List<ScientificName> names = new ArrayList<ScientificName>();
			
			for (PlacesExtraction r : info.getPlacesExtractions()) {
				if (r.getTextIdx() == pResult.getTextIdx())
					places = r.getPlaceInformation();
			}
			
			for (ScientificNamesExtraction r : info.getScientificNamesExtractions()) {
				if (r.getTextIdx() == pResult.getTextIdx())
					names = r.getNames();
			}
			calculator.setNames(names);
			calculator.setPlaces(places);
			
			for (APerson candidate : candidates) {
				StringBuffer dataPoint = new StringBuffer();
				
				// unknown class
				dataPoint.append(UNKNOWN_CLASS_SYMBOL);
				dataPoint.append(",");
				
				// add text type
				dataPoint.append(pResult.getType());
				dataPoint.append(",");
				// name_length
				dataPoint.append(candidate.getReferenceInText().length());
				dataPoint.append(",");
				// contains issued
				dataPoint.append(calculator.getSentenceContainsIssued(candidate));
				dataPoint.append(",");
				// contains applied
				dataPoint.append(calculator.getSentenceContainsApplied(candidate));
				dataPoint.append(",");
				// contains permit
				dataPoint.append(calculator.getSentenceContainsPermit(candidate));
				dataPoint.append(",");
				// contains comment
				dataPoint.append(calculator.getSentenceContainsComment(candidate));
				dataPoint.append(",");
				// is subject
				start = System.currentTimeMillis();
				dataPoint.append(calculator.getIsSubject(candidate));
				dataPoint.append(",");
				end = System.currentTimeMillis();
				configuration.getLogger().logMessage(COMPONENT_NAME, "Determining subject: " + (end - start) + "ms");
				
				// contains applicant
				dataPoint.append(calculator.getSentenceContainsApplicant(candidate));
				dataPoint.append(",");
				// distance term to applicant
				dataPoint.append(calculator.getDistanceCandidateToApplicant(candidate));
				dataPoint.append(",");
				// person, location, organization
				if (candidate instanceof Person)
					dataPoint.append(1);
				else if (candidate instanceof Organization)
					dataPoint.append(2);
				else if (candidate instanceof Location)
					dataPoint.append(3);
				else
					continue;
				dataPoint.append(",");
				
				// get similarity to speciies names
				dataPoint.append(calculator.getSimilarityPersonNameForPerson(candidate));
				dataPoint.append(",");
				dataPoint.append(calculator.getSimilarityPersonNameForName(candidate));
				dataPoint.append(",");
				dataPoint.append(calculator.doPersonAndNameStartAtSameIdx(candidate));
				dataPoint.append(",");
				
				// get similarity to places names
				dataPoint.append(calculator.getSimilarityPersonPlaceForPerson(candidate));
				dataPoint.append(",");
				dataPoint.append(calculator.getSimilarityPersonPlaceForPlace(candidate));
				dataPoint.append(",");
				dataPoint.append(calculator.doPersonAndPlaceStartAtSameIdx(candidate));
				dataPoint.append(",");
				
				// is surrounded by brackets
				dataPoint.append(calculator.isSurroundedByBrackets(candidate));
				dataPoint.append(",");
				
				// is surrounded by commata
				dataPoint.append(calculator.isSurroundedByCommata(candidate));
				dataPoint.append(",");
				// followed by 's
				dataPoint.append(calculator.isFollowedBy_s(candidate));
				dataPoint.append(",");
				
				// is abbreviation
				dataPoint.append(calculator.getIsAbbreviation(candidate));
				
				dataPoint.append("\n");
				
				arffContents.append(dataPoint);
			}
		}
	}

}