Mercurial > hg > anteater
view src/de/mpiwg/anteater/persons/regex/ApplicantRegexFinder.java @ 0:036535fcd179
anteater
author | jdamerow |
---|---|
date | Fri, 14 Sep 2012 10:30:43 +0200 |
parents | |
children |
line wrap: on
line source
package de.mpiwg.anteater.persons.regex; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.regex.Matcher; import java.util.regex.Pattern; import de.mpiwg.anteater.AnteaterConfiguration; import de.mpiwg.anteater.persons.APerson; import de.mpiwg.anteater.persons.Person; import de.mpiwg.anteater.persons.PersonsExtraction; import de.mpiwg.anteater.results.ApplicantResult; import de.mpiwg.anteater.text.Paragraph; import de.mpiwg.anteater.text.TextInformation; import de.mpiwg.anteater.text.TextPart; import de.mpiwg.anteater.text.TextType; public class ApplicantRegexFinder { public final static String COMPONENT_NAME = ApplicantRegexFinder.class.getSimpleName(); private String applicantRegex = "Applicant:\\p{Blank}{0,1}([A-Z]{2}-.+?,){0,1}\\p{Blank}{0,1}(.+?,( {0,1}Inc(.){0,1}| {0,1}LLC){0,1})"; private AnteaterConfiguration configuration; public ApplicantRegexFinder(AnteaterConfiguration configuration) { this.configuration = configuration; } public List<ApplicantResult> findApplicants(List<TextInformation> infos) { configuration.getLogger().logMessage(COMPONENT_NAME, "Search for applicants with regular expressions."); List<ApplicantResult> results = new ArrayList<ApplicantResult>(); for (TextInformation info : infos) { List<PersonsExtraction> extractions = info.getPersonsExtractions(); // find applicants in summaries List<TextPart> summaries = info.getSummaries(); for (TextPart summary : summaries) { PersonsExtraction pExtraction =findExtraction(extractions, TextType.TYPE_SUMMARY, summaries.indexOf(summary)); results.addAll(findApplicantsInText(info, pExtraction, summary)); } // find applicants in suppleInf List<TextPart> suppleInf = info.getSupplInfos(); for (TextPart sInf : suppleInf) { PersonsExtraction pExtraction = findExtraction(extractions, TextType.TYPE_SUPLINF, suppleInf.indexOf(sInf)); results.addAll(findApplicantsInText(info, pExtraction, sInf)); } } configuration.getLogger().logMessage(COMPONENT_NAME, "Found " + results.size() + " applicant(s)."); return results; } private PersonsExtraction findExtraction(List<PersonsExtraction> extractions, int textType, int textIdx) { PersonsExtraction pExtraction = null; for (PersonsExtraction extr : extractions) { if (extr.getType() == textType && extr.getTextIdx() == textIdx) { pExtraction = extr; break; } } if (pExtraction == null) { pExtraction = new PersonsExtraction(); pExtraction.setType(textType); pExtraction.setTextIdx(textIdx); pExtraction.setPerson(new ArrayList<APerson>()); extractions.add(pExtraction); } return pExtraction; } private List<ApplicantResult> findApplicantsInText(TextInformation info, PersonsExtraction pExtraction, TextPart textPart) { List<Paragraph> paragraphs = textPart.getParagraphs(); // find extraction object in list List<ApplicantResult> results = new ArrayList<ApplicantResult>(); for (Paragraph para : paragraphs) { Map<Integer, String> applicants = findApplicants(para.getParagraphText()); for (Integer pos : applicants.keySet()) { // find out if person was already found APerson person = null; int posInText = textPart.getPositionInTextFromParagraph(para, pos); for (APerson p : pExtraction.getPersons()) if (p.getStart() == posInText) { person = p; break; } if (person == null) { person = new Person(); person.setStart(posInText); pExtraction.getPersons().add(person); } person.setLength(applicants.get(pos).length()); person.setReferenceInText(applicants.get(pos)); ApplicantResult result = new ApplicantResult(); result.setPrediction(2.0); result.setFinding(person); result.setResult(pExtraction); result.setTextInfo(info); results.add(result); } } return results; } public Map<Integer, String> findApplicants(String text) { String paraText = text; Pattern pattern = Pattern.compile(applicantRegex); Matcher match = pattern.matcher(paraText); Map<Integer, String> applicants = new HashMap<Integer, String>(); while (match.find()) { String applicant = match.group(2); int pos = match.start(2); if (applicant.endsWith(",")) applicant = applicant.substring(0, applicant.length() - 1); applicants.put(pos, applicant); } return applicants; } }