Mercurial > hg > anteater
comparison src/de/mpiwg/anteater/persons/regex/ApplicantRegexFinder.java @ 0:036535fcd179
anteater
author | jdamerow |
---|---|
date | Fri, 14 Sep 2012 10:30:43 +0200 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:036535fcd179 |
---|---|
1 package de.mpiwg.anteater.persons.regex; | |
2 | |
3 import java.util.ArrayList; | |
4 import java.util.HashMap; | |
5 import java.util.List; | |
6 import java.util.Map; | |
7 import java.util.regex.Matcher; | |
8 import java.util.regex.Pattern; | |
9 | |
10 import de.mpiwg.anteater.AnteaterConfiguration; | |
11 import de.mpiwg.anteater.persons.APerson; | |
12 import de.mpiwg.anteater.persons.Person; | |
13 import de.mpiwg.anteater.persons.PersonsExtraction; | |
14 import de.mpiwg.anteater.results.ApplicantResult; | |
15 import de.mpiwg.anteater.text.Paragraph; | |
16 import de.mpiwg.anteater.text.TextInformation; | |
17 import de.mpiwg.anteater.text.TextPart; | |
18 import de.mpiwg.anteater.text.TextType; | |
19 | |
20 public class ApplicantRegexFinder { | |
21 | |
22 public final static String COMPONENT_NAME = ApplicantRegexFinder.class.getSimpleName(); | |
23 | |
24 private String applicantRegex = "Applicant:\\p{Blank}{0,1}([A-Z]{2}-.+?,){0,1}\\p{Blank}{0,1}(.+?,( {0,1}Inc(.){0,1}| {0,1}LLC){0,1})"; | |
25 | |
26 private AnteaterConfiguration configuration; | |
27 | |
28 public ApplicantRegexFinder(AnteaterConfiguration configuration) { | |
29 this.configuration = configuration; | |
30 } | |
31 | |
32 public List<ApplicantResult> findApplicants(List<TextInformation> infos) { | |
33 configuration.getLogger().logMessage(COMPONENT_NAME, "Search for applicants with regular expressions."); | |
34 | |
35 List<ApplicantResult> results = new ArrayList<ApplicantResult>(); | |
36 | |
37 for (TextInformation info : infos) { | |
38 List<PersonsExtraction> extractions = info.getPersonsExtractions(); | |
39 | |
40 | |
41 // find applicants in summaries | |
42 List<TextPart> summaries = info.getSummaries(); | |
43 for (TextPart summary : summaries) { | |
44 | |
45 PersonsExtraction pExtraction =findExtraction(extractions, TextType.TYPE_SUMMARY, summaries.indexOf(summary)); | |
46 results.addAll(findApplicantsInText(info, pExtraction, summary)); | |
47 } | |
48 | |
49 // find applicants in suppleInf | |
50 List<TextPart> suppleInf = info.getSupplInfos(); | |
51 for (TextPart sInf : suppleInf) { | |
52 PersonsExtraction pExtraction = findExtraction(extractions, TextType.TYPE_SUPLINF, suppleInf.indexOf(sInf)); | |
53 results.addAll(findApplicantsInText(info, pExtraction, sInf)); | |
54 } | |
55 } | |
56 | |
57 configuration.getLogger().logMessage(COMPONENT_NAME, "Found " + results.size() + " applicant(s)."); | |
58 | |
59 return results; | |
60 } | |
61 | |
62 private PersonsExtraction findExtraction(List<PersonsExtraction> extractions, int textType, int textIdx) { | |
63 PersonsExtraction pExtraction = null; | |
64 | |
65 for (PersonsExtraction extr : extractions) { | |
66 if (extr.getType() == textType && extr.getTextIdx() == textIdx) { | |
67 pExtraction = extr; | |
68 break; | |
69 } | |
70 } | |
71 | |
72 if (pExtraction == null) { | |
73 pExtraction = new PersonsExtraction(); | |
74 pExtraction.setType(textType); | |
75 pExtraction.setTextIdx(textIdx); | |
76 pExtraction.setPerson(new ArrayList<APerson>()); | |
77 extractions.add(pExtraction); | |
78 } | |
79 | |
80 return pExtraction; | |
81 } | |
82 | |
83 private List<ApplicantResult> findApplicantsInText(TextInformation info, PersonsExtraction pExtraction, | |
84 TextPart textPart) { | |
85 List<Paragraph> paragraphs = textPart.getParagraphs(); | |
86 // find extraction object in list | |
87 | |
88 | |
89 | |
90 List<ApplicantResult> results = new ArrayList<ApplicantResult>(); | |
91 for (Paragraph para : paragraphs) { | |
92 Map<Integer, String> applicants = findApplicants(para.getParagraphText()); | |
93 for (Integer pos : applicants.keySet()) { | |
94 // find out if person was already found | |
95 APerson person = null; | |
96 int posInText = textPart.getPositionInTextFromParagraph(para, pos); | |
97 for (APerson p : pExtraction.getPersons()) | |
98 if (p.getStart() == posInText) { | |
99 person = p; | |
100 break; | |
101 } | |
102 | |
103 if (person == null) { | |
104 person = new Person(); | |
105 person.setStart(posInText); | |
106 pExtraction.getPersons().add(person); | |
107 } | |
108 person.setLength(applicants.get(pos).length()); | |
109 person.setReferenceInText(applicants.get(pos)); | |
110 | |
111 ApplicantResult result = new ApplicantResult(); | |
112 result.setPrediction(2.0); | |
113 result.setFinding(person); | |
114 result.setResult(pExtraction); | |
115 result.setTextInfo(info); | |
116 results.add(result); | |
117 } | |
118 } | |
119 | |
120 return results; | |
121 } | |
122 | |
123 public Map<Integer, String> findApplicants(String text) { | |
124 String paraText = text; | |
125 Pattern pattern = Pattern.compile(applicantRegex); | |
126 Matcher match = pattern.matcher(paraText); | |
127 | |
128 Map<Integer, String> applicants = new HashMap<Integer, String>(); | |
129 while (match.find()) { | |
130 String applicant = match.group(2); | |
131 int pos = match.start(2); | |
132 if (applicant.endsWith(",")) | |
133 applicant = applicant.substring(0, applicant.length() - 1); | |
134 applicants.put(pos, applicant); | |
135 } | |
136 | |
137 return applicants; | |
138 } | |
139 } |