changeset 1:7a4341c9f2e5

checking permit numbers for similarity if no direct match
author jdamerow
date Fri, 05 Oct 2012 18:52:14 -0700
parents 036535fcd179
children 1c2b4f5e2c05
files .classpath eventsXML/01-14522.xml eventsXML/2010-23822.xml mlAnalysis/APPLICANT_01-14522.arff mlAnalysis/LOCATION_01-14522.arff results/01-14522.xml results/2010-23822.xml src/de/mpiwg/anteater/events/processors/PermitOrApplicantEventProcessor.java texts/01-14522.xml
diffstat 9 files changed, 417 insertions(+), 344 deletions(-) [+]
line wrap: on
line diff
--- a/.classpath	Fri Sep 14 10:30:43 2012 +0200
+++ b/.classpath	Fri Oct 05 18:52:14 2012 -0700
@@ -24,5 +24,6 @@
 	<classpathentry kind="lib" path="lib/lingpipe-4.1.0.jar"/>
 	<classpathentry kind="lib" path="lib/icu4j-4_8_1_1.jar"/>
 	<classpathentry kind="lib" path="lib/commons-lang3-3.1.jar"/>
+	<classpathentry kind="lib" path="lib/simmetrics_jar_v1_6_2_d07_02_07.jar"/>
 	<classpathentry kind="output" path="bin"/>
 </classpath>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/eventsXML/01-14522.xml	Fri Oct 05 18:52:14 2012 -0700
@@ -0,0 +1,56 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<events>
+	<event text="01-14522.xml" date_filed="6-7-01">
+		<applicants>
+			<applicant>
+				<name>Ocean Alliance/Whale Conservation Institute</name>
+				<applicant_institutions />
+				<applicant_locations>
+					<applicant_locations>
+						<name>Lincoln, Massachusetts 01773</name>
+						<place_information type="Town" woeId="2439449"
+							latitude="42.4141" longitude="-71.3257">Lincoln, MA, US</place_information>
+					</applicant_locations>
+				</applicant_locations>
+			</applicant>
+			<applicant>
+				<name>Dr. Roger S. Payne</name>
+				<applicant_institutions />
+				<applicant_locations>
+					<applicant_locations>
+						<name>Seattle, WA 98115-0070</name>
+						<place_information type="Zip" woeId="12798958"
+							latitude="47.6858" longitude="-122.283">98115, Seattle, WA, US
+						</place_information>
+					</applicant_locations>
+				</applicant_locations>
+			</applicant>
+		</applicants>
+		<research_locations />
+		<researched_species />
+	</event>
+	<event text="01-14522.xml" date_filed="6-7-01">
+		<applicants>
+			<applicant>
+				<name>Dr. Peter L. Tyack</name>
+				<applicant_institutions />
+				<applicant_locations>
+					<applicant_locations>
+						<name>Woods Hole, MA 02543</name>
+						<place_information type="Zip" woeId="12758881"
+							latitude="41.5294" longitude="-70.6659">02543, Woods Hole, MA, US
+						</place_information>
+					</applicant_locations>
+				</applicant_locations>
+			</applicant>
+		</applicants>
+		<research_locations>
+			<research_location>
+				<name>Pa</name>
+				<place_information type="State" woeId="2347597"
+					latitude="40.9947" longitude="-77.6045">Pennsylvania, US</place_information>
+			</research_location>
+		</research_locations>
+		<researched_species />
+	</event>
+</events>
--- a/eventsXML/2010-23822.xml	Fri Sep 14 10:30:43 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,145 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<events>
-	<event text="2010-23822.xml" date_filed="9-22-10">
-		<applicants>
-			<applicant>
-				<name>University of Connecticut</name>
-				<applicant_institutions />
-				<applicant_locations />
-			</applicant>
-		</applicants>
-		<research_locations />
-		<researched_species>
-			<species identified_name="Propithecus tattersalli">Propithecus tattersalli</species>
-		</researched_species>
-	</event>
-	<event text="2010-23822.xml" date_filed="9-22-10">
-		<applicants>
-			<applicant>
-				<name>Christina Marisa Tellez</name>
-				<applicant_institutions>
-					<applicant_institution>
-						<name>UCLA), Los Angeles, CA</name>
-						<place_information type="POI" woeId="23511626"
-							latitude="34.0644" longitude="-118.445">University of California Los
-							Angeles, Los Angeles, CA, US</place_information>
-					</applicant_institution>
-				</applicant_institutions>
-				<applicant_locations />
-			</applicant>
-		</applicants>
-		<research_locations />
-		<researched_species>
-			<species identified_name="Crocodylus acutus">Crocodylus acutus</species>
-			<species identified_name="Crocodylus moreletti">Crocodylus moreletti</species>
-			<species identified_name="Damaliscus pygargus pygargus">Damaliscus pygargus pygargus
-			</species>
-		</researched_species>
-	</event>
-	<event text="2010-23822.xml" date_filed="9-22-10">
-		<applicants>
-			<applicant>
-				<name>Steven Louis</name>
-				<applicant_institutions />
-				<applicant_locations>
-					<applicant_locations>
-						<name>Richland Center</name>
-						<place_information type="Town" woeId="2480850"
-							latitude="43.3424" longitude="-90.3865">Richland Center, WI, US
-						</place_information>
-					</applicant_locations>
-				</applicant_locations>
-			</applicant>
-			<applicant>
-				<name>Selmer Erickson</name>
-				<applicant_institutions />
-				<applicant_locations>
-					<applicant_locations>
-						<name>Anchorage</name>
-						<place_information type="Town" woeId="2354490"
-							latitude="61.2176" longitude="-149.858">Anchorage, AK, US
-						</place_information>
-					</applicant_locations>
-				</applicant_locations>
-			</applicant>
-			<applicant>
-				<name>U.S. Fish and Wildlife Service</name>
-				<applicant_institutions />
-				<applicant_locations>
-					<applicant_locations>
-						<name>Anchorage</name>
-						<place_information type="Town" woeId="2354490"
-							latitude="61.2176" longitude="-149.858">Anchorage, AK, US
-						</place_information>
-					</applicant_locations>
-				</applicant_locations>
-			</applicant>
-		</applicants>
-		<research_locations>
-			<research_location>
-				<name>MN</name>
-				<place_information type="State" woeId="2347582"
-					latitude="46.4423" longitude="-93.3659">Minnesota, US</place_information>
-			</research_location>
-			<research_location>
-				<name>Alaska</name>
-				<place_information type="State" woeId="2347560"
-					latitude="63.0365" longitude="-149.106">Alaska, US</place_information>
-			</research_location>
-			<research_location>
-				<name>Alaska</name>
-				<place_information type="State" woeId="2347560"
-					latitude="63.0365" longitude="-149.106">Alaska, US</place_information>
-			</research_location>
-		</research_locations>
-		<researched_species>
-			<species identified_name="Ursus maritimus">Ursus maritimus</species>
-		</researched_species>
-	</event>
-	<event text="2010-23822.xml" date_filed="9-22-10">
-		<applicants>
-			<applicant>
-				<name>Indianapolis Zoological Society</name>
-				<applicant_institutions />
-				<applicant_locations>
-					<applicant_locations>
-						<name>Indianapolis</name>
-						<place_information type="Town" woeId="2427032"
-							latitude="39.7669" longitude="-86.15">Indianapolis, IN, US
-						</place_information>
-					</applicant_locations>
-				</applicant_locations>
-			</applicant>
-		</applicants>
-		<research_locations>
-			<research_location>
-				<name>Pacific</name>
-				<place_information type="Ocean" woeId="55959717"
-					latitude="0.89316" longitude="-154.721">Pacific Ocean</place_information>
-			</research_location>
-		</research_locations>
-		<researched_species>
-			<species identified_name="Odobenus rosmarus divergens">Odobenus rosmarus divergens
-			</species>
-		</researched_species>
-	</event>
-	<event text="2010-23822.xml" date_filed="9-22-10">
-		<applicants>
-			<applicant>
-				<name>Thomas A. Postel</name>
-				<applicant_institutions />
-				<applicant_locations>
-					<applicant_locations>
-						<name>Minneola, FL</name>
-						<place_information type="Town" woeId="2452128"
-							latitude="28.5756" longitude="-81.7481">Minneola, FL, US</place_information>
-					</applicant_locations>
-				</applicant_locations>
-			</applicant>
-		</applicants>
-		<research_locations />
-		<researched_species>
-			<species identified_name="Trichechus manatus">Trichechus manatus</species>
-		</researched_species>
-	</event>
-</events>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mlAnalysis/APPLICANT_01-14522.arff	Fri Oct 05 18:52:14 2012 -0700
@@ -0,0 +1,85 @@
+@relation endangeredSpecies_applicant_dataset
+
+@attribute isApplicant {0,1}
+@attribute text_type {1, 2}
+@attribute name_length integer
+@attribute issued {0, 1}
+@attribute applied {0, 1}
+@attribute permit {0, 1}
+@attribute comment {0, 1}
+@attribute is_subject {0, 1}
+@attribute applicant {0, 1}
+@attribute char_applicant_to_name integer
+@attribute pers_org_loc {1,2,3}
+@attribute GNRD-nlp_overlap_nlp real
+@attribute GNRD-nlp_overlap_GNRD real
+@attribute start_idx_eq_GNRD {0, 1}
+@attribute Placemaker-nlp_overlap_nlp real
+@attribute Placemaker-nlp_overlap_pl real
+@attribute start_idx_eq_placemaker {0, 1}
+@attribute surrounded_by_brackets {0, 1}
+@attribute surrounded_by_commata {0, 1}
+@attribute followed_by_s {0, 1}
+@attribute isAbbreviation {0,1}
+
+@data
+?,1,4,0,0,1,0,0,0,0,2,0.0,0.0,0,0.0,0.0,0,1,0,0,0
+?,1,43,0,0,1,0,1,0,0,2,0.0,0.0,0,0.0,0.0,0,1,0,0,0
+?,1,11,0,0,1,0,0,0,0,3,0.0,0.0,0,0.0,0.0,0,1,0,0,0
+?,1,7,0,0,1,0,0,0,0,3,0.0,0.0,0,0.0,0.0,0,1,1,0,0
+?,1,13,0,0,1,0,0,0,0,3,0.0,0.0,0,0.0,0.0,0,1,0,0,0
+?,1,18,0,0,1,0,0,0,0,1,0.0,0.0,0,0.0,0.0,0,1,0,0,0
+?,1,4,0,0,1,0,0,0,0,2,0.0,0.0,0,0.0,0.0,0,1,0,0,0
+?,1,4,0,0,1,0,0,0,0,2,0.0,0.0,0,0.0,0.0,0,1,0,0,0
+?,1,33,0,0,1,0,0,0,0,2,0.0,0.0,0,0.0,0.0,0,1,1,0,0
+?,1,4,0,0,1,0,0,0,0,3,0.0,0.0,0,0.0,0.0,0,1,1,0,0
+?,1,7,0,0,1,0,0,0,0,3,0.0,0.0,0,0.0,0.0,0,1,1,0,0
+?,1,2,0,0,1,0,0,0,0,3,0.0,0.0,0,0.0,0.0,0,1,0,0,0
+?,1,18,0,0,1,0,0,0,0,1,0.0,0.0,0,0.0,0.0,0,1,0,0,0
+?,1,18,0,0,1,0,0,0,0,2,0.0,0.0,0,0.0,0.0,0,1,1,0,0
+?,1,36,0,0,1,0,0,0,0,2,0.0,0.0,0,0.0,0.0,0,1,1,0,0
+?,1,2,0,0,1,0,0,0,0,3,0.0,0.0,0,0.0,0.0,0,1,0,0,0
+?,2,4,0,0,1,0,0,0,0,2,0.0,0.0,0,0.0,0.0,0,1,0,0,0
+?,2,27,0,0,0,0,0,0,0,2,0.0,0.0,0,0.0,0.0,0,1,0,0,0
+?,2,4,0,0,0,0,0,1,-904,3,0.0,0.0,0,0.0,1.0,1,1,0,0,0
+?,2,16,0,0,0,0,0,0,0,2,0.0,0.0,0,0.0,0.0,0,1,0,0,0
+?,2,4,0,0,0,0,0,0,0,1,0.0,0.0,0,0.0,0.0,0,1,0,0,0
+?,2,36,0,0,0,0,0,0,0,2,0.0,0.0,0,0.0,0.0,0,1,0,0,0
+?,2,2,0,0,0,0,0,0,0,3,0.0,0.0,0,1.0,1.0,1,1,0,0,0
+?,2,13,0,0,0,0,0,0,0,3,0.0,0.0,0,0.0,0.0,0,1,0,0,0
+?,2,13,0,0,0,0,0,0,0,2,0.0,0.0,0,0.0,0.0,0,1,0,0,0
+?,2,6,0,0,0,0,0,0,0,3,0.0,0.0,0,0.0,0.0,0,1,0,0,0
+?,2,14,0,0,0,0,0,0,0,3,0.0,0.0,0,0.0,0.0,0,1,0,0,0
+?,2,9,0,0,0,0,0,0,0,2,0.0,0.0,0,0.0,0.0,0,1,0,0,0
+?,2,14,0,0,0,0,0,0,0,3,0.0,0.0,0,0.0,0.0,0,1,0,0,0
+?,2,14,0,0,0,0,0,0,0,3,0.0,0.0,0,1.0,1.0,1,1,0,0,0
+?,2,33,0,0,0,0,0,0,0,2,0.0,0.0,0,0.0,0.0,0,1,0,0,0
+?,2,29,0,0,0,1,0,0,0,2,0.0,0.0,0,0.0,0.0,0,1,1,0,0
+?,2,4,0,0,0,1,0,0,0,2,0.0,0.0,0,0.0,0.0,0,1,1,0,0
+?,2,4,0,0,0,0,0,0,0,2,0.0,0.0,0,0.0,0.0,0,1,0,0,0
+?,2,24,0,0,0,0,0,0,0,2,0.0,0.0,0,0.0,0.0,0,1,0,0,0
+?,2,32,0,0,0,0,0,0,0,2,0.0,0.0,0,0.0,0.0,0,1,0,0,0
+?,2,29,0,0,1,0,0,0,0,2,0.0,0.0,0,0.0,0.0,0,1,1,0,0
+?,2,4,0,0,1,0,0,0,0,2,0.0,0.0,0,0.0,0.0,0,1,1,0,0
+?,2,16,0,0,1,0,0,0,0,2,0.0,0.0,0,0.0,0.0,0,1,0,0,0
+?,2,4,0,0,1,0,0,0,0,2,0.0,0.0,0,0.0,0.0,0,1,1,0,0
+?,2,4,0,0,1,0,0,0,0,3,0.0,0.0,0,0.0,0.0,0,1,0,0,0
+?,2,7,0,0,0,0,0,0,0,3,0.0,0.0,0,1.0,0.0,1,1,1,0,0
+?,2,2,0,0,0,0,0,0,0,3,0.0,0.0,0,1.0,0.0,0,1,0,0,0
+?,2,13,0,0,1,0,0,0,0,2,0.0,0.0,0,0.0,1.0,1,1,0,0,0
+?,2,4,0,0,1,0,0,0,0,2,0.0,0.0,0,0.0,0.0,0,1,1,0,0
+?,2,6,0,0,1,0,0,0,0,3,0.0,0.0,0,1.0,0.0,1,1,1,0,0
+?,2,2,0,0,1,0,0,0,0,3,0.0,0.0,0,1.0,0.0,0,1,0,0,0
+?,2,16,0,0,1,0,0,0,0,2,0.0,0.0,0,0.0,0.0,0,1,0,0,0
+?,2,4,0,0,1,0,0,0,0,2,0.0,0.0,0,0.0,0.0,0,1,1,0,0
+?,2,16,0,0,1,0,0,0,0,3,0.0,0.0,0,0.0,0.0,0,1,0,0,0
+?,2,10,0,0,1,0,0,0,0,3,0.0,0.0,0,1.0,0.0,1,1,1,0,0
+?,2,2,0,0,1,0,0,0,0,3,0.0,0.0,0,1.0,0.0,0,1,0,0,0
+?,2,16,0,0,1,0,0,0,0,2,0.0,0.0,0,0.0,0.0,0,1,0,0,0
+?,2,4,0,0,1,0,0,0,0,2,0.0,0.0,0,0.0,0.0,0,1,1,0,0
+?,2,15,0,0,1,0,0,0,0,2,0.0,0.0,0,0.0,0.0,0,1,0,0,0
+?,2,10,0,0,1,0,0,0,0,3,0.0,0.0,0,1.0,0.0,1,1,1,0,0
+?,2,2,0,0,1,0,0,0,0,3,0.0,0.0,0,1.0,0.0,0,1,0,0,0
+?,2,4,0,0,1,0,0,0,0,2,0.0,0.0,0,0.0,0.0,0,1,1,0,0
+?,2,14,0,0,1,0,0,0,0,3,0.0,0.0,0,0.0,0.0,0,1,1,0,0
+?,2,2,0,0,1,0,0,0,0,3,0.0,0.0,0,1.0,0.0,1,1,0,0,0
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mlAnalysis/LOCATION_01-14522.arff	Fri Oct 05 18:52:14 2012 -0700
@@ -0,0 +1,46 @@
+@relation location_training_data
+
+@attribute  location_type {0,1,2,3}
+@attribute numbers/words real
+@attribute starts_with_uppercase/words real
+@attribute contains_2_uppercase_letter_word {0,1}
+@attribute contains_university {0,1}
+@attribute surrounded_by_comma {0,1}
+@attribute surrounded_by_brackets {0,1}
+@attribute preceeded_by_and {0,1}
+@attribute preceeded_by_the {0,1}
+@attribute char_to_last_species_in_p integer
+@attribute char_to_next_species_in_p integer
+@attribute char_to_study_in_p integer
+@attribute char_to_studies_in_p integer
+@attribute char_to_in_in_s integer
+@attribute char_to_at_in_s integer
+@attribute nr_char_to_last_applicant_in_text integer
+@attribute has_comma {0,1}
+@attribute has_brackets {0,1}
+@attribute type {0,1,2,3,4,5,6,7,8}
+@attribute chars_to_survey_in_s integer
+@attribute chars_to_species_in_s integer
+
+@data
+?,0.3333333333333333,0.6666666666666666,1,0,0,0,0,0,-1,-1,0,0,0,0,197,1,0,7,0,0,
+?,0.3333333333333333,0.6666666666666666,0,0,0,0,0,0,-1,-1,0,0,0,0,19,1,0,1,0,0,
+?,0.25,0.75,1,0,0,0,0,0,-1,-1,0,0,0,0,60,1,0,7,0,0,
+?,0.0,1.0,0,0,0,0,0,0,-1,-1,0,0,0,0,6,0,0,5,0,0,
+?,1.0,0.0,0,0,0,0,0,0,-1,-1,0,0,0,0,236,0,0,7,0,0,
+?,0.25,0.75,1,0,0,0,0,0,-1,-1,0,0,0,0,-1,1,0,7,0,0,
+?,0.25,0.75,1,0,0,0,0,0,-1,-1,0,0,0,0,-1,1,0,7,0,0,
+?,0.0,1.0,0,0,0,0,0,0,-1,-1,0,0,0,0,-1,0,0,3,0,0,
+?,0.0,1.0,0,0,0,0,0,0,-1,-1,0,0,0,0,-1,0,0,3,0,0,
+?,0.0,1.0,0,0,0,0,0,0,-1,-1,-384,0,-56,-137,-1,0,0,3,-44,0,
+?,0.25,0.75,1,0,0,0,0,0,-1,-1,0,0,0,0,-1,1,0,7,0,0,
+?,0.3333333333333333,0.6666666666666666,1,0,0,0,0,0,-1,-1,0,0,0,0,-1,1,0,7,0,0,
+?,0.3333333333333333,0.6666666666666666,1,0,0,0,0,0,-1,-1,0,0,0,0,-1,1,0,7,0,0,
+?,0.3333333333333333,0.6666666666666666,1,0,0,0,0,0,-1,-1,0,0,0,0,-1,1,0,7,0,0,
+?,0.5,0.5,1,0,0,0,0,0,-1,-1,0,0,0,0,-1,0,0,7,0,0,
+?,0.0,1.0,0,0,0,0,0,0,-1,-1,0,0,0,0,-1,0,0,1,0,0,
+?,0.0,0.6666666666666666,0,0,0,0,1,0,-1,-1,0,0,0,0,-1,0,0,0,0,0,
+?,0.0,1.0,0,0,0,0,0,0,-1,-1,0,0,0,0,-1,0,0,4,0,0,
+?,0.0,1.0,0,0,0,0,0,0,-1,-1,0,0,0,0,-1,0,0,7,0,0,
+?,0.0,1.0,0,0,0,0,0,0,-1,-1,0,0,0,0,-1,0,0,2,0,0,
+?,0.0,1.0,0,0,0,0,0,0,-1,-1,0,0,0,0,-1,1,0,1,0,0,
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/results/01-14522.xml	Fri Oct 05 18:52:14 2012 -0700
@@ -0,0 +1,141 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<results>
+	<summaries>
+
+		<summary>
+			<p type="2">SUMMARY:</p>
+			<p type="1">Notice is hereby given of the following actions regarding
+				permits for takes of marine mammal species for the purposes of
+				scientific research:</p>
+			<p type="1">
+				NMFS has received a permit application from:
+				<applicant>Ocean Alliance/Whale Conservation Institute</applicant>
+				, 191 Weston Road,
+				<applicant_location woeId="2439449" type="Town"
+					name="Lincoln, MA, US">Lincoln, Massachusetts 01773</applicant_location>
+				(
+				<applicant>Dr. Roger S. Payne</applicant>
+				, Principal Investigator) (Application No. 751-1614-00); NMFS has
+				received applications for permit amendments from:; NMFS, National
+				Marine Mammal Laboratory, 7600 Sand Point Way, N.E., BIN C15700,
+				<applicant_location woeId="12798958" type="Zip"
+					name="98115, Seattle, WA, US">Seattle, WA 98115-0070</applicant_location>
+				(Permit No. 782-532-00)); and
+				<applicant>Dr. Peter L. Tyack</applicant>
+				, Biology Department, Woods Hole Oceanographic Institution,
+				<applicant_location woeId="12758881" type="Zip"
+					name="02543, Woods Hole, MA, US">Woods Hole, MA 02543</applicant_location>
+				(Permit No. 981-1578).
+			</p>
+		</summary>
+	</summaries>
+	<supplementary_information>
+
+		<supplInfo>
+			<p type="2">SUPPLEMENTARY INFORMATION:</p>
+			<p type="1">The subject application and permit amendments are requested
+				under the authority of the Marine Mammal Protection Act of 1972, as
+				amended (MMPA; 16 U.S.C. 1361 et seq .), the Regulations Governing
+				the Taking and Importing of Marine Mammals (50 CFR part 216), the
+				Endangered Species Act of 1973, as amended (ESA; 16 U.S.C. 1531 et
+				seq .), the regulations governing the taking, importing, and
+				exporting of endangered and threatened species (50 CFR 222-226), and
+				the Fur Seal Act of 1966, as amended (16 U.S.C. 1151 et seq. )</p>
+			<p type="2">New Application Received</p>
+			<p type="1">For Application No. 715-1614-00, the applicant requests
+				permission to conduct vessel and aerial surveys, collect tissue
+				samples (sloughed skin and skin and blubber biopsies) from living,
+				free-ranging animals and collect skin, blubber, blood, bone, baleen
+				and other organ tissue samples from dead stranded animals from all
+				age and sex classes of 21 cetacean species in U.S., foreign, and
+				international waters. Tissue samples would be used to quantify
+				toxicant loads and immunochemical responses to these loads to test
+				the hypothesis that there are demonstrable differences between
+				different populations ans species with regard to the levels of toxic
+				compounds present. Genetic analyses would also be performed on
+				samples to investigate the genetic diversity and variability of the
+				population groups sampled. This information would be used to
+				establish a baseline for comparisons with future samples and to
+				assist in making future management and conservation policies.</p>
+			<p type="2">Permit Amendment Requests Received</p>
+			<p type="1">
+				For Permit No. 782-1532-00, the Permit authorizes the Holder to take
+				Steller sea lions (
+				<species_scientific name="Eumetopias jubatus">Eumetopias jubatus
+				</species_scientific>
+				) for research that involves takes by aerial and ship based surveys
+				biennally, capture and take morphological measurements, collect
+				specimens (blood and biopsy), brand, tag, and disturb during scat
+				collection. The Holder now requests to amend the take authority to
+				conduct aerial surveys each year, include Southeast Alaska in
+				monthly surveys, increase the number of animals to be incidentally
+				harassed during scat collection, allow additional procedures for
+				animal handling such as: using gas anesthesia, branding pups ≥4 mos
+				and juveniles to 3 yrs, injecting Evan’s blue dye and deuterated
+				water, collecting muscle biopsy, using noninvasive bioelectric
+				impedance analysis, increasing blood sample volume, extracting a
+				tooth, and pulling vibrissae. This Permit amendment will improve
+				field techniques and incorporate collaborative efforts of scientists
+				funded under the Steller Sea Lion Research Initiative.
+			</p>
+			<p type="1">
+				For Permit No. 981-1578-00, the Permit authorizes the Holder to tag
+				cetaceans with an advanced digital sound recording tag (DTAG) that
+				can record the acoustic stimuli an animal hears, along with
+				measuring vocal, behavioral, and physiological responses to sound
+				played back at received levels of 120-160 dB re 1 micron Pa. The
+				research was authorized in the Mediterranean and Ligurian Seas and
+				off the coast of the Azores in the North Atlantic. The Holder
+				requests an amendment to increase the source level but not the
+				received level for a whale-finding sonar to 200 dB re 1 micron
+				<research_location woeId="2347597" type="State"
+					name="Pennsylvania, US">Pa</research_location>
+				at 1 m, add playbacks involving exposure to impulse signals from
+				airguns as used in seismic surveys, include one additional baleen
+				whale species and 12 species of Odontocete whale, and extend the
+				study area to include North Atlantic and Gulf of Mexico.
+			</p>
+			<p type="1">In compliance with the National Environmental Policy Act of
+				1969 (42 U.S.C. 4321 et seq .), an initial determination has been
+				made that the activities proposed are categorically excluded from
+				the requirement to prepare an environmental assessment or
+				environmental impact statement.</p>
+			<p type="1">Written comments or requests for a public hearing on the
+				application or amendment requests should be mailed to the Chief,
+				Permits and Documentation Division, F/PR1, Office of Protected
+				Resources, NMFS, 1315 East-West Highway, Room 13705, Silver Spring,
+				MD 20910. Those individuals requesting a hearing should set forth
+				the specific reasons why a hearing on these particular requests
+				would be appropriate.</p>
+			<p type="1">Comments may also be submitted by facsimile at (301)
+				713-0376, provided the facsimile is confirmed by hard copy submitted
+				by mail and postmarked no later than the closing date of the comment
+				period. Please note that comments will not be accepted by e-mail or
+				by other electronic media.</p>
+			<p type="1">Concurrent with the publication of this notice in the
+				Federal Register , NMFS is forwarding copies of thee application and
+				amendment requests to the Marine Mammal Commission and its Committee
+				of Scientific Advisors.</p>
+			<p type="1">Documents may be reviewed in the following locations:</p>
+			<p type="1">For all permits and permit amendments: Permits and
+				Documentation Division, Office of Protected Resources, NMFS, 1315
+				East-West Highway, Room 13705, Silver Spring, MD 20910; phone (301)
+				713-2289; fax (301) 713-0376;</p>
+			<p type="1">For permit 751-1614-00: Northwest Region, NMFS, 7600 Sand
+				Point Way NE, BIN C15700, Bldg. 1, Seattle, WA 98115-0700; phone
+				(206) 526-6150; fax (206) 526-6426;</p>
+			<p type="1">For permits 751-1614-00 and 782-1532-00: Alaska Region,
+				NMFS, P.O. Box 21668, Juneau, AK 99802-1668; phone (907) 586-7221;
+				fax (907) 586-7249;</p>
+			<p type="1">For permit 751-1614-00: Southwest Region, NMFS, 501 West
+				Ocean Blvd., Suite 4200, Long Beach, CA 90802-4213; phone (562)
+				980-4001; fax (562) 980-4018;</p>
+			<p type="1">For permits 751-1614-00 and 981-1578-00: Northeast Region,
+				NMFS, One Blackburn Drive, Gloucester, MA 01930-2298; phone (978)
+				281-9200; fax (978) 281-9371; and</p>
+			<p type="1">For permits 751-1614-00 and 981-1578-00: Southeast Region,
+				NMFS, 9721 Executive Center Drive North, St. Petersburg, FL
+				33702-2432; phone (727) 570-5301; fax (727) 570-5320.</p>
+		</supplInfo>
+	</supplementary_information>
+</results>
--- a/results/2010-23822.xml	Fri Sep 14 10:30:43 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,199 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<results>
-	<summaries>
-
-		<summary>
-			<p type="2">SUMMARY:</p>
-			<p type="1">&amp;We, the U.S. Fish and Wildlife Service, invite the
-				public to comment on the following applications to conduct certain
-				activities with endangered species, marine mammals, or both. With
-				some exceptions, the Endangered Species Act (ESA) and Marine Mammal
-				Protection Act (MMPA) prohibits activities with listed species
-				unless a Federal permit is issued that allows such activities. Both
-				laws require that we invite public comment before issuing these
-				permits.</p>
-		</summary>
-	</summaries>
-	<supplementary_information>
-
-		<supplInfo>
-			<p type="2">SUPPLEMENTARY INFORMATION:</p>
-			<p type="2">I. Public Comment Procedures</p>
-			<p type="2">A. How Do I Request Copies of Applications or Comment on
-				Submitted Applications?</p>
-			<p type="1">Send your request for copies of applications or comments
-				and materials concerning any of the applications to the contact
-				listed under ADDRESSES . Please include the Federal Register notice
-				publication date, the PRT-number, and the name of the applicant in
-				your request or submission. We will not consider requests or
-				comments sent to an e-mail or address not listed under ADDRESSES .
-				If you provide an email address in your request for copies of
-				applications, we will attempt to respond to your request
-				electronically.</p>
-			<p type="1">Please make your requests or comments as specific as
-				possible. Please confine your comments to issues for which we seek
-				comments in this notice, and explain the basis for your comments.
-				Include sufficient information with your comments to allow us to
-				authenticate any scientific or commercial data you include.</p>
-			<p type="1">The comments and recommendations that will be most useful
-				and likely to influence agency decisions are: (1) Those supported by
-				quantitative information or studies; and (2) Those that include
-				citations to, and analyses of, the applicable laws and regulations.
-				We will not consider or include in our administrative record
-				comments we receive after the close of the comment period (see
-				DATES) or comments delivered to an address other than those listed
-				above (see ADDRESSES ).</p>
-			<p type="2">B. May I Review Comments Submitted by Others?</p>
-			<p type="1">Comments, including names and street addresses of
-				respondents, will be available for public review at the address
-				listed under ADDRESSES . The public may review documents and other
-				information applicants have sent in support of the application
-				unless our allowing viewing would violate the Privacy Act or Freedom
-				of Information Act. Before including your address, phone number,
-				e-mail address, or other personal identifying information in your
-				comment, you should be aware that your entire comment—including your
-				personal identifying information—may be made publicly available at
-				any time. While you can ask us in your comment to withhold your
-				personal identifying information from public review, we cannot
-				guarantee that we will be able to do so.</p>
-			<p type="2">II. Background</p>
-			<p type="1">To help us carry out our conservation responsibilities for
-				affected species, the Endangered Species Act of 1973, section
-				10(a)(1)(A), as amended (16 U.S.C. 1531 et seq. ), and our
-				regulations in the Code of Federal Regulations (CFR) at 50 CFR 17,
-				the Marine Mammal Protection Act of 1972, as amended (16 U.S.C. 1361
-				et seq. ), and our regulations in the Code of Federal Regulations
-				(CFR) at 50 CFR 18 require that we invite public comment before
-				final action on these permit applications. Under the MMPA, you may
-				request a hearing on any MMPA application received. If you request a
-				hearing, give specific reasons why a hearing would be appropriate.
-				The holding of such a hearing is at the discretion of the Service
-				Director.</p>
-			<p type="2">III. Permit Applications</p>
-			<p type="2">A. Endangered Species</p>
-			<p type="2">
-				Applicant:
-				<applicant>University of Connecticut</applicant>
-				, Storrs, CT; PRT-14240A
-			</p>
-			<p type="1">
-				The applicant requests a permit to export biological samples from
-				captive born golden-crowned sifaka (
-				<species_scientific name="Propithecus tattersalli">Propithecus tattersalli
-				</species_scientific>
-				) for the purpose of scientific research. This notification covers
-				activities to be conducted by the applicant over a 5-year period.
-			</p>
-			<p type="2">
-				Applicant:
-				<applicant>Christina Marisa Tellez</applicant>
-				, University of California Los Angeles (
-				<applicant_institution woeId="23511626" type="POI"
-					name="University of California Los Angeles, Los Angeles, CA, US">UCLA), Los Angeles, CA</applicant_institution>
-				; PRT-10564A
-			</p>
-			<p type="1">
-				The applicant requests a permit to import biological samples from
-				American crocodile (
-				<species_scientific name="Crocodylus acutus">Crocodylus acutus
-				</species_scientific>
-				), and Morelet's crocodile (
-				<species_scientific name="Crocodylus moreletti">Crocodylus moreletti
-				</species_scientific>
-				) from Belize for the purpose of enhancement of the species through
-				scientific research. This notification covers activities conducted
-				by the applicant over a 5-year period.
-			</p>
-			<p type="2">Multiple Applicants</p>
-			<p type="1">
-				The following applicants each request a permit to import the
-				sport-hunted trophy of one male bontebok (
-				<species_scientific name="Damaliscus pygargus pygargus">Damaliscus pygargus
-					pygargus</species_scientific>
-				) culled from a captive herd maintained under the management program
-				of the Republic of South Africa, for the purpose of enhancement of
-				the survival of the species.
-			</p>
-			<p type="2">
-				Applicant:
-				<applicant>Steven Louis</applicant>
-				,
-				<applicant_location woeId="2480850" type="Town"
-					name="Richland Center, WI, US">Richland Center</applicant_location>
-				, WI; PRT-21605A
-			</p>
-			<p type="2">
-				Applicant:
-				<applicant>Selmer Erickson</applicant>
-				<error type="research_location">, Park</error>
-				Rapids,
-				<research_location woeId="2347582" type="State"
-					name="Minnesota, US">MN</research_location>
-				; PRT-21574A
-			</p>
-			<p type="2">B. Endangered Marine Mammals and Marine Mammals</p>
-			<p type="2">
-				Applicant:
-				<applicant>U.S. Fish and Wildlife Service</applicant>
-				, Marine Mammals Management,
-				<applicant_location woeId="2354490" type="Town"
-					name="Anchorage, AK, US">Anchorage</applicant_location>
-				, AK; PRT-046081
-			</p>
-			<p type="1">
-				The applicant requests amendment and renewal of the permit to take
-				and harassment polar bears (
-				<species_scientific name="Ursus maritimus">Ursus maritimus
-				</species_scientific>
-				) in the wild in
-				<research_location woeId="2347560" type="State"
-					name="Alaska, US">Alaska</research_location>
-				and in waters around
-				<research_location woeId="2347560" type="State"
-					name="Alaska, US">Alaska</research_location>
-				for the purpose of scientific research. This notification covers
-				activities to be conducted by the applicant over a 5-year period.
-			</p>
-			<p type="2">
-				Applicant:
-				<applicant>Indianapolis Zoological Society</applicant>
-				,
-				<applicant_location woeId="2427032" type="Town"
-					name="Indianapolis, IN, US">Indianapolis</applicant_location>
-				, IN; PRT-19420A
-			</p>
-			<p type="1">
-				The applicant requests a permit to take a
-				<research_location woeId="55959717" type="Ocean"
-					name="Pacific Ocean">Pacific</research_location>
-				walrus, (
-				<species_scientific name="Odobenus rosmarus divergens">Odobenus rosmarus
-					divergens</species_scientific>
-				), one male, found beached and abandoned as a newborn near Barrow,
-				AK on July 4, 2003 for the purpose of public display. This
-				notification covers activities to be conducted by the applicant over
-				a 5-year period.
-			</p>
-			<p type="2">
-				Applicant:
-				<applicant>Thomas A. Postel</applicant>
-				,
-				<applicant_location woeId="2452128" type="Town"
-					name="Minneola, FL, US">Minneola, FL</applicant_location>
-				; PRT-19806A
-			</p>
-			<p type="1">
-				The applicant requests a permit to photography Florida manatees (
-				<species_scientific name="Trichechus manatus">Trichechus manatus
-				</species_scientific>
-				) underwater for commercial and educational purposes. This
-				notification covers activities to be conducted by the applicant over
-				a one-year period.
-			</p>
-			<p type="1">Concurrent with publishing this notice in the Federal
-				Register , we are forwarding copies of the above applications to the
-				Marine Mammal Commission and the Committee of Scientific Advisors
-				for their review.</p>
-		</supplInfo>
-	</supplementary_information>
-</results>
--- a/src/de/mpiwg/anteater/events/processors/PermitOrApplicantEventProcessor.java	Fri Sep 14 10:30:43 2012 +0200
+++ b/src/de/mpiwg/anteater/events/processors/PermitOrApplicantEventProcessor.java	Fri Oct 05 18:52:14 2012 -0700
@@ -5,6 +5,8 @@
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 
+import uk.ac.shef.wit.simmetrics.similaritymetrics.JaroWinkler;
+
 import de.mpiwg.anteater.events.Applicant;
 import de.mpiwg.anteater.events.ResearchEvent;
 import de.mpiwg.anteater.ml.PlaceClasses;
@@ -128,6 +130,7 @@
 
 				IfStatement: if (matcher.find()) {
 					String numberInPara = matcher.group(1);
+					// check if found number exisits in events
 					for (ResearchEvent ev : newEvents) {
 						if (numberInPara.equals(ev.getApplicationOrPermitNo()))
 						{
@@ -135,6 +138,25 @@
 							break IfStatement;
 						}
 					}
+					// if there is no event with permit number
+					// check if just something was clipped
+					for (ResearchEvent ev : newEvents) {
+						if (ev.getApplicationOrPermitNo().contains(numberInPara) || numberInPara.contains(ev.getApplicationOrPermitNo())) {
+							eventForPara = ev;
+							break IfStatement;
+						}
+					}
+					// if there is still no event found
+					// check for switched numbers
+					for (ResearchEvent ev : newEvents) {
+						JaroWinkler winkler = new JaroWinkler();
+						double sim = winkler.getSimilarity(numberInPara, ev.getApplicationOrPermitNo());
+						if (sim > 0.85)
+						{
+							eventForPara = ev;
+							break IfStatement;
+						}
+					}
 					eventForPara = null;
 					continue;
 				}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/texts/01-14522.xml	Fri Oct 05 18:52:14 2012 -0700
@@ -0,0 +1,66 @@
+<NOTICE>
+      <PREAMB>
+        <AGENCY>DEPARTMENT OF COMMERCE</AGENCY>
+        <SUBAGY>National Oceanic and Atmospheric Administration</SUBAGY>
+        <DEPDOC>[I.D. 060101A]</DEPDOC>
+        <SUBJECT>Marine Mammals; Permits</SUBJECT>
+        <AGY>
+          <HD SOURCE="HED">AGENCY:</HD>
+          <P>National Marine Fisheries Service (NMFS), National Oceanic and Atmospheric Administration (NOAA), Commerce.</P>
+        </AGY>
+        <ACT>
+          <HD SOURCE="HED">ACTION:</HD>
+          <P>Receipt of application No. 751-1614-00; and receipt of applications to amend permits (782-1532-00, 981-1578-00).</P>
+        </ACT>
+        <SUM>
+          <HD SOURCE="HED">SUMMARY:</HD>
+          <P>Notice is hereby given of the following actions regarding permits for takes of marine mammal species for the purposes of scientific research:</P>
+          <P>NMFS has received a permit application from: Ocean Alliance/Whale Conservation Institute, 191 Weston Road, Lincoln, Massachusetts 01773 (Dr. Roger S. Payne, Principal Investigator) (Application No. 751-1614-00); NMFS has received applications for permit amendments from:; NMFS, National Marine Mammal Laboratory, 7600 Sand Point Way, N.E., BIN C15700, Seattle, WA 98115-0070 (Permit No. 782-532-00)); and Dr. Peter L. Tyack, Biology Department, Woods Hole Oceanographic Institution, Woods Hole, MA 02543 (Permit No. 981-1578).</P>
+        </SUM>
+        <DATES>
+          <HD SOURCE="HED">DATES:</HD>
+          <P>Written or telefaxed comments on the new application or amendment requests must be received on or before July 9, 2001.</P>
+        </DATES>
+        <ADD>
+          <HD SOURCE="HED">ADDRESSES:</HD>
+
+          <P>The application and related documents are available for review upon written request or by appointment.  See<E T="02">SUPPLEMENTARY INFORMATION</E>.</P>
+          <P>Written comments or requests for a public hearing on the application or amendment requests should be mailed to the Chief, Permits and Documentation Division, F/PR1, Office of Protected Resources, NMFS, 1315 East-West Highway, Room 13705, Silver Spring, MD 20910.  Those individuals requesting a hearing should set forth the specific reasons why a hearing on this particular request would be appropriate.</P>
+        </ADD>
+        <FURINF>
+          <HD SOURCE="HED">FOR FURTHER INFORMATION CONTACT:</HD>
+          <P>Ruth Johnson or Tammy Adams, (301)713-2289.</P>
+        </FURINF>
+      </PREAMB>
+      <SUPLINF>
+        <HD SOURCE="HED">SUPPLEMENTARY INFORMATION:</HD>
+
+        <P>The subject application and permit amendments are requested under the authority of the Marine Mammal Protection Act of 1972, as amended (MMPA; 16 U.S.C. 1361<E T="03">et seq</E>.), the Regulations Governing the Taking and Importing of Marine Mammals (50 CFR part 216), the Endangered Species Act of 1973, as amended (ESA; 16 U.S.C. 1531<E T="03">et seq</E>.), the regulations governing the taking, importing, and exporting of endangered and threatened species (50 CFR 222-226), and the Fur Seal Act of 1966, as amended (16 U.S.C. 1151<E T="03">et seq.</E>)</P>
+        <HD SOURCE="HD1">New Application Received</HD>
+
+        <P>For Application No. 715-1614-00, the applicant requests permission to conduct vessel and aerial surveys, collect tissue samples (sloughed skin and skin and blubber biopsies) from living, free-ranging animals and collect skin, blubber, blood, bone, baleen and other organ tissue samples from dead stranded animals from all age and sex<PRTPAGE P="30886"/>classes of 21 cetacean species in U.S., foreign, and international waters. Tissue samples would be used to quantify toxicant loads and immunochemical responses to these loads to test the hypothesis that there are demonstrable differences between different populations ans species with regard to the levels of toxic compounds present. Genetic analyses would also be performed on samples to investigate the genetic diversity and variability of the population groups sampled.  This information would be used to establish a baseline for comparisons with future samples and to assist in making future management and conservation policies.</P>
+        <HD SOURCE="HD1">Permit Amendment Requests Received</HD>
+
+        <P>For Permit No. 782-1532-00, the Permit authorizes the Holder to take Steller sea lions (<E T="03">Eumetopias jubatus</E>) for research that involves takes by aerial and ship based surveys biennally, capture and take morphological measurements, collect specimens (blood and biopsy), brand, tag, and disturb during scat collection.  The Holder now requests to amend the take authority to conduct aerial surveys each year, include Southeast Alaska in monthly surveys, increase the number of animals to be incidentally harassed during scat collection, allow additional procedures for animal handling such as: using gas anesthesia, branding pups &#x2265;4 mos and juveniles to 3 yrs, injecting Evan&#x2019;s blue dye and deuterated water, collecting muscle biopsy, using noninvasive bioelectric impedance analysis, increasing blood sample volume, extracting a tooth, and pulling vibrissae.  This Permit amendment will improve field techniques and incorporate collaborative efforts of scientists funded under the Steller Sea Lion Research Initiative.</P>
+        <P>For Permit No. 981-1578-00, the Permit authorizes the Holder to tag cetaceans with an advanced digital sound recording tag (DTAG) that can record the acoustic stimuli an animal hears, along with measuring vocal, behavioral, and physiological responses to sound played back at received levels of 120-160 dB re 1 micron Pa. The research was authorized in the Mediterranean and Ligurian Seas and off the coast of the Azores in the North Atlantic. The Holder requests an amendment to increase the source level but not the received level for a whale-finding sonar to 200 dB re 1 micron Pa at 1 m, add playbacks involving exposure to impulse signals from airguns as used in seismic surveys, include one additional baleen whale species and 12 species of Odontocete whale, and extend the study area to include North Atlantic and Gulf of Mexico.</P>
+
+        <P>In compliance with the National Environmental Policy Act of 1969 (42 U.S.C. 4321<E T="03">et seq</E>.), an initial determination has been made that the activities proposed are categorically excluded from the requirement to prepare an environmental assessment or environmental impact statement.</P>
+        <P>Written comments or requests for a public hearing on the application or amendment requests should be mailed to the Chief, Permits and Documentation Division, F/PR1, Office of Protected Resources, NMFS, 1315 East-West Highway, Room 13705, Silver Spring, MD 20910.  Those individuals requesting a hearing should set forth the specific reasons why a hearing on these particular requests would be appropriate.</P>
+        <P>Comments may also be submitted by facsimile at (301) 713-0376, provided the facsimile is confirmed by hard copy submitted by mail and postmarked no later than the closing date of the comment period.  Please note that comments will not be accepted by e-mail or by other electronic media.</P>
+        <P>Concurrent with the publication of this notice in the<E T="04">Federal Register</E>, NMFS is forwarding copies of thee application and amendment requests to the Marine Mammal Commission and its Committee of Scientific Advisors.</P>
+        <P>Documents may be reviewed in the following locations:</P>
+        <P>For all permits and permit amendments: Permits and Documentation Division, Office of Protected Resources, NMFS, 1315 East-West Highway, Room 13705, Silver Spring, MD 20910; phone (301) 713-2289; fax (301) 713-0376;</P>
+        <P>For permit 751-1614-00:  Northwest Region, NMFS, 7600 Sand Point Way NE, BIN C15700, Bldg. 1, Seattle, WA 98115-0700; phone (206) 526-6150; fax (206) 526-6426;</P>
+        <P>For permits 751-1614-00 and 782-1532-00:  Alaska Region, NMFS, P.O. Box 21668, Juneau, AK 99802-1668; phone (907) 586-7221; fax (907) 586-7249;</P>
+        <P>For permit 751-1614-00: Southwest Region, NMFS, 501 West Ocean Blvd., Suite 4200, Long Beach, CA 90802-4213; phone (562) 980-4001; fax (562) 980-4018;</P>
+        <P>For permits 751-1614-00 and 981-1578-00:  Northeast Region, NMFS, One Blackburn Drive, Gloucester, MA 01930-2298; phone (978) 281-9200; fax (978) 281-9371; and</P>
+        <P>For permits 751-1614-00 and 981-1578-00:  Southeast Region, NMFS, 9721 Executive Center Drive North, St. Petersburg, FL 33702-2432; phone (727) 570-5301; fax (727) 570-5320.</P>
+        <SIG>
+          <DATED>Dated: June 4, 2001.</DATED>
+          <NAME>Ann D. Terbush,</NAME>
+          <TITLE>Chief, Permits and Documentation Division, Office of Protected Resources, National Marine Fisheries Service.</TITLE>
+        </SIG>
+      </SUPLINF>
+      <FRDOC>[FR Doc. 01-14522 Filed 6-7-01; 8:45 am]</FRDOC>
+      <BILCOD>BILLING CODE  3510-22-S</BILCOD>
+    </NOTICE>
\ No newline at end of file