annotate src/de/mpiwg/anteater/species/common/impl/LinnaeusNameFinder.java @ 3:ae96e4bc7fb2

save found species to analysis files
author jdamerow
date Mon, 22 Oct 2012 14:21:14 -0700
parents 1c2b4f5e2c05
children dcc35f89dce3
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
2
1c2b4f5e2c05 linnaeus for finding species
jdamerow
parents:
diff changeset
1 package de.mpiwg.anteater.species.common.impl;
1c2b4f5e2c05 linnaeus for finding species
jdamerow
parents:
diff changeset
2
1c2b4f5e2c05 linnaeus for finding species
jdamerow
parents:
diff changeset
3 import java.io.InputStream;
1c2b4f5e2c05 linnaeus for finding species
jdamerow
parents:
diff changeset
4 import java.util.HashMap;
1c2b4f5e2c05 linnaeus for finding species
jdamerow
parents:
diff changeset
5 import java.util.List;
1c2b4f5e2c05 linnaeus for finding species
jdamerow
parents:
diff changeset
6 import java.util.Map;
1c2b4f5e2c05 linnaeus for finding species
jdamerow
parents:
diff changeset
7
3
ae96e4bc7fb2 save found species to analysis files
jdamerow
parents: 2
diff changeset
8 import org.apache.commons.lang3.StringEscapeUtils;
ae96e4bc7fb2 save found species to analysis files
jdamerow
parents: 2
diff changeset
9
2
1c2b4f5e2c05 linnaeus for finding species
jdamerow
parents:
diff changeset
10 import uk.ac.man.documentparser.dataholders.Document;
1c2b4f5e2c05 linnaeus for finding species
jdamerow
parents:
diff changeset
11 import uk.ac.man.entitytagger.Mention;
1c2b4f5e2c05 linnaeus for finding species
jdamerow
parents:
diff changeset
12 import uk.ac.man.entitytagger.doc.TaggedDocument;
1c2b4f5e2c05 linnaeus for finding species
jdamerow
parents:
diff changeset
13 import uk.ac.man.entitytagger.matching.MatchOperations;
1c2b4f5e2c05 linnaeus for finding species
jdamerow
parents:
diff changeset
14 import uk.ac.man.entitytagger.matching.Matcher;
1c2b4f5e2c05 linnaeus for finding species
jdamerow
parents:
diff changeset
15 import uk.ac.man.entitytagger.matching.Matcher.Disambiguation;
1c2b4f5e2c05 linnaeus for finding species
jdamerow
parents:
diff changeset
16 import uk.ac.man.entitytagger.matching.Postprocessor;
1c2b4f5e2c05 linnaeus for finding species
jdamerow
parents:
diff changeset
17 import uk.ac.man.entitytagger.matching.matchers.MatchPostProcessor;
1c2b4f5e2c05 linnaeus for finding species
jdamerow
parents:
diff changeset
18 import uk.ac.man.entitytagger.matching.matchers.VariantDictionaryMatcher;
1c2b4f5e2c05 linnaeus for finding species
jdamerow
parents:
diff changeset
19 import de.mpiwg.anteater.logging.IAnteaterLogger;
1c2b4f5e2c05 linnaeus for finding species
jdamerow
parents:
diff changeset
20 import de.mpiwg.anteater.species.common.ICommonNameFinder;
1c2b4f5e2c05 linnaeus for finding species
jdamerow
parents:
diff changeset
21
1c2b4f5e2c05 linnaeus for finding species
jdamerow
parents:
diff changeset
22 public class LinnaeusNameFinder implements ICommonNameFinder {
1c2b4f5e2c05 linnaeus for finding species
jdamerow
parents:
diff changeset
23
1c2b4f5e2c05 linnaeus for finding species
jdamerow
parents:
diff changeset
24 public final static String COMPONENT_NAME = LinnaeusNameFinder.class
1c2b4f5e2c05 linnaeus for finding species
jdamerow
parents:
diff changeset
25 .getSimpleName();
1c2b4f5e2c05 linnaeus for finding species
jdamerow
parents:
diff changeset
26
1c2b4f5e2c05 linnaeus for finding species
jdamerow
parents:
diff changeset
27 private IAnteaterLogger logger;
1c2b4f5e2c05 linnaeus for finding species
jdamerow
parents:
diff changeset
28
1c2b4f5e2c05 linnaeus for finding species
jdamerow
parents:
diff changeset
29 public LinnaeusNameFinder(IAnteaterLogger logger) {
1c2b4f5e2c05 linnaeus for finding species
jdamerow
parents:
diff changeset
30 this.logger = logger;
1c2b4f5e2c05 linnaeus for finding species
jdamerow
parents:
diff changeset
31 }
1c2b4f5e2c05 linnaeus for finding species
jdamerow
parents:
diff changeset
32
1c2b4f5e2c05 linnaeus for finding species
jdamerow
parents:
diff changeset
33 @Override
1c2b4f5e2c05 linnaeus for finding species
jdamerow
parents:
diff changeset
34 public String findCommonNames(String text) {
1c2b4f5e2c05 linnaeus for finding species
jdamerow
parents:
diff changeset
35 Matcher matcher = VariantDictionaryMatcher.load(getClass()
1c2b4f5e2c05 linnaeus for finding species
jdamerow
parents:
diff changeset
36 .getResourceAsStream("resources-linnaeus/species-light.tsv"),
1c2b4f5e2c05 linnaeus for finding species
jdamerow
parents:
diff changeset
37 true);
1c2b4f5e2c05 linnaeus for finding species
jdamerow
parents:
diff changeset
38
1c2b4f5e2c05 linnaeus for finding species
jdamerow
parents:
diff changeset
39 Disambiguation disambiguation = Disambiguation.ON_WHOLE;
1c2b4f5e2c05 linnaeus for finding species
jdamerow
parents:
diff changeset
40
1c2b4f5e2c05 linnaeus for finding species
jdamerow
parents:
diff changeset
41 matcher = new MatchPostProcessor(matcher, disambiguation, true, null,
1c2b4f5e2c05 linnaeus for finding species
jdamerow
parents:
diff changeset
42 getPostprocessor(new HashMap<String, String>(), ""));
1c2b4f5e2c05 linnaeus for finding species
jdamerow
parents:
diff changeset
43
1c2b4f5e2c05 linnaeus for finding species
jdamerow
parents:
diff changeset
44 matcher.match("test", new Document("none", null, null, null, null,
1c2b4f5e2c05 linnaeus for finding species
jdamerow
parents:
diff changeset
45 null, null, null, null, null, null, null, null, null, null));
1c2b4f5e2c05 linnaeus for finding species
jdamerow
parents:
diff changeset
46
1c2b4f5e2c05 linnaeus for finding species
jdamerow
parents:
diff changeset
47 Document doc = new Document("id", "title", "", text, text, Document.Text_raw_type.TEXT, "", null, Document.Type.OTHER, null, "", "", "", "", null);
1c2b4f5e2c05 linnaeus for finding species
jdamerow
parents:
diff changeset
48 TaggedDocument tagged = MatchOperations.matchDocument(matcher, doc);
1c2b4f5e2c05 linnaeus for finding species
jdamerow
parents:
diff changeset
49 List<Mention> species = tagged.getAllMatches();
3
ae96e4bc7fb2 save found species to analysis files
jdamerow
parents: 2
diff changeset
50 StringBuffer sb = new StringBuffer();
ae96e4bc7fb2 save found species to analysis files
jdamerow
parents: 2
diff changeset
51 sb.append("<linnaeus>");
ae96e4bc7fb2 save found species to analysis files
jdamerow
parents: 2
diff changeset
52
ae96e4bc7fb2 save found species to analysis files
jdamerow
parents: 2
diff changeset
53 for (Mention s : species) {
ae96e4bc7fb2 save found species to analysis files
jdamerow
parents: 2
diff changeset
54 sb.append("<species id=\"" + StringEscapeUtils.escapeXml(s.getMostProbableID()) + "\" ");
ae96e4bc7fb2 save found species to analysis files
jdamerow
parents: 2
diff changeset
55 sb.append("start=\"" + s.getStart() + "\" ");
ae96e4bc7fb2 save found species to analysis files
jdamerow
parents: 2
diff changeset
56 sb.append("end=\"" + s.getEnd() + "\" ");
ae96e4bc7fb2 save found species to analysis files
jdamerow
parents: 2
diff changeset
57 sb.append("text=\"" + StringEscapeUtils.escapeXml(s.getText()) + "\" ");
ae96e4bc7fb2 save found species to analysis files
jdamerow
parents: 2
diff changeset
58 sb.append("/>");
ae96e4bc7fb2 save found species to analysis files
jdamerow
parents: 2
diff changeset
59 }
ae96e4bc7fb2 save found species to analysis files
jdamerow
parents: 2
diff changeset
60
ae96e4bc7fb2 save found species to analysis files
jdamerow
parents: 2
diff changeset
61 sb.append("</linnaeus>");
ae96e4bc7fb2 save found species to analysis files
jdamerow
parents: 2
diff changeset
62 System.out.println(sb.toString());
ae96e4bc7fb2 save found species to analysis files
jdamerow
parents: 2
diff changeset
63 return sb.toString();
2
1c2b4f5e2c05 linnaeus for finding species
jdamerow
parents:
diff changeset
64 }
1c2b4f5e2c05 linnaeus for finding species
jdamerow
parents:
diff changeset
65
1c2b4f5e2c05 linnaeus for finding species
jdamerow
parents:
diff changeset
66 public Postprocessor getPostprocessor(Map<String, String> comments,
1c2b4f5e2c05 linnaeus for finding species
jdamerow
parents:
diff changeset
67 String tag) {
1c2b4f5e2c05 linnaeus for finding species
jdamerow
parents:
diff changeset
68 InputStream stop = getClass().getResourceAsStream(
1c2b4f5e2c05 linnaeus for finding species
jdamerow
parents:
diff changeset
69 "resources-linnaeus/stoplist.tsv");
1c2b4f5e2c05 linnaeus for finding species
jdamerow
parents:
diff changeset
70 InputStream acr = getClass().getResourceAsStream(
1c2b4f5e2c05 linnaeus for finding species
jdamerow
parents:
diff changeset
71 "resources-linnaeus/synonyms-acronyms.tsv");
1c2b4f5e2c05 linnaeus for finding species
jdamerow
parents:
diff changeset
72 InputStream spf = getClass().getResourceAsStream(
1c2b4f5e2c05 linnaeus for finding species
jdamerow
parents:
diff changeset
73 "resources-linnaeus/species-frequency.tsv");
1c2b4f5e2c05 linnaeus for finding species
jdamerow
parents:
diff changeset
74
1c2b4f5e2c05 linnaeus for finding species
jdamerow
parents:
diff changeset
75 Postprocessor res = new Postprocessor(new InputStream[] { stop },
1c2b4f5e2c05 linnaeus for finding species
jdamerow
parents:
diff changeset
76 new InputStream[] { acr }, new InputStream[] { spf }, comments,
1c2b4f5e2c05 linnaeus for finding species
jdamerow
parents:
diff changeset
77 null);
1c2b4f5e2c05 linnaeus for finding species
jdamerow
parents:
diff changeset
78
1c2b4f5e2c05 linnaeus for finding species
jdamerow
parents:
diff changeset
79 try {
1c2b4f5e2c05 linnaeus for finding species
jdamerow
parents:
diff changeset
80 if (stop != null)
1c2b4f5e2c05 linnaeus for finding species
jdamerow
parents:
diff changeset
81 stop.close();
1c2b4f5e2c05 linnaeus for finding species
jdamerow
parents:
diff changeset
82 if (acr != null)
1c2b4f5e2c05 linnaeus for finding species
jdamerow
parents:
diff changeset
83 acr.close();
1c2b4f5e2c05 linnaeus for finding species
jdamerow
parents:
diff changeset
84 if (spf != null)
1c2b4f5e2c05 linnaeus for finding species
jdamerow
parents:
diff changeset
85 spf.close();
1c2b4f5e2c05 linnaeus for finding species
jdamerow
parents:
diff changeset
86 } catch (Exception e) {
1c2b4f5e2c05 linnaeus for finding species
jdamerow
parents:
diff changeset
87 System.err.println(e);
1c2b4f5e2c05 linnaeus for finding species
jdamerow
parents:
diff changeset
88 e.printStackTrace();
1c2b4f5e2c05 linnaeus for finding species
jdamerow
parents:
diff changeset
89 System.exit(0);
1c2b4f5e2c05 linnaeus for finding species
jdamerow
parents:
diff changeset
90 }
1c2b4f5e2c05 linnaeus for finding species
jdamerow
parents:
diff changeset
91 return res;
1c2b4f5e2c05 linnaeus for finding species
jdamerow
parents:
diff changeset
92 }
1c2b4f5e2c05 linnaeus for finding species
jdamerow
parents:
diff changeset
93 }