2
|
1 package de.mpiwg.anteater.species.common.impl;
|
|
2
|
|
3 import java.io.InputStream;
|
|
4 import java.util.HashMap;
|
|
5 import java.util.List;
|
|
6 import java.util.Map;
|
|
7
|
3
|
8 import org.apache.commons.lang3.StringEscapeUtils;
|
|
9
|
2
|
10 import uk.ac.man.documentparser.dataholders.Document;
|
|
11 import uk.ac.man.entitytagger.Mention;
|
|
12 import uk.ac.man.entitytagger.doc.TaggedDocument;
|
|
13 import uk.ac.man.entitytagger.matching.MatchOperations;
|
|
14 import uk.ac.man.entitytagger.matching.Matcher;
|
|
15 import uk.ac.man.entitytagger.matching.Matcher.Disambiguation;
|
|
16 import uk.ac.man.entitytagger.matching.Postprocessor;
|
|
17 import uk.ac.man.entitytagger.matching.matchers.MatchPostProcessor;
|
|
18 import uk.ac.man.entitytagger.matching.matchers.VariantDictionaryMatcher;
|
|
19 import de.mpiwg.anteater.logging.IAnteaterLogger;
|
|
20 import de.mpiwg.anteater.species.common.ICommonNameFinder;
|
|
21
|
|
22 public class LinnaeusNameFinder implements ICommonNameFinder {
|
|
23
|
|
24 public final static String COMPONENT_NAME = LinnaeusNameFinder.class
|
|
25 .getSimpleName();
|
|
26
|
|
27 private IAnteaterLogger logger;
|
|
28
|
|
29 public LinnaeusNameFinder(IAnteaterLogger logger) {
|
|
30 this.logger = logger;
|
|
31 }
|
|
32
|
|
33 @Override
|
|
34 public String findCommonNames(String text) {
|
|
35 Matcher matcher = VariantDictionaryMatcher.load(getClass()
|
|
36 .getResourceAsStream("resources-linnaeus/species-light.tsv"),
|
|
37 true);
|
|
38
|
|
39 Disambiguation disambiguation = Disambiguation.ON_WHOLE;
|
|
40
|
|
41 matcher = new MatchPostProcessor(matcher, disambiguation, true, null,
|
|
42 getPostprocessor(new HashMap<String, String>(), ""));
|
|
43
|
|
44 matcher.match("test", new Document("none", null, null, null, null,
|
|
45 null, null, null, null, null, null, null, null, null, null));
|
|
46
|
|
47 Document doc = new Document("id", "title", "", text, text, Document.Text_raw_type.TEXT, "", null, Document.Type.OTHER, null, "", "", "", "", null);
|
|
48 TaggedDocument tagged = MatchOperations.matchDocument(matcher, doc);
|
|
49 List<Mention> species = tagged.getAllMatches();
|
3
|
50 StringBuffer sb = new StringBuffer();
|
|
51 sb.append("<linnaeus>");
|
|
52
|
|
53 for (Mention s : species) {
|
|
54 sb.append("<species id=\"" + StringEscapeUtils.escapeXml(s.getMostProbableID()) + "\" ");
|
|
55 sb.append("start=\"" + s.getStart() + "\" ");
|
|
56 sb.append("end=\"" + s.getEnd() + "\" ");
|
|
57 sb.append("text=\"" + StringEscapeUtils.escapeXml(s.getText()) + "\" ");
|
|
58 sb.append("/>");
|
|
59 }
|
|
60
|
|
61 sb.append("</linnaeus>");
|
|
62 System.out.println(sb.toString());
|
|
63 return sb.toString();
|
2
|
64 }
|
|
65
|
|
66 public Postprocessor getPostprocessor(Map<String, String> comments,
|
|
67 String tag) {
|
|
68 InputStream stop = getClass().getResourceAsStream(
|
|
69 "resources-linnaeus/stoplist.tsv");
|
|
70 InputStream acr = getClass().getResourceAsStream(
|
|
71 "resources-linnaeus/synonyms-acronyms.tsv");
|
|
72 InputStream spf = getClass().getResourceAsStream(
|
|
73 "resources-linnaeus/species-frequency.tsv");
|
|
74
|
|
75 Postprocessor res = new Postprocessor(new InputStream[] { stop },
|
|
76 new InputStream[] { acr }, new InputStream[] { spf }, comments,
|
|
77 null);
|
|
78
|
|
79 try {
|
|
80 if (stop != null)
|
|
81 stop.close();
|
|
82 if (acr != null)
|
|
83 acr.close();
|
|
84 if (spf != null)
|
|
85 spf.close();
|
|
86 } catch (Exception e) {
|
|
87 System.err.println(e);
|
|
88 e.printStackTrace();
|
|
89 System.exit(0);
|
|
90 }
|
|
91 return res;
|
|
92 }
|
|
93 }
|