Mercurial > hg > anteater
comparison src/de/mpiwg/anteater/species/common/impl/LinnaeusNameFinder.java @ 2:1c2b4f5e2c05
linnaeus for finding species
author | jdamerow |
---|---|
date | Mon, 22 Oct 2012 13:46:54 -0700 |
parents | |
children | ae96e4bc7fb2 |
comparison
equal
deleted
inserted
replaced
1:7a4341c9f2e5 | 2:1c2b4f5e2c05 |
---|---|
1 package de.mpiwg.anteater.species.common.impl; | |
2 | |
3 import java.io.InputStream; | |
4 import java.util.HashMap; | |
5 import java.util.HashSet; | |
6 import java.util.List; | |
7 import java.util.Map; | |
8 import java.util.logging.Logger; | |
9 | |
10 import martin.common.ArgParser; | |
11 import uk.ac.man.documentparser.dataholders.Document; | |
12 import uk.ac.man.entitytagger.Mention; | |
13 import uk.ac.man.entitytagger.doc.TaggedDocument; | |
14 import uk.ac.man.entitytagger.matching.MatchOperations; | |
15 import uk.ac.man.entitytagger.matching.Matcher; | |
16 import uk.ac.man.entitytagger.matching.Matcher.Disambiguation; | |
17 import uk.ac.man.entitytagger.matching.Postprocessor; | |
18 import uk.ac.man.entitytagger.matching.matchers.MatchPostProcessor; | |
19 import uk.ac.man.entitytagger.matching.matchers.VariantDictionaryMatcher; | |
20 import de.mpiwg.anteater.logging.IAnteaterLogger; | |
21 import de.mpiwg.anteater.species.common.ICommonNameFinder; | |
22 | |
23 public class LinnaeusNameFinder implements ICommonNameFinder { | |
24 | |
25 public final static String COMPONENT_NAME = LinnaeusNameFinder.class | |
26 .getSimpleName(); | |
27 | |
28 private IAnteaterLogger logger; | |
29 | |
30 public LinnaeusNameFinder(IAnteaterLogger logger) { | |
31 this.logger = logger; | |
32 } | |
33 | |
34 @Override | |
35 public String findCommonNames(String text) { | |
36 Matcher matcher = VariantDictionaryMatcher.load(getClass() | |
37 .getResourceAsStream("resources-linnaeus/species-light.tsv"), | |
38 true); | |
39 | |
40 Disambiguation disambiguation = Disambiguation.ON_WHOLE; | |
41 | |
42 matcher = new MatchPostProcessor(matcher, disambiguation, true, null, | |
43 getPostprocessor(new HashMap<String, String>(), "")); | |
44 | |
45 matcher.match("test", new Document("none", null, null, null, null, | |
46 null, null, null, null, null, null, null, null, null, null)); | |
47 | |
48 Document doc = new Document("id", "title", "", text, text, Document.Text_raw_type.TEXT, "", null, Document.Type.OTHER, null, "", "", "", "", null); | |
49 TaggedDocument tagged = MatchOperations.matchDocument(matcher, doc); | |
50 List<Mention> species = tagged.getAllMatches(); | |
51 for (Mention s : species) | |
52 System.out.println("found " + s.getMostProbableID() + ": " + s.getText() + " at " + s.getStart()); | |
53 return null; | |
54 } | |
55 | |
56 public Postprocessor getPostprocessor(Map<String, String> comments, | |
57 String tag) { | |
58 InputStream stop = getClass().getResourceAsStream( | |
59 "resources-linnaeus/stoplist.tsv"); | |
60 InputStream acr = getClass().getResourceAsStream( | |
61 "resources-linnaeus/synonyms-acronyms.tsv"); | |
62 InputStream spf = getClass().getResourceAsStream( | |
63 "resources-linnaeus/species-frequency.tsv"); | |
64 | |
65 Postprocessor res = new Postprocessor(new InputStream[] { stop }, | |
66 new InputStream[] { acr }, new InputStream[] { spf }, comments, | |
67 null); | |
68 | |
69 try { | |
70 if (stop != null) | |
71 stop.close(); | |
72 if (acr != null) | |
73 acr.close(); | |
74 if (spf != null) | |
75 spf.close(); | |
76 } catch (Exception e) { | |
77 System.err.println(e); | |
78 e.printStackTrace(); | |
79 System.exit(0); | |
80 } | |
81 return res; | |
82 } | |
83 } |