comparison src/de/mpiwg/anteater/species/common/impl/LinnaeusNameFinder.java @ 2:1c2b4f5e2c05

linnaeus for finding species
author jdamerow
date Mon, 22 Oct 2012 13:46:54 -0700
parents
children ae96e4bc7fb2
comparison
equal deleted inserted replaced
1:7a4341c9f2e5 2:1c2b4f5e2c05
1 package de.mpiwg.anteater.species.common.impl;
2
3 import java.io.InputStream;
4 import java.util.HashMap;
5 import java.util.HashSet;
6 import java.util.List;
7 import java.util.Map;
8 import java.util.logging.Logger;
9
10 import martin.common.ArgParser;
11 import uk.ac.man.documentparser.dataholders.Document;
12 import uk.ac.man.entitytagger.Mention;
13 import uk.ac.man.entitytagger.doc.TaggedDocument;
14 import uk.ac.man.entitytagger.matching.MatchOperations;
15 import uk.ac.man.entitytagger.matching.Matcher;
16 import uk.ac.man.entitytagger.matching.Matcher.Disambiguation;
17 import uk.ac.man.entitytagger.matching.Postprocessor;
18 import uk.ac.man.entitytagger.matching.matchers.MatchPostProcessor;
19 import uk.ac.man.entitytagger.matching.matchers.VariantDictionaryMatcher;
20 import de.mpiwg.anteater.logging.IAnteaterLogger;
21 import de.mpiwg.anteater.species.common.ICommonNameFinder;
22
23 public class LinnaeusNameFinder implements ICommonNameFinder {
24
25 public final static String COMPONENT_NAME = LinnaeusNameFinder.class
26 .getSimpleName();
27
28 private IAnteaterLogger logger;
29
30 public LinnaeusNameFinder(IAnteaterLogger logger) {
31 this.logger = logger;
32 }
33
34 @Override
35 public String findCommonNames(String text) {
36 Matcher matcher = VariantDictionaryMatcher.load(getClass()
37 .getResourceAsStream("resources-linnaeus/species-light.tsv"),
38 true);
39
40 Disambiguation disambiguation = Disambiguation.ON_WHOLE;
41
42 matcher = new MatchPostProcessor(matcher, disambiguation, true, null,
43 getPostprocessor(new HashMap<String, String>(), ""));
44
45 matcher.match("test", new Document("none", null, null, null, null,
46 null, null, null, null, null, null, null, null, null, null));
47
48 Document doc = new Document("id", "title", "", text, text, Document.Text_raw_type.TEXT, "", null, Document.Type.OTHER, null, "", "", "", "", null);
49 TaggedDocument tagged = MatchOperations.matchDocument(matcher, doc);
50 List<Mention> species = tagged.getAllMatches();
51 for (Mention s : species)
52 System.out.println("found " + s.getMostProbableID() + ": " + s.getText() + " at " + s.getStart());
53 return null;
54 }
55
56 public Postprocessor getPostprocessor(Map<String, String> comments,
57 String tag) {
58 InputStream stop = getClass().getResourceAsStream(
59 "resources-linnaeus/stoplist.tsv");
60 InputStream acr = getClass().getResourceAsStream(
61 "resources-linnaeus/synonyms-acronyms.tsv");
62 InputStream spf = getClass().getResourceAsStream(
63 "resources-linnaeus/species-frequency.tsv");
64
65 Postprocessor res = new Postprocessor(new InputStream[] { stop },
66 new InputStream[] { acr }, new InputStream[] { spf }, comments,
67 null);
68
69 try {
70 if (stop != null)
71 stop.close();
72 if (acr != null)
73 acr.close();
74 if (spf != null)
75 spf.close();
76 } catch (Exception e) {
77 System.err.println(e);
78 e.printStackTrace();
79 System.exit(0);
80 }
81 return res;
82 }
83 }