Mercurial > hg > anteater
view src/de/mpiwg/anteater/species/common/impl/LinnaeusNameFinder.java @ 4:dcc35f89dce3
include linneaus findings
author | jdamerow |
---|---|
date | Thu, 25 Oct 2012 15:25:08 -0700 |
parents | ae96e4bc7fb2 |
children |
line wrap: on
line source
package de.mpiwg.anteater.species.common.impl; import java.io.InputStream; import java.util.HashMap; import java.util.List; import java.util.Map; import org.apache.commons.lang3.StringEscapeUtils; import uk.ac.man.documentparser.dataholders.Document; import uk.ac.man.entitytagger.Mention; import uk.ac.man.entitytagger.doc.TaggedDocument; import uk.ac.man.entitytagger.matching.MatchOperations; import uk.ac.man.entitytagger.matching.Matcher; import uk.ac.man.entitytagger.matching.Matcher.Disambiguation; import uk.ac.man.entitytagger.matching.Postprocessor; import uk.ac.man.entitytagger.matching.matchers.MatchPostProcessor; import uk.ac.man.entitytagger.matching.matchers.VariantDictionaryMatcher; import de.mpiwg.anteater.logging.IAnteaterLogger; import de.mpiwg.anteater.species.common.ICommonNameFinder; public class LinnaeusNameFinder implements ICommonNameFinder { public final static String COMPONENT_NAME = LinnaeusNameFinder.class .getSimpleName(); private IAnteaterLogger logger; public LinnaeusNameFinder(IAnteaterLogger logger) { this.logger = logger; } @Override public String findCommonNames(String text) { Matcher matcher = VariantDictionaryMatcher.load(getClass() .getResourceAsStream("resources-linnaeus/species-light.tsv"), true); Disambiguation disambiguation = Disambiguation.ON_WHOLE; matcher = new MatchPostProcessor(matcher, disambiguation, true, null, getPostprocessor(new HashMap<String, String>(), "")); matcher.match("test", new Document("none", null, null, null, null, null, null, null, null, null, null, null, null, null, null)); StringBuffer sb = new StringBuffer(); sb.append("<linnaeus>"); Document doc = new Document("", "", "", "", text, Document.Text_raw_type.TEXT, "", null, Document.Type.OTHER, null, "", "", "", "", null); TaggedDocument tagged = MatchOperations.matchDocument(matcher, doc); List<Mention> species = tagged.getAllMatches(); for (Mention s : species) { int startAt = s.getStart() - s.getText().length() > -1 ? s .getStart() - s.getText().length() : 0; String stub = text.substring(startAt); int foundAt = stub.indexOf(s.getText()); sb.append("<species id=\"" + StringEscapeUtils.escapeXml(s.getMostProbableID()) + "\" "); sb.append("start=\"" + (startAt + foundAt) + "\" "); sb.append("end=\"" + (startAt + foundAt + s.getText().length()) + "\" "); sb.append("text=\"" + StringEscapeUtils.escapeXml(s.getText()) + "\" "); sb.append("/>"); } sb.append("</linnaeus>"); System.out.println(sb.toString()); return sb.toString(); } public Postprocessor getPostprocessor(Map<String, String> comments, String tag) { InputStream stop = getClass().getResourceAsStream( "resources-linnaeus/stoplist.tsv"); InputStream acr = getClass().getResourceAsStream( "resources-linnaeus/synonyms-acronyms.tsv"); InputStream spf = getClass().getResourceAsStream( "resources-linnaeus/species-frequency.tsv"); Postprocessor res = new Postprocessor(new InputStream[] { stop }, new InputStream[] { acr }, new InputStream[] { spf }, comments, null); try { if (stop != null) stop.close(); if (acr != null) acr.close(); if (spf != null) spf.close(); } catch (Exception e) { System.err.println(e); e.printStackTrace(); System.exit(0); } return res; } }