view src/de/mpiwg/anteater/species/common/impl/LinnaeusNameFinder.java @ 2:1c2b4f5e2c05

linnaeus for finding species
author jdamerow
date Mon, 22 Oct 2012 13:46:54 -0700
parents
children ae96e4bc7fb2
line wrap: on
line source

package de.mpiwg.anteater.species.common.impl;

import java.io.InputStream;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.logging.Logger;

import martin.common.ArgParser;
import uk.ac.man.documentparser.dataholders.Document;
import uk.ac.man.entitytagger.Mention;
import uk.ac.man.entitytagger.doc.TaggedDocument;
import uk.ac.man.entitytagger.matching.MatchOperations;
import uk.ac.man.entitytagger.matching.Matcher;
import uk.ac.man.entitytagger.matching.Matcher.Disambiguation;
import uk.ac.man.entitytagger.matching.Postprocessor;
import uk.ac.man.entitytagger.matching.matchers.MatchPostProcessor;
import uk.ac.man.entitytagger.matching.matchers.VariantDictionaryMatcher;
import de.mpiwg.anteater.logging.IAnteaterLogger;
import de.mpiwg.anteater.species.common.ICommonNameFinder;

public class LinnaeusNameFinder implements ICommonNameFinder {

	public final static String COMPONENT_NAME = LinnaeusNameFinder.class
			.getSimpleName();

	private IAnteaterLogger logger;

	public LinnaeusNameFinder(IAnteaterLogger logger) {
		this.logger = logger;
	}

	@Override
	public String findCommonNames(String text) {
		Matcher matcher = VariantDictionaryMatcher.load(getClass()
				.getResourceAsStream("resources-linnaeus/species-light.tsv"),
				true);

		Disambiguation disambiguation = Disambiguation.ON_WHOLE;

		matcher = new MatchPostProcessor(matcher, disambiguation, true, null,
				getPostprocessor(new HashMap<String, String>(), ""));

		matcher.match("test", new Document("none", null, null, null, null,
				null, null, null, null, null, null, null, null, null, null));

		Document doc = new Document("id", "title", "", text, text, Document.Text_raw_type.TEXT, "", null, Document.Type.OTHER, null, "", "", "", "", null);
		TaggedDocument tagged = MatchOperations.matchDocument(matcher, doc);
		List<Mention> species = tagged.getAllMatches();
		for (Mention s : species)
			System.out.println("found " + s.getMostProbableID() + ": " + s.getText() + " at " + s.getStart());
		return null;
	}

	public Postprocessor getPostprocessor(Map<String, String> comments,
			String tag) {
		InputStream stop = getClass().getResourceAsStream(
				"resources-linnaeus/stoplist.tsv");
		InputStream acr = getClass().getResourceAsStream(
				"resources-linnaeus/synonyms-acronyms.tsv");
		InputStream spf = getClass().getResourceAsStream(
				"resources-linnaeus/species-frequency.tsv");

		Postprocessor res = new Postprocessor(new InputStream[] { stop },
				new InputStream[] { acr }, new InputStream[] { spf }, comments,
				null);

		try {
			if (stop != null)
				stop.close();
			if (acr != null)
				acr.close();
			if (spf != null)
				spf.close();
		} catch (Exception e) {
			System.err.println(e);
			e.printStackTrace();
			System.exit(0);
		}
		return res;
	}
}