view src/de/mpiwg/anteater/species/common/impl/LinnaeusNameFinder.java @ 3:ae96e4bc7fb2

save found species to analysis files
author jdamerow
date Mon, 22 Oct 2012 14:21:14 -0700
parents 1c2b4f5e2c05
children dcc35f89dce3
line wrap: on
line source

package de.mpiwg.anteater.species.common.impl;

import java.io.InputStream;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import org.apache.commons.lang3.StringEscapeUtils;

import uk.ac.man.documentparser.dataholders.Document;
import uk.ac.man.entitytagger.Mention;
import uk.ac.man.entitytagger.doc.TaggedDocument;
import uk.ac.man.entitytagger.matching.MatchOperations;
import uk.ac.man.entitytagger.matching.Matcher;
import uk.ac.man.entitytagger.matching.Matcher.Disambiguation;
import uk.ac.man.entitytagger.matching.Postprocessor;
import uk.ac.man.entitytagger.matching.matchers.MatchPostProcessor;
import uk.ac.man.entitytagger.matching.matchers.VariantDictionaryMatcher;
import de.mpiwg.anteater.logging.IAnteaterLogger;
import de.mpiwg.anteater.species.common.ICommonNameFinder;

public class LinnaeusNameFinder implements ICommonNameFinder {

	public final static String COMPONENT_NAME = LinnaeusNameFinder.class
			.getSimpleName();

	private IAnteaterLogger logger;

	public LinnaeusNameFinder(IAnteaterLogger logger) {
		this.logger = logger;
	}

	@Override
	public String findCommonNames(String text) {
		Matcher matcher = VariantDictionaryMatcher.load(getClass()
				.getResourceAsStream("resources-linnaeus/species-light.tsv"),
				true);

		Disambiguation disambiguation = Disambiguation.ON_WHOLE;

		matcher = new MatchPostProcessor(matcher, disambiguation, true, null,
				getPostprocessor(new HashMap<String, String>(), ""));

		matcher.match("test", new Document("none", null, null, null, null,
				null, null, null, null, null, null, null, null, null, null));

		Document doc = new Document("id", "title", "", text, text, Document.Text_raw_type.TEXT, "", null, Document.Type.OTHER, null, "", "", "", "", null);
		TaggedDocument tagged = MatchOperations.matchDocument(matcher, doc);
		List<Mention> species = tagged.getAllMatches();
		StringBuffer sb = new StringBuffer();
		sb.append("<linnaeus>");
		
		for (Mention s : species) {
			sb.append("<species id=\"" + StringEscapeUtils.escapeXml(s.getMostProbableID()) + "\" ");
			sb.append("start=\"" + s.getStart() + "\" ");
			sb.append("end=\"" + s.getEnd() + "\" ");
			sb.append("text=\"" + StringEscapeUtils.escapeXml(s.getText()) + "\" ");
			sb.append("/>");
		}
		
		sb.append("</linnaeus>");
		System.out.println(sb.toString());
		return sb.toString();
	}

	public Postprocessor getPostprocessor(Map<String, String> comments,
			String tag) {
		InputStream stop = getClass().getResourceAsStream(
				"resources-linnaeus/stoplist.tsv");
		InputStream acr = getClass().getResourceAsStream(
				"resources-linnaeus/synonyms-acronyms.tsv");
		InputStream spf = getClass().getResourceAsStream(
				"resources-linnaeus/species-frequency.tsv");

		Postprocessor res = new Postprocessor(new InputStream[] { stop },
				new InputStream[] { acr }, new InputStream[] { spf }, comments,
				null);

		try {
			if (stop != null)
				stop.close();
			if (acr != null)
				acr.close();
			if (spf != null)
				spf.close();
		} catch (Exception e) {
			System.err.println(e);
			e.printStackTrace();
			System.exit(0);
		}
		return res;
	}
}