view src/de/mpiwg/dwinter/fulltext/search/FulltextSearch.java @ 1:5c9c31510f0c

CLOSED - # 16: Zeige nur eine konfigurierbare Anzahl von Treffern an. https://it-dev.mpiwg-berlin.mpg.de/tracs/pythonOcropusTools/ticket/16
author dwinter
date Wed, 03 Nov 2010 12:26:29 +0100
parents
children 2b29b0b6db16
line wrap: on
line source

package de.mpiwg.dwinter.fulltext.search;

import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Iterator;

import javax.swing.text.Document;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpression;
import javax.xml.xpath.XPathExpressionException;
import javax.xml.xpath.XPathFactory;

import org.apache.lucene.analysis.de.GermanAnalyzer;
import org.apache.lucene.analysis.fr.FrenchAnalyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.search.Collector;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.TopDocsCollector;
import org.apache.lucene.search.TopScoreDocCollector;
import org.apache.lucene.search.WildcardQuery;
import org.apache.lucene.store.LockObtainFailedException;
import org.apache.lucene.util.Version;
import org.w3c.dom.NodeList;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;

import de.mpiwg.dwinter.fulltext.searcher.LanguageSearcher;
import de.mpiwg.dwinter.fulltext.searcher.LanguageSearchers;



public class FulltextSearch {

	protected File index_dir;
	
	public LanguageSearchers languageSearchers = new LanguageSearchers();
	protected static ArrayList<String> supportedLanguages = new ArrayList<String>();
	
	public FulltextSearch(){}
	
	public FulltextSearch(File index_dir) throws CorruptIndexException, LockObtainFailedException, IOException {
	
		
		this.index_dir=index_dir;
		
		init_language_searchers(index_dir);
		init_languages();
	}
	
	protected void init_languages() throws CorruptIndexException, LockObtainFailedException, IOException{

		
		
		supportedLanguages.add("de");
		supportedLanguages.add("en");
		supportedLanguages.add("fr");
		supportedLanguages.add("la");		

	}

	protected void init_language_searchers(File dir) throws CorruptIndexException,
			LockObtainFailedException, IOException {
		languageSearchers.add(new LanguageSearcher("de",new GermanAnalyzer(Version.LUCENE_30),dir));
		languageSearchers.add(new LanguageSearcher("en",new StandardAnalyzer(Version.LUCENE_30),dir));
		languageSearchers.add(new LanguageSearcher("fr",new FrenchAnalyzer(Version.LUCENE_30),dir));
		languageSearchers.add(new LanguageSearcher("all",new StandardAnalyzer(Version.LUCENE_30),dir));
		languageSearchers.add(new LanguageSearcher("morph",new StandardAnalyzer(Version.LUCENE_30),dir));
		languageSearchers.add(new LanguageSearcher("la",new StandardAnalyzer(Version.LUCENE_30),dir));
	}
	
	public Collector search(Query query,String language) throws IOException{
		return languageSearchers.searchLanguage(query, language);
	}
	
	public HashMap<String, Collector> search(Query query) throws IOException{
		return languageSearchers.searchAllLanguages(query);
	}

	public HashMap<String, Collector> searchMD(String searchString, String mdString, ArrayList<String> languages) throws ParseException, IOException {
		if (mdString==null & languages==null)
			return languageSearchers.parseAndsearchAllLanguages(searchString);
		
		return languageSearchers.parseAndsearchAllLanguages(searchString,mdString,languages);
		
	}

	public HashMap<String, Collector> search(String searchString) throws IOException, ParseException {
		
		return languageSearchers.parseAndsearchAllLanguages(searchString);
	}
	
//	TODO: implement donatus
//	public HashMap<String, HashMap<String, Collector>> searchAndAnalyse(String searchString) throws IOException, ParseException {
//		
//		return languageSearchers.parseAndsearchAndAnalyseAllLanguages(searchString);
//	}

	public ArrayList<String> searchForMorph(String path, String word) throws ParseException, IOException, ParserConfigurationException, SAXException, XPathExpressionException {
		
		LanguageSearcher searcher = languageSearchers.getSearcherByLanguage("morph");
		Term term=new Term("path",path);
		Query query=new WildcardQuery(term);
		
		TopScoreDocCollector col = TopScoreDocCollector.create(10, false);
		
		searcher.searcher.search(query, col);
		
		ArrayList<String> ret = new ArrayList<String>();
		
		
		String morph = null;
		//System.out.println("path:"+path);
		ScoreDoc[] docs = col.topDocs().scoreDocs;
		
		
		for (ScoreDoc doc:docs){
			morph = searcher.searcher.doc(doc.doc).get("donatusMorph");
			//System.out.println("morph:"+morph);
		}
        
		DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
		factory.setNamespaceAware(false); // never forget this!
		DocumentBuilder builder = factory.newDocumentBuilder();
		InputSource s =new InputSource(new StringReader(morph));
		//System.out.println("morph:"+morph);
		//System.out.println("morphende");
		org.w3c.dom.Document doc = builder.parse(s);
		
		XPathFactory xpathfactory = XPathFactory.newInstance();
		XPath xpath = xpathfactory.newXPath();
		String xquery = "//lemma[@form='"+word+"']/variant/@form";
		XPathExpression expr = xpath.compile(xquery);
		//System.out.println("xpath now:"+xquery);
		Object result = expr.evaluate(doc, XPathConstants.NODESET);
		NodeList nodes = (NodeList) result;
		for (int i = 0; i < nodes.getLength(); i++) {
		    //System.out.println("nodeS:"+nodes.item(i).getNodeValue()); 
		    ret.add(nodes.item(i).getNodeValue());
		}
		
		return ret;
	}

	
	
}