Mercurial > hg > fulltextSearch
view src/de/mpiwg/dwinter/fulltext/search/FulltextSearch.java @ 1:5c9c31510f0c
CLOSED - # 16: Zeige nur eine konfigurierbare Anzahl von Treffern an.
https://it-dev.mpiwg-berlin.mpg.de/tracs/pythonOcropusTools/ticket/16
author | dwinter |
---|---|
date | Wed, 03 Nov 2010 12:26:29 +0100 |
parents | |
children | 2b29b0b6db16 |
line wrap: on
line source
package de.mpiwg.dwinter.fulltext.search; import java.io.File; import java.io.IOException; import java.io.InputStream; import java.io.StringReader; import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; import java.util.Iterator; import javax.swing.text.Document; import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.parsers.ParserConfigurationException; import javax.xml.xpath.XPath; import javax.xml.xpath.XPathConstants; import javax.xml.xpath.XPathExpression; import javax.xml.xpath.XPathExpressionException; import javax.xml.xpath.XPathFactory; import org.apache.lucene.analysis.de.GermanAnalyzer; import org.apache.lucene.analysis.fr.FrenchAnalyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.index.CorruptIndexException; import org.apache.lucene.index.Term; import org.apache.lucene.queryParser.ParseException; import org.apache.lucene.search.Collector; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TopDocs; import org.apache.lucene.search.TopDocsCollector; import org.apache.lucene.search.TopScoreDocCollector; import org.apache.lucene.search.WildcardQuery; import org.apache.lucene.store.LockObtainFailedException; import org.apache.lucene.util.Version; import org.w3c.dom.NodeList; import org.xml.sax.InputSource; import org.xml.sax.SAXException; import de.mpiwg.dwinter.fulltext.searcher.LanguageSearcher; import de.mpiwg.dwinter.fulltext.searcher.LanguageSearchers; public class FulltextSearch { protected File index_dir; public LanguageSearchers languageSearchers = new LanguageSearchers(); protected static ArrayList<String> supportedLanguages = new ArrayList<String>(); public FulltextSearch(){} public FulltextSearch(File index_dir) throws CorruptIndexException, LockObtainFailedException, IOException { this.index_dir=index_dir; init_language_searchers(index_dir); init_languages(); } protected void init_languages() throws CorruptIndexException, LockObtainFailedException, IOException{ supportedLanguages.add("de"); supportedLanguages.add("en"); supportedLanguages.add("fr"); supportedLanguages.add("la"); } protected void init_language_searchers(File dir) throws CorruptIndexException, LockObtainFailedException, IOException { languageSearchers.add(new LanguageSearcher("de",new GermanAnalyzer(Version.LUCENE_30),dir)); languageSearchers.add(new LanguageSearcher("en",new StandardAnalyzer(Version.LUCENE_30),dir)); languageSearchers.add(new LanguageSearcher("fr",new FrenchAnalyzer(Version.LUCENE_30),dir)); languageSearchers.add(new LanguageSearcher("all",new StandardAnalyzer(Version.LUCENE_30),dir)); languageSearchers.add(new LanguageSearcher("morph",new StandardAnalyzer(Version.LUCENE_30),dir)); languageSearchers.add(new LanguageSearcher("la",new StandardAnalyzer(Version.LUCENE_30),dir)); } public Collector search(Query query,String language) throws IOException{ return languageSearchers.searchLanguage(query, language); } public HashMap<String, Collector> search(Query query) throws IOException{ return languageSearchers.searchAllLanguages(query); } public HashMap<String, Collector> searchMD(String searchString, String mdString, ArrayList<String> languages) throws ParseException, IOException { if (mdString==null & languages==null) return languageSearchers.parseAndsearchAllLanguages(searchString); return languageSearchers.parseAndsearchAllLanguages(searchString,mdString,languages); } public HashMap<String, Collector> search(String searchString) throws IOException, ParseException { return languageSearchers.parseAndsearchAllLanguages(searchString); } // TODO: implement donatus // public HashMap<String, HashMap<String, Collector>> searchAndAnalyse(String searchString) throws IOException, ParseException { // // return languageSearchers.parseAndsearchAndAnalyseAllLanguages(searchString); // } public ArrayList<String> searchForMorph(String path, String word) throws ParseException, IOException, ParserConfigurationException, SAXException, XPathExpressionException { LanguageSearcher searcher = languageSearchers.getSearcherByLanguage("morph"); Term term=new Term("path",path); Query query=new WildcardQuery(term); TopScoreDocCollector col = TopScoreDocCollector.create(10, false); searcher.searcher.search(query, col); ArrayList<String> ret = new ArrayList<String>(); String morph = null; //System.out.println("path:"+path); ScoreDoc[] docs = col.topDocs().scoreDocs; for (ScoreDoc doc:docs){ morph = searcher.searcher.doc(doc.doc).get("donatusMorph"); //System.out.println("morph:"+morph); } DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); factory.setNamespaceAware(false); // never forget this! DocumentBuilder builder = factory.newDocumentBuilder(); InputSource s =new InputSource(new StringReader(morph)); //System.out.println("morph:"+morph); //System.out.println("morphende"); org.w3c.dom.Document doc = builder.parse(s); XPathFactory xpathfactory = XPathFactory.newInstance(); XPath xpath = xpathfactory.newXPath(); String xquery = "//lemma[@form='"+word+"']/variant/@form"; XPathExpression expr = xpath.compile(xquery); //System.out.println("xpath now:"+xquery); Object result = expr.evaluate(doc, XPathConstants.NODESET); NodeList nodes = (NodeList) result; for (int i = 0; i < nodes.getLength(); i++) { //System.out.println("nodeS:"+nodes.item(i).getNodeValue()); ret.add(nodes.item(i).getNodeValue()); } return ret; } }