Mercurial > hg > fulltextSearch
comparison src/de/mpiwg/dwinter/fulltext/search/FulltextSearch.java @ 1:5c9c31510f0c
CLOSED - # 16: Zeige nur eine konfigurierbare Anzahl von Treffern an.
https://it-dev.mpiwg-berlin.mpg.de/tracs/pythonOcropusTools/ticket/16
author | dwinter |
---|---|
date | Wed, 03 Nov 2010 12:26:29 +0100 |
parents | |
children | 2b29b0b6db16 |
comparison
equal
deleted
inserted
replaced
0:72a015318a6d | 1:5c9c31510f0c |
---|---|
1 package de.mpiwg.dwinter.fulltext.search; | |
2 | |
3 import java.io.File; | |
4 import java.io.IOException; | |
5 import java.io.InputStream; | |
6 import java.io.StringReader; | |
7 import java.util.ArrayList; | |
8 import java.util.Arrays; | |
9 import java.util.HashMap; | |
10 import java.util.Iterator; | |
11 | |
12 import javax.swing.text.Document; | |
13 import javax.xml.parsers.DocumentBuilder; | |
14 import javax.xml.parsers.DocumentBuilderFactory; | |
15 import javax.xml.parsers.ParserConfigurationException; | |
16 import javax.xml.xpath.XPath; | |
17 import javax.xml.xpath.XPathConstants; | |
18 import javax.xml.xpath.XPathExpression; | |
19 import javax.xml.xpath.XPathExpressionException; | |
20 import javax.xml.xpath.XPathFactory; | |
21 | |
22 import org.apache.lucene.analysis.de.GermanAnalyzer; | |
23 import org.apache.lucene.analysis.fr.FrenchAnalyzer; | |
24 import org.apache.lucene.analysis.standard.StandardAnalyzer; | |
25 import org.apache.lucene.index.CorruptIndexException; | |
26 import org.apache.lucene.index.Term; | |
27 import org.apache.lucene.queryParser.ParseException; | |
28 import org.apache.lucene.search.Collector; | |
29 import org.apache.lucene.search.Query; | |
30 import org.apache.lucene.search.ScoreDoc; | |
31 import org.apache.lucene.search.TopDocs; | |
32 import org.apache.lucene.search.TopDocsCollector; | |
33 import org.apache.lucene.search.TopScoreDocCollector; | |
34 import org.apache.lucene.search.WildcardQuery; | |
35 import org.apache.lucene.store.LockObtainFailedException; | |
36 import org.apache.lucene.util.Version; | |
37 import org.w3c.dom.NodeList; | |
38 import org.xml.sax.InputSource; | |
39 import org.xml.sax.SAXException; | |
40 | |
41 import de.mpiwg.dwinter.fulltext.searcher.LanguageSearcher; | |
42 import de.mpiwg.dwinter.fulltext.searcher.LanguageSearchers; | |
43 | |
44 | |
45 | |
46 public class FulltextSearch { | |
47 | |
48 protected File index_dir; | |
49 | |
50 public LanguageSearchers languageSearchers = new LanguageSearchers(); | |
51 protected static ArrayList<String> supportedLanguages = new ArrayList<String>(); | |
52 | |
53 public FulltextSearch(){} | |
54 | |
55 public FulltextSearch(File index_dir) throws CorruptIndexException, LockObtainFailedException, IOException { | |
56 | |
57 | |
58 this.index_dir=index_dir; | |
59 | |
60 init_language_searchers(index_dir); | |
61 init_languages(); | |
62 } | |
63 | |
64 protected void init_languages() throws CorruptIndexException, LockObtainFailedException, IOException{ | |
65 | |
66 | |
67 | |
68 supportedLanguages.add("de"); | |
69 supportedLanguages.add("en"); | |
70 supportedLanguages.add("fr"); | |
71 supportedLanguages.add("la"); | |
72 | |
73 } | |
74 | |
75 protected void init_language_searchers(File dir) throws CorruptIndexException, | |
76 LockObtainFailedException, IOException { | |
77 languageSearchers.add(new LanguageSearcher("de",new GermanAnalyzer(Version.LUCENE_30),dir)); | |
78 languageSearchers.add(new LanguageSearcher("en",new StandardAnalyzer(Version.LUCENE_30),dir)); | |
79 languageSearchers.add(new LanguageSearcher("fr",new FrenchAnalyzer(Version.LUCENE_30),dir)); | |
80 languageSearchers.add(new LanguageSearcher("all",new StandardAnalyzer(Version.LUCENE_30),dir)); | |
81 languageSearchers.add(new LanguageSearcher("morph",new StandardAnalyzer(Version.LUCENE_30),dir)); | |
82 languageSearchers.add(new LanguageSearcher("la",new StandardAnalyzer(Version.LUCENE_30),dir)); | |
83 } | |
84 | |
85 public Collector search(Query query,String language) throws IOException{ | |
86 return languageSearchers.searchLanguage(query, language); | |
87 } | |
88 | |
89 public HashMap<String, Collector> search(Query query) throws IOException{ | |
90 return languageSearchers.searchAllLanguages(query); | |
91 } | |
92 | |
93 public HashMap<String, Collector> searchMD(String searchString, String mdString, ArrayList<String> languages) throws ParseException, IOException { | |
94 if (mdString==null & languages==null) | |
95 return languageSearchers.parseAndsearchAllLanguages(searchString); | |
96 | |
97 return languageSearchers.parseAndsearchAllLanguages(searchString,mdString,languages); | |
98 | |
99 } | |
100 | |
101 public HashMap<String, Collector> search(String searchString) throws IOException, ParseException { | |
102 | |
103 return languageSearchers.parseAndsearchAllLanguages(searchString); | |
104 } | |
105 | |
106 // TODO: implement donatus | |
107 // public HashMap<String, HashMap<String, Collector>> searchAndAnalyse(String searchString) throws IOException, ParseException { | |
108 // | |
109 // return languageSearchers.parseAndsearchAndAnalyseAllLanguages(searchString); | |
110 // } | |
111 | |
112 public ArrayList<String> searchForMorph(String path, String word) throws ParseException, IOException, ParserConfigurationException, SAXException, XPathExpressionException { | |
113 | |
114 LanguageSearcher searcher = languageSearchers.getSearcherByLanguage("morph"); | |
115 Term term=new Term("path",path); | |
116 Query query=new WildcardQuery(term); | |
117 | |
118 TopScoreDocCollector col = TopScoreDocCollector.create(10, false); | |
119 | |
120 searcher.searcher.search(query, col); | |
121 | |
122 ArrayList<String> ret = new ArrayList<String>(); | |
123 | |
124 | |
125 String morph = null; | |
126 //System.out.println("path:"+path); | |
127 ScoreDoc[] docs = col.topDocs().scoreDocs; | |
128 | |
129 | |
130 for (ScoreDoc doc:docs){ | |
131 morph = searcher.searcher.doc(doc.doc).get("donatusMorph"); | |
132 //System.out.println("morph:"+morph); | |
133 } | |
134 | |
135 DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); | |
136 factory.setNamespaceAware(false); // never forget this! | |
137 DocumentBuilder builder = factory.newDocumentBuilder(); | |
138 InputSource s =new InputSource(new StringReader(morph)); | |
139 //System.out.println("morph:"+morph); | |
140 //System.out.println("morphende"); | |
141 org.w3c.dom.Document doc = builder.parse(s); | |
142 | |
143 XPathFactory xpathfactory = XPathFactory.newInstance(); | |
144 XPath xpath = xpathfactory.newXPath(); | |
145 String xquery = "//lemma[@form='"+word+"']/variant/@form"; | |
146 XPathExpression expr = xpath.compile(xquery); | |
147 //System.out.println("xpath now:"+xquery); | |
148 Object result = expr.evaluate(doc, XPathConstants.NODESET); | |
149 NodeList nodes = (NodeList) result; | |
150 for (int i = 0; i < nodes.getLength(); i++) { | |
151 //System.out.println("nodeS:"+nodes.item(i).getNodeValue()); | |
152 ret.add(nodes.item(i).getNodeValue()); | |
153 } | |
154 | |
155 return ret; | |
156 } | |
157 | |
158 | |
159 | |
160 } |