comparison src/de/mpiwg/dwinter/fulltext/search/FulltextSearch.java @ 1:5c9c31510f0c

CLOSED - # 16: Zeige nur eine konfigurierbare Anzahl von Treffern an. https://it-dev.mpiwg-berlin.mpg.de/tracs/pythonOcropusTools/ticket/16
author dwinter
date Wed, 03 Nov 2010 12:26:29 +0100
parents
children 2b29b0b6db16
comparison
equal deleted inserted replaced
0:72a015318a6d 1:5c9c31510f0c
1 package de.mpiwg.dwinter.fulltext.search;
2
3 import java.io.File;
4 import java.io.IOException;
5 import java.io.InputStream;
6 import java.io.StringReader;
7 import java.util.ArrayList;
8 import java.util.Arrays;
9 import java.util.HashMap;
10 import java.util.Iterator;
11
12 import javax.swing.text.Document;
13 import javax.xml.parsers.DocumentBuilder;
14 import javax.xml.parsers.DocumentBuilderFactory;
15 import javax.xml.parsers.ParserConfigurationException;
16 import javax.xml.xpath.XPath;
17 import javax.xml.xpath.XPathConstants;
18 import javax.xml.xpath.XPathExpression;
19 import javax.xml.xpath.XPathExpressionException;
20 import javax.xml.xpath.XPathFactory;
21
22 import org.apache.lucene.analysis.de.GermanAnalyzer;
23 import org.apache.lucene.analysis.fr.FrenchAnalyzer;
24 import org.apache.lucene.analysis.standard.StandardAnalyzer;
25 import org.apache.lucene.index.CorruptIndexException;
26 import org.apache.lucene.index.Term;
27 import org.apache.lucene.queryParser.ParseException;
28 import org.apache.lucene.search.Collector;
29 import org.apache.lucene.search.Query;
30 import org.apache.lucene.search.ScoreDoc;
31 import org.apache.lucene.search.TopDocs;
32 import org.apache.lucene.search.TopDocsCollector;
33 import org.apache.lucene.search.TopScoreDocCollector;
34 import org.apache.lucene.search.WildcardQuery;
35 import org.apache.lucene.store.LockObtainFailedException;
36 import org.apache.lucene.util.Version;
37 import org.w3c.dom.NodeList;
38 import org.xml.sax.InputSource;
39 import org.xml.sax.SAXException;
40
41 import de.mpiwg.dwinter.fulltext.searcher.LanguageSearcher;
42 import de.mpiwg.dwinter.fulltext.searcher.LanguageSearchers;
43
44
45
46 public class FulltextSearch {
47
48 protected File index_dir;
49
50 public LanguageSearchers languageSearchers = new LanguageSearchers();
51 protected static ArrayList<String> supportedLanguages = new ArrayList<String>();
52
53 public FulltextSearch(){}
54
55 public FulltextSearch(File index_dir) throws CorruptIndexException, LockObtainFailedException, IOException {
56
57
58 this.index_dir=index_dir;
59
60 init_language_searchers(index_dir);
61 init_languages();
62 }
63
64 protected void init_languages() throws CorruptIndexException, LockObtainFailedException, IOException{
65
66
67
68 supportedLanguages.add("de");
69 supportedLanguages.add("en");
70 supportedLanguages.add("fr");
71 supportedLanguages.add("la");
72
73 }
74
75 protected void init_language_searchers(File dir) throws CorruptIndexException,
76 LockObtainFailedException, IOException {
77 languageSearchers.add(new LanguageSearcher("de",new GermanAnalyzer(Version.LUCENE_30),dir));
78 languageSearchers.add(new LanguageSearcher("en",new StandardAnalyzer(Version.LUCENE_30),dir));
79 languageSearchers.add(new LanguageSearcher("fr",new FrenchAnalyzer(Version.LUCENE_30),dir));
80 languageSearchers.add(new LanguageSearcher("all",new StandardAnalyzer(Version.LUCENE_30),dir));
81 languageSearchers.add(new LanguageSearcher("morph",new StandardAnalyzer(Version.LUCENE_30),dir));
82 languageSearchers.add(new LanguageSearcher("la",new StandardAnalyzer(Version.LUCENE_30),dir));
83 }
84
85 public Collector search(Query query,String language) throws IOException{
86 return languageSearchers.searchLanguage(query, language);
87 }
88
89 public HashMap<String, Collector> search(Query query) throws IOException{
90 return languageSearchers.searchAllLanguages(query);
91 }
92
93 public HashMap<String, Collector> searchMD(String searchString, String mdString, ArrayList<String> languages) throws ParseException, IOException {
94 if (mdString==null & languages==null)
95 return languageSearchers.parseAndsearchAllLanguages(searchString);
96
97 return languageSearchers.parseAndsearchAllLanguages(searchString,mdString,languages);
98
99 }
100
101 public HashMap<String, Collector> search(String searchString) throws IOException, ParseException {
102
103 return languageSearchers.parseAndsearchAllLanguages(searchString);
104 }
105
106 // TODO: implement donatus
107 // public HashMap<String, HashMap<String, Collector>> searchAndAnalyse(String searchString) throws IOException, ParseException {
108 //
109 // return languageSearchers.parseAndsearchAndAnalyseAllLanguages(searchString);
110 // }
111
112 public ArrayList<String> searchForMorph(String path, String word) throws ParseException, IOException, ParserConfigurationException, SAXException, XPathExpressionException {
113
114 LanguageSearcher searcher = languageSearchers.getSearcherByLanguage("morph");
115 Term term=new Term("path",path);
116 Query query=new WildcardQuery(term);
117
118 TopScoreDocCollector col = TopScoreDocCollector.create(10, false);
119
120 searcher.searcher.search(query, col);
121
122 ArrayList<String> ret = new ArrayList<String>();
123
124
125 String morph = null;
126 //System.out.println("path:"+path);
127 ScoreDoc[] docs = col.topDocs().scoreDocs;
128
129
130 for (ScoreDoc doc:docs){
131 morph = searcher.searcher.doc(doc.doc).get("donatusMorph");
132 //System.out.println("morph:"+morph);
133 }
134
135 DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
136 factory.setNamespaceAware(false); // never forget this!
137 DocumentBuilder builder = factory.newDocumentBuilder();
138 InputSource s =new InputSource(new StringReader(morph));
139 //System.out.println("morph:"+morph);
140 //System.out.println("morphende");
141 org.w3c.dom.Document doc = builder.parse(s);
142
143 XPathFactory xpathfactory = XPathFactory.newInstance();
144 XPath xpath = xpathfactory.newXPath();
145 String xquery = "//lemma[@form='"+word+"']/variant/@form";
146 XPathExpression expr = xpath.compile(xquery);
147 //System.out.println("xpath now:"+xquery);
148 Object result = expr.evaluate(doc, XPathConstants.NODESET);
149 NodeList nodes = (NodeList) result;
150 for (int i = 0; i < nodes.getLength(); i++) {
151 //System.out.println("nodeS:"+nodes.item(i).getNodeValue());
152 ret.add(nodes.item(i).getNodeValue());
153 }
154
155 return ret;
156 }
157
158
159
160 }