Mercurial > hg > mpdl-group
comparison software/eXist/webapp/mpdl/lucene/search.xql @ 7:5589d865af7a
Erstellung XQL/XSL Applikation
author | Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de> |
---|---|
date | Tue, 08 Feb 2011 15:16:46 +0100 |
parents | |
children | d6f528ad5d96 |
comparison
equal
deleted
inserted
replaced
6:2396a569e446 | 7:5589d865af7a |
---|---|
1 xquery version "1.0"; | |
2 | |
3 module namespace mpdl-lucene = "http://www.mpiwg-berlin.mpg.de/ns/mpdl/lucene/search"; | |
4 | |
5 import module namespace functx = "http://www.functx.com" at "../util/functx.xql"; | |
6 | |
7 declare namespace ft = "http://exist-db.org/xquery/lucene"; | |
8 | |
9 declare namespace echo="http://www.mpiwg-berlin.mpg.de/ns/echo/1.0/"; | |
10 | |
11 declare namespace dc="http://purl.org/dc/elements/1.1/"; | |
12 declare namespace dcterms="http://purl.org/dc/terms"; | |
13 | |
14 declare function mpdl-lucene:search($mpdlCollection, $queryStr) { | |
15 let $luceneParseResult := mpdltext:lucene-parse-query($queryStr) | |
16 let $result := | |
17 if ($luceneParseResult = '') | |
18 then $mpdlCollection/.[ft:query(archimedes/text, $queryStr) or ft:query(echo:echo/echo:text, $queryStr)] | |
19 else | |
20 <result> | |
21 <error>{$luceneParseResult}</error> | |
22 <size>0</size> | |
23 <pages>0</pages> | |
24 <pn>0</pn> | |
25 <hits/> | |
26 </result> | |
27 return $result | |
28 }; | |
29 | |
30 declare function mpdl-lucene:search($mpdlCollectionName, $language, $document, $queryType, $queryStr, $pn as xs:int, $pageSize as xs:int) as node() { | |
31 (: performance reasons: all hits (not only the first 10! ) are passed through the :) | |
32 (: for loop: so the overhead in each loop has to be minimized :) | |
33 let $query := | |
34 if ($queryType = 'fulltext' or $queryType = 'fulltextMorph') | |
35 then $queryStr | |
36 else if ($queryType = 'fulltextMorphLemma') | |
37 then concat('lemmalemma', $queryStr) | |
38 else () | |
39 let $pageBreaks := | |
40 if ($mpdlCollectionName = 'archimedes') | |
41 then $document//pb | |
42 else if ($mpdlCollectionName = 'echo') | |
43 then $document//echo:pb | |
44 else $document//pb | |
45 let $luceneParseResult := mpdltext:lucene-parse-query($queryStr) | |
46 let $t := | |
47 if ($luceneParseResult != '') | |
48 then () | |
49 else if ($mpdlCollectionName = 'archimedes') | |
50 then $document//s[ft:query(., $query)] | |
51 else if ($mpdlCollectionName = 'echo') | |
52 then $document//echo:s[ft:query(., $query)] | |
53 else $document//s[ft:query(., $query)] | |
54 let $from := ($pn * $pageSize) - $pageSize + 1 | |
55 let $to := $pn * $pageSize | |
56 (: performance improvements: result set of 500 needs 3 sec., result set of 10 needs 0,7 sec.:) | |
57 let $tempQueryResult := | |
58 for $ss at $poss in $t | |
59 where $poss >= $from and $poss <= $to | |
60 return $ss | |
61 let $queryResult := | |
62 for $s at $pos in $tempQueryResult | |
63 let $pnOfS := count($pageBreaks[. << $s]) (: faster: comparison only in pb elements of this document :) | |
64 let $pb := subsequence($pageBreaks, $pnOfS, 1) | |
65 (: test if sentence surrounds page break; costs 0,1 sec performance :) | |
66 let $pbPlus1 := subsequence($pageBreaks, $pnOfS + 1, 1) | |
67 let $sSurroundsPB := | |
68 if ($pbPlus1/parent::node() = $s and $pbPlus1 intersect $s/descendant::node()) | |
69 then true() | |
70 else false() | |
71 let $posOfS := (: faster: comparison only in s elements of this document :) | |
72 if ($mpdlCollectionName = 'archimedes') | |
73 then count($pb/following::s[. << $s]) + 1 | |
74 else if ($mpdlCollectionName = 'echo') | |
75 then count($pb/following::echo:s[. << $s]) + 1 | |
76 else count($pb/following::s[. << $s]) + 1 | |
77 let $position := $from - 1 + $pos | |
78 let $resultElem := | |
79 <hit> | |
80 <pos>{$position}</pos> | |
81 <pn>{$pnOfS}</pn> | |
82 <pos-of-s>{$posOfS}</pos-of-s> | |
83 <s>{string($s)}</s> | |
84 <s-surrounds-pb>{$sSurroundsPB}</s-surrounds-pb> | |
85 </hit> | |
86 return $resultElem | |
87 let $resultSize := count($t) | |
88 let $pages := | |
89 if ($resultSize = 0) | |
90 then 0 | |
91 else $resultSize idiv $pageSize + 1 | |
92 let $queryForms := | |
93 if ($queryType = 'fulltextMorph' or $queryType = 'fulltextMorphLemma') | |
94 then mpdltext:get-query-morph-forms($language, $queryStr) | |
95 else () | |
96 let $queryRegularizations := | |
97 if ($queryType = 'fulltextMorph') | |
98 then mpdltext:get-query-regularizations($language, $queryStr) | |
99 else () | |
100 | |
101 let $encodedQueryTerms := | |
102 if ($language = "zh") | |
103 then | |
104 mpdltext:get-big5-encoded-terms($query) | |
105 else () | |
106 | |
107 let $result := | |
108 if ($luceneParseResult != '') | |
109 then | |
110 <result> | |
111 <error>{$luceneParseResult}</error> | |
112 <size>0</size> | |
113 <pages>0</pages> | |
114 <pn>0</pn> | |
115 <hits/> | |
116 </result> | |
117 else | |
118 <result> | |
119 <size>{$resultSize}</size> | |
120 <page-size>{$pageSize}</page-size> | |
121 <pages>{$pages}</pages> | |
122 <pn>{$pn}</pn> | |
123 <hits> | |
124 {$queryResult} | |
125 </hits> | |
126 <query-forms>{$queryForms}</query-forms> | |
127 <query-regularizations>{$queryRegularizations}</query-regularizations> | |
128 {$encodedQueryTerms} | |
129 </result> | |
130 | |
131 return $result | |
132 }; | |
133 | |
134 declare function mpdl-lucene:attrSearch($metadataStr, $attribute1, $attrValue1, $boolOp, $attribute2, $attrValue2) { | |
135 let $attrFtQueryStr1 := mpdl-lucene:getAttrQueryStr($attribute1, $attrValue1) | |
136 let $attrFtQueryStr2 := mpdl-lucene:getAttrQueryStr($attribute2, $attrValue2) | |
137 let $booleanQueryStr := | |
138 if ($attrValue2 = "") | |
139 then $attrFtQueryStr1 | |
140 else if ($attrValue2 != "" and $boolOp = "or") | |
141 then concat($attrFtQueryStr1, " or ", $attrFtQueryStr2) | |
142 else if ($attrValue2 != "" and $boolOp = "and") | |
143 then concat("(", $attrFtQueryStr1, ") and (", $attrFtQueryStr2, ")") | |
144 else if ($attrValue2 != "" and $boolOp = "andNot") | |
145 then concat("(", $attrFtQueryStr1, ") and not(", $attrFtQueryStr2, ")") | |
146 else () | |
147 let $attrQuery := concat($metadataStr, "/.[", $booleanQueryStr, "]") | |
148 let $luceneParseResult1 := mpdltext:lucene-parse-query($attrValue1) | |
149 let $luceneParseResult2 := mpdltext:lucene-parse-query($attrValue2) | |
150 let $luceneParseResult := | |
151 if ($luceneParseResult1 = '' and $luceneParseResult2 = '') | |
152 then '' | |
153 else if ($luceneParseResult1 != '' and $luceneParseResult2 = '') | |
154 then concat("attribute ", $attribute1, " with value: ", $attrValue1, ": ", $luceneParseResult1) | |
155 else if ($luceneParseResult1 = '' and $luceneParseResult2 != '') | |
156 then concat("attribute ", $attribute2, " with value: ", $attrValue2, ": ", $luceneParseResult2) | |
157 else if ($luceneParseResult1 != '' and $luceneParseResult2 != '') | |
158 then concat("attribute ", $attribute1, " with value: ", $attrValue1, ": ", $luceneParseResult1, ", attribute ", $attribute2, " with value: ", $attrValue2, ": ", $luceneParseResult2) | |
159 else () | |
160 let $result := | |
161 if ($luceneParseResult = '') | |
162 then util:eval($attrQuery) | |
163 else | |
164 <result> | |
165 <error>{$luceneParseResult}</error> | |
166 <size>0</size> | |
167 <pages>0</pages> | |
168 <pn>0</pn> | |
169 <hits/> | |
170 </result> | |
171 | |
172 return $result | |
173 }; | |
174 | |
175 declare function mpdl-lucene:order($metadata, $orderBy) { | |
176 let $result := | |
177 (for $attrElem in $metadata | |
178 let $doc := $attrElem/fn:root() | |
179 let $documentUri := document-uri($doc) | |
180 let $docBase := functx:substring-after-last(functx:substring-before-last(functx:substring-before-last($documentUri, "/"), "/"), "/") | |
181 let $metadataElem := mpdl-lucene:getMetadata($docBase, $doc) | |
182 let $attrElemName := mpdl-lucene:getElemNameByAttr($docBase, $orderBy) | |
183 let $orderByElem := mpdl-lucene:getElemDynamic($metadataElem, $attrElemName) (: this costs performance for many result elements :) | |
184 let $orderByTemp := lower-case(string-join($orderByElem, ', ')) | |
185 order by $orderByTemp | |
186 return $attrElem) | |
187 | |
188 return $result | |
189 | |
190 (: performance improvement (?): | |
191 let $result := | |
192 for $x in doc(/db/doc1.xml) | |
193 order by $x | |
194 if ($sortOrder eq "asc") | |
195 then ( "ascending" ) | |
196 else ( "descending" ) | |
197 :) | |
198 }; | |
199 | |
200 declare function mpdl-lucene:getMetadata($docBase, $doc) { | |
201 let $result := | |
202 if ($docBase = 'archimedes') | |
203 then $doc/archimedes/info | |
204 else if ($docBase = 'echo') | |
205 then $doc/echo:echo/echo:metadata | |
206 else () | |
207 return $result | |
208 }; | |
209 | |
210 declare function mpdl-lucene:getText($docBase, $doc) { | |
211 let $result := | |
212 if ($docBase = 'archimedes') | |
213 then $doc/archimedes/text | |
214 else if ($docBase = 'echo') | |
215 then $doc/echo:echo/echo:text | |
216 else () | |
217 return $result | |
218 }; | |
219 | |
220 | |
221 (: TODO: performance improvement: each time util:eval is executed :) | |
222 declare function mpdl-lucene:getElemDynamic($path, $elemStr) { | |
223 let $evalExpr := | |
224 if($elemStr != "") | |
225 then concat("$path/", $elemStr) | |
226 else "" | |
227 let $result := util:eval($evalExpr) | |
228 return $result | |
229 }; | |
230 | |
231 declare function mpdl-lucene:getAttrQueryStr($attribute, $attrValue) { | |
232 let $attrArch := mpdl-lucene:getElemNameByAttr("archimedes", $attribute) | |
233 let $attrEcho := mpdl-lucene:getElemNameByAttr("echo", $attribute) | |
234 let $attrArchRelQueryStr := | |
235 if ($attribute = "date") | |
236 then concat($attrArch, " = '", $attrValue, "'") | |
237 else concat("ft:query(", $attrArch, ", '", $attrValue, "')") | |
238 let $attrEchoRelQueryStr := | |
239 if ($attribute = "date") | |
240 then concat($attrEcho, " = '", $attrValue, "'") | |
241 else concat("ft:query(", $attrEcho, ", '", $attrValue, "')") | |
242 let $result := | |
243 if ($attrArch != "" and $attrEcho != "") | |
244 then concat($attrArchRelQueryStr, " or ", $attrEchoRelQueryStr) | |
245 else if ($attrArch = "" and $attrEcho != "") | |
246 then $attrEchoRelQueryStr | |
247 else if ($attrArch != "" and $attrEcho = "") | |
248 then $attrArchRelQueryStr | |
249 else "" | |
250 return $result | |
251 }; | |
252 | |
253 declare function mpdl-lucene:getElemNameByAttr($docBase, $attribute) { | |
254 let $docBaseArch := "archimedes" | |
255 let $docBaseEcho := "echo" | |
256 let $result := | |
257 if ($docBase = $docBaseArch and $attribute = "author") | |
258 then "author" | |
259 else if ($docBase = $docBaseEcho and $attribute = "author") | |
260 then "dcterms:creator" | |
261 else if ($docBase = $docBaseArch and $attribute = "title") | |
262 then "title" | |
263 else if ($docBase = $docBaseEcho and $attribute = "title") | |
264 then "dcterms:title" | |
265 else if ($docBase = $docBaseArch and $attribute = "place") | |
266 then "place" | |
267 else if ($docBase = $docBaseEcho and $attribute = "place") | |
268 then "" | |
269 else if ($docBase = $docBaseArch and $attribute = "date") | |
270 then "date" | |
271 else if ($docBase = $docBaseEcho and $attribute = "date") | |
272 then "dcterms:date" | |
273 else if ($docBase = $docBaseArch and $attribute = "language") | |
274 then "lang" | |
275 else if ($docBase = $docBaseEcho and $attribute = "language") | |
276 then "dcterms:language" | |
277 else if ($docBase = $docBaseArch and $attribute = "identifier") | |
278 then "locator" | |
279 else if ($docBase = $docBaseEcho and $attribute = "identifier") | |
280 then "dcterms:identifier" | |
281 else if ($docBase = $docBaseArch and $attribute = "rights") | |
282 then "" | |
283 else if ($docBase = $docBaseEcho and $attribute = "rights") | |
284 then "dcterms:rights" | |
285 else if ($docBase = $docBaseArch and $attribute = "license") | |
286 then "" | |
287 else if ($docBase = $docBaseEcho and $attribute = "license") | |
288 then "dcterms:license" | |
289 else if ($docBase = $docBaseArch and $attribute = "accessRights") | |
290 then "" | |
291 else if ($docBase = $docBaseEcho and $attribute = "accessRights") | |
292 then "dcterms:accessRights" | |
293 else if ($docBase = $docBaseArch and $attribute = "file") | |
294 then "cvs_file" | |
295 else if ($docBase = $docBaseEcho and $attribute = "file") | |
296 then "" | |
297 else if ($docBase = $docBaseArch and $attribute = "translator") | |
298 then "translator" | |
299 else if ($docBase = $docBaseEcho and $attribute = "translator") | |
300 then "" | |
301 else if ($docBase = $docBaseArch and $attribute = "version") | |
302 then "cvs_version" | |
303 else if ($docBase = $docBaseEcho and $attribute = "version") | |
304 then "" | |
305 else "" | |
306 | |
307 return $result | |
308 }; | |
309 | |
310 declare function mpdl-lucene:getElementsByAttr($metadata, $docBase, $attribute) { | |
311 let $docBaseArch := "archimedes" | |
312 let $docBaseEcho := "echo" | |
313 let $result := | |
314 if ($docBase = $docBaseArch and $attribute = "author") | |
315 then | |
316 for $elem in $metadata/author | |
317 return <author>{$elem}</author> | |
318 else if ($docBase = $docBaseEcho and $attribute = "author") | |
319 then | |
320 for $elem in $metadata/dcterms:creator | |
321 return <author>{$elem}</author> | |
322 else if ($docBase = $docBaseArch and $attribute = "title") | |
323 then | |
324 for $elem in $metadata/title | |
325 return <title>{$elem}</title> | |
326 else if ($docBase = $docBaseEcho and $attribute = "title") | |
327 then | |
328 for $elem in $metadata/dcterms:title | |
329 return <title>{$elem}</title> | |
330 else if ($docBase = $docBaseArch and $attribute = "place") | |
331 then | |
332 for $elem in $metadata/place | |
333 return <place>{$elem}</place> | |
334 else if ($docBase = $docBaseEcho and $attribute = "place") | |
335 then () | |
336 else if ($docBase = $docBaseArch and $attribute = "date") | |
337 then | |
338 for $elem in $metadata/date | |
339 return <date>{$elem}</date> | |
340 else if ($docBase = $docBaseEcho and $attribute = "date") | |
341 then | |
342 for $elem in $metadata/dcterms:date | |
343 return <date>{$elem}</date> | |
344 else if ($docBase = $docBaseArch and $attribute = "language") | |
345 then $metadata/lang | |
346 else if ($docBase = $docBaseEcho and $attribute = "language") | |
347 then $metadata/dcterms:language | |
348 else if ($docBase = $docBaseArch and $attribute = "identifier") | |
349 then $metadata/locator | |
350 else if ($docBase = $docBaseEcho and $attribute = "identifier") | |
351 then $metadata/dcterms:identifier | |
352 else if ($docBase = $docBaseArch and $attribute = "rights") | |
353 then () | |
354 else if ($docBase = $docBaseEcho and $attribute = "rights") | |
355 then | |
356 for $elem in $metadata/dcterms:rights | |
357 return <rights>{$elem}</rights> | |
358 else if ($docBase = $docBaseArch and $attribute = "accessRights") | |
359 then () | |
360 else if ($docBase = $docBaseEcho and $attribute = "accessRights") | |
361 then | |
362 for $elem in $metadata/dcterms:accessRights | |
363 return <rights>{$elem}</rights> | |
364 else if ($docBase = $docBaseArch and $attribute = "license") | |
365 then () | |
366 else if ($docBase = $docBaseEcho and $attribute = "license") | |
367 then | |
368 for $elem in $metadata/dcterms:license | |
369 return <license>{$elem}</license> | |
370 else if ($docBase = $docBaseArch and $attribute = "file") | |
371 then $metadata/cvs_file | |
372 else if ($docBase = $docBaseEcho and $attribute = "file") | |
373 then () | |
374 else if ($docBase = $docBaseArch and $attribute = "translator") | |
375 then $metadata/translator | |
376 else if ($docBase = $docBaseEcho and $attribute = "translator") | |
377 then () | |
378 else if ($docBase = $docBaseArch and $attribute = "version") | |
379 then $metadata/cvs_version | |
380 else if ($docBase = $docBaseEcho and $attribute = "version") | |
381 then () | |
382 else () | |
383 | |
384 return $result | |
385 }; | |
386 |