comparison software/eXist/webapp/mpdl/lucene/search.xql @ 7:5589d865af7a

Erstellung XQL/XSL Applikation
author Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
date Tue, 08 Feb 2011 15:16:46 +0100
parents
children d6f528ad5d96
comparison
equal deleted inserted replaced
6:2396a569e446 7:5589d865af7a
1 xquery version "1.0";
2
3 module namespace mpdl-lucene = "http://www.mpiwg-berlin.mpg.de/ns/mpdl/lucene/search";
4
5 import module namespace functx = "http://www.functx.com" at "../util/functx.xql";
6
7 declare namespace ft = "http://exist-db.org/xquery/lucene";
8
9 declare namespace echo="http://www.mpiwg-berlin.mpg.de/ns/echo/1.0/";
10
11 declare namespace dc="http://purl.org/dc/elements/1.1/";
12 declare namespace dcterms="http://purl.org/dc/terms";
13
14 declare function mpdl-lucene:search($mpdlCollection, $queryStr) {
15 let $luceneParseResult := mpdltext:lucene-parse-query($queryStr)
16 let $result :=
17 if ($luceneParseResult = '')
18 then $mpdlCollection/.[ft:query(archimedes/text, $queryStr) or ft:query(echo:echo/echo:text, $queryStr)]
19 else
20 <result>
21 <error>{$luceneParseResult}</error>
22 <size>0</size>
23 <pages>0</pages>
24 <pn>0</pn>
25 <hits/>
26 </result>
27 return $result
28 };
29
30 declare function mpdl-lucene:search($mpdlCollectionName, $language, $document, $queryType, $queryStr, $pn as xs:int, $pageSize as xs:int) as node() {
31 (: performance reasons: all hits (not only the first 10! ) are passed through the :)
32 (: for loop: so the overhead in each loop has to be minimized :)
33 let $query :=
34 if ($queryType = 'fulltext' or $queryType = 'fulltextMorph')
35 then $queryStr
36 else if ($queryType = 'fulltextMorphLemma')
37 then concat('lemmalemma', $queryStr)
38 else ()
39 let $pageBreaks :=
40 if ($mpdlCollectionName = 'archimedes')
41 then $document//pb
42 else if ($mpdlCollectionName = 'echo')
43 then $document//echo:pb
44 else $document//pb
45 let $luceneParseResult := mpdltext:lucene-parse-query($queryStr)
46 let $t :=
47 if ($luceneParseResult != '')
48 then ()
49 else if ($mpdlCollectionName = 'archimedes')
50 then $document//s[ft:query(., $query)]
51 else if ($mpdlCollectionName = 'echo')
52 then $document//echo:s[ft:query(., $query)]
53 else $document//s[ft:query(., $query)]
54 let $from := ($pn * $pageSize) - $pageSize + 1
55 let $to := $pn * $pageSize
56 (: performance improvements: result set of 500 needs 3 sec., result set of 10 needs 0,7 sec.:)
57 let $tempQueryResult :=
58 for $ss at $poss in $t
59 where $poss >= $from and $poss <= $to
60 return $ss
61 let $queryResult :=
62 for $s at $pos in $tempQueryResult
63 let $pnOfS := count($pageBreaks[. << $s]) (: faster: comparison only in pb elements of this document :)
64 let $pb := subsequence($pageBreaks, $pnOfS, 1)
65 (: test if sentence surrounds page break; costs 0,1 sec performance :)
66 let $pbPlus1 := subsequence($pageBreaks, $pnOfS + 1, 1)
67 let $sSurroundsPB :=
68 if ($pbPlus1/parent::node() = $s and $pbPlus1 intersect $s/descendant::node())
69 then true()
70 else false()
71 let $posOfS := (: faster: comparison only in s elements of this document :)
72 if ($mpdlCollectionName = 'archimedes')
73 then count($pb/following::s[. << $s]) + 1
74 else if ($mpdlCollectionName = 'echo')
75 then count($pb/following::echo:s[. << $s]) + 1
76 else count($pb/following::s[. << $s]) + 1
77 let $position := $from - 1 + $pos
78 let $resultElem :=
79 <hit>
80 <pos>{$position}</pos>
81 <pn>{$pnOfS}</pn>
82 <pos-of-s>{$posOfS}</pos-of-s>
83 <s>{string($s)}</s>
84 <s-surrounds-pb>{$sSurroundsPB}</s-surrounds-pb>
85 </hit>
86 return $resultElem
87 let $resultSize := count($t)
88 let $pages :=
89 if ($resultSize = 0)
90 then 0
91 else $resultSize idiv $pageSize + 1
92 let $queryForms :=
93 if ($queryType = 'fulltextMorph' or $queryType = 'fulltextMorphLemma')
94 then mpdltext:get-query-morph-forms($language, $queryStr)
95 else ()
96 let $queryRegularizations :=
97 if ($queryType = 'fulltextMorph')
98 then mpdltext:get-query-regularizations($language, $queryStr)
99 else ()
100
101 let $encodedQueryTerms :=
102 if ($language = "zh")
103 then
104 mpdltext:get-big5-encoded-terms($query)
105 else ()
106
107 let $result :=
108 if ($luceneParseResult != '')
109 then
110 <result>
111 <error>{$luceneParseResult}</error>
112 <size>0</size>
113 <pages>0</pages>
114 <pn>0</pn>
115 <hits/>
116 </result>
117 else
118 <result>
119 <size>{$resultSize}</size>
120 <page-size>{$pageSize}</page-size>
121 <pages>{$pages}</pages>
122 <pn>{$pn}</pn>
123 <hits>
124 {$queryResult}
125 </hits>
126 <query-forms>{$queryForms}</query-forms>
127 <query-regularizations>{$queryRegularizations}</query-regularizations>
128 {$encodedQueryTerms}
129 </result>
130
131 return $result
132 };
133
134 declare function mpdl-lucene:attrSearch($metadataStr, $attribute1, $attrValue1, $boolOp, $attribute2, $attrValue2) {
135 let $attrFtQueryStr1 := mpdl-lucene:getAttrQueryStr($attribute1, $attrValue1)
136 let $attrFtQueryStr2 := mpdl-lucene:getAttrQueryStr($attribute2, $attrValue2)
137 let $booleanQueryStr :=
138 if ($attrValue2 = "")
139 then $attrFtQueryStr1
140 else if ($attrValue2 != "" and $boolOp = "or")
141 then concat($attrFtQueryStr1, " or ", $attrFtQueryStr2)
142 else if ($attrValue2 != "" and $boolOp = "and")
143 then concat("(", $attrFtQueryStr1, ") and (", $attrFtQueryStr2, ")")
144 else if ($attrValue2 != "" and $boolOp = "andNot")
145 then concat("(", $attrFtQueryStr1, ") and not(", $attrFtQueryStr2, ")")
146 else ()
147 let $attrQuery := concat($metadataStr, "/.[", $booleanQueryStr, "]")
148 let $luceneParseResult1 := mpdltext:lucene-parse-query($attrValue1)
149 let $luceneParseResult2 := mpdltext:lucene-parse-query($attrValue2)
150 let $luceneParseResult :=
151 if ($luceneParseResult1 = '' and $luceneParseResult2 = '')
152 then ''
153 else if ($luceneParseResult1 != '' and $luceneParseResult2 = '')
154 then concat("attribute ", $attribute1, " with value: ", $attrValue1, ": ", $luceneParseResult1)
155 else if ($luceneParseResult1 = '' and $luceneParseResult2 != '')
156 then concat("attribute ", $attribute2, " with value: ", $attrValue2, ": ", $luceneParseResult2)
157 else if ($luceneParseResult1 != '' and $luceneParseResult2 != '')
158 then concat("attribute ", $attribute1, " with value: ", $attrValue1, ": ", $luceneParseResult1, ", attribute ", $attribute2, " with value: ", $attrValue2, ": ", $luceneParseResult2)
159 else ()
160 let $result :=
161 if ($luceneParseResult = '')
162 then util:eval($attrQuery)
163 else
164 <result>
165 <error>{$luceneParseResult}</error>
166 <size>0</size>
167 <pages>0</pages>
168 <pn>0</pn>
169 <hits/>
170 </result>
171
172 return $result
173 };
174
175 declare function mpdl-lucene:order($metadata, $orderBy) {
176 let $result :=
177 (for $attrElem in $metadata
178 let $doc := $attrElem/fn:root()
179 let $documentUri := document-uri($doc)
180 let $docBase := functx:substring-after-last(functx:substring-before-last(functx:substring-before-last($documentUri, "/"), "/"), "/")
181 let $metadataElem := mpdl-lucene:getMetadata($docBase, $doc)
182 let $attrElemName := mpdl-lucene:getElemNameByAttr($docBase, $orderBy)
183 let $orderByElem := mpdl-lucene:getElemDynamic($metadataElem, $attrElemName) (: this costs performance for many result elements :)
184 let $orderByTemp := lower-case(string-join($orderByElem, ', '))
185 order by $orderByTemp
186 return $attrElem)
187
188 return $result
189
190 (: performance improvement (?):
191 let $result :=
192 for $x in doc(/db/doc1.xml)
193 order by $x
194 if ($sortOrder eq "asc")
195 then ( "ascending" )
196 else ( "descending" )
197 :)
198 };
199
200 declare function mpdl-lucene:getMetadata($docBase, $doc) {
201 let $result :=
202 if ($docBase = 'archimedes')
203 then $doc/archimedes/info
204 else if ($docBase = 'echo')
205 then $doc/echo:echo/echo:metadata
206 else ()
207 return $result
208 };
209
210 declare function mpdl-lucene:getText($docBase, $doc) {
211 let $result :=
212 if ($docBase = 'archimedes')
213 then $doc/archimedes/text
214 else if ($docBase = 'echo')
215 then $doc/echo:echo/echo:text
216 else ()
217 return $result
218 };
219
220
221 (: TODO: performance improvement: each time util:eval is executed :)
222 declare function mpdl-lucene:getElemDynamic($path, $elemStr) {
223 let $evalExpr :=
224 if($elemStr != "")
225 then concat("$path/", $elemStr)
226 else ""
227 let $result := util:eval($evalExpr)
228 return $result
229 };
230
231 declare function mpdl-lucene:getAttrQueryStr($attribute, $attrValue) {
232 let $attrArch := mpdl-lucene:getElemNameByAttr("archimedes", $attribute)
233 let $attrEcho := mpdl-lucene:getElemNameByAttr("echo", $attribute)
234 let $attrArchRelQueryStr :=
235 if ($attribute = "date")
236 then concat($attrArch, " = '", $attrValue, "'")
237 else concat("ft:query(", $attrArch, ", '", $attrValue, "')")
238 let $attrEchoRelQueryStr :=
239 if ($attribute = "date")
240 then concat($attrEcho, " = '", $attrValue, "'")
241 else concat("ft:query(", $attrEcho, ", '", $attrValue, "')")
242 let $result :=
243 if ($attrArch != "" and $attrEcho != "")
244 then concat($attrArchRelQueryStr, " or ", $attrEchoRelQueryStr)
245 else if ($attrArch = "" and $attrEcho != "")
246 then $attrEchoRelQueryStr
247 else if ($attrArch != "" and $attrEcho = "")
248 then $attrArchRelQueryStr
249 else ""
250 return $result
251 };
252
253 declare function mpdl-lucene:getElemNameByAttr($docBase, $attribute) {
254 let $docBaseArch := "archimedes"
255 let $docBaseEcho := "echo"
256 let $result :=
257 if ($docBase = $docBaseArch and $attribute = "author")
258 then "author"
259 else if ($docBase = $docBaseEcho and $attribute = "author")
260 then "dcterms:creator"
261 else if ($docBase = $docBaseArch and $attribute = "title")
262 then "title"
263 else if ($docBase = $docBaseEcho and $attribute = "title")
264 then "dcterms:title"
265 else if ($docBase = $docBaseArch and $attribute = "place")
266 then "place"
267 else if ($docBase = $docBaseEcho and $attribute = "place")
268 then ""
269 else if ($docBase = $docBaseArch and $attribute = "date")
270 then "date"
271 else if ($docBase = $docBaseEcho and $attribute = "date")
272 then "dcterms:date"
273 else if ($docBase = $docBaseArch and $attribute = "language")
274 then "lang"
275 else if ($docBase = $docBaseEcho and $attribute = "language")
276 then "dcterms:language"
277 else if ($docBase = $docBaseArch and $attribute = "identifier")
278 then "locator"
279 else if ($docBase = $docBaseEcho and $attribute = "identifier")
280 then "dcterms:identifier"
281 else if ($docBase = $docBaseArch and $attribute = "rights")
282 then ""
283 else if ($docBase = $docBaseEcho and $attribute = "rights")
284 then "dcterms:rights"
285 else if ($docBase = $docBaseArch and $attribute = "license")
286 then ""
287 else if ($docBase = $docBaseEcho and $attribute = "license")
288 then "dcterms:license"
289 else if ($docBase = $docBaseArch and $attribute = "accessRights")
290 then ""
291 else if ($docBase = $docBaseEcho and $attribute = "accessRights")
292 then "dcterms:accessRights"
293 else if ($docBase = $docBaseArch and $attribute = "file")
294 then "cvs_file"
295 else if ($docBase = $docBaseEcho and $attribute = "file")
296 then ""
297 else if ($docBase = $docBaseArch and $attribute = "translator")
298 then "translator"
299 else if ($docBase = $docBaseEcho and $attribute = "translator")
300 then ""
301 else if ($docBase = $docBaseArch and $attribute = "version")
302 then "cvs_version"
303 else if ($docBase = $docBaseEcho and $attribute = "version")
304 then ""
305 else ""
306
307 return $result
308 };
309
310 declare function mpdl-lucene:getElementsByAttr($metadata, $docBase, $attribute) {
311 let $docBaseArch := "archimedes"
312 let $docBaseEcho := "echo"
313 let $result :=
314 if ($docBase = $docBaseArch and $attribute = "author")
315 then
316 for $elem in $metadata/author
317 return <author>{$elem}</author>
318 else if ($docBase = $docBaseEcho and $attribute = "author")
319 then
320 for $elem in $metadata/dcterms:creator
321 return <author>{$elem}</author>
322 else if ($docBase = $docBaseArch and $attribute = "title")
323 then
324 for $elem in $metadata/title
325 return <title>{$elem}</title>
326 else if ($docBase = $docBaseEcho and $attribute = "title")
327 then
328 for $elem in $metadata/dcterms:title
329 return <title>{$elem}</title>
330 else if ($docBase = $docBaseArch and $attribute = "place")
331 then
332 for $elem in $metadata/place
333 return <place>{$elem}</place>
334 else if ($docBase = $docBaseEcho and $attribute = "place")
335 then ()
336 else if ($docBase = $docBaseArch and $attribute = "date")
337 then
338 for $elem in $metadata/date
339 return <date>{$elem}</date>
340 else if ($docBase = $docBaseEcho and $attribute = "date")
341 then
342 for $elem in $metadata/dcterms:date
343 return <date>{$elem}</date>
344 else if ($docBase = $docBaseArch and $attribute = "language")
345 then $metadata/lang
346 else if ($docBase = $docBaseEcho and $attribute = "language")
347 then $metadata/dcterms:language
348 else if ($docBase = $docBaseArch and $attribute = "identifier")
349 then $metadata/locator
350 else if ($docBase = $docBaseEcho and $attribute = "identifier")
351 then $metadata/dcterms:identifier
352 else if ($docBase = $docBaseArch and $attribute = "rights")
353 then ()
354 else if ($docBase = $docBaseEcho and $attribute = "rights")
355 then
356 for $elem in $metadata/dcterms:rights
357 return <rights>{$elem}</rights>
358 else if ($docBase = $docBaseArch and $attribute = "accessRights")
359 then ()
360 else if ($docBase = $docBaseEcho and $attribute = "accessRights")
361 then
362 for $elem in $metadata/dcterms:accessRights
363 return <rights>{$elem}</rights>
364 else if ($docBase = $docBaseArch and $attribute = "license")
365 then ()
366 else if ($docBase = $docBaseEcho and $attribute = "license")
367 then
368 for $elem in $metadata/dcterms:license
369 return <license>{$elem}</license>
370 else if ($docBase = $docBaseArch and $attribute = "file")
371 then $metadata/cvs_file
372 else if ($docBase = $docBaseEcho and $attribute = "file")
373 then ()
374 else if ($docBase = $docBaseArch and $attribute = "translator")
375 then $metadata/translator
376 else if ($docBase = $docBaseEcho and $attribute = "translator")
377 then ()
378 else if ($docBase = $docBaseArch and $attribute = "version")
379 then $metadata/cvs_version
380 else if ($docBase = $docBaseEcho and $attribute = "version")
381 then ()
382 else ()
383
384 return $result
385 };
386