Mercurial > hg > mpdl-group
diff software/eXist/webapp/mpdl/lucene/search.xql @ 7:5589d865af7a
Erstellung XQL/XSL Applikation
author | Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de> |
---|---|
date | Tue, 08 Feb 2011 15:16:46 +0100 |
parents | |
children | d6f528ad5d96 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/webapp/mpdl/lucene/search.xql Tue Feb 08 15:16:46 2011 +0100 @@ -0,0 +1,385 @@ +xquery version "1.0"; + +module namespace mpdl-lucene = "http://www.mpiwg-berlin.mpg.de/ns/mpdl/lucene/search"; + +import module namespace functx = "http://www.functx.com" at "../util/functx.xql"; + +declare namespace ft = "http://exist-db.org/xquery/lucene"; + +declare namespace echo="http://www.mpiwg-berlin.mpg.de/ns/echo/1.0/"; + +declare namespace dc="http://purl.org/dc/elements/1.1/"; +declare namespace dcterms="http://purl.org/dc/terms"; + +declare function mpdl-lucene:search($mpdlCollection, $queryStr) { + let $luceneParseResult := mpdltext:lucene-parse-query($queryStr) + let $result := + if ($luceneParseResult = '') + then $mpdlCollection/.[ft:query(archimedes/text, $queryStr) or ft:query(echo:echo/echo:text, $queryStr)] + else + <result> + <error>{$luceneParseResult}</error> + <size>0</size> + <pages>0</pages> + <pn>0</pn> + <hits/> + </result> + return $result +}; + +declare function mpdl-lucene:search($mpdlCollectionName, $language, $document, $queryType, $queryStr, $pn as xs:int, $pageSize as xs:int) as node() { + (: performance reasons: all hits (not only the first 10! ) are passed through the :) + (: for loop: so the overhead in each loop has to be minimized :) + let $query := + if ($queryType = 'fulltext' or $queryType = 'fulltextMorph') + then $queryStr + else if ($queryType = 'fulltextMorphLemma') + then concat('lemmalemma', $queryStr) + else () + let $pageBreaks := + if ($mpdlCollectionName = 'archimedes') + then $document//pb + else if ($mpdlCollectionName = 'echo') + then $document//echo:pb + else $document//pb + let $luceneParseResult := mpdltext:lucene-parse-query($queryStr) + let $t := + if ($luceneParseResult != '') + then () + else if ($mpdlCollectionName = 'archimedes') + then $document//s[ft:query(., $query)] + else if ($mpdlCollectionName = 'echo') + then $document//echo:s[ft:query(., $query)] + else $document//s[ft:query(., $query)] + let $from := ($pn * $pageSize) - $pageSize + 1 + let $to := $pn * $pageSize + (: performance improvements: result set of 500 needs 3 sec., result set of 10 needs 0,7 sec.:) + let $tempQueryResult := + for $ss at $poss in $t + where $poss >= $from and $poss <= $to + return $ss + let $queryResult := + for $s at $pos in $tempQueryResult + let $pnOfS := count($pageBreaks[. << $s]) (: faster: comparison only in pb elements of this document :) + let $pb := subsequence($pageBreaks, $pnOfS, 1) + (: test if sentence surrounds page break; costs 0,1 sec performance :) + let $pbPlus1 := subsequence($pageBreaks, $pnOfS + 1, 1) + let $sSurroundsPB := + if ($pbPlus1/parent::node() = $s and $pbPlus1 intersect $s/descendant::node()) + then true() + else false() + let $posOfS := (: faster: comparison only in s elements of this document :) + if ($mpdlCollectionName = 'archimedes') + then count($pb/following::s[. << $s]) + 1 + else if ($mpdlCollectionName = 'echo') + then count($pb/following::echo:s[. << $s]) + 1 + else count($pb/following::s[. << $s]) + 1 + let $position := $from - 1 + $pos + let $resultElem := + <hit> + <pos>{$position}</pos> + <pn>{$pnOfS}</pn> + <pos-of-s>{$posOfS}</pos-of-s> + <s>{string($s)}</s> + <s-surrounds-pb>{$sSurroundsPB}</s-surrounds-pb> + </hit> + return $resultElem + let $resultSize := count($t) + let $pages := + if ($resultSize = 0) + then 0 + else $resultSize idiv $pageSize + 1 + let $queryForms := + if ($queryType = 'fulltextMorph' or $queryType = 'fulltextMorphLemma') + then mpdltext:get-query-morph-forms($language, $queryStr) + else () + let $queryRegularizations := + if ($queryType = 'fulltextMorph') + then mpdltext:get-query-regularizations($language, $queryStr) + else () + + let $encodedQueryTerms := + if ($language = "zh") + then + mpdltext:get-big5-encoded-terms($query) + else () + + let $result := + if ($luceneParseResult != '') + then + <result> + <error>{$luceneParseResult}</error> + <size>0</size> + <pages>0</pages> + <pn>0</pn> + <hits/> + </result> + else + <result> + <size>{$resultSize}</size> + <page-size>{$pageSize}</page-size> + <pages>{$pages}</pages> + <pn>{$pn}</pn> + <hits> + {$queryResult} + </hits> + <query-forms>{$queryForms}</query-forms> + <query-regularizations>{$queryRegularizations}</query-regularizations> + {$encodedQueryTerms} + </result> + + return $result +}; + +declare function mpdl-lucene:attrSearch($metadataStr, $attribute1, $attrValue1, $boolOp, $attribute2, $attrValue2) { + let $attrFtQueryStr1 := mpdl-lucene:getAttrQueryStr($attribute1, $attrValue1) + let $attrFtQueryStr2 := mpdl-lucene:getAttrQueryStr($attribute2, $attrValue2) + let $booleanQueryStr := + if ($attrValue2 = "") + then $attrFtQueryStr1 + else if ($attrValue2 != "" and $boolOp = "or") + then concat($attrFtQueryStr1, " or ", $attrFtQueryStr2) + else if ($attrValue2 != "" and $boolOp = "and") + then concat("(", $attrFtQueryStr1, ") and (", $attrFtQueryStr2, ")") + else if ($attrValue2 != "" and $boolOp = "andNot") + then concat("(", $attrFtQueryStr1, ") and not(", $attrFtQueryStr2, ")") + else () + let $attrQuery := concat($metadataStr, "/.[", $booleanQueryStr, "]") + let $luceneParseResult1 := mpdltext:lucene-parse-query($attrValue1) + let $luceneParseResult2 := mpdltext:lucene-parse-query($attrValue2) + let $luceneParseResult := + if ($luceneParseResult1 = '' and $luceneParseResult2 = '') + then '' + else if ($luceneParseResult1 != '' and $luceneParseResult2 = '') + then concat("attribute ", $attribute1, " with value: ", $attrValue1, ": ", $luceneParseResult1) + else if ($luceneParseResult1 = '' and $luceneParseResult2 != '') + then concat("attribute ", $attribute2, " with value: ", $attrValue2, ": ", $luceneParseResult2) + else if ($luceneParseResult1 != '' and $luceneParseResult2 != '') + then concat("attribute ", $attribute1, " with value: ", $attrValue1, ": ", $luceneParseResult1, ", attribute ", $attribute2, " with value: ", $attrValue2, ": ", $luceneParseResult2) + else () + let $result := + if ($luceneParseResult = '') + then util:eval($attrQuery) + else + <result> + <error>{$luceneParseResult}</error> + <size>0</size> + <pages>0</pages> + <pn>0</pn> + <hits/> + </result> + + return $result +}; + +declare function mpdl-lucene:order($metadata, $orderBy) { + let $result := + (for $attrElem in $metadata + let $doc := $attrElem/fn:root() + let $documentUri := document-uri($doc) + let $docBase := functx:substring-after-last(functx:substring-before-last(functx:substring-before-last($documentUri, "/"), "/"), "/") + let $metadataElem := mpdl-lucene:getMetadata($docBase, $doc) + let $attrElemName := mpdl-lucene:getElemNameByAttr($docBase, $orderBy) + let $orderByElem := mpdl-lucene:getElemDynamic($metadataElem, $attrElemName) (: this costs performance for many result elements :) + let $orderByTemp := lower-case(string-join($orderByElem, ', ')) order by $orderByTemp + return $attrElem) + + return $result + + (: performance improvement (?): + let $result := + for $x in doc(/db/doc1.xml) + order by $x + if ($sortOrder eq "asc") + then ( "ascending" ) + else ( "descending" ) + :) +}; + +declare function mpdl-lucene:getMetadata($docBase, $doc) { + let $result := + if ($docBase = 'archimedes') + then $doc/archimedes/info + else if ($docBase = 'echo') + then $doc/echo:echo/echo:metadata + else () + return $result +}; + +declare function mpdl-lucene:getText($docBase, $doc) { + let $result := + if ($docBase = 'archimedes') + then $doc/archimedes/text + else if ($docBase = 'echo') + then $doc/echo:echo/echo:text + else () + return $result +}; + + +(: TODO: performance improvement: each time util:eval is executed :) +declare function mpdl-lucene:getElemDynamic($path, $elemStr) { + let $evalExpr := + if($elemStr != "") + then concat("$path/", $elemStr) + else "" + let $result := util:eval($evalExpr) + return $result +}; + +declare function mpdl-lucene:getAttrQueryStr($attribute, $attrValue) { + let $attrArch := mpdl-lucene:getElemNameByAttr("archimedes", $attribute) + let $attrEcho := mpdl-lucene:getElemNameByAttr("echo", $attribute) + let $attrArchRelQueryStr := + if ($attribute = "date") + then concat($attrArch, " = '", $attrValue, "'") + else concat("ft:query(", $attrArch, ", '", $attrValue, "')") + let $attrEchoRelQueryStr := + if ($attribute = "date") + then concat($attrEcho, " = '", $attrValue, "'") + else concat("ft:query(", $attrEcho, ", '", $attrValue, "')") + let $result := + if ($attrArch != "" and $attrEcho != "") + then concat($attrArchRelQueryStr, " or ", $attrEchoRelQueryStr) + else if ($attrArch = "" and $attrEcho != "") + then $attrEchoRelQueryStr + else if ($attrArch != "" and $attrEcho = "") + then $attrArchRelQueryStr + else "" + return $result +}; + +declare function mpdl-lucene:getElemNameByAttr($docBase, $attribute) { + let $docBaseArch := "archimedes" + let $docBaseEcho := "echo" + let $result := + if ($docBase = $docBaseArch and $attribute = "author") + then "author" + else if ($docBase = $docBaseEcho and $attribute = "author") + then "dcterms:creator" + else if ($docBase = $docBaseArch and $attribute = "title") + then "title" + else if ($docBase = $docBaseEcho and $attribute = "title") + then "dcterms:title" + else if ($docBase = $docBaseArch and $attribute = "place") + then "place" + else if ($docBase = $docBaseEcho and $attribute = "place") + then "" + else if ($docBase = $docBaseArch and $attribute = "date") + then "date" + else if ($docBase = $docBaseEcho and $attribute = "date") + then "dcterms:date" + else if ($docBase = $docBaseArch and $attribute = "language") + then "lang" + else if ($docBase = $docBaseEcho and $attribute = "language") + then "dcterms:language" + else if ($docBase = $docBaseArch and $attribute = "identifier") + then "locator" + else if ($docBase = $docBaseEcho and $attribute = "identifier") + then "dcterms:identifier" + else if ($docBase = $docBaseArch and $attribute = "rights") + then "" + else if ($docBase = $docBaseEcho and $attribute = "rights") + then "dcterms:rights" + else if ($docBase = $docBaseArch and $attribute = "license") + then "" + else if ($docBase = $docBaseEcho and $attribute = "license") + then "dcterms:license" + else if ($docBase = $docBaseArch and $attribute = "accessRights") + then "" + else if ($docBase = $docBaseEcho and $attribute = "accessRights") + then "dcterms:accessRights" + else if ($docBase = $docBaseArch and $attribute = "file") + then "cvs_file" + else if ($docBase = $docBaseEcho and $attribute = "file") + then "" + else if ($docBase = $docBaseArch and $attribute = "translator") + then "translator" + else if ($docBase = $docBaseEcho and $attribute = "translator") + then "" + else if ($docBase = $docBaseArch and $attribute = "version") + then "cvs_version" + else if ($docBase = $docBaseEcho and $attribute = "version") + then "" + else "" + + return $result +}; + +declare function mpdl-lucene:getElementsByAttr($metadata, $docBase, $attribute) { + let $docBaseArch := "archimedes" + let $docBaseEcho := "echo" + let $result := + if ($docBase = $docBaseArch and $attribute = "author") + then + for $elem in $metadata/author + return <author>{$elem}</author> + else if ($docBase = $docBaseEcho and $attribute = "author") + then + for $elem in $metadata/dcterms:creator + return <author>{$elem}</author> + else if ($docBase = $docBaseArch and $attribute = "title") + then + for $elem in $metadata/title + return <title>{$elem}</title> + else if ($docBase = $docBaseEcho and $attribute = "title") + then + for $elem in $metadata/dcterms:title + return <title>{$elem}</title> + else if ($docBase = $docBaseArch and $attribute = "place") + then + for $elem in $metadata/place + return <place>{$elem}</place> + else if ($docBase = $docBaseEcho and $attribute = "place") + then () + else if ($docBase = $docBaseArch and $attribute = "date") + then + for $elem in $metadata/date + return <date>{$elem}</date> + else if ($docBase = $docBaseEcho and $attribute = "date") + then + for $elem in $metadata/dcterms:date + return <date>{$elem}</date> + else if ($docBase = $docBaseArch and $attribute = "language") + then $metadata/lang + else if ($docBase = $docBaseEcho and $attribute = "language") + then $metadata/dcterms:language + else if ($docBase = $docBaseArch and $attribute = "identifier") + then $metadata/locator + else if ($docBase = $docBaseEcho and $attribute = "identifier") + then $metadata/dcterms:identifier + else if ($docBase = $docBaseArch and $attribute = "rights") + then () + else if ($docBase = $docBaseEcho and $attribute = "rights") + then + for $elem in $metadata/dcterms:rights + return <rights>{$elem}</rights> + else if ($docBase = $docBaseArch and $attribute = "accessRights") + then () + else if ($docBase = $docBaseEcho and $attribute = "accessRights") + then + for $elem in $metadata/dcterms:accessRights + return <rights>{$elem}</rights> + else if ($docBase = $docBaseArch and $attribute = "license") + then () + else if ($docBase = $docBaseEcho and $attribute = "license") + then + for $elem in $metadata/dcterms:license + return <license>{$elem}</license> + else if ($docBase = $docBaseArch and $attribute = "file") + then $metadata/cvs_file + else if ($docBase = $docBaseEcho and $attribute = "file") + then () + else if ($docBase = $docBaseArch and $attribute = "translator") + then $metadata/translator + else if ($docBase = $docBaseEcho and $attribute = "translator") + then () + else if ($docBase = $docBaseArch and $attribute = "version") + then $metadata/cvs_version + else if ($docBase = $docBaseEcho and $attribute = "version") + then () + else () + + return $result +}; +