Mercurial > hg > mpdl-group
view software/eXist/webapp/mpdl/lucene/search.xql @ 17:7e883ce72fec
diverse Fehlerbehebungen
author | Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de> |
---|---|
date | Tue, 27 Sep 2011 16:41:15 +0200 |
parents | e99964f390e4 |
children |
line wrap: on
line source
xquery version "1.0"; module namespace mpdl-lucene = "http://www.mpiwg-berlin.mpg.de/ns/mpdl/lucene/search"; import module namespace functx = "http://www.functx.com" at "../util/functx.xql"; declare namespace ft = "http://exist-db.org/xquery/lucene"; declare namespace echo="http://www.mpiwg-berlin.mpg.de/ns/echo/1.0/"; declare namespace TEI="http://www.tei-c.org/ns/1.0"; declare namespace dc="http://purl.org/dc/elements/1.1/"; declare namespace dcterms="http://purl.org/dc/terms"; declare function mpdl-lucene:search($mpdlCollection, $queryStr) { let $luceneParseResult := mpdltext:lucene-parse-query($queryStr) let $result := if ($luceneParseResult = '') then $mpdlCollection/.[ft:query(archimedes/text, $queryStr) or ft:query(echo:echo/echo:text, $queryStr) or ft:query(TEI:TEI/TEI:text, $queryStr)] else <result> <error>{$luceneParseResult}</error> <size>0</size> <pages>0</pages> <pn>0</pn> <hits/> </result> return $result }; declare function mpdl-lucene:search($mpdlCollectionName, $language, $document, $queryType, $queryStr, $pn as xs:int, $pageSize as xs:int) as node() { (: performance reasons: all hits (not only the first 10! ) are passed through the :) (: for loop: so the overhead in each loop has to be minimized :) let $query := if ($queryType = 'fulltext' or $queryType = 'fulltextMorph') then $queryStr else if ($queryType = 'fulltextMorphLemma') then concat('lemmalemma', $queryStr) else () let $pageBreaks := $document//*[name() = 'pb'] let $luceneParseResult := mpdltext:lucene-parse-query($queryStr) let $t := if ($luceneParseResult != '') then () else if ($mpdlCollectionName = 'archimedes') then $document//s[ft:query(., $query)]|$document//head[ft:query(., $query)] else if ($mpdlCollectionName = 'echo') then $document//echo:s[ft:query(., $query)]|$document//echo:head[ft:query(., $query)] else if ($mpdlCollectionName = 'tei') then $document//TEI:s[ft:query(., $query)]|$document//TEI:head[ft:query(., $query)] else $document//*:s[ft:query(., $query)]|$document//*:head[ft:query(., $query)] (: else $document//*[name() = 's' or name() = 'head'][ft:query(., $query)] this would be much slower and would consume too much memory :) let $from := ($pn * $pageSize) - $pageSize + 1 let $to := $pn * $pageSize (: performance improvements: result set of 500 needs 3 sec., result set of 10 needs 0,7 sec.:) let $tempQueryResult := for $ss at $poss in $t where $poss >= $from and $poss <= $to return $ss let $docSentences := if ($mpdlCollectionName = 'archimedes') then $document//s else if ($mpdlCollectionName = 'echo') then $document//echo:s else if ($mpdlCollectionName = 'tei') then $document//TEI:s else $document//*:s let $docHeads := if ($mpdlCollectionName = 'archimedes') then $document//head else if ($mpdlCollectionName = 'echo') then $document//echo:head else if ($mpdlCollectionName = 'tei') then $document//TEI:head else $document//*:head let $queryResult := for $hit at $pos in $tempQueryResult let $hitType := local-name($hit) let $pnOfHit := count($pageBreaks[. << $hit]) (: faster: comparison only in pb elements of this document :) let $pb := subsequence($pageBreaks, $pnOfHit, 1) (: test if sentence surrounds page break; costs 0,1 sec performance :) let $pbPlus1 := subsequence($pageBreaks, $pnOfHit + 1, 1) let $hitSurroundsPB := if ($pbPlus1/parent::node() = $hit and $pbPlus1 intersect $hit/descendant::node()) then true() else false() let $pageHits := if ($hitType = 's') then $docSentences/.[. >> $pb and . << $pbPlus1] else if ($hitType = 'head') then $docHeads/.[. >> $pb and . << $pbPlus1] else $docSentences/.[. >> $pb and . << $pbPlus1] let $posOfHit := count($pageHits/.[. << $hit]) + 1 let $position := $from - 1 + $pos let $resultElem := <hit> <hitType>{$hitType}</hitType> <pos>{$position}</pos> <pn>{$pnOfHit}</pn> <hitPos>{$posOfHit}</hitPos> <hitString>{string($hit)}</hitString> <hitSurroundsPB>{$hitSurroundsPB}</hitSurroundsPB> </hit> return $resultElem let $resultSize := count($t) let $pages := if ($resultSize = 0) then 0 else $resultSize idiv $pageSize + 1 let $queryForms := if ($queryType = 'fulltextMorph' or $queryType = 'fulltextMorphLemma') then mpdltext:get-query-morph-forms($language, $queryStr) else () let $queryRegularizations := if ($queryType = 'fulltextMorph') then mpdltext:get-query-regularizations($language, $queryStr) else () let $encodedQueryTerms := if ($language = "zh") then mpdltext:get-big5-encoded-terms($query) else () let $result := if ($luceneParseResult != '') then <result> <error>{$luceneParseResult}</error> <size>0</size> <pages>0</pages> <pn>0</pn> <hits/> </result> else <result> <size>{$resultSize}</size> <page-size>{$pageSize}</page-size> <pages>{$pages}</pages> <pn>{$pn}</pn> <hits> {$queryResult} </hits> <query-forms>{$queryForms}</query-forms> <query-regularizations>{$queryRegularizations}</query-regularizations> {$encodedQueryTerms} </result> return $result }; declare function mpdl-lucene:attrSearch($metadataStr, $attribute1, $attrValue1, $boolOp, $attribute2, $attrValue2) { let $attrFtQueryStr1 := mpdl-lucene:getAttrQueryStr($attribute1, $attrValue1) let $attrFtQueryStr2 := mpdl-lucene:getAttrQueryStr($attribute2, $attrValue2) let $booleanQueryStr := if ($attrValue2 = "") then $attrFtQueryStr1 else if ($attrValue2 != "" and $boolOp = "or") then concat($attrFtQueryStr1, " or ", $attrFtQueryStr2) else if ($attrValue2 != "" and $boolOp = "and") then concat("(", $attrFtQueryStr1, ") and (", $attrFtQueryStr2, ")") else if ($attrValue2 != "" and $boolOp = "andNot") then concat("(", $attrFtQueryStr1, ") and not(", $attrFtQueryStr2, ")") else () let $attrQuery := concat($metadataStr, "/.[", $booleanQueryStr, "]") let $luceneParseResult1 := mpdltext:lucene-parse-query($attrValue1) let $luceneParseResult2 := mpdltext:lucene-parse-query($attrValue2) let $luceneParseResult := if ($luceneParseResult1 = '' and $luceneParseResult2 = '') then '' else if ($luceneParseResult1 != '' and $luceneParseResult2 = '') then concat("attribute ", $attribute1, " with value: ", $attrValue1, ": ", $luceneParseResult1) else if ($luceneParseResult1 = '' and $luceneParseResult2 != '') then concat("attribute ", $attribute2, " with value: ", $attrValue2, ": ", $luceneParseResult2) else if ($luceneParseResult1 != '' and $luceneParseResult2 != '') then concat("attribute ", $attribute1, " with value: ", $attrValue1, ": ", $luceneParseResult1, ", attribute ", $attribute2, " with value: ", $attrValue2, ": ", $luceneParseResult2) else () let $result := if ($luceneParseResult = '') then util:eval($attrQuery) else <result> <error>{$luceneParseResult}</error> <size>0</size> <pages>0</pages> <pn>0</pn> <hits/> </result> return $result }; declare function mpdl-lucene:order($metadata, $orderBy) { let $result := (for $attrElem in $metadata let $doc := $attrElem/fn:root() let $documentUri := document-uri($doc) let $docBase := functx:substring-after-last(functx:substring-before-last(functx:substring-before-last($documentUri, "/"), "/"), "/") let $metadataElem := mpdl-lucene:getMetadata($docBase, $doc) let $attrElemName := if ($orderBy = "document") then "document" else mpdl-lucene:getElemNameByAttr($docBase, $orderBy) let $orderByElem := if ($attrElemName = "document") then util:document-name($doc) else mpdl-lucene:getElemDynamic($metadataElem, $attrElemName) (: this costs performance for many result elements :) let $orderByTemp := lower-case(string-join($orderByElem, ', ')) order by $orderByTemp return $attrElem) return $result (: performance improvement (?): let $result := for $x in doc(/db/doc1.xml) order by $x if ($sortOrder eq "asc") then ( "ascending" ) else ( "descending" ) :) }; declare function mpdl-lucene:getMetadata($docBase, $doc) { let $result := if ($docBase = 'archimedes') then $doc/archimedes/info else if ($docBase = 'echo') then $doc/echo:echo/echo:metadata else if ($docBase = 'tei') then $doc/TEI:TEI/TEI:teiHeader else () return $result }; declare function mpdl-lucene:getText($docBase, $doc) { let $result := if ($docBase = 'archimedes') then $doc/archimedes/text else if ($docBase = 'echo') then $doc/echo:echo/echo:text else if ($docBase = 'tei') then $doc/TEI:TEI/TEI:text else $doc/*[1] return $result }; (: TODO: performance improvement: each time util:eval is executed :) declare function mpdl-lucene:getElemDynamic($path, $elemStr) { let $evalExpr := if($elemStr != "") then concat("$path/", $elemStr) else "" let $result := util:eval($evalExpr) return $result }; declare function mpdl-lucene:getAttrQueryStr($attribute, $attrValue) { let $attrArch := mpdl-lucene:getElemNameByAttr("archimedes", $attribute) let $attrEcho := mpdl-lucene:getElemNameByAttr("echo", $attribute) let $attrTei := mpdl-lucene:getElemNameByAttr("tei", $attribute) let $attrArchRelQueryStr := if ($attribute = "date") then concat($attrArch, " = '", $attrValue, "'") else concat("ft:query(", $attrArch, ", '", $attrValue, "')") let $attrEchoRelQueryStr := if ($attribute = "date") then concat($attrEcho, " = '", $attrValue, "'") else concat("ft:query(", $attrEcho, ", '", $attrValue, "')") let $attrTeiRelQueryStr := if ($attribute = "date") then concat($attrTei, " = '", $attrValue, "'") else concat("ft:query(", $attrTei, ", '", $attrValue, "')") let $result := if ($attrArch != "" and $attrEcho != "" and $attrTei != "") then concat($attrArchRelQueryStr, " or ", $attrEchoRelQueryStr, " or ", $attrTeiRelQueryStr) else if ($attrArch != "" and $attrEcho != "" and $attrTei = "") then concat($attrArchRelQueryStr, " or ", $attrEchoRelQueryStr) else if ($attrArch != "" and $attrEcho = "" and $attrTei != "") then concat($attrArchRelQueryStr, " or ", $attrTeiRelQueryStr) else if ($attrArch = "" and $attrEcho != "" and $attrTei != "") then concat($attrEchoRelQueryStr, " or ", $attrTeiRelQueryStr) else if ($attrArch != "" and $attrEcho = "" and $attrTei = "") then $attrArchRelQueryStr else if ($attrArch = "" and $attrEcho != "" and $attrTei = "") then $attrEchoRelQueryStr else if ($attrArch = "" and $attrEcho = "" and $attrTei != "") then $attrTeiRelQueryStr else "" return $result }; declare function mpdl-lucene:getElemNameByAttr($docBase, $attribute) { let $docBaseArch := "archimedes" let $docBaseEcho := "echo" let $docBaseTei := "tei" let $result := if ($docBase = $docBaseArch and $attribute = "author") then "author" else if ($docBase = $docBaseEcho and $attribute = "author") then "dcterms:creator" else if ($docBase = $docBaseTei and $attribute = "author") then "TEI:fileDesc/TEI:titleStmt/TEI:author" else if ($docBase = $docBaseArch and $attribute = "title") then "title" else if ($docBase = $docBaseEcho and $attribute = "title") then "dcterms:title" else if ($docBase = $docBaseTei and $attribute = "title") then "TEI:fileDesc/TEI:titleStmt/TEI:title" else if ($docBase = $docBaseArch and $attribute = "place") then "place" else if ($docBase = $docBaseEcho and $attribute = "place") then "" else if ($docBase = $docBaseTei and $attribute = "place") then "TEI:fileDesc/TEI:publicationStmt/TEI:pubPlace" else if ($docBase = $docBaseArch and $attribute = "date") then "date" else if ($docBase = $docBaseEcho and $attribute = "date") then "dcterms:date" else if ($docBase = $docBaseTei and $attribute = "date") then "TEI:fileDesc/TEI:publicationStmt/TEI:date" else if ($docBase = $docBaseArch and $attribute = "language") then "lang" else if ($docBase = $docBaseEcho and $attribute = "language") then "dcterms:language" else if ($docBase = $docBaseTei and $attribute = "language") then "TEI:profileDesc/TEI:langUsage/TEI:language/@ident" else if ($docBase = $docBaseArch and $attribute = "identifier") then "locator" else if ($docBase = $docBaseEcho and $attribute = "identifier") then "dcterms:identifier" else if ($docBase = $docBaseTei and $attribute = "identifier") then "identifier" else if ($docBase = $docBaseArch and $attribute = "rights") then "" else if ($docBase = $docBaseEcho and $attribute = "rights") then "dcterms:rights" else if ($docBase = $docBaseTei and $attribute = "rights") then "rights" else if ($docBase = $docBaseArch and $attribute = "license") then "" else if ($docBase = $docBaseEcho and $attribute = "license") then "dcterms:license" else if ($docBase = $docBaseTei and $attribute = "license") then "" else if ($docBase = $docBaseArch and $attribute = "accessRights") then "" else if ($docBase = $docBaseEcho and $attribute = "accessRights") then "dcterms:accessRights" else if ($docBase = $docBaseTei and $attribute = "accessRights") then "accessRights" else if ($docBase = $docBaseArch and $attribute = "file") then "cvs_file" else if ($docBase = $docBaseEcho and $attribute = "file") then "" else if ($docBase = $docBaseTei and $attribute = "file") then "" else if ($docBase = $docBaseArch and $attribute = "translator") then "translator" else if ($docBase = $docBaseEcho and $attribute = "translator") then "" else if ($docBase = $docBaseTei and $attribute = "translator") then "" else if ($docBase = $docBaseArch and $attribute = "version") then "cvs_version" else if ($docBase = $docBaseEcho and $attribute = "version") then "" else if ($docBase = $docBaseTei and $attribute = "version") then "" else "" return $result }; declare function mpdl-lucene:getElementsByAttr($metadata, $docBase, $attribute) { let $docBaseArch := "archimedes" let $docBaseEcho := "echo" let $docBaseTei := "tei" let $result := if ($docBase = $docBaseArch and $attribute = "author") then for $elem in $metadata/author return <author>{$elem}</author> else if ($docBase = $docBaseEcho and $attribute = "author") then for $elem in $metadata/dcterms:creator return <author>{$elem}</author> else if ($docBase = $docBaseTei and $attribute = "author") then for $elem in $metadata/TEI:fileDesc/TEI:titleStmt/TEI:author return <author>{$elem}</author> else if ($docBase = $docBaseArch and $attribute = "title") then for $elem in $metadata/title return <title>{$elem}</title> else if ($docBase = $docBaseEcho and $attribute = "title") then for $elem in $metadata/dcterms:title return <title>{$elem}</title> else if ($docBase = $docBaseTei and $attribute = "title") then for $elem in $metadata/TEI:fileDesc/TEI:titleStmt/TEI:title return <title>{$elem}</title> else if ($docBase = $docBaseArch and $attribute = "place") then for $elem in $metadata/place return <place>{$elem}</place> else if ($docBase = $docBaseEcho and $attribute = "place") then () else if ($docBase = $docBaseArch and $attribute = "date") then for $elem in $metadata/date return <date>{$elem}</date> else if ($docBase = $docBaseEcho and $attribute = "date") then for $elem in $metadata/dcterms:date return <date>{$elem}</date> else if ($docBase = $docBaseTei and $attribute = "date") then for $elem in $metadata/TEI:fileDesc/TEI:publicationStmt/TEI:date return <date>{$elem}</date> else if ($docBase = $docBaseArch and $attribute = "language") then $metadata/lang else if ($docBase = $docBaseEcho and $attribute = "language") then $metadata/dcterms:language else if ($docBase = $docBaseTei and $attribute = "language") then $metadata/TEI:profileDesc/TEI:langUsage/TEI:language/@ident else if ($docBase = $docBaseArch and $attribute = "identifier") then $metadata/locator else if ($docBase = $docBaseEcho and $attribute = "identifier") then $metadata/dcterms:identifier else if ($docBase = $docBaseArch and $attribute = "rights") then () else if ($docBase = $docBaseEcho and $attribute = "rights") then for $elem in $metadata/dcterms:rights return <rights>{$elem}</rights> else if ($docBase = $docBaseTei and $attribute = "rights") then $metadata/TEI:fileDesc/TEI:publicationStmt/TEI:availability else if ($docBase = $docBaseArch and $attribute = "accessRights") then () else if ($docBase = $docBaseEcho and $attribute = "accessRights") then for $elem in $metadata/dcterms:accessRights return <rights>{$elem}</rights> else if ($docBase = $docBaseTei and $attribute = "accessRights") then $metadata/TEI:fileDesc/TEI:publicationStmt/TEI:availability/@status else if ($docBase = $docBaseArch and $attribute = "license") then () else if ($docBase = $docBaseEcho and $attribute = "license") then for $elem in $metadata/dcterms:license return <license>{$elem}</license> else if ($docBase = $docBaseArch and $attribute = "file") then $metadata/cvs_file else if ($docBase = $docBaseEcho and $attribute = "file") then () else if ($docBase = $docBaseArch and $attribute = "translator") then $metadata/translator else if ($docBase = $docBaseEcho and $attribute = "translator") then () else if ($docBase = $docBaseArch and $attribute = "version") then $metadata/cvs_version else if ($docBase = $docBaseEcho and $attribute = "version") then () else () return $result };