diff software/eXist/webapp/mpdl/lucene/search.xql @ 7:5589d865af7a

Erstellung XQL/XSL Applikation
author Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
date Tue, 08 Feb 2011 15:16:46 +0100
parents
children d6f528ad5d96
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/software/eXist/webapp/mpdl/lucene/search.xql	Tue Feb 08 15:16:46 2011 +0100
@@ -0,0 +1,385 @@
+xquery version "1.0";
+
+module namespace mpdl-lucene = "http://www.mpiwg-berlin.mpg.de/ns/mpdl/lucene/search"; 
+
+import module namespace functx = "http://www.functx.com" at "../util/functx.xql";
+
+declare namespace ft = "http://exist-db.org/xquery/lucene";
+
+declare namespace echo="http://www.mpiwg-berlin.mpg.de/ns/echo/1.0/";
+
+declare namespace dc="http://purl.org/dc/elements/1.1/";
+declare namespace dcterms="http://purl.org/dc/terms";
+
+declare function mpdl-lucene:search($mpdlCollection, $queryStr) {
+  let $luceneParseResult := mpdltext:lucene-parse-query($queryStr)
+  let $result := 
+    if ($luceneParseResult = '')
+    then $mpdlCollection/.[ft:query(archimedes/text, $queryStr) or ft:query(echo:echo/echo:text, $queryStr)]
+    else
+      <result>
+        <error>{$luceneParseResult}</error>
+        <size>0</size>
+        <pages>0</pages>
+        <pn>0</pn>
+        <hits/>
+      </result>    
+  return $result
+};
+
+declare function mpdl-lucene:search($mpdlCollectionName, $language, $document, $queryType, $queryStr, $pn as xs:int, $pageSize as xs:int) as node() {
+  (: performance reasons: all hits (not only the first 10! ) are passed through the :)
+  (: for loop: so the overhead in each loop has to be minimized :)
+  let $query := 
+    if ($queryType = 'fulltext' or $queryType = 'fulltextMorph')
+    then $queryStr
+    else if ($queryType = 'fulltextMorphLemma')
+    then concat('lemmalemma', $queryStr)
+    else ()
+  let $pageBreaks := 
+    if ($mpdlCollectionName = 'archimedes')
+    then $document//pb
+    else if ($mpdlCollectionName = 'echo') 
+    then $document//echo:pb
+    else $document//pb
+  let $luceneParseResult := mpdltext:lucene-parse-query($queryStr)
+  let $t := 
+    if ($luceneParseResult != '')
+    then ()
+    else if ($mpdlCollectionName = 'archimedes')
+    then $document//s[ft:query(., $query)]
+    else if ($mpdlCollectionName = 'echo')
+    then $document//echo:s[ft:query(., $query)]
+    else $document//s[ft:query(., $query)]
+  let $from := ($pn * $pageSize) - $pageSize + 1
+  let $to := $pn * $pageSize
+  (: performance improvements: result set of 500 needs 3 sec., result set of 10 needs 0,7 sec.:)
+  let $tempQueryResult := 
+    for $ss at $poss in $t
+    where $poss >= $from and $poss <= $to
+    return $ss
+  let $queryResult :=
+    for $s at $pos in $tempQueryResult
+      let $pnOfS := count($pageBreaks[. << $s])        (: faster: comparison only in pb elements of this document :)
+      let $pb := subsequence($pageBreaks, $pnOfS, 1)
+      (: test if sentence surrounds page break; costs 0,1 sec performance :)
+      let $pbPlus1 := subsequence($pageBreaks, $pnOfS + 1, 1)
+      let $sSurroundsPB := 
+        if ($pbPlus1/parent::node() = $s and $pbPlus1 intersect $s/descendant::node())
+        then true()
+        else false()
+      let $posOfS :=                           (: faster: comparison only in s elements of this document :)
+        if ($mpdlCollectionName = 'archimedes')
+        then count($pb/following::s[. << $s]) + 1
+        else if ($mpdlCollectionName = 'echo')
+        then count($pb/following::echo:s[. << $s]) + 1
+        else count($pb/following::s[. << $s]) + 1
+      let $position := $from - 1 + $pos
+      let $resultElem := 
+        <hit>
+          <pos>{$position}</pos>
+          <pn>{$pnOfS}</pn>
+          <pos-of-s>{$posOfS}</pos-of-s>
+          <s>{string($s)}</s>
+          <s-surrounds-pb>{$sSurroundsPB}</s-surrounds-pb>
+        </hit>
+    return $resultElem
+  let $resultSize := count($t)
+  let $pages := 
+    if ($resultSize = 0)
+    then 0
+    else $resultSize idiv $pageSize + 1
+  let $queryForms := 
+    if ($queryType = 'fulltextMorph' or $queryType = 'fulltextMorphLemma')
+    then mpdltext:get-query-morph-forms($language, $queryStr)
+    else ()
+  let $queryRegularizations := 
+    if ($queryType = 'fulltextMorph')
+    then mpdltext:get-query-regularizations($language, $queryStr)
+    else ()
+
+  let $encodedQueryTerms := 
+    if ($language = "zh")
+    then 
+      mpdltext:get-big5-encoded-terms($query)
+    else ()
+
+  let $result := 
+    if ($luceneParseResult != '')
+    then
+      <result>
+        <error>{$luceneParseResult}</error>
+        <size>0</size>
+        <pages>0</pages>
+        <pn>0</pn>
+        <hits/>
+      </result>    
+    else
+      <result>
+        <size>{$resultSize}</size>
+        <page-size>{$pageSize}</page-size>
+        <pages>{$pages}</pages>
+        <pn>{$pn}</pn>
+        <hits>
+          {$queryResult}
+        </hits>
+        <query-forms>{$queryForms}</query-forms>
+        <query-regularizations>{$queryRegularizations}</query-regularizations>
+        {$encodedQueryTerms}
+      </result>
+   
+   return $result
+};
+
+declare function mpdl-lucene:attrSearch($metadataStr, $attribute1, $attrValue1, $boolOp, $attribute2, $attrValue2) {
+  let $attrFtQueryStr1 := mpdl-lucene:getAttrQueryStr($attribute1, $attrValue1)
+  let $attrFtQueryStr2 := mpdl-lucene:getAttrQueryStr($attribute2, $attrValue2)
+  let $booleanQueryStr := 
+    if ($attrValue2 = "")
+    then $attrFtQueryStr1
+    else if ($attrValue2 != "" and $boolOp = "or")
+    then concat($attrFtQueryStr1, " or ", $attrFtQueryStr2)
+    else if ($attrValue2 != "" and $boolOp = "and")
+    then concat("(", $attrFtQueryStr1, ") and (", $attrFtQueryStr2, ")")
+    else if ($attrValue2 != "" and $boolOp = "andNot")
+    then concat("(", $attrFtQueryStr1, ") and not(", $attrFtQueryStr2, ")")
+    else ()
+  let $attrQuery := concat($metadataStr, "/.[", $booleanQueryStr, "]")
+  let $luceneParseResult1 := mpdltext:lucene-parse-query($attrValue1)
+  let $luceneParseResult2 := mpdltext:lucene-parse-query($attrValue2)
+  let $luceneParseResult :=
+    if ($luceneParseResult1 = '' and $luceneParseResult2 = '')
+    then ''
+    else if ($luceneParseResult1 != '' and $luceneParseResult2 = '')
+    then concat("attribute ", $attribute1, " with value: ", $attrValue1, ": ", $luceneParseResult1)
+    else if ($luceneParseResult1 = '' and $luceneParseResult2 != '')
+    then concat("attribute ", $attribute2, " with value: ", $attrValue2, ": ", $luceneParseResult2)
+    else if ($luceneParseResult1 != '' and $luceneParseResult2 != '')
+    then concat("attribute ", $attribute1, " with value: ", $attrValue1, ": ", $luceneParseResult1, ", attribute ", $attribute2, " with value: ", $attrValue2, ": ", $luceneParseResult2)
+    else ()    
+  let $result := 
+    if ($luceneParseResult = '')
+    then util:eval($attrQuery)
+    else
+      <result>
+        <error>{$luceneParseResult}</error>
+        <size>0</size>
+        <pages>0</pages>
+        <pn>0</pn>
+        <hits/>
+      </result>    
+  
+  return $result
+};
+
+declare function mpdl-lucene:order($metadata, $orderBy) {
+  let $result := 
+  (for $attrElem in $metadata
+     let $doc := $attrElem/fn:root()
+     let $documentUri := document-uri($doc)
+     let $docBase := functx:substring-after-last(functx:substring-before-last(functx:substring-before-last($documentUri, "/"), "/"), "/")
+     let $metadataElem := mpdl-lucene:getMetadata($docBase, $doc)
+     let $attrElemName := mpdl-lucene:getElemNameByAttr($docBase, $orderBy)
+     let $orderByElem := mpdl-lucene:getElemDynamic($metadataElem, $attrElemName)  (: this costs performance for many result elements   :)
+     let $orderByTemp := lower-case(string-join($orderByElem, ', '))
   order by $orderByTemp
+   return $attrElem)
+
+  return $result
+
+  (: performance improvement (?):
+     let $result := 
+       for $x in doc(/db/doc1.xml) 
+       order by $x
+       if ($sortOrder eq "asc")
+       then ( "ascending" ) 
+       else ( "descending" ) 
+  :)
+};
+
+declare function mpdl-lucene:getMetadata($docBase, $doc) {
+  let $result :=
+    if ($docBase = 'archimedes')
+    then $doc/archimedes/info
+    else if ($docBase = 'echo')
+    then $doc/echo:echo/echo:metadata
+    else ()
+  return $result	
+};
+
+declare function mpdl-lucene:getText($docBase, $doc) {
+  let $result :=
+    if ($docBase = 'archimedes')
+    then $doc/archimedes/text
+    else if ($docBase = 'echo')
+    then $doc/echo:echo/echo:text
+    else ()
+  return $result	
+};
+
+
+(: TODO: performance improvement: each time util:eval is executed   :)
+declare function mpdl-lucene:getElemDynamic($path, $elemStr) {
+  let $evalExpr :=
+    if($elemStr != "")
+    then concat("$path/", $elemStr)
+    else ""
+  let $result := util:eval($evalExpr)
+  return $result	
+};
+
+declare function mpdl-lucene:getAttrQueryStr($attribute, $attrValue) {
+  let $attrArch := mpdl-lucene:getElemNameByAttr("archimedes", $attribute)
+  let $attrEcho := mpdl-lucene:getElemNameByAttr("echo", $attribute)
+  let $attrArchRelQueryStr :=
+    if ($attribute = "date")
+    then concat($attrArch, " = '", $attrValue, "'")
+    else concat("ft:query(", $attrArch, ", '", $attrValue, "')")
+  let $attrEchoRelQueryStr :=
+    if ($attribute = "date")
+    then concat($attrEcho, " = '", $attrValue, "'")
+    else concat("ft:query(", $attrEcho, ", '", $attrValue, "')")
+  let $result := 
+    if ($attrArch != "" and $attrEcho != "")
+    then concat($attrArchRelQueryStr, " or ",  $attrEchoRelQueryStr)
+    else if ($attrArch = "" and $attrEcho != "")
+    then $attrEchoRelQueryStr
+    else if ($attrArch != "" and $attrEcho = "")
+    then $attrArchRelQueryStr
+    else ""
+  return $result
+};
+
+declare function mpdl-lucene:getElemNameByAttr($docBase, $attribute) {
+  let $docBaseArch := "archimedes"
+  let $docBaseEcho := "echo"
+  let $result :=
+    if ($docBase = $docBaseArch and $attribute = "author")
+    then "author"
+    else if ($docBase = $docBaseEcho and $attribute = "author")
+    then "dcterms:creator"
+    else if ($docBase = $docBaseArch and $attribute = "title")
+    then "title"
+    else if ($docBase = $docBaseEcho and $attribute = "title")
+    then "dcterms:title"
+    else if ($docBase = $docBaseArch and $attribute = "place")
+    then "place"
+    else if ($docBase = $docBaseEcho and $attribute = "place")
+    then ""
+    else if ($docBase = $docBaseArch and $attribute = "date")
+    then "date"
+    else if ($docBase = $docBaseEcho and $attribute = "date")
+    then "dcterms:date"
+    else if ($docBase = $docBaseArch and $attribute = "language")
+    then "lang"
+    else if ($docBase = $docBaseEcho and $attribute = "language")
+    then "dcterms:language"
+    else if ($docBase = $docBaseArch and $attribute = "identifier")
+    then "locator"
+    else if ($docBase = $docBaseEcho and $attribute = "identifier")
+    then "dcterms:identifier"
+    else if ($docBase = $docBaseArch and $attribute = "rights")
+    then ""
+    else if ($docBase = $docBaseEcho and $attribute = "rights")
+    then "dcterms:rights"
+    else if ($docBase = $docBaseArch and $attribute = "license")
+    then ""
+    else if ($docBase = $docBaseEcho and $attribute = "license")
+    then "dcterms:license"
+    else if ($docBase = $docBaseArch and $attribute = "accessRights")
+    then ""
+    else if ($docBase = $docBaseEcho and $attribute = "accessRights")
+    then "dcterms:accessRights"
+    else if ($docBase = $docBaseArch and $attribute = "file")
+    then "cvs_file"
+    else if ($docBase = $docBaseEcho and $attribute = "file")
+    then ""
+    else if ($docBase = $docBaseArch and $attribute = "translator")
+    then "translator"
+    else if ($docBase = $docBaseEcho and $attribute = "translator")
+    then ""
+    else if ($docBase = $docBaseArch and $attribute = "version")
+    then "cvs_version"
+    else if ($docBase = $docBaseEcho and $attribute = "version")
+    then ""
+    else ""
+    
+  return $result
+};
+
+declare function mpdl-lucene:getElementsByAttr($metadata, $docBase, $attribute) {
+  let $docBaseArch := "archimedes"
+  let $docBaseEcho := "echo"
+  let $result :=
+    if ($docBase = $docBaseArch and $attribute = "author")
+    then 
+      for $elem in $metadata/author
+      return <author>{$elem}</author>
+    else if ($docBase = $docBaseEcho and $attribute = "author")
+    then 
+      for $elem in $metadata/dcterms:creator
+      return <author>{$elem}</author>
+    else if ($docBase = $docBaseArch and $attribute = "title")
+    then 
+      for $elem in $metadata/title
+      return <title>{$elem}</title>
+    else if ($docBase = $docBaseEcho and $attribute = "title")
+    then 
+      for $elem in $metadata/dcterms:title
+      return <title>{$elem}</title>
+    else if ($docBase = $docBaseArch and $attribute = "place")
+    then 
+      for $elem in $metadata/place
+      return <place>{$elem}</place>
+    else if ($docBase = $docBaseEcho and $attribute = "place")
+    then ()
+    else if ($docBase = $docBaseArch and $attribute = "date")
+    then 
+      for $elem in $metadata/date
+      return <date>{$elem}</date>
+    else if ($docBase = $docBaseEcho and $attribute = "date")
+    then 
+      for $elem in $metadata/dcterms:date
+      return <date>{$elem}</date>
+    else if ($docBase = $docBaseArch and $attribute = "language")
+    then $metadata/lang
+    else if ($docBase = $docBaseEcho and $attribute = "language")
+    then $metadata/dcterms:language
+    else if ($docBase = $docBaseArch and $attribute = "identifier")
+    then $metadata/locator
+    else if ($docBase = $docBaseEcho and $attribute = "identifier")
+    then $metadata/dcterms:identifier
+    else if ($docBase = $docBaseArch and $attribute = "rights")
+    then ()
+    else if ($docBase = $docBaseEcho and $attribute = "rights")
+    then 
+      for $elem in $metadata/dcterms:rights
+      return <rights>{$elem}</rights>
+    else if ($docBase = $docBaseArch and $attribute = "accessRights")
+    then ()
+    else if ($docBase = $docBaseEcho and $attribute = "accessRights")
+    then 
+      for $elem in $metadata/dcterms:accessRights
+      return <rights>{$elem}</rights>
+    else if ($docBase = $docBaseArch and $attribute = "license")
+    then ()
+    else if ($docBase = $docBaseEcho and $attribute = "license")
+    then 
+      for $elem in $metadata/dcterms:license
+      return <license>{$elem}</license>
+    else if ($docBase = $docBaseArch and $attribute = "file")
+    then $metadata/cvs_file
+    else if ($docBase = $docBaseEcho and $attribute = "file")
+    then ()
+    else if ($docBase = $docBaseArch and $attribute = "translator")
+    then $metadata/translator
+    else if ($docBase = $docBaseEcho and $attribute = "translator")
+    then ()
+    else if ($docBase = $docBaseArch and $attribute = "version")
+    then $metadata/cvs_version
+    else if ($docBase = $docBaseEcho and $attribute = "version")
+    then ()
+    else ()
+    
+  return $result
+};
+