view software/eXist/webapp/mpdl/lucene/search.xql @ 7:5589d865af7a

Erstellung XQL/XSL Applikation
author Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
date Tue, 08 Feb 2011 15:16:46 +0100
parents
children d6f528ad5d96
line wrap: on
line source

xquery version "1.0";

module namespace mpdl-lucene = "http://www.mpiwg-berlin.mpg.de/ns/mpdl/lucene/search"; 

import module namespace functx = "http://www.functx.com" at "../util/functx.xql";

declare namespace ft = "http://exist-db.org/xquery/lucene";

declare namespace echo="http://www.mpiwg-berlin.mpg.de/ns/echo/1.0/";

declare namespace dc="http://purl.org/dc/elements/1.1/";
declare namespace dcterms="http://purl.org/dc/terms";

declare function mpdl-lucene:search($mpdlCollection, $queryStr) {
  let $luceneParseResult := mpdltext:lucene-parse-query($queryStr)
  let $result := 
    if ($luceneParseResult = '')
    then $mpdlCollection/.[ft:query(archimedes/text, $queryStr) or ft:query(echo:echo/echo:text, $queryStr)]
    else
      <result>
        <error>{$luceneParseResult}</error>
        <size>0</size>
        <pages>0</pages>
        <pn>0</pn>
        <hits/>
      </result>    
  return $result
};

declare function mpdl-lucene:search($mpdlCollectionName, $language, $document, $queryType, $queryStr, $pn as xs:int, $pageSize as xs:int) as node() {
  (: performance reasons: all hits (not only the first 10! ) are passed through the :)
  (: for loop: so the overhead in each loop has to be minimized :)
  let $query := 
    if ($queryType = 'fulltext' or $queryType = 'fulltextMorph')
    then $queryStr
    else if ($queryType = 'fulltextMorphLemma')
    then concat('lemmalemma', $queryStr)
    else ()
  let $pageBreaks := 
    if ($mpdlCollectionName = 'archimedes')
    then $document//pb
    else if ($mpdlCollectionName = 'echo') 
    then $document//echo:pb
    else $document//pb
  let $luceneParseResult := mpdltext:lucene-parse-query($queryStr)
  let $t := 
    if ($luceneParseResult != '')
    then ()
    else if ($mpdlCollectionName = 'archimedes')
    then $document//s[ft:query(., $query)]
    else if ($mpdlCollectionName = 'echo')
    then $document//echo:s[ft:query(., $query)]
    else $document//s[ft:query(., $query)]
  let $from := ($pn * $pageSize) - $pageSize + 1
  let $to := $pn * $pageSize
  (: performance improvements: result set of 500 needs 3 sec., result set of 10 needs 0,7 sec.:)
  let $tempQueryResult := 
    for $ss at $poss in $t
    where $poss >= $from and $poss <= $to
    return $ss
  let $queryResult :=
    for $s at $pos in $tempQueryResult
      let $pnOfS := count($pageBreaks[. << $s])        (: faster: comparison only in pb elements of this document :)
      let $pb := subsequence($pageBreaks, $pnOfS, 1)
      (: test if sentence surrounds page break; costs 0,1 sec performance :)
      let $pbPlus1 := subsequence($pageBreaks, $pnOfS + 1, 1)
      let $sSurroundsPB := 
        if ($pbPlus1/parent::node() = $s and $pbPlus1 intersect $s/descendant::node())
        then true()
        else false()
      let $posOfS :=                           (: faster: comparison only in s elements of this document :)
        if ($mpdlCollectionName = 'archimedes')
        then count($pb/following::s[. << $s]) + 1
        else if ($mpdlCollectionName = 'echo')
        then count($pb/following::echo:s[. << $s]) + 1
        else count($pb/following::s[. << $s]) + 1
      let $position := $from - 1 + $pos
      let $resultElem := 
        <hit>
          <pos>{$position}</pos>
          <pn>{$pnOfS}</pn>
          <pos-of-s>{$posOfS}</pos-of-s>
          <s>{string($s)}</s>
          <s-surrounds-pb>{$sSurroundsPB}</s-surrounds-pb>
        </hit>
    return $resultElem
  let $resultSize := count($t)
  let $pages := 
    if ($resultSize = 0)
    then 0
    else $resultSize idiv $pageSize + 1
  let $queryForms := 
    if ($queryType = 'fulltextMorph' or $queryType = 'fulltextMorphLemma')
    then mpdltext:get-query-morph-forms($language, $queryStr)
    else ()
  let $queryRegularizations := 
    if ($queryType = 'fulltextMorph')
    then mpdltext:get-query-regularizations($language, $queryStr)
    else ()

  let $encodedQueryTerms := 
    if ($language = "zh")
    then 
      mpdltext:get-big5-encoded-terms($query)
    else ()

  let $result := 
    if ($luceneParseResult != '')
    then
      <result>
        <error>{$luceneParseResult}</error>
        <size>0</size>
        <pages>0</pages>
        <pn>0</pn>
        <hits/>
      </result>    
    else
      <result>
        <size>{$resultSize}</size>
        <page-size>{$pageSize}</page-size>
        <pages>{$pages}</pages>
        <pn>{$pn}</pn>
        <hits>
          {$queryResult}
        </hits>
        <query-forms>{$queryForms}</query-forms>
        <query-regularizations>{$queryRegularizations}</query-regularizations>
        {$encodedQueryTerms}
      </result>
   
   return $result
};

declare function mpdl-lucene:attrSearch($metadataStr, $attribute1, $attrValue1, $boolOp, $attribute2, $attrValue2) {
  let $attrFtQueryStr1 := mpdl-lucene:getAttrQueryStr($attribute1, $attrValue1)
  let $attrFtQueryStr2 := mpdl-lucene:getAttrQueryStr($attribute2, $attrValue2)
  let $booleanQueryStr := 
    if ($attrValue2 = "")
    then $attrFtQueryStr1
    else if ($attrValue2 != "" and $boolOp = "or")
    then concat($attrFtQueryStr1, " or ", $attrFtQueryStr2)
    else if ($attrValue2 != "" and $boolOp = "and")
    then concat("(", $attrFtQueryStr1, ") and (", $attrFtQueryStr2, ")")
    else if ($attrValue2 != "" and $boolOp = "andNot")
    then concat("(", $attrFtQueryStr1, ") and not(", $attrFtQueryStr2, ")")
    else ()
  let $attrQuery := concat($metadataStr, "/.[", $booleanQueryStr, "]")
  let $luceneParseResult1 := mpdltext:lucene-parse-query($attrValue1)
  let $luceneParseResult2 := mpdltext:lucene-parse-query($attrValue2)
  let $luceneParseResult :=
    if ($luceneParseResult1 = '' and $luceneParseResult2 = '')
    then ''
    else if ($luceneParseResult1 != '' and $luceneParseResult2 = '')
    then concat("attribute ", $attribute1, " with value: ", $attrValue1, ": ", $luceneParseResult1)
    else if ($luceneParseResult1 = '' and $luceneParseResult2 != '')
    then concat("attribute ", $attribute2, " with value: ", $attrValue2, ": ", $luceneParseResult2)
    else if ($luceneParseResult1 != '' and $luceneParseResult2 != '')
    then concat("attribute ", $attribute1, " with value: ", $attrValue1, ": ", $luceneParseResult1, ", attribute ", $attribute2, " with value: ", $attrValue2, ": ", $luceneParseResult2)
    else ()    
  let $result := 
    if ($luceneParseResult = '')
    then util:eval($attrQuery)
    else
      <result>
        <error>{$luceneParseResult}</error>
        <size>0</size>
        <pages>0</pages>
        <pn>0</pn>
        <hits/>
      </result>    
  
  return $result
};

declare function mpdl-lucene:order($metadata, $orderBy) {
  let $result := 
  (for $attrElem in $metadata
     let $doc := $attrElem/fn:root()
     let $documentUri := document-uri($doc)
     let $docBase := functx:substring-after-last(functx:substring-before-last(functx:substring-before-last($documentUri, "/"), "/"), "/")
     let $metadataElem := mpdl-lucene:getMetadata($docBase, $doc)
     let $attrElemName := mpdl-lucene:getElemNameByAttr($docBase, $orderBy)
     let $orderByElem := mpdl-lucene:getElemDynamic($metadataElem, $attrElemName)  (: this costs performance for many result elements   :)
     let $orderByTemp := lower-case(string-join($orderByElem, ', '))
   order by $orderByTemp
   return $attrElem)

  return $result

  (: performance improvement (?):
     let $result := 
       for $x in doc(/db/doc1.xml) 
       order by $x
       if ($sortOrder eq "asc")
       then ( "ascending" ) 
       else ( "descending" ) 
  :)
};

declare function mpdl-lucene:getMetadata($docBase, $doc) {
  let $result :=
    if ($docBase = 'archimedes')
    then $doc/archimedes/info
    else if ($docBase = 'echo')
    then $doc/echo:echo/echo:metadata
    else ()
  return $result	
};

declare function mpdl-lucene:getText($docBase, $doc) {
  let $result :=
    if ($docBase = 'archimedes')
    then $doc/archimedes/text
    else if ($docBase = 'echo')
    then $doc/echo:echo/echo:text
    else ()
  return $result	
};


(: TODO: performance improvement: each time util:eval is executed   :)
declare function mpdl-lucene:getElemDynamic($path, $elemStr) {
  let $evalExpr :=
    if($elemStr != "")
    then concat("$path/", $elemStr)
    else ""
  let $result := util:eval($evalExpr)
  return $result	
};

declare function mpdl-lucene:getAttrQueryStr($attribute, $attrValue) {
  let $attrArch := mpdl-lucene:getElemNameByAttr("archimedes", $attribute)
  let $attrEcho := mpdl-lucene:getElemNameByAttr("echo", $attribute)
  let $attrArchRelQueryStr :=
    if ($attribute = "date")
    then concat($attrArch, " = '", $attrValue, "'")
    else concat("ft:query(", $attrArch, ", '", $attrValue, "')")
  let $attrEchoRelQueryStr :=
    if ($attribute = "date")
    then concat($attrEcho, " = '", $attrValue, "'")
    else concat("ft:query(", $attrEcho, ", '", $attrValue, "')")
  let $result := 
    if ($attrArch != "" and $attrEcho != "")
    then concat($attrArchRelQueryStr, " or ",  $attrEchoRelQueryStr)
    else if ($attrArch = "" and $attrEcho != "")
    then $attrEchoRelQueryStr
    else if ($attrArch != "" and $attrEcho = "")
    then $attrArchRelQueryStr
    else ""
  return $result
};

declare function mpdl-lucene:getElemNameByAttr($docBase, $attribute) {
  let $docBaseArch := "archimedes"
  let $docBaseEcho := "echo"
  let $result :=
    if ($docBase = $docBaseArch and $attribute = "author")
    then "author"
    else if ($docBase = $docBaseEcho and $attribute = "author")
    then "dcterms:creator"
    else if ($docBase = $docBaseArch and $attribute = "title")
    then "title"
    else if ($docBase = $docBaseEcho and $attribute = "title")
    then "dcterms:title"
    else if ($docBase = $docBaseArch and $attribute = "place")
    then "place"
    else if ($docBase = $docBaseEcho and $attribute = "place")
    then ""
    else if ($docBase = $docBaseArch and $attribute = "date")
    then "date"
    else if ($docBase = $docBaseEcho and $attribute = "date")
    then "dcterms:date"
    else if ($docBase = $docBaseArch and $attribute = "language")
    then "lang"
    else if ($docBase = $docBaseEcho and $attribute = "language")
    then "dcterms:language"
    else if ($docBase = $docBaseArch and $attribute = "identifier")
    then "locator"
    else if ($docBase = $docBaseEcho and $attribute = "identifier")
    then "dcterms:identifier"
    else if ($docBase = $docBaseArch and $attribute = "rights")
    then ""
    else if ($docBase = $docBaseEcho and $attribute = "rights")
    then "dcterms:rights"
    else if ($docBase = $docBaseArch and $attribute = "license")
    then ""
    else if ($docBase = $docBaseEcho and $attribute = "license")
    then "dcterms:license"
    else if ($docBase = $docBaseArch and $attribute = "accessRights")
    then ""
    else if ($docBase = $docBaseEcho and $attribute = "accessRights")
    then "dcterms:accessRights"
    else if ($docBase = $docBaseArch and $attribute = "file")
    then "cvs_file"
    else if ($docBase = $docBaseEcho and $attribute = "file")
    then ""
    else if ($docBase = $docBaseArch and $attribute = "translator")
    then "translator"
    else if ($docBase = $docBaseEcho and $attribute = "translator")
    then ""
    else if ($docBase = $docBaseArch and $attribute = "version")
    then "cvs_version"
    else if ($docBase = $docBaseEcho and $attribute = "version")
    then ""
    else ""
    
  return $result
};

declare function mpdl-lucene:getElementsByAttr($metadata, $docBase, $attribute) {
  let $docBaseArch := "archimedes"
  let $docBaseEcho := "echo"
  let $result :=
    if ($docBase = $docBaseArch and $attribute = "author")
    then 
      for $elem in $metadata/author
      return <author>{$elem}</author>
    else if ($docBase = $docBaseEcho and $attribute = "author")
    then 
      for $elem in $metadata/dcterms:creator
      return <author>{$elem}</author>
    else if ($docBase = $docBaseArch and $attribute = "title")
    then 
      for $elem in $metadata/title
      return <title>{$elem}</title>
    else if ($docBase = $docBaseEcho and $attribute = "title")
    then 
      for $elem in $metadata/dcterms:title
      return <title>{$elem}</title>
    else if ($docBase = $docBaseArch and $attribute = "place")
    then 
      for $elem in $metadata/place
      return <place>{$elem}</place>
    else if ($docBase = $docBaseEcho and $attribute = "place")
    then ()
    else if ($docBase = $docBaseArch and $attribute = "date")
    then 
      for $elem in $metadata/date
      return <date>{$elem}</date>
    else if ($docBase = $docBaseEcho and $attribute = "date")
    then 
      for $elem in $metadata/dcterms:date
      return <date>{$elem}</date>
    else if ($docBase = $docBaseArch and $attribute = "language")
    then $metadata/lang
    else if ($docBase = $docBaseEcho and $attribute = "language")
    then $metadata/dcterms:language
    else if ($docBase = $docBaseArch and $attribute = "identifier")
    then $metadata/locator
    else if ($docBase = $docBaseEcho and $attribute = "identifier")
    then $metadata/dcterms:identifier
    else if ($docBase = $docBaseArch and $attribute = "rights")
    then ()
    else if ($docBase = $docBaseEcho and $attribute = "rights")
    then 
      for $elem in $metadata/dcterms:rights
      return <rights>{$elem}</rights>
    else if ($docBase = $docBaseArch and $attribute = "accessRights")
    then ()
    else if ($docBase = $docBaseEcho and $attribute = "accessRights")
    then 
      for $elem in $metadata/dcterms:accessRights
      return <rights>{$elem}</rights>
    else if ($docBase = $docBaseArch and $attribute = "license")
    then ()
    else if ($docBase = $docBaseEcho and $attribute = "license")
    then 
      for $elem in $metadata/dcterms:license
      return <license>{$elem}</license>
    else if ($docBase = $docBaseArch and $attribute = "file")
    then $metadata/cvs_file
    else if ($docBase = $docBaseEcho and $attribute = "file")
    then ()
    else if ($docBase = $docBaseArch and $attribute = "translator")
    then $metadata/translator
    else if ($docBase = $docBaseEcho and $attribute = "translator")
    then ()
    else if ($docBase = $docBaseArch and $attribute = "version")
    then $metadata/cvs_version
    else if ($docBase = $docBaseEcho and $attribute = "version")
    then ()
    else ()
    
  return $result
};