view software/eXist/webapp/mpdl/lucene/search.xql @ 15:e99964f390e4

diverse Fehlerbehebungen
author Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
date Mon, 29 Aug 2011 17:40:19 +0200
parents d6f528ad5d96
children 7e883ce72fec
line wrap: on
line source

xquery version "1.0";

module namespace mpdl-lucene = "http://www.mpiwg-berlin.mpg.de/ns/mpdl/lucene/search"; 

import module namespace functx = "http://www.functx.com" at "../util/functx.xql";

declare namespace ft = "http://exist-db.org/xquery/lucene";

declare namespace echo="http://www.mpiwg-berlin.mpg.de/ns/echo/1.0/";
declare namespace TEI="http://www.tei-c.org/ns/1.0";

declare namespace dc="http://purl.org/dc/elements/1.1/";
declare namespace dcterms="http://purl.org/dc/terms";

declare function mpdl-lucene:search($mpdlCollection, $queryStr) {
  let $luceneParseResult := mpdltext:lucene-parse-query($queryStr)
  let $result := 
    if ($luceneParseResult = '')
    then $mpdlCollection/.[ft:query(archimedes/text, $queryStr) or ft:query(echo:echo/echo:text, $queryStr) or ft:query(TEI:TEI/TEI:text, $queryStr)]
    else
      <result>
        <error>{$luceneParseResult}</error>
        <size>0</size>
        <pages>0</pages>
        <pn>0</pn>
        <hits/>
      </result>    
  return $result
};

declare function mpdl-lucene:search($mpdlCollectionName, $language, $document, $queryType, $queryStr, $pn as xs:int, $pageSize as xs:int) as node() {
  (: performance reasons: all hits (not only the first 10! ) are passed through the :)
  (: for loop: so the overhead in each loop has to be minimized :)
  let $query := 
    if ($queryType = 'fulltext' or $queryType = 'fulltextMorph')
    then $queryStr
    else if ($queryType = 'fulltextMorphLemma')
    then concat('lemmalemma', $queryStr)
    else ()
  let $pageBreaks := $document//*[name() = 'pb']
  let $luceneParseResult := mpdltext:lucene-parse-query($queryStr)
  let $t := 
    if ($luceneParseResult != '')
    then ()
    else if ($mpdlCollectionName = 'archimedes')
    then $document//s[ft:query(., $query)]|$document//head[ft:query(., $query)]
    else if ($mpdlCollectionName = 'echo')
    then $document//echo:s[ft:query(., $query)]|$document//echo:head[ft:query(., $query)]
    else if ($mpdlCollectionName = 'tei')
    then $document//TEI:s[ft:query(., $query)]|$document//TEI:head[ft:query(., $query)]
    else $document//s[ft:query(., $query)]|$document//head[ft:query(., $query)]
    (: else $document//*[name() = 's' or name() = 'head'][ft:query(., $query)]  this would be much slower and would consume too much memory   :)
  let $from := ($pn * $pageSize) - $pageSize + 1
  let $to := $pn * $pageSize
  (: performance improvements: result set of 500 needs 3 sec., result set of 10 needs 0,7 sec.:)
  let $tempQueryResult := 
    for $ss at $poss in $t
    where $poss >= $from and $poss <= $to
    return $ss
  let $queryResult :=
    for $hit at $pos in $tempQueryResult
      let $hitType := local-name($hit)
      let $pnOfHit := count($pageBreaks[. << $hit])        (: faster: comparison only in pb elements of this document :)
      let $pb := subsequence($pageBreaks, $pnOfHit, 1)
      (: test if sentence surrounds page break; costs 0,1 sec performance :)
      let $pbPlus1 := subsequence($pageBreaks, $pnOfHit + 1, 1)
      let $hitSurroundsPB := 
        if ($pbPlus1/parent::node() = $hit and $pbPlus1 intersect $hit/descendant::node())
        then true()
        else false()
      let $posOfHit :=           (: faster: comparison only in s or head elements of this document and only in a specific namespace :)
        if ($mpdlCollectionName = 'archimedes' and $hitType = 's')
        then count($pb/following::s[. << $hit]) + 1
        else if ($mpdlCollectionName = 'archimedes' and $hitType = 'head')
        then count($pb/following::head[. << $hit]) + 1
        else if ($mpdlCollectionName = 'echo' and $hitType = 's')
        then count($pb/following::echo:s[. << $hit]) + 1
        else if ($mpdlCollectionName = 'echo' and $hitType = 'head')
        then count($pb/following::echo:head[. << $hit]) + 1
        else if ($mpdlCollectionName = 'tei' and $hitType = 's')
        then count($pb/following::TEI:s[. << $hit]) + 1
        else if ($mpdlCollectionName = 'tei' and $hitType = 'head')
        then count($pb/following::TEI:head[. << $hit]) + 1
        else count($pb/following::s[. << $hit]) + 1
      let $position := $from - 1 + $pos
      let $resultElem := 
        <hit>
          <hitType>{$hitType}</hitType>
          <pos>{$position}</pos>
          <pn>{$pnOfHit}</pn>
          <hitPos>{$posOfHit}</hitPos>
          <hitString>{string($hit)}</hitString>
          <hitSurroundsPB>{$hitSurroundsPB}</hitSurroundsPB>
        </hit>
    return $resultElem
  let $resultSize := count($t)
  let $pages := 
    if ($resultSize = 0)
    then 0
    else $resultSize idiv $pageSize + 1
  let $queryForms := 
    if ($queryType = 'fulltextMorph' or $queryType = 'fulltextMorphLemma')
    then mpdltext:get-query-morph-forms($language, $queryStr)
    else ()
  let $queryRegularizations := 
    if ($queryType = 'fulltextMorph')
    then mpdltext:get-query-regularizations($language, $queryStr)
    else ()

  let $encodedQueryTerms := 
    if ($language = "zh")
    then 
      mpdltext:get-big5-encoded-terms($query)
    else ()

  let $result := 
    if ($luceneParseResult != '')
    then
      <result>
        <error>{$luceneParseResult}</error>
        <size>0</size>
        <pages>0</pages>
        <pn>0</pn>
        <hits/>
      </result>    
    else
      <result>
        <size>{$resultSize}</size>
        <page-size>{$pageSize}</page-size>
        <pages>{$pages}</pages>
        <pn>{$pn}</pn>
        <hits>
          {$queryResult}
        </hits>
        <query-forms>{$queryForms}</query-forms>
        <query-regularizations>{$queryRegularizations}</query-regularizations>
        {$encodedQueryTerms}
      </result>
   
   return $result
};

declare function mpdl-lucene:attrSearch($metadataStr, $attribute1, $attrValue1, $boolOp, $attribute2, $attrValue2) {
  let $attrFtQueryStr1 := mpdl-lucene:getAttrQueryStr($attribute1, $attrValue1)
  let $attrFtQueryStr2 := mpdl-lucene:getAttrQueryStr($attribute2, $attrValue2)
  let $booleanQueryStr := 
    if ($attrValue2 = "")
    then $attrFtQueryStr1
    else if ($attrValue2 != "" and $boolOp = "or")
    then concat($attrFtQueryStr1, " or ", $attrFtQueryStr2)
    else if ($attrValue2 != "" and $boolOp = "and")
    then concat("(", $attrFtQueryStr1, ") and (", $attrFtQueryStr2, ")")
    else if ($attrValue2 != "" and $boolOp = "andNot")
    then concat("(", $attrFtQueryStr1, ") and not(", $attrFtQueryStr2, ")")
    else ()
  let $attrQuery := concat($metadataStr, "/.[", $booleanQueryStr, "]")
  let $luceneParseResult1 := mpdltext:lucene-parse-query($attrValue1)
  let $luceneParseResult2 := mpdltext:lucene-parse-query($attrValue2)
  let $luceneParseResult :=
    if ($luceneParseResult1 = '' and $luceneParseResult2 = '')
    then ''
    else if ($luceneParseResult1 != '' and $luceneParseResult2 = '')
    then concat("attribute ", $attribute1, " with value: ", $attrValue1, ": ", $luceneParseResult1)
    else if ($luceneParseResult1 = '' and $luceneParseResult2 != '')
    then concat("attribute ", $attribute2, " with value: ", $attrValue2, ": ", $luceneParseResult2)
    else if ($luceneParseResult1 != '' and $luceneParseResult2 != '')
    then concat("attribute ", $attribute1, " with value: ", $attrValue1, ": ", $luceneParseResult1, ", attribute ", $attribute2, " with value: ", $attrValue2, ": ", $luceneParseResult2)
    else ()    
  let $result := 
    if ($luceneParseResult = '')
    then util:eval($attrQuery)
    else
      <result>
        <error>{$luceneParseResult}</error>
        <size>0</size>
        <pages>0</pages>
        <pn>0</pn>
        <hits/>
      </result>    
  
  return $result
};

declare function mpdl-lucene:order($metadata, $orderBy) {
  let $result := 
  (for $attrElem in $metadata
     let $doc := $attrElem/fn:root()
     let $documentUri := document-uri($doc)
     let $docBase := functx:substring-after-last(functx:substring-before-last(functx:substring-before-last($documentUri, "/"), "/"), "/")
     let $metadataElem := mpdl-lucene:getMetadata($docBase, $doc)
     let $attrElemName := mpdl-lucene:getElemNameByAttr($docBase, $orderBy)
     let $orderByElem := mpdl-lucene:getElemDynamic($metadataElem, $attrElemName)  (: this costs performance for many result elements   :)
     let $orderByTemp := lower-case(string-join($orderByElem, ', '))
   order by $orderByTemp
   return $attrElem)

  return $result

  (: performance improvement (?):
     let $result := 
       for $x in doc(/db/doc1.xml) 
       order by $x
       if ($sortOrder eq "asc")
       then ( "ascending" ) 
       else ( "descending" ) 
  :)
};

declare function mpdl-lucene:getMetadata($docBase, $doc) {
  let $result :=
    if ($docBase = 'archimedes')
    then $doc/archimedes/info
    else if ($docBase = 'echo')
    then $doc/echo:echo/echo:metadata
    else if ($docBase = 'tei')
    then $doc/TEI:TEI/TEI:teiHeader
    else ()
  return $result	
};

declare function mpdl-lucene:getText($docBase, $doc) {
  let $result :=
    if ($docBase = 'archimedes')
    then $doc/archimedes/text
    else if ($docBase = 'echo')
    then $doc/echo:echo/echo:text
    else if ($docBase = 'tei')
    then $doc/TEI:TEI/TEI:text
    else ()
  return $result	
};


(: TODO: performance improvement: each time util:eval is executed   :)
declare function mpdl-lucene:getElemDynamic($path, $elemStr) {
  let $evalExpr :=
    if($elemStr != "")
    then concat("$path/", $elemStr)
    else ""
  let $result := util:eval($evalExpr)
  return $result	
};

declare function mpdl-lucene:getAttrQueryStr($attribute, $attrValue) {
  let $attrArch := mpdl-lucene:getElemNameByAttr("archimedes", $attribute)
  let $attrEcho := mpdl-lucene:getElemNameByAttr("echo", $attribute)
  let $attrTei := mpdl-lucene:getElemNameByAttr("tei", $attribute)
  let $attrArchRelQueryStr :=
    if ($attribute = "date")
    then concat($attrArch, " = '", $attrValue, "'")
    else concat("ft:query(", $attrArch, ", '", $attrValue, "')")
  let $attrEchoRelQueryStr :=
    if ($attribute = "date")
    then concat($attrEcho, " = '", $attrValue, "'")
    else concat("ft:query(", $attrEcho, ", '", $attrValue, "')")
  let $attrTeiRelQueryStr :=
    if ($attribute = "date")
    then concat($attrTei, " = '", $attrValue, "'")
    else concat("ft:query(", $attrTei, ", '", $attrValue, "')")
  let $result := 
    if ($attrArch != "" and $attrEcho != "" and $attrTei != "")
    then concat($attrArchRelQueryStr, " or ",  $attrEchoRelQueryStr, " or ",  $attrTeiRelQueryStr)
    else if ($attrArch != "" and $attrEcho != "" and $attrTei = "")
    then concat($attrArchRelQueryStr, " or ",  $attrEchoRelQueryStr)
    else if ($attrArch != "" and $attrEcho = "" and $attrTei != "")
    then concat($attrArchRelQueryStr, " or ",  $attrTeiRelQueryStr)
    else if ($attrArch = "" and $attrEcho != "" and $attrTei != "")
    then concat($attrEchoRelQueryStr, " or ",  $attrTeiRelQueryStr)
    else if ($attrArch != "" and $attrEcho = "" and $attrTei = "")
    then $attrArchRelQueryStr
    else if ($attrArch = "" and $attrEcho != "" and $attrTei = "")
    then $attrEchoRelQueryStr
    else if ($attrArch = "" and $attrEcho = "" and $attrTei != "")
    then $attrTeiRelQueryStr
    else ""
  return $result
};

declare function mpdl-lucene:getElemNameByAttr($docBase, $attribute) {
  let $docBaseArch := "archimedes"
  let $docBaseEcho := "echo"
  let $docBaseTei := "tei"
  let $result :=
    if ($docBase = $docBaseArch and $attribute = "author")
    then "author"
    else if ($docBase = $docBaseEcho and $attribute = "author")
    then "dcterms:creator"
    else if ($docBase = $docBaseTei and $attribute = "author")
    then "TEI:fileDesc/TEI:titleStmt/TEI:author"
    else if ($docBase = $docBaseArch and $attribute = "title")
    then "title"
    else if ($docBase = $docBaseEcho and $attribute = "title")
    then "dcterms:title"
    else if ($docBase = $docBaseTei and $attribute = "title")
    then "TEI:fileDesc/TEI:titleStmt/TEI:title"
    else if ($docBase = $docBaseArch and $attribute = "place")
    then "place"
    else if ($docBase = $docBaseEcho and $attribute = "place")
    then ""
    else if ($docBase = $docBaseTei and $attribute = "place")
    then "TEI:fileDesc/TEI:publicationStmt/TEI:pubPlace"
    else if ($docBase = $docBaseArch and $attribute = "date")
    then "date"
    else if ($docBase = $docBaseEcho and $attribute = "date")
    then "dcterms:date"
    else if ($docBase = $docBaseTei and $attribute = "date")
    then "TEI:fileDesc/TEI:publicationStmt/TEI:date"
    else if ($docBase = $docBaseArch and $attribute = "language")
    then "lang"
    else if ($docBase = $docBaseEcho and $attribute = "language")
    then "dcterms:language"
    else if ($docBase = $docBaseTei and $attribute = "language")
    then "TEI:profileDesc/TEI:langUsage/TEI:language/@ident"
    else if ($docBase = $docBaseArch and $attribute = "identifier")
    then "locator"
    else if ($docBase = $docBaseEcho and $attribute = "identifier")
    then "dcterms:identifier"
    else if ($docBase = $docBaseTei and $attribute = "identifier")
    then "identifier"
    else if ($docBase = $docBaseArch and $attribute = "rights")
    then ""
    else if ($docBase = $docBaseEcho and $attribute = "rights")
    then "dcterms:rights"
    else if ($docBase = $docBaseTei and $attribute = "rights")
    then "rights"
    else if ($docBase = $docBaseArch and $attribute = "license")
    then ""
    else if ($docBase = $docBaseEcho and $attribute = "license")
    then "dcterms:license"
    else if ($docBase = $docBaseTei and $attribute = "license")
    then ""
    else if ($docBase = $docBaseArch and $attribute = "accessRights")
    then ""
    else if ($docBase = $docBaseEcho and $attribute = "accessRights")
    then "dcterms:accessRights"
    else if ($docBase = $docBaseTei and $attribute = "accessRights")
    then "accessRights"
    else if ($docBase = $docBaseArch and $attribute = "file")
    then "cvs_file"
    else if ($docBase = $docBaseEcho and $attribute = "file")
    then ""
    else if ($docBase = $docBaseTei and $attribute = "file")
    then ""
    else if ($docBase = $docBaseArch and $attribute = "translator")
    then "translator"
    else if ($docBase = $docBaseEcho and $attribute = "translator")
    then ""
    else if ($docBase = $docBaseTei and $attribute = "translator")
    then ""
    else if ($docBase = $docBaseArch and $attribute = "version")
    then "cvs_version"
    else if ($docBase = $docBaseEcho and $attribute = "version")
    then ""
    else if ($docBase = $docBaseTei and $attribute = "version")
    then ""
    else ""
    
  return $result
};

declare function mpdl-lucene:getElementsByAttr($metadata, $docBase, $attribute) {
  let $docBaseArch := "archimedes"
  let $docBaseEcho := "echo"
  let $docBaseTei := "tei"
  let $result :=
    if ($docBase = $docBaseArch and $attribute = "author")
    then 
      for $elem in $metadata/author
      return <author>{$elem}</author>
    else if ($docBase = $docBaseEcho and $attribute = "author")
    then 
      for $elem in $metadata/dcterms:creator
      return <author>{$elem}</author>
    else if ($docBase = $docBaseTei and $attribute = "author")
    then 
      for $elem in $metadata/TEI:fileDesc/TEI:titleStmt/TEI:author
      return <author>{$elem}</author>
    else if ($docBase = $docBaseArch and $attribute = "title")
    then 
      for $elem in $metadata/title
      return <title>{$elem}</title>
    else if ($docBase = $docBaseEcho and $attribute = "title")
    then 
      for $elem in $metadata/dcterms:title
      return <title>{$elem}</title>
    else if ($docBase = $docBaseTei and $attribute = "title")
    then 
      for $elem in $metadata/TEI:fileDesc/TEI:titleStmt/TEI:title
      return <title>{$elem}</title>
    else if ($docBase = $docBaseArch and $attribute = "place")
    then 
      for $elem in $metadata/place
      return <place>{$elem}</place>
    else if ($docBase = $docBaseEcho and $attribute = "place")
    then ()
    else if ($docBase = $docBaseArch and $attribute = "date")
    then 
      for $elem in $metadata/date
      return <date>{$elem}</date>
    else if ($docBase = $docBaseEcho and $attribute = "date")
    then 
      for $elem in $metadata/dcterms:date
      return <date>{$elem}</date>
    else if ($docBase = $docBaseTei and $attribute = "date")
    then 
      for $elem in $metadata/TEI:fileDesc/TEI:publicationStmt/TEI:date
      return <date>{$elem}</date>
    else if ($docBase = $docBaseArch and $attribute = "language")
    then $metadata/lang
    else if ($docBase = $docBaseEcho and $attribute = "language")
    then $metadata/dcterms:language
    else if ($docBase = $docBaseTei and $attribute = "language")
    then $metadata/TEI:profileDesc/TEI:langUsage/TEI:language/@ident
    else if ($docBase = $docBaseArch and $attribute = "identifier")
    then $metadata/locator
    else if ($docBase = $docBaseEcho and $attribute = "identifier")
    then $metadata/dcterms:identifier
    else if ($docBase = $docBaseArch and $attribute = "rights")
    then ()
    else if ($docBase = $docBaseEcho and $attribute = "rights")
    then 
      for $elem in $metadata/dcterms:rights
      return <rights>{$elem}</rights>
    else if ($docBase = $docBaseTei and $attribute = "rights")
    then $metadata/TEI:fileDesc/TEI:publicationStmt/TEI:availability
    else if ($docBase = $docBaseArch and $attribute = "accessRights")
    then ()
    else if ($docBase = $docBaseEcho and $attribute = "accessRights")
    then 
      for $elem in $metadata/dcterms:accessRights
      return <rights>{$elem}</rights>
    else if ($docBase = $docBaseTei and $attribute = "accessRights")
    then $metadata/TEI:fileDesc/TEI:publicationStmt/TEI:availability/@status
    else if ($docBase = $docBaseArch and $attribute = "license")
    then ()
    else if ($docBase = $docBaseEcho and $attribute = "license")
    then 
      for $elem in $metadata/dcterms:license
      return <license>{$elem}</license>
    else if ($docBase = $docBaseArch and $attribute = "file")
    then $metadata/cvs_file
    else if ($docBase = $docBaseEcho and $attribute = "file")
    then ()
    else if ($docBase = $docBaseArch and $attribute = "translator")
    then $metadata/translator
    else if ($docBase = $docBaseEcho and $attribute = "translator")
    then ()
    else if ($docBase = $docBaseArch and $attribute = "version")
    then $metadata/cvs_version
    else if ($docBase = $docBaseEcho and $attribute = "version")
    then ()
    else ()
    
  return $result
};