diff software/eXist/webapp/mpdl/pq.xql @ 7:5589d865af7a

Erstellung XQL/XSL Applikation
author Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
date Tue, 08 Feb 2011 15:16:46 +0100
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/software/eXist/webapp/mpdl/pq.xql	Tue Feb 08 15:16:46 2011 +0100
@@ -0,0 +1,439 @@
+xquery version "1.0";
+
+import module namespace mpdl-time = "http://www.mpiwg-berlin.mpg.de/ns/mpdl/util/time" at "util/time.xql";
+import module namespace functx = "http://www.functx.com" at "util/functx.xql";
+import module namespace mpdl-lucene = "http://www.mpiwg-berlin.mpg.de/ns/mpdl/lucene/search" at "lucene/search.xql";
+import module namespace mpdl-text = "http://www.mpiwg-berlin.mpg.de/ns/mpdl/text" at "text/all.xql";
+
+declare namespace xlink="http://www.w3.org/1999/xlink";
+declare namespace request = "http://exist-db.org/xquery/request";
+declare namespace transform = "http://exist-db.org/xquery/transform";
+declare namespace util = "http://exist-db.org/xquery/util";
+
+declare namespace dcterms="http://purl.org/dc/terms";
+declare namespace xhtml="http://www.w3.org/1999/xhtml";
+declare namespace echo="http://www.mpiwg-berlin.mpg.de/ns/echo/1.0/";
+
+
+let $mpdlDocUri := request:get-parameter("document", "")
+let $queryType := request:get-parameter("query-type", "")
+let $mode := request:get-parameter("mode", "image")
+let $reqPN := number(request:get-parameter("pn", "-1"))
+let $reqPF := request:get-parameter("pf", "")
+let $reqSN := number(request:get-parameter("sn", "-1"))
+let $query := request:get-parameter("query", "")
+let $reqQueryResultPN := request:get-parameter("query-result-pn", "")
+let $queryResultPN := 
+  if ($reqQueryResultPN = '' or $reqQueryResultPN = '0')
+  then 1
+  else number($reqQueryResultPN)
+let $regCharNorm := request:get-parameter("characterNormalization", "")
+let $tmpCharNorm := string-join($regCharNorm, ',')
+let $charNorm := 
+  if($tmpCharNorm = "regPlusNorm")
+  then "reg,norm"
+  else $tmpCharNorm
+let $reqExport := request:get-parameter("export", "")
+let $options := string(request:get-parameter("options", ""))
+
+let $presentationPath := "/db/mpdl/presentation"
+(: e.g. mpdlCollectioName is derived from mpdlDocUri: /archimedes/la/yourDoc.xml  :)
+let $documentName := substring-before(substring-after(substring-after(substring-after($mpdlDocUri, "/"), "/"), "/"), ".")
+let $language := substring-before(substring-after(substring-after($mpdlDocUri, "/"), "/"), "/")
+let $docbase := substring-before(substring-after($mpdlDocUri, "/"), "/")
+let $fullDocumentUri := 
+  if ($queryType = 'fulltext' or $queryType = 'ftIndex')
+  then concat('/db/mpdl/documents/standard', $mpdlDocUri)
+  else if ($queryType = 'fulltextMorph' or $queryType = 'fulltextMorphLemma' or $queryType = 'ftIndexMorph')
+  then concat('/db/mpdl/documents/morph', $mpdlDocUri)
+  else concat('/db/mpdl/documents/morph', $mpdlDocUri)
+let $currentTimeBegin := util:system-time()
+let $documentAvailable := doc-available($fullDocumentUri)
+let $document := doc($fullDocumentUri)
+let $metadata := 
+  if ($docbase = 'archimedes')
+  then $document/archimedes/info
+  else if ($docbase = 'echo')
+  then $document/echo:echo/echo:metadata
+  else ''
+
+let $pageBreaks := 
+  if ($docbase = 'archimedes')
+  then $document//pb
+  else if ($docbase = 'echo')
+  then $document//echo:pb
+  else $document//pb
+let $countPagesTemp := count($pageBreaks)
+let $countPages := 
+  if ($countPagesTemp > 0)
+  then $countPagesTemp
+  else 1
+
+(: xQuery inline execution does not work in module so it has to be done here  :) 
+let $xQueryPageSize := 100
+let $xQueryResultEval := 
+  if ($queryType = 'xpath' or $queryType = 'xquery' and $query != "")
+  then util:eval-inline($document, $query)
+  else ()
+let $xQueryFrom := ($queryResultPN * $xQueryPageSize) - $xQueryPageSize + 1
+let $xQueryTo := $queryResultPN * $xQueryPageSize
+let $xQueryResultEntries := 
+  for $entry at $pos in $xQueryResultEval
+  where $pos >= $xQueryFrom and $pos <= $xQueryTo
+  return $entry
+let $xQuerySize := count($xQueryResultEval)
+let $xQueryPages := 
+  if ($xQuerySize = 0)
+  then 0
+  else $xQuerySize idiv $xQueryPageSize + 1
+let $xQueryResult := 
+      <result>
+        <size>{$xQuerySize}</size>
+        <page-size>{$xQueryPageSize}</page-size>
+        <pages>{$xQueryPages}</pages>
+        <pn>{$queryResultPN}</pn>
+        <hits>{$xQueryResultEntries}</hits>
+      </result>  
+
+let $queryResult := 
+  if (($queryType = 'fulltext' or $queryType = 'fulltextMorph' or $queryType = 'fulltextMorphLemma') and $query != "")
+  then mpdl-lucene:search($docbase, $language, $document, $queryType, $query, $queryResultPN, 10)
+  else if (($queryType = 'ftIndex' or $queryType = 'ftIndexMorph') and $query != "")
+  then mpdl-text:indexTerms($docbase, $language, $document, $query, $queryResultPN, 100)
+  else if ($queryType = 'xpath' or $queryType = 'xquery' and $query != "")
+  then $xQueryResult
+  else if ($queryType = 'toc' or $queryType = 'figures')
+  then mpdl-text:get-toc($docbase, $queryType, $document, $queryResultPN, 100)
+  else if ($query = "")
+  then 
+      <result>
+        <size>0</size>
+        <pages>0</pages>
+        <pn>0</pn>
+        <hits/>
+      </result>
+  else ()
+
+let $countHits := count($queryResult/result/hits/hit)
+let $firstHit := $queryResult/result/hits/hit[1]
+(: jump to first pn and sn hit in fulltext mode   :)
+let $pn := 
+  if (($queryType = 'fulltext' or $queryType = 'fulltextMorph' or $queryType = 'fulltextMorphLemma') and $countHits > 0 and $reqPN <= 0)
+  then number($firstHit/pn)
+  else if ($reqPN = -1)
+  then 1
+  else $reqPN
+let $sn := 
+  if (($queryType = 'fulltext' or $queryType = 'fulltextMorph' or $queryType = 'fulltextMorphLemma') and $countHits > 0 and $reqPN <= 0 and $reqSN < 0)
+  then number($firstHit/pos-of-s)
+  else $reqSN
+
+(: 10 or more is an error :)
+let $errorCode := 
+  if (not($documentAvailable))
+  then 10
+  else if ($countPagesTemp != 0 and ($pn > $countPagesTemp or $pn <= 0))
+  then 11 
+  else if ($countPagesTemp = 0) 
+  then 1    (: if no page break is found then the document should have exactly one page   :)
+  else if (not($mode = "text"  or $mode = "textPollux" or $mode = "gis" or $mode = "image" or $mode = "xml" or $mode = "pureXml"))
+  then 12
+  else 0
+
+let $pb1 := 
+  if ($errorCode = 0)
+  then subsequence($pageBreaks, $pn, 1)
+  else if ($errorCode = 1)
+  then subsequence(mpdl-lucene:getText($docbase, $document), 1, 1)
+  else ()
+let $pb2 := 
+  if ($errorCode = 0)
+  then subsequence($pageBreaks, $pn + 1, 1)
+  else if ($errorCode = 1)
+  then subsequence(mpdl-lucene:getText($docbase, $document), 2, 1)
+  else ()
+let $pageHeader := string($pb1/@rhead)
+let $pageNumberOrig := string($pb1/@o)
+
+let $documentIdentifier :=
+  if ($docbase = 'archimedes')
+  then $metadata/locator
+  else if ($docbase = 'echo')
+  then $metadata/dcterms:identifier
+  else $metadata/dcterms:identifier
+let $echoDocIdentifier := 
+  if ($documentIdentifier != '')
+  then substring-before(substring-after($documentIdentifier, "ECHO:"), ".")
+  else ''
+let $echoURLZogilib := "http://echo.mpiwg-berlin.mpg.de/zogilib"
+let $nausikaaURLScaler := "http://nausikaa2.rz-berlin.mpg.de/digitallibrary/servlet/Scaler"
+let $nausikaaURLDlInfo := "http://nausikaa2.mpiwg-berlin.mpg.de/digitallibrary/dlInfo-xml.jsp"
+let $nausikaaURLTexter := "http://nausikaa2.mpiwg-berlin.mpg.de/digitallibrary/servlet/Texter"
+let $echoImageDir := 
+  if ($docbase = 'archimedes')
+  then string($metadata/echodir)
+  else if ($docbase = 'echo')
+  then string($metadata/echo:echodir)
+  else ''
+let $imagesDocDirectory :=
+  if ($echoImageDir != '')
+  then $echoImageDir
+  else if ($docbase = 'archimedes')
+  then concat("/permanent/archimedes/", $documentName)
+  else if ($docbase = 'echo')
+  then concat("/permanent/library/", $echoDocIdentifier)
+  else ''
+let $imagesDocDirectoryIndexMetaUrl := 
+  if ($mode = "image" or $mode = "text" or $mode = "textPollux" or $mode = "gis")
+  then concat($nausikaaURLTexter, "?fn=", $imagesDocDirectory, "/index.meta")
+  else ()
+let $digilibAvailable := mpdldoc:check-uri($imagesDocDirectoryIndexMetaUrl, ())
+let $imagesDocDirectoryIndexMeta := 
+  if (($mode = "image" or $mode = "text" or $mode = "textPollux" or $mode = "gis") and $digilibAvailable)
+  then doc($imagesDocDirectoryIndexMetaUrl)
+  else ()
+let $pageImageDirectory := string($imagesDocDirectoryIndexMeta/resource/meta/texttool/image)
+let $figuresImageDirectoryTemp := string($imagesDocDirectoryIndexMeta/resource/meta/texttool/figures)
+let $figuresImageDirectory := 
+  if ($figuresImageDirectoryTemp != '')
+  then $figuresImageDirectoryTemp
+  else concat(substring-before($pageImageDirectory, "pageimg"), "figures")
+let $pageImageFileNameWithoutExtension := 
+  if ($docbase = 'echo')
+  then concat("/", string($pb1/@file))
+  else ''
+let $imageFileName :=
+  if ($reqPF = '')
+  then concat($imagesDocDirectory, "/", $pageImageDirectory, $pageImageFileNameWithoutExtension)
+  else $reqPF
+let $imageEcho := <image-echo>{$echoURLZogilib}?fn={$imageFileName}&amp;pn={$pn}</image-echo>
+let $imageScaler := <image-scaler>{$nausikaaURLScaler}?fn={$imageFileName}&amp;pn={$pn}</image-scaler>
+
+let $imageFileNameUrl := concat($nausikaaURLDlInfo, "?fn=", $imageFileName)
+let $testImageResult := 
+  if ($mode = 'image' and $digilibAvailable)
+  then doc($imageFileNameUrl)
+  else ()
+let $testImageResultParamImgFn := string($testImageResult//parameter[@name='img.fn']/@value)
+let $imageIsAvailable := 
+  if ($testImageResultParamImgFn = '' and $reqPF = '')
+  then 'false'
+  else 'true'
+
+let $positionOfFirstFigureAfterPB1 := 
+  if ($docbase = 'archimedes')
+  then count($pb1/following::figure[1]/preceding::figure) + 1
+  else if ($docbase = 'echo')
+  then count($pb1/following::echo:figure[1]/preceding::echo:figure) + 1
+  else ()
+
+let $pageFragmentTmp := 
+  if ($mode = "image" or $errorCode > 9)
+  then ()
+  else if ($mode = "text" or $mode = "textPollux" or $mode = "gis" or $mode = "xml" or $mode = "pureXml")
+  then util:get-fragment-between($pb1, $pb2, true())
+  else ()
+(: replace the soft hyphen (Unicode character for 00AD) just before the line break by a normal hyphen :)
+(: delete the hyphen just before the line break in case of options=withoutLBs :)
+let $pageFragment :=
+  if (($mode = "text" or $mode = "textPollux") and not(contains($options, "withoutLBs")) and contains($pageFragmentTmp, "­<lb"))
+  then replace($pageFragmentTmp, "­<lb", "-<lb")
+  else if (($mode = "text" or $mode = "textPollux") and contains($options, "withoutLBs") and contains($pageFragmentTmp, "-<lb"))
+  then replace($pageFragmentTmp, "-<lb", "<lb")
+  else $pageFragmentTmp
+let $pageFragmentNormalized := 
+  if ($mode = "image" or $errorCode > 9)
+  then ()
+  else if (($mode = "text" or $mode = "textPollux" or $mode = "gis") and $charNorm = "")
+  then mpdltext:normalizeChars('reg,norm', $language, $pageFragment)
+  else if (($mode = "xml" or $mode = "pureXml") and $charNorm = "")
+  then $pageFragment
+  else if (($mode = "text" or $mode = "textPollux" or $mode = "gis" or $mode = "xml" or $mode = "pureXml") and $charNorm != "")
+  then mpdltext:normalizeChars($charNorm, $language, $pageFragment)
+  else ()
+let $retPageFragment := 
+  if ($mode = "image" or $errorCode > 9)
+  then ()
+  else if ($mode = "text" or $mode = "gis" or $mode = "xml" or $mode = "pureXml")
+  then $pageFragmentNormalized
+  else if ($mode = "textPollux")
+  then mpdltext:dictionarize($pageFragmentNormalized, $language)
+  else ()
+let $returnPageFragmentTmp := util:parse($retPageFragment)  (: returns a valid xml document for that string   :)  
+
+let $externalElementsTmpTmp := mpdltext:externalObject("read", "element", "", $mpdlDocUri, string($pn), "", "", "")
+let $externalElementsTmp := 
+  if(not($externalElementsTmpTmp = ""))
+  then util:parse($externalElementsTmpTmp)
+  else ()
+let $externalElements := $externalElementsTmp/result/element
+let $containsExternalElements := 
+  if(not(empty($externalElements)))
+  then true()
+  else false()
+(: let $bla := error(QName("Bla", "Bla"), util:serialize($externalElementsTmp, ())) :)
+let $returnPageFragmentTmpp := 
+  if (contains($options, "withXmlNodeId") or $containsExternalElements)
+  then mpdl-text:insertNodeIdAttribute($returnPageFragmentTmp/*[1])
+  else $returnPageFragmentTmp
+let $sentences := util:eval-inline($returnPageFragmentTmpp, ".//s")
+let $s4NodeId := subsequence($sentences, 4, 1)/@xmlNodeId
+let $s5NodeId := subsequence($sentences, 5, 1)/@xmlNodeId
+let $testExternalObjects :=
+  (<element uid="joe" documentId="{$mpdlDocUri}" pageNumber="14" xmlNodeId="{$s4NodeId}" before="true" charPos="10"><content><note>This is a first test note</note></content></element>, 
+   <element uid="joe" documentId="{$mpdlDocUri}" pageNumber="14" xmlNodeId="{$s5NodeId}" before="false" charPos="-1"><content><note>This is a second test note</note></content></element>) 
+
+let $returnPageFragment := 
+  if($containsExternalElements)
+  then mpdl-text:insert($returnPageFragmentTmpp/*[1], $externalElements) 
+  else $returnPageFragmentTmpp
+
+let $pageFigureAnchors := $returnPageFragment//anchor[@type = 'figure']
+let $pageFigures :=
+    for $pageFigureAnchor in $pageFigureAnchors
+      let $figureHref := string($pageFigureAnchor/@xlink:href)
+      let $pageFigureTmp := $document//echo:figure[@xlink:label = $figureHref]
+      let $pageFigure := subsequence($pageFigureTmp, 1, 1)
+    return 
+      $pageFigure
+let $pageHandwrittenAnchors := $returnPageFragment//anchor[@type = 'handwritten']
+let $pageHandwritten :=
+    for $pageHandwrittenAnchor in $pageHandwrittenAnchors
+      let $handwrittenHref := string($pageHandwrittenAnchor/@xlink:href)
+      let $pageHandwrittenTmp := $document//echo:handwritten[@xlink:label = $handwrittenHref]
+      let $pageHandwritten := subsequence($pageHandwrittenTmp, 1, 1)
+    return 
+      $pageHandwritten
+let $pageTableAnchors := $returnPageFragment//anchor[@type = 'table']
+let $pageTables :=
+    for $pageTableAnchor in $pageTableAnchors
+      let $tableHref := string($pageTableAnchor/@xlink:href)
+      let $pageTableTmp := $document//xhtml:table[@xlink:label = $tableHref]
+      let $pageTable := subsequence($pageTableTmp, 1, 1)
+    return 
+      $pageTable
+let $pageNoteAnchors := $returnPageFragment//anchor[@type = 'note']
+let $pageNotes :=
+  if ($docbase = "echo")
+  then
+    for $pageNoteAnchor in $pageNoteAnchors
+      let $noteHref := string($pageNoteAnchor/@xlink:href)
+      let $pageNoteTmp := $document//echo:note[@xlink:label = $noteHref]
+      let $pageNote := subsequence($pageNoteTmp, 1, 1)
+    return 
+      $pageNote
+  else
+    $returnPageFragment//note
+
+(: Metadata handling: only metadata of the selected document is scanned   :)
+let $identifier := $documentIdentifier
+let $authors := mpdl-lucene:getElementsByAttr($metadata, $docbase, "author")
+let $titles := mpdl-lucene:getElementsByAttr($metadata, $docbase, "title")
+let $places := mpdl-lucene:getElementsByAttr($metadata, $docbase, "place")
+let $date := mpdl-lucene:getElementsByAttr($metadata, $docbase, "date")
+let $rights := mpdl-lucene:getElementsByAttr($metadata, $docbase, "rights")
+let $accessRights := mpdl-lucene:getElementsByAttr($metadata, $docbase, "accessRights")
+let $licenses := mpdl-lucene:getElementsByAttr($metadata, $docbase, "license")
+let $file := mpdl-lucene:getElementsByAttr($metadata, $docbase, "file")
+let $translator := mpdl-lucene:getElementsByAttr($metadata, $docbase, "translator")
+let $version := mpdl-lucene:getElementsByAttr($metadata, $docbase, "version")
+
+let $currentTimeEnd := util:system-time()
+let $neededTime := mpdl-time:duration-as-ms($currentTimeEnd - $currentTimeBegin)
+
+let $xmlResult := 
+  if ($errorCode < 10)
+  then 
+    <result>
+      <document-description>
+        <uri>{$mpdlDocUri}</uri>
+        <collection-name>{$docbase}</collection-name>
+        <document-name>{$documentName}</document-name>
+        <language>{$language}</language>
+        <authors>{$authors}</authors>
+        <titles>{$titles}</titles>
+        <places>{$places}</places>
+        <date>{$date}</date>
+        <identifier>{$identifier}</identifier>
+        <rights>{$rights}</rights>
+        <accessRights>{$accessRights}</accessRights>
+        <licenses>{$licenses}</licenses>
+        <file>{$file}</file>
+        <translator>{$translator}</translator>
+        <version>{$version}</version>
+        <count-pages>{$countPages}</count-pages>
+      </document-description>
+      <page>
+        <mode>{$mode}</mode>
+        <number>{$pn}</number>
+        <sentence-number>{$sn}</sentence-number>
+        <header>{$pageHeader}</header>
+        <number-orig>{$pageNumberOrig}</number-orig>
+        <digilib-available>{$digilibAvailable}</digilib-available>
+        <image-available>{$imageIsAvailable}</image-available>
+        <image-file-name>{$imageFileName}</image-file-name>
+        {$imageEcho}
+        {$imageScaler}
+        <xml-url>?document={$documentName}&amp;pn={$pn}&amp;mode=xml</xml-url>
+        <page-image-directory>{$imagesDocDirectory}/{$pageImageDirectory}</page-image-directory>
+        <figures-image-directory>{$imagesDocDirectory}/{$figuresImageDirectory}</figures-image-directory>
+        <firstFigurePosition>{$positionOfFirstFigureAfterPB1}</firstFigurePosition>
+        <figures>{$pageFigures}</figures>
+        <handwritten>{$pageHandwritten}</handwritten>
+        <tables>{$pageTables}</tables>
+        <notes>{$pageNotes}</notes>
+        <content>{$returnPageFragment}</content>
+        <character-normalization>{$charNorm}</character-normalization>
+        <options>{$options}</options>
+      </page>
+      <query>
+        <type>{$queryType}</type>
+        <expression>{$query}</expression>
+        {$queryResult}
+      </query>
+      <performance>{$neededTime}</performance>
+    </result>
+  else if ($errorCode = 10)
+  then <error><number>{$errorCode}</number><description>Fulltext document: {$mpdlDocUri} is not available yet</description></error>
+  else if ($errorCode = 11)
+  then <error><number>{$errorCode}</number><description>No result: Page {$pn} not found</description></error>
+  else if ($errorCode = 12)
+  then <error><number>{$errorCode}</number><description>View mode {$mode} not available</description></error>
+  else <error><number>{$errorCode}</number><description>undefined error: {$errorCode}</description></error>  
+
+let $declare := 
+  if ($errorCode > 9 or $mode = "text" or $mode = "textPollux" or $mode = "gis" or $mode = "image" or $mode = "xml")
+  then util:declare-option("exist:serialize", "method=xhtml media-type=text/html omit-xml-declaration=no indent=yes encoding=utf-8")
+  else if ($mode = "pureXml")
+  then util:declare-option("exist:serialize", "method=xml media-type=text/xml omit-xml-declaration=no indent=yes encoding=utf-8")
+  else util:declare-option("exist:serialize", "method=xml media-type=text/xml omit-xml-declaration=no indent=yes encoding=utf-8")
+let $xslFilePath := 
+  if($reqExport = "pdf")
+  then concat($presentationPath, "/pageFragmentHtml.xsl")
+  else if($mode = "text" or $mode = "textPollux" or $mode = "gis" or $mode = "image" or $mode = "xml")
+  then concat($presentationPath, "/pageHtml.xsl")
+  else concat($presentationPath, "/pageXml.xsl")
+
+let $titleStr := concat(string-join($authors, ', '), ". ", string-join($titles, ', '), ". ", string-join($places, ', '), " ", $date, ".")
+let $tmpResult :=
+  if ($errorCode < 10 and $reqExport = "pdf")
+  then mpdl-text:html2pdf($language, $xmlResult, $xslFilePath, $titleStr, $pn, $mode) 
+  else if ($errorCode < 10 and not($reqExport = "pdf"))
+  then mpdl-text:transform($xmlResult, $xslFilePath)
+  else 
+    <div>{$xmlResult}</div>  (:  error xml result  :)
+let $result :=
+  if ($errorCode < 10 and $reqExport = "pdf")
+  then response:stream-binary($tmpResult, "application/pdf", concat($documentName, "-page", $pn, ".pdf"))
+  else $tmpResult
+  
+let $setHeaderXmlFilename := 
+  if ($mode = "pureXml" and $queryType = "xpath" and $query != "")
+  then response:set-header('Content-Disposition', concat('filename=', $documentName, '-xpath-result--', $query, '--'))
+  else if ($mode = "pureXml" and $queryType = "xquery" and $query != "")
+  then response:set-header('Content-Disposition', concat('filename=', 'xquery-result'))
+  else if ($mode = "pureXml") 
+  then response:set-header('Content-Disposition', concat('filename=', $documentName, '-page', $pn)) 
+  else ()
+
+return $result
\ No newline at end of file