Mercurial > hg > mpdl-group
diff software/eXist/webapp/mpdl/pq.xql @ 7:5589d865af7a
Erstellung XQL/XSL Applikation
author | Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de> |
---|---|
date | Tue, 08 Feb 2011 15:16:46 +0100 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/webapp/mpdl/pq.xql Tue Feb 08 15:16:46 2011 +0100 @@ -0,0 +1,439 @@ +xquery version "1.0"; + +import module namespace mpdl-time = "http://www.mpiwg-berlin.mpg.de/ns/mpdl/util/time" at "util/time.xql"; +import module namespace functx = "http://www.functx.com" at "util/functx.xql"; +import module namespace mpdl-lucene = "http://www.mpiwg-berlin.mpg.de/ns/mpdl/lucene/search" at "lucene/search.xql"; +import module namespace mpdl-text = "http://www.mpiwg-berlin.mpg.de/ns/mpdl/text" at "text/all.xql"; + +declare namespace xlink="http://www.w3.org/1999/xlink"; +declare namespace request = "http://exist-db.org/xquery/request"; +declare namespace transform = "http://exist-db.org/xquery/transform"; +declare namespace util = "http://exist-db.org/xquery/util"; + +declare namespace dcterms="http://purl.org/dc/terms"; +declare namespace xhtml="http://www.w3.org/1999/xhtml"; +declare namespace echo="http://www.mpiwg-berlin.mpg.de/ns/echo/1.0/"; + + +let $mpdlDocUri := request:get-parameter("document", "") +let $queryType := request:get-parameter("query-type", "") +let $mode := request:get-parameter("mode", "image") +let $reqPN := number(request:get-parameter("pn", "-1")) +let $reqPF := request:get-parameter("pf", "") +let $reqSN := number(request:get-parameter("sn", "-1")) +let $query := request:get-parameter("query", "") +let $reqQueryResultPN := request:get-parameter("query-result-pn", "") +let $queryResultPN := + if ($reqQueryResultPN = '' or $reqQueryResultPN = '0') + then 1 + else number($reqQueryResultPN) +let $regCharNorm := request:get-parameter("characterNormalization", "") +let $tmpCharNorm := string-join($regCharNorm, ',') +let $charNorm := + if($tmpCharNorm = "regPlusNorm") + then "reg,norm" + else $tmpCharNorm +let $reqExport := request:get-parameter("export", "") +let $options := string(request:get-parameter("options", "")) + +let $presentationPath := "/db/mpdl/presentation" +(: e.g. mpdlCollectioName is derived from mpdlDocUri: /archimedes/la/yourDoc.xml :) +let $documentName := substring-before(substring-after(substring-after(substring-after($mpdlDocUri, "/"), "/"), "/"), ".") +let $language := substring-before(substring-after(substring-after($mpdlDocUri, "/"), "/"), "/") +let $docbase := substring-before(substring-after($mpdlDocUri, "/"), "/") +let $fullDocumentUri := + if ($queryType = 'fulltext' or $queryType = 'ftIndex') + then concat('/db/mpdl/documents/standard', $mpdlDocUri) + else if ($queryType = 'fulltextMorph' or $queryType = 'fulltextMorphLemma' or $queryType = 'ftIndexMorph') + then concat('/db/mpdl/documents/morph', $mpdlDocUri) + else concat('/db/mpdl/documents/morph', $mpdlDocUri) +let $currentTimeBegin := util:system-time() +let $documentAvailable := doc-available($fullDocumentUri) +let $document := doc($fullDocumentUri) +let $metadata := + if ($docbase = 'archimedes') + then $document/archimedes/info + else if ($docbase = 'echo') + then $document/echo:echo/echo:metadata + else '' + +let $pageBreaks := + if ($docbase = 'archimedes') + then $document//pb + else if ($docbase = 'echo') + then $document//echo:pb + else $document//pb +let $countPagesTemp := count($pageBreaks) +let $countPages := + if ($countPagesTemp > 0) + then $countPagesTemp + else 1 + +(: xQuery inline execution does not work in module so it has to be done here :) +let $xQueryPageSize := 100 +let $xQueryResultEval := + if ($queryType = 'xpath' or $queryType = 'xquery' and $query != "") + then util:eval-inline($document, $query) + else () +let $xQueryFrom := ($queryResultPN * $xQueryPageSize) - $xQueryPageSize + 1 +let $xQueryTo := $queryResultPN * $xQueryPageSize +let $xQueryResultEntries := + for $entry at $pos in $xQueryResultEval + where $pos >= $xQueryFrom and $pos <= $xQueryTo + return $entry +let $xQuerySize := count($xQueryResultEval) +let $xQueryPages := + if ($xQuerySize = 0) + then 0 + else $xQuerySize idiv $xQueryPageSize + 1 +let $xQueryResult := + <result> + <size>{$xQuerySize}</size> + <page-size>{$xQueryPageSize}</page-size> + <pages>{$xQueryPages}</pages> + <pn>{$queryResultPN}</pn> + <hits>{$xQueryResultEntries}</hits> + </result> + +let $queryResult := + if (($queryType = 'fulltext' or $queryType = 'fulltextMorph' or $queryType = 'fulltextMorphLemma') and $query != "") + then mpdl-lucene:search($docbase, $language, $document, $queryType, $query, $queryResultPN, 10) + else if (($queryType = 'ftIndex' or $queryType = 'ftIndexMorph') and $query != "") + then mpdl-text:indexTerms($docbase, $language, $document, $query, $queryResultPN, 100) + else if ($queryType = 'xpath' or $queryType = 'xquery' and $query != "") + then $xQueryResult + else if ($queryType = 'toc' or $queryType = 'figures') + then mpdl-text:get-toc($docbase, $queryType, $document, $queryResultPN, 100) + else if ($query = "") + then + <result> + <size>0</size> + <pages>0</pages> + <pn>0</pn> + <hits/> + </result> + else () + +let $countHits := count($queryResult/result/hits/hit) +let $firstHit := $queryResult/result/hits/hit[1] +(: jump to first pn and sn hit in fulltext mode :) +let $pn := + if (($queryType = 'fulltext' or $queryType = 'fulltextMorph' or $queryType = 'fulltextMorphLemma') and $countHits > 0 and $reqPN <= 0) + then number($firstHit/pn) + else if ($reqPN = -1) + then 1 + else $reqPN +let $sn := + if (($queryType = 'fulltext' or $queryType = 'fulltextMorph' or $queryType = 'fulltextMorphLemma') and $countHits > 0 and $reqPN <= 0 and $reqSN < 0) + then number($firstHit/pos-of-s) + else $reqSN + +(: 10 or more is an error :) +let $errorCode := + if (not($documentAvailable)) + then 10 + else if ($countPagesTemp != 0 and ($pn > $countPagesTemp or $pn <= 0)) + then 11 + else if ($countPagesTemp = 0) + then 1 (: if no page break is found then the document should have exactly one page :) + else if (not($mode = "text" or $mode = "textPollux" or $mode = "gis" or $mode = "image" or $mode = "xml" or $mode = "pureXml")) + then 12 + else 0 + +let $pb1 := + if ($errorCode = 0) + then subsequence($pageBreaks, $pn, 1) + else if ($errorCode = 1) + then subsequence(mpdl-lucene:getText($docbase, $document), 1, 1) + else () +let $pb2 := + if ($errorCode = 0) + then subsequence($pageBreaks, $pn + 1, 1) + else if ($errorCode = 1) + then subsequence(mpdl-lucene:getText($docbase, $document), 2, 1) + else () +let $pageHeader := string($pb1/@rhead) +let $pageNumberOrig := string($pb1/@o) + +let $documentIdentifier := + if ($docbase = 'archimedes') + then $metadata/locator + else if ($docbase = 'echo') + then $metadata/dcterms:identifier + else $metadata/dcterms:identifier +let $echoDocIdentifier := + if ($documentIdentifier != '') + then substring-before(substring-after($documentIdentifier, "ECHO:"), ".") + else '' +let $echoURLZogilib := "http://echo.mpiwg-berlin.mpg.de/zogilib" +let $nausikaaURLScaler := "http://nausikaa2.rz-berlin.mpg.de/digitallibrary/servlet/Scaler" +let $nausikaaURLDlInfo := "http://nausikaa2.mpiwg-berlin.mpg.de/digitallibrary/dlInfo-xml.jsp" +let $nausikaaURLTexter := "http://nausikaa2.mpiwg-berlin.mpg.de/digitallibrary/servlet/Texter" +let $echoImageDir := + if ($docbase = 'archimedes') + then string($metadata/echodir) + else if ($docbase = 'echo') + then string($metadata/echo:echodir) + else '' +let $imagesDocDirectory := + if ($echoImageDir != '') + then $echoImageDir + else if ($docbase = 'archimedes') + then concat("/permanent/archimedes/", $documentName) + else if ($docbase = 'echo') + then concat("/permanent/library/", $echoDocIdentifier) + else '' +let $imagesDocDirectoryIndexMetaUrl := + if ($mode = "image" or $mode = "text" or $mode = "textPollux" or $mode = "gis") + then concat($nausikaaURLTexter, "?fn=", $imagesDocDirectory, "/index.meta") + else () +let $digilibAvailable := mpdldoc:check-uri($imagesDocDirectoryIndexMetaUrl, ()) +let $imagesDocDirectoryIndexMeta := + if (($mode = "image" or $mode = "text" or $mode = "textPollux" or $mode = "gis") and $digilibAvailable) + then doc($imagesDocDirectoryIndexMetaUrl) + else () +let $pageImageDirectory := string($imagesDocDirectoryIndexMeta/resource/meta/texttool/image) +let $figuresImageDirectoryTemp := string($imagesDocDirectoryIndexMeta/resource/meta/texttool/figures) +let $figuresImageDirectory := + if ($figuresImageDirectoryTemp != '') + then $figuresImageDirectoryTemp + else concat(substring-before($pageImageDirectory, "pageimg"), "figures") +let $pageImageFileNameWithoutExtension := + if ($docbase = 'echo') + then concat("/", string($pb1/@file)) + else '' +let $imageFileName := + if ($reqPF = '') + then concat($imagesDocDirectory, "/", $pageImageDirectory, $pageImageFileNameWithoutExtension) + else $reqPF +let $imageEcho := <image-echo>{$echoURLZogilib}?fn={$imageFileName}&pn={$pn}</image-echo> +let $imageScaler := <image-scaler>{$nausikaaURLScaler}?fn={$imageFileName}&pn={$pn}</image-scaler> + +let $imageFileNameUrl := concat($nausikaaURLDlInfo, "?fn=", $imageFileName) +let $testImageResult := + if ($mode = 'image' and $digilibAvailable) + then doc($imageFileNameUrl) + else () +let $testImageResultParamImgFn := string($testImageResult//parameter[@name='img.fn']/@value) +let $imageIsAvailable := + if ($testImageResultParamImgFn = '' and $reqPF = '') + then 'false' + else 'true' + +let $positionOfFirstFigureAfterPB1 := + if ($docbase = 'archimedes') + then count($pb1/following::figure[1]/preceding::figure) + 1 + else if ($docbase = 'echo') + then count($pb1/following::echo:figure[1]/preceding::echo:figure) + 1 + else () + +let $pageFragmentTmp := + if ($mode = "image" or $errorCode > 9) + then () + else if ($mode = "text" or $mode = "textPollux" or $mode = "gis" or $mode = "xml" or $mode = "pureXml") + then util:get-fragment-between($pb1, $pb2, true()) + else () +(: replace the soft hyphen (Unicode character for 00AD) just before the line break by a normal hyphen :) +(: delete the hyphen just before the line break in case of options=withoutLBs :) +let $pageFragment := + if (($mode = "text" or $mode = "textPollux") and not(contains($options, "withoutLBs")) and contains($pageFragmentTmp, "<lb")) + then replace($pageFragmentTmp, "<lb", "-<lb") + else if (($mode = "text" or $mode = "textPollux") and contains($options, "withoutLBs") and contains($pageFragmentTmp, "-<lb")) + then replace($pageFragmentTmp, "-<lb", "<lb") + else $pageFragmentTmp +let $pageFragmentNormalized := + if ($mode = "image" or $errorCode > 9) + then () + else if (($mode = "text" or $mode = "textPollux" or $mode = "gis") and $charNorm = "") + then mpdltext:normalizeChars('reg,norm', $language, $pageFragment) + else if (($mode = "xml" or $mode = "pureXml") and $charNorm = "") + then $pageFragment + else if (($mode = "text" or $mode = "textPollux" or $mode = "gis" or $mode = "xml" or $mode = "pureXml") and $charNorm != "") + then mpdltext:normalizeChars($charNorm, $language, $pageFragment) + else () +let $retPageFragment := + if ($mode = "image" or $errorCode > 9) + then () + else if ($mode = "text" or $mode = "gis" or $mode = "xml" or $mode = "pureXml") + then $pageFragmentNormalized + else if ($mode = "textPollux") + then mpdltext:dictionarize($pageFragmentNormalized, $language) + else () +let $returnPageFragmentTmp := util:parse($retPageFragment) (: returns a valid xml document for that string :) + +let $externalElementsTmpTmp := mpdltext:externalObject("read", "element", "", $mpdlDocUri, string($pn), "", "", "") +let $externalElementsTmp := + if(not($externalElementsTmpTmp = "")) + then util:parse($externalElementsTmpTmp) + else () +let $externalElements := $externalElementsTmp/result/element +let $containsExternalElements := + if(not(empty($externalElements))) + then true() + else false() +(: let $bla := error(QName("Bla", "Bla"), util:serialize($externalElementsTmp, ())) :) +let $returnPageFragmentTmpp := + if (contains($options, "withXmlNodeId") or $containsExternalElements) + then mpdl-text:insertNodeIdAttribute($returnPageFragmentTmp/*[1]) + else $returnPageFragmentTmp +let $sentences := util:eval-inline($returnPageFragmentTmpp, ".//s") +let $s4NodeId := subsequence($sentences, 4, 1)/@xmlNodeId +let $s5NodeId := subsequence($sentences, 5, 1)/@xmlNodeId +let $testExternalObjects := + (<element uid="joe" documentId="{$mpdlDocUri}" pageNumber="14" xmlNodeId="{$s4NodeId}" before="true" charPos="10"><content><note>This is a first test note</note></content></element>, + <element uid="joe" documentId="{$mpdlDocUri}" pageNumber="14" xmlNodeId="{$s5NodeId}" before="false" charPos="-1"><content><note>This is a second test note</note></content></element>) + +let $returnPageFragment := + if($containsExternalElements) + then mpdl-text:insert($returnPageFragmentTmpp/*[1], $externalElements) + else $returnPageFragmentTmpp + +let $pageFigureAnchors := $returnPageFragment//anchor[@type = 'figure'] +let $pageFigures := + for $pageFigureAnchor in $pageFigureAnchors + let $figureHref := string($pageFigureAnchor/@xlink:href) + let $pageFigureTmp := $document//echo:figure[@xlink:label = $figureHref] + let $pageFigure := subsequence($pageFigureTmp, 1, 1) + return + $pageFigure +let $pageHandwrittenAnchors := $returnPageFragment//anchor[@type = 'handwritten'] +let $pageHandwritten := + for $pageHandwrittenAnchor in $pageHandwrittenAnchors + let $handwrittenHref := string($pageHandwrittenAnchor/@xlink:href) + let $pageHandwrittenTmp := $document//echo:handwritten[@xlink:label = $handwrittenHref] + let $pageHandwritten := subsequence($pageHandwrittenTmp, 1, 1) + return + $pageHandwritten +let $pageTableAnchors := $returnPageFragment//anchor[@type = 'table'] +let $pageTables := + for $pageTableAnchor in $pageTableAnchors + let $tableHref := string($pageTableAnchor/@xlink:href) + let $pageTableTmp := $document//xhtml:table[@xlink:label = $tableHref] + let $pageTable := subsequence($pageTableTmp, 1, 1) + return + $pageTable +let $pageNoteAnchors := $returnPageFragment//anchor[@type = 'note'] +let $pageNotes := + if ($docbase = "echo") + then + for $pageNoteAnchor in $pageNoteAnchors + let $noteHref := string($pageNoteAnchor/@xlink:href) + let $pageNoteTmp := $document//echo:note[@xlink:label = $noteHref] + let $pageNote := subsequence($pageNoteTmp, 1, 1) + return + $pageNote + else + $returnPageFragment//note + +(: Metadata handling: only metadata of the selected document is scanned :) +let $identifier := $documentIdentifier +let $authors := mpdl-lucene:getElementsByAttr($metadata, $docbase, "author") +let $titles := mpdl-lucene:getElementsByAttr($metadata, $docbase, "title") +let $places := mpdl-lucene:getElementsByAttr($metadata, $docbase, "place") +let $date := mpdl-lucene:getElementsByAttr($metadata, $docbase, "date") +let $rights := mpdl-lucene:getElementsByAttr($metadata, $docbase, "rights") +let $accessRights := mpdl-lucene:getElementsByAttr($metadata, $docbase, "accessRights") +let $licenses := mpdl-lucene:getElementsByAttr($metadata, $docbase, "license") +let $file := mpdl-lucene:getElementsByAttr($metadata, $docbase, "file") +let $translator := mpdl-lucene:getElementsByAttr($metadata, $docbase, "translator") +let $version := mpdl-lucene:getElementsByAttr($metadata, $docbase, "version") + +let $currentTimeEnd := util:system-time() +let $neededTime := mpdl-time:duration-as-ms($currentTimeEnd - $currentTimeBegin) + +let $xmlResult := + if ($errorCode < 10) + then + <result> + <document-description> + <uri>{$mpdlDocUri}</uri> + <collection-name>{$docbase}</collection-name> + <document-name>{$documentName}</document-name> + <language>{$language}</language> + <authors>{$authors}</authors> + <titles>{$titles}</titles> + <places>{$places}</places> + <date>{$date}</date> + <identifier>{$identifier}</identifier> + <rights>{$rights}</rights> + <accessRights>{$accessRights}</accessRights> + <licenses>{$licenses}</licenses> + <file>{$file}</file> + <translator>{$translator}</translator> + <version>{$version}</version> + <count-pages>{$countPages}</count-pages> + </document-description> + <page> + <mode>{$mode}</mode> + <number>{$pn}</number> + <sentence-number>{$sn}</sentence-number> + <header>{$pageHeader}</header> + <number-orig>{$pageNumberOrig}</number-orig> + <digilib-available>{$digilibAvailable}</digilib-available> + <image-available>{$imageIsAvailable}</image-available> + <image-file-name>{$imageFileName}</image-file-name> + {$imageEcho} + {$imageScaler} + <xml-url>?document={$documentName}&pn={$pn}&mode=xml</xml-url> + <page-image-directory>{$imagesDocDirectory}/{$pageImageDirectory}</page-image-directory> + <figures-image-directory>{$imagesDocDirectory}/{$figuresImageDirectory}</figures-image-directory> + <firstFigurePosition>{$positionOfFirstFigureAfterPB1}</firstFigurePosition> + <figures>{$pageFigures}</figures> + <handwritten>{$pageHandwritten}</handwritten> + <tables>{$pageTables}</tables> + <notes>{$pageNotes}</notes> + <content>{$returnPageFragment}</content> + <character-normalization>{$charNorm}</character-normalization> + <options>{$options}</options> + </page> + <query> + <type>{$queryType}</type> + <expression>{$query}</expression> + {$queryResult} + </query> + <performance>{$neededTime}</performance> + </result> + else if ($errorCode = 10) + then <error><number>{$errorCode}</number><description>Fulltext document: {$mpdlDocUri} is not available yet</description></error> + else if ($errorCode = 11) + then <error><number>{$errorCode}</number><description>No result: Page {$pn} not found</description></error> + else if ($errorCode = 12) + then <error><number>{$errorCode}</number><description>View mode {$mode} not available</description></error> + else <error><number>{$errorCode}</number><description>undefined error: {$errorCode}</description></error> + +let $declare := + if ($errorCode > 9 or $mode = "text" or $mode = "textPollux" or $mode = "gis" or $mode = "image" or $mode = "xml") + then util:declare-option("exist:serialize", "method=xhtml media-type=text/html omit-xml-declaration=no indent=yes encoding=utf-8") + else if ($mode = "pureXml") + then util:declare-option("exist:serialize", "method=xml media-type=text/xml omit-xml-declaration=no indent=yes encoding=utf-8") + else util:declare-option("exist:serialize", "method=xml media-type=text/xml omit-xml-declaration=no indent=yes encoding=utf-8") +let $xslFilePath := + if($reqExport = "pdf") + then concat($presentationPath, "/pageFragmentHtml.xsl") + else if($mode = "text" or $mode = "textPollux" or $mode = "gis" or $mode = "image" or $mode = "xml") + then concat($presentationPath, "/pageHtml.xsl") + else concat($presentationPath, "/pageXml.xsl") + +let $titleStr := concat(string-join($authors, ', '), ". ", string-join($titles, ', '), ". ", string-join($places, ', '), " ", $date, ".") +let $tmpResult := + if ($errorCode < 10 and $reqExport = "pdf") + then mpdl-text:html2pdf($language, $xmlResult, $xslFilePath, $titleStr, $pn, $mode) + else if ($errorCode < 10 and not($reqExport = "pdf")) + then mpdl-text:transform($xmlResult, $xslFilePath) + else + <div>{$xmlResult}</div> (: error xml result :) +let $result := + if ($errorCode < 10 and $reqExport = "pdf") + then response:stream-binary($tmpResult, "application/pdf", concat($documentName, "-page", $pn, ".pdf")) + else $tmpResult + +let $setHeaderXmlFilename := + if ($mode = "pureXml" and $queryType = "xpath" and $query != "") + then response:set-header('Content-Disposition', concat('filename=', $documentName, '-xpath-result--', $query, '--')) + else if ($mode = "pureXml" and $queryType = "xquery" and $query != "") + then response:set-header('Content-Disposition', concat('filename=', 'xquery-result')) + else if ($mode = "pureXml") + then response:set-header('Content-Disposition', concat('filename=', $documentName, '-page', $pn)) + else () + +return $result \ No newline at end of file