Mercurial > hg > mpdl-group
diff software/eXist/webapp/mpdl/interface/page-fragment.xql @ 7:5589d865af7a
Erstellung XQL/XSL Applikation
author | Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de> |
---|---|
date | Tue, 08 Feb 2011 15:16:46 +0100 |
parents | |
children | d2a1c14fde31 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/webapp/mpdl/interface/page-fragment.xql Tue Feb 08 15:16:46 2011 +0100 @@ -0,0 +1,401 @@ +xquery version "1.0"; + +import module namespace mpdl-time = "http://www.mpiwg-berlin.mpg.de/ns/mpdl/util/time" at "../util/time.xql"; +import module namespace functx = "http://www.functx.com" at "../util/functx.xql"; +import module namespace mpdl-lucene = "http://www.mpiwg-berlin.mpg.de/ns/mpdl/lucene/search" at "../lucene/search.xql"; +import module namespace mpdl-text = "http://www.mpiwg-berlin.mpg.de/ns/mpdl/text" at "../text/all.xql"; + +declare namespace xlink="http://www.w3.org/1999/xlink"; +declare namespace request = "http://exist-db.org/xquery/request"; +declare namespace transform = "http://exist-db.org/xquery/transform"; +declare namespace util = "http://exist-db.org/xquery/util"; + +declare namespace dcterms="http://purl.org/dc/terms"; +declare namespace xhtml="http://www.w3.org/1999/xhtml"; +declare namespace echo="http://www.mpiwg-berlin.mpg.de/ns/echo/1.0/"; + +let $mpdlDocUri := request:get-parameter("document", "") +let $mode := request:get-parameter("mode", "image") + +let $reqPN := number(request:get-parameter("pn", "-1")) +let $reqPF := request:get-parameter("pf", "") +let $reqSN := number(request:get-parameter("sn", "-1")) +let $highlightQuery := request:get-parameter("highlightQuery", "") +let $regCharNorm := request:get-parameter("characterNormalization", "") +let $tmpCharNorm := string-join($regCharNorm, ',') +let $charNorm := + if($tmpCharNorm = "regPlusNorm") + then "reg,norm" + else $tmpCharNorm +let $reqExport := request:get-parameter("export", "") +let $options := string(request:get-parameter("options", "")) + +let $presentationPath := "/db/mpdl/presentation" +(: e.g. mpdlCollectioName is derived from mpdlDocUri: /archimedes/la/yourDoc.xml :) +let $documentName := substring-before(substring-after(substring-after(substring-after($mpdlDocUri, "/"), "/"), "/"), ".") +let $language := substring-before(substring-after(substring-after($mpdlDocUri, "/"), "/"), "/") +let $docbase := substring-before(substring-after($mpdlDocUri, "/"), "/") +let $fullDocumentUri := concat('/db/mpdl/documents/morph', $mpdlDocUri) +let $currentTimeBegin := util:system-time() +let $documentAvailable := doc-available($fullDocumentUri) +let $document := doc($fullDocumentUri) +let $metadata := + if ($docbase = 'archimedes') + then $document/archimedes/info + else if ($docbase = 'echo') + then $document/echo:echo/echo:metadata + else '' + +let $pageBreaks := + if ($docbase = 'archimedes') + then $document//pb + else if ($docbase = 'echo') + then $document//echo:pb + else $document//pb +let $countPagesTemp := count($pageBreaks) +let $countPages := + if ($countPagesTemp > 0) + then $countPagesTemp + else 1 + +(: for performance reasons: deliver count of gis places and toc/figure entries :) +let $gisPlaces := + if ($docbase = 'echo') + then $document//echo:place + else () +let $countGisPlaces := count($gisPlaces) +let $tocEntries := + if ($docbase = 'echo') + then $document//echo:div[@type = 'section' or @type = 'chapter'] + else () +let $figureEntries := + if ($docbase = 'echo') + then $document//echo:figure + else if ($docbase = 'archimedes') + then $document//figure + else () +let $countTocEntries := count($tocEntries) +let $countFigureEntries := count($figureEntries) + +(: jump to first pn and sn hit in fulltext mode :) +let $pn := + if ($reqPN = -1) + then 1 + else $reqPN +let $sn := $reqSN + +(: 10 or more is an error :) +let $errorCode := + if (not($documentAvailable)) + then 10 + else if ($countPagesTemp != 0 and ($pn > $countPagesTemp or $pn <= 0)) + then 11 + else if ($countPagesTemp = 0) + then 1 (: if no page break is found then the document should have exactly one page :) + else if (not($mode = "text" or $mode = "textPollux" or $mode = "gis" or $mode = "image" or $mode = "xml" or $mode = "pureXml")) + then 12 + else 0 + +let $pb1 := + if ($errorCode = 0) + then subsequence($pageBreaks, $pn, 1) + else if ($errorCode = 1) + then subsequence(mpdl-lucene:getText($docbase, $document), 1, 1) + else () +let $pb2 := + if ($errorCode = 0) + then subsequence($pageBreaks, $pn + 1, 1) + else if ($errorCode = 1) + then subsequence(mpdl-lucene:getText($docbase, $document), 2, 1) + else () +let $pageHeader := string($pb1/@rhead) +let $pageNumberOrig := string($pb1/@o) + +let $documentIdentifier := + if ($docbase = 'archimedes') + then $metadata/locator + else if ($docbase = 'echo') + then $metadata/dcterms:identifier + else $metadata/dcterms:identifier +let $echoDocIdentifier := + if ($documentIdentifier != '') + then substring-before(substring-after($documentIdentifier, "ECHO:"), ".") + else '' +let $echoURLZogilib := "http://echo.mpiwg-berlin.mpg.de/zogilib" +let $nausikaaURLScaler := "http://nausikaa2.rz-berlin.mpg.de/digitallibrary/servlet/Scaler" +let $nausikaaURLDlInfo := "http://nausikaa2.mpiwg-berlin.mpg.de/digitallibrary/dlInfo-xml.jsp" +let $nausikaaURLTexter := "http://nausikaa2.mpiwg-berlin.mpg.de/digitallibrary/servlet/Texter" +let $echoImageDir := + if ($docbase = 'archimedes') + then string($metadata/echodir) + else if ($docbase = 'echo') + then string($metadata/echo:echodir) + else '' +let $imagesDocDirectory := + if ($echoImageDir != '') + then $echoImageDir + else if ($docbase = 'archimedes') + then concat("/permanent/archimedes/", $documentName) + else if ($docbase = 'echo') + then concat("/permanent/library/", $echoDocIdentifier) + else '' +let $imagesDocDirectoryIndexMetaUrl := + if ($mode = "image" or $mode = "text" or $mode = "textPollux" or $mode = "gis") + then concat($nausikaaURLTexter, "?fn=", $imagesDocDirectory, "/index.meta") + else () +let $digilibAvailable := mpdldoc:check-uri($imagesDocDirectoryIndexMetaUrl, 2000) +let $imagesDocDirectoryIndexMeta := + if (($mode = "image" or $mode = "text" or $mode = "textPollux" or $mode = "gis") and $digilibAvailable) + then doc($imagesDocDirectoryIndexMetaUrl) + else () +let $pageImageDirectory := string($imagesDocDirectoryIndexMeta/resource/meta/texttool/image) +let $figuresImageDirectoryTemp := string($imagesDocDirectoryIndexMeta/resource/meta/texttool/figures) +let $figuresImageDirectory := + if ($figuresImageDirectoryTemp != '') + then $figuresImageDirectoryTemp + else concat(substring-before($pageImageDirectory, "pageimg"), "figures") +let $pageImageFileNameWithoutExtension := + if ($docbase = 'echo') + then concat("/", string($pb1/@file)) + else '' +let $imageFileName := + if ($reqPF = '') + then concat($imagesDocDirectory, "/", $pageImageDirectory, $pageImageFileNameWithoutExtension) + else $reqPF +let $imageEcho := <image-echo>{$echoURLZogilib}?fn={$imageFileName}&pn={$pn}</image-echo> +let $imageScaler := <image-scaler>{$nausikaaURLScaler}?fn={$imageFileName}&pn={$pn}</image-scaler> + +let $imageFileNameUrl := concat($nausikaaURLDlInfo, "?fn=", $imageFileName) +let $testImageResult := + if ($mode = 'image' and $digilibAvailable) + then doc($imageFileNameUrl) + else () +let $testImageResultParamImgFn := string($testImageResult//parameter[@name='img.fn']/@value) +let $imageIsAvailable := + if ($testImageResultParamImgFn = '' and $reqPF = '') + then 'false' + else 'true' + +let $positionOfFirstFigureAfterPB1 := + if ($docbase = 'archimedes') + then count($pb1/following::figure[1]/preceding::figure) + 1 + else if ($docbase = 'echo') + then count($pb1/following::echo:figure[1]/preceding::echo:figure) + 1 + else () + +let $pageFragmentTmp := + if ($mode = "image" or $errorCode > 9) + then () + else if ($mode = "text" or $mode = "textPollux" or $mode = "gis" or $mode = "xml" or $mode = "pureXml") + then util:get-fragment-between($pb1, $pb2, true()) + else () +(: replace the soft hyphen (Unicode character for 00AD) just before the line break by a normal hyphen :) +(: delete the hyphen just before the line break in case of options=withoutLBs :) +let $pageFragment := + if (($mode = "text" or $mode = "textPollux") and not(contains($options, "withoutLBs")) and contains($pageFragmentTmp, "<lb")) + then replace($pageFragmentTmp, "<lb", "-<lb") + else if (($mode = "text" or $mode = "textPollux") and contains($options, "withoutLBs") and contains($pageFragmentTmp, "-<lb")) + then replace($pageFragmentTmp, "-<lb", "<lb") + else $pageFragmentTmp +let $pageFragmentNormalized := + if ($mode = "image" or $errorCode > 9) + then () + else if (($mode = "text" or $mode = "textPollux" or $mode = "gis") and $charNorm = "") + then mpdltext:normalizeChars('reg,norm', $language, $pageFragment) + else if (($mode = "xml" or $mode = "pureXml") and $charNorm = "") + then $pageFragment + else if (($mode = "text" or $mode = "textPollux" or $mode = "gis" or $mode = "xml" or $mode = "pureXml") and $charNorm != "") + then mpdltext:normalizeChars($charNorm, $language, $pageFragment) + else () +let $retPageFragment := + if ($mode = "image" or $errorCode > 9) + then () + else if ($mode = "text" or $mode = "gis" or $mode = "xml" or $mode = "pureXml") + then $pageFragmentNormalized + else if ($mode = "textPollux") + then mpdltext:dictionarize($pageFragmentNormalized, $language) + else () +let $returnPageFragmentTmp := util:parse($retPageFragment) (: returns a valid xml document for that string :) + +let $externalElementsTmpTmp := mpdltext:externalObject("read", "element", "", $mpdlDocUri, string($pn), "", "", "") +let $externalElementsTmp := + if(not($externalElementsTmpTmp = "")) + then util:parse($externalElementsTmpTmp) + else () +let $externalElements := $externalElementsTmp/result/element +let $containsExternalElements := + if(not(empty($externalElements))) + then true() + else false() +let $returnPageFragmentTmpp := + if (contains($options, "withXmlNodeId") or $containsExternalElements) + then mpdl-text:insertNodeIdAttribute($returnPageFragmentTmp/*[1]) + else $returnPageFragmentTmp + +let $returnPageFragment := + if($containsExternalElements) + then mpdl-text:insert($returnPageFragmentTmpp/*[1], $externalElements) + else $returnPageFragmentTmpp + +let $pageFigureAnchors := $returnPageFragment//anchor[@type = 'figure'] +let $pageFigures := + for $pageFigureAnchor in $pageFigureAnchors + let $href := string($pageFigureAnchor/@xlink:href) + let $pageFigureTmp := $document//echo:figure[@xlink:label = $href] + let $pageFigure := subsequence($pageFigureTmp, 1, 1) + return + $pageFigure +let $pageHandwrittenAnchors := $returnPageFragment//anchor[@type = 'handwritten'] +let $pageHandwritten := + for $pageHandwrittenAnchor in $pageHandwrittenAnchors + let $handwrittenHref := string($pageHandwrittenAnchor/@xlink:href) + let $pageHandwrittenTmp := $document//echo:handwritten[@xlink:label = $handwrittenHref] + let $pageHandwritten := subsequence($pageHandwrittenTmp, 1, 1) + return + $pageHandwritten +let $pageTableAnchors := $returnPageFragment//anchor[@type = 'table'] +let $pageTables := + for $pageTableAnchor in $pageTableAnchors + let $tableHref := string($pageTableAnchor/@xlink:href) + let $pageTableTmp := $document//xhtml:table[@xlink:label = $tableHref] + let $pageTable := subsequence($pageTableTmp, 1, 1) + return + $pageTable +let $pageNoteAnchors := $returnPageFragment//anchor[@type = 'note'] +let $pageNotes := + if ($docbase = "echo") + then + for $pageNoteAnchor in $pageNoteAnchors + let $noteHref := string($pageNoteAnchor/@xlink:href) + let $pageNoteTmp := $document//echo:note[@xlink:label = $noteHref] + let $pageNote := subsequence($pageNoteTmp, 1, 1) + return + $pageNote + else + $returnPageFragment//note + +(: Metadata handling: only metadata of the selected document is scanned :) +let $identifier := $documentIdentifier +let $authors := mpdl-lucene:getElementsByAttr($metadata, $docbase, "author") +let $titles := mpdl-lucene:getElementsByAttr($metadata, $docbase, "title") +let $places := mpdl-lucene:getElementsByAttr($metadata, $docbase, "place") +let $date := mpdl-lucene:getElementsByAttr($metadata, $docbase, "date") +let $rights := mpdl-lucene:getElementsByAttr($metadata, $docbase, "rights") +let $accessRights := mpdl-lucene:getElementsByAttr($metadata, $docbase, "accessRights") +let $licenses := mpdl-lucene:getElementsByAttr($metadata, $docbase, "license") +let $file := mpdl-lucene:getElementsByAttr($metadata, $docbase, "file") +let $translator := mpdl-lucene:getElementsByAttr($metadata, $docbase, "translator") +let $version := mpdl-lucene:getElementsByAttr($metadata, $docbase, "version") + +let $highlightQueryWordsTemp := + if ($highlightQuery != '') + then mpdltext:get-query-morph-forms($language, $highlightQuery) + else '' +let $highlightQueryRegularizations := + if ($highlightQuery != '') + then mpdltext:get-query-regularizations($language, $highlightQuery) + else '' +let $highlightQueryWords := + if ($highlightQueryWordsTemp != '' and $highlightQueryRegularizations = '') + then $highlightQueryWordsTemp + else if ($highlightQueryWordsTemp = '' and $highlightQueryRegularizations != '') + then $highlightQueryRegularizations + else if ($highlightQueryWordsTemp != '' and $highlightQueryRegularizations != '') + then concat($highlightQueryWordsTemp, '|', $highlightQueryRegularizations) + else () + +let $currentTimeEnd := util:system-time() +let $neededTime := mpdl-time:duration-as-ms($currentTimeEnd - $currentTimeBegin) + +let $xmlResult := + if ($errorCode < 10) + then + <result> + <document-description> + <uri>{$mpdlDocUri}</uri> + <collection-name>{$docbase}</collection-name> + <document-name>{$documentName}</document-name> + <language>{$language}</language> + <authors>{$authors}</authors> + <titles>{$titles}</titles> + <places>{$places}</places> + <date>{$date}</date> + <identifier>{$identifier}</identifier> + <rights>{$rights}</rights> + <accessRights>{$accessRights}</accessRights> + <licenses>{$licenses}</licenses> + <file>{$file}</file> + <translator>{$translator}</translator> + <version>{$version}</version> + <count-pages>{$countPages}</count-pages> + <count-places>{$countGisPlaces}</count-places> + <count-toc-entries>{$countTocEntries}</count-toc-entries> + <count-figure-entries>{$countFigureEntries}</count-figure-entries> + </document-description> + <page> + <mode>{$mode}</mode> + <number>{$pn}</number> + <header>{$pageHeader}</header> + <number-orig>{$pageNumberOrig}</number-orig> + <sentence-number>{$sn}</sentence-number> + <digilib-available>{$digilibAvailable}</digilib-available> + <image-available>{$imageIsAvailable}</image-available> + <image-file-name>{$imageFileName}</image-file-name> + {$imageEcho} + {$imageScaler} + <xml-url>?document={$documentName}&pn={$pn}&mode=xml</xml-url> + <page-image-directory>{$imagesDocDirectory}/{$pageImageDirectory}</page-image-directory> + <figures-image-directory>{$imagesDocDirectory}/{$figuresImageDirectory}</figures-image-directory> + <firstFigurePosition>{$positionOfFirstFigureAfterPB1}</firstFigurePosition> + <figures>{$pageFigures}</figures> + <handwritten>{$pageHandwritten}</handwritten> + <tables>{$pageTables}</tables> + <notes>{$pageNotes}</notes> + <highlights> + <query>{$highlightQuery}</query> + <words>{$highlightQueryWords}</words> + </highlights> + <content>{$returnPageFragment}</content> + <character-normalization>{$charNorm}</character-normalization> + <options>{$options}</options> + </page> + <performance>{$neededTime}</performance> + </result> + else if ($errorCode = 10) + then <error><number>{$errorCode}</number><description>Fulltext document: {$mpdlDocUri} is not available yet</description></error> + else if ($errorCode = 11) + then <error><number>{$errorCode}</number><description>No result: Page {$pn} not found</description></error> + else if ($errorCode = 12) + then <error><number>{$errorCode}</number><description>View mode {$mode} not available</description></error> + else <error><number>{$errorCode}</number><description>undefined error: {$errorCode}</description></error> + +let $declare := + if ($errorCode > 9 or $mode = "text" or $mode = "textPollux" or $mode = "gis" or $mode = "image" or $mode = "xml") + then util:declare-option("exist:serialize", "method=xhtml media-type=text/html omit-xml-declaration=no indent=yes encoding=utf-8") + else if ($mode = "pureXml") + then util:declare-option("exist:serialize", "method=xml media-type=text/xml omit-xml-declaration=no indent=yes encoding=utf-8") + else util:declare-option("exist:serialize", "method=xml media-type=text/xml omit-xml-declaration=no indent=yes encoding=utf-8") +let $xslFilePath := + if($mode = "text" or $mode = "textPollux" or $mode = "gis" or $mode = "image" or $mode = "xml") + then concat($presentationPath, "/pageFragmentHtml.xsl") + else concat($presentationPath, "/pageXml.xsl") + +let $titleStr := concat(string-join($authors, ', '), ". ", string-join($titles, ', '), ". ", string-join($places, ', '), " ", $date, ".") +let $tmpResult := + if ($errorCode < 10 and $reqExport = "pdf") + then mpdl-text:html2pdf($language, $xmlResult, $xslFilePath, $titleStr, $pn, $mode) + else if ($errorCode < 10 and not($reqExport = "pdf")) + then mpdl-text:transform($xmlResult, $xslFilePath) + else + <div>{$xmlResult}</div> (: error xml result :) +let $result := + if ($errorCode < 10 and $reqExport = "pdf") + then response:stream-binary($tmpResult, "application/pdf", concat($documentName, "-page", $pn, ".pdf")) + else $tmpResult + +let $setHeader := + if ($mode = "pureXml") + then response:set-header('Content-Disposition', concat('filename=', $documentName, '-page', $pn)) + else () + +return $result \ No newline at end of file