Mercurial > hg > mpdl-group
diff software/eXist/webapp/mpdl/presentation/functions-text.xsl @ 7:5589d865af7a
Erstellung XQL/XSL Applikation
author | Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de> |
---|---|
date | Tue, 08 Feb 2011 15:16:46 +0100 |
parents | |
children | 7e883ce72fec |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/webapp/mpdl/presentation/functions-text.xsl Tue Feb 08 15:16:46 2011 +0100 @@ -0,0 +1,354 @@ +<?xml version="1.0"?> +<xsl:stylesheet version="2.0" + xmlns:xsl="http://www.w3.org/1999/XSL/Transform" + xmlns:xlink="http://www.w3.org/1999/xlink" + xmlns:saxon="http://saxon.sf.net/" + xmlns:functx="http://www.functx.com" + xmlns:text="http://www.mpiwg-berlin.mpg.de/ns/mpdl/text" + xmlns:xs="http://www.w3.org/2001/XMLSchema"> + +<xsl:import href="/db/mpdl/presentation/functions-functx.xsl" /> + +<!-- delivers a concatenation of n chars of the given char --> +<xsl:function name="text:nchars"> + <xsl:param name="count" as="xs:integer"/> + <xsl:param name="char" as="xs:string"/> + <xsl:sequence select="if ($count > 1) then concat($char, text:nchars($count - 1, $char)) else $char"/> +</xsl:function> + +<xsl:function name="text:sortWithComma"> + <xsl:param name="inputString" as="xs:string"/> + <xsl:variable name="inputStrings" select="tokenize($inputString, '\|')"/> + <xsl:variable name="sortedInputStrings" select="functx:sort($inputStrings)"/> + <xsl:variable name="result"> + <xsl:for-each select="$sortedInputStrings"> + <xsl:value-of select="concat(., ', ')"/> + </xsl:for-each> + </xsl:variable> + <xsl:variable name="length" select="string-length($result)"/> + <xsl:choose> + <xsl:when test="$length > 0"> + <xsl:value-of select="substring($result, 0, $length - 1)"/> + </xsl:when> + <xsl:otherwise> + <xsl:value-of select="''"/> + </xsl:otherwise> + </xsl:choose> +</xsl:function> + +<!-- Compares inputString1 and inputString2 and removes the duplicates in --> +<!-- inputString1 and gives them back --> +<!-- inputString1 and inputString2 contain a list of strings separated by "|" --> +<xsl:function name="text:removeDuplicates"> + <xsl:param name="inputString1" as="xs:string"/> + <xsl:param name="inputString2" as="xs:string"/> + <xsl:variable name="inputStrings1" select="tokenize($inputString1, '\|')"/> + <xsl:variable name="inputStrings2" select="tokenize($inputString2, '\|')"/> + <xsl:variable name="result"> + <xsl:for-each select="$inputStrings1"> + <xsl:variable name="str" select="."/> + <xsl:if test="not(text:contained($str, $inputStrings2))"><xsl:value-of select="concat($str, '|')"/></xsl:if> + </xsl:for-each> + </xsl:variable> + <xsl:variable name="length" select="string-length($result)"/> + <xsl:choose> + <xsl:when test="$length > 0"> + <xsl:value-of select="substring($result, 0, $length)"/> + </xsl:when> + <xsl:otherwise> + <xsl:value-of select="''"/> + </xsl:otherwise> + </xsl:choose> +</xsl:function> + +<xsl:function name="text:contained" as="xs:boolean"> + <xsl:param name="arg" as="xs:string?"/> + <xsl:param name="searchStrings" as="xs:string*"/> + <xsl:sequence select="some $searchString in $searchStrings satisfies compare($arg, $searchString) = 0"/> +</xsl:function> + +<xsl:function name="text:cutStringBefore"> + <xsl:param name="inputString" as="xs:string"/> + <xsl:param name="cutLength" as="xs:integer"/> + <xsl:variable name="length" select="string-length($inputString)"/> + <xsl:variable name="cutString" select="substring($inputString, $length - $cutLength)"/> + <xsl:choose> + <xsl:when test="$length > $cutLength"> + (...) + <xsl:value-of select="text:cutFirstWord($cutString)"/> + </xsl:when> + <xsl:otherwise><xsl:value-of select="$cutString"/></xsl:otherwise> + </xsl:choose> +</xsl:function> + +<xsl:function name="text:cutStringAfter"> + <xsl:param name="inputString" as="xs:string"/> + <xsl:param name="cutLength" as="xs:integer"/> + <xsl:variable name="length" select="string-length($inputString)"/> + <xsl:variable name="cutString" select="substring($inputString, 0, $cutLength)"/> + <xsl:choose> + <xsl:when test="$length > $cutLength"> + <xsl:value-of select="text:cutLastWord($cutString)"/> + (...) + </xsl:when> + <xsl:otherwise><xsl:value-of select="$cutString"/></xsl:otherwise> + </xsl:choose> +</xsl:function> + +<xsl:function name="text:cutFirstWord"> + <xsl:param name="inputString" as="xs:string"/> + <xsl:value-of select="replace($inputString, '^.*?[\s:.,;!_]', ' ', 'im')"/> +</xsl:function> + +<xsl:function name="text:cutLastWord"> + <xsl:param name="inputString" as="xs:string"/> + <xsl:value-of select="replace($inputString, '(.*)[\s:.,;!_].*$', '$1 ', 'im')"/> +</xsl:function> + +<xsl:function name="text:trim"> + <xsl:param name="inputString" as="xs:string"/> + <xsl:variable name="trimBefore" select="replace($inputString, '^\s+(.*?)', '$1')"/> + <xsl:value-of select="replace($trimBefore, '(.*?)\s+$', '$1')"/> +</xsl:function> + +<!-- Word delimiter: not tested yet --> +<!-- TODO: bol, eol, ", &, <, > --> +<!-- <xsl:variable name="wordDelimRegExpr" select="'[\s\(\)\[\]\.\\\{\}\$\^\+\?\!\* ยง%:,;=/]+'"/> --> + +<!-- Tokenization of a Lucene query with phrases and words to a list of query terms separated with | --> +<xsl:function name="text:translateLuceneToTerms"> + <xsl:param name="inputLuceneQuery" as="xs:string" /> + <!-- Delete all parantheses outside quotes: (a AND b) OR c -> a AND b OR c --> + <xsl:variable name="luceneQueryWithoutParantheses1" select="replace($inputLuceneQuery, '([^\\])[()]', '$1')" /> + <xsl:variable name="luceneQueryWithoutParantheses2" select="replace($luceneQueryWithoutParantheses1, '^[()]', '')" /> + <!-- Unescape special chars in Lucene query --> + <xsl:variable name="luceneQueryUnescaped" select="replace($luceneQueryWithoutParantheses2, '\\([-\+&|!(){}\[\]\^"~*?:\\])', '$1')" /> + <!-- Escape special chars which have a meta meaning in regular expressions --> + <xsl:variable name="luceneQueryRegExprEscaped1" select="replace($luceneQueryUnescaped, '[*]', '###Star###')" /> + <xsl:variable name="luceneQueryRegExprEscaped2" select="replace($luceneQueryRegExprEscaped1, '[\+]', '###Plus###')" /> + <xsl:variable name="luceneQueryRegExprEscaped3" select="replace($luceneQueryRegExprEscaped2, '[?]', '###QM###')" /> + <xsl:variable name="luceneQueryRegExprEscaped4" select="replace($luceneQueryRegExprEscaped3, '[\.]', '###Dot###')" /> + <xsl:variable name="luceneQueryRegExprEscaped5" select="replace($luceneQueryRegExprEscaped4, '[\^]', '###BeginLine###')" /> + <xsl:variable name="luceneQueryRegExprEscaped6" select="replace($luceneQueryRegExprEscaped5, '[$]', '###EndLine###')" /> + <xsl:variable name="luceneQueryRegExprEscaped7" select="replace($luceneQueryRegExprEscaped6, '[|]', '###Or###')" /> + <xsl:variable name="luceneQueryRegExprEscaped8" select="replace($luceneQueryRegExprEscaped7, '[(]', '###Paranthes1Open###')" /> + <xsl:variable name="luceneQueryRegExprEscaped9" select="replace($luceneQueryRegExprEscaped8, '[)]', '###Paranthes1Close###')" /> + <xsl:variable name="luceneQueryRegExprEscaped10" select="replace($luceneQueryRegExprEscaped9, '[{]', '###Paranthes2Open###')" /> + <xsl:variable name="luceneQueryRegExprEscaped11" select="replace($luceneQueryRegExprEscaped10, '[}]', '###Paranthes2Close###')" /> + <xsl:variable name="luceneQueryRegExprEscaped12" select="replace($luceneQueryRegExprEscaped11, '[\[]', '###Paranthes3Open###')" /> + <xsl:variable name="luceneQueryRegExprEscaped13" select="replace($luceneQueryRegExprEscaped12, '[\]]', '###Paranthes3Close###')" /> + <xsl:variable name="luceneQueryRegExprEscaped14" select="replace($luceneQueryRegExprEscaped13, '~[0123456789,]*', '###Tilde###')" /> + <xsl:value-of select="text:translateLuceneQueryToTermsMain($luceneQueryRegExprEscaped14, 'DUMMYDUMMYSTART')" /> +</xsl:function> + +<!-- Tokenization of a Lucene query with phrases and words to a list of query terms separated with | --> +<xsl:function name="text:translateLuceneQueryToTermsMain"> + <xsl:param name="inputString" as="xs:string" /> + <!-- last term in previous step: used for Lucene operator NOT) --> + <xsl:param name="lastFetchedTerm" as="xs:string" /> + <xsl:choose> + <!-- single phrase: phrase is appended --> + <xsl:when test="matches($inputString, '^"[^"]*?"$')"> + <xsl:variable name="withoutSurroundingQuotes" select="substring-before(substring-after($inputString, '"'), '"')"/> + <xsl:variable name="termEscaped" select="text:escapeLucenePhraseForRegExpr($withoutSurroundingQuotes)" /> + <xsl:variable name="term" select="text:translateLuceneQueryTermToTerm($termEscaped, $lastFetchedTerm)" /> + <xsl:value-of select="$term" /> + </xsl:when> + <!-- "+" followd by a single phrase: phrase is appended --> + <xsl:when test="matches($inputString, '^###Plus###"[^"]*?"$')"> + <xsl:variable name="length" select="string-length($inputString)"/> + <xsl:variable name="withoutSurroundingQuotes" select="substring($inputString, 12, $length - 12)"/> + <xsl:variable name="termEscaped" select="text:escapeLucenePhraseForRegExpr($withoutSurroundingQuotes)" /> + <xsl:variable name="term" select="text:translateLuceneQueryTermToTerm($termEscaped, $lastFetchedTerm)" /> + <xsl:value-of select="$term" /> + </xsl:when> + <!-- "-" followd by a single phrase: phrase is not appended --> + <xsl:when test="matches($inputString, '^-"[^"]*?"$')"> + <xsl:value-of select="'DUMMYDUMMYMINUS'" /> + </xsl:when> + <!-- TODO: single phrase followed by near operator (~[0123456789]+): determine distance between --> + <xsl:when test="matches($inputString, '^"[^"]*?"~[0123456789]+$')"> + <xsl:value-of select="$inputString" /> + </xsl:when> + <!-- phrase followed by something: phrase is appended --> + <xsl:when test="matches($inputString, '^".*?"\s')"> + <xsl:variable name="afterFirstTerm" select="replace($inputString, '^".*?"\s', '')"/> + <xsl:variable name="length" select="string-length($inputString)"/> + <xsl:variable name="afterFirstTermLength" select="string-length($afterFirstTerm)"/> + <xsl:variable name="firstTerm" select="substring($inputString, 1, $length - $afterFirstTermLength)"/> + <xsl:variable name="afterFirstTermTrimmed" select="text:trim($afterFirstTerm)"/> + <xsl:variable name="firstTermWithoutSurroundingQuotes" select="substring-before(substring-after($firstTerm, '"'), '"')"/> + <xsl:variable name="firstTermEscaped" select="text:escapeLucenePhraseForRegExpr($firstTermWithoutSurroundingQuotes)" /> + <xsl:variable name="term" select="text:translateLuceneQueryTermToTerm($firstTermEscaped, $lastFetchedTerm)" /> + <xsl:value-of select="concat($term, '|')" /> + <!-- Recursive call of this function with the substring after the first term --> + <xsl:value-of select="text:translateLuceneQueryToTermsMain($afterFirstTermTrimmed, $term)"/> + </xsl:when> + <!-- + sign followd by phrase followed by something: phrase is appended --> + <xsl:when test="matches($inputString, '^###Plus###".*?"\s')"> + <xsl:variable name="afterFirstTerm" select="replace($inputString, '^###Plus###".*?"\s', '')"/> + <xsl:variable name="length" select="string-length($inputString)"/> + <xsl:variable name="afterFirstTermLength" select="string-length($afterFirstTerm)"/> + <xsl:variable name="firstTerm" select="substring($inputString, 1, $length - $afterFirstTermLength)"/> + <xsl:variable name="afterFirstTermTrimmed" select="text:trim($afterFirstTerm)"/> + <xsl:variable name="firstTermWithoutSurroundingQuotes" select="substring-before(substring-after($firstTerm, '"'), '"')"/> + <xsl:variable name="firstTermEscaped" select="text:escapeLucenePhraseForRegExpr($firstTermWithoutSurroundingQuotes)" /> + <xsl:variable name="term" select="text:translateLuceneQueryTermToTerm($firstTermEscaped, $lastFetchedTerm)" /> + <xsl:value-of select="concat($term, '|')" /> + <!-- Recursive call of this function with the substring after the first term --> + <xsl:value-of select="text:translateLuceneQueryToTermsMain($afterFirstTermTrimmed, $term)"/> + </xsl:when> + <!-- "-" followd by phrase followed by something: phrase is not appended --> + <xsl:when test="matches($inputString, '^-".*?"\s')"> + <xsl:variable name="afterFirstTerm" select="replace($inputString, '^-".*?"\s', '')"/> + <xsl:variable name="afterFirstTermTrimmed" select="text:trim($afterFirstTerm)"/> + <!-- Recursive call of this function with the substring after the first term --> + <xsl:value-of select="text:translateLuceneQueryToTermsMain($afterFirstTermTrimmed, 'DUMMYDUMMYMINUS')"/> + </xsl:when> + <!-- single word: without quotes and spaces: word is appended --> + <xsl:when test="matches($inputString, '^[^"\s]*$')"> + <xsl:variable name="termEscaped" select="text:escapeLuceneTermForRegExpr($inputString)" /> + <xsl:variable name="term" select="text:translateLuceneQueryTermToTerm($termEscaped, $lastFetchedTerm)"/> + <xsl:value-of select="$term" /> + </xsl:when> + <!-- word followed by something: word is appended --> + <xsl:when test="matches($inputString, '^[^"\s]*?\s')"> + <xsl:variable name="afterFirstTerm" select="replace($inputString, '^[^"\s]*?\s', '')"/> + <xsl:variable name="length" select="string-length($inputString)"/> + <xsl:variable name="afterFirstTermLength" select="string-length($afterFirstTerm)"/> + <xsl:variable name="firstTerm" select="substring($inputString, 1, $length - $afterFirstTermLength)"/> + <xsl:variable name="firstTermTrimmed" select="text:trim($firstTerm)"/> + <xsl:variable name="afterFirstTermTrimmed" select="text:trim($afterFirstTerm)"/> + <!-- treat single Lucene term: special Lucene characters in query term ("*", "+", ".", "-") --> + <xsl:variable name="termEscaped" select="text:escapeLuceneTermForRegExpr($firstTermTrimmed)" /> + <xsl:variable name="term" select="text:translateLuceneQueryTermToTerm($termEscaped, $lastFetchedTerm)" /> + <xsl:value-of select="concat($term, '|')" /> + <!-- Recursive call of this function with the substring after the first term --> + <xsl:value-of select="text:translateLuceneQueryToTermsMain($afterFirstTermTrimmed, $term)"/> + </xsl:when> + <xsl:otherwise> + <xsl:value-of select="''"/> + </xsl:otherwise> + </xsl:choose> +</xsl:function> + +<xsl:function name="text:escapeLuceneTermForRegExpr"> + <xsl:param name="inputString" as="xs:string" /> + <!-- replace special Lucene characters: "*", "+", "~" were escaped for regular expression --> + <xsl:variable name="termWithoutLuceneSymbols1" select="replace($inputString, '###Star###', '')" /> + <xsl:variable name="termWithoutLuceneSymbols2" select="replace($termWithoutLuceneSymbols1, '###Plus###', '')" /> + <xsl:variable name="termWithoutLuceneSymbols3" select="replace($termWithoutLuceneSymbols2, '###Tilde###', '')" /> + <!-- Lucene mask symbol "?" is replaced with regular expression symbol "." --> + <xsl:variable name="term" select="replace($termWithoutLuceneSymbols3, '###QM###', '.')" /> + <xsl:value-of select="$term" /> +</xsl:function> + +<xsl:function name="text:escapeLucenePhraseForRegExpr"> + <xsl:param name="inputString" as="xs:string" /> + <!-- replace special Lucene characters: "*" and "+" were escaped for regular expression --> + <xsl:variable name="termWithoutLuceneSymbols1" select="replace($inputString, '###Star###', '\\*')" /> + <xsl:variable name="termWithoutLuceneSymbols2" select="replace($termWithoutLuceneSymbols1, '###Plus###', '')" /> + <!-- Lucene mask symbol "?" is replaced with regular expression symbol "." --> + <xsl:variable name="term" select="replace($termWithoutLuceneSymbols2, '###QM###', '\\?')" /> + <xsl:value-of select="$term" /> +</xsl:function> + +<!-- last special char replacements and logical operator handling --> +<xsl:function name="text:translateLuceneQueryTermToTerm"> + <xsl:param name="inputString" as="xs:string" /> + <!-- last term in previous step: used for Lucene operator NOT) --> + <xsl:param name="lastFetchedTerm" as="xs:string" /> + <xsl:variable name="termEscaped1" select="replace($inputString, '###Dot###', '\\.')" /> + <xsl:variable name="termEscaped2" select="replace($termEscaped1, '###BeginLine###', '\\^')" /> + <xsl:variable name="termEscaped3" select="replace($termEscaped2, '###EndLine###', '\\DollarSign')" /> <!-- TODO --> + <xsl:variable name="termEscaped4" select="replace($termEscaped3, '###Or###', '\\|')" /> + <xsl:variable name="termEscaped5" select="replace($termEscaped4, '###Paranthes1Open###', '\\(')" /> + <xsl:variable name="termEscaped6" select="replace($termEscaped5, '###Paranthes1Close###', '\\)')" /> + <xsl:variable name="termEscaped7" select="replace($termEscaped6, '###Paranthes2Open###', '\\{')" /> + <xsl:variable name="termEscaped8" select="replace($termEscaped7, '###Paranthes1Close###', '\\}')" /> + <xsl:variable name="termEscaped9" select="replace($termEscaped8, '###Paranthes3Open###', '\\[')" /> + <xsl:variable name="term" select="replace($termEscaped9, '###Paranthes3Close###', '\\]')" /> + <xsl:choose> + <xsl:when test="($term != 'AND') and ($term != 'OR') and ($term != 'NOT') and (substring($term, 1, 1) != '-') and (($lastFetchedTerm = 'DUMMYDUMMYSTART') or ($lastFetchedTerm != 'DUMMYDUMMYNOT'))"> + <xsl:value-of select="$term" /> + </xsl:when> + <xsl:otherwise> + <xsl:value-of select="concat('DUMMYDUMMY', $term)"/> + </xsl:otherwise> + </xsl:choose> +</xsl:function> + +<!-- Highlight all term occurrences. Result is a sequence of text and highlighted nodes (with span). +Example: LE<span ...>MECHANICHE</span>...<span ...>Mechaniche</span>... --> +<xsl:function name="text:highlight"> + <xsl:param name="inputStr" as="xs:string" /> + <xsl:param name="terms" as="xs:string" /> + <xsl:param name="words" as="xs:string" /> + <xsl:param name="clipped" as="xs:string" /> + <xsl:variable name="hitBeginStr" select="'XXhitStartXX'"/> + <xsl:variable name="hitEndStr" select="'XXhitEndXX'"/> + <xsl:variable name="inputStringTemp" select="replace($inputStr, '\n', '')"/> + <!-- replace all term or word occurences with surrounding begin and end marks (string operation) --> + <xsl:choose> + <xsl:when test="$terms != '' and $words != ''"> + <xsl:variable name="inputStringWithMarksForTerms" select="replace($inputStringTemp, concat('(', $terms, ')'), concat($hitBeginStr, '$1', $hitEndStr), 'im')"/> + <xsl:variable name="inputStringWithMarksForWords" select="replace($inputStringWithMarksForTerms, concat('([\s:.,;!_]+|^)', '(', $words, ')', '([\s:.,;!_]+|$)'), concat('$1', $hitBeginStr, '$2', $hitEndStr, '$3'), 'im')"/> + <xsl:sequence select="text:highlightTerms($inputStringWithMarksForWords, $clipped, $hitBeginStr, $hitEndStr)" /> + </xsl:when> + <xsl:when test="$terms != '' and $words = ''"> + <xsl:variable name="inputStringWithMarksForTerms" select="replace($inputStringTemp, concat('(', $terms, ')'), concat($hitBeginStr, '$1', $hitEndStr), 'im')"/> + <xsl:sequence select="text:highlightTerms($inputStringWithMarksForTerms, $clipped, $hitBeginStr, $hitEndStr)" /> + </xsl:when> + <xsl:when test="$terms = '' and $words != ''"> + <xsl:variable name="inputStringWithMarksForWords" select="replace($inputStringTemp, concat('([\s:.,;!_]+|^)', '(', $words, ')', '([\s:.,;!_]+)'), concat('$1', $hitBeginStr, '$2', $hitEndStr, '$3'), 'im')"/> + <xsl:sequence select="text:highlightTerms($inputStringWithMarksForWords, $clipped, $hitBeginStr, $hitEndStr)" /> + </xsl:when> + <xsl:otherwise> + <xsl:sequence select="$inputStr"/> + </xsl:otherwise> + </xsl:choose> +</xsl:function> + +<!-- Convert an input string with hits (marked with begin and end marks) to a sequence of text nodes with highlight span nodes --> +<xsl:function name="text:highlightTerms"> + <xsl:param name="inputString" as="xs:string" /> + <xsl:param name="clipped" as="xs:string" /> + <xsl:param name="hitBeginStr" /> + <xsl:param name="hitEndStr" /> + <xsl:variable name="substringBefore" select="substring-before($inputString, $hitEndStr)"/> + <xsl:variable name="substringAfter" select="substring-after($inputString, $hitEndStr)"/> + <xsl:variable name="beforeHitBeginString" select="substring-before($substringBefore, $hitBeginStr)"/> + <xsl:variable name="hitTerm" select="substring-after($substringBefore, $hitBeginStr)"/> + <xsl:choose> + <xsl:when test="contains($inputString, $hitEndStr)"> + <!-- Prints the original part of the substring up to the first occurrence of a hit --> + <xsl:choose> + <xsl:when test="$clipped='true'"> + <xsl:value-of select="text:cutStringBefore($beforeHitBeginString, 70)"/> + </xsl:when> + <xsl:otherwise> + <xsl:value-of select="$beforeHitBeginString"/> + </xsl:otherwise> + </xsl:choose> + <!-- Highlight the hit --> + <span class="hit highlight"> + <xsl:value-of select="$hitTerm"/> + </span> + <!-- Recursive call of this function with the substring after the first occurrence of the hit: further occurrences of hits --> + <xsl:sequence select="text:highlightTerms($substringAfter, $clipped, $hitBeginStr, $hitEndStr)"/> + </xsl:when> + <!-- if no occurrence of a hit could be found the whole string is printed --> + <xsl:otherwise> + <xsl:choose> + <xsl:when test="$clipped='true'"> + <xsl:value-of select="text:cutStringAfter($inputString, 70)"/> + </xsl:when> + <xsl:otherwise> + <xsl:value-of select="$inputString"/> + </xsl:otherwise> + </xsl:choose> + </xsl:otherwise> + </xsl:choose> +</xsl:function> + +</xsl:stylesheet> \ No newline at end of file