Mercurial > hg > mpdl-group
view software/eXist/mpdl-modules/src/org/exist/xquery/modules/document/GetFragmentBetweenFunctionByFileSearch.java @ 0:408254cf2f1d
Erstellung
author | Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de> |
---|---|
date | Wed, 24 Nov 2010 17:24:23 +0100 |
parents | |
children |
line wrap: on
line source
/* * eXist Open Source Native XML Database: Extension module * Copyright (C) 2009 Max Planck Institute for the history of science. * Josef Willenborg, jwillenborg@mpiwg-berlin.mpg.de * http://www.mpiwg-berlin.mpg.de * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public License * as published by the Free Software Foundation; either version 2 * of the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. * * $Id: GetFragmentBetweenFunction.java $ */ package org.exist.xquery.modules.document; import java.io.BufferedReader; import java.io.FileReader; import java.io.IOException; import java.util.ArrayList; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.exist.dom.QName; import org.exist.xquery.BasicFunction; import org.exist.xquery.Cardinality; import org.exist.xquery.CompiledXQuery; import org.exist.xquery.FunctionSignature; import org.exist.xquery.XPathException; import org.exist.xquery.XQuery; import org.exist.xquery.XQueryContext; import org.exist.xquery.value.Item; import org.exist.xquery.value.Sequence; import org.exist.xquery.value.SequenceType; import org.exist.xquery.value.StringValue; import org.exist.xquery.value.Type; import org.exist.xquery.value.ValueSequence; /** * Delivers the fragment between two milestones in an XML document. * It leads to more performance for most XML documents because it * determines the fragment directly by file search on the XML file on * the file system. * Precondition of this function is that all the XML document could be * read on the file system cache (see FS_DOC_CACHE_PATH as the root path * for the XML documents). * Static XML documents could easily be copied to that file system path. * But look also to the extension of the eXist class RpcConnection which * implements the FS_DOC_CACHE for all important operations on XML documents. */ public class GetFragmentBetweenFunctionByFileSearch extends BasicFunction { private final static String FS_DOC_CACHE_PATH = "/webapp/WEB-INF/data/fs-doc-cache"; public final static FunctionSignature signature = new FunctionSignature( new QName("getFragmentBetween", DocumentModule.NAMESPACE_URI, DocumentModule.PREFIX), "A function which delivers the xml fragment between milestones)", new SequenceType[] { new SequenceType(Type.STRING, Cardinality.ONE), new SequenceType(Type.STRING, Cardinality.ONE), new SequenceType(Type.INTEGER, Cardinality.ONE), new SequenceType(Type.INTEGER, Cardinality.ONE) }, new SequenceType(Type.STRING, Cardinality.ZERO_OR_MORE)); public GetFragmentBetweenFunctionByFileSearch(XQueryContext context) { super(context, signature); } /* * Get the fragment between two milestones by position * @param args 1. docUriStr document URI (e.g. /db/shakespeare/hamlet.xml), * 2. msName milestone name (e.g.: pb), 3. msPositionFrom first milestone (e.g.: 10), * 4. msPositionTo second milestone (e.g.: 11) */ public Sequence eval(Sequence[] args, Sequence contextSequence) throws XPathException { Sequence docUri = args[0]; Sequence milestoneName = args[1]; Sequence milestonePositionFrom = args[2]; Sequence milestonePositionTo = args[3]; String docUriStr = docUri.getStringValue(); String milestoneNameStr = milestoneName.getStringValue(); int milestonePositionFromInt = new Integer(milestonePositionFrom.getStringValue()); int milestonePositionToInt = new Integer(milestonePositionTo.getStringValue()); ValueSequence resultFragment = new ValueSequence(); int countMs = getCountMs(docUriStr, milestoneNameStr); // test milestone positions within document: return an empty string if not valid if (milestonePositionFromInt < 1 || milestonePositionToInt <= milestonePositionFromInt || milestonePositionFromInt > countMs || milestonePositionToInt > countMs+1) { resultFragment.add(new StringValue("")); return resultFragment; } String msFromPathName = getNodePath(docUriStr, milestoneNameStr, milestonePositionFromInt); String msToPathName = getNodePath(docUriStr, milestoneNameStr, milestonePositionToInt); String openElementsOfMsFrom = pathName2XmlTags(msFromPathName, "open"); String closingElementsOfMsTo = pathName2XmlTags(msToPathName, "close"); // fetch the fragment between the two milestones String fragment = getFragmentBetween(docUriStr, milestoneNameStr, milestonePositionFromInt, milestonePositionToInt); fragment = openElementsOfMsFrom + fragment + closingElementsOfMsTo; StringValue strValFragment = new StringValue(fragment); resultFragment.add(strValFragment); return resultFragment; } /** * Fetch the fragment between two milestones in an XML document * bufferSize is important for better performance: each chunk in this size is * matched against the regular expression, if it is too small or too high then * performance could be bad * @param docUriStr document URI (e.g. /db/shakespeare/hamlet.xml) * @param msName milestone name (e.g.: pb) * @param msPositionFrom first milestone (e.g.: 10) * @param msPositionTo second milestone (e.g.: 11) * @return fragment between the two milestones with msPositionFrom and msPositionTo * @throws XPathException */ private String getFragmentBetween(String docUriStr, String msName, int msPositionFrom, int msPositionTo) throws XPathException { int bufferSize = 16384; // performance: buffer size 4096 is 25% slower String existHomeFilePath = getExistHomeFilePath(); String docLocalFileName = existHomeFilePath + FS_DOC_CACHE_PATH + docUriStr; /* * find milestones: <pb n=7 /> * find milestones explicitly closed: <pb n=5>blabla</pb> * find milestones in multilines: * <pb * n=10 * /> * find case insensitive and in multilines: Pattern.CASE_INSENSITIVE | Pattern.MULTILINE */ String regExprMsInternClosed = "<" + msName + "[^>]*?/>"; String regExprMsExternClosed = "<" + msName + "[^>]*?>[^>]*?</" + msName + "\\s*>"; String regExprMilestone = regExprMsInternClosed + "|" + regExprMsExternClosed; Pattern p = Pattern.compile(regExprMilestone, Pattern.CASE_INSENSITIVE | Pattern.MULTILINE); // both flags enabled String readBufferStr = ""; char[] readBuffer = new char[bufferSize]; String msFragmentBuffer = ""; int msCount = 0; String result = ""; boolean eof = false; String ms = ""; try { BufferedReader in = new BufferedReader(new FileReader(docLocalFileName)); while (!eof && !(msCount >= msPositionTo)) { int countReadChars = in.read(readBuffer, 0, bufferSize); // last page: delivers all characters to the end in the document if (countReadChars == -1) { eof = true; in.close(); return ms + msFragmentBuffer; } readBufferStr = new String(readBuffer, 0, countReadChars); msFragmentBuffer = msFragmentBuffer + readBufferStr; Matcher m = p.matcher(msFragmentBuffer); int fragmentBeginPos = 0; while (m.find()) { int msBeginPos = m.start(); int msEndPos = m.end(); // we have the milestone fragments (milestone end could be matched) one by one // milestone end: cut the part in the last line before the milestone String msFragment = ms + msFragmentBuffer.substring(fragmentBeginPos, msBeginPos); // add result milestone fragments which are between msPositionFrom and msPositionTo // last fragment in document (last page): is not added if (msCount >= msPositionFrom && msCount < msPositionTo) { result = result + msFragment; } fragmentBeginPos = msEndPos; ms = msFragmentBuffer.substring(msBeginPos, msEndPos); msCount++; // each found milestone increments the count of milestones } // delivers the portion after the last found milestone; this is used for the next msFragmentBuffer for matching msFragmentBuffer = msFragmentBuffer.substring(fragmentBeginPos, msFragmentBuffer.length()); } in.close(); } catch (IOException e) { throw new XPathException(e); } return result; } private String getNodePath(String docPath, String msName, int position) throws XPathException { String query = "let $ms := doc('" + docPath + "')//" + msName + "[" + position + "]/.. \n" + "let $result := " + " if ($ms) " + " then util:node-xpath($ms)" + " else (\"\") \n" + "return $result"; String nodePath = executeXQuery(query); return nodePath; } private int getCountMs(String docPath, String msName) throws XPathException { int count = -1; String query = "let $result := count(doc('" + docPath + "')//" + msName + ")" + "\n" + "return $result"; String resultStr = executeXQuery(query); count = new Integer(resultStr); return count; } /** * A path name delivered by function xnode-path (with special strings such as * "@", "[", "]", " eq ") is converted to an XML String with xml tags, * opened or closed such as the mode says * @param pathName delivered by function xnode-path: Example: /archimedes[@xmlns:xlink eq "http://www.w3.org/1999/xlink"]/text/body/chap/p[@type eq "main"]/s/foreign[@lang eq "en"] * @param mode open or close * @return xml tags opened or closed */ private String pathName2XmlTags(String pathName, String mode) { String result = ""; ArrayList<String> elements = pathName2ElementsWithAttributes(pathName); if (mode.equals("open")) { for (int i=0; i < elements.size(); i++) { String element = elements.get(i); element = element.replaceAll("\\[", " "); // opening element: replace open bracket with space element = element.replaceAll(" eq ", "="); // opening element: remove @ character element = element.replaceAll("@", ""); // opening element: remove @ character element = element.replaceAll("\\]", ""); // opening element: remove closing bracket if (! (element.length() == 0)) result += "<" + element + ">\n"; } } else if (mode.equals("close")) { for (int i=elements.size()-1; i >= 0; i--) { String element = elements.get(i); element = element.replaceAll("\\[[^\\]]*\\]", ""); // closing element: remove brackets with attributes if (! (element.length() == 0)) result += "</" + element + ">\n"; } } return result; } private ArrayList<String> pathName2ElementsWithAttributes(String pathName) { ArrayList<String> result = new ArrayList<String>(); String regExpr = "/[^/]+\\[[^\\]]*\\]" + "|" + "/[^/\\[]+"; // pathName example: "/archimedes[@xmlns:xlink eq "http://www.w3.org/1999/xlink"]/text/body/chap/p[@type eq "main"]/s/foreign[@lang eq "en"]" Pattern p = Pattern.compile(regExpr, Pattern.CASE_INSENSITIVE | Pattern.MULTILINE); // both flags enabled Matcher m = p.matcher(pathName); while (m.find()) { int msBeginPos = m.start(); int msEndPos = m.end(); String elementName = pathName.substring(msBeginPos+1, msEndPos); // without first "/" character result.add(elementName); } return result; } private String getExistHomeFilePath() throws XPathException { return context.getBroker().getConfiguration().getExistHome().getAbsolutePath(); } private String executeXQuery(String xQueryStr) throws XPathException { XQuery xQuery = context.getBroker().getXQueryService(); CompiledXQuery compiledXQuery = xQuery.compile(context, xQueryStr); Sequence sequence = compiledXQuery.eval(null); // without context Item item = sequence.itemAt(0); String nodeValueStr = item.getStringValue(); return nodeValueStr; } /** * not yet used but useful in future * @param docPath * @return * @throws XPathException */ private String getNamespaceString(String docPath) throws XPathException { String query = "let $elem := doc('" + docPath + "')/*" + "\n" + "let $prefs := in-scope-prefixes($elem)" + "\n" + "for $pref in $prefs" + "\n" + " let $uri := namespace-uri-for-prefix($pref, $elem)" + "\n" + " let $result := " + " if ($pref = \"xml\") " + " then ()" + "\n" + " else concat(\"xmlns:\", $pref, \"="\", $uri, \""\") \n" + "return $result"; String resultStr = executeXQuery(query); return resultStr; } }