Mercurial > hg > mpdl-group

/*
 *  eXist Open Source Native XML Database: Extension module
 *  Copyright (C) 2009 Max Planck Institute for the history of science.
 *  Josef Willenborg, jwillenborg@mpiwg-berlin.mpg.de
 *  http://www.mpiwg-berlin.mpg.de
 *
 *  This program is free software; you can redistribute it and/or
 *  modify it under the terms of the GNU Lesser General Public License
 *  as published by the Free Software Foundation; either version 2
 *  of the License, or (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU Lesser General Public License for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public License
 *  along with this program; if not, write to the Free Software
 *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 *
 *  $Id: GetFragmentBetweenFunction.java $
 */
package org.exist.xquery.modules.document;

import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.exist.dom.QName;
import org.exist.xquery.BasicFunction;
import org.exist.xquery.Cardinality;
import org.exist.xquery.CompiledXQuery;
import org.exist.xquery.FunctionSignature;
import org.exist.xquery.XPathException;
import org.exist.xquery.XQuery;
import org.exist.xquery.XQueryContext;
import org.exist.xquery.value.Item;
import org.exist.xquery.value.Sequence;
import org.exist.xquery.value.SequenceType;
import org.exist.xquery.value.StringValue;
import org.exist.xquery.value.Type;
import org.exist.xquery.value.ValueSequence;

/**
 * Delivers the fragment between two milestones in an XML document.
 * It leads to more performance for most XML documents because it
 * determines the fragment directly by file search on the XML file on
 * the file system.
 * Precondition of this function is that all the XML document could be
 * read on the file system cache (see FS_DOC_CACHE_PATH as the root path
 * for the XML documents).
 * Static XML documents could easily be copied to that file system path.
 * But look also to the extension of the eXist class RpcConnection which
 * implements the FS_DOC_CACHE for all important operations on XML documents.
 */
public class GetFragmentBetweenFunctionByFileSearch extends BasicFunction {
  private final static String FS_DOC_CACHE_PATH = "/webapp/WEB-INF/data/fs-doc-cache";

	public final static FunctionSignature signature =
		new FunctionSignature(
	      new QName("getFragmentBetween", DocumentModule.NAMESPACE_URI, DocumentModule.PREFIX),
	      "A function which delivers the xml fragment between milestones)",
        new SequenceType[] { new SequenceType(Type.STRING, Cardinality.ONE),
                             new SequenceType(Type.STRING, Cardinality.ONE),
                             new SequenceType(Type.INTEGER, Cardinality.ONE),
                             new SequenceType(Type.INTEGER, Cardinality.ONE) },
	      new SequenceType(Type.STRING, Cardinality.ZERO_OR_MORE));

	public GetFragmentBetweenFunctionByFileSearch(XQueryContext context) {
		super(context, signature);
	}

	/*
	 * Get the fragment between two milestones by position
   * @param args 1. docUriStr document URI (e.g. /db/shakespeare/hamlet.xml),
   * 2. msName milestone name (e.g.: pb), 3. msPositionFrom first milestone (e.g.: 10),
   * 4. msPositionTo second milestone (e.g.: 11)
	 */
  public Sequence eval(Sequence[] args, Sequence contextSequence) throws XPathException {
    Sequence docUri = args[0];
    Sequence milestoneName = args[1];
    Sequence milestonePositionFrom = args[2];
    Sequence milestonePositionTo = args[3];
    String docUriStr = docUri.getStringValue();
    String milestoneNameStr = milestoneName.getStringValue();
    int milestonePositionFromInt = new Integer(milestonePositionFrom.getStringValue());
    int milestonePositionToInt = new Integer(milestonePositionTo.getStringValue());
    ValueSequence resultFragment = new ValueSequence();
    int countMs = getCountMs(docUriStr, milestoneNameStr);
    // test milestone positions within document: return an empty string if not valid
    if (milestonePositionFromInt < 1 || milestonePositionToInt <= milestonePositionFromInt || milestonePositionFromInt > countMs || milestonePositionToInt > countMs+1) {
      resultFragment.add(new StringValue(""));
      return resultFragment;
    }
    String msFromPathName = getNodePath(docUriStr, milestoneNameStr, milestonePositionFromInt);
    String msToPathName = getNodePath(docUriStr, milestoneNameStr, milestonePositionToInt);
    String openElementsOfMsFrom = pathName2XmlTags(msFromPathName, "open");
    String closingElementsOfMsTo = pathName2XmlTags(msToPathName, "close");
    // fetch the fragment between the two milestones
    String fragment = getFragmentBetween(docUriStr, milestoneNameStr, milestonePositionFromInt, milestonePositionToInt);
    fragment = openElementsOfMsFrom + fragment + closingElementsOfMsTo;
    StringValue strValFragment = new StringValue(fragment);
    resultFragment.add(strValFragment);
    return resultFragment;
  }

  /**
	 * Fetch the fragment between two milestones in an XML document
	 * bufferSize is important for better performance: each chunk in this size is
	 * matched against the regular expression, if it is too small or too high then
	 * performance could be bad
	 * @param docUriStr document URI (e.g. /db/shakespeare/hamlet.xml)
	 * @param msName milestone name (e.g.: pb)
	 * @param msPositionFrom first milestone (e.g.: 10)
	 * @param msPositionTo second milestone (e.g.: 11)
	 * @return fragment between the two milestones with msPositionFrom and msPositionTo
	 * @throws XPathException
	 */
  private String getFragmentBetween(String docUriStr, String msName, int msPositionFrom, int msPositionTo) throws XPathException {
    int bufferSize = 16384; // performance: buffer size 4096 is 25% slower
    String existHomeFilePath = getExistHomeFilePath();
    String docLocalFileName = existHomeFilePath + FS_DOC_CACHE_PATH + docUriStr;
    /*
     * find milestones: <pb n=7 />
     * find milestones explicitly closed: <pb n=5>blabla</pb>
     * find milestones in multilines:
     *   <pb
     *    n=10
     *   />
     * find case insensitive and in multilines: Pattern.CASE_INSENSITIVE | Pattern.MULTILINE
     */
    String regExprMsInternClosed = "<" + msName + "[^>]*?/>";
    String regExprMsExternClosed = "<" + msName + "[^>]*?>[^>]*?</" + msName + "\\s*>";
    String regExprMilestone = regExprMsInternClosed + "|" + regExprMsExternClosed;
    Pattern p = Pattern.compile(regExprMilestone, Pattern.CASE_INSENSITIVE | Pattern.MULTILINE); // both flags enabled
    String readBufferStr = "";
    char[] readBuffer = new char[bufferSize];
    String msFragmentBuffer = "";
    int msCount = 0;
    String result = "";
    boolean eof = false;
    String ms = "";
    try {
      BufferedReader in = new BufferedReader(new FileReader(docLocalFileName));
      while (!eof && !(msCount >= msPositionTo)) {
        int countReadChars = in.read(readBuffer, 0, bufferSize);
        // last page: delivers all characters to the end in the document
        if (countReadChars == -1) {
          eof = true;
          in.close();
          return ms + msFragmentBuffer;
        }
        readBufferStr = new String(readBuffer, 0, countReadChars);
        msFragmentBuffer = msFragmentBuffer + readBufferStr;
        Matcher m = p.matcher(msFragmentBuffer);
        int fragmentBeginPos = 0;
        while (m.find()) {
          int msBeginPos = m.start();
          int msEndPos = m.end();
          // we have the milestone fragments (milestone end could be matched) one by one
          // milestone end: cut the part in the last line before the milestone
          String msFragment = ms + msFragmentBuffer.substring(fragmentBeginPos, msBeginPos);
          // add result milestone fragments which are between msPositionFrom and msPositionTo
          // last fragment in document (last page): is not added
          if (msCount >= msPositionFrom && msCount < msPositionTo) {
            result = result + msFragment;
          }
          fragmentBeginPos = msEndPos;
          ms = msFragmentBuffer.substring(msBeginPos, msEndPos);
          msCount++; // each found milestone increments the count of milestones
        }
        // delivers the portion after the last found milestone; this is used for the next msFragmentBuffer for matching
        msFragmentBuffer = msFragmentBuffer.substring(fragmentBeginPos, msFragmentBuffer.length());
      }
      in.close();
    } catch (IOException e) {
      throw new XPathException(e);
    }
    return result;
  }

  private String getNodePath(String docPath, String msName, int position) throws XPathException {
    String query =
      "let $ms := doc('" + docPath + "')//" + msName + "[" + position + "]/.. \n" +
      "let $result := " +
      "  if ($ms) " +
      "  then util:node-xpath($ms)" +
      "  else (\"\") \n" +
      "return $result";
    String nodePath = executeXQuery(query);
    return nodePath;
  }

  private int getCountMs(String docPath, String msName) throws XPathException {
    int count = -1;
    String query = "let $result := count(doc('" + docPath + "')//" + msName + ")" + "\n" + "return $result";
    String resultStr = executeXQuery(query);
    count = new Integer(resultStr);
    return count;
  }

  /**
   * A path name delivered by function xnode-path (with special strings such as
   * "@", "[", "]", " eq ") is converted to an XML String with xml tags,
   * opened or closed such as the mode says
   * @param pathName delivered by function xnode-path: Example: /archimedes[@xmlns:xlink eq "http://www.w3.org/1999/xlink"]/text/body/chap/p[@type eq "main"]/s/foreign[@lang eq "en"]
   * @param mode open or close
   * @return xml tags opened or closed
   */
  private String pathName2XmlTags(String pathName, String mode) {
    String result = "";
    ArrayList<String> elements = pathName2ElementsWithAttributes(pathName);
    if (mode.equals("open")) {
      for (int i=0; i < elements.size(); i++) {
        String element = elements.get(i);
        element = element.replaceAll("\\[", " ");  // opening element: replace open bracket with space
        element = element.replaceAll(" eq ", "=");  // opening element: remove @ character
        element = element.replaceAll("@", "");  // opening element: remove @ character
        element = element.replaceAll("\\]", "");  // opening element: remove closing bracket
        if (! (element.length() == 0))
          result += "<" + element + ">\n";
      }
    } else if (mode.equals("close")) {
      for (int i=elements.size()-1; i >= 0; i--) {
        String element = elements.get(i);
        element = element.replaceAll("\\[[^\\]]*\\]", "");  // closing element: remove brackets with attributes
        if (! (element.length() == 0))
          result += "</" + element + ">\n";
      }
    }
    return result;
  }

  private ArrayList<String> pathName2ElementsWithAttributes(String pathName) {
    ArrayList<String> result = new ArrayList<String>();
    String regExpr = "/[^/]+\\[[^\\]]*\\]" + "|" + "/[^/\\[]+"; // pathName example: "/archimedes[@xmlns:xlink eq "http://www.w3.org/1999/xlink"]/text/body/chap/p[@type eq "main"]/s/foreign[@lang eq "en"]"
    Pattern p = Pattern.compile(regExpr, Pattern.CASE_INSENSITIVE | Pattern.MULTILINE); // both flags enabled
    Matcher m = p.matcher(pathName);
    while (m.find()) {
      int msBeginPos = m.start();
      int msEndPos = m.end();
      String elementName = pathName.substring(msBeginPos+1, msEndPos);  // without first "/" character
      result.add(elementName);
    }
    return result;
  }

  private String getExistHomeFilePath() throws XPathException {
    return context.getBroker().getConfiguration().getExistHome().getAbsolutePath();
  }

  private String executeXQuery(String xQueryStr) throws XPathException {
    XQuery xQuery = context.getBroker().getXQueryService();
    CompiledXQuery compiledXQuery = xQuery.compile(context, xQueryStr);
    Sequence sequence = compiledXQuery.eval(null); // without context
    Item item = sequence.itemAt(0);
    String nodeValueStr = item.getStringValue();
    return nodeValueStr;
  }

  /**
   * not yet used but useful in future
   * @param docPath
   * @return
   * @throws XPathException
   */
  private String getNamespaceString(String docPath) throws XPathException {
    String query =
      "let $elem := doc('" + docPath + "')/*" + "\n" +
      "let $prefs := in-scope-prefixes($elem)" + "\n" +
      "for $pref in $prefs" + "\n" +
      "  let $uri := namespace-uri-for-prefix($pref, $elem)" + "\n" +
      "  let $result := " +
      "    if ($pref = \"xml\") " +
      "    then ()" + "\n" +
      "    else  concat(\"xmlns:\", $pref, \"=&quot;\", $uri, \"&quot;\") \n" +
      "return $result";
      String resultStr = executeXQuery(query);
      return resultStr;
  }
}
author	Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
date	Wed, 24 Nov 2010 17:24:23 +0100
parents
children