Mercurial > hg > mpdl-group

package de.mpg.mpiwg.berlin.mpdl.cms.document;

import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.StringReader;
import java.net.MalformedURLException;
import java.net.URI;
import java.net.URISyntaxException;
import java.net.URL;
import java.util.Arrays;
import java.util.Date;
import java.util.Hashtable;
import java.util.Iterator;
import java.util.List;
import java.util.logging.Logger;

import net.sf.saxon.s9api.Axis;
import net.sf.saxon.s9api.QName;
import net.sf.saxon.s9api.XdmNode;
import net.sf.saxon.s9api.XdmNodeKind;
import net.sf.saxon.s9api.XdmSequenceIterator;

import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpException;
import org.apache.commons.httpclient.methods.GetMethod;
import org.apache.commons.io.FileUtils;
import org.apache.http.HttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.params.BasicHttpParams;
import org.apache.http.params.HttpConnectionParams;
import org.apache.http.params.HttpParams;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;

import com.sun.org.apache.xerces.internal.parsers.SAXParser;

import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException;
import de.mpg.mpiwg.berlin.mpdl.lt.general.Language;
import de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize.WordContentHandler;
import de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize.XmlTokenizer;
import de.mpg.mpiwg.berlin.mpdl.util.StringUtils;
import de.mpg.mpiwg.berlin.mpdl.util.Util;
import de.mpg.mpiwg.berlin.mpdl.xml.xquery.XQueryEvaluator;
import de.mpg.mpiwg.berlin.mpdl.cms.general.Constants;
import de.mpg.mpiwg.berlin.mpdl.cms.lucene.IndexHandler;
import de.mpg.mpiwg.berlin.mpdl.cms.scheduler.CmsDocOperation;
import de.mpg.mpiwg.berlin.mpdl.cms.transform.GetFragmentsContentHandler;
import de.mpg.mpiwg.berlin.mpdl.cms.transform.PageTransformer;
import de.mpg.mpiwg.berlin.mpdl.cms.transform.XslResourceTransformer;

/**
 * Handler for documents (singleton).
 */
public class DocumentHandler {
  private static Logger LOGGER = Logger.getLogger(DocumentHandler.class.getName());
  private static List<String> EXCLUDED_PROJECT_DOCS =
    Arrays.asList("/echo/zh/Min_chan_luyi_1_7MCGW0WG.xml", // the Saxon transfomer has heavy problems with some characters in CJK Unified Ideographs Extension B, e.g.: line 309 (second reg on page 16)
     "/echo/zh/Min_chan_luyi_2_U7Y9NQ9V.xml",
     "/echo/zh/Min_chan_luyi_3_2FP9M172.xml",
     "/echo/zh/Min_chan_luyi_4_FXA6FSFH.xml",
     "/echo/zh/Min_chan_luyi_5_VG6NY5XD.xml",
     "/echo/zh/Xifa_shenji.xml",
     "/echo/zh/Yulei_tushuo_1_HXX4MGZW.xml",
     "/echo/zh/Yulei_tushuo_2_FN1CTY5C.xml");
  private long beginOfOperation;
  private long endOfOperation;

  public void doOperation(CmsDocOperation docOperation) throws ApplicationException {
    String operationName = docOperation.getName();
    if (operationName.equals("create")) {
      create(docOperation);
    } else if (operationName.equals("delete")) {
      delete(docOperation);
    } else if (operationName.equals("importDirectory")) {
      importDirectory(docOperation);
    } else if (operationName.equals("createPdf")) {
      createPdf(docOperation);
    } else if (operationName.equals("createAllPdfInDirectory")) {
      createAllPdfInDirectory(docOperation);
    }
  }

  private void importDirectory(CmsDocOperation docOperation) throws ApplicationException {
    try {
      LOGGER.info("Start of DocumentHandler. This operation could be time consuming because documents are indexed (normal indexing times are 1-10 minutes for a document)");
      beginOperation();
      String localDocumentsUrlStr = docOperation.getSrcUrl(); // start directory: file:/a/local/directory
      String collectionNames = docOperation.getCollectionNames();  // e.g. "echo"
      File localDocumentsDir = new File(new URI(localDocumentsUrlStr));
      boolean docDirExists = localDocumentsDir.exists();
      if (! docDirExists)
        throw new ApplicationException("Document directory:" + localDocumentsUrlStr + " does not exists. Please use a directory that exists and perform the operation again.");
      String[] fileExtensions = {"xml"};
      Iterator<File> iterFiles = FileUtils.iterateFiles(localDocumentsDir, fileExtensions, true);
      int i = 0;
      while(iterFiles.hasNext()) {
        i++;
        File xmlFile = iterFiles.next();
        String xmlFileStr = xmlFile.getPath();
        int relativePos = (int) localDocumentsDir.getPath().length();
        String docId = xmlFileStr.substring(relativePos);  // relative path name starting from localDocumentsDir, e.g. /tei/de/Test_1789.xml
        String xmlFileUrlStr = xmlFile.toURI().toURL().toString();
        CmsDocOperation createDocOperation = new CmsDocOperation("create", xmlFileUrlStr, null, docId);
        createDocOperation.setCollectionNames(collectionNames);
        try {
          doOperation(createDocOperation);
          Date now = new Date();
          LOGGER.info("Document " + i + ": " + docId + " successfully imported (" + now.toString() + ")");
        } catch (Exception e) {
          LOGGER.info("Document " + i + ": " + docId + " has problems:");
          e.printStackTrace();
        }
      }
      endOperation();
      LOGGER.info("The DocumentHandler needed: " + (endOfOperation - beginOfOperation) + " ms" );
    } catch (Exception e) {
      throw new ApplicationException(e);
    }
  }

  private void createAllPdfInDirectory(CmsDocOperation docOperation) throws ApplicationException {
    try {
      LOGGER.info("Start of generating Pdf-Documents. This operation could be time consuming because Pdf generation needs some time.");
      beginOperation();
      String localDocumentsUrlStr = docOperation.getSrcUrl(); // start directory: file:/a/local/directory
      String collectionNames = docOperation.getCollectionNames();  // e.g. "echo"
      File localDocumentsDir = new File(new URI(localDocumentsUrlStr));
      boolean docDirExists = localDocumentsDir.exists();
      if (! docDirExists)
        throw new ApplicationException("Document directory:" + localDocumentsUrlStr + " does not exists. Please use a directory that exists and perform the operation again.");
      String[] fileExtensions = {"xml"};
      Iterator<File> iterFiles = FileUtils.iterateFiles(localDocumentsDir, fileExtensions, true);
      int i = 0;
      while(iterFiles.hasNext()) {
        i++;
        File xmlFile = iterFiles.next();
        String xmlFileStr = xmlFile.getPath();
        int relativePos = (int) localDocumentsDir.getPath().length();
        String docId = xmlFileStr.substring(relativePos);  // relative path name starting from localDocumentsDir, e.g. /tei/de/Test_1789.xml
        CmsDocOperation createPdfOperation = new CmsDocOperation("createPdf", null, null, docId);
        createPdfOperation.setCollectionNames(collectionNames);
        try {
          doOperation(createPdfOperation);
          Date now = new Date();
          LOGGER.info("Pdf document " + i + ": " + docId + " successfully created (" + now.toString() + ")");
        } catch (Exception e) {
          LOGGER.info("Pdf document " + i + ": " + docId + " has problems:");
          e.printStackTrace();
        }
      }
      endOperation();
      LOGGER.info("The Pdf generation needed: " + (endOfOperation - beginOfOperation) + " ms" );
    } catch (Exception e) {
      throw new ApplicationException(e);
    }
  }

  private boolean isProjectDoc(String docId) {
    boolean isProjectDoc = true;
    if (EXCLUDED_PROJECT_DOCS.contains(docId))
      return false;
    return isProjectDoc;
  }

  private void create(CmsDocOperation docOperation) throws ApplicationException {
    try {
      String operationName = docOperation.getName();
      String srcUrlStr = docOperation.getSrcUrl();
      String docId = docOperation.getDocIdentifier();
      if (! isProjectDoc(docId)) {
        LOGGER.info("Operation: " + operationName + " not performed on: " + docId + ". Cause: document is excluded as project doc");
        return;
      }
      String mainLanguage = docOperation.getMainLanguage();
      if (mainLanguage == null) {
        mainLanguage = getMainLanguage(docId);
      }
      String[] elementNames = docOperation.getElementNames();
      if (elementNames == null) {
        String[] defaultElementNames = {"s", "head", "caption", "variables", "description"};
        docOperation.setElementNames(defaultElementNames); // default
      }
      String docDirName = getDocDir(docId);
      String docDestFileName = getDocFullFileName(docId);
      URL srcUrl = null;
      String protocol = null;
      if (srcUrlStr != null && ! srcUrlStr.equals("empty")) {
        srcUrl = new URL(srcUrlStr);
        protocol = srcUrl.getProtocol();
      }
      File docDestFile = new File(docDestFileName);
      // parse validation on file
      XQueryEvaluator xQueryEvaluator = new XQueryEvaluator();
      XdmNode docNode = xQueryEvaluator.parse(srcUrl); // if it is not parseable an exception with a detail message is thrown
      String docType = getNodeType(docNode);  // archimedes, echo, TEI, html ...
      docType = docType.trim();
      if (docType == null) {
        docOperation.setErrorMessage("file type of: " + srcUrlStr + "is not supported");
        return;
      }
      // perform operation on file system
      if (protocol.equals("file")) {
        docOperation.setStatus("upload file: " + srcUrlStr + " to CMS");
      } else {
        docOperation.setStatus("download file from: " + srcUrlStr + " to CMS");
      }
      FileUtils.copyURLToFile(srcUrl, docDestFile, 100000, 100000);

      // replace anchor in echo documents and also add the number attribute to figures
      String docDestFileNameUpgrade = docDestFileName + ".upgrade";
      File docDestFileUpgrade = new File(docDestFileNameUpgrade);
      XslResourceTransformer replaceAnchorTransformer = new XslResourceTransformer("replaceAnchor.xsl");
      String docDestFileUrlStr = docDestFile.getPath();
      String result = replaceAnchorTransformer.transform(docDestFileUrlStr);
      FileUtils.writeStringToFile(docDestFileUpgrade, result, "utf-8");

      MetadataRecord mdRecord = new MetadataRecord();
      mdRecord.setDocId(docId);
      mdRecord.setCollectionNames(docOperation.getCollectionNames());
      mdRecord.setType("text/xml");

      // generate toc file (toc, figure, handwritten)
      XslResourceTransformer tocTransformer = new XslResourceTransformer("toc.xsl");
      File tocFile = new File(docDirName + "/toc.xml");
      String tocResult = tocTransformer.transform(docDestFileNameUpgrade);
      FileUtils.writeStringToFile(tocFile, tocResult, "utf-8");

      // Get metadata info of the xml document
      docOperation.setStatus("extract metadata of: " + srcUrlStr + " to CMS");
      XQueryEvaluator xQueryEvaluator2 = new XQueryEvaluator();
      mdRecord = getMetadataRecord(docDestFileUpgrade, docType, mdRecord, xQueryEvaluator2);
      String mdRecordLanguage = mdRecord.getLanguage();
      if (mdRecordLanguage == null && mainLanguage != null)
        mdRecord.setLanguage(mainLanguage);

      // save all pages as single xml files (untokenized and tokenized)
      docOperation.setStatus("extract page fragments of: " + srcUrlStr + " to CMS");
      File docDir = new File(docDirName + "/pages");
      FileUtils.deleteQuietly(docDir);  // first delete pages directory
      Hashtable<Integer, StringBuilder> pageFragments = getFragments(docDestFileNameUpgrade, "pb");
      int pageCount = pageFragments.size();
      if (pageCount == 0) {
        // no pb element is found: then the whole document is the first page
        String docXmlStr = FileUtils.readFileToString(docDestFileUpgrade, "utf-8");
        docXmlStr = docXmlStr.replaceAll("<\\?xml.*?\\?>", "");  // remove the xml declaration if it exists
        pageFragments = new Hashtable<Integer, StringBuilder>();
        pageFragments.put(new Integer(1), new StringBuilder(docXmlStr));
        pageCount = 1;
      }
      PageTransformer pageTransformer = new PageTransformer();
      for (int page=1; page<=pageCount; page++) {
        String fragment = pageFragments.get(new Integer(page)).toString();
        fragment = "<?xml version=\"1.0\" encoding=\"utf-8\"?>\n" + fragment;
        String docPageFileName = docDirName + "/pages/page-" + page + ".xml";
        File docPageFile = new File(docPageFileName);
        FileUtils.writeStringToFile(docPageFile, fragment, "utf-8");
        String language = mdRecord.getLanguage();
        String tokenizedXmlStr = tokenizeWithLemmas(fragment, language);  // xml fragment enriched with <w> elements
        tokenizedXmlStr = "<?xml version=\"1.0\" encoding=\"utf-8\"?>" + tokenizedXmlStr;
        tokenizedXmlStr = enrichWordsOrigRegNorm(tokenizedXmlStr);  // xml string: enrich <w> elements with normalization info (orig, reg, norm)
        String docPageTokenizedFileName = docDirName + "/pages/page-" + page + "-morph.xml";
        File docPageTokenizedFile = new File(docPageTokenizedFileName);
        FileUtils.writeStringToFile(docPageTokenizedFile, tokenizedXmlStr, "utf-8");
        String docPageHtmlFileName = docDirName + "/pages/page-" + page + ".html";
        File docPageHtmlFile = new File(docPageHtmlFileName);
        String htmlStr = pageTransformer.transform(tokenizedXmlStr, mdRecord, page, "html");
        FileUtils.writeStringToFile(docPageHtmlFile, htmlStr, "utf-8");
      }

      // perform operation on Lucene
      docOperation.setStatus(operationName + " document: " + docId + " in CMS");
      docOperation.setMdRecord(mdRecord);
      IndexHandler indexHandler = IndexHandler.getInstance();
      indexHandler.indexDocument(docOperation);

    } catch (IOException e) {
      throw new ApplicationException(e);
    }
  }

  private void delete(CmsDocOperation docOperation) throws ApplicationException {
    String operationName = docOperation.getName();
    String docIdentifier = docOperation.getDocIdentifier();
    if (docIdentifier == null || docIdentifier.trim().equals(""))
      throw new ApplicationException("Your document identifier is empty. Please specify a document identifier for your document.");
    String docDirStr = getDocDir(docIdentifier);
    File docDir = new File(docDirStr);
    boolean docExists = docDir.exists();
    if (! docExists) {
      throw new ApplicationException("Document:" + docIdentifier + " does not exists. Please use a name that exists and perform the operation \"Delete\" again.");
    }
    // perform operation on file system
    docOperation.setStatus(operationName + " document: " + docIdentifier + " in CMS");
    FileUtils.deleteQuietly(docDir);

    // perform operation on Lucene
    IndexHandler indexHandler = IndexHandler.getInstance();
    indexHandler.deleteDocument(docOperation);

  }

  private void createPdf(CmsDocOperation docOperation) throws ApplicationException {
    String docId = docOperation.getDocIdentifier();
    String operationName = docOperation.getName();
    if (docId == null || docId.trim().equals(""))
      throw new ApplicationException("Your document identifier is empty. Please specify a document identifier for your document.");
    if (! isProjectDoc(docId)) {
      LOGGER.info("Operation: " + operationName + " not performed on: " + docId + ". Cause: document is excluded as project doc");
      return;
    }
    IndexHandler indexHandler = IndexHandler.getInstance();
    MetadataRecord mdRecord = indexHandler.getDocMetadata(docId);
    docOperation.setStatus("create PDF and HTML versions of the document: " + docId);
    PdfHandler pdfHandler = PdfHandler.getInstance();
    pdfHandler.createFile(true, true, mdRecord);  // generate Pdf + Html document
  }

  private MetadataRecord getMetadataRecord(File xmlFile, String schemaName, MetadataRecord mdRecord, XQueryEvaluator xQueryEvaluator) throws ApplicationException {
    if (schemaName == null)
      return mdRecord;
    try {
      URL srcUrl = xmlFile.toURI().toURL();
      if (schemaName.equals("archimedes"))
        mdRecord = getMetadataRecordArch(xQueryEvaluator, srcUrl, mdRecord);
      else if (schemaName.equals("echo"))
        mdRecord = getMetadataRecordEcho(xQueryEvaluator, srcUrl, mdRecord);
      else if (schemaName.equals("TEI"))
        mdRecord = getMetadataRecordTei(xQueryEvaluator, srcUrl, mdRecord);
      else if (schemaName.equals("html"))
        mdRecord = getMetadataRecordHtml(xQueryEvaluator, srcUrl, mdRecord);
      else
        mdRecord.setSchemaName("diverse"); // all other cases: set docType to schemaName
    } catch (MalformedURLException e) {
      throw new ApplicationException(e);
    }
    mdRecord.setLastModified(new Date());
    return mdRecord;
  }

  private MetadataRecord getMetadataRecordArch(XQueryEvaluator xQueryEvaluator, URL srcUrl, MetadataRecord mdRecord) throws ApplicationException {
    String metadataXmlStr = xQueryEvaluator.evaluateAsString(srcUrl, "/archimedes//info");
    if (metadataXmlStr != null) {
      String identifier = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "/info/locator");
      if (identifier != null)
        identifier = StringUtils.deresolveXmlEntities(identifier);
      String creator = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "/info/author");
      if (creator != null)
        creator = StringUtils.deresolveXmlEntities(creator);
      String title = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "/info/title");
      if (title != null)
        title = StringUtils.deresolveXmlEntities(title);
      String language = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "/info/lang[1]");
      if (language != null)
        language = StringUtils.deresolveXmlEntities(language);
      String place = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "/info/place");
      if (place != null)
        place = StringUtils.deresolveXmlEntities(place);
      String yearStr = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "/info/date");
      Date date = null;
      if (yearStr != null && ! yearStr.equals("")) {
        yearStr = StringUtils.deresolveXmlEntities(yearStr);
        yearStr = new Util().toYearStr(yearStr);  // test if possible etc
        if (yearStr != null) {
          try {
            date = new Util().toDate(yearStr + "-01-01T00:00:00.000Z");
          } catch (Exception e) {
            // nothing
          }
        }
      }
      String rights = "open access";
      String license = "http://echo.mpiwg-berlin.mpg.de/policy/oa_basics/declaration";
      String accessRights = "free";

      mdRecord.setIdentifier(identifier);
      mdRecord.setLanguage(language);
      mdRecord.setCreator(creator);
      mdRecord.setTitle(title);
      mdRecord.setPublisher(place);
      mdRecord.setRights(rights);
      mdRecord.setDate(date);
      mdRecord.setLicense(license);
      mdRecord.setAccessRights(accessRights);

      // get echo metadata
      String echoDir = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "/info/echodir");
      String docId = mdRecord.getDocId();
      String echoIdTmp = docId;
      if (docId != null && ! docId.isEmpty()) {
        int start = docId.lastIndexOf("/");
        if (start != -1)
          start = start + 1;
        else
          start = 0;
        int end = docId.lastIndexOf(".");
        if (end == -1)
          end = docId.length();
        echoIdTmp = docId.substring(start, end);
      }
      String echoId = "/permanent/archimedes/" + echoIdTmp;
      if (echoIdTmp == null || echoIdTmp.isEmpty())
        echoId = null;
      if (echoDir != null && ! echoDir.isEmpty()) {
        echoId = echoDir;
      }
      mdRecord = getEchoMetadata(xQueryEvaluator, echoId, mdRecord);
    }
    String pageCountStr = xQueryEvaluator.evaluateAsString(srcUrl, "count(//pb)");
    int pageCount = Integer.valueOf(pageCountStr);
    mdRecord.setPageCount(pageCount);
    mdRecord.setSchemaName("archimedes");
    return mdRecord;
  }

  private MetadataRecord getMetadataRecordEcho(XQueryEvaluator xQueryEvaluator, URL srcUrl, MetadataRecord mdRecord) throws ApplicationException {
    String metadataXmlStr = xQueryEvaluator.evaluateAsString(srcUrl, "/*:echo/*:metadata");
    if (metadataXmlStr != null) {
      String identifier = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "/*:metadata/*:identifier");
      if (identifier != null) {
        identifier = StringUtils.deresolveXmlEntities(identifier);
      }
      String creator = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "/*:metadata/*:creator");
      if (creator != null)
        creator = StringUtils.deresolveXmlEntities(creator);
      String title = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "/*:metadata/*:title");
      if (title != null)
        title = StringUtils.deresolveXmlEntities(title);
      String language = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "/*:metadata/*:language[1]");
      if (language != null)
        language = StringUtils.deresolveXmlEntities(language);
      String yearStr = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "/*:metadata/*:date");
      Date date = null;
      if (yearStr != null && ! yearStr.equals("")) {
        yearStr = StringUtils.deresolveXmlEntities(yearStr);
        yearStr = new Util().toYearStr(yearStr);  // test if possible etc
        if (yearStr != null) {
          try {
            date = new Util().toDate(yearStr + "-01-01T00:00:00.000Z");
          } catch (Exception e) {
            // nothing
          }
        }
      }
      String rights = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "/*:metadata/*:rights");
      if (rights != null)
        rights = StringUtils.deresolveXmlEntities(rights);
      String license = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "/*:metadata/*:license");
      if (license != null)
        license = StringUtils.deresolveXmlEntities(license);
      String accessRights = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "/*:metadata/*:accessRights");
      if (accessRights != null)
        accessRights = StringUtils.deresolveXmlEntities(accessRights);

      mdRecord.setIdentifier(identifier);
      mdRecord.setLanguage(language);
      mdRecord.setCreator(creator);
      mdRecord.setTitle(title);
      mdRecord.setRights(rights);
      mdRecord.setDate(date);
      mdRecord.setLicense(license);
      mdRecord.setAccessRights(accessRights);

      // get echo metadata
      String echoDir = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "/*:metadata/*:echodir");
      String echoIdTmp = identifier;
      if (identifier != null && ! identifier.isEmpty()) {
        int start = identifier.indexOf("ECHO:");
        if (start != -1)
          start = start + 5;
        else
          start = 0;
        int end = identifier.lastIndexOf(".");
        if (end == -1)
          end = identifier.length();
        echoIdTmp = identifier.substring(start, end);
      }
      String echoId = "/permanent/library/" + echoIdTmp;
      if (echoIdTmp == null || echoIdTmp.isEmpty())
        echoId = null;
      if (echoDir != null && ! echoDir.isEmpty()) {
        echoId = echoDir;
      }
      mdRecord = getEchoMetadata(xQueryEvaluator, echoId, mdRecord);
    }
    String pageCountStr = xQueryEvaluator.evaluateAsString(srcUrl, "count(//*:pb)");
    int pageCount = Integer.valueOf(pageCountStr);
    mdRecord.setPageCount(pageCount);
    mdRecord.setSchemaName("echo");
    return mdRecord;
  }

  private MetadataRecord getMetadataRecordTei(XQueryEvaluator xQueryEvaluator, URL srcUrl, MetadataRecord mdRecord) throws ApplicationException {
    String metadataXmlStr = xQueryEvaluator.evaluateAsString(srcUrl, "/*:TEI/*:teiHeader");
    if (metadataXmlStr != null) {
      String identifier = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "/*:teiHeader/*:fileDesc/*:publicationStmt/*:idno");
      if (identifier != null) {
        identifier = StringUtils.deresolveXmlEntities(identifier);
        identifier = deleteSpecialChars(identifier);
      }
      String creator = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "/*:teiHeader/*:fileDesc/*:titleStmt/*:author");
      if (creator != null)
        creator = StringUtils.deresolveXmlEntities(creator);
      String title = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "/*:teiHeader/*:fileDesc/*:titleStmt/*:title");
      if (title != null)
        title = StringUtils.deresolveXmlEntities(title);
      String language = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "string(/*:teiHeader/*:profileDesc/*:langUsage/*:language[1]/@ident)");
      if (language != null && language.isEmpty())
        language = null;
      if (language != null) {
        language = language.toLowerCase();
        if (language.length() == 5) {  // e.g. "de-DE or en-US"
          if (language.substring(2, 3).equals("-")) {
            String lang = language.substring(0, 2);
            language = Language.getInstance().getISO639Code(lang);
          }
        }
      }
      String place = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "/*:teiHeader/*:fileDesc/*:publicationStmt/*:pubPlace");
      if (place != null)
        place = StringUtils.deresolveXmlEntities(place);
      String yearStr = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "/*:teiHeader/*:fileDesc/*:publicationStmt/*:date");
      Date date = null;
      if (yearStr != null && ! yearStr.equals("")) {
        yearStr = StringUtils.deresolveXmlEntities(yearStr);
        yearStr = new Util().toYearStr(yearStr);  // test if possible etc
        if (yearStr != null) {
          try {
            date = new Util().toDate(yearStr + "-01-01T00:00:00.000Z");
          } catch (Exception e) {
            // nothing
          }
        }
      }
      String subject = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "string(/*:teiHeader/*:profileDesc/*:textClass/*:keywords/*:term)");
      if (subject != null)
        subject = StringUtils.deresolveXmlEntities(subject);
      String rights = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "/*:teiHeader/*:fileDesc/*:publicationStmt/*:availability");
      if (rights == null)
        rights = "open access";
      rights = StringUtils.deresolveXmlEntities(rights);
      String license = "http://echo.mpiwg-berlin.mpg.de/policy/oa_basics/declaration";
      String accessRights = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "string(/*:teiHeader/*:fileDesc/*:publicationStmt/*:availability/@status)");
      if (accessRights == null)
        accessRights = "free";
      accessRights = StringUtils.deresolveXmlEntities(accessRights);

      mdRecord.setIdentifier(identifier);
      mdRecord.setLanguage(language);
      mdRecord.setCreator(creator);
      mdRecord.setTitle(title);
      mdRecord.setPublisher(place);
      mdRecord.setRights(rights);
      mdRecord.setDate(date);
      mdRecord.setSubject(subject);
      mdRecord.setLicense(license);
      mdRecord.setAccessRights(accessRights);

      // get echo metadata
      mdRecord = getEchoMetadata(xQueryEvaluator, identifier, mdRecord);  // identifier is echoDir
    }
    String pageCountStr = xQueryEvaluator.evaluateAsString(srcUrl, "count(//*:pb)");
    int pageCount = Integer.valueOf(pageCountStr);
    mdRecord.setPageCount(pageCount);
    mdRecord.setSchemaName("TEI");
    return mdRecord;
  }

  private MetadataRecord getMetadataRecordHtml(XQueryEvaluator xQueryEvaluator, URL srcUrl, MetadataRecord mdRecord) throws ApplicationException {
    String metadataXmlStr = xQueryEvaluator.evaluateAsString(srcUrl, "/html/head");
    if (metadataXmlStr != null) {
      String identifier = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "string(/meta[@name = 'DC.identifier']/@content)");
      if (identifier != null && ! identifier.isEmpty())
        identifier = StringUtils.deresolveXmlEntities(identifier);
      String creator = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "string(/meta[@name = 'DC.creator']/@content)");
      if (creator != null && ! creator.isEmpty())
        creator = StringUtils.deresolveXmlEntities(creator);
      String title = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "string(/meta[@name = 'DC.title']/@content)");
      if (title != null && ! title.isEmpty())
        title = StringUtils.deresolveXmlEntities(title);
      String language = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "string(/meta[@name = 'DC.language']/@content)");
      if (language != null && language.isEmpty())
        language = null;
      if (language != null && ! language.isEmpty())
        language = StringUtils.deresolveXmlEntities(language);
      String publisher = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "string(/meta[@name = 'DC.publisher']/@content)");
      if (publisher != null)
        publisher = StringUtils.deresolveXmlEntities(publisher);
      String yearStr = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "string(/meta[@name = 'DC.date']/@content)");
      Date date = null;
      if (yearStr != null && ! yearStr.equals("")) {
        yearStr = StringUtils.deresolveXmlEntities(yearStr);
        yearStr = new Util().toYearStr(yearStr);  // test if possible etc
        if (yearStr != null) {
          try {
            date = new Util().toDate(yearStr + "-01-01T00:00:00.000Z");
          } catch (Exception e) {
            // nothing
          }
        }
      }
      String subject = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "string(/meta[@name = 'DC.subject']/@content)");
      if (subject != null)
        subject = StringUtils.deresolveXmlEntities(subject);
      String rights = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "string(/meta[@name = 'DC.rights']/@content)");
      if (rights != null && ! rights.isEmpty())
        rights = StringUtils.deresolveXmlEntities(rights);
      String license = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "string(/meta[@name = 'DC.license']/@content)");
      if (license != null && ! license.isEmpty())
        license = StringUtils.deresolveXmlEntities(license);
      String accessRights = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "string(/meta[@name = 'DC.accessRights']/@content)");
      if (accessRights != null && ! accessRights.isEmpty())
        accessRights = StringUtils.deresolveXmlEntities(accessRights);

      mdRecord.setIdentifier(identifier);
      mdRecord.setLanguage(language);
      mdRecord.setCreator(creator);
      mdRecord.setTitle(title);
      mdRecord.setPublisher(publisher);
      mdRecord.setRights(rights);
      mdRecord.setDate(date);
      mdRecord.setSubject(subject);
      mdRecord.setLicense(license);
      mdRecord.setAccessRights(accessRights);

      // get echo metadata
      mdRecord = getEchoMetadata(xQueryEvaluator, identifier, mdRecord);  // identifier is echoDir
    }
    String pageCountStr = xQueryEvaluator.evaluateAsString(srcUrl, "count(//pb)");
    int pageCount = Integer.valueOf(pageCountStr);
    mdRecord.setPageCount(pageCount);
    mdRecord.setSchemaName("html");
    return mdRecord;
  }

  private MetadataRecord getEchoMetadata(XQueryEvaluator xQueryEvaluator, String echoDir, MetadataRecord mdRecord) throws ApplicationException {
    if (echoDir == null || echoDir.isEmpty()) {
      String docId = mdRecord.getDocId();
      echoDir = getEchoDir(xQueryEvaluator, docId);
      if (echoDir == null)
        return mdRecord;
    }
    String urLTexter = "http://digilib.mpiwg-berlin.mpg.de/digitallibrary/servlet/Texter?fn=" + echoDir + "/index.meta";
    String echoIndexMetaStr = performGetRequest(urLTexter);
    String echoPageImageDir = null;
    String echoFiguresDir = null;
    String mpiwgDocId = null;
    if (echoIndexMetaStr != null) {
      if (echoIndexMetaStr.equals("XXXXTimeoutXXXX"))
        return null;
      else if (echoIndexMetaStr.equals("XXXXUrlErrorXXXX"))
        return mdRecord;
      echoPageImageDir = xQueryEvaluator.evaluateAsStringValueJoined(echoIndexMetaStr, "/resource/meta/texttool/image");
      if (echoPageImageDir != null)
        echoPageImageDir = echoDir + "/" + echoPageImageDir;
      else
        echoPageImageDir = echoDir + "/" + "pageimg"; // default
      echoFiguresDir = xQueryEvaluator.evaluateAsStringValueJoined(echoIndexMetaStr, "/resource/meta/texttool/figures");
      if (echoFiguresDir != null)
        echoFiguresDir = echoDir + "/" + echoFiguresDir;
      else
        echoFiguresDir = echoDir + "/" + "figures"; // default
      mpiwgDocId = xQueryEvaluator.evaluateAsStringValueJoined(echoIndexMetaStr, "/resource/meta/dri[@type = 'mpiwg']");
    }
    mdRecord.setEchoId(echoDir);
    mdRecord.setEchoPageImageDir(echoPageImageDir);
    mdRecord.setEchoFiguresDir(echoFiguresDir);
    mdRecord.setMpiwgDocId(mpiwgDocId);
    return mdRecord;
  }

  private String getEchoDir(XQueryEvaluator xQueryEvaluator, String docId) throws ApplicationException {
    String echoDir = null;
    String urLTextUrlPath = "http://md.mpiwg-berlin.mpg.de/purls/searchSolr?text-url-path=" + docId + "&format=short";
    String resultXmlStr = performGetRequest(urLTextUrlPath);
    if (resultXmlStr != null) {
      if (resultXmlStr.equals("XXXXTimeoutXXXX"))
        return null;
      else if (resultXmlStr.equals("XXXXUrlErrorXXXX"))
        return null;
      String archivePath = xQueryEvaluator.evaluateAsStringValueJoined(resultXmlStr, "//archive-path");
      if (archivePath != null) {
        archivePath = archivePath.replaceAll("/mpiwg/online", "");
        if (archivePath.isEmpty())
          echoDir = null;
        else
          echoDir = archivePath;
      }
    }
    return echoDir;
  }

  private String getNodeType(XdmNode node) {
    String nodeType = null;
    XdmSequenceIterator iter = node.axisIterator(Axis.CHILD);
    if (iter != null) {
      while (iter.hasNext()) {
        XdmNode firstChild = (XdmNode) iter.next();
        if (firstChild != null) {
          XdmNodeKind nodeKind = firstChild.getNodeKind();
          if (nodeKind.ordinal() == XdmNodeKind.ELEMENT.ordinal()) {
            QName nodeQName = firstChild.getNodeName();
            nodeType = nodeQName.getLocalName();
          }
        }
      }
    }
    return nodeType;
  }

  public String getDocFullFileName(String docId) {
    String docDir = getDocDir(docId);
    String docFileName = getDocFileName(docId);
    String docFullFileName = docDir + "/" + docFileName;
    return docFullFileName;
  }

  public String getFullFileName(String docId, String type) {
    String docDir = getDocDir(docId);
    String docFileName = getDocFileName(docId);
    int lastDot = docFileName.lastIndexOf(".");
    String docFileNameWithoutExtension = docFileName.substring(0, lastDot);
    String fullFileName = docDir + "/" + docFileNameWithoutExtension + ".xml";
    if (type != null && ! type.equals("toc")) {
      fullFileName = docDir + "/" + docFileNameWithoutExtension + "." + type;
    } else if (type != null && type.equals("toc")) {
      fullFileName = docDir + "/toc.xml";
    }
    return fullFileName;
  }

  public String getDocDir(String docId) {
    String documentsDirectory = Constants.getInstance().getDocumentsDir();
    String subDir = docId;
    if (docId.contains(".")) {
      int index = docId.lastIndexOf(".");
      subDir = docId.substring(0, index);
    }
    if (! subDir.startsWith("/"))
      subDir = "/" + subDir;
    String docDir = documentsDirectory + subDir;
    return docDir;
  }

  public String getDocFileName(String docId) {
    String docFileName = docId;
    int index = docId.lastIndexOf("/");
    if (index != -1) {
      docFileName = docId.substring(index + 1);
    }
    return docFileName;
  }

  private String getMainLanguage(String docId) {
    String mainLang = null;
    int to = docId.lastIndexOf("/");
    if (to != -1) {
      String preStr = docId.substring(0, to);
      int from = preStr.lastIndexOf("/");
      if (from != -1)
        mainLang = preStr.substring(from + 1, to);
    }
    return mainLang;
  }

  private String deleteSpecialChars(String inputStr) {
    StringBuilder buf = new StringBuilder();
    for (int i = 0; i < inputStr.length(); i++) {
      char c = inputStr.charAt(i);
      String replace = new String();
      switch (c) {
        case '@': replace = ""; break;
        case ' ': replace = ""; break;
        case ';': replace = ""; break;
        default: replace += c; break;
      }
      buf.append(replace);
    }
    return buf.toString();
  }

  private Hashtable<Integer, StringBuilder> getFragments(String fileName, String milestoneElementName) throws ApplicationException {
    try {
      GetFragmentsContentHandler getFragmentsContentHandler = new GetFragmentsContentHandler(milestoneElementName);
      XMLReader xmlParser = new SAXParser();
      xmlParser.setContentHandler(getFragmentsContentHandler);
      StringReader bla = new StringReader(FileUtils.readFileToString(new File(fileName), "utf-8"));
      InputSource inputSource = new InputSource(bla);
      xmlParser.parse(inputSource);
      Hashtable<Integer, StringBuilder> resultFragments = getFragmentsContentHandler.getResultPages();
      return resultFragments;
    } catch (SAXException e) {
      throw new ApplicationException(e);
    } catch (IOException e) {
      throw new ApplicationException(e);
    }
  }

  private String tokenizeWithLemmas(String xmlStr, String language) throws ApplicationException {
    StringReader strReader = new StringReader(xmlStr);
    XmlTokenizer xmlTokenizer = new XmlTokenizer(strReader);
    xmlTokenizer.setLanguage(language);
    String[] outputOptionsWithLemmas = {"withLemmas"}; // so all tokens are fetched with lemmas (costs performance)
    // non word breaking elements;
    // TODO examine bugs with emph, figure, hi :
    // e.g. "... der <hi rend="i">Capi-<lb n="16"/>talist.</hi> Es ..."
    // e.g. page 30 in /echo/la/Cataneo_1600.xml
    String[] nwbElements = {"lb", "br", "cb"};
    xmlTokenizer.setNWBElements(nwbElements);
    xmlTokenizer.setOutputOptions(outputOptionsWithLemmas);
    xmlTokenizer.tokenize();
    String retStr = xmlTokenizer.getXmlResult();
    return retStr;
  }

  private String enrichWordsOrigRegNorm(String xmlStr) throws ApplicationException {
    try {
      WordContentHandler wordContentHandler = new WordContentHandler();
      XMLReader xmlParser = new SAXParser();
      xmlParser.setContentHandler(wordContentHandler);
      StringReader strReader = new StringReader(xmlStr);
      InputSource inputSource = new InputSource(strReader);
      xmlParser.parse(inputSource);
      String result = wordContentHandler.getResult();
      return result;
    } catch (SAXException e) {
      throw new ApplicationException(e);
    } catch (IOException e) {
      throw new ApplicationException(e);
    }
  }

  private String performGetRequest(String url) throws ApplicationException {
    String resultStr = null;
    try {
      boolean urlIsOk = checkUri(url, 2000); // if url doesn't answer after 2 seconds
      if (! urlIsOk)
        return "XXXXTimeoutXXXX";
      HttpClient httpClient = new HttpClient();
      GetMethod method = new GetMethod(url);
      httpClient.executeMethod(method);
      int statusCode = method.getStatusCode();
      if (statusCode >= 400)
        return "XXXXUrlErrorXXXX";
      byte[] resultBytes = method.getResponseBody();
      resultStr = new String(resultBytes, "utf-8");
      method.releaseConnection();
    } catch (HttpException e) {
      throw new ApplicationException(e);
    } catch (IOException e) {
      throw new ApplicationException(e);
    }
    return resultStr;
  }

  private boolean checkUri(String uriStr, int timeoutMilliseconds) throws ApplicationException {
    boolean isOk = true;
    try {
      URI uri = new URI(uriStr);
      HttpGet httpGet = new HttpGet(uri);
      HttpParams httpParameters = new BasicHttpParams();
      // Set the timeout in milliseconds until a connection is established.
      // The default value is zero, that means the timeout is not used.
      int timeoutConnection = 2000;
      HttpConnectionParams.setConnectionTimeout(httpParameters, timeoutConnection);
      // Set the default socket timeout (SO_TIMEOUT)
      // in milliseconds which is the timeout for waiting for data.
      int timeoutSocket = 2000;
      HttpConnectionParams.setSoTimeout(httpParameters, timeoutSocket);
      DefaultHttpClient httpClient = new DefaultHttpClient(httpParameters);
      HttpResponse response = httpClient.execute(httpGet);
    } catch (IOException e) {
      isOk = false;  // if timeout exception is thrown
    } catch (URISyntaxException e) {
      throw new ApplicationException(e);
    }
    return isOk;
  }

  /**
   * Write string into destFile. If directory for that destFile does not exist
   * it creates this directory including parent directories.
   * @param str string to write
   * @param destFileName destination file name
   * @throws ApplicationException
   */
  private void saveFile(String str, String destFileName) throws ApplicationException {
    OutputStreamWriter out = null;
    try {
      if (str == null)
        return;  // do nothing
      File destFile = new File(destFileName);
      File destDir = new File(destFile.getParent());
      if (! destDir.exists()) {
        destDir.mkdirs();  // create the directory including parent directories which do not exist
      }
      out = new OutputStreamWriter(new BufferedOutputStream(new FileOutputStream(destFile)), "utf-8");
      out.write(str);
      out.flush();
    } catch (FileNotFoundException e) {
      throw new ApplicationException(e);
    } catch (IOException e) {
      throw new ApplicationException(e);
    } finally {
      try {
        if (out != null)
          out.close();
        } catch (Exception e) {
          // nothing: always close the stream at the end of the method
        }
    }
  }

  private void beginOperation() {
    beginOfOperation = new Date().getTime();
  }

  private void endOperation() {
    endOfOperation = new Date().getTime();
  }

}
author	Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
date	Tue, 21 May 2013 10:19:32 +0200
parents
children