Mercurial > hg > mpdl-group

diff software/mpdl-services-new/mpiwg-mpdl-cms/src/de/mpg/mpiwg/berlin/mpdl/cms/document/DocumentHandler.java @ 25:e9fe3186670c default tip
letzter Stand eingecheckt
author: Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
date: Tue, 21 May 2013 10:19:32 +0200
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/software/mpdl-services-new/mpiwg-mpdl-cms/src/de/mpg/mpiwg/berlin/mpdl/cms/document/DocumentHandler.java	Tue May 21 10:19:32 2013 +0200
@@ -0,0 +1,927 @@
+package de.mpg.mpiwg.berlin.mpdl.cms.document;
+
+import java.io.BufferedOutputStream;
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.OutputStreamWriter;
+import java.io.StringReader;
+import java.net.MalformedURLException;
+import java.net.URI;
+import java.net.URISyntaxException;
+import java.net.URL;
+import java.util.Arrays;
+import java.util.Date;
+import java.util.Hashtable;
+import java.util.Iterator;
+import java.util.List;
+import java.util.logging.Logger;
+
+import net.sf.saxon.s9api.Axis;
+import net.sf.saxon.s9api.QName;
+import net.sf.saxon.s9api.XdmNode;
+import net.sf.saxon.s9api.XdmNodeKind;
+import net.sf.saxon.s9api.XdmSequenceIterator;
+
+import org.apache.commons.httpclient.HttpClient;
+import org.apache.commons.httpclient.HttpException;
+import org.apache.commons.httpclient.methods.GetMethod;
+import org.apache.commons.io.FileUtils;
+import org.apache.http.HttpResponse;
+import org.apache.http.client.methods.HttpGet;
+import org.apache.http.impl.client.DefaultHttpClient;
+import org.apache.http.params.BasicHttpParams;
+import org.apache.http.params.HttpConnectionParams;
+import org.apache.http.params.HttpParams;
+import org.xml.sax.InputSource;
+import org.xml.sax.SAXException;
+import org.xml.sax.XMLReader;
+
+import com.sun.org.apache.xerces.internal.parsers.SAXParser;
+
+import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException;
+import de.mpg.mpiwg.berlin.mpdl.lt.general.Language;
+import de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize.WordContentHandler;
+import de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize.XmlTokenizer;
+import de.mpg.mpiwg.berlin.mpdl.util.StringUtils;
+import de.mpg.mpiwg.berlin.mpdl.util.Util;
+import de.mpg.mpiwg.berlin.mpdl.xml.xquery.XQueryEvaluator;
+import de.mpg.mpiwg.berlin.mpdl.cms.general.Constants;
+import de.mpg.mpiwg.berlin.mpdl.cms.lucene.IndexHandler;
+import de.mpg.mpiwg.berlin.mpdl.cms.scheduler.CmsDocOperation;
+import de.mpg.mpiwg.berlin.mpdl.cms.transform.GetFragmentsContentHandler;
+import de.mpg.mpiwg.berlin.mpdl.cms.transform.PageTransformer;
+import de.mpg.mpiwg.berlin.mpdl.cms.transform.XslResourceTransformer;
+
+/**
+ * Handler for documents (singleton). 
+ */
+public class DocumentHandler {
+  private static Logger LOGGER = Logger.getLogger(DocumentHandler.class.getName()); 
+  private static List<String> EXCLUDED_PROJECT_DOCS = 
+    Arrays.asList("/echo/zh/Min_chan_luyi_1_7MCGW0WG.xml", // the Saxon transfomer has heavy problems with some characters in CJK Unified Ideographs Extension B, e.g.: line 309 (second reg on page 16)
+     "/echo/zh/Min_chan_luyi_2_U7Y9NQ9V.xml", 
+     "/echo/zh/Min_chan_luyi_3_2FP9M172.xml", 
+     "/echo/zh/Min_chan_luyi_4_FXA6FSFH.xml",
+     "/echo/zh/Min_chan_luyi_5_VG6NY5XD.xml",
+     "/echo/zh/Xifa_shenji.xml",
+     "/echo/zh/Yulei_tushuo_1_HXX4MGZW.xml",
+     "/echo/zh/Yulei_tushuo_2_FN1CTY5C.xml");
+  private long beginOfOperation;
+  private long endOfOperation;
+  
+  public void doOperation(CmsDocOperation docOperation) throws ApplicationException {
+    String operationName = docOperation.getName();  
+    if (operationName.equals("create")) {
+      create(docOperation);
+    } else if (operationName.equals("delete")) {
+      delete(docOperation);
+    } else if (operationName.equals("importDirectory")) {
+      importDirectory(docOperation);
+    } else if (operationName.equals("createPdf")) {
+      createPdf(docOperation);
+    } else if (operationName.equals("createAllPdfInDirectory")) {
+      createAllPdfInDirectory(docOperation);
+    }
+  }
+  
+  private void importDirectory(CmsDocOperation docOperation) throws ApplicationException {
+    try {
+      LOGGER.info("Start of DocumentHandler. This operation could be time consuming because documents are indexed (normal indexing times are 1-10 minutes for a document)");
+      beginOperation();
+      String localDocumentsUrlStr = docOperation.getSrcUrl(); // start directory: file:/a/local/directory
+      String collectionNames = docOperation.getCollectionNames();  // e.g. "echo"
+      File localDocumentsDir = new File(new URI(localDocumentsUrlStr));
+      boolean docDirExists = localDocumentsDir.exists();
+      if (! docDirExists) 
+        throw new ApplicationException("Document directory:" + localDocumentsUrlStr + " does not exists. Please use a directory that exists and perform the operation again.");
+      String[] fileExtensions = {"xml"};
+      Iterator<File> iterFiles = FileUtils.iterateFiles(localDocumentsDir, fileExtensions, true);
+      int i = 0;
+      while(iterFiles.hasNext()) {
+        i++;
+        File xmlFile = iterFiles.next();
+        String xmlFileStr = xmlFile.getPath();
+        int relativePos = (int) localDocumentsDir.getPath().length();
+        String docId = xmlFileStr.substring(relativePos);  // relative path name starting from localDocumentsDir, e.g. /tei/de/Test_1789.xml
+        String xmlFileUrlStr = xmlFile.toURI().toURL().toString();
+        CmsDocOperation createDocOperation = new CmsDocOperation("create", xmlFileUrlStr, null, docId);
+        createDocOperation.setCollectionNames(collectionNames);
+        try {
+          doOperation(createDocOperation);
+          Date now = new Date();
+          LOGGER.info("Document " + i + ": " + docId + " successfully imported (" + now.toString() + ")");
+        } catch (Exception e) {
+          LOGGER.info("Document " + i + ": " + docId + " has problems:");
+          e.printStackTrace();
+        }
+      }
+      endOperation();
+      LOGGER.info("The DocumentHandler needed: " + (endOfOperation - beginOfOperation) + " ms" );
+    } catch (Exception e) {
+      throw new ApplicationException(e);
+    }
+  }
+  
+  private void createAllPdfInDirectory(CmsDocOperation docOperation) throws ApplicationException {
+    try {
+      LOGGER.info("Start of generating Pdf-Documents. This operation could be time consuming because Pdf generation needs some time.");
+      beginOperation();
+      String localDocumentsUrlStr = docOperation.getSrcUrl(); // start directory: file:/a/local/directory
+      String collectionNames = docOperation.getCollectionNames();  // e.g. "echo"
+      File localDocumentsDir = new File(new URI(localDocumentsUrlStr));
+      boolean docDirExists = localDocumentsDir.exists();
+      if (! docDirExists) 
+        throw new ApplicationException("Document directory:" + localDocumentsUrlStr + " does not exists. Please use a directory that exists and perform the operation again.");
+      String[] fileExtensions = {"xml"};
+      Iterator<File> iterFiles = FileUtils.iterateFiles(localDocumentsDir, fileExtensions, true);
+      int i = 0;
+      while(iterFiles.hasNext()) {
+        i++;
+        File xmlFile = iterFiles.next();
+        String xmlFileStr = xmlFile.getPath();
+        int relativePos = (int) localDocumentsDir.getPath().length();
+        String docId = xmlFileStr.substring(relativePos);  // relative path name starting from localDocumentsDir, e.g. /tei/de/Test_1789.xml
+        CmsDocOperation createPdfOperation = new CmsDocOperation("createPdf", null, null, docId);
+        createPdfOperation.setCollectionNames(collectionNames);
+        try {
+          doOperation(createPdfOperation);
+          Date now = new Date();
+          LOGGER.info("Pdf document " + i + ": " + docId + " successfully created (" + now.toString() + ")");
+        } catch (Exception e) {
+          LOGGER.info("Pdf document " + i + ": " + docId + " has problems:");
+          e.printStackTrace();
+        }
+      }
+      endOperation();
+      LOGGER.info("The Pdf generation needed: " + (endOfOperation - beginOfOperation) + " ms" );
+    } catch (Exception e) {
+      throw new ApplicationException(e);
+    }
+  }
+  
+  private boolean isProjectDoc(String docId) {
+    boolean isProjectDoc = true;
+    if (EXCLUDED_PROJECT_DOCS.contains(docId)) 
+      return false;
+    return isProjectDoc;
+  }
+  
+  private void create(CmsDocOperation docOperation) throws ApplicationException {
+    try {
+      String operationName = docOperation.getName();  
+      String srcUrlStr = docOperation.getSrcUrl(); 
+      String docId = docOperation.getDocIdentifier();
+      if (! isProjectDoc(docId)) {
+        LOGGER.info("Operation: " + operationName + " not performed on: " + docId + ". Cause: document is excluded as project doc");
+        return;
+      }
+      String mainLanguage = docOperation.getMainLanguage();
+      if (mainLanguage == null) {
+        mainLanguage = getMainLanguage(docId);
+      }
+      String[] elementNames = docOperation.getElementNames();
+      if (elementNames == null) {
+        String[] defaultElementNames = {"s", "head", "caption", "variables", "description"};
+        docOperation.setElementNames(defaultElementNames); // default
+      }
+      String docDirName = getDocDir(docId);
+      String docDestFileName = getDocFullFileName(docId); 
+      URL srcUrl = null;
+      String protocol = null;
+      if (srcUrlStr != null && ! srcUrlStr.equals("empty")) {
+        srcUrl = new URL(srcUrlStr);
+        protocol = srcUrl.getProtocol();
+      }
+      File docDestFile = new File(docDestFileName);
+      // parse validation on file
+      XQueryEvaluator xQueryEvaluator = new XQueryEvaluator();
+      XdmNode docNode = xQueryEvaluator.parse(srcUrl); // if it is not parseable an exception with a detail message is thrown 
+      String docType = getNodeType(docNode);  // archimedes, echo, TEI, html ... 
+      docType = docType.trim();
+      if (docType == null) {
+        docOperation.setErrorMessage("file type of: " + srcUrlStr + "is not supported");
+        return;
+      }
+      // perform operation on file system
+      if (protocol.equals("file")) {
+        docOperation.setStatus("upload file: " + srcUrlStr + " to CMS");
+      } else {
+        docOperation.setStatus("download file from: " + srcUrlStr + " to CMS");
+      }
+      FileUtils.copyURLToFile(srcUrl, docDestFile, 100000, 100000);
+
+      // replace anchor in echo documents and also add the number attribute to figures
+      String docDestFileNameUpgrade = docDestFileName + ".upgrade";
+      File docDestFileUpgrade = new File(docDestFileNameUpgrade);
+      XslResourceTransformer replaceAnchorTransformer = new XslResourceTransformer("replaceAnchor.xsl");
+      String docDestFileUrlStr = docDestFile.getPath();
+      String result = replaceAnchorTransformer.transform(docDestFileUrlStr);
+      FileUtils.writeStringToFile(docDestFileUpgrade, result, "utf-8");
+      
+      MetadataRecord mdRecord = new MetadataRecord();
+      mdRecord.setDocId(docId);
+      mdRecord.setCollectionNames(docOperation.getCollectionNames());
+      mdRecord.setType("text/xml");
+
+      // generate toc file (toc, figure, handwritten)
+      XslResourceTransformer tocTransformer = new XslResourceTransformer("toc.xsl");
+      File tocFile = new File(docDirName + "/toc.xml");
+      String tocResult = tocTransformer.transform(docDestFileNameUpgrade);
+      FileUtils.writeStringToFile(tocFile, tocResult, "utf-8");
+
+      // Get metadata info of the xml document
+      docOperation.setStatus("extract metadata of: " + srcUrlStr + " to CMS");
+      XQueryEvaluator xQueryEvaluator2 = new XQueryEvaluator();
+      mdRecord = getMetadataRecord(docDestFileUpgrade, docType, mdRecord, xQueryEvaluator2);
+      String mdRecordLanguage = mdRecord.getLanguage();
+      if (mdRecordLanguage == null && mainLanguage != null)
+        mdRecord.setLanguage(mainLanguage);
+      
+      // save all pages as single xml files (untokenized and tokenized)
+      docOperation.setStatus("extract page fragments of: " + srcUrlStr + " to CMS");
+      File docDir = new File(docDirName + "/pages");
+      FileUtils.deleteQuietly(docDir);  // first delete pages directory
+      Hashtable<Integer, StringBuilder> pageFragments = getFragments(docDestFileNameUpgrade, "pb");
+      int pageCount = pageFragments.size();
+      if (pageCount == 0) {
+        // no pb element is found: then the whole document is the first page
+        String docXmlStr = FileUtils.readFileToString(docDestFileUpgrade, "utf-8");
+        docXmlStr = docXmlStr.replaceAll("<\\?xml.*?\\?>", "");  // remove the xml declaration if it exists
+        pageFragments = new Hashtable<Integer, StringBuilder>();
+        pageFragments.put(new Integer(1), new StringBuilder(docXmlStr));
+        pageCount = 1;
+      }
+      PageTransformer pageTransformer = new PageTransformer();
+      for (int page=1; page<=pageCount; page++) {
+        String fragment = pageFragments.get(new Integer(page)).toString();
+        fragment = "<?xml version=\"1.0\" encoding=\"utf-8\"?>\n" + fragment;
+        String docPageFileName = docDirName + "/pages/page-" + page + ".xml";
+        File docPageFile = new File(docPageFileName);
+        FileUtils.writeStringToFile(docPageFile, fragment, "utf-8");
+        String language = mdRecord.getLanguage();
+        String tokenizedXmlStr = tokenizeWithLemmas(fragment, language);  // xml fragment enriched with <w> elements
+        tokenizedXmlStr = "<?xml version=\"1.0\" encoding=\"utf-8\"?>" + tokenizedXmlStr;
+        tokenizedXmlStr = enrichWordsOrigRegNorm(tokenizedXmlStr);  // xml string: enrich <w> elements with normalization info (orig, reg, norm)
+        String docPageTokenizedFileName = docDirName + "/pages/page-" + page + "-morph.xml";
+        File docPageTokenizedFile = new File(docPageTokenizedFileName);
+        FileUtils.writeStringToFile(docPageTokenizedFile, tokenizedXmlStr, "utf-8");
+        String docPageHtmlFileName = docDirName + "/pages/page-" + page + ".html";
+        File docPageHtmlFile = new File(docPageHtmlFileName);
+        String htmlStr = pageTransformer.transform(tokenizedXmlStr, mdRecord, page, "html");
+        FileUtils.writeStringToFile(docPageHtmlFile, htmlStr, "utf-8");
+      }
+      
+      // perform operation on Lucene
+      docOperation.setStatus(operationName + " document: " + docId + " in CMS");
+      docOperation.setMdRecord(mdRecord);
+      IndexHandler indexHandler = IndexHandler.getInstance();
+      indexHandler.indexDocument(docOperation);
+
+    } catch (IOException e) {
+      throw new ApplicationException(e);
+    }
+  }
+  
+  private void delete(CmsDocOperation docOperation) throws ApplicationException {
+    String operationName = docOperation.getName();  
+    String docIdentifier = docOperation.getDocIdentifier();
+    if (docIdentifier == null || docIdentifier.trim().equals(""))
+      throw new ApplicationException("Your document identifier is empty. Please specify a document identifier for your document.");
+    String docDirStr = getDocDir(docIdentifier);
+    File docDir = new File(docDirStr);
+    boolean docExists = docDir.exists();
+    if (! docExists) {
+      throw new ApplicationException("Document:" + docIdentifier + " does not exists. Please use a name that exists and perform the operation \"Delete\" again.");
+    }
+    // perform operation on file system
+    docOperation.setStatus(operationName + " document: " + docIdentifier + " in CMS");
+    FileUtils.deleteQuietly(docDir);
+      
+    // perform operation on Lucene
+    IndexHandler indexHandler = IndexHandler.getInstance();
+    indexHandler.deleteDocument(docOperation);
+      
+  }
+  
+  private void createPdf(CmsDocOperation docOperation) throws ApplicationException {
+    String docId = docOperation.getDocIdentifier();
+    String operationName = docOperation.getName();
+    if (docId == null || docId.trim().equals(""))
+      throw new ApplicationException("Your document identifier is empty. Please specify a document identifier for your document.");
+    if (! isProjectDoc(docId)) {
+      LOGGER.info("Operation: " + operationName + " not performed on: " + docId + ". Cause: document is excluded as project doc");
+      return;
+    }
+    IndexHandler indexHandler = IndexHandler.getInstance();
+    MetadataRecord mdRecord = indexHandler.getDocMetadata(docId);
+    docOperation.setStatus("create PDF and HTML versions of the document: " + docId);
+    PdfHandler pdfHandler = PdfHandler.getInstance();
+    pdfHandler.createFile(true, true, mdRecord);  // generate Pdf + Html document
+  }
+  
+  private MetadataRecord getMetadataRecord(File xmlFile, String schemaName, MetadataRecord mdRecord, XQueryEvaluator xQueryEvaluator) throws ApplicationException {
+    if (schemaName == null)
+      return mdRecord;
+    try {
+      URL srcUrl = xmlFile.toURI().toURL();
+      if (schemaName.equals("archimedes"))
+        mdRecord = getMetadataRecordArch(xQueryEvaluator, srcUrl, mdRecord);
+      else if (schemaName.equals("echo"))
+        mdRecord = getMetadataRecordEcho(xQueryEvaluator, srcUrl, mdRecord);
+      else if (schemaName.equals("TEI"))
+        mdRecord = getMetadataRecordTei(xQueryEvaluator, srcUrl, mdRecord);
+      else if (schemaName.equals("html"))
+        mdRecord = getMetadataRecordHtml(xQueryEvaluator, srcUrl, mdRecord);
+      else
+        mdRecord.setSchemaName("diverse"); // all other cases: set docType to schemaName
+    } catch (MalformedURLException e) {
+      throw new ApplicationException(e);
+    }
+    mdRecord.setLastModified(new Date());
+    return mdRecord;
+  }
+
+  private MetadataRecord getMetadataRecordArch(XQueryEvaluator xQueryEvaluator, URL srcUrl, MetadataRecord mdRecord) throws ApplicationException {
+    String metadataXmlStr = xQueryEvaluator.evaluateAsString(srcUrl, "/archimedes//info");
+    if (metadataXmlStr != null) {
+      String identifier = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "/info/locator");
+      if (identifier != null)
+        identifier = StringUtils.deresolveXmlEntities(identifier);
+      String creator = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "/info/author");
+      if (creator != null)
+        creator = StringUtils.deresolveXmlEntities(creator);
+      String title = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "/info/title");
+      if (title != null)
+        title = StringUtils.deresolveXmlEntities(title);
+      String language = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "/info/lang[1]");
+      if (language != null)
+        language = StringUtils.deresolveXmlEntities(language);
+      String place = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "/info/place");
+      if (place != null)
+        place = StringUtils.deresolveXmlEntities(place);
+      String yearStr = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "/info/date");
+      Date date = null; 
+      if (yearStr != null && ! yearStr.equals("")) {
+        yearStr = StringUtils.deresolveXmlEntities(yearStr);
+        yearStr = new Util().toYearStr(yearStr);  // test if possible etc
+        if (yearStr != null) {
+          try {
+            date = new Util().toDate(yearStr + "-01-01T00:00:00.000Z");
+          } catch (Exception e) {
+            // nothing
+          }
+        }
+      }
+      String rights = "open access";
+      String license = "http://echo.mpiwg-berlin.mpg.de/policy/oa_basics/declaration";
+      String accessRights = "free";
+
+      mdRecord.setIdentifier(identifier);
+      mdRecord.setLanguage(language);
+      mdRecord.setCreator(creator);
+      mdRecord.setTitle(title);
+      mdRecord.setPublisher(place);
+      mdRecord.setRights(rights);
+      mdRecord.setDate(date);
+      mdRecord.setLicense(license);
+      mdRecord.setAccessRights(accessRights);
+
+      // get echo metadata
+      String echoDir = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "/info/echodir");
+      String docId = mdRecord.getDocId();
+      String echoIdTmp = docId;
+      if (docId != null && ! docId.isEmpty()) {
+        int start = docId.lastIndexOf("/");
+        if (start != -1)
+          start = start + 1;
+        else 
+          start = 0;
+        int end = docId.lastIndexOf(".");
+        if (end == -1)
+          end = docId.length();
+        echoIdTmp = docId.substring(start, end);
+      }
+      String echoId = "/permanent/archimedes/" + echoIdTmp;
+      if (echoIdTmp == null || echoIdTmp.isEmpty())
+        echoId = null;
+      if (echoDir != null && ! echoDir.isEmpty()) {
+        echoId = echoDir;
+      }
+      mdRecord = getEchoMetadata(xQueryEvaluator, echoId, mdRecord);
+    }
+    String pageCountStr = xQueryEvaluator.evaluateAsString(srcUrl, "count(//pb)");
+    int pageCount = Integer.valueOf(pageCountStr);
+    mdRecord.setPageCount(pageCount);
+    mdRecord.setSchemaName("archimedes");
+    return mdRecord;
+  }
+  
+  private MetadataRecord getMetadataRecordEcho(XQueryEvaluator xQueryEvaluator, URL srcUrl, MetadataRecord mdRecord) throws ApplicationException {
+    String metadataXmlStr = xQueryEvaluator.evaluateAsString(srcUrl, "/*:echo/*:metadata");
+    if (metadataXmlStr != null) {
+      String identifier = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "/*:metadata/*:identifier");
+      if (identifier != null) {
+        identifier = StringUtils.deresolveXmlEntities(identifier);
+      }
+      String creator = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "/*:metadata/*:creator");
+      if (creator != null)
+        creator = StringUtils.deresolveXmlEntities(creator);
+      String title = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "/*:metadata/*:title");
+      if (title != null)
+        title = StringUtils.deresolveXmlEntities(title);
+      String language = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "/*:metadata/*:language[1]");
+      if (language != null)
+        language = StringUtils.deresolveXmlEntities(language);
+      String yearStr = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "/*:metadata/*:date");
+      Date date = null; 
+      if (yearStr != null && ! yearStr.equals("")) {
+        yearStr = StringUtils.deresolveXmlEntities(yearStr);
+        yearStr = new Util().toYearStr(yearStr);  // test if possible etc
+        if (yearStr != null) {
+          try {
+            date = new Util().toDate(yearStr + "-01-01T00:00:00.000Z");
+          } catch (Exception e) {
+            // nothing
+          }
+        }
+      }
+      String rights = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "/*:metadata/*:rights");
+      if (rights != null)
+        rights = StringUtils.deresolveXmlEntities(rights);
+      String license = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "/*:metadata/*:license");
+      if (license != null)
+        license = StringUtils.deresolveXmlEntities(license);
+      String accessRights = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "/*:metadata/*:accessRights");
+      if (accessRights != null)
+        accessRights = StringUtils.deresolveXmlEntities(accessRights);
+
+      mdRecord.setIdentifier(identifier);
+      mdRecord.setLanguage(language);
+      mdRecord.setCreator(creator);
+      mdRecord.setTitle(title);
+      mdRecord.setRights(rights);
+      mdRecord.setDate(date);
+      mdRecord.setLicense(license);
+      mdRecord.setAccessRights(accessRights);
+
+      // get echo metadata
+      String echoDir = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "/*:metadata/*:echodir");
+      String echoIdTmp = identifier;
+      if (identifier != null && ! identifier.isEmpty()) {
+        int start = identifier.indexOf("ECHO:");
+        if (start != -1)
+          start = start + 5;
+        else 
+          start = 0;
+        int end = identifier.lastIndexOf(".");
+        if (end == -1)
+          end = identifier.length();
+        echoIdTmp = identifier.substring(start, end);
+      }
+      String echoId = "/permanent/library/" + echoIdTmp;
+      if (echoIdTmp == null || echoIdTmp.isEmpty())
+        echoId = null;
+      if (echoDir != null && ! echoDir.isEmpty()) {
+        echoId = echoDir;
+      }
+      mdRecord = getEchoMetadata(xQueryEvaluator, echoId, mdRecord);
+    }
+    String pageCountStr = xQueryEvaluator.evaluateAsString(srcUrl, "count(//*:pb)");
+    int pageCount = Integer.valueOf(pageCountStr);
+    mdRecord.setPageCount(pageCount);
+    mdRecord.setSchemaName("echo");
+    return mdRecord;
+  }
+
+  private MetadataRecord getMetadataRecordTei(XQueryEvaluator xQueryEvaluator, URL srcUrl, MetadataRecord mdRecord) throws ApplicationException {
+    String metadataXmlStr = xQueryEvaluator.evaluateAsString(srcUrl, "/*:TEI/*:teiHeader");
+    if (metadataXmlStr != null) {
+      String identifier = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "/*:teiHeader/*:fileDesc/*:publicationStmt/*:idno");
+      if (identifier != null) {
+        identifier = StringUtils.deresolveXmlEntities(identifier);
+        identifier = deleteSpecialChars(identifier);
+      }
+      String creator = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "/*:teiHeader/*:fileDesc/*:titleStmt/*:author");
+      if (creator != null)
+        creator = StringUtils.deresolveXmlEntities(creator);
+      String title = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "/*:teiHeader/*:fileDesc/*:titleStmt/*:title");
+      if (title != null)
+        title = StringUtils.deresolveXmlEntities(title);
+      String language = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "string(/*:teiHeader/*:profileDesc/*:langUsage/*:language[1]/@ident)");
+      if (language != null && language.isEmpty())
+        language = null;
+      if (language != null) {
+        language = language.toLowerCase();
+        if (language.length() == 5) {  // e.g. "de-DE or en-US"
+          if (language.substring(2, 3).equals("-")) {
+            String lang = language.substring(0, 2);
+            language = Language.getInstance().getISO639Code(lang);
+          }
+        }
+      }
+      String place = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "/*:teiHeader/*:fileDesc/*:publicationStmt/*:pubPlace");
+      if (place != null)
+        place = StringUtils.deresolveXmlEntities(place);
+      String yearStr = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "/*:teiHeader/*:fileDesc/*:publicationStmt/*:date");
+      Date date = null; 
+      if (yearStr != null && ! yearStr.equals("")) {
+        yearStr = StringUtils.deresolveXmlEntities(yearStr);
+        yearStr = new Util().toYearStr(yearStr);  // test if possible etc
+        if (yearStr != null) {
+          try {
+            date = new Util().toDate(yearStr + "-01-01T00:00:00.000Z");
+          } catch (Exception e) {
+            // nothing
+          }
+        }
+      }
+      String subject = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "string(/*:teiHeader/*:profileDesc/*:textClass/*:keywords/*:term)");
+      if (subject != null)
+        subject = StringUtils.deresolveXmlEntities(subject);
+      String rights = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "/*:teiHeader/*:fileDesc/*:publicationStmt/*:availability");
+      if (rights == null)
+        rights = "open access";
+      rights = StringUtils.deresolveXmlEntities(rights);
+      String license = "http://echo.mpiwg-berlin.mpg.de/policy/oa_basics/declaration";
+      String accessRights = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "string(/*:teiHeader/*:fileDesc/*:publicationStmt/*:availability/@status)");
+      if (accessRights == null) 
+        accessRights = "free";
+      accessRights = StringUtils.deresolveXmlEntities(accessRights);
+
+      mdRecord.setIdentifier(identifier);
+      mdRecord.setLanguage(language);
+      mdRecord.setCreator(creator);
+      mdRecord.setTitle(title);
+      mdRecord.setPublisher(place);
+      mdRecord.setRights(rights);
+      mdRecord.setDate(date);
+      mdRecord.setSubject(subject);
+      mdRecord.setLicense(license);
+      mdRecord.setAccessRights(accessRights);
+
+      // get echo metadata
+      mdRecord = getEchoMetadata(xQueryEvaluator, identifier, mdRecord);  // identifier is echoDir
+    }
+    String pageCountStr = xQueryEvaluator.evaluateAsString(srcUrl, "count(//*:pb)");
+    int pageCount = Integer.valueOf(pageCountStr);
+    mdRecord.setPageCount(pageCount);
+    mdRecord.setSchemaName("TEI");
+    return mdRecord;
+  }
+
+  private MetadataRecord getMetadataRecordHtml(XQueryEvaluator xQueryEvaluator, URL srcUrl, MetadataRecord mdRecord) throws ApplicationException {
+    String metadataXmlStr = xQueryEvaluator.evaluateAsString(srcUrl, "/html/head");
+    if (metadataXmlStr != null) {
+      String identifier = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "string(/meta[@name = 'DC.identifier']/@content)");
+      if (identifier != null && ! identifier.isEmpty())
+        identifier = StringUtils.deresolveXmlEntities(identifier);
+      String creator = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "string(/meta[@name = 'DC.creator']/@content)");
+      if (creator != null && ! creator.isEmpty())
+        creator = StringUtils.deresolveXmlEntities(creator);
+      String title = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "string(/meta[@name = 'DC.title']/@content)");
+      if (title != null && ! title.isEmpty())
+        title = StringUtils.deresolveXmlEntities(title);
+      String language = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "string(/meta[@name = 'DC.language']/@content)");
+      if (language != null && language.isEmpty())
+        language = null;
+      if (language != null && ! language.isEmpty())
+        language = StringUtils.deresolveXmlEntities(language);
+      String publisher = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "string(/meta[@name = 'DC.publisher']/@content)");
+      if (publisher != null)
+        publisher = StringUtils.deresolveXmlEntities(publisher);
+      String yearStr = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "string(/meta[@name = 'DC.date']/@content)");
+      Date date = null; 
+      if (yearStr != null && ! yearStr.equals("")) {
+        yearStr = StringUtils.deresolveXmlEntities(yearStr);
+        yearStr = new Util().toYearStr(yearStr);  // test if possible etc
+        if (yearStr != null) {
+          try {
+            date = new Util().toDate(yearStr + "-01-01T00:00:00.000Z");
+          } catch (Exception e) {
+            // nothing
+          }
+        }
+      }
+      String subject = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "string(/meta[@name = 'DC.subject']/@content)");
+      if (subject != null)
+        subject = StringUtils.deresolveXmlEntities(subject);
+      String rights = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "string(/meta[@name = 'DC.rights']/@content)");
+      if (rights != null && ! rights.isEmpty())
+        rights = StringUtils.deresolveXmlEntities(rights);
+      String license = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "string(/meta[@name = 'DC.license']/@content)");
+      if (license != null && ! license.isEmpty())
+        license = StringUtils.deresolveXmlEntities(license);
+      String accessRights = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "string(/meta[@name = 'DC.accessRights']/@content)");
+      if (accessRights != null && ! accessRights.isEmpty())
+        accessRights = StringUtils.deresolveXmlEntities(accessRights);
+
+      mdRecord.setIdentifier(identifier);
+      mdRecord.setLanguage(language);
+      mdRecord.setCreator(creator);
+      mdRecord.setTitle(title);
+      mdRecord.setPublisher(publisher);
+      mdRecord.setRights(rights);
+      mdRecord.setDate(date);
+      mdRecord.setSubject(subject);
+      mdRecord.setLicense(license);
+      mdRecord.setAccessRights(accessRights);
+      
+      // get echo metadata
+      mdRecord = getEchoMetadata(xQueryEvaluator, identifier, mdRecord);  // identifier is echoDir
+    }
+    String pageCountStr = xQueryEvaluator.evaluateAsString(srcUrl, "count(//pb)");
+    int pageCount = Integer.valueOf(pageCountStr);
+    mdRecord.setPageCount(pageCount);
+    mdRecord.setSchemaName("html");
+    return mdRecord;
+  }
+
+  private MetadataRecord getEchoMetadata(XQueryEvaluator xQueryEvaluator, String echoDir, MetadataRecord mdRecord) throws ApplicationException {
+    if (echoDir == null || echoDir.isEmpty()) {
+      String docId = mdRecord.getDocId();
+      echoDir = getEchoDir(xQueryEvaluator, docId);
+      if (echoDir == null)
+        return mdRecord;
+    }
+    String urLTexter = "http://digilib.mpiwg-berlin.mpg.de/digitallibrary/servlet/Texter?fn=" + echoDir + "/index.meta";
+    String echoIndexMetaStr = performGetRequest(urLTexter);
+    String echoPageImageDir = null;
+    String echoFiguresDir = null;
+    String mpiwgDocId = null;
+    if (echoIndexMetaStr != null) {
+      if (echoIndexMetaStr.equals("XXXXTimeoutXXXX"))
+        return null;
+      else if (echoIndexMetaStr.equals("XXXXUrlErrorXXXX"))
+        return mdRecord;
+      echoPageImageDir = xQueryEvaluator.evaluateAsStringValueJoined(echoIndexMetaStr, "/resource/meta/texttool/image");
+      if (echoPageImageDir != null)
+        echoPageImageDir = echoDir + "/" + echoPageImageDir;
+      else
+        echoPageImageDir = echoDir + "/" + "pageimg"; // default
+      echoFiguresDir = xQueryEvaluator.evaluateAsStringValueJoined(echoIndexMetaStr, "/resource/meta/texttool/figures");
+      if (echoFiguresDir != null)
+        echoFiguresDir = echoDir + "/" + echoFiguresDir;
+      else
+        echoFiguresDir = echoDir + "/" + "figures"; // default
+      mpiwgDocId = xQueryEvaluator.evaluateAsStringValueJoined(echoIndexMetaStr, "/resource/meta/dri[@type = 'mpiwg']");
+    }
+    mdRecord.setEchoId(echoDir);
+    mdRecord.setEchoPageImageDir(echoPageImageDir);
+    mdRecord.setEchoFiguresDir(echoFiguresDir);
+    mdRecord.setMpiwgDocId(mpiwgDocId);
+    return mdRecord;
+  }
+  
+  private String getEchoDir(XQueryEvaluator xQueryEvaluator, String docId) throws ApplicationException {
+    String echoDir = null;
+    String urLTextUrlPath = "http://md.mpiwg-berlin.mpg.de/purls/searchSolr?text-url-path=" + docId + "&format=short";
+    String resultXmlStr = performGetRequest(urLTextUrlPath);
+    if (resultXmlStr != null) {
+      if (resultXmlStr.equals("XXXXTimeoutXXXX"))
+        return null;
+      else if (resultXmlStr.equals("XXXXUrlErrorXXXX"))
+        return null;
+      String archivePath = xQueryEvaluator.evaluateAsStringValueJoined(resultXmlStr, "//archive-path");
+      if (archivePath != null) {
+        archivePath = archivePath.replaceAll("/mpiwg/online", "");
+        if (archivePath.isEmpty())
+          echoDir = null;
+        else
+          echoDir = archivePath;
+      }
+    }
+    return echoDir;
+  }
+  
+  private String getNodeType(XdmNode node) {
+    String nodeType = null;
+    XdmSequenceIterator iter = node.axisIterator(Axis.CHILD);
+    if (iter != null) {
+      while (iter.hasNext()) {
+        XdmNode firstChild = (XdmNode) iter.next();
+        if (firstChild != null) {
+          XdmNodeKind nodeKind = firstChild.getNodeKind();
+          if (nodeKind.ordinal() == XdmNodeKind.ELEMENT.ordinal()) {
+            QName nodeQName = firstChild.getNodeName();
+            nodeType = nodeQName.getLocalName();
+          }
+        }
+      }
+    }
+    return nodeType;
+  }
+  
+  public String getDocFullFileName(String docId) {
+    String docDir = getDocDir(docId);
+    String docFileName = getDocFileName(docId);
+    String docFullFileName = docDir + "/" + docFileName; 
+    return docFullFileName;
+  }
+  
+  public String getFullFileName(String docId, String type) {
+    String docDir = getDocDir(docId);
+    String docFileName = getDocFileName(docId);
+    int lastDot = docFileName.lastIndexOf(".");
+    String docFileNameWithoutExtension = docFileName.substring(0, lastDot);
+    String fullFileName = docDir + "/" + docFileNameWithoutExtension + ".xml";
+    if (type != null && ! type.equals("toc")) {
+      fullFileName = docDir + "/" + docFileNameWithoutExtension + "." + type;
+    } else if (type != null && type.equals("toc")) {
+      fullFileName = docDir + "/toc.xml";
+    }
+    return fullFileName;
+  }
+  
+  public String getDocDir(String docId) {
+    String documentsDirectory = Constants.getInstance().getDocumentsDir();
+    String subDir = docId;
+    if (docId.contains(".")) {
+      int index = docId.lastIndexOf(".");
+      subDir = docId.substring(0, index);
+    }
+    if (! subDir.startsWith("/"))
+      subDir = "/" + subDir;
+    String docDir = documentsDirectory + subDir;
+    return docDir;
+  }
+
+  public String getDocFileName(String docId) {
+    String docFileName = docId;
+    int index = docId.lastIndexOf("/");
+    if (index != -1) {
+      docFileName = docId.substring(index + 1);
+    }
+    return docFileName;
+  } 
+  
+  private String getMainLanguage(String docId) {
+    String mainLang = null;
+    int to = docId.lastIndexOf("/");
+    if (to != -1) {
+      String preStr = docId.substring(0, to);
+      int from = preStr.lastIndexOf("/");
+      if (from != -1) 
+        mainLang = preStr.substring(from + 1, to);
+    }
+    return mainLang;
+  } 
+  
+  private String deleteSpecialChars(String inputStr) {
+    StringBuilder buf = new StringBuilder();
+    for (int i = 0; i < inputStr.length(); i++) {
+      char c = inputStr.charAt(i);
+      String replace = new String();
+      switch (c) {
+        case '@': replace = ""; break; 
+        case ' ': replace = ""; break; 
+        case ';': replace = ""; break; 
+        default: replace += c; break;
+      }
+      buf.append(replace);
+    }
+    return buf.toString();
+  }
+
+  private Hashtable<Integer, StringBuilder> getFragments(String fileName, String milestoneElementName) throws ApplicationException {
+    try {
+      GetFragmentsContentHandler getFragmentsContentHandler = new GetFragmentsContentHandler(milestoneElementName);
+      XMLReader xmlParser = new SAXParser();
+      xmlParser.setContentHandler(getFragmentsContentHandler);
+      StringReader bla = new StringReader(FileUtils.readFileToString(new File(fileName), "utf-8"));
+      InputSource inputSource = new InputSource(bla);
+      xmlParser.parse(inputSource);
+      Hashtable<Integer, StringBuilder> resultFragments = getFragmentsContentHandler.getResultPages();
+      return resultFragments;
+    } catch (SAXException e) {
+      throw new ApplicationException(e);
+    } catch (IOException e) {
+      throw new ApplicationException(e);
+    }
+  }
+
+  private String tokenizeWithLemmas(String xmlStr, String language) throws ApplicationException {
+    StringReader strReader = new StringReader(xmlStr);
+    XmlTokenizer xmlTokenizer = new XmlTokenizer(strReader);
+    xmlTokenizer.setLanguage(language);
+    String[] outputOptionsWithLemmas = {"withLemmas"}; // so all tokens are fetched with lemmas (costs performance)
+    // non word breaking elements; 
+    // TODO examine bugs with emph, figure, hi : 
+    // e.g. "... der <hi rend="i">Capi-<lb n="16"/>talist.</hi> Es ..."
+    // e.g. page 30 in /echo/la/Cataneo_1600.xml
+    String[] nwbElements = {"lb", "br", "cb"};  
+    xmlTokenizer.setNWBElements(nwbElements);
+    xmlTokenizer.setOutputOptions(outputOptionsWithLemmas); 
+    xmlTokenizer.tokenize();  
+    String retStr = xmlTokenizer.getXmlResult();
+    return retStr;
+  }
+  
+  private String enrichWordsOrigRegNorm(String xmlStr) throws ApplicationException {
+    try {
+      WordContentHandler wordContentHandler = new WordContentHandler();
+      XMLReader xmlParser = new SAXParser();
+      xmlParser.setContentHandler(wordContentHandler);
+      StringReader strReader = new StringReader(xmlStr);
+      InputSource inputSource = new InputSource(strReader);
+      xmlParser.parse(inputSource);
+      String result = wordContentHandler.getResult();
+      return result;
+    } catch (SAXException e) {
+      throw new ApplicationException(e);
+    } catch (IOException e) {
+      throw new ApplicationException(e);
+    }
+  }
+  
+  private String performGetRequest(String url) throws ApplicationException {
+    String resultStr = null;
+    try {
+      boolean urlIsOk = checkUri(url, 2000); // if url doesn't answer after 2 seconds
+      if (! urlIsOk)
+        return "XXXXTimeoutXXXX";
+      HttpClient httpClient = new HttpClient();
+      GetMethod method = new GetMethod(url);
+      httpClient.executeMethod(method); 
+      int statusCode = method.getStatusCode();
+      if (statusCode >= 400)
+        return "XXXXUrlErrorXXXX";
+      byte[] resultBytes = method.getResponseBody();
+      resultStr = new String(resultBytes, "utf-8");
+      method.releaseConnection();
+    } catch (HttpException e) {
+      throw new ApplicationException(e);      
+    } catch (IOException e) {
+      throw new ApplicationException(e);      
+    }
+    return resultStr;
+  } 
+
+  private boolean checkUri(String uriStr, int timeoutMilliseconds) throws ApplicationException {
+    boolean isOk = true;
+    try {
+      URI uri = new URI(uriStr);
+      HttpGet httpGet = new HttpGet(uri);
+      HttpParams httpParameters = new BasicHttpParams();
+      // Set the timeout in milliseconds until a connection is established.
+      // The default value is zero, that means the timeout is not used. 
+      int timeoutConnection = 2000;
+      HttpConnectionParams.setConnectionTimeout(httpParameters, timeoutConnection);
+      // Set the default socket timeout (SO_TIMEOUT) 
+      // in milliseconds which is the timeout for waiting for data.
+      int timeoutSocket = 2000;
+      HttpConnectionParams.setSoTimeout(httpParameters, timeoutSocket);
+      DefaultHttpClient httpClient = new DefaultHttpClient(httpParameters);
+      HttpResponse response = httpClient.execute(httpGet);
+    } catch (IOException e) {
+      isOk = false;  // if timeout exception is thrown
+    } catch (URISyntaxException e) {
+      throw new ApplicationException(e);
+    }
+    return isOk;
+  }
+
+  /**
+   * Write string into destFile. If directory for that destFile does not exist 
+   * it creates this directory including parent directories. 
+   * @param str string to write
+   * @param destFileName destination file name
+   * @throws ApplicationException
+   */
+  private void saveFile(String str, String destFileName) throws ApplicationException {
+    OutputStreamWriter out = null;
+    try {
+      if (str == null)
+        return;  // do nothing
+      File destFile = new File(destFileName);
+      File destDir = new File(destFile.getParent()); 
+      if (! destDir.exists()) {
+        destDir.mkdirs();  // create the directory including parent directories which do not exist
+      }
+      out = new OutputStreamWriter(new BufferedOutputStream(new FileOutputStream(destFile)), "utf-8");
+      out.write(str);
+      out.flush();
+    } catch (FileNotFoundException e) {
+      throw new ApplicationException(e);
+    } catch (IOException e) {
+      throw new ApplicationException(e);
+    } finally {
+      try { 
+        if (out != null)
+          out.close(); 
+        } catch (Exception e) { 
+          // nothing: always close the stream at the end of the method
+        }  
+    }
+  }
+
+  private void beginOperation() {
+    beginOfOperation = new Date().getTime();
+  }
+
+  private void endOperation() {
+    endOfOperation = new Date().getTime();
+  }
+
+}
\ No newline at end of file
author	Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
date	Tue, 21 May 2013 10:19:32 +0200
parents
children