Mercurial > hg > mpdl-group
diff software/mpdl-services-new/mpiwg-mpdl-cms/src/de/mpg/mpiwg/berlin/mpdl/cms/document/DocumentHandler.java @ 25:e9fe3186670c default tip
letzter Stand eingecheckt
author | Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de> |
---|---|
date | Tue, 21 May 2013 10:19:32 +0200 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services-new/mpiwg-mpdl-cms/src/de/mpg/mpiwg/berlin/mpdl/cms/document/DocumentHandler.java Tue May 21 10:19:32 2013 +0200 @@ -0,0 +1,927 @@ +package de.mpg.mpiwg.berlin.mpdl.cms.document; + +import java.io.BufferedOutputStream; +import java.io.File; +import java.io.FileNotFoundException; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.OutputStreamWriter; +import java.io.StringReader; +import java.net.MalformedURLException; +import java.net.URI; +import java.net.URISyntaxException; +import java.net.URL; +import java.util.Arrays; +import java.util.Date; +import java.util.Hashtable; +import java.util.Iterator; +import java.util.List; +import java.util.logging.Logger; + +import net.sf.saxon.s9api.Axis; +import net.sf.saxon.s9api.QName; +import net.sf.saxon.s9api.XdmNode; +import net.sf.saxon.s9api.XdmNodeKind; +import net.sf.saxon.s9api.XdmSequenceIterator; + +import org.apache.commons.httpclient.HttpClient; +import org.apache.commons.httpclient.HttpException; +import org.apache.commons.httpclient.methods.GetMethod; +import org.apache.commons.io.FileUtils; +import org.apache.http.HttpResponse; +import org.apache.http.client.methods.HttpGet; +import org.apache.http.impl.client.DefaultHttpClient; +import org.apache.http.params.BasicHttpParams; +import org.apache.http.params.HttpConnectionParams; +import org.apache.http.params.HttpParams; +import org.xml.sax.InputSource; +import org.xml.sax.SAXException; +import org.xml.sax.XMLReader; + +import com.sun.org.apache.xerces.internal.parsers.SAXParser; + +import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; +import de.mpg.mpiwg.berlin.mpdl.lt.general.Language; +import de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize.WordContentHandler; +import de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize.XmlTokenizer; +import de.mpg.mpiwg.berlin.mpdl.util.StringUtils; +import de.mpg.mpiwg.berlin.mpdl.util.Util; +import de.mpg.mpiwg.berlin.mpdl.xml.xquery.XQueryEvaluator; +import de.mpg.mpiwg.berlin.mpdl.cms.general.Constants; +import de.mpg.mpiwg.berlin.mpdl.cms.lucene.IndexHandler; +import de.mpg.mpiwg.berlin.mpdl.cms.scheduler.CmsDocOperation; +import de.mpg.mpiwg.berlin.mpdl.cms.transform.GetFragmentsContentHandler; +import de.mpg.mpiwg.berlin.mpdl.cms.transform.PageTransformer; +import de.mpg.mpiwg.berlin.mpdl.cms.transform.XslResourceTransformer; + +/** + * Handler for documents (singleton). + */ +public class DocumentHandler { + private static Logger LOGGER = Logger.getLogger(DocumentHandler.class.getName()); + private static List<String> EXCLUDED_PROJECT_DOCS = + Arrays.asList("/echo/zh/Min_chan_luyi_1_7MCGW0WG.xml", // the Saxon transfomer has heavy problems with some characters in CJK Unified Ideographs Extension B, e.g.: line 309 (second reg on page 16) + "/echo/zh/Min_chan_luyi_2_U7Y9NQ9V.xml", + "/echo/zh/Min_chan_luyi_3_2FP9M172.xml", + "/echo/zh/Min_chan_luyi_4_FXA6FSFH.xml", + "/echo/zh/Min_chan_luyi_5_VG6NY5XD.xml", + "/echo/zh/Xifa_shenji.xml", + "/echo/zh/Yulei_tushuo_1_HXX4MGZW.xml", + "/echo/zh/Yulei_tushuo_2_FN1CTY5C.xml"); + private long beginOfOperation; + private long endOfOperation; + + public void doOperation(CmsDocOperation docOperation) throws ApplicationException { + String operationName = docOperation.getName(); + if (operationName.equals("create")) { + create(docOperation); + } else if (operationName.equals("delete")) { + delete(docOperation); + } else if (operationName.equals("importDirectory")) { + importDirectory(docOperation); + } else if (operationName.equals("createPdf")) { + createPdf(docOperation); + } else if (operationName.equals("createAllPdfInDirectory")) { + createAllPdfInDirectory(docOperation); + } + } + + private void importDirectory(CmsDocOperation docOperation) throws ApplicationException { + try { + LOGGER.info("Start of DocumentHandler. This operation could be time consuming because documents are indexed (normal indexing times are 1-10 minutes for a document)"); + beginOperation(); + String localDocumentsUrlStr = docOperation.getSrcUrl(); // start directory: file:/a/local/directory + String collectionNames = docOperation.getCollectionNames(); // e.g. "echo" + File localDocumentsDir = new File(new URI(localDocumentsUrlStr)); + boolean docDirExists = localDocumentsDir.exists(); + if (! docDirExists) + throw new ApplicationException("Document directory:" + localDocumentsUrlStr + " does not exists. Please use a directory that exists and perform the operation again."); + String[] fileExtensions = {"xml"}; + Iterator<File> iterFiles = FileUtils.iterateFiles(localDocumentsDir, fileExtensions, true); + int i = 0; + while(iterFiles.hasNext()) { + i++; + File xmlFile = iterFiles.next(); + String xmlFileStr = xmlFile.getPath(); + int relativePos = (int) localDocumentsDir.getPath().length(); + String docId = xmlFileStr.substring(relativePos); // relative path name starting from localDocumentsDir, e.g. /tei/de/Test_1789.xml + String xmlFileUrlStr = xmlFile.toURI().toURL().toString(); + CmsDocOperation createDocOperation = new CmsDocOperation("create", xmlFileUrlStr, null, docId); + createDocOperation.setCollectionNames(collectionNames); + try { + doOperation(createDocOperation); + Date now = new Date(); + LOGGER.info("Document " + i + ": " + docId + " successfully imported (" + now.toString() + ")"); + } catch (Exception e) { + LOGGER.info("Document " + i + ": " + docId + " has problems:"); + e.printStackTrace(); + } + } + endOperation(); + LOGGER.info("The DocumentHandler needed: " + (endOfOperation - beginOfOperation) + " ms" ); + } catch (Exception e) { + throw new ApplicationException(e); + } + } + + private void createAllPdfInDirectory(CmsDocOperation docOperation) throws ApplicationException { + try { + LOGGER.info("Start of generating Pdf-Documents. This operation could be time consuming because Pdf generation needs some time."); + beginOperation(); + String localDocumentsUrlStr = docOperation.getSrcUrl(); // start directory: file:/a/local/directory + String collectionNames = docOperation.getCollectionNames(); // e.g. "echo" + File localDocumentsDir = new File(new URI(localDocumentsUrlStr)); + boolean docDirExists = localDocumentsDir.exists(); + if (! docDirExists) + throw new ApplicationException("Document directory:" + localDocumentsUrlStr + " does not exists. Please use a directory that exists and perform the operation again."); + String[] fileExtensions = {"xml"}; + Iterator<File> iterFiles = FileUtils.iterateFiles(localDocumentsDir, fileExtensions, true); + int i = 0; + while(iterFiles.hasNext()) { + i++; + File xmlFile = iterFiles.next(); + String xmlFileStr = xmlFile.getPath(); + int relativePos = (int) localDocumentsDir.getPath().length(); + String docId = xmlFileStr.substring(relativePos); // relative path name starting from localDocumentsDir, e.g. /tei/de/Test_1789.xml + CmsDocOperation createPdfOperation = new CmsDocOperation("createPdf", null, null, docId); + createPdfOperation.setCollectionNames(collectionNames); + try { + doOperation(createPdfOperation); + Date now = new Date(); + LOGGER.info("Pdf document " + i + ": " + docId + " successfully created (" + now.toString() + ")"); + } catch (Exception e) { + LOGGER.info("Pdf document " + i + ": " + docId + " has problems:"); + e.printStackTrace(); + } + } + endOperation(); + LOGGER.info("The Pdf generation needed: " + (endOfOperation - beginOfOperation) + " ms" ); + } catch (Exception e) { + throw new ApplicationException(e); + } + } + + private boolean isProjectDoc(String docId) { + boolean isProjectDoc = true; + if (EXCLUDED_PROJECT_DOCS.contains(docId)) + return false; + return isProjectDoc; + } + + private void create(CmsDocOperation docOperation) throws ApplicationException { + try { + String operationName = docOperation.getName(); + String srcUrlStr = docOperation.getSrcUrl(); + String docId = docOperation.getDocIdentifier(); + if (! isProjectDoc(docId)) { + LOGGER.info("Operation: " + operationName + " not performed on: " + docId + ". Cause: document is excluded as project doc"); + return; + } + String mainLanguage = docOperation.getMainLanguage(); + if (mainLanguage == null) { + mainLanguage = getMainLanguage(docId); + } + String[] elementNames = docOperation.getElementNames(); + if (elementNames == null) { + String[] defaultElementNames = {"s", "head", "caption", "variables", "description"}; + docOperation.setElementNames(defaultElementNames); // default + } + String docDirName = getDocDir(docId); + String docDestFileName = getDocFullFileName(docId); + URL srcUrl = null; + String protocol = null; + if (srcUrlStr != null && ! srcUrlStr.equals("empty")) { + srcUrl = new URL(srcUrlStr); + protocol = srcUrl.getProtocol(); + } + File docDestFile = new File(docDestFileName); + // parse validation on file + XQueryEvaluator xQueryEvaluator = new XQueryEvaluator(); + XdmNode docNode = xQueryEvaluator.parse(srcUrl); // if it is not parseable an exception with a detail message is thrown + String docType = getNodeType(docNode); // archimedes, echo, TEI, html ... + docType = docType.trim(); + if (docType == null) { + docOperation.setErrorMessage("file type of: " + srcUrlStr + "is not supported"); + return; + } + // perform operation on file system + if (protocol.equals("file")) { + docOperation.setStatus("upload file: " + srcUrlStr + " to CMS"); + } else { + docOperation.setStatus("download file from: " + srcUrlStr + " to CMS"); + } + FileUtils.copyURLToFile(srcUrl, docDestFile, 100000, 100000); + + // replace anchor in echo documents and also add the number attribute to figures + String docDestFileNameUpgrade = docDestFileName + ".upgrade"; + File docDestFileUpgrade = new File(docDestFileNameUpgrade); + XslResourceTransformer replaceAnchorTransformer = new XslResourceTransformer("replaceAnchor.xsl"); + String docDestFileUrlStr = docDestFile.getPath(); + String result = replaceAnchorTransformer.transform(docDestFileUrlStr); + FileUtils.writeStringToFile(docDestFileUpgrade, result, "utf-8"); + + MetadataRecord mdRecord = new MetadataRecord(); + mdRecord.setDocId(docId); + mdRecord.setCollectionNames(docOperation.getCollectionNames()); + mdRecord.setType("text/xml"); + + // generate toc file (toc, figure, handwritten) + XslResourceTransformer tocTransformer = new XslResourceTransformer("toc.xsl"); + File tocFile = new File(docDirName + "/toc.xml"); + String tocResult = tocTransformer.transform(docDestFileNameUpgrade); + FileUtils.writeStringToFile(tocFile, tocResult, "utf-8"); + + // Get metadata info of the xml document + docOperation.setStatus("extract metadata of: " + srcUrlStr + " to CMS"); + XQueryEvaluator xQueryEvaluator2 = new XQueryEvaluator(); + mdRecord = getMetadataRecord(docDestFileUpgrade, docType, mdRecord, xQueryEvaluator2); + String mdRecordLanguage = mdRecord.getLanguage(); + if (mdRecordLanguage == null && mainLanguage != null) + mdRecord.setLanguage(mainLanguage); + + // save all pages as single xml files (untokenized and tokenized) + docOperation.setStatus("extract page fragments of: " + srcUrlStr + " to CMS"); + File docDir = new File(docDirName + "/pages"); + FileUtils.deleteQuietly(docDir); // first delete pages directory + Hashtable<Integer, StringBuilder> pageFragments = getFragments(docDestFileNameUpgrade, "pb"); + int pageCount = pageFragments.size(); + if (pageCount == 0) { + // no pb element is found: then the whole document is the first page + String docXmlStr = FileUtils.readFileToString(docDestFileUpgrade, "utf-8"); + docXmlStr = docXmlStr.replaceAll("<\\?xml.*?\\?>", ""); // remove the xml declaration if it exists + pageFragments = new Hashtable<Integer, StringBuilder>(); + pageFragments.put(new Integer(1), new StringBuilder(docXmlStr)); + pageCount = 1; + } + PageTransformer pageTransformer = new PageTransformer(); + for (int page=1; page<=pageCount; page++) { + String fragment = pageFragments.get(new Integer(page)).toString(); + fragment = "<?xml version=\"1.0\" encoding=\"utf-8\"?>\n" + fragment; + String docPageFileName = docDirName + "/pages/page-" + page + ".xml"; + File docPageFile = new File(docPageFileName); + FileUtils.writeStringToFile(docPageFile, fragment, "utf-8"); + String language = mdRecord.getLanguage(); + String tokenizedXmlStr = tokenizeWithLemmas(fragment, language); // xml fragment enriched with <w> elements + tokenizedXmlStr = "<?xml version=\"1.0\" encoding=\"utf-8\"?>" + tokenizedXmlStr; + tokenizedXmlStr = enrichWordsOrigRegNorm(tokenizedXmlStr); // xml string: enrich <w> elements with normalization info (orig, reg, norm) + String docPageTokenizedFileName = docDirName + "/pages/page-" + page + "-morph.xml"; + File docPageTokenizedFile = new File(docPageTokenizedFileName); + FileUtils.writeStringToFile(docPageTokenizedFile, tokenizedXmlStr, "utf-8"); + String docPageHtmlFileName = docDirName + "/pages/page-" + page + ".html"; + File docPageHtmlFile = new File(docPageHtmlFileName); + String htmlStr = pageTransformer.transform(tokenizedXmlStr, mdRecord, page, "html"); + FileUtils.writeStringToFile(docPageHtmlFile, htmlStr, "utf-8"); + } + + // perform operation on Lucene + docOperation.setStatus(operationName + " document: " + docId + " in CMS"); + docOperation.setMdRecord(mdRecord); + IndexHandler indexHandler = IndexHandler.getInstance(); + indexHandler.indexDocument(docOperation); + + } catch (IOException e) { + throw new ApplicationException(e); + } + } + + private void delete(CmsDocOperation docOperation) throws ApplicationException { + String operationName = docOperation.getName(); + String docIdentifier = docOperation.getDocIdentifier(); + if (docIdentifier == null || docIdentifier.trim().equals("")) + throw new ApplicationException("Your document identifier is empty. Please specify a document identifier for your document."); + String docDirStr = getDocDir(docIdentifier); + File docDir = new File(docDirStr); + boolean docExists = docDir.exists(); + if (! docExists) { + throw new ApplicationException("Document:" + docIdentifier + " does not exists. Please use a name that exists and perform the operation \"Delete\" again."); + } + // perform operation on file system + docOperation.setStatus(operationName + " document: " + docIdentifier + " in CMS"); + FileUtils.deleteQuietly(docDir); + + // perform operation on Lucene + IndexHandler indexHandler = IndexHandler.getInstance(); + indexHandler.deleteDocument(docOperation); + + } + + private void createPdf(CmsDocOperation docOperation) throws ApplicationException { + String docId = docOperation.getDocIdentifier(); + String operationName = docOperation.getName(); + if (docId == null || docId.trim().equals("")) + throw new ApplicationException("Your document identifier is empty. Please specify a document identifier for your document."); + if (! isProjectDoc(docId)) { + LOGGER.info("Operation: " + operationName + " not performed on: " + docId + ". Cause: document is excluded as project doc"); + return; + } + IndexHandler indexHandler = IndexHandler.getInstance(); + MetadataRecord mdRecord = indexHandler.getDocMetadata(docId); + docOperation.setStatus("create PDF and HTML versions of the document: " + docId); + PdfHandler pdfHandler = PdfHandler.getInstance(); + pdfHandler.createFile(true, true, mdRecord); // generate Pdf + Html document + } + + private MetadataRecord getMetadataRecord(File xmlFile, String schemaName, MetadataRecord mdRecord, XQueryEvaluator xQueryEvaluator) throws ApplicationException { + if (schemaName == null) + return mdRecord; + try { + URL srcUrl = xmlFile.toURI().toURL(); + if (schemaName.equals("archimedes")) + mdRecord = getMetadataRecordArch(xQueryEvaluator, srcUrl, mdRecord); + else if (schemaName.equals("echo")) + mdRecord = getMetadataRecordEcho(xQueryEvaluator, srcUrl, mdRecord); + else if (schemaName.equals("TEI")) + mdRecord = getMetadataRecordTei(xQueryEvaluator, srcUrl, mdRecord); + else if (schemaName.equals("html")) + mdRecord = getMetadataRecordHtml(xQueryEvaluator, srcUrl, mdRecord); + else + mdRecord.setSchemaName("diverse"); // all other cases: set docType to schemaName + } catch (MalformedURLException e) { + throw new ApplicationException(e); + } + mdRecord.setLastModified(new Date()); + return mdRecord; + } + + private MetadataRecord getMetadataRecordArch(XQueryEvaluator xQueryEvaluator, URL srcUrl, MetadataRecord mdRecord) throws ApplicationException { + String metadataXmlStr = xQueryEvaluator.evaluateAsString(srcUrl, "/archimedes//info"); + if (metadataXmlStr != null) { + String identifier = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "/info/locator"); + if (identifier != null) + identifier = StringUtils.deresolveXmlEntities(identifier); + String creator = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "/info/author"); + if (creator != null) + creator = StringUtils.deresolveXmlEntities(creator); + String title = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "/info/title"); + if (title != null) + title = StringUtils.deresolveXmlEntities(title); + String language = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "/info/lang[1]"); + if (language != null) + language = StringUtils.deresolveXmlEntities(language); + String place = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "/info/place"); + if (place != null) + place = StringUtils.deresolveXmlEntities(place); + String yearStr = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "/info/date"); + Date date = null; + if (yearStr != null && ! yearStr.equals("")) { + yearStr = StringUtils.deresolveXmlEntities(yearStr); + yearStr = new Util().toYearStr(yearStr); // test if possible etc + if (yearStr != null) { + try { + date = new Util().toDate(yearStr + "-01-01T00:00:00.000Z"); + } catch (Exception e) { + // nothing + } + } + } + String rights = "open access"; + String license = "http://echo.mpiwg-berlin.mpg.de/policy/oa_basics/declaration"; + String accessRights = "free"; + + mdRecord.setIdentifier(identifier); + mdRecord.setLanguage(language); + mdRecord.setCreator(creator); + mdRecord.setTitle(title); + mdRecord.setPublisher(place); + mdRecord.setRights(rights); + mdRecord.setDate(date); + mdRecord.setLicense(license); + mdRecord.setAccessRights(accessRights); + + // get echo metadata + String echoDir = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "/info/echodir"); + String docId = mdRecord.getDocId(); + String echoIdTmp = docId; + if (docId != null && ! docId.isEmpty()) { + int start = docId.lastIndexOf("/"); + if (start != -1) + start = start + 1; + else + start = 0; + int end = docId.lastIndexOf("."); + if (end == -1) + end = docId.length(); + echoIdTmp = docId.substring(start, end); + } + String echoId = "/permanent/archimedes/" + echoIdTmp; + if (echoIdTmp == null || echoIdTmp.isEmpty()) + echoId = null; + if (echoDir != null && ! echoDir.isEmpty()) { + echoId = echoDir; + } + mdRecord = getEchoMetadata(xQueryEvaluator, echoId, mdRecord); + } + String pageCountStr = xQueryEvaluator.evaluateAsString(srcUrl, "count(//pb)"); + int pageCount = Integer.valueOf(pageCountStr); + mdRecord.setPageCount(pageCount); + mdRecord.setSchemaName("archimedes"); + return mdRecord; + } + + private MetadataRecord getMetadataRecordEcho(XQueryEvaluator xQueryEvaluator, URL srcUrl, MetadataRecord mdRecord) throws ApplicationException { + String metadataXmlStr = xQueryEvaluator.evaluateAsString(srcUrl, "/*:echo/*:metadata"); + if (metadataXmlStr != null) { + String identifier = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "/*:metadata/*:identifier"); + if (identifier != null) { + identifier = StringUtils.deresolveXmlEntities(identifier); + } + String creator = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "/*:metadata/*:creator"); + if (creator != null) + creator = StringUtils.deresolveXmlEntities(creator); + String title = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "/*:metadata/*:title"); + if (title != null) + title = StringUtils.deresolveXmlEntities(title); + String language = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "/*:metadata/*:language[1]"); + if (language != null) + language = StringUtils.deresolveXmlEntities(language); + String yearStr = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "/*:metadata/*:date"); + Date date = null; + if (yearStr != null && ! yearStr.equals("")) { + yearStr = StringUtils.deresolveXmlEntities(yearStr); + yearStr = new Util().toYearStr(yearStr); // test if possible etc + if (yearStr != null) { + try { + date = new Util().toDate(yearStr + "-01-01T00:00:00.000Z"); + } catch (Exception e) { + // nothing + } + } + } + String rights = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "/*:metadata/*:rights"); + if (rights != null) + rights = StringUtils.deresolveXmlEntities(rights); + String license = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "/*:metadata/*:license"); + if (license != null) + license = StringUtils.deresolveXmlEntities(license); + String accessRights = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "/*:metadata/*:accessRights"); + if (accessRights != null) + accessRights = StringUtils.deresolveXmlEntities(accessRights); + + mdRecord.setIdentifier(identifier); + mdRecord.setLanguage(language); + mdRecord.setCreator(creator); + mdRecord.setTitle(title); + mdRecord.setRights(rights); + mdRecord.setDate(date); + mdRecord.setLicense(license); + mdRecord.setAccessRights(accessRights); + + // get echo metadata + String echoDir = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "/*:metadata/*:echodir"); + String echoIdTmp = identifier; + if (identifier != null && ! identifier.isEmpty()) { + int start = identifier.indexOf("ECHO:"); + if (start != -1) + start = start + 5; + else + start = 0; + int end = identifier.lastIndexOf("."); + if (end == -1) + end = identifier.length(); + echoIdTmp = identifier.substring(start, end); + } + String echoId = "/permanent/library/" + echoIdTmp; + if (echoIdTmp == null || echoIdTmp.isEmpty()) + echoId = null; + if (echoDir != null && ! echoDir.isEmpty()) { + echoId = echoDir; + } + mdRecord = getEchoMetadata(xQueryEvaluator, echoId, mdRecord); + } + String pageCountStr = xQueryEvaluator.evaluateAsString(srcUrl, "count(//*:pb)"); + int pageCount = Integer.valueOf(pageCountStr); + mdRecord.setPageCount(pageCount); + mdRecord.setSchemaName("echo"); + return mdRecord; + } + + private MetadataRecord getMetadataRecordTei(XQueryEvaluator xQueryEvaluator, URL srcUrl, MetadataRecord mdRecord) throws ApplicationException { + String metadataXmlStr = xQueryEvaluator.evaluateAsString(srcUrl, "/*:TEI/*:teiHeader"); + if (metadataXmlStr != null) { + String identifier = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "/*:teiHeader/*:fileDesc/*:publicationStmt/*:idno"); + if (identifier != null) { + identifier = StringUtils.deresolveXmlEntities(identifier); + identifier = deleteSpecialChars(identifier); + } + String creator = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "/*:teiHeader/*:fileDesc/*:titleStmt/*:author"); + if (creator != null) + creator = StringUtils.deresolveXmlEntities(creator); + String title = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "/*:teiHeader/*:fileDesc/*:titleStmt/*:title"); + if (title != null) + title = StringUtils.deresolveXmlEntities(title); + String language = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "string(/*:teiHeader/*:profileDesc/*:langUsage/*:language[1]/@ident)"); + if (language != null && language.isEmpty()) + language = null; + if (language != null) { + language = language.toLowerCase(); + if (language.length() == 5) { // e.g. "de-DE or en-US" + if (language.substring(2, 3).equals("-")) { + String lang = language.substring(0, 2); + language = Language.getInstance().getISO639Code(lang); + } + } + } + String place = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "/*:teiHeader/*:fileDesc/*:publicationStmt/*:pubPlace"); + if (place != null) + place = StringUtils.deresolveXmlEntities(place); + String yearStr = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "/*:teiHeader/*:fileDesc/*:publicationStmt/*:date"); + Date date = null; + if (yearStr != null && ! yearStr.equals("")) { + yearStr = StringUtils.deresolveXmlEntities(yearStr); + yearStr = new Util().toYearStr(yearStr); // test if possible etc + if (yearStr != null) { + try { + date = new Util().toDate(yearStr + "-01-01T00:00:00.000Z"); + } catch (Exception e) { + // nothing + } + } + } + String subject = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "string(/*:teiHeader/*:profileDesc/*:textClass/*:keywords/*:term)"); + if (subject != null) + subject = StringUtils.deresolveXmlEntities(subject); + String rights = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "/*:teiHeader/*:fileDesc/*:publicationStmt/*:availability"); + if (rights == null) + rights = "open access"; + rights = StringUtils.deresolveXmlEntities(rights); + String license = "http://echo.mpiwg-berlin.mpg.de/policy/oa_basics/declaration"; + String accessRights = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "string(/*:teiHeader/*:fileDesc/*:publicationStmt/*:availability/@status)"); + if (accessRights == null) + accessRights = "free"; + accessRights = StringUtils.deresolveXmlEntities(accessRights); + + mdRecord.setIdentifier(identifier); + mdRecord.setLanguage(language); + mdRecord.setCreator(creator); + mdRecord.setTitle(title); + mdRecord.setPublisher(place); + mdRecord.setRights(rights); + mdRecord.setDate(date); + mdRecord.setSubject(subject); + mdRecord.setLicense(license); + mdRecord.setAccessRights(accessRights); + + // get echo metadata + mdRecord = getEchoMetadata(xQueryEvaluator, identifier, mdRecord); // identifier is echoDir + } + String pageCountStr = xQueryEvaluator.evaluateAsString(srcUrl, "count(//*:pb)"); + int pageCount = Integer.valueOf(pageCountStr); + mdRecord.setPageCount(pageCount); + mdRecord.setSchemaName("TEI"); + return mdRecord; + } + + private MetadataRecord getMetadataRecordHtml(XQueryEvaluator xQueryEvaluator, URL srcUrl, MetadataRecord mdRecord) throws ApplicationException { + String metadataXmlStr = xQueryEvaluator.evaluateAsString(srcUrl, "/html/head"); + if (metadataXmlStr != null) { + String identifier = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "string(/meta[@name = 'DC.identifier']/@content)"); + if (identifier != null && ! identifier.isEmpty()) + identifier = StringUtils.deresolveXmlEntities(identifier); + String creator = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "string(/meta[@name = 'DC.creator']/@content)"); + if (creator != null && ! creator.isEmpty()) + creator = StringUtils.deresolveXmlEntities(creator); + String title = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "string(/meta[@name = 'DC.title']/@content)"); + if (title != null && ! title.isEmpty()) + title = StringUtils.deresolveXmlEntities(title); + String language = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "string(/meta[@name = 'DC.language']/@content)"); + if (language != null && language.isEmpty()) + language = null; + if (language != null && ! language.isEmpty()) + language = StringUtils.deresolveXmlEntities(language); + String publisher = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "string(/meta[@name = 'DC.publisher']/@content)"); + if (publisher != null) + publisher = StringUtils.deresolveXmlEntities(publisher); + String yearStr = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "string(/meta[@name = 'DC.date']/@content)"); + Date date = null; + if (yearStr != null && ! yearStr.equals("")) { + yearStr = StringUtils.deresolveXmlEntities(yearStr); + yearStr = new Util().toYearStr(yearStr); // test if possible etc + if (yearStr != null) { + try { + date = new Util().toDate(yearStr + "-01-01T00:00:00.000Z"); + } catch (Exception e) { + // nothing + } + } + } + String subject = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "string(/meta[@name = 'DC.subject']/@content)"); + if (subject != null) + subject = StringUtils.deresolveXmlEntities(subject); + String rights = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "string(/meta[@name = 'DC.rights']/@content)"); + if (rights != null && ! rights.isEmpty()) + rights = StringUtils.deresolveXmlEntities(rights); + String license = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "string(/meta[@name = 'DC.license']/@content)"); + if (license != null && ! license.isEmpty()) + license = StringUtils.deresolveXmlEntities(license); + String accessRights = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "string(/meta[@name = 'DC.accessRights']/@content)"); + if (accessRights != null && ! accessRights.isEmpty()) + accessRights = StringUtils.deresolveXmlEntities(accessRights); + + mdRecord.setIdentifier(identifier); + mdRecord.setLanguage(language); + mdRecord.setCreator(creator); + mdRecord.setTitle(title); + mdRecord.setPublisher(publisher); + mdRecord.setRights(rights); + mdRecord.setDate(date); + mdRecord.setSubject(subject); + mdRecord.setLicense(license); + mdRecord.setAccessRights(accessRights); + + // get echo metadata + mdRecord = getEchoMetadata(xQueryEvaluator, identifier, mdRecord); // identifier is echoDir + } + String pageCountStr = xQueryEvaluator.evaluateAsString(srcUrl, "count(//pb)"); + int pageCount = Integer.valueOf(pageCountStr); + mdRecord.setPageCount(pageCount); + mdRecord.setSchemaName("html"); + return mdRecord; + } + + private MetadataRecord getEchoMetadata(XQueryEvaluator xQueryEvaluator, String echoDir, MetadataRecord mdRecord) throws ApplicationException { + if (echoDir == null || echoDir.isEmpty()) { + String docId = mdRecord.getDocId(); + echoDir = getEchoDir(xQueryEvaluator, docId); + if (echoDir == null) + return mdRecord; + } + String urLTexter = "http://digilib.mpiwg-berlin.mpg.de/digitallibrary/servlet/Texter?fn=" + echoDir + "/index.meta"; + String echoIndexMetaStr = performGetRequest(urLTexter); + String echoPageImageDir = null; + String echoFiguresDir = null; + String mpiwgDocId = null; + if (echoIndexMetaStr != null) { + if (echoIndexMetaStr.equals("XXXXTimeoutXXXX")) + return null; + else if (echoIndexMetaStr.equals("XXXXUrlErrorXXXX")) + return mdRecord; + echoPageImageDir = xQueryEvaluator.evaluateAsStringValueJoined(echoIndexMetaStr, "/resource/meta/texttool/image"); + if (echoPageImageDir != null) + echoPageImageDir = echoDir + "/" + echoPageImageDir; + else + echoPageImageDir = echoDir + "/" + "pageimg"; // default + echoFiguresDir = xQueryEvaluator.evaluateAsStringValueJoined(echoIndexMetaStr, "/resource/meta/texttool/figures"); + if (echoFiguresDir != null) + echoFiguresDir = echoDir + "/" + echoFiguresDir; + else + echoFiguresDir = echoDir + "/" + "figures"; // default + mpiwgDocId = xQueryEvaluator.evaluateAsStringValueJoined(echoIndexMetaStr, "/resource/meta/dri[@type = 'mpiwg']"); + } + mdRecord.setEchoId(echoDir); + mdRecord.setEchoPageImageDir(echoPageImageDir); + mdRecord.setEchoFiguresDir(echoFiguresDir); + mdRecord.setMpiwgDocId(mpiwgDocId); + return mdRecord; + } + + private String getEchoDir(XQueryEvaluator xQueryEvaluator, String docId) throws ApplicationException { + String echoDir = null; + String urLTextUrlPath = "http://md.mpiwg-berlin.mpg.de/purls/searchSolr?text-url-path=" + docId + "&format=short"; + String resultXmlStr = performGetRequest(urLTextUrlPath); + if (resultXmlStr != null) { + if (resultXmlStr.equals("XXXXTimeoutXXXX")) + return null; + else if (resultXmlStr.equals("XXXXUrlErrorXXXX")) + return null; + String archivePath = xQueryEvaluator.evaluateAsStringValueJoined(resultXmlStr, "//archive-path"); + if (archivePath != null) { + archivePath = archivePath.replaceAll("/mpiwg/online", ""); + if (archivePath.isEmpty()) + echoDir = null; + else + echoDir = archivePath; + } + } + return echoDir; + } + + private String getNodeType(XdmNode node) { + String nodeType = null; + XdmSequenceIterator iter = node.axisIterator(Axis.CHILD); + if (iter != null) { + while (iter.hasNext()) { + XdmNode firstChild = (XdmNode) iter.next(); + if (firstChild != null) { + XdmNodeKind nodeKind = firstChild.getNodeKind(); + if (nodeKind.ordinal() == XdmNodeKind.ELEMENT.ordinal()) { + QName nodeQName = firstChild.getNodeName(); + nodeType = nodeQName.getLocalName(); + } + } + } + } + return nodeType; + } + + public String getDocFullFileName(String docId) { + String docDir = getDocDir(docId); + String docFileName = getDocFileName(docId); + String docFullFileName = docDir + "/" + docFileName; + return docFullFileName; + } + + public String getFullFileName(String docId, String type) { + String docDir = getDocDir(docId); + String docFileName = getDocFileName(docId); + int lastDot = docFileName.lastIndexOf("."); + String docFileNameWithoutExtension = docFileName.substring(0, lastDot); + String fullFileName = docDir + "/" + docFileNameWithoutExtension + ".xml"; + if (type != null && ! type.equals("toc")) { + fullFileName = docDir + "/" + docFileNameWithoutExtension + "." + type; + } else if (type != null && type.equals("toc")) { + fullFileName = docDir + "/toc.xml"; + } + return fullFileName; + } + + public String getDocDir(String docId) { + String documentsDirectory = Constants.getInstance().getDocumentsDir(); + String subDir = docId; + if (docId.contains(".")) { + int index = docId.lastIndexOf("."); + subDir = docId.substring(0, index); + } + if (! subDir.startsWith("/")) + subDir = "/" + subDir; + String docDir = documentsDirectory + subDir; + return docDir; + } + + public String getDocFileName(String docId) { + String docFileName = docId; + int index = docId.lastIndexOf("/"); + if (index != -1) { + docFileName = docId.substring(index + 1); + } + return docFileName; + } + + private String getMainLanguage(String docId) { + String mainLang = null; + int to = docId.lastIndexOf("/"); + if (to != -1) { + String preStr = docId.substring(0, to); + int from = preStr.lastIndexOf("/"); + if (from != -1) + mainLang = preStr.substring(from + 1, to); + } + return mainLang; + } + + private String deleteSpecialChars(String inputStr) { + StringBuilder buf = new StringBuilder(); + for (int i = 0; i < inputStr.length(); i++) { + char c = inputStr.charAt(i); + String replace = new String(); + switch (c) { + case '@': replace = ""; break; + case ' ': replace = ""; break; + case ';': replace = ""; break; + default: replace += c; break; + } + buf.append(replace); + } + return buf.toString(); + } + + private Hashtable<Integer, StringBuilder> getFragments(String fileName, String milestoneElementName) throws ApplicationException { + try { + GetFragmentsContentHandler getFragmentsContentHandler = new GetFragmentsContentHandler(milestoneElementName); + XMLReader xmlParser = new SAXParser(); + xmlParser.setContentHandler(getFragmentsContentHandler); + StringReader bla = new StringReader(FileUtils.readFileToString(new File(fileName), "utf-8")); + InputSource inputSource = new InputSource(bla); + xmlParser.parse(inputSource); + Hashtable<Integer, StringBuilder> resultFragments = getFragmentsContentHandler.getResultPages(); + return resultFragments; + } catch (SAXException e) { + throw new ApplicationException(e); + } catch (IOException e) { + throw new ApplicationException(e); + } + } + + private String tokenizeWithLemmas(String xmlStr, String language) throws ApplicationException { + StringReader strReader = new StringReader(xmlStr); + XmlTokenizer xmlTokenizer = new XmlTokenizer(strReader); + xmlTokenizer.setLanguage(language); + String[] outputOptionsWithLemmas = {"withLemmas"}; // so all tokens are fetched with lemmas (costs performance) + // non word breaking elements; + // TODO examine bugs with emph, figure, hi : + // e.g. "... der <hi rend="i">Capi-<lb n="16"/>talist.</hi> Es ..." + // e.g. page 30 in /echo/la/Cataneo_1600.xml + String[] nwbElements = {"lb", "br", "cb"}; + xmlTokenizer.setNWBElements(nwbElements); + xmlTokenizer.setOutputOptions(outputOptionsWithLemmas); + xmlTokenizer.tokenize(); + String retStr = xmlTokenizer.getXmlResult(); + return retStr; + } + + private String enrichWordsOrigRegNorm(String xmlStr) throws ApplicationException { + try { + WordContentHandler wordContentHandler = new WordContentHandler(); + XMLReader xmlParser = new SAXParser(); + xmlParser.setContentHandler(wordContentHandler); + StringReader strReader = new StringReader(xmlStr); + InputSource inputSource = new InputSource(strReader); + xmlParser.parse(inputSource); + String result = wordContentHandler.getResult(); + return result; + } catch (SAXException e) { + throw new ApplicationException(e); + } catch (IOException e) { + throw new ApplicationException(e); + } + } + + private String performGetRequest(String url) throws ApplicationException { + String resultStr = null; + try { + boolean urlIsOk = checkUri(url, 2000); // if url doesn't answer after 2 seconds + if (! urlIsOk) + return "XXXXTimeoutXXXX"; + HttpClient httpClient = new HttpClient(); + GetMethod method = new GetMethod(url); + httpClient.executeMethod(method); + int statusCode = method.getStatusCode(); + if (statusCode >= 400) + return "XXXXUrlErrorXXXX"; + byte[] resultBytes = method.getResponseBody(); + resultStr = new String(resultBytes, "utf-8"); + method.releaseConnection(); + } catch (HttpException e) { + throw new ApplicationException(e); + } catch (IOException e) { + throw new ApplicationException(e); + } + return resultStr; + } + + private boolean checkUri(String uriStr, int timeoutMilliseconds) throws ApplicationException { + boolean isOk = true; + try { + URI uri = new URI(uriStr); + HttpGet httpGet = new HttpGet(uri); + HttpParams httpParameters = new BasicHttpParams(); + // Set the timeout in milliseconds until a connection is established. + // The default value is zero, that means the timeout is not used. + int timeoutConnection = 2000; + HttpConnectionParams.setConnectionTimeout(httpParameters, timeoutConnection); + // Set the default socket timeout (SO_TIMEOUT) + // in milliseconds which is the timeout for waiting for data. + int timeoutSocket = 2000; + HttpConnectionParams.setSoTimeout(httpParameters, timeoutSocket); + DefaultHttpClient httpClient = new DefaultHttpClient(httpParameters); + HttpResponse response = httpClient.execute(httpGet); + } catch (IOException e) { + isOk = false; // if timeout exception is thrown + } catch (URISyntaxException e) { + throw new ApplicationException(e); + } + return isOk; + } + + /** + * Write string into destFile. If directory for that destFile does not exist + * it creates this directory including parent directories. + * @param str string to write + * @param destFileName destination file name + * @throws ApplicationException + */ + private void saveFile(String str, String destFileName) throws ApplicationException { + OutputStreamWriter out = null; + try { + if (str == null) + return; // do nothing + File destFile = new File(destFileName); + File destDir = new File(destFile.getParent()); + if (! destDir.exists()) { + destDir.mkdirs(); // create the directory including parent directories which do not exist + } + out = new OutputStreamWriter(new BufferedOutputStream(new FileOutputStream(destFile)), "utf-8"); + out.write(str); + out.flush(); + } catch (FileNotFoundException e) { + throw new ApplicationException(e); + } catch (IOException e) { + throw new ApplicationException(e); + } finally { + try { + if (out != null) + out.close(); + } catch (Exception e) { + // nothing: always close the stream at the end of the method + } + } + } + + private void beginOperation() { + beginOfOperation = new Date().getTime(); + } + + private void endOperation() { + endOfOperation = new Date().getTime(); + } + +} \ No newline at end of file