comparison software/mpdl-services-new/mpiwg-mpdl-cms/src/de/mpg/mpiwg/berlin/mpdl/cms/document/DocumentHandler.java @ 25:e9fe3186670c default tip

letzter Stand eingecheckt
author Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
date Tue, 21 May 2013 10:19:32 +0200
parents
children
comparison
equal deleted inserted replaced
23:e845310098ba 25:e9fe3186670c
1 package de.mpg.mpiwg.berlin.mpdl.cms.document;
2
3 import java.io.BufferedOutputStream;
4 import java.io.File;
5 import java.io.FileNotFoundException;
6 import java.io.FileOutputStream;
7 import java.io.IOException;
8 import java.io.OutputStreamWriter;
9 import java.io.StringReader;
10 import java.net.MalformedURLException;
11 import java.net.URI;
12 import java.net.URISyntaxException;
13 import java.net.URL;
14 import java.util.Arrays;
15 import java.util.Date;
16 import java.util.Hashtable;
17 import java.util.Iterator;
18 import java.util.List;
19 import java.util.logging.Logger;
20
21 import net.sf.saxon.s9api.Axis;
22 import net.sf.saxon.s9api.QName;
23 import net.sf.saxon.s9api.XdmNode;
24 import net.sf.saxon.s9api.XdmNodeKind;
25 import net.sf.saxon.s9api.XdmSequenceIterator;
26
27 import org.apache.commons.httpclient.HttpClient;
28 import org.apache.commons.httpclient.HttpException;
29 import org.apache.commons.httpclient.methods.GetMethod;
30 import org.apache.commons.io.FileUtils;
31 import org.apache.http.HttpResponse;
32 import org.apache.http.client.methods.HttpGet;
33 import org.apache.http.impl.client.DefaultHttpClient;
34 import org.apache.http.params.BasicHttpParams;
35 import org.apache.http.params.HttpConnectionParams;
36 import org.apache.http.params.HttpParams;
37 import org.xml.sax.InputSource;
38 import org.xml.sax.SAXException;
39 import org.xml.sax.XMLReader;
40
41 import com.sun.org.apache.xerces.internal.parsers.SAXParser;
42
43 import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException;
44 import de.mpg.mpiwg.berlin.mpdl.lt.general.Language;
45 import de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize.WordContentHandler;
46 import de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize.XmlTokenizer;
47 import de.mpg.mpiwg.berlin.mpdl.util.StringUtils;
48 import de.mpg.mpiwg.berlin.mpdl.util.Util;
49 import de.mpg.mpiwg.berlin.mpdl.xml.xquery.XQueryEvaluator;
50 import de.mpg.mpiwg.berlin.mpdl.cms.general.Constants;
51 import de.mpg.mpiwg.berlin.mpdl.cms.lucene.IndexHandler;
52 import de.mpg.mpiwg.berlin.mpdl.cms.scheduler.CmsDocOperation;
53 import de.mpg.mpiwg.berlin.mpdl.cms.transform.GetFragmentsContentHandler;
54 import de.mpg.mpiwg.berlin.mpdl.cms.transform.PageTransformer;
55 import de.mpg.mpiwg.berlin.mpdl.cms.transform.XslResourceTransformer;
56
57 /**
58 * Handler for documents (singleton).
59 */
60 public class DocumentHandler {
61 private static Logger LOGGER = Logger.getLogger(DocumentHandler.class.getName());
62 private static List<String> EXCLUDED_PROJECT_DOCS =
63 Arrays.asList("/echo/zh/Min_chan_luyi_1_7MCGW0WG.xml", // the Saxon transfomer has heavy problems with some characters in CJK Unified Ideographs Extension B, e.g.: line 309 (second reg on page 16)
64 "/echo/zh/Min_chan_luyi_2_U7Y9NQ9V.xml",
65 "/echo/zh/Min_chan_luyi_3_2FP9M172.xml",
66 "/echo/zh/Min_chan_luyi_4_FXA6FSFH.xml",
67 "/echo/zh/Min_chan_luyi_5_VG6NY5XD.xml",
68 "/echo/zh/Xifa_shenji.xml",
69 "/echo/zh/Yulei_tushuo_1_HXX4MGZW.xml",
70 "/echo/zh/Yulei_tushuo_2_FN1CTY5C.xml");
71 private long beginOfOperation;
72 private long endOfOperation;
73
74 public void doOperation(CmsDocOperation docOperation) throws ApplicationException {
75 String operationName = docOperation.getName();
76 if (operationName.equals("create")) {
77 create(docOperation);
78 } else if (operationName.equals("delete")) {
79 delete(docOperation);
80 } else if (operationName.equals("importDirectory")) {
81 importDirectory(docOperation);
82 } else if (operationName.equals("createPdf")) {
83 createPdf(docOperation);
84 } else if (operationName.equals("createAllPdfInDirectory")) {
85 createAllPdfInDirectory(docOperation);
86 }
87 }
88
89 private void importDirectory(CmsDocOperation docOperation) throws ApplicationException {
90 try {
91 LOGGER.info("Start of DocumentHandler. This operation could be time consuming because documents are indexed (normal indexing times are 1-10 minutes for a document)");
92 beginOperation();
93 String localDocumentsUrlStr = docOperation.getSrcUrl(); // start directory: file:/a/local/directory
94 String collectionNames = docOperation.getCollectionNames(); // e.g. "echo"
95 File localDocumentsDir = new File(new URI(localDocumentsUrlStr));
96 boolean docDirExists = localDocumentsDir.exists();
97 if (! docDirExists)
98 throw new ApplicationException("Document directory:" + localDocumentsUrlStr + " does not exists. Please use a directory that exists and perform the operation again.");
99 String[] fileExtensions = {"xml"};
100 Iterator<File> iterFiles = FileUtils.iterateFiles(localDocumentsDir, fileExtensions, true);
101 int i = 0;
102 while(iterFiles.hasNext()) {
103 i++;
104 File xmlFile = iterFiles.next();
105 String xmlFileStr = xmlFile.getPath();
106 int relativePos = (int) localDocumentsDir.getPath().length();
107 String docId = xmlFileStr.substring(relativePos); // relative path name starting from localDocumentsDir, e.g. /tei/de/Test_1789.xml
108 String xmlFileUrlStr = xmlFile.toURI().toURL().toString();
109 CmsDocOperation createDocOperation = new CmsDocOperation("create", xmlFileUrlStr, null, docId);
110 createDocOperation.setCollectionNames(collectionNames);
111 try {
112 doOperation(createDocOperation);
113 Date now = new Date();
114 LOGGER.info("Document " + i + ": " + docId + " successfully imported (" + now.toString() + ")");
115 } catch (Exception e) {
116 LOGGER.info("Document " + i + ": " + docId + " has problems:");
117 e.printStackTrace();
118 }
119 }
120 endOperation();
121 LOGGER.info("The DocumentHandler needed: " + (endOfOperation - beginOfOperation) + " ms" );
122 } catch (Exception e) {
123 throw new ApplicationException(e);
124 }
125 }
126
127 private void createAllPdfInDirectory(CmsDocOperation docOperation) throws ApplicationException {
128 try {
129 LOGGER.info("Start of generating Pdf-Documents. This operation could be time consuming because Pdf generation needs some time.");
130 beginOperation();
131 String localDocumentsUrlStr = docOperation.getSrcUrl(); // start directory: file:/a/local/directory
132 String collectionNames = docOperation.getCollectionNames(); // e.g. "echo"
133 File localDocumentsDir = new File(new URI(localDocumentsUrlStr));
134 boolean docDirExists = localDocumentsDir.exists();
135 if (! docDirExists)
136 throw new ApplicationException("Document directory:" + localDocumentsUrlStr + " does not exists. Please use a directory that exists and perform the operation again.");
137 String[] fileExtensions = {"xml"};
138 Iterator<File> iterFiles = FileUtils.iterateFiles(localDocumentsDir, fileExtensions, true);
139 int i = 0;
140 while(iterFiles.hasNext()) {
141 i++;
142 File xmlFile = iterFiles.next();
143 String xmlFileStr = xmlFile.getPath();
144 int relativePos = (int) localDocumentsDir.getPath().length();
145 String docId = xmlFileStr.substring(relativePos); // relative path name starting from localDocumentsDir, e.g. /tei/de/Test_1789.xml
146 CmsDocOperation createPdfOperation = new CmsDocOperation("createPdf", null, null, docId);
147 createPdfOperation.setCollectionNames(collectionNames);
148 try {
149 doOperation(createPdfOperation);
150 Date now = new Date();
151 LOGGER.info("Pdf document " + i + ": " + docId + " successfully created (" + now.toString() + ")");
152 } catch (Exception e) {
153 LOGGER.info("Pdf document " + i + ": " + docId + " has problems:");
154 e.printStackTrace();
155 }
156 }
157 endOperation();
158 LOGGER.info("The Pdf generation needed: " + (endOfOperation - beginOfOperation) + " ms" );
159 } catch (Exception e) {
160 throw new ApplicationException(e);
161 }
162 }
163
164 private boolean isProjectDoc(String docId) {
165 boolean isProjectDoc = true;
166 if (EXCLUDED_PROJECT_DOCS.contains(docId))
167 return false;
168 return isProjectDoc;
169 }
170
171 private void create(CmsDocOperation docOperation) throws ApplicationException {
172 try {
173 String operationName = docOperation.getName();
174 String srcUrlStr = docOperation.getSrcUrl();
175 String docId = docOperation.getDocIdentifier();
176 if (! isProjectDoc(docId)) {
177 LOGGER.info("Operation: " + operationName + " not performed on: " + docId + ". Cause: document is excluded as project doc");
178 return;
179 }
180 String mainLanguage = docOperation.getMainLanguage();
181 if (mainLanguage == null) {
182 mainLanguage = getMainLanguage(docId);
183 }
184 String[] elementNames = docOperation.getElementNames();
185 if (elementNames == null) {
186 String[] defaultElementNames = {"s", "head", "caption", "variables", "description"};
187 docOperation.setElementNames(defaultElementNames); // default
188 }
189 String docDirName = getDocDir(docId);
190 String docDestFileName = getDocFullFileName(docId);
191 URL srcUrl = null;
192 String protocol = null;
193 if (srcUrlStr != null && ! srcUrlStr.equals("empty")) {
194 srcUrl = new URL(srcUrlStr);
195 protocol = srcUrl.getProtocol();
196 }
197 File docDestFile = new File(docDestFileName);
198 // parse validation on file
199 XQueryEvaluator xQueryEvaluator = new XQueryEvaluator();
200 XdmNode docNode = xQueryEvaluator.parse(srcUrl); // if it is not parseable an exception with a detail message is thrown
201 String docType = getNodeType(docNode); // archimedes, echo, TEI, html ...
202 docType = docType.trim();
203 if (docType == null) {
204 docOperation.setErrorMessage("file type of: " + srcUrlStr + "is not supported");
205 return;
206 }
207 // perform operation on file system
208 if (protocol.equals("file")) {
209 docOperation.setStatus("upload file: " + srcUrlStr + " to CMS");
210 } else {
211 docOperation.setStatus("download file from: " + srcUrlStr + " to CMS");
212 }
213 FileUtils.copyURLToFile(srcUrl, docDestFile, 100000, 100000);
214
215 // replace anchor in echo documents and also add the number attribute to figures
216 String docDestFileNameUpgrade = docDestFileName + ".upgrade";
217 File docDestFileUpgrade = new File(docDestFileNameUpgrade);
218 XslResourceTransformer replaceAnchorTransformer = new XslResourceTransformer("replaceAnchor.xsl");
219 String docDestFileUrlStr = docDestFile.getPath();
220 String result = replaceAnchorTransformer.transform(docDestFileUrlStr);
221 FileUtils.writeStringToFile(docDestFileUpgrade, result, "utf-8");
222
223 MetadataRecord mdRecord = new MetadataRecord();
224 mdRecord.setDocId(docId);
225 mdRecord.setCollectionNames(docOperation.getCollectionNames());
226 mdRecord.setType("text/xml");
227
228 // generate toc file (toc, figure, handwritten)
229 XslResourceTransformer tocTransformer = new XslResourceTransformer("toc.xsl");
230 File tocFile = new File(docDirName + "/toc.xml");
231 String tocResult = tocTransformer.transform(docDestFileNameUpgrade);
232 FileUtils.writeStringToFile(tocFile, tocResult, "utf-8");
233
234 // Get metadata info of the xml document
235 docOperation.setStatus("extract metadata of: " + srcUrlStr + " to CMS");
236 XQueryEvaluator xQueryEvaluator2 = new XQueryEvaluator();
237 mdRecord = getMetadataRecord(docDestFileUpgrade, docType, mdRecord, xQueryEvaluator2);
238 String mdRecordLanguage = mdRecord.getLanguage();
239 if (mdRecordLanguage == null && mainLanguage != null)
240 mdRecord.setLanguage(mainLanguage);
241
242 // save all pages as single xml files (untokenized and tokenized)
243 docOperation.setStatus("extract page fragments of: " + srcUrlStr + " to CMS");
244 File docDir = new File(docDirName + "/pages");
245 FileUtils.deleteQuietly(docDir); // first delete pages directory
246 Hashtable<Integer, StringBuilder> pageFragments = getFragments(docDestFileNameUpgrade, "pb");
247 int pageCount = pageFragments.size();
248 if (pageCount == 0) {
249 // no pb element is found: then the whole document is the first page
250 String docXmlStr = FileUtils.readFileToString(docDestFileUpgrade, "utf-8");
251 docXmlStr = docXmlStr.replaceAll("<\\?xml.*?\\?>", ""); // remove the xml declaration if it exists
252 pageFragments = new Hashtable<Integer, StringBuilder>();
253 pageFragments.put(new Integer(1), new StringBuilder(docXmlStr));
254 pageCount = 1;
255 }
256 PageTransformer pageTransformer = new PageTransformer();
257 for (int page=1; page<=pageCount; page++) {
258 String fragment = pageFragments.get(new Integer(page)).toString();
259 fragment = "<?xml version=\"1.0\" encoding=\"utf-8\"?>\n" + fragment;
260 String docPageFileName = docDirName + "/pages/page-" + page + ".xml";
261 File docPageFile = new File(docPageFileName);
262 FileUtils.writeStringToFile(docPageFile, fragment, "utf-8");
263 String language = mdRecord.getLanguage();
264 String tokenizedXmlStr = tokenizeWithLemmas(fragment, language); // xml fragment enriched with <w> elements
265 tokenizedXmlStr = "<?xml version=\"1.0\" encoding=\"utf-8\"?>" + tokenizedXmlStr;
266 tokenizedXmlStr = enrichWordsOrigRegNorm(tokenizedXmlStr); // xml string: enrich <w> elements with normalization info (orig, reg, norm)
267 String docPageTokenizedFileName = docDirName + "/pages/page-" + page + "-morph.xml";
268 File docPageTokenizedFile = new File(docPageTokenizedFileName);
269 FileUtils.writeStringToFile(docPageTokenizedFile, tokenizedXmlStr, "utf-8");
270 String docPageHtmlFileName = docDirName + "/pages/page-" + page + ".html";
271 File docPageHtmlFile = new File(docPageHtmlFileName);
272 String htmlStr = pageTransformer.transform(tokenizedXmlStr, mdRecord, page, "html");
273 FileUtils.writeStringToFile(docPageHtmlFile, htmlStr, "utf-8");
274 }
275
276 // perform operation on Lucene
277 docOperation.setStatus(operationName + " document: " + docId + " in CMS");
278 docOperation.setMdRecord(mdRecord);
279 IndexHandler indexHandler = IndexHandler.getInstance();
280 indexHandler.indexDocument(docOperation);
281
282 } catch (IOException e) {
283 throw new ApplicationException(e);
284 }
285 }
286
287 private void delete(CmsDocOperation docOperation) throws ApplicationException {
288 String operationName = docOperation.getName();
289 String docIdentifier = docOperation.getDocIdentifier();
290 if (docIdentifier == null || docIdentifier.trim().equals(""))
291 throw new ApplicationException("Your document identifier is empty. Please specify a document identifier for your document.");
292 String docDirStr = getDocDir(docIdentifier);
293 File docDir = new File(docDirStr);
294 boolean docExists = docDir.exists();
295 if (! docExists) {
296 throw new ApplicationException("Document:" + docIdentifier + " does not exists. Please use a name that exists and perform the operation \"Delete\" again.");
297 }
298 // perform operation on file system
299 docOperation.setStatus(operationName + " document: " + docIdentifier + " in CMS");
300 FileUtils.deleteQuietly(docDir);
301
302 // perform operation on Lucene
303 IndexHandler indexHandler = IndexHandler.getInstance();
304 indexHandler.deleteDocument(docOperation);
305
306 }
307
308 private void createPdf(CmsDocOperation docOperation) throws ApplicationException {
309 String docId = docOperation.getDocIdentifier();
310 String operationName = docOperation.getName();
311 if (docId == null || docId.trim().equals(""))
312 throw new ApplicationException("Your document identifier is empty. Please specify a document identifier for your document.");
313 if (! isProjectDoc(docId)) {
314 LOGGER.info("Operation: " + operationName + " not performed on: " + docId + ". Cause: document is excluded as project doc");
315 return;
316 }
317 IndexHandler indexHandler = IndexHandler.getInstance();
318 MetadataRecord mdRecord = indexHandler.getDocMetadata(docId);
319 docOperation.setStatus("create PDF and HTML versions of the document: " + docId);
320 PdfHandler pdfHandler = PdfHandler.getInstance();
321 pdfHandler.createFile(true, true, mdRecord); // generate Pdf + Html document
322 }
323
324 private MetadataRecord getMetadataRecord(File xmlFile, String schemaName, MetadataRecord mdRecord, XQueryEvaluator xQueryEvaluator) throws ApplicationException {
325 if (schemaName == null)
326 return mdRecord;
327 try {
328 URL srcUrl = xmlFile.toURI().toURL();
329 if (schemaName.equals("archimedes"))
330 mdRecord = getMetadataRecordArch(xQueryEvaluator, srcUrl, mdRecord);
331 else if (schemaName.equals("echo"))
332 mdRecord = getMetadataRecordEcho(xQueryEvaluator, srcUrl, mdRecord);
333 else if (schemaName.equals("TEI"))
334 mdRecord = getMetadataRecordTei(xQueryEvaluator, srcUrl, mdRecord);
335 else if (schemaName.equals("html"))
336 mdRecord = getMetadataRecordHtml(xQueryEvaluator, srcUrl, mdRecord);
337 else
338 mdRecord.setSchemaName("diverse"); // all other cases: set docType to schemaName
339 } catch (MalformedURLException e) {
340 throw new ApplicationException(e);
341 }
342 mdRecord.setLastModified(new Date());
343 return mdRecord;
344 }
345
346 private MetadataRecord getMetadataRecordArch(XQueryEvaluator xQueryEvaluator, URL srcUrl, MetadataRecord mdRecord) throws ApplicationException {
347 String metadataXmlStr = xQueryEvaluator.evaluateAsString(srcUrl, "/archimedes//info");
348 if (metadataXmlStr != null) {
349 String identifier = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "/info/locator");
350 if (identifier != null)
351 identifier = StringUtils.deresolveXmlEntities(identifier);
352 String creator = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "/info/author");
353 if (creator != null)
354 creator = StringUtils.deresolveXmlEntities(creator);
355 String title = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "/info/title");
356 if (title != null)
357 title = StringUtils.deresolveXmlEntities(title);
358 String language = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "/info/lang[1]");
359 if (language != null)
360 language = StringUtils.deresolveXmlEntities(language);
361 String place = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "/info/place");
362 if (place != null)
363 place = StringUtils.deresolveXmlEntities(place);
364 String yearStr = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "/info/date");
365 Date date = null;
366 if (yearStr != null && ! yearStr.equals("")) {
367 yearStr = StringUtils.deresolveXmlEntities(yearStr);
368 yearStr = new Util().toYearStr(yearStr); // test if possible etc
369 if (yearStr != null) {
370 try {
371 date = new Util().toDate(yearStr + "-01-01T00:00:00.000Z");
372 } catch (Exception e) {
373 // nothing
374 }
375 }
376 }
377 String rights = "open access";
378 String license = "http://echo.mpiwg-berlin.mpg.de/policy/oa_basics/declaration";
379 String accessRights = "free";
380
381 mdRecord.setIdentifier(identifier);
382 mdRecord.setLanguage(language);
383 mdRecord.setCreator(creator);
384 mdRecord.setTitle(title);
385 mdRecord.setPublisher(place);
386 mdRecord.setRights(rights);
387 mdRecord.setDate(date);
388 mdRecord.setLicense(license);
389 mdRecord.setAccessRights(accessRights);
390
391 // get echo metadata
392 String echoDir = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "/info/echodir");
393 String docId = mdRecord.getDocId();
394 String echoIdTmp = docId;
395 if (docId != null && ! docId.isEmpty()) {
396 int start = docId.lastIndexOf("/");
397 if (start != -1)
398 start = start + 1;
399 else
400 start = 0;
401 int end = docId.lastIndexOf(".");
402 if (end == -1)
403 end = docId.length();
404 echoIdTmp = docId.substring(start, end);
405 }
406 String echoId = "/permanent/archimedes/" + echoIdTmp;
407 if (echoIdTmp == null || echoIdTmp.isEmpty())
408 echoId = null;
409 if (echoDir != null && ! echoDir.isEmpty()) {
410 echoId = echoDir;
411 }
412 mdRecord = getEchoMetadata(xQueryEvaluator, echoId, mdRecord);
413 }
414 String pageCountStr = xQueryEvaluator.evaluateAsString(srcUrl, "count(//pb)");
415 int pageCount = Integer.valueOf(pageCountStr);
416 mdRecord.setPageCount(pageCount);
417 mdRecord.setSchemaName("archimedes");
418 return mdRecord;
419 }
420
421 private MetadataRecord getMetadataRecordEcho(XQueryEvaluator xQueryEvaluator, URL srcUrl, MetadataRecord mdRecord) throws ApplicationException {
422 String metadataXmlStr = xQueryEvaluator.evaluateAsString(srcUrl, "/*:echo/*:metadata");
423 if (metadataXmlStr != null) {
424 String identifier = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "/*:metadata/*:identifier");
425 if (identifier != null) {
426 identifier = StringUtils.deresolveXmlEntities(identifier);
427 }
428 String creator = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "/*:metadata/*:creator");
429 if (creator != null)
430 creator = StringUtils.deresolveXmlEntities(creator);
431 String title = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "/*:metadata/*:title");
432 if (title != null)
433 title = StringUtils.deresolveXmlEntities(title);
434 String language = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "/*:metadata/*:language[1]");
435 if (language != null)
436 language = StringUtils.deresolveXmlEntities(language);
437 String yearStr = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "/*:metadata/*:date");
438 Date date = null;
439 if (yearStr != null && ! yearStr.equals("")) {
440 yearStr = StringUtils.deresolveXmlEntities(yearStr);
441 yearStr = new Util().toYearStr(yearStr); // test if possible etc
442 if (yearStr != null) {
443 try {
444 date = new Util().toDate(yearStr + "-01-01T00:00:00.000Z");
445 } catch (Exception e) {
446 // nothing
447 }
448 }
449 }
450 String rights = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "/*:metadata/*:rights");
451 if (rights != null)
452 rights = StringUtils.deresolveXmlEntities(rights);
453 String license = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "/*:metadata/*:license");
454 if (license != null)
455 license = StringUtils.deresolveXmlEntities(license);
456 String accessRights = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "/*:metadata/*:accessRights");
457 if (accessRights != null)
458 accessRights = StringUtils.deresolveXmlEntities(accessRights);
459
460 mdRecord.setIdentifier(identifier);
461 mdRecord.setLanguage(language);
462 mdRecord.setCreator(creator);
463 mdRecord.setTitle(title);
464 mdRecord.setRights(rights);
465 mdRecord.setDate(date);
466 mdRecord.setLicense(license);
467 mdRecord.setAccessRights(accessRights);
468
469 // get echo metadata
470 String echoDir = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "/*:metadata/*:echodir");
471 String echoIdTmp = identifier;
472 if (identifier != null && ! identifier.isEmpty()) {
473 int start = identifier.indexOf("ECHO:");
474 if (start != -1)
475 start = start + 5;
476 else
477 start = 0;
478 int end = identifier.lastIndexOf(".");
479 if (end == -1)
480 end = identifier.length();
481 echoIdTmp = identifier.substring(start, end);
482 }
483 String echoId = "/permanent/library/" + echoIdTmp;
484 if (echoIdTmp == null || echoIdTmp.isEmpty())
485 echoId = null;
486 if (echoDir != null && ! echoDir.isEmpty()) {
487 echoId = echoDir;
488 }
489 mdRecord = getEchoMetadata(xQueryEvaluator, echoId, mdRecord);
490 }
491 String pageCountStr = xQueryEvaluator.evaluateAsString(srcUrl, "count(//*:pb)");
492 int pageCount = Integer.valueOf(pageCountStr);
493 mdRecord.setPageCount(pageCount);
494 mdRecord.setSchemaName("echo");
495 return mdRecord;
496 }
497
498 private MetadataRecord getMetadataRecordTei(XQueryEvaluator xQueryEvaluator, URL srcUrl, MetadataRecord mdRecord) throws ApplicationException {
499 String metadataXmlStr = xQueryEvaluator.evaluateAsString(srcUrl, "/*:TEI/*:teiHeader");
500 if (metadataXmlStr != null) {
501 String identifier = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "/*:teiHeader/*:fileDesc/*:publicationStmt/*:idno");
502 if (identifier != null) {
503 identifier = StringUtils.deresolveXmlEntities(identifier);
504 identifier = deleteSpecialChars(identifier);
505 }
506 String creator = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "/*:teiHeader/*:fileDesc/*:titleStmt/*:author");
507 if (creator != null)
508 creator = StringUtils.deresolveXmlEntities(creator);
509 String title = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "/*:teiHeader/*:fileDesc/*:titleStmt/*:title");
510 if (title != null)
511 title = StringUtils.deresolveXmlEntities(title);
512 String language = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "string(/*:teiHeader/*:profileDesc/*:langUsage/*:language[1]/@ident)");
513 if (language != null && language.isEmpty())
514 language = null;
515 if (language != null) {
516 language = language.toLowerCase();
517 if (language.length() == 5) { // e.g. "de-DE or en-US"
518 if (language.substring(2, 3).equals("-")) {
519 String lang = language.substring(0, 2);
520 language = Language.getInstance().getISO639Code(lang);
521 }
522 }
523 }
524 String place = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "/*:teiHeader/*:fileDesc/*:publicationStmt/*:pubPlace");
525 if (place != null)
526 place = StringUtils.deresolveXmlEntities(place);
527 String yearStr = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "/*:teiHeader/*:fileDesc/*:publicationStmt/*:date");
528 Date date = null;
529 if (yearStr != null && ! yearStr.equals("")) {
530 yearStr = StringUtils.deresolveXmlEntities(yearStr);
531 yearStr = new Util().toYearStr(yearStr); // test if possible etc
532 if (yearStr != null) {
533 try {
534 date = new Util().toDate(yearStr + "-01-01T00:00:00.000Z");
535 } catch (Exception e) {
536 // nothing
537 }
538 }
539 }
540 String subject = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "string(/*:teiHeader/*:profileDesc/*:textClass/*:keywords/*:term)");
541 if (subject != null)
542 subject = StringUtils.deresolveXmlEntities(subject);
543 String rights = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "/*:teiHeader/*:fileDesc/*:publicationStmt/*:availability");
544 if (rights == null)
545 rights = "open access";
546 rights = StringUtils.deresolveXmlEntities(rights);
547 String license = "http://echo.mpiwg-berlin.mpg.de/policy/oa_basics/declaration";
548 String accessRights = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "string(/*:teiHeader/*:fileDesc/*:publicationStmt/*:availability/@status)");
549 if (accessRights == null)
550 accessRights = "free";
551 accessRights = StringUtils.deresolveXmlEntities(accessRights);
552
553 mdRecord.setIdentifier(identifier);
554 mdRecord.setLanguage(language);
555 mdRecord.setCreator(creator);
556 mdRecord.setTitle(title);
557 mdRecord.setPublisher(place);
558 mdRecord.setRights(rights);
559 mdRecord.setDate(date);
560 mdRecord.setSubject(subject);
561 mdRecord.setLicense(license);
562 mdRecord.setAccessRights(accessRights);
563
564 // get echo metadata
565 mdRecord = getEchoMetadata(xQueryEvaluator, identifier, mdRecord); // identifier is echoDir
566 }
567 String pageCountStr = xQueryEvaluator.evaluateAsString(srcUrl, "count(//*:pb)");
568 int pageCount = Integer.valueOf(pageCountStr);
569 mdRecord.setPageCount(pageCount);
570 mdRecord.setSchemaName("TEI");
571 return mdRecord;
572 }
573
574 private MetadataRecord getMetadataRecordHtml(XQueryEvaluator xQueryEvaluator, URL srcUrl, MetadataRecord mdRecord) throws ApplicationException {
575 String metadataXmlStr = xQueryEvaluator.evaluateAsString(srcUrl, "/html/head");
576 if (metadataXmlStr != null) {
577 String identifier = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "string(/meta[@name = 'DC.identifier']/@content)");
578 if (identifier != null && ! identifier.isEmpty())
579 identifier = StringUtils.deresolveXmlEntities(identifier);
580 String creator = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "string(/meta[@name = 'DC.creator']/@content)");
581 if (creator != null && ! creator.isEmpty())
582 creator = StringUtils.deresolveXmlEntities(creator);
583 String title = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "string(/meta[@name = 'DC.title']/@content)");
584 if (title != null && ! title.isEmpty())
585 title = StringUtils.deresolveXmlEntities(title);
586 String language = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "string(/meta[@name = 'DC.language']/@content)");
587 if (language != null && language.isEmpty())
588 language = null;
589 if (language != null && ! language.isEmpty())
590 language = StringUtils.deresolveXmlEntities(language);
591 String publisher = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "string(/meta[@name = 'DC.publisher']/@content)");
592 if (publisher != null)
593 publisher = StringUtils.deresolveXmlEntities(publisher);
594 String yearStr = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "string(/meta[@name = 'DC.date']/@content)");
595 Date date = null;
596 if (yearStr != null && ! yearStr.equals("")) {
597 yearStr = StringUtils.deresolveXmlEntities(yearStr);
598 yearStr = new Util().toYearStr(yearStr); // test if possible etc
599 if (yearStr != null) {
600 try {
601 date = new Util().toDate(yearStr + "-01-01T00:00:00.000Z");
602 } catch (Exception e) {
603 // nothing
604 }
605 }
606 }
607 String subject = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "string(/meta[@name = 'DC.subject']/@content)");
608 if (subject != null)
609 subject = StringUtils.deresolveXmlEntities(subject);
610 String rights = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "string(/meta[@name = 'DC.rights']/@content)");
611 if (rights != null && ! rights.isEmpty())
612 rights = StringUtils.deresolveXmlEntities(rights);
613 String license = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "string(/meta[@name = 'DC.license']/@content)");
614 if (license != null && ! license.isEmpty())
615 license = StringUtils.deresolveXmlEntities(license);
616 String accessRights = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "string(/meta[@name = 'DC.accessRights']/@content)");
617 if (accessRights != null && ! accessRights.isEmpty())
618 accessRights = StringUtils.deresolveXmlEntities(accessRights);
619
620 mdRecord.setIdentifier(identifier);
621 mdRecord.setLanguage(language);
622 mdRecord.setCreator(creator);
623 mdRecord.setTitle(title);
624 mdRecord.setPublisher(publisher);
625 mdRecord.setRights(rights);
626 mdRecord.setDate(date);
627 mdRecord.setSubject(subject);
628 mdRecord.setLicense(license);
629 mdRecord.setAccessRights(accessRights);
630
631 // get echo metadata
632 mdRecord = getEchoMetadata(xQueryEvaluator, identifier, mdRecord); // identifier is echoDir
633 }
634 String pageCountStr = xQueryEvaluator.evaluateAsString(srcUrl, "count(//pb)");
635 int pageCount = Integer.valueOf(pageCountStr);
636 mdRecord.setPageCount(pageCount);
637 mdRecord.setSchemaName("html");
638 return mdRecord;
639 }
640
641 private MetadataRecord getEchoMetadata(XQueryEvaluator xQueryEvaluator, String echoDir, MetadataRecord mdRecord) throws ApplicationException {
642 if (echoDir == null || echoDir.isEmpty()) {
643 String docId = mdRecord.getDocId();
644 echoDir = getEchoDir(xQueryEvaluator, docId);
645 if (echoDir == null)
646 return mdRecord;
647 }
648 String urLTexter = "http://digilib.mpiwg-berlin.mpg.de/digitallibrary/servlet/Texter?fn=" + echoDir + "/index.meta";
649 String echoIndexMetaStr = performGetRequest(urLTexter);
650 String echoPageImageDir = null;
651 String echoFiguresDir = null;
652 String mpiwgDocId = null;
653 if (echoIndexMetaStr != null) {
654 if (echoIndexMetaStr.equals("XXXXTimeoutXXXX"))
655 return null;
656 else if (echoIndexMetaStr.equals("XXXXUrlErrorXXXX"))
657 return mdRecord;
658 echoPageImageDir = xQueryEvaluator.evaluateAsStringValueJoined(echoIndexMetaStr, "/resource/meta/texttool/image");
659 if (echoPageImageDir != null)
660 echoPageImageDir = echoDir + "/" + echoPageImageDir;
661 else
662 echoPageImageDir = echoDir + "/" + "pageimg"; // default
663 echoFiguresDir = xQueryEvaluator.evaluateAsStringValueJoined(echoIndexMetaStr, "/resource/meta/texttool/figures");
664 if (echoFiguresDir != null)
665 echoFiguresDir = echoDir + "/" + echoFiguresDir;
666 else
667 echoFiguresDir = echoDir + "/" + "figures"; // default
668 mpiwgDocId = xQueryEvaluator.evaluateAsStringValueJoined(echoIndexMetaStr, "/resource/meta/dri[@type = 'mpiwg']");
669 }
670 mdRecord.setEchoId(echoDir);
671 mdRecord.setEchoPageImageDir(echoPageImageDir);
672 mdRecord.setEchoFiguresDir(echoFiguresDir);
673 mdRecord.setMpiwgDocId(mpiwgDocId);
674 return mdRecord;
675 }
676
677 private String getEchoDir(XQueryEvaluator xQueryEvaluator, String docId) throws ApplicationException {
678 String echoDir = null;
679 String urLTextUrlPath = "http://md.mpiwg-berlin.mpg.de/purls/searchSolr?text-url-path=" + docId + "&format=short";
680 String resultXmlStr = performGetRequest(urLTextUrlPath);
681 if (resultXmlStr != null) {
682 if (resultXmlStr.equals("XXXXTimeoutXXXX"))
683 return null;
684 else if (resultXmlStr.equals("XXXXUrlErrorXXXX"))
685 return null;
686 String archivePath = xQueryEvaluator.evaluateAsStringValueJoined(resultXmlStr, "//archive-path");
687 if (archivePath != null) {
688 archivePath = archivePath.replaceAll("/mpiwg/online", "");
689 if (archivePath.isEmpty())
690 echoDir = null;
691 else
692 echoDir = archivePath;
693 }
694 }
695 return echoDir;
696 }
697
698 private String getNodeType(XdmNode node) {
699 String nodeType = null;
700 XdmSequenceIterator iter = node.axisIterator(Axis.CHILD);
701 if (iter != null) {
702 while (iter.hasNext()) {
703 XdmNode firstChild = (XdmNode) iter.next();
704 if (firstChild != null) {
705 XdmNodeKind nodeKind = firstChild.getNodeKind();
706 if (nodeKind.ordinal() == XdmNodeKind.ELEMENT.ordinal()) {
707 QName nodeQName = firstChild.getNodeName();
708 nodeType = nodeQName.getLocalName();
709 }
710 }
711 }
712 }
713 return nodeType;
714 }
715
716 public String getDocFullFileName(String docId) {
717 String docDir = getDocDir(docId);
718 String docFileName = getDocFileName(docId);
719 String docFullFileName = docDir + "/" + docFileName;
720 return docFullFileName;
721 }
722
723 public String getFullFileName(String docId, String type) {
724 String docDir = getDocDir(docId);
725 String docFileName = getDocFileName(docId);
726 int lastDot = docFileName.lastIndexOf(".");
727 String docFileNameWithoutExtension = docFileName.substring(0, lastDot);
728 String fullFileName = docDir + "/" + docFileNameWithoutExtension + ".xml";
729 if (type != null && ! type.equals("toc")) {
730 fullFileName = docDir + "/" + docFileNameWithoutExtension + "." + type;
731 } else if (type != null && type.equals("toc")) {
732 fullFileName = docDir + "/toc.xml";
733 }
734 return fullFileName;
735 }
736
737 public String getDocDir(String docId) {
738 String documentsDirectory = Constants.getInstance().getDocumentsDir();
739 String subDir = docId;
740 if (docId.contains(".")) {
741 int index = docId.lastIndexOf(".");
742 subDir = docId.substring(0, index);
743 }
744 if (! subDir.startsWith("/"))
745 subDir = "/" + subDir;
746 String docDir = documentsDirectory + subDir;
747 return docDir;
748 }
749
750 public String getDocFileName(String docId) {
751 String docFileName = docId;
752 int index = docId.lastIndexOf("/");
753 if (index != -1) {
754 docFileName = docId.substring(index + 1);
755 }
756 return docFileName;
757 }
758
759 private String getMainLanguage(String docId) {
760 String mainLang = null;
761 int to = docId.lastIndexOf("/");
762 if (to != -1) {
763 String preStr = docId.substring(0, to);
764 int from = preStr.lastIndexOf("/");
765 if (from != -1)
766 mainLang = preStr.substring(from + 1, to);
767 }
768 return mainLang;
769 }
770
771 private String deleteSpecialChars(String inputStr) {
772 StringBuilder buf = new StringBuilder();
773 for (int i = 0; i < inputStr.length(); i++) {
774 char c = inputStr.charAt(i);
775 String replace = new String();
776 switch (c) {
777 case '@': replace = ""; break;
778 case ' ': replace = ""; break;
779 case ';': replace = ""; break;
780 default: replace += c; break;
781 }
782 buf.append(replace);
783 }
784 return buf.toString();
785 }
786
787 private Hashtable<Integer, StringBuilder> getFragments(String fileName, String milestoneElementName) throws ApplicationException {
788 try {
789 GetFragmentsContentHandler getFragmentsContentHandler = new GetFragmentsContentHandler(milestoneElementName);
790 XMLReader xmlParser = new SAXParser();
791 xmlParser.setContentHandler(getFragmentsContentHandler);
792 StringReader bla = new StringReader(FileUtils.readFileToString(new File(fileName), "utf-8"));
793 InputSource inputSource = new InputSource(bla);
794 xmlParser.parse(inputSource);
795 Hashtable<Integer, StringBuilder> resultFragments = getFragmentsContentHandler.getResultPages();
796 return resultFragments;
797 } catch (SAXException e) {
798 throw new ApplicationException(e);
799 } catch (IOException e) {
800 throw new ApplicationException(e);
801 }
802 }
803
804 private String tokenizeWithLemmas(String xmlStr, String language) throws ApplicationException {
805 StringReader strReader = new StringReader(xmlStr);
806 XmlTokenizer xmlTokenizer = new XmlTokenizer(strReader);
807 xmlTokenizer.setLanguage(language);
808 String[] outputOptionsWithLemmas = {"withLemmas"}; // so all tokens are fetched with lemmas (costs performance)
809 // non word breaking elements;
810 // TODO examine bugs with emph, figure, hi :
811 // e.g. "... der <hi rend="i">Capi-<lb n="16"/>talist.</hi> Es ..."
812 // e.g. page 30 in /echo/la/Cataneo_1600.xml
813 String[] nwbElements = {"lb", "br", "cb"};
814 xmlTokenizer.setNWBElements(nwbElements);
815 xmlTokenizer.setOutputOptions(outputOptionsWithLemmas);
816 xmlTokenizer.tokenize();
817 String retStr = xmlTokenizer.getXmlResult();
818 return retStr;
819 }
820
821 private String enrichWordsOrigRegNorm(String xmlStr) throws ApplicationException {
822 try {
823 WordContentHandler wordContentHandler = new WordContentHandler();
824 XMLReader xmlParser = new SAXParser();
825 xmlParser.setContentHandler(wordContentHandler);
826 StringReader strReader = new StringReader(xmlStr);
827 InputSource inputSource = new InputSource(strReader);
828 xmlParser.parse(inputSource);
829 String result = wordContentHandler.getResult();
830 return result;
831 } catch (SAXException e) {
832 throw new ApplicationException(e);
833 } catch (IOException e) {
834 throw new ApplicationException(e);
835 }
836 }
837
838 private String performGetRequest(String url) throws ApplicationException {
839 String resultStr = null;
840 try {
841 boolean urlIsOk = checkUri(url, 2000); // if url doesn't answer after 2 seconds
842 if (! urlIsOk)
843 return "XXXXTimeoutXXXX";
844 HttpClient httpClient = new HttpClient();
845 GetMethod method = new GetMethod(url);
846 httpClient.executeMethod(method);
847 int statusCode = method.getStatusCode();
848 if (statusCode >= 400)
849 return "XXXXUrlErrorXXXX";
850 byte[] resultBytes = method.getResponseBody();
851 resultStr = new String(resultBytes, "utf-8");
852 method.releaseConnection();
853 } catch (HttpException e) {
854 throw new ApplicationException(e);
855 } catch (IOException e) {
856 throw new ApplicationException(e);
857 }
858 return resultStr;
859 }
860
861 private boolean checkUri(String uriStr, int timeoutMilliseconds) throws ApplicationException {
862 boolean isOk = true;
863 try {
864 URI uri = new URI(uriStr);
865 HttpGet httpGet = new HttpGet(uri);
866 HttpParams httpParameters = new BasicHttpParams();
867 // Set the timeout in milliseconds until a connection is established.
868 // The default value is zero, that means the timeout is not used.
869 int timeoutConnection = 2000;
870 HttpConnectionParams.setConnectionTimeout(httpParameters, timeoutConnection);
871 // Set the default socket timeout (SO_TIMEOUT)
872 // in milliseconds which is the timeout for waiting for data.
873 int timeoutSocket = 2000;
874 HttpConnectionParams.setSoTimeout(httpParameters, timeoutSocket);
875 DefaultHttpClient httpClient = new DefaultHttpClient(httpParameters);
876 HttpResponse response = httpClient.execute(httpGet);
877 } catch (IOException e) {
878 isOk = false; // if timeout exception is thrown
879 } catch (URISyntaxException e) {
880 throw new ApplicationException(e);
881 }
882 return isOk;
883 }
884
885 /**
886 * Write string into destFile. If directory for that destFile does not exist
887 * it creates this directory including parent directories.
888 * @param str string to write
889 * @param destFileName destination file name
890 * @throws ApplicationException
891 */
892 private void saveFile(String str, String destFileName) throws ApplicationException {
893 OutputStreamWriter out = null;
894 try {
895 if (str == null)
896 return; // do nothing
897 File destFile = new File(destFileName);
898 File destDir = new File(destFile.getParent());
899 if (! destDir.exists()) {
900 destDir.mkdirs(); // create the directory including parent directories which do not exist
901 }
902 out = new OutputStreamWriter(new BufferedOutputStream(new FileOutputStream(destFile)), "utf-8");
903 out.write(str);
904 out.flush();
905 } catch (FileNotFoundException e) {
906 throw new ApplicationException(e);
907 } catch (IOException e) {
908 throw new ApplicationException(e);
909 } finally {
910 try {
911 if (out != null)
912 out.close();
913 } catch (Exception e) {
914 // nothing: always close the stream at the end of the method
915 }
916 }
917 }
918
919 private void beginOperation() {
920 beginOfOperation = new Date().getTime();
921 }
922
923 private void endOperation() {
924 endOfOperation = new Date().getTime();
925 }
926
927 }