diff software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/doc/ArchimedesDocManager.java @ 0:408254cf2f1d

Erstellung
author Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
date Wed, 24 Nov 2010 17:24:23 +0100
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/doc/ArchimedesDocManager.java	Wed Nov 24 17:24:23 2010 +0100
@@ -0,0 +1,147 @@
+package de.mpg.mpiwg.berlin.mpdl.lt.doc;
+
+import java.io.BufferedInputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FilenameFilter;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Date;
+
+import org.xml.sax.InputSource;
+import org.xml.sax.SAXException;
+import org.xml.sax.XMLReader;
+
+import com.sun.org.apache.xerces.internal.parsers.SAXParser;
+
+import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException;
+import de.mpg.mpiwg.berlin.mpdl.general.MpdlConstants;
+import de.mpg.mpiwg.berlin.mpdl.util.Util;
+import de.mpg.mpiwg.berlin.mpdl.util.FilenameFilterExtension;
+
+public class ArchimedesDocManager {
+  private static ArchimedesDocManager instance;
+  private static String MPDL_DOC_DIR = MpdlConstants.MPDL_DOC_DIR;
+  private static String ARCH_DOC_DIR = MPDL_DOC_DIR + "/documents/archimedes";
+  private static String ARCH_DOC_OUT_DIR = MPDL_DOC_DIR + "/documentsTranscodedToUnicode/archimedes";
+  private ArchimedesDocContentHandler archimedesDocContentHandler;
+  private ArchimedesDocForeignLangContentHandler archimedesDocForeignLangContentHandler;
+  private Date beginOfOperation;
+  private Date endOfOperation;
+  
+  public static ArchimedesDocManager getInstance() throws ApplicationException {
+    if (instance == null) {
+      instance = new ArchimedesDocManager();
+    }
+    return instance;
+  }
+
+  /**
+   * 
+   */
+  public static void main(String[] args) throws ApplicationException {
+    getInstance();
+    instance.beginOperation();
+    System.out.print("Start ...");
+    // Greek
+    String inputDirGreek = ARCH_DOC_DIR + "/el";
+    String outputDirGreek = ARCH_DOC_OUT_DIR + "/el";
+    // instance.transcodeDirectory("el", "betacode", "unicode", inputDirGreek, outputDirGreek);
+    // Arabic
+    String inputDirArabic = ARCH_DOC_DIR + "/ar";
+    String outputDirArabic = ARCH_DOC_OUT_DIR + "/ar";
+    // instance.transcodeDirectory("ar", "buckwalter", "unicode", inputDirArabic, outputDirArabic);
+
+    // Foreign lang=greek transcoding
+    instance.transcodeForeignLangFiles();
+    
+    instance.end();
+    instance.endOperation();
+    Double elapsedTime = new Util().getSecondWithMillisecondsBetween(instance.beginOfOperation, instance.endOfOperation);
+    System.out.println("End.");
+    System.out.println("Needed time: " + elapsedTime + " seconds");
+  }
+
+  private void transcodeDirectory(String language, String fromEncoding, String toEncoding, String inputDirName, String outputDirName) throws ApplicationException {
+    File inputDir = new File(inputDirName);
+    FilenameFilter filter = new FilenameFilterExtension("xml");
+    File[] files = inputDir.listFiles(filter);
+    for (int i=0; i < files.length; i++) {
+      File inputFile = files[i];
+      String outputFileName = inputFile.getName();
+      File outputFile = new File(outputDirName + "/" + outputFileName);
+      File outputDir = new File(outputFile.getParent()); 
+      if (! outputDir.exists()) {
+        outputDir.mkdirs();  // create the directory including parent directories which do not exist
+      }
+      transcodeFile(language, fromEncoding, toEncoding, inputFile, outputFile);
+    }  
+  }
+  
+  private void transcodeFile(String language, String fromEncoding, String toEncoding, File inputFile, File outputFile) throws ApplicationException {
+    archimedesDocContentHandler = new ArchimedesDocContentHandler(language, fromEncoding, toEncoding, outputFile);
+    try {
+      XMLReader xmlParser = new SAXParser();
+      xmlParser.setContentHandler(archimedesDocContentHandler);
+      InputStream inputStream = new FileInputStream(inputFile);
+      BufferedInputStream bufferedInputStream = new BufferedInputStream(inputStream);
+      InputSource input = new InputSource(bufferedInputStream);
+      xmlParser.parse(input);
+      bufferedInputStream.close();
+    } catch (SAXException e) {
+      throw new ApplicationException(e);
+    } catch (IOException e) {
+      throw new ApplicationException(e);
+    }
+  }
+  
+  private void transcodeForeignLangFiles() throws ApplicationException {
+    String[] languages = {"en", "fr", "it", "la"};
+    for (int i=0; i<languages.length; i++) {
+      String language = languages[i];
+      File inputDir = new File(ARCH_DOC_DIR + "/" + language);
+      String outputDirName = ARCH_DOC_OUT_DIR + "/" + language;
+      FilenameFilter filter = new FilenameFilterExtension("xml");
+      File[] files = inputDir.listFiles(filter);
+      for (int j=0; j < files.length; j++) {
+        File inputFile = files[j];
+        String outputFileName = inputFile.getName();
+        File outputFile = new File(outputDirName + "/" + outputFileName);
+        File outputDir = new File(outputFile.getParent()); 
+        if (! outputDir.exists()) {
+          outputDir.mkdirs();  // create the directory including parent directories which do not exist
+        }
+        transcodeForeignLangFile(inputFile, outputFile);
+      }
+    }
+  }
+  
+  private void transcodeForeignLangFile(File inputFile, File outputFile) throws ApplicationException {
+    archimedesDocForeignLangContentHandler = new ArchimedesDocForeignLangContentHandler(outputFile);
+    try {
+      XMLReader xmlParser = new SAXParser();
+      xmlParser.setContentHandler(archimedesDocForeignLangContentHandler);
+      InputStream inputStream = new FileInputStream(inputFile);
+      BufferedInputStream bufferedInputStream = new BufferedInputStream(inputStream);
+      InputSource input = new InputSource(bufferedInputStream);
+      xmlParser.parse(input);
+      bufferedInputStream.close();
+    } catch (SAXException e) {
+      throw new ApplicationException(e);
+    } catch (IOException e) {
+      throw new ApplicationException(e);
+    }
+  }
+  
+  private void end() throws ApplicationException {
+  }
+
+  private void beginOperation() {
+    beginOfOperation = new Date();
+  }
+
+  private void endOperation() {
+    endOfOperation = new Date();
+  }
+
+}
\ No newline at end of file