view software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/doc/ArchimedesDocManager.java @ 0:408254cf2f1d

Erstellung
author Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
date Wed, 24 Nov 2010 17:24:23 +0100
parents
children
line wrap: on
line source

package de.mpg.mpiwg.berlin.mpdl.lt.doc;

import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FilenameFilter;
import java.io.IOException;
import java.io.InputStream;
import java.util.Date;

import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;

import com.sun.org.apache.xerces.internal.parsers.SAXParser;

import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException;
import de.mpg.mpiwg.berlin.mpdl.general.MpdlConstants;
import de.mpg.mpiwg.berlin.mpdl.util.Util;
import de.mpg.mpiwg.berlin.mpdl.util.FilenameFilterExtension;

public class ArchimedesDocManager {
  private static ArchimedesDocManager instance;
  private static String MPDL_DOC_DIR = MpdlConstants.MPDL_DOC_DIR;
  private static String ARCH_DOC_DIR = MPDL_DOC_DIR + "/documents/archimedes";
  private static String ARCH_DOC_OUT_DIR = MPDL_DOC_DIR + "/documentsTranscodedToUnicode/archimedes";
  private ArchimedesDocContentHandler archimedesDocContentHandler;
  private ArchimedesDocForeignLangContentHandler archimedesDocForeignLangContentHandler;
  private Date beginOfOperation;
  private Date endOfOperation;
  
  public static ArchimedesDocManager getInstance() throws ApplicationException {
    if (instance == null) {
      instance = new ArchimedesDocManager();
    }
    return instance;
  }

  /**
   * 
   */
  public static void main(String[] args) throws ApplicationException {
    getInstance();
    instance.beginOperation();
    System.out.print("Start ...");
    // Greek
    String inputDirGreek = ARCH_DOC_DIR + "/el";
    String outputDirGreek = ARCH_DOC_OUT_DIR + "/el";
    // instance.transcodeDirectory("el", "betacode", "unicode", inputDirGreek, outputDirGreek);
    // Arabic
    String inputDirArabic = ARCH_DOC_DIR + "/ar";
    String outputDirArabic = ARCH_DOC_OUT_DIR + "/ar";
    // instance.transcodeDirectory("ar", "buckwalter", "unicode", inputDirArabic, outputDirArabic);

    // Foreign lang=greek transcoding
    instance.transcodeForeignLangFiles();
    
    instance.end();
    instance.endOperation();
    Double elapsedTime = new Util().getSecondWithMillisecondsBetween(instance.beginOfOperation, instance.endOfOperation);
    System.out.println("End.");
    System.out.println("Needed time: " + elapsedTime + " seconds");
  }

  private void transcodeDirectory(String language, String fromEncoding, String toEncoding, String inputDirName, String outputDirName) throws ApplicationException {
    File inputDir = new File(inputDirName);
    FilenameFilter filter = new FilenameFilterExtension("xml");
    File[] files = inputDir.listFiles(filter);
    for (int i=0; i < files.length; i++) {
      File inputFile = files[i];
      String outputFileName = inputFile.getName();
      File outputFile = new File(outputDirName + "/" + outputFileName);
      File outputDir = new File(outputFile.getParent()); 
      if (! outputDir.exists()) {
        outputDir.mkdirs();  // create the directory including parent directories which do not exist
      }
      transcodeFile(language, fromEncoding, toEncoding, inputFile, outputFile);
    }  
  }
  
  private void transcodeFile(String language, String fromEncoding, String toEncoding, File inputFile, File outputFile) throws ApplicationException {
    archimedesDocContentHandler = new ArchimedesDocContentHandler(language, fromEncoding, toEncoding, outputFile);
    try {
      XMLReader xmlParser = new SAXParser();
      xmlParser.setContentHandler(archimedesDocContentHandler);
      InputStream inputStream = new FileInputStream(inputFile);
      BufferedInputStream bufferedInputStream = new BufferedInputStream(inputStream);
      InputSource input = new InputSource(bufferedInputStream);
      xmlParser.parse(input);
      bufferedInputStream.close();
    } catch (SAXException e) {
      throw new ApplicationException(e);
    } catch (IOException e) {
      throw new ApplicationException(e);
    }
  }
  
  private void transcodeForeignLangFiles() throws ApplicationException {
    String[] languages = {"en", "fr", "it", "la"};
    for (int i=0; i<languages.length; i++) {
      String language = languages[i];
      File inputDir = new File(ARCH_DOC_DIR + "/" + language);
      String outputDirName = ARCH_DOC_OUT_DIR + "/" + language;
      FilenameFilter filter = new FilenameFilterExtension("xml");
      File[] files = inputDir.listFiles(filter);
      for (int j=0; j < files.length; j++) {
        File inputFile = files[j];
        String outputFileName = inputFile.getName();
        File outputFile = new File(outputDirName + "/" + outputFileName);
        File outputDir = new File(outputFile.getParent()); 
        if (! outputDir.exists()) {
          outputDir.mkdirs();  // create the directory including parent directories which do not exist
        }
        transcodeForeignLangFile(inputFile, outputFile);
      }
    }
  }
  
  private void transcodeForeignLangFile(File inputFile, File outputFile) throws ApplicationException {
    archimedesDocForeignLangContentHandler = new ArchimedesDocForeignLangContentHandler(outputFile);
    try {
      XMLReader xmlParser = new SAXParser();
      xmlParser.setContentHandler(archimedesDocForeignLangContentHandler);
      InputStream inputStream = new FileInputStream(inputFile);
      BufferedInputStream bufferedInputStream = new BufferedInputStream(inputStream);
      InputSource input = new InputSource(bufferedInputStream);
      xmlParser.parse(input);
      bufferedInputStream.close();
    } catch (SAXException e) {
      throw new ApplicationException(e);
    } catch (IOException e) {
      throw new ApplicationException(e);
    }
  }
  
  private void end() throws ApplicationException {
  }

  private void beginOperation() {
    beginOfOperation = new Date();
  }

  private void endOperation() {
    endOfOperation = new Date();
  }

}