Mercurial > hg > mpdl-group
diff software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/doc/ArchimedesDocManager.java @ 0:408254cf2f1d
Erstellung
author | Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de> |
---|---|
date | Wed, 24 Nov 2010 17:24:23 +0100 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/doc/ArchimedesDocManager.java Wed Nov 24 17:24:23 2010 +0100 @@ -0,0 +1,147 @@ +package de.mpg.mpiwg.berlin.mpdl.lt.doc; + +import java.io.BufferedInputStream; +import java.io.File; +import java.io.FileInputStream; +import java.io.FilenameFilter; +import java.io.IOException; +import java.io.InputStream; +import java.util.Date; + +import org.xml.sax.InputSource; +import org.xml.sax.SAXException; +import org.xml.sax.XMLReader; + +import com.sun.org.apache.xerces.internal.parsers.SAXParser; + +import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; +import de.mpg.mpiwg.berlin.mpdl.general.MpdlConstants; +import de.mpg.mpiwg.berlin.mpdl.util.Util; +import de.mpg.mpiwg.berlin.mpdl.util.FilenameFilterExtension; + +public class ArchimedesDocManager { + private static ArchimedesDocManager instance; + private static String MPDL_DOC_DIR = MpdlConstants.MPDL_DOC_DIR; + private static String ARCH_DOC_DIR = MPDL_DOC_DIR + "/documents/archimedes"; + private static String ARCH_DOC_OUT_DIR = MPDL_DOC_DIR + "/documentsTranscodedToUnicode/archimedes"; + private ArchimedesDocContentHandler archimedesDocContentHandler; + private ArchimedesDocForeignLangContentHandler archimedesDocForeignLangContentHandler; + private Date beginOfOperation; + private Date endOfOperation; + + public static ArchimedesDocManager getInstance() throws ApplicationException { + if (instance == null) { + instance = new ArchimedesDocManager(); + } + return instance; + } + + /** + * + */ + public static void main(String[] args) throws ApplicationException { + getInstance(); + instance.beginOperation(); + System.out.print("Start ..."); + // Greek + String inputDirGreek = ARCH_DOC_DIR + "/el"; + String outputDirGreek = ARCH_DOC_OUT_DIR + "/el"; + // instance.transcodeDirectory("el", "betacode", "unicode", inputDirGreek, outputDirGreek); + // Arabic + String inputDirArabic = ARCH_DOC_DIR + "/ar"; + String outputDirArabic = ARCH_DOC_OUT_DIR + "/ar"; + // instance.transcodeDirectory("ar", "buckwalter", "unicode", inputDirArabic, outputDirArabic); + + // Foreign lang=greek transcoding + instance.transcodeForeignLangFiles(); + + instance.end(); + instance.endOperation(); + Double elapsedTime = new Util().getSecondWithMillisecondsBetween(instance.beginOfOperation, instance.endOfOperation); + System.out.println("End."); + System.out.println("Needed time: " + elapsedTime + " seconds"); + } + + private void transcodeDirectory(String language, String fromEncoding, String toEncoding, String inputDirName, String outputDirName) throws ApplicationException { + File inputDir = new File(inputDirName); + FilenameFilter filter = new FilenameFilterExtension("xml"); + File[] files = inputDir.listFiles(filter); + for (int i=0; i < files.length; i++) { + File inputFile = files[i]; + String outputFileName = inputFile.getName(); + File outputFile = new File(outputDirName + "/" + outputFileName); + File outputDir = new File(outputFile.getParent()); + if (! outputDir.exists()) { + outputDir.mkdirs(); // create the directory including parent directories which do not exist + } + transcodeFile(language, fromEncoding, toEncoding, inputFile, outputFile); + } + } + + private void transcodeFile(String language, String fromEncoding, String toEncoding, File inputFile, File outputFile) throws ApplicationException { + archimedesDocContentHandler = new ArchimedesDocContentHandler(language, fromEncoding, toEncoding, outputFile); + try { + XMLReader xmlParser = new SAXParser(); + xmlParser.setContentHandler(archimedesDocContentHandler); + InputStream inputStream = new FileInputStream(inputFile); + BufferedInputStream bufferedInputStream = new BufferedInputStream(inputStream); + InputSource input = new InputSource(bufferedInputStream); + xmlParser.parse(input); + bufferedInputStream.close(); + } catch (SAXException e) { + throw new ApplicationException(e); + } catch (IOException e) { + throw new ApplicationException(e); + } + } + + private void transcodeForeignLangFiles() throws ApplicationException { + String[] languages = {"en", "fr", "it", "la"}; + for (int i=0; i<languages.length; i++) { + String language = languages[i]; + File inputDir = new File(ARCH_DOC_DIR + "/" + language); + String outputDirName = ARCH_DOC_OUT_DIR + "/" + language; + FilenameFilter filter = new FilenameFilterExtension("xml"); + File[] files = inputDir.listFiles(filter); + for (int j=0; j < files.length; j++) { + File inputFile = files[j]; + String outputFileName = inputFile.getName(); + File outputFile = new File(outputDirName + "/" + outputFileName); + File outputDir = new File(outputFile.getParent()); + if (! outputDir.exists()) { + outputDir.mkdirs(); // create the directory including parent directories which do not exist + } + transcodeForeignLangFile(inputFile, outputFile); + } + } + } + + private void transcodeForeignLangFile(File inputFile, File outputFile) throws ApplicationException { + archimedesDocForeignLangContentHandler = new ArchimedesDocForeignLangContentHandler(outputFile); + try { + XMLReader xmlParser = new SAXParser(); + xmlParser.setContentHandler(archimedesDocForeignLangContentHandler); + InputStream inputStream = new FileInputStream(inputFile); + BufferedInputStream bufferedInputStream = new BufferedInputStream(inputStream); + InputSource input = new InputSource(bufferedInputStream); + xmlParser.parse(input); + bufferedInputStream.close(); + } catch (SAXException e) { + throw new ApplicationException(e); + } catch (IOException e) { + throw new ApplicationException(e); + } + } + + private void end() throws ApplicationException { + } + + private void beginOperation() { + beginOfOperation = new Date(); + } + + private void endOperation() { + endOfOperation = new Date(); + } + +} \ No newline at end of file