0
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
1 package de.mpg.mpiwg.berlin.mpdl.lt.doc;
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
2
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
3 import java.io.BufferedInputStream;
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
4 import java.io.File;
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
5 import java.io.FileInputStream;
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
6 import java.io.FilenameFilter;
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
7 import java.io.IOException;
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
8 import java.io.InputStream;
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
9 import java.util.Date;
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
10
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
11 import org.xml.sax.InputSource;
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
12 import org.xml.sax.SAXException;
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
13 import org.xml.sax.XMLReader;
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
14
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
15 import com.sun.org.apache.xerces.internal.parsers.SAXParser;
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
16
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
17 import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException;
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
18 import de.mpg.mpiwg.berlin.mpdl.general.MpdlConstants;
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
19 import de.mpg.mpiwg.berlin.mpdl.util.Util;
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
20 import de.mpg.mpiwg.berlin.mpdl.util.FilenameFilterExtension;
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
21
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
22 public class ArchimedesDocManager {
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
23 private static ArchimedesDocManager instance;
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
24 private static String MPDL_DOC_DIR = MpdlConstants.MPDL_DOC_DIR;
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
25 private static String ARCH_DOC_DIR = MPDL_DOC_DIR + "/documents/archimedes";
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
26 private static String ARCH_DOC_OUT_DIR = MPDL_DOC_DIR + "/documentsTranscodedToUnicode/archimedes";
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
27 private ArchimedesDocContentHandler archimedesDocContentHandler;
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
28 private ArchimedesDocForeignLangContentHandler archimedesDocForeignLangContentHandler;
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
29 private Date beginOfOperation;
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
30 private Date endOfOperation;
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
31
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
32 public static ArchimedesDocManager getInstance() throws ApplicationException {
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
33 if (instance == null) {
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
34 instance = new ArchimedesDocManager();
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
35 }
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
36 return instance;
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
37 }
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
38
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
39 /**
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
40 *
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
41 */
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
42 public static void main(String[] args) throws ApplicationException {
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
43 getInstance();
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
44 instance.beginOperation();
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
45 System.out.print("Start ...");
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
46 // Greek
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
47 String inputDirGreek = ARCH_DOC_DIR + "/el";
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
48 String outputDirGreek = ARCH_DOC_OUT_DIR + "/el";
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
49 // instance.transcodeDirectory("el", "betacode", "unicode", inputDirGreek, outputDirGreek);
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
50 // Arabic
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
51 String inputDirArabic = ARCH_DOC_DIR + "/ar";
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
52 String outputDirArabic = ARCH_DOC_OUT_DIR + "/ar";
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
53 // instance.transcodeDirectory("ar", "buckwalter", "unicode", inputDirArabic, outputDirArabic);
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
54
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
55 // Foreign lang=greek transcoding
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
56 instance.transcodeForeignLangFiles();
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
57
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
58 instance.end();
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
59 instance.endOperation();
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
60 Double elapsedTime = new Util().getSecondWithMillisecondsBetween(instance.beginOfOperation, instance.endOfOperation);
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
61 System.out.println("End.");
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
62 System.out.println("Needed time: " + elapsedTime + " seconds");
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
63 }
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
64
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
65 private void transcodeDirectory(String language, String fromEncoding, String toEncoding, String inputDirName, String outputDirName) throws ApplicationException {
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
66 File inputDir = new File(inputDirName);
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
67 FilenameFilter filter = new FilenameFilterExtension("xml");
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
68 File[] files = inputDir.listFiles(filter);
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
69 for (int i=0; i < files.length; i++) {
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
70 File inputFile = files[i];
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
71 String outputFileName = inputFile.getName();
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
72 File outputFile = new File(outputDirName + "/" + outputFileName);
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
73 File outputDir = new File(outputFile.getParent());
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
74 if (! outputDir.exists()) {
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
75 outputDir.mkdirs(); // create the directory including parent directories which do not exist
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
76 }
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
77 transcodeFile(language, fromEncoding, toEncoding, inputFile, outputFile);
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
78 }
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
79 }
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
80
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
81 private void transcodeFile(String language, String fromEncoding, String toEncoding, File inputFile, File outputFile) throws ApplicationException {
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
82 archimedesDocContentHandler = new ArchimedesDocContentHandler(language, fromEncoding, toEncoding, outputFile);
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
83 try {
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
84 XMLReader xmlParser = new SAXParser();
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
85 xmlParser.setContentHandler(archimedesDocContentHandler);
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
86 InputStream inputStream = new FileInputStream(inputFile);
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
87 BufferedInputStream bufferedInputStream = new BufferedInputStream(inputStream);
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
88 InputSource input = new InputSource(bufferedInputStream);
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
89 xmlParser.parse(input);
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
90 bufferedInputStream.close();
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
91 } catch (SAXException e) {
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
92 throw new ApplicationException(e);
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
93 } catch (IOException e) {
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
94 throw new ApplicationException(e);
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
95 }
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
96 }
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
97
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
98 private void transcodeForeignLangFiles() throws ApplicationException {
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
99 String[] languages = {"en", "fr", "it", "la"};
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
100 for (int i=0; i<languages.length; i++) {
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
101 String language = languages[i];
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
102 File inputDir = new File(ARCH_DOC_DIR + "/" + language);
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
103 String outputDirName = ARCH_DOC_OUT_DIR + "/" + language;
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
104 FilenameFilter filter = new FilenameFilterExtension("xml");
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
105 File[] files = inputDir.listFiles(filter);
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
106 for (int j=0; j < files.length; j++) {
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
107 File inputFile = files[j];
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
108 String outputFileName = inputFile.getName();
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
109 File outputFile = new File(outputDirName + "/" + outputFileName);
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
110 File outputDir = new File(outputFile.getParent());
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
111 if (! outputDir.exists()) {
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
112 outputDir.mkdirs(); // create the directory including parent directories which do not exist
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
113 }
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
114 transcodeForeignLangFile(inputFile, outputFile);
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
115 }
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
116 }
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
117 }
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
118
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
119 private void transcodeForeignLangFile(File inputFile, File outputFile) throws ApplicationException {
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
120 archimedesDocForeignLangContentHandler = new ArchimedesDocForeignLangContentHandler(outputFile);
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
121 try {
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
122 XMLReader xmlParser = new SAXParser();
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
123 xmlParser.setContentHandler(archimedesDocForeignLangContentHandler);
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
124 InputStream inputStream = new FileInputStream(inputFile);
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
125 BufferedInputStream bufferedInputStream = new BufferedInputStream(inputStream);
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
126 InputSource input = new InputSource(bufferedInputStream);
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
127 xmlParser.parse(input);
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
128 bufferedInputStream.close();
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
129 } catch (SAXException e) {
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
130 throw new ApplicationException(e);
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
131 } catch (IOException e) {
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
132 throw new ApplicationException(e);
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
133 }
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
134 }
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
135
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
136 private void end() throws ApplicationException {
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
137 }
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
138
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
139 private void beginOperation() {
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
140 beginOfOperation = new Date();
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
141 }
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
142
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
143 private void endOperation() {
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
144 endOfOperation = new Date();
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
145 }
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
146
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
147 } |