Mercurial > hg > mpdl-group
annotate software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/general/Transcoder.java @ 12:fba5577e49d9
diverse Fehlerbehebungen
author | Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de> |
---|---|
date | Tue, 19 Apr 2011 16:51:26 +0200 |
parents | 2396a569e446 |
children |
rev | line source |
---|---|
0 | 1 package de.mpg.mpiwg.berlin.mpdl.lt.general; |
2 | |
3 import java.io.IOException; | |
4 import java.io.StringReader; | |
5 | |
6 import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; | |
7 import edu.unc.epidoc.transcoder.TransCoder; | |
8 | |
9 public class Transcoder { | |
10 private static Transcoder instance; | |
11 private TransCoder betaCodeTranscoder; | |
12 | |
13 public static Transcoder getInstance() { | |
14 if (instance == null) { | |
15 instance = new Transcoder(); | |
16 } | |
17 return instance; | |
18 } | |
19 | |
20 public String transcodeFromBetaCode2UnicodeEpidoc(String inputStr) throws ApplicationException { | |
21 String encodedUnicodeStr = null; | |
22 try { | |
23 if (betaCodeTranscoder == null) { | |
24 betaCodeTranscoder = new TransCoder(); | |
25 betaCodeTranscoder.setParser("BetaCode"); | |
26 betaCodeTranscoder.setConverter("UnicodeC"); | |
27 } | |
28 encodedUnicodeStr = betaCodeTranscoder.getString(inputStr); | |
29 } catch (Exception e) { | |
30 throw new ApplicationException(e); | |
31 } | |
32 return encodedUnicodeStr; | |
33 } | |
34 | |
35 public String transcodeFromBetaCode2Unicode(String inputStr) throws ApplicationException { | |
36 StringReader strReader = new StringReader(inputStr); | |
37 Betacode2UnicodeLex betacode2UnicodeLex = new Betacode2UnicodeLex(strReader); | |
38 String retStr = ""; | |
39 String token = ""; | |
40 while (token != null) { | |
41 try { | |
42 token = betacode2UnicodeLex.yylex(); | |
43 if (token != null) | |
44 retStr += token; | |
45 } catch (IOException e ) { | |
46 throw new ApplicationException(e); | |
47 } | |
48 } | |
12
fba5577e49d9
diverse Fehlerbehebungen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
6
diff
changeset
|
49 // replace "small letter sigma" at the end of a word by the "small letter end sigma" |
fba5577e49d9
diverse Fehlerbehebungen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
6
diff
changeset
|
50 if (retStr != null && retStr.contains("σ")) { |
fba5577e49d9
diverse Fehlerbehebungen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
6
diff
changeset
|
51 retStr = retStr.replaceAll("(.*)σ(\\s)", "$1ς$2"); |
fba5577e49d9
diverse Fehlerbehebungen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
6
diff
changeset
|
52 retStr = retStr.replaceAll("(.*)σ($)", "$1ς$2"); |
fba5577e49d9
diverse Fehlerbehebungen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
6
diff
changeset
|
53 } |
0 | 54 return retStr; |
55 /* | |
56 // alternative to JFlex | |
57 String encodedUnicodeStr = null; | |
58 if (inputStr.matches("^a)")) | |
59 encodedUnicodeStr = inputStr.replaceFirst("^a)", "\u1F00"); | |
60 else if (inputStr.matches("^a(")) | |
61 encodedUnicodeStr = inputStr.replaceFirst("^a(", "\u1F01"); | |
62 else if (inputStr.matches("^a)\\")) | |
63 encodedUnicodeStr = inputStr.replaceFirst("^a)\\", "\u1F02"); | |
64 | |
65 // the longest regular expressions first | |
66 | |
67 return encodedUnicodeStr; | |
68 */ | |
69 } | |
6
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
70 |
0 | 71 |
6
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
72 public String transcodeFromUnicode2BetaCode(String inputStr) throws ApplicationException { |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
73 StringReader strReader = new StringReader(inputStr); |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
74 Unicode2BetacodeLex betacode2UnicodeLex = new Unicode2BetacodeLex(strReader); |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
75 String retStr = ""; |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
76 String token = ""; |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
77 while (token != null) { |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
78 try { |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
79 token = betacode2UnicodeLex.yylex(); |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
80 if (token != null) |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
81 retStr += token; |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
82 } catch (IOException e ) { |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
83 throw new ApplicationException(e); |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
84 } |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
85 } |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
86 return retStr; |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
87 } |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
88 |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
89 public String transcodeFromUnicode2Buckwalter(String inputStr) throws ApplicationException { |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
90 StringReader strReader = new StringReader(inputStr); |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
91 Unicode2BuckwalterLex betacode2UnicodeLex = new Unicode2BuckwalterLex(strReader); |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
92 String retStr = ""; |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
93 String token = ""; |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
94 while (token != null) { |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
95 try { |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
96 token = betacode2UnicodeLex.yylex(); |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
97 if (token != null) |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
98 retStr += token; |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
99 } catch (IOException e ) { |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
100 throw new ApplicationException(e); |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
101 } |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
102 } |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
103 return retStr; |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
104 } |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
105 |
0 | 106 public String transcodeFromBuckwalter2Unicode(String inputStr) throws ApplicationException { |
107 StringReader strReader = new StringReader(inputStr); | |
108 Buckwalter2UnicodeLex buckwalter2UnicodeLex = new Buckwalter2UnicodeLex(strReader); | |
109 String retStr = ""; | |
110 String token = ""; | |
111 while (token != null) { | |
112 try { | |
113 token = buckwalter2UnicodeLex.yylex(); | |
114 if (token != null) | |
115 retStr += token; | |
116 } catch (IOException e ) { | |
117 throw new ApplicationException(e); | |
118 } | |
119 } | |
120 return retStr; | |
121 } | |
122 | |
123 | |
124 | |
125 public String transcodeFromBuckwalter2UnicodeAraMorph(String inputStr) { | |
126 String encodedUnicodeStr = arabizeWord(inputStr); | |
127 return encodedUnicodeStr; | |
128 } | |
129 | |
130 /* | |
131 * copied from http://www.nongnu.org/aramorph/english/download.html | |
132 * Class: AraMorph | |
133 */ | |
134 private String arabizeWord(String translitered) { | |
135 String tmp_word = translitered; | |
136 // convert to transliteration | |
137 tmp_word = tmp_word.replaceAll("'", "\u0621"); //\u0621 : ARABIC LETTER HAMZA | |
138 tmp_word = tmp_word.replaceAll("\\|", "\u0622"); //\u0622 : ARABIC LETTER ALEF WITH MADDA ABOVE | |
139 tmp_word = tmp_word.replaceAll(">", "\u0623"); //\u0623 : ARABIC LETTER ALEF WITH HAMZA ABOVE | |
140 tmp_word = tmp_word.replaceAll("&", "\u0624"); //\u0624 : ARABIC LETTER WAW WITH HAMZA ABOVE | |
141 tmp_word = tmp_word.replaceAll("<", "\u0625"); //\u0625 : ARABIC LETTER ALEF WITH HAMZA BELOW | |
142 tmp_word = tmp_word.replaceAll("}", "\u0626"); //\u0626 : ARABIC LETTER YEH WITH HAMZA ABOVE | |
143 tmp_word = tmp_word.replaceAll("A", "\u0627"); //\u0627 : ARABIC LETTER ALEF | |
144 tmp_word = tmp_word.replaceAll("b", "\u0628"); //\u0628 : ARABIC LETTER BEH | |
145 tmp_word = tmp_word.replaceAll("p", "\u0629"); //\u0629 : ARABIC LETTER TEH MARBUTA | |
146 tmp_word = tmp_word.replaceAll("t", "\u062A"); //\u062A : ARABIC LETTER TEH | |
147 tmp_word = tmp_word.replaceAll("v", "\u062B"); //\u062B : ARABIC LETTER THEH | |
148 tmp_word = tmp_word.replaceAll("j", "\u062C"); //\u062C : ARABIC LETTER JEEM | |
149 tmp_word = tmp_word.replaceAll("H", "\u062D"); //\u062D : ARABIC LETTER HAH | |
150 tmp_word = tmp_word.replaceAll("x", "\u062E"); //\u062E : ARABIC LETTER KHAH | |
151 tmp_word = tmp_word.replaceAll("d", "\u062F"); //\u062F : ARABIC LETTER DAL | |
152 tmp_word = tmp_word.replaceAll("\\*", "\u0630"); //\u0630 : ARABIC LETTER THAL | |
153 tmp_word = tmp_word.replaceAll("r", "\u0631"); //\u0631 : ARABIC LETTER REH | |
154 tmp_word = tmp_word.replaceAll("z", "\u0632"); //\u0632 : ARABIC LETTER ZAIN | |
155 tmp_word = tmp_word.replaceAll("s", "\u0633" ); //\u0633 : ARABIC LETTER SEEN | |
156 tmp_word = tmp_word.replaceAll("\\$", "\u0634"); //\u0634 : ARABIC LETTER SHEEN | |
157 tmp_word = tmp_word.replaceAll("S", "\u0635"); //\u0635 : ARABIC LETTER SAD | |
158 tmp_word = tmp_word.replaceAll("D", "\u0636"); //\u0636 : ARABIC LETTER DAD | |
159 tmp_word = tmp_word.replaceAll("T", "\u0637"); //\u0637 : ARABIC LETTER TAH | |
160 tmp_word = tmp_word.replaceAll("Z", "\u0638"); //\u0638 : ARABIC LETTER ZAH | |
161 tmp_word = tmp_word.replaceAll("E", "\u0639"); //\u0639 : ARABIC LETTER AIN | |
162 tmp_word = tmp_word.replaceAll("g", "\u063A"); //\u063A : ARABIC LETTER GHAIN | |
163 tmp_word = tmp_word.replaceAll("_", "\u0640"); //\u0640 : ARABIC TATWEEL | |
164 tmp_word = tmp_word.replaceAll("f", "\u0641"); //\u0641 : ARABIC LETTER FEH | |
165 tmp_word = tmp_word.replaceAll("q", "\u0642"); //\u0642 : ARABIC LETTER QAF | |
166 tmp_word = tmp_word.replaceAll("k", "\u0643"); //\u0643 : ARABIC LETTER KAF | |
167 tmp_word = tmp_word.replaceAll("l", "\u0644"); //\u0644 : ARABIC LETTER LAM | |
168 tmp_word = tmp_word.replaceAll("m", "\u0645"); //\u0645 : ARABIC LETTER MEEM | |
169 tmp_word = tmp_word.replaceAll("n", "\u0646"); //\u0646 : ARABIC LETTER NOON | |
170 tmp_word = tmp_word.replaceAll("h", "\u0647"); //\u0647 : ARABIC LETTER HEH | |
171 tmp_word = tmp_word.replaceAll("w", "\u0648"); //\u0648 : ARABIC LETTER WAW | |
172 tmp_word = tmp_word.replaceAll("Y", "\u0649"); //\u0649 : ARABIC LETTER ALEF MAKSURA | |
173 tmp_word = tmp_word.replaceAll("y", "\u064A"); //\u064A : ARABIC LETTER YEH | |
174 tmp_word = tmp_word.replaceAll("F", "\u064B"); //\u064B : ARABIC FATHATAN | |
175 tmp_word = tmp_word.replaceAll("N", "\u064C"); //\u064C : ARABIC DAMMATAN | |
176 tmp_word = tmp_word.replaceAll("K", "\u064D"); //\u064D : ARABIC KASRATAN | |
177 tmp_word = tmp_word.replaceAll("a", "\u064E"); //\u064E : ARABIC FATHA | |
178 tmp_word = tmp_word.replaceAll("u", "\u064F"); //\u064F : ARABIC DAMMA | |
179 tmp_word = tmp_word.replaceAll("i", "\u0650"); //\u0650 : ARABIC KASRA | |
180 tmp_word = tmp_word.replaceAll("~", "\u0651"); //\u0651 : ARABIC SHADDA | |
181 tmp_word = tmp_word.replaceAll("o", "\u0652"); //\u0652 : ARABIC SUKUN | |
182 tmp_word = tmp_word.replaceAll("`", "\u0670"); //\u0670 : ARABIC LETTER SUPERSCRIPT ALEF | |
183 tmp_word = tmp_word.replaceAll("\\{", "\u0671"); //\u0671 : ARABIC LETTER ALEF WASLA | |
184 tmp_word = tmp_word.replaceAll("P", "\u067E"); //\u067E : ARABIC LETTER PEH | |
185 tmp_word = tmp_word.replaceAll("J", "\u0686"); //\u0686 : ARABIC LETTER TCHEH | |
186 tmp_word = tmp_word.replaceAll("V", "\u06A4"); //\u06A4 : ARABIC LETTER VEH | |
187 tmp_word = tmp_word.replaceAll("G", "\u06AF"); //\u06AF : ARABIC LETTER GAF | |
188 tmp_word = tmp_word.replaceAll("R", "\u0698"); //\u0698 : ARABIC LETTER JEH (no more in Buckwalter system) | |
189 //Not in Buckwalter system \u0679 : ARABIC LETTER TTEH | |
190 //Not in Buckwalter system \u0688 : ARABIC LETTER DDAL | |
191 //Not in Buckwalter system \u06A9 : ARABIC LETTER KEHEH | |
192 //Not in Buckwalter system \u0691 : ARABIC LETTER RREH | |
193 //Not in Buckwalter system \u06BA : ARABIC LETTER NOON GHUNNA | |
194 //Not in Buckwalter system \u06BE : ARABIC LETTER HEH DOACHASHMEE | |
195 //Not in Buckwalter system \u06C1 : ARABIC LETTER HEH GOAL | |
196 //Not in Buckwalter system \u06D2 : ARABIC LETTER YEH BARREE | |
197 tmp_word = tmp_word.replaceAll(",", "\u060C" ); //\u060C : ARABIC COMMA | |
198 tmp_word = tmp_word.replaceAll(";", "\u061B"); //\u061B : ARABIC SEMICOLON | |
199 tmp_word = tmp_word.replaceAll("\\?", "\u061F"); //\u061F : ARABIC QUESTION MARK | |
200 return tmp_word; | |
201 } | |
202 | |
203 } |