annotate software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/general/Transcoder.java @ 12:fba5577e49d9

diverse Fehlerbehebungen
author Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
date Tue, 19 Apr 2011 16:51:26 +0200
parents 2396a569e446
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
1 package de.mpg.mpiwg.berlin.mpdl.lt.general;
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
2
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
3 import java.io.IOException;
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
4 import java.io.StringReader;
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
5
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
6 import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException;
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
7 import edu.unc.epidoc.transcoder.TransCoder;
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
8
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
9 public class Transcoder {
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
10 private static Transcoder instance;
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
11 private TransCoder betaCodeTranscoder;
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
12
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
13 public static Transcoder getInstance() {
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
14 if (instance == null) {
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
15 instance = new Transcoder();
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
16 }
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
17 return instance;
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
18 }
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
19
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
20 public String transcodeFromBetaCode2UnicodeEpidoc(String inputStr) throws ApplicationException {
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
21 String encodedUnicodeStr = null;
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
22 try {
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
23 if (betaCodeTranscoder == null) {
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
24 betaCodeTranscoder = new TransCoder();
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
25 betaCodeTranscoder.setParser("BetaCode");
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
26 betaCodeTranscoder.setConverter("UnicodeC");
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
27 }
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
28 encodedUnicodeStr = betaCodeTranscoder.getString(inputStr);
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
29 } catch (Exception e) {
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
30 throw new ApplicationException(e);
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
31 }
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
32 return encodedUnicodeStr;
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
33 }
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
34
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
35 public String transcodeFromBetaCode2Unicode(String inputStr) throws ApplicationException {
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
36 StringReader strReader = new StringReader(inputStr);
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
37 Betacode2UnicodeLex betacode2UnicodeLex = new Betacode2UnicodeLex(strReader);
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
38 String retStr = "";
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
39 String token = "";
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
40 while (token != null) {
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
41 try {
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
42 token = betacode2UnicodeLex.yylex();
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
43 if (token != null)
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
44 retStr += token;
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
45 } catch (IOException e ) {
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
46 throw new ApplicationException(e);
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
47 }
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
48 }
12
fba5577e49d9 diverse Fehlerbehebungen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 6
diff changeset
49 // replace "small letter sigma" at the end of a word by the "small letter end sigma"
fba5577e49d9 diverse Fehlerbehebungen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 6
diff changeset
50 if (retStr != null && retStr.contains("σ")) {
fba5577e49d9 diverse Fehlerbehebungen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 6
diff changeset
51 retStr = retStr.replaceAll("(.*)σ(\\s)", "$1ς$2");
fba5577e49d9 diverse Fehlerbehebungen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 6
diff changeset
52 retStr = retStr.replaceAll("(.*)σ($)", "$1ς$2");
fba5577e49d9 diverse Fehlerbehebungen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 6
diff changeset
53 }
0
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
54 return retStr;
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
55 /*
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
56 // alternative to JFlex
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
57 String encodedUnicodeStr = null;
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
58 if (inputStr.matches("^a)"))
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
59 encodedUnicodeStr = inputStr.replaceFirst("^a)", "\u1F00");
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
60 else if (inputStr.matches("^a("))
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
61 encodedUnicodeStr = inputStr.replaceFirst("^a(", "\u1F01");
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
62 else if (inputStr.matches("^a)\\"))
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
63 encodedUnicodeStr = inputStr.replaceFirst("^a)\\", "\u1F02");
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
64
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
65 // the longest regular expressions first
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
66
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
67 return encodedUnicodeStr;
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
68 */
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
69 }
6
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
70
0
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
71
6
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
72 public String transcodeFromUnicode2BetaCode(String inputStr) throws ApplicationException {
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
73 StringReader strReader = new StringReader(inputStr);
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
74 Unicode2BetacodeLex betacode2UnicodeLex = new Unicode2BetacodeLex(strReader);
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
75 String retStr = "";
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
76 String token = "";
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
77 while (token != null) {
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
78 try {
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
79 token = betacode2UnicodeLex.yylex();
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
80 if (token != null)
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
81 retStr += token;
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
82 } catch (IOException e ) {
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
83 throw new ApplicationException(e);
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
84 }
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
85 }
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
86 return retStr;
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
87 }
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
88
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
89 public String transcodeFromUnicode2Buckwalter(String inputStr) throws ApplicationException {
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
90 StringReader strReader = new StringReader(inputStr);
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
91 Unicode2BuckwalterLex betacode2UnicodeLex = new Unicode2BuckwalterLex(strReader);
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
92 String retStr = "";
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
93 String token = "";
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
94 while (token != null) {
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
95 try {
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
96 token = betacode2UnicodeLex.yylex();
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
97 if (token != null)
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
98 retStr += token;
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
99 } catch (IOException e ) {
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
100 throw new ApplicationException(e);
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
101 }
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
102 }
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
103 return retStr;
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
104 }
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
105
0
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
106 public String transcodeFromBuckwalter2Unicode(String inputStr) throws ApplicationException {
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
107 StringReader strReader = new StringReader(inputStr);
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
108 Buckwalter2UnicodeLex buckwalter2UnicodeLex = new Buckwalter2UnicodeLex(strReader);
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
109 String retStr = "";
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
110 String token = "";
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
111 while (token != null) {
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
112 try {
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
113 token = buckwalter2UnicodeLex.yylex();
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
114 if (token != null)
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
115 retStr += token;
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
116 } catch (IOException e ) {
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
117 throw new ApplicationException(e);
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
118 }
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
119 }
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
120 return retStr;
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
121 }
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
122
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
123
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
124
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
125 public String transcodeFromBuckwalter2UnicodeAraMorph(String inputStr) {
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
126 String encodedUnicodeStr = arabizeWord(inputStr);
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
127 return encodedUnicodeStr;
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
128 }
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
129
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
130 /*
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
131 * copied from http://www.nongnu.org/aramorph/english/download.html
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
132 * Class: AraMorph
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
133 */
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
134 private String arabizeWord(String translitered) {
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
135 String tmp_word = translitered;
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
136 // convert to transliteration
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
137 tmp_word = tmp_word.replaceAll("'", "\u0621"); //\u0621 : ARABIC LETTER HAMZA
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
138 tmp_word = tmp_word.replaceAll("\\|", "\u0622"); //\u0622 : ARABIC LETTER ALEF WITH MADDA ABOVE
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
139 tmp_word = tmp_word.replaceAll(">", "\u0623"); //\u0623 : ARABIC LETTER ALEF WITH HAMZA ABOVE
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
140 tmp_word = tmp_word.replaceAll("&", "\u0624"); //\u0624 : ARABIC LETTER WAW WITH HAMZA ABOVE
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
141 tmp_word = tmp_word.replaceAll("<", "\u0625"); //\u0625 : ARABIC LETTER ALEF WITH HAMZA BELOW
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
142 tmp_word = tmp_word.replaceAll("}", "\u0626"); //\u0626 : ARABIC LETTER YEH WITH HAMZA ABOVE
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
143 tmp_word = tmp_word.replaceAll("A", "\u0627"); //\u0627 : ARABIC LETTER ALEF
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
144 tmp_word = tmp_word.replaceAll("b", "\u0628"); //\u0628 : ARABIC LETTER BEH
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
145 tmp_word = tmp_word.replaceAll("p", "\u0629"); //\u0629 : ARABIC LETTER TEH MARBUTA
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
146 tmp_word = tmp_word.replaceAll("t", "\u062A"); //\u062A : ARABIC LETTER TEH
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
147 tmp_word = tmp_word.replaceAll("v", "\u062B"); //\u062B : ARABIC LETTER THEH
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
148 tmp_word = tmp_word.replaceAll("j", "\u062C"); //\u062C : ARABIC LETTER JEEM
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
149 tmp_word = tmp_word.replaceAll("H", "\u062D"); //\u062D : ARABIC LETTER HAH
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
150 tmp_word = tmp_word.replaceAll("x", "\u062E"); //\u062E : ARABIC LETTER KHAH
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
151 tmp_word = tmp_word.replaceAll("d", "\u062F"); //\u062F : ARABIC LETTER DAL
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
152 tmp_word = tmp_word.replaceAll("\\*", "\u0630"); //\u0630 : ARABIC LETTER THAL
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
153 tmp_word = tmp_word.replaceAll("r", "\u0631"); //\u0631 : ARABIC LETTER REH
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
154 tmp_word = tmp_word.replaceAll("z", "\u0632"); //\u0632 : ARABIC LETTER ZAIN
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
155 tmp_word = tmp_word.replaceAll("s", "\u0633" ); //\u0633 : ARABIC LETTER SEEN
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
156 tmp_word = tmp_word.replaceAll("\\$", "\u0634"); //\u0634 : ARABIC LETTER SHEEN
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
157 tmp_word = tmp_word.replaceAll("S", "\u0635"); //\u0635 : ARABIC LETTER SAD
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
158 tmp_word = tmp_word.replaceAll("D", "\u0636"); //\u0636 : ARABIC LETTER DAD
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
159 tmp_word = tmp_word.replaceAll("T", "\u0637"); //\u0637 : ARABIC LETTER TAH
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
160 tmp_word = tmp_word.replaceAll("Z", "\u0638"); //\u0638 : ARABIC LETTER ZAH
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
161 tmp_word = tmp_word.replaceAll("E", "\u0639"); //\u0639 : ARABIC LETTER AIN
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
162 tmp_word = tmp_word.replaceAll("g", "\u063A"); //\u063A : ARABIC LETTER GHAIN
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
163 tmp_word = tmp_word.replaceAll("_", "\u0640"); //\u0640 : ARABIC TATWEEL
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
164 tmp_word = tmp_word.replaceAll("f", "\u0641"); //\u0641 : ARABIC LETTER FEH
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
165 tmp_word = tmp_word.replaceAll("q", "\u0642"); //\u0642 : ARABIC LETTER QAF
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
166 tmp_word = tmp_word.replaceAll("k", "\u0643"); //\u0643 : ARABIC LETTER KAF
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
167 tmp_word = tmp_word.replaceAll("l", "\u0644"); //\u0644 : ARABIC LETTER LAM
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
168 tmp_word = tmp_word.replaceAll("m", "\u0645"); //\u0645 : ARABIC LETTER MEEM
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
169 tmp_word = tmp_word.replaceAll("n", "\u0646"); //\u0646 : ARABIC LETTER NOON
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
170 tmp_word = tmp_word.replaceAll("h", "\u0647"); //\u0647 : ARABIC LETTER HEH
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
171 tmp_word = tmp_word.replaceAll("w", "\u0648"); //\u0648 : ARABIC LETTER WAW
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
172 tmp_word = tmp_word.replaceAll("Y", "\u0649"); //\u0649 : ARABIC LETTER ALEF MAKSURA
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
173 tmp_word = tmp_word.replaceAll("y", "\u064A"); //\u064A : ARABIC LETTER YEH
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
174 tmp_word = tmp_word.replaceAll("F", "\u064B"); //\u064B : ARABIC FATHATAN
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
175 tmp_word = tmp_word.replaceAll("N", "\u064C"); //\u064C : ARABIC DAMMATAN
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
176 tmp_word = tmp_word.replaceAll("K", "\u064D"); //\u064D : ARABIC KASRATAN
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
177 tmp_word = tmp_word.replaceAll("a", "\u064E"); //\u064E : ARABIC FATHA
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
178 tmp_word = tmp_word.replaceAll("u", "\u064F"); //\u064F : ARABIC DAMMA
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
179 tmp_word = tmp_word.replaceAll("i", "\u0650"); //\u0650 : ARABIC KASRA
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
180 tmp_word = tmp_word.replaceAll("~", "\u0651"); //\u0651 : ARABIC SHADDA
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
181 tmp_word = tmp_word.replaceAll("o", "\u0652"); //\u0652 : ARABIC SUKUN
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
182 tmp_word = tmp_word.replaceAll("`", "\u0670"); //\u0670 : ARABIC LETTER SUPERSCRIPT ALEF
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
183 tmp_word = tmp_word.replaceAll("\\{", "\u0671"); //\u0671 : ARABIC LETTER ALEF WASLA
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
184 tmp_word = tmp_word.replaceAll("P", "\u067E"); //\u067E : ARABIC LETTER PEH
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
185 tmp_word = tmp_word.replaceAll("J", "\u0686"); //\u0686 : ARABIC LETTER TCHEH
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
186 tmp_word = tmp_word.replaceAll("V", "\u06A4"); //\u06A4 : ARABIC LETTER VEH
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
187 tmp_word = tmp_word.replaceAll("G", "\u06AF"); //\u06AF : ARABIC LETTER GAF
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
188 tmp_word = tmp_word.replaceAll("R", "\u0698"); //\u0698 : ARABIC LETTER JEH (no more in Buckwalter system)
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
189 //Not in Buckwalter system \u0679 : ARABIC LETTER TTEH
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
190 //Not in Buckwalter system \u0688 : ARABIC LETTER DDAL
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
191 //Not in Buckwalter system \u06A9 : ARABIC LETTER KEHEH
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
192 //Not in Buckwalter system \u0691 : ARABIC LETTER RREH
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
193 //Not in Buckwalter system \u06BA : ARABIC LETTER NOON GHUNNA
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
194 //Not in Buckwalter system \u06BE : ARABIC LETTER HEH DOACHASHMEE
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
195 //Not in Buckwalter system \u06C1 : ARABIC LETTER HEH GOAL
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
196 //Not in Buckwalter system \u06D2 : ARABIC LETTER YEH BARREE
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
197 tmp_word = tmp_word.replaceAll(",", "\u060C" ); //\u060C : ARABIC COMMA
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
198 tmp_word = tmp_word.replaceAll(";", "\u061B"); //\u061B : ARABIC SEMICOLON
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
199 tmp_word = tmp_word.replaceAll("\\?", "\u061F"); //\u061F : ARABIC QUESTION MARK
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
200 return tmp_word;
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
201 }
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
202
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
203 }