diff software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/general/Transcoder.java @ 0:408254cf2f1d

Erstellung
author Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
date Wed, 24 Nov 2010 17:24:23 +0100
parents
children 2396a569e446
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/general/Transcoder.java	Wed Nov 24 17:24:23 2010 +0100
@@ -0,0 +1,163 @@
+package de.mpg.mpiwg.berlin.mpdl.lt.general;
+
+import java.io.IOException;
+import java.io.StringReader;
+
+import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException;
+import edu.unc.epidoc.transcoder.TransCoder;
+
+public class Transcoder {
+  private static Transcoder instance;
+  private TransCoder betaCodeTranscoder;
+  
+  public static Transcoder getInstance() {
+    if (instance == null) {
+      instance = new Transcoder();
+    }
+    return instance;
+  }
+
+  public String transcodeFromBetaCode2UnicodeEpidoc(String inputStr) throws ApplicationException {
+    String encodedUnicodeStr = null;
+    try {
+      if (betaCodeTranscoder == null) {
+        betaCodeTranscoder = new TransCoder();
+        betaCodeTranscoder.setParser("BetaCode");
+        betaCodeTranscoder.setConverter("UnicodeC");
+      }
+      encodedUnicodeStr = betaCodeTranscoder.getString(inputStr);
+    } catch (Exception e) {
+      throw new ApplicationException(e);
+    }
+    return encodedUnicodeStr;
+  }
+  
+  public String transcodeFromBetaCode2Unicode(String inputStr) throws ApplicationException {
+    StringReader strReader = new StringReader(inputStr);
+    Betacode2UnicodeLex betacode2UnicodeLex = new Betacode2UnicodeLex(strReader);
+    String retStr = "";
+    String token = "";
+    while (token != null) {
+      try {
+        token = betacode2UnicodeLex.yylex();
+        if (token != null)
+          retStr += token;
+      } catch (IOException e ) {
+        throw new ApplicationException(e);
+      }
+    }
+    return retStr;
+    /* 
+    // alternative to JFlex 
+    String encodedUnicodeStr = null;
+    if (inputStr.matches("^a)"))
+      encodedUnicodeStr = inputStr.replaceFirst("^a)", "\u1F00");
+    else if (inputStr.matches("^a("))
+      encodedUnicodeStr = inputStr.replaceFirst("^a(", "\u1F01");
+    else if (inputStr.matches("^a)\\"))
+      encodedUnicodeStr = inputStr.replaceFirst("^a)\\", "\u1F02");
+      
+    // the longest regular expressions first
+    
+    return encodedUnicodeStr;
+    */
+  }
+  
+  public String transcodeFromBuckwalter2Unicode(String inputStr) throws ApplicationException {
+    StringReader strReader = new StringReader(inputStr);
+    Buckwalter2UnicodeLex buckwalter2UnicodeLex = new Buckwalter2UnicodeLex(strReader);
+    String retStr = "";
+    String token = "";
+    while (token != null) {
+      try {
+        token = buckwalter2UnicodeLex.yylex();
+        if (token != null)
+          retStr += token;
+      } catch (IOException e ) {
+        throw new ApplicationException(e);
+      }
+    }
+    return retStr;
+  }
+
+
+  
+  public String transcodeFromBuckwalter2UnicodeAraMorph(String inputStr) {
+    String encodedUnicodeStr = arabizeWord(inputStr);
+    return encodedUnicodeStr;
+  }
+
+  /*
+   * copied from http://www.nongnu.org/aramorph/english/download.html
+   * Class: AraMorph
+   */
+  private String arabizeWord(String translitered) {
+    String tmp_word = translitered;
+    // convert to transliteration
+    tmp_word = tmp_word.replaceAll("'", "\u0621"); //\u0621 : ARABIC LETTER HAMZA
+    tmp_word = tmp_word.replaceAll("\\|", "\u0622"); //\u0622 : ARABIC LETTER ALEF WITH MADDA ABOVE
+    tmp_word = tmp_word.replaceAll(">", "\u0623"); //\u0623 : ARABIC LETTER ALEF WITH HAMZA ABOVE
+    tmp_word = tmp_word.replaceAll("&", "\u0624"); //\u0624 : ARABIC LETTER WAW WITH HAMZA ABOVE
+    tmp_word = tmp_word.replaceAll("<", "\u0625"); //\u0625 : ARABIC LETTER ALEF WITH HAMZA BELOW
+    tmp_word = tmp_word.replaceAll("}", "\u0626"); //\u0626 : ARABIC LETTER YEH WITH HAMZA ABOVE
+    tmp_word = tmp_word.replaceAll("A", "\u0627"); //\u0627 : ARABIC LETTER ALEF
+    tmp_word = tmp_word.replaceAll("b", "\u0628"); //\u0628 : ARABIC LETTER BEH
+    tmp_word = tmp_word.replaceAll("p", "\u0629"); //\u0629 : ARABIC LETTER TEH MARBUTA
+    tmp_word = tmp_word.replaceAll("t", "\u062A"); //\u062A : ARABIC LETTER TEH
+    tmp_word = tmp_word.replaceAll("v", "\u062B"); //\u062B : ARABIC LETTER THEH
+    tmp_word = tmp_word.replaceAll("j", "\u062C"); //\u062C : ARABIC LETTER JEEM
+    tmp_word = tmp_word.replaceAll("H", "\u062D"); //\u062D : ARABIC LETTER HAH
+    tmp_word = tmp_word.replaceAll("x", "\u062E"); //\u062E : ARABIC LETTER KHAH
+    tmp_word = tmp_word.replaceAll("d", "\u062F"); //\u062F : ARABIC LETTER DAL
+    tmp_word = tmp_word.replaceAll("\\*", "\u0630"); //\u0630 : ARABIC LETTER THAL
+    tmp_word = tmp_word.replaceAll("r", "\u0631"); //\u0631 : ARABIC LETTER REH
+    tmp_word = tmp_word.replaceAll("z", "\u0632"); //\u0632 : ARABIC LETTER ZAIN
+    tmp_word = tmp_word.replaceAll("s", "\u0633" ); //\u0633 : ARABIC LETTER SEEN
+    tmp_word = tmp_word.replaceAll("\\$", "\u0634"); //\u0634 : ARABIC LETTER SHEEN
+    tmp_word = tmp_word.replaceAll("S", "\u0635"); //\u0635 : ARABIC LETTER SAD
+    tmp_word = tmp_word.replaceAll("D", "\u0636"); //\u0636 : ARABIC LETTER DAD
+    tmp_word = tmp_word.replaceAll("T", "\u0637"); //\u0637 : ARABIC LETTER TAH
+    tmp_word = tmp_word.replaceAll("Z", "\u0638"); //\u0638 : ARABIC LETTER ZAH
+    tmp_word = tmp_word.replaceAll("E", "\u0639"); //\u0639 : ARABIC LETTER AIN
+    tmp_word = tmp_word.replaceAll("g", "\u063A"); //\u063A : ARABIC LETTER GHAIN
+    tmp_word = tmp_word.replaceAll("_", "\u0640"); //\u0640 : ARABIC TATWEEL
+    tmp_word = tmp_word.replaceAll("f", "\u0641"); //\u0641 : ARABIC LETTER FEH
+    tmp_word = tmp_word.replaceAll("q", "\u0642"); //\u0642 : ARABIC LETTER QAF
+    tmp_word = tmp_word.replaceAll("k", "\u0643"); //\u0643 : ARABIC LETTER KAF
+    tmp_word = tmp_word.replaceAll("l", "\u0644"); //\u0644 : ARABIC LETTER LAM
+    tmp_word = tmp_word.replaceAll("m", "\u0645"); //\u0645 : ARABIC LETTER MEEM
+    tmp_word = tmp_word.replaceAll("n", "\u0646"); //\u0646 : ARABIC LETTER NOON
+    tmp_word = tmp_word.replaceAll("h", "\u0647"); //\u0647 : ARABIC LETTER HEH
+    tmp_word = tmp_word.replaceAll("w", "\u0648"); //\u0648 : ARABIC LETTER WAW
+    tmp_word = tmp_word.replaceAll("Y", "\u0649"); //\u0649 : ARABIC LETTER ALEF MAKSURA
+    tmp_word = tmp_word.replaceAll("y", "\u064A"); //\u064A : ARABIC LETTER YEH
+    tmp_word = tmp_word.replaceAll("F", "\u064B"); //\u064B : ARABIC FATHATAN
+    tmp_word = tmp_word.replaceAll("N", "\u064C"); //\u064C : ARABIC DAMMATAN
+    tmp_word = tmp_word.replaceAll("K", "\u064D"); //\u064D : ARABIC KASRATAN
+    tmp_word = tmp_word.replaceAll("a", "\u064E"); //\u064E : ARABIC FATHA
+    tmp_word = tmp_word.replaceAll("u", "\u064F"); //\u064F : ARABIC DAMMA
+    tmp_word = tmp_word.replaceAll("i", "\u0650"); //\u0650 : ARABIC KASRA
+    tmp_word = tmp_word.replaceAll("~", "\u0651"); //\u0651 : ARABIC SHADDA
+    tmp_word = tmp_word.replaceAll("o", "\u0652"); //\u0652 : ARABIC SUKUN
+    tmp_word = tmp_word.replaceAll("`", "\u0670"); //\u0670 : ARABIC LETTER SUPERSCRIPT ALEF
+    tmp_word = tmp_word.replaceAll("\\{", "\u0671"); //\u0671 : ARABIC LETTER ALEF WASLA
+    tmp_word = tmp_word.replaceAll("P", "\u067E"); //\u067E : ARABIC LETTER PEH
+    tmp_word = tmp_word.replaceAll("J", "\u0686"); //\u0686 : ARABIC LETTER TCHEH
+    tmp_word = tmp_word.replaceAll("V", "\u06A4"); //\u06A4 : ARABIC LETTER VEH
+    tmp_word = tmp_word.replaceAll("G", "\u06AF"); //\u06AF : ARABIC LETTER GAF
+    tmp_word = tmp_word.replaceAll("R", "\u0698"); //\u0698 : ARABIC LETTER JEH (no more in Buckwalter system)
+    //Not in Buckwalter system \u0679 : ARABIC LETTER TTEH
+    //Not in Buckwalter system \u0688 : ARABIC LETTER DDAL
+    //Not in Buckwalter system \u06A9 : ARABIC LETTER KEHEH
+    //Not in Buckwalter system \u0691 : ARABIC LETTER RREH
+    //Not in Buckwalter system \u06BA : ARABIC LETTER NOON GHUNNA
+    //Not in Buckwalter system \u06BE : ARABIC LETTER HEH DOACHASHMEE
+    //Not in Buckwalter system \u06C1 : ARABIC LETTER HEH GOAL
+    //Not in Buckwalter system \u06D2 : ARABIC LETTER YEH BARREE
+    tmp_word = tmp_word.replaceAll(",", "\u060C" ); //\u060C : ARABIC COMMA
+    tmp_word = tmp_word.replaceAll(";", "\u061B"); //\u061B : ARABIC SEMICOLON
+    tmp_word = tmp_word.replaceAll("\\?", "\u061F"); //\u061F : ARABIC QUESTION MARK
+    return tmp_word;
+  }
+  
+}
\ No newline at end of file