Mercurial > hg > mpdl-group
diff software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/lex/db/DBLexWriter.java @ 0:408254cf2f1d
Erstellung
author | Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de> |
---|---|
date | Wed, 24 Nov 2010 17:24:23 +0100 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/lex/db/DBLexWriter.java Wed Nov 24 17:24:23 2010 +0100 @@ -0,0 +1,630 @@ +package de.mpg.mpiwg.berlin.mpdl.lt.lex.db; + +import java.io.BufferedOutputStream; +import java.io.BufferedReader; +import java.io.File; +import java.io.FileNotFoundException; +import java.io.FileOutputStream; +import java.io.FileReader; +import java.io.IOException; +import java.io.Reader; +import java.io.StringReader; +import java.io.UnsupportedEncodingException; +import java.util.ArrayList; +import java.util.Date; +import java.util.HashMap; +import java.util.Iterator; +import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.xml.sax.InputSource; +import org.xml.sax.SAXException; +import org.xml.sax.XMLReader; + +import com.sleepycat.je.Cursor; +import com.sleepycat.je.Database; +import com.sleepycat.je.DatabaseEntry; +import com.sleepycat.je.DatabaseException; +import com.sleepycat.je.LockMode; +import com.sleepycat.je.OperationStatus; +import com.sleepycat.je.util.DbLoad; +import com.sun.org.apache.xerces.internal.parsers.SAXParser; + +import de.mpg.mpiwg.berlin.mpdl.util.StringUtilEscapeChars; +import de.mpg.mpiwg.berlin.mpdl.util.Util; +import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; +import de.mpg.mpiwg.berlin.mpdl.general.MpdlConstants; +import de.mpg.mpiwg.berlin.mpdl.lt.general.Transcoder; +import de.mpg.mpiwg.berlin.mpdl.lt.lex.app.Lexica; +import de.mpg.mpiwg.berlin.mpdl.lt.lex.app.Lexicon; +import de.mpg.mpiwg.berlin.mpdl.lt.lex.app.LexiconEntry; + +public class DBLexWriter { + private static DBLexWriter instance; + private static String MPDL_DATA_DIR = MpdlConstants.MPDL_DATA_DIR; + private static String DATA_FILES_DIR_LEXICA = MPDL_DATA_DIR + "/dataFiles/pollux"; + private static String DB_DIR_LEXICA = MPDL_DATA_DIR + "/dataBerkeleyDB/pollux"; + private DbEnvLex dbEnvLexica; + private Date beginOfOperation; + private Date endOfOperation; + + public static DBLexWriter getInstance() throws ApplicationException { + if (instance == null) { + instance = new DBLexWriter(); + } + return instance; + } + + public static void main(String[] args) throws ApplicationException { + getInstance(); + instance.beginOperation(); + System.out.print("Start ..."); + // instance.initReadOnly(); + instance.initReadWrite(); + // instance.readSampleData(); + // instance.testTranscoder(); + // instance.printSizeOfAllLexicons(); + instance.writeLexiconsToFiles(); + // instance.loadPolluxDbDumpsToDb(); + // instance.copyAndRepairAndTranscodeDumps(); + instance.end(); + instance.endOperation(); + Double elapsedTime = new Util().getSecondWithMillisecondsBetween(instance.beginOfOperation, instance.endOfOperation); + System.out.println("End."); + System.out.println("Needed time: " + elapsedTime + " seconds"); + } + + private void initReadWrite() throws ApplicationException { + dbEnvLexica = new DbEnvLex(); + dbEnvLexica.setDataDir(DB_DIR_LEXICA); + dbEnvLexica.initReadWrite(); + } + + private void initReadOnly() throws ApplicationException { + dbEnvLexica = new DbEnvLex(); + dbEnvLexica.setDataDir(DB_DIR_LEXICA); + dbEnvLexica.initReadOnly(); + ArrayList<Lexicon> lexicons = Lexica.getInstance().getLexicons(); + for (int i=0; i<lexicons.size(); i++) { + Lexicon lexicon = lexicons.get(i); + String lexiconName = lexicon.getName(); + dbEnvLexica.openDatabase(lexiconName); + } + } + + private void loadPolluxDbDumpsToDb() throws ApplicationException { + ArrayList<Lexicon> lexicons = Lexica.getInstance().getLexicons(); + for (int i=0; i<lexicons.size(); i++) { + Lexicon lexicon = lexicons.get(i); + String lexiconName = lexicon.getName(); + loadDbDumpToDb(lexiconName); + } + } + + private void loadDbDumpToDb(String lexiconName) throws ApplicationException { + String dumpFileName = DATA_FILES_DIR_LEXICA + "/" + lexiconName + ".dump"; + String dbName = lexiconName + "Dump.db"; + try { + BufferedReader bufferedReader = new BufferedReader(new FileReader(dumpFileName)); + DbLoad loader = new DbLoad(); + loader.setEnv(dbEnvLexica.getEnv()); + loader.setDbName(dbName); + loader.setInputReader(bufferedReader); + loader.setIgnoreUnknownConfig(true); + loader.load(); + bufferedReader.close(); + } catch (FileNotFoundException e) { + throw new ApplicationException(e); + } catch (IOException e) { + throw new ApplicationException(e); + } catch (DatabaseException e) { + throw new ApplicationException(e); + } + } + + private void readSampleData() throws ApplicationException { + try { + List<String> dbNames = dbEnvLexica.getEnv().getDatabaseNames(); + String l1 = readEntry("autenrieth", "au)to/s"); + String l2 = readEntry("ls", "laudabilis"); + String l3 = readEntry("lsjUnicode", "ἄδρεπτος"); + String l4 = readEntry("salmoneUnicode", "ءرش"); + System.out.println("Autenrieth: autos: " + l1); + System.out.println("Lewis & Short: Laudabilis: " + l2); + System.out.println("LSJ: ἄδρεπτος: " + l3); + System.out.println("Salmone: طب: " + l4); + printSampleEntries("salmoneUnicode", 10); + printSampleEntries("lsjUnicode", 1000); + } catch (DatabaseException e) { + throw new ApplicationException(e); + } + } + + private void end() throws ApplicationException { + ArrayList<Lexicon> lexicons = Lexica.getInstance().getLexicons(); + for (int i=0; i<lexicons.size(); i++) { + Lexicon lexicon = lexicons.get(i); + String lexiconName = lexicon.getName(); + dbEnvLexica.closeDatabase(lexiconName); + dbEnvLexica.closeDatabase(lexiconName + "Dump"); + } + dbEnvLexica.close(); + } + + private String readEntry(String lexiconName, String formName) throws ApplicationException { + String retString = null; + try { + String keyStr = formName; + DatabaseEntry dbEntryKey = new DatabaseEntry(keyStr.getBytes("utf-8")); + Database lexDB = dbEnvLexica.getLexiconDB(lexiconName); + Cursor cursor = lexDB.openCursor(null, null); + DatabaseEntry foundValue = new DatabaseEntry(); + OperationStatus operationStatus = cursor.getSearchKey(dbEntryKey, foundValue, LockMode.DEFAULT); + if (operationStatus == OperationStatus.SUCCESS) { + byte[] foundValueBytes = foundValue.getData(); + retString = new String(foundValueBytes, "utf-8"); + } + cursor.close(); + } catch (DatabaseException e) { + throw new ApplicationException(e); + } catch (UnsupportedEncodingException e) { + throw new ApplicationException(e); + } + return retString; + } + + private void printSizeOfAllLexiconsTemp() throws ApplicationException { + String lexiconName = "lsj"; + int[] sizes = getSizes(lexiconName); + System.out.println(lexiconName + ": " + sizes[0] + " records (" + sizes[1] + " of them are not xml valid)"); + } + + private void printSizeOfAllLexicons() throws ApplicationException { + ArrayList<Lexicon> lexicons = Lexica.getInstance().getLexicons(); + for (int i=0; i<lexicons.size(); i++) { + Lexicon lexicon = lexicons.get(i); + String lexiconName = lexicon.getName(); + int[] sizes = getSizes(lexiconName); + System.out.println(lexiconName + ": " + sizes[0] + " records (" + sizes[1] + " of them are not xml valid)"); + } + } + + private int[] getSizes(String lexiconName) throws ApplicationException { + int size = 0; + int sizeXmlNotValidEntries = 0; + try { + dbEnvLexica.openDatabase(lexiconName); + Database lexDB = dbEnvLexica.getLexiconDB(lexiconName); + Cursor cursor = lexDB.openCursor(null, null); + DatabaseEntry dbEntryKey = new DatabaseEntry(); + DatabaseEntry dbEntryValue = new DatabaseEntry(); + OperationStatus operationStatus = cursor.getFirst(dbEntryKey, dbEntryValue, LockMode.DEFAULT); + while (operationStatus == OperationStatus.SUCCESS) { + operationStatus = cursor.getNext(dbEntryKey, dbEntryValue, LockMode.DEFAULT); + byte[] dbEntryKeyBytes = dbEntryKey.getData(); + String dbEntryKeyStr = new String(dbEntryKeyBytes, "utf-8"); + byte[] dbEntryValueBytes = dbEntryValue.getData(); + String dbEntryValueStr = new String(dbEntryValueBytes, "utf-8"); + int begin = dbEntryValueStr.indexOf("<repaired-entry>"); + int end = dbEntryValueStr.indexOf("</repaired-entry>"); + dbEntryValueStr = dbEntryValueStr.substring(begin, end) + "</repaired-entry>"; + LexiconEntry dbLexEntry = new LexiconEntry(lexiconName, dbEntryKeyStr, dbEntryValueStr); + LexiconEntry xmlLexiconEntry = xmlParse(dbLexEntry); + if (! xmlLexiconEntry.isXmlValid()) { + sizeXmlNotValidEntries ++; + } + size++; + } + cursor.close(); + } catch (DatabaseException e) { + throw new ApplicationException(e); + } catch (UnsupportedEncodingException e) { + throw new ApplicationException(e); + } + int[] sizes = new int[2]; + sizes[0] = size; + sizes[1] = sizeXmlNotValidEntries; + return sizes; + } + + private void copyAndRepairAndTranscodeDumps() throws ApplicationException { + try { + ArrayList<Lexicon> lexicons = Lexica.getInstance().getLexicons(); + for (int i=0; i<lexicons.size(); i++) { + Lexicon lexicon = lexicons.get(i); + String lexiconName = lexicon.getName(); + HashMap<String, DatabaseEntry> lexDumpHashMap = getWholeLexiconHashMap(lexiconName + "Dump"); + dbEnvLexica.openDatabase(lexiconName); + Database lexDB = dbEnvLexica.getLexiconDB(lexiconName); + Iterator<String> lexDumpIter = lexDumpHashMap.keySet().iterator(); + while (lexDumpIter.hasNext()) { + String lexDumpKeyStr = lexDumpIter.next(); + DatabaseEntry lexDumpValue = lexDumpHashMap.get(lexDumpKeyStr); + byte[] lexDumpValueBytes = lexDumpValue.getData(); + String lexDumpValueStr = new String(lexDumpValueBytes, "utf-8"); + String newLexValueStr = new String(lexDumpValueBytes, "utf-8"); + // repair lsj + if (lexiconName.equals("lsj")) { + newLexValueStr = newLexValueStr.replaceAll("<br>", "<br/>"); + newLexValueStr = newLexValueStr.replaceAll("<p>", "<p/>"); + String elementNameGreek = "G"; + newLexValueStr = deleteNestedTags(elementNameGreek, newLexValueStr); // delete tags <G> and </G> inside <G> + newLexValueStr = newLexValueStr.replaceAll("lang=greek", "lang=\"greek\""); + boolean senseContained = newLexValueStr.matches(".*<sense.*>.*"); + boolean endSenseContained = newLexValueStr.matches(".*</sense>.*"); + if (senseContained && ! endSenseContained) + newLexValueStr = newLexValueStr.replaceAll("<sense .*?>", ""); + else if (!senseContained && endSenseContained) + newLexValueStr = newLexValueStr.replaceAll("</sense>", ""); + boolean refContained = newLexValueStr.matches(".*<ref.*>.*"); + boolean endRefContained = newLexValueStr.matches(".*</ref>.*"); + if (refContained && ! endRefContained) + newLexValueStr = newLexValueStr.replaceAll("<ref .*?>", ""); + else if (!refContained && endRefContained) + newLexValueStr = newLexValueStr.replaceAll("</ref>", ""); + /* + boolean itypeContained = newLexValueStr.matches(".*<itype.*>.*"); + boolean endItypeContained = newLexValueStr.matches(".*</itype>.*"); + if (itypeContained && ! endItypeContained) + newLexValueStr = newLexValueStr.replaceAll("<itype .*?>", ""); + else if (!itypeContained && endItypeContained) + newLexValueStr = newLexValueStr.replaceAll("</itype>", ""); + */ + } + // repair cooper + if (lexiconName.equals("cooper")) { + newLexValueStr = newLexValueStr.replaceAll("<PB>", ""); // TODO hack + newLexValueStr = newLexValueStr.replaceAll("<p>", "<p/>"); // TODO hack + } + // repair baretti + if (lexiconName.equals("baretti")) { + newLexValueStr = newLexValueStr.replaceAll("<li>", "<li/>"); // TODO hack + } + // repair for all lexicons + newLexValueStr = newLexValueStr.replaceAll("type=style", "type=\"style\""); + newLexValueStr = newLexValueStr.replaceAll("type=dom", "type=\"dom\""); + newLexValueStr = newLexValueStr.replaceAll("<\\*>", ""); + newLexValueStr = newLexValueStr.replaceAll("<p />", "<p/>"); + LexiconEntry newLexEntryTemp = new LexiconEntry(lexiconName, lexDumpKeyStr, newLexValueStr); // lexDumpKeyStr is not transcoded yet but it will not be used in further in the code + LexiconEntry newLexEntry = xmlParseAndRepair(newLexEntryTemp); + String xmlValidString = "<xml-valid>true</xml-valid>"; + if (! newLexEntry.isXmlValid()) { + xmlValidString = "<xml-valid>false</xml-valid>"; + } + newLexValueStr = newLexEntry.getContent(); + // transcode the Betacode lexicon entries to Unicode (key and value) + if (lexicon.isBetacodeLexicon()) { + Transcoder transcoder = Transcoder.getInstance(); + lexDumpKeyStr = transcoder.transcodeFromBetaCode2Unicode(lexDumpKeyStr); + String elementName = "G"; + if (newLexEntry.isXmlValid()) { + newLexValueStr = transcodeByElementName("fromBetacode2Unicode", elementName, newLexValueStr); + } + } + // transcode the Buckwalter entries to Unicode (key and value) + if (lexicon.isBuckwalterLexicon()) { + Transcoder transcoder = Transcoder.getInstance(); + lexDumpKeyStr = transcoder.transcodeFromBuckwalter2Unicode(lexDumpKeyStr); + String elementName = "AR"; + if (newLexEntry.isXmlValid()) { + newLexValueStr = transcodeByElementName("fromBuckwalter2Unicode", elementName, newLexValueStr); + } + } + // put the entry into database + newLexValueStr = "<content>" + xmlValidString + "<original-entry>" + lexDumpValueStr + "</original-entry>" + "<repaired-entry>" + newLexValueStr + "</repaired-entry>" + "</content>"; + DatabaseEntry newLexDumpKey = new DatabaseEntry(lexDumpKeyStr.getBytes("utf-8")); + DatabaseEntry newLexValue = new DatabaseEntry(newLexValueStr.getBytes("utf-8")); + lexDB.put(null, newLexDumpKey, newLexValue); + } + } + } catch (DatabaseException e) { + throw new ApplicationException(e); + } catch (UnsupportedEncodingException e) { + throw new ApplicationException(e); + } + } + + private void printSampleEntries(String lexiconName, int count) throws ApplicationException { + try { + int counter = 0; + dbEnvLexica.openDatabase(lexiconName); + Database lexDB = dbEnvLexica.getLexiconDB(lexiconName); + Cursor cursor = lexDB.openCursor(null, null); + DatabaseEntry dbEntryKey = new DatabaseEntry(); + DatabaseEntry dbEntryValue = new DatabaseEntry(); + OperationStatus operationStatus = cursor.getFirst(dbEntryKey, dbEntryValue, LockMode.DEFAULT); + while (operationStatus == OperationStatus.SUCCESS && counter < count) { + int size = dbEntryKey.getSize(); + if (size > 0) { + byte[] dbEntryKeyBytes = dbEntryKey.getData(); + String dbEntryKeyStr = new String(dbEntryKeyBytes, "utf-8"); + System.out.println(lexiconName + ": key: " + dbEntryKeyStr + " value size: " + dbEntryValue.getSize()); + } + operationStatus = cursor.getNext(dbEntryKey, dbEntryValue, LockMode.DEFAULT); + counter++; + } + cursor.close(); + } catch (DatabaseException e) { + throw new ApplicationException(e); + } catch (UnsupportedEncodingException e) { + throw new ApplicationException(e); + } + } + + private void testTranscoder() throws ApplicationException { + String testStr = "<G>hfhf fdfd<G>ei)mi/</G> (<tr>sum</tr>), Aeol. <G>e)/mmi</G> hfhfh </G><author>Sapph.</author>2.15, <author>Theoc.</author>20.32; Cret. <G>h)mi/</G> <title>GDI</title> 4959a; <per>2</per><number>sg.</number> <G>ei)=</G>, Ep. and Ion. <cit><G>ei)s</G> <author>Od.</author>17.388</cit>, al., Aeol. <G>e)/ssi</G>, Ep. and Dor. <cit><G>e)ssi/</G> <author>Il.</author>1.176</cit>, <author>Pi.</author>"; + String testStr2 = "aaaaa <G>1111a <G>2222a</G> <G>3333a</G> 1111a</G> aaaaa bbbbb <G>1111b <G>2222b</G> <G>3333b</G> 1111b</G> bbbbb "; + String testStr3 = "<G>e)pano/rqwsin e)/xein</G>, opp <G>a)ni/aton ei)=nai *hi</G>3. 1165 b18. --<G>e)panorqw/seis kai boh/qeiai *rb</G>5. 1383 a20."; + String testStr4 = "<G>suni^hmi</G> <author>Ar.</author><title>Av.</title>946 (s. v.l.), <author>Strato Com.</author>1.3: with variation of quantity, <G>plei=ston ou)=lon i(/ei <G>[i^]</G>, i)/oulon i(/ei [i_</G>] <title>Carm.Pop.</title> 1.]:—" + + ";<br><tr>release, let go</tr>, <cit><G>h(=ka ..po/das kai\\ xei=re fe/resqai</G> <author>Od.</author>12.442</cit>; <G>h(=ke fe/resqai</G> <tr>let</tr> him float" + + "off, <author>Il.</author>21.120; <tr>let fall</tr>, <G>ka\\d de\\ ka/rhtos h(=ke ko/mas</G> <tr>made</tr> his locks <tr>flow</tr> down from his head, <author>Od.<" + + "/author>6.231; [<cit><G>e)qei/ras] i(/ei lo/fon a)mfi/</G> .... ggg"; + String testStr5 = "plei=ston ou)=lon i(/ei "; + String testStr6 = "*a as< as as: *)a *s ss "; + Transcoder t = Transcoder.getInstance(); + String transcoded = t.transcodeFromBetaCode2Unicode(testStr4); + transcoded = t.transcodeFromBetaCode2Unicode(testStr5); + transcoded = t.transcodeFromBetaCode2Unicode(testStr6); + + String arabTestStr1 = "^nutaf"; + String arabTestStr2 = "min"; + String arabTestStr3 = "Aal-Hiyal (^qAla ^>arisTwTAlys) yataEaj~aba Aal-nAs minhA <im~A fy Aal->a$yA' Aal~aty taEriDu TabEAF fa-mim~A lA yuElamu Eil~atuhu wa-<im~A fy Aal->a$yA' Aal-muxAlifap li-l-TabE fa-mim~A yuEmalu bi-Aal-SinAEap li-manfaEap Aal-nAs li->an~a Aal-TabyEap tulzimu >abadAF jihap wAHidap wa->am~A manAfiE Aal-nAs fa-<in~ahA taxtalifu <ixtilAfAF kavyrAF."; + transcoded = t.transcodeFromBuckwalter2Unicode(arabTestStr1); + transcoded = t.transcodeFromBuckwalter2Unicode(arabTestStr2); + transcoded = t.transcodeFromBuckwalter2Unicode(arabTestStr3); + + // String deletedNestedTags = deleteNestedTags("G", testStr4); + // String regExpr = "(<G>.*?)<G>(.*?)</G>(.*?)<G>(.*?)</G>(.*?</G>)"; + String regExpr = "(<G>.*?)<G>(.*)(</G>){1,}(.*?</G>)"; + // String regExpr = "(<G>.*?)<G>(.*?)</G>(.*?)<G>(.*?)</G>(.*?</G>)"; + String replaceStr = testStr2.replaceAll(regExpr, "$1$2$4"); + // String replaceStr2 = testStr2.replaceAll("<G>(.*)<G>(.*)</G>(.*)<G>(.*)</G>(.*)</G>", "<G>$2$3$4$5</G>"); + regExpr = "<G>.*?(<G>.*?</G>){1,}.*?</G>"; + regExpr = "(<G>.*?)<G>(.*?)</G>(.*?){1,}(.*?</G>)"; + // String regExpr = "[a-zA-Z0-9]+?\\[.+?\\]/" + "|" + "[a-zA-Z0-9]+?/" + "|" + "[a-zA-Z0-9]+?\\[.+\\]$" + "|" + "[a-zA-Z0-9]+?$"; // pathName example: "/archimedes[@xmlns:xlink eq "http://www.w3.org/1999/xlink"]/text/body/chap/p[@type eq "main"]/s/foreign[@lang eq "en"]" + Pattern p = Pattern.compile(regExpr, Pattern.CASE_INSENSITIVE | Pattern.MULTILINE); // both flags enabled + Matcher m = p.matcher(testStr2); + while (m.find()) { + int msBeginPos = m.start(); + int msEndPos = m.end(); + String matchStr = testStr2.substring(msBeginPos, msEndPos); + String bla = ""; + } + + String retStr = transcodeByElementName("fromBetacode2Unicode", "G", testStr); + retStr = transcodeByElementName("fromBetacode2Unicode", "G", "bla"); + retStr = transcodeByElementName("fromBetacode2Unicode", "G", ""); + } + + private String transcodeByElementName(String transcodeDirection, String elementName, String inputStr) throws ApplicationException { + if (inputStr == null || elementName == null) + return null; + String elemBeginTag = "<" + elementName + ">"; + String elemEndTag = "</" + elementName + ">"; + Transcoder transcoder = Transcoder.getInstance(); + String outputStr = ""; + int begin = inputStr.indexOf(elemBeginTag); + int end = inputStr.indexOf(elemEndTag); + while (begin != -1 && end != -1 && begin < end) { + String before = inputStr.substring(0, begin); + String origStr = inputStr.substring(begin + elemBeginTag.length(), end); + origStr = StringUtilEscapeChars.deleteSpecialXmlEntities(origStr); + String transcodedStr = origStr; + if (transcodeDirection.equals("fromBetacode2Unicode")) + transcodedStr = transcoder.transcodeFromBetaCode2Unicode(origStr); + else if (transcodeDirection.equals("fromBuckwalter2Unicode")) + transcodedStr = transcoder.transcodeFromBuckwalter2Unicode(origStr); + outputStr = outputStr + before + new String(elemBeginTag); + outputStr = outputStr + transcodedStr; + outputStr = outputStr + new String(elemEndTag); + inputStr = inputStr.substring(end + elemEndTag.length()); + begin = inputStr.indexOf(elemBeginTag); + end = inputStr.indexOf(elemEndTag); + } + outputStr = outputStr + inputStr; + return outputStr; + } + + private String deleteNestedTags(String elementName, String inputStr) { + String inputStrTmp = new String(inputStr); + String elemBeginTag = "<" + elementName + ">"; + String elemEndTag = "</" + elementName + ">"; + String outputStr = ""; + int begin = inputStrTmp.indexOf(elemBeginTag); + int end = inputStrTmp.indexOf(elemEndTag); + while (begin != -1 && end != -1) { + end = getIndexClosedTag(begin, elementName, inputStrTmp); + String before = inputStrTmp.substring(0, begin); + String origStr = null; + if (end == -1) // if no end tag could be found + origStr = inputStrTmp.substring(begin + elemBeginTag.length(), inputStrTmp.length()); + else + origStr = inputStrTmp.substring(begin + elemBeginTag.length(), end); + origStr = origStr.replaceAll(elemBeginTag, ""); + origStr = origStr.replaceAll(elemEndTag, ""); + outputStr = outputStr + before + new String(elemBeginTag); + outputStr = outputStr + origStr; + outputStr = outputStr + new String(elemEndTag); + inputStrTmp = inputStrTmp.substring(end + elemEndTag.length()); + begin = inputStrTmp.indexOf(elemBeginTag); + } + outputStr = outputStr + inputStrTmp; + return outputStr; + } + + private int getIndexClosedTag(int begin, String elementName, String inputStr) { + int beginTmp = begin; + int retIndex = -1; + String elemBeginTag = "<" + elementName + ">"; + String elemEndTag = "</" + elementName + ">"; + int indexEndTag = inputStr.indexOf(elemEndTag); + while (indexEndTag != -1) { + String betweenTmpStr = inputStr.substring(beginTmp + elemBeginTag.length(), indexEndTag); + int indexBeginTag = betweenTmpStr.indexOf(elemBeginTag); + if (indexBeginTag != -1) { + beginTmp = indexEndTag; + } else { + return indexEndTag; + } + indexEndTag = inputStr.indexOf(elemEndTag, indexEndTag + elemEndTag.length()); + } + return retIndex; + } + + private HashMap<String, DatabaseEntry> getWholeLexiconHashMap(String lexiconName) throws ApplicationException { + HashMap<String, DatabaseEntry> lexHashMap = new HashMap<String, DatabaseEntry>(); + try { + dbEnvLexica.openDatabase(lexiconName); + Database lexDB = dbEnvLexica.getLexiconDB(lexiconName); + Cursor cursor = lexDB.openCursor(null, null); + DatabaseEntry dbEntryKey = new DatabaseEntry(); + DatabaseEntry dbEntryValue = new DatabaseEntry(); + OperationStatus operationStatus = cursor.getFirst(dbEntryKey, dbEntryValue, LockMode.DEFAULT); + while (operationStatus == OperationStatus.SUCCESS) { + int size = dbEntryKey.getSize(); + if (size > 0) { + byte[] dbEntryKeyBytes = dbEntryKey.getData(); + String dbEntryKeyStr = new String(dbEntryKeyBytes, "utf-8"); + DatabaseEntry newDbEntryValue = new DatabaseEntry(dbEntryValue.getData()); + lexHashMap.put(dbEntryKeyStr, newDbEntryValue); + } + operationStatus = cursor.getNext(dbEntryKey, dbEntryValue, LockMode.DEFAULT); + } + cursor.close(); + } catch (DatabaseException e) { + throw new ApplicationException(e); + } catch (UnsupportedEncodingException e) { + throw new ApplicationException(e); + } + return lexHashMap; + } + + private LexiconEntry xmlParseAndRepair(LexiconEntry lexEntry) throws ApplicationException { + String origLexEntryContent = lexEntry.getContent(); + String lexEntryContent = new String(origLexEntryContent); + lexEntry.setContent(lexEntryContent); + // parse and repair: try to repair it 3 times through parsing + LexiconEntry retLexiconEntry = xmParseAndRepairLocal(lexEntry); + retLexiconEntry = xmParseAndRepairLocal(retLexiconEntry); + retLexiconEntry = xmParseAndRepairLocal(retLexiconEntry); + // if it could not be repaired the original content (which is not XML valid) is delivered + if (! retLexiconEntry.isXmlValid()) + retLexiconEntry.setContent(origLexEntryContent); + return retLexiconEntry; + } + + private LexiconEntry xmParseAndRepairLocal(LexiconEntry lexEntry) throws ApplicationException { + if (! lexEntry.isXmlValid()) { + lexEntry = xmlParse(lexEntry); + } + if (! lexEntry.isXmlValid() && lexEntry.getValidationCode() != null && lexEntry.getValidationCode().equals("elementNotClosed")) { + String elementName = lexEntry.getValidationFailElementName(); + String lexiconEntryContent = lexEntry.getContent(); + lexiconEntryContent = lexiconEntryContent.replaceAll("<" + elementName + " .*?>", ""); + lexiconEntryContent = lexiconEntryContent.replaceAll("</" + elementName + ">", ""); + lexEntry.setContent(lexiconEntryContent); + lexEntry.setXmlMadeValid(true); + } + return lexEntry; + } + + private LexiconEntry xmlParse(LexiconEntry lexEntry) throws ApplicationException { + String lexEntryContent = "<content>" + lexEntry.getContent() + "</content>"; + LexEntryContentHandler lexEntryContentHandler = new LexEntryContentHandler(); + XMLReader xmlParser = new SAXParser(); + xmlParser.setContentHandler(lexEntryContentHandler); + LexEntryErrorHandler lexEntryErrorHandler = new LexEntryErrorHandler(); + xmlParser.setErrorHandler(lexEntryErrorHandler); + try { + Reader reader = new StringReader(lexEntryContent); + InputSource input = new InputSource(reader); + xmlParser.parse(input); + lexEntry.setXmlValid(true); + } catch (SAXException e) { + // nothing but following + lexEntry.setXmlValid(false); + String exceptionMessage = e.getMessage(); + if (exceptionMessage.matches("The element type .* must be terminated by the matching end-tag .*")) { + int begin = exceptionMessage.indexOf("\""); + if (begin != -1) { + String subStr = exceptionMessage.substring(begin + 1); + int end = subStr.indexOf("\""); + if (end != -1) { + String elementName = exceptionMessage.substring(begin + 1, begin + 1 + end); + lexEntry.setValidationCode("elementNotClosed"); + lexEntry.setValidationFailElementName(elementName); + } + } + } + } catch (IOException e) { + throw new ApplicationException(e); + } + return lexEntry; + } + + private void writeLexiconsToFiles() throws ApplicationException { + BufferedReader in = null; + BufferedOutputStream out = null; + try { + ArrayList<Lexicon> lexicons = Lexica.getInstance().getLexicons(); + for (int i=0; i<lexicons.size(); i++) { + Lexicon lexicon = lexicons.get(i); + String lexiconName = lexicon.getName(); + HashMap<String, DatabaseEntry> lexHashMap = getWholeLexiconHashMap(lexiconName); + Iterator<String> lexDumpIter = lexHashMap.keySet().iterator(); + File outputFile = new File(DATA_FILES_DIR_LEXICA + "/" + lexiconName + ".xml"); + out = new BufferedOutputStream(new FileOutputStream(outputFile)); + write("<lexicon>\n", out); + write("<name>" + lexiconName + "</name>\n", out); + write("<description>" + lexicon.getDescription() + "</description>\n", out); + write("<entries>\n", out); + while (lexDumpIter.hasNext()) { + write("<entry>\n", out); + String lexKeyStr = lexDumpIter.next(); + write("<form>" + lexKeyStr + "</form>\n", out); + DatabaseEntry lexValue = lexHashMap.get(lexKeyStr); + byte[] lexValueBytes = lexValue.getData(); + write(lexValueBytes, out); + write("</entry>\n", out); + } + write("</entries>\n", out); + write("</lexicon>\n", out); + } + } catch (FileNotFoundException e) { + throw new ApplicationException(e); + } finally { + // always close the stream + if (in != null) try { in.close(); } catch (Exception e) { } + if (out != null) try { out.close(); } catch (Exception e) { } + } + } + + private void write(byte[] inputBytes, BufferedOutputStream out) throws ApplicationException { + try { + out.write(inputBytes, 0, inputBytes.length); + out.flush(); + } catch (IOException e) { + throw new ApplicationException(e); + } + } + + private void write(String outStr, BufferedOutputStream out) throws ApplicationException { + try { + byte[] bytes = outStr.getBytes("utf-8"); + out.write(bytes, 0, bytes.length); + out.flush(); + } catch (IOException e) { + throw new ApplicationException(e); + } + } + + private void beginOperation() { + beginOfOperation = new Date(); + } + + private void endOperation() { + endOfOperation = new Date(); + } + +} \ No newline at end of file