Mercurial > hg > mpdl-group
view software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/dict/db/DBLexWriter.java @ 19:4a3641ae14d2
Erstellung
author | Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de> |
---|---|
date | Wed, 09 Nov 2011 15:32:05 +0100 |
parents | |
children |
line wrap: on
line source
package de.mpg.mpiwg.berlin.mpdl.lt.dict.db; import java.io.BufferedOutputStream; import java.io.BufferedReader; import java.io.File; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.FileReader; import java.io.IOException; import java.io.Reader; import java.io.StringReader; import java.io.UnsupportedEncodingException; import java.util.ArrayList; import java.util.Date; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.xml.sax.InputSource; import org.xml.sax.SAXException; import org.xml.sax.XMLReader; import com.sleepycat.je.Cursor; import com.sleepycat.je.Database; import com.sleepycat.je.DatabaseEntry; import com.sleepycat.je.DatabaseException; import com.sleepycat.je.LockMode; import com.sleepycat.je.OperationStatus; import com.sleepycat.je.util.DbLoad; import com.sun.org.apache.xerces.internal.parsers.SAXParser; import de.mpg.mpiwg.berlin.mpdl.util.StringUtils; import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; import de.mpg.mpiwg.berlin.mpdl.lt.dict.app.Lexica; import de.mpg.mpiwg.berlin.mpdl.lt.dict.app.Lexicon; import de.mpg.mpiwg.berlin.mpdl.lt.dict.app.LexiconEntry; import de.mpg.mpiwg.berlin.mpdl.lt.general.Constants; import de.mpg.mpiwg.berlin.mpdl.lt.text.transcode.Transcoder; public class DBLexWriter { private static DBLexWriter instance; private static String DATA_DIR = Constants.getInstance().getDataDir(); private static String DATA_FILES_DIR_LEXICA = DATA_DIR + "/dataFiles/pollux"; private static String DB_DIR_LEXICA = DATA_DIR + "/dataBerkeleyDB/pollux"; private DbEnvLex dbEnvLexica; private Date beginOfOperation; private Date endOfOperation; public static DBLexWriter getInstance() throws ApplicationException { if (instance == null) { instance = new DBLexWriter(); } return instance; } public static void main(String[] args) throws ApplicationException { getInstance(); instance.beginOperation(); System.out.print("Start ..."); // instance.initReadOnly(); instance.initReadWrite(); // instance.readSampleData(); // instance.testTranscoder(); // instance.printSizeOfAllLexicons(); instance.writeLexiconsToFiles(); // instance.loadPolluxDbDumpsToDb(); // instance.copyAndRepairAndTranscodeDumps(); instance.end(); instance.endOperation(); // Double elapsedTime = new Util().getSecondWithMillisecondsBetween(instance.beginOfOperation, instance.endOfOperation); System.out.println("End."); // System.out.println("Needed time: " + elapsedTime + " seconds"); } private void initReadWrite() throws ApplicationException { dbEnvLexica = new DbEnvLex(); dbEnvLexica.setDataDir(DB_DIR_LEXICA); dbEnvLexica.initReadWrite(); } private void initReadOnly() throws ApplicationException { dbEnvLexica = new DbEnvLex(); dbEnvLexica.setDataDir(DB_DIR_LEXICA); dbEnvLexica.initReadOnly(); ArrayList<Lexicon> lexicons = Lexica.getInstance().getLocalLexicons(); for (int i=0; i<lexicons.size(); i++) { Lexicon lexicon = lexicons.get(i); String lexiconName = lexicon.getName(); dbEnvLexica.openDatabase(lexiconName); } } private void loadPolluxDbDumpsToDb() throws ApplicationException { ArrayList<Lexicon> lexicons = Lexica.getInstance().getLocalLexicons(); for (int i=0; i<lexicons.size(); i++) { Lexicon lexicon = lexicons.get(i); String lexiconName = lexicon.getName(); loadDbDumpToDb(lexiconName); } } private void loadDbDumpToDb(String lexiconName) throws ApplicationException { String dumpFileName = DATA_FILES_DIR_LEXICA + "/" + lexiconName + ".dump"; String dbName = lexiconName + "Dump.db"; try { BufferedReader bufferedReader = new BufferedReader(new FileReader(dumpFileName)); DbLoad loader = new DbLoad(); loader.setEnv(dbEnvLexica.getEnv()); loader.setDbName(dbName); loader.setInputReader(bufferedReader); loader.setIgnoreUnknownConfig(true); loader.load(); bufferedReader.close(); } catch (FileNotFoundException e) { throw new ApplicationException(e); } catch (IOException e) { throw new ApplicationException(e); } catch (DatabaseException e) { throw new ApplicationException(e); } } private void readSampleData() throws ApplicationException { try { List<String> dbNames = dbEnvLexica.getEnv().getDatabaseNames(); String l1 = readEntry("autenrieth", "au)to/s"); String l2 = readEntry("ls", "laudabilis"); String l3 = readEntry("lsjUnicode", "ἄδρεπτος"); String l4 = readEntry("salmoneUnicode", "ءرش"); System.out.println("Autenrieth: autos: " + l1); System.out.println("Lewis & Short: Laudabilis: " + l2); System.out.println("LSJ: ἄδρεπτος: " + l3); System.out.println("Salmone: طب: " + l4); printSampleEntries("salmoneUnicode", 10); printSampleEntries("lsjUnicode", 1000); } catch (DatabaseException e) { throw new ApplicationException(e); } } private void end() throws ApplicationException { ArrayList<Lexicon> lexicons = Lexica.getInstance().getLocalLexicons(); for (int i=0; i<lexicons.size(); i++) { Lexicon lexicon = lexicons.get(i); String lexiconName = lexicon.getName(); dbEnvLexica.closeDatabase(lexiconName); dbEnvLexica.closeDatabase(lexiconName + "Dump"); } dbEnvLexica.close(); } private String readEntry(String lexiconName, String formName) throws ApplicationException { String retString = null; try { String keyStr = formName; DatabaseEntry dbEntryKey = new DatabaseEntry(keyStr.getBytes("utf-8")); Database lexDB = dbEnvLexica.getLexiconDB(lexiconName); Cursor cursor = lexDB.openCursor(null, null); DatabaseEntry foundValue = new DatabaseEntry(); OperationStatus operationStatus = cursor.getSearchKey(dbEntryKey, foundValue, LockMode.DEFAULT); if (operationStatus == OperationStatus.SUCCESS) { byte[] foundValueBytes = foundValue.getData(); retString = new String(foundValueBytes, "utf-8"); } cursor.close(); } catch (DatabaseException e) { throw new ApplicationException(e); } catch (UnsupportedEncodingException e) { throw new ApplicationException(e); } return retString; } private void printSizeOfAllLexiconsTemp() throws ApplicationException { String lexiconName = "lsj"; int[] sizes = getSizes(lexiconName); System.out.println(lexiconName + ": " + sizes[0] + " records (" + sizes[1] + " of them are not xml valid)"); } private void printSizeOfAllLexicons() throws ApplicationException { ArrayList<Lexicon> lexicons = Lexica.getInstance().getLocalLexicons(); for (int i=0; i<lexicons.size(); i++) { Lexicon lexicon = lexicons.get(i); String lexiconName = lexicon.getName(); int[] sizes = getSizes(lexiconName); System.out.println(lexiconName + ": " + sizes[0] + " records (" + sizes[1] + " of them are not xml valid)"); } } private int[] getSizes(String lexiconName) throws ApplicationException { int size = 0; int sizeXmlNotValidEntries = 0; try { dbEnvLexica.openDatabase(lexiconName); Database lexDB = dbEnvLexica.getLexiconDB(lexiconName); Cursor cursor = lexDB.openCursor(null, null); DatabaseEntry dbEntryKey = new DatabaseEntry(); DatabaseEntry dbEntryValue = new DatabaseEntry(); OperationStatus operationStatus = cursor.getFirst(dbEntryKey, dbEntryValue, LockMode.DEFAULT); while (operationStatus == OperationStatus.SUCCESS) { operationStatus = cursor.getNext(dbEntryKey, dbEntryValue, LockMode.DEFAULT); byte[] dbEntryKeyBytes = dbEntryKey.getData(); String dbEntryKeyStr = new String(dbEntryKeyBytes, "utf-8"); byte[] dbEntryValueBytes = dbEntryValue.getData(); String dbEntryValueStr = new String(dbEntryValueBytes, "utf-8"); int begin = dbEntryValueStr.indexOf("<repaired-entry>"); int end = dbEntryValueStr.indexOf("</repaired-entry>"); dbEntryValueStr = dbEntryValueStr.substring(begin, end) + "</repaired-entry>"; LexiconEntry dbLexEntry = new LexiconEntry(lexiconName, dbEntryKeyStr, dbEntryValueStr); LexiconEntry xmlLexiconEntry = xmlParse(dbLexEntry); if (! xmlLexiconEntry.isXmlValid()) { sizeXmlNotValidEntries ++; } size++; } cursor.close(); } catch (DatabaseException e) { throw new ApplicationException(e); } catch (UnsupportedEncodingException e) { throw new ApplicationException(e); } int[] sizes = new int[2]; sizes[0] = size; sizes[1] = sizeXmlNotValidEntries; return sizes; } private void copyAndRepairAndTranscodeDumps() throws ApplicationException { try { ArrayList<Lexicon> lexicons = Lexica.getInstance().getLocalLexicons(); for (int i=0; i<lexicons.size(); i++) { Lexicon lexicon = lexicons.get(i); String lexiconName = lexicon.getName(); HashMap<String, DatabaseEntry> lexDumpHashMap = getWholeLexiconHashMap(lexiconName + "Dump"); dbEnvLexica.openDatabase(lexiconName); Database lexDB = dbEnvLexica.getLexiconDB(lexiconName); Iterator<String> lexDumpIter = lexDumpHashMap.keySet().iterator(); while (lexDumpIter.hasNext()) { String lexDumpKeyStr = lexDumpIter.next(); DatabaseEntry lexDumpValue = lexDumpHashMap.get(lexDumpKeyStr); byte[] lexDumpValueBytes = lexDumpValue.getData(); String lexDumpValueStr = new String(lexDumpValueBytes, "utf-8"); String newLexValueStr = new String(lexDumpValueBytes, "utf-8"); // repair lsj if (lexiconName.equals("lsj")) { newLexValueStr = newLexValueStr.replaceAll("<br>", "<br/>"); newLexValueStr = newLexValueStr.replaceAll("<p>", "<p/>"); String elementNameGreek = "G"; newLexValueStr = deleteNestedTags(elementNameGreek, newLexValueStr); // delete tags <G> and </G> inside <G> newLexValueStr = newLexValueStr.replaceAll("lang=greek", "lang=\"greek\""); boolean senseContained = newLexValueStr.matches(".*<sense.*>.*"); boolean endSenseContained = newLexValueStr.matches(".*</sense>.*"); if (senseContained && ! endSenseContained) newLexValueStr = newLexValueStr.replaceAll("<sense .*?>", ""); else if (!senseContained && endSenseContained) newLexValueStr = newLexValueStr.replaceAll("</sense>", ""); boolean refContained = newLexValueStr.matches(".*<ref.*>.*"); boolean endRefContained = newLexValueStr.matches(".*</ref>.*"); if (refContained && ! endRefContained) newLexValueStr = newLexValueStr.replaceAll("<ref .*?>", ""); else if (!refContained && endRefContained) newLexValueStr = newLexValueStr.replaceAll("</ref>", ""); /* boolean itypeContained = newLexValueStr.matches(".*<itype.*>.*"); boolean endItypeContained = newLexValueStr.matches(".*</itype>.*"); if (itypeContained && ! endItypeContained) newLexValueStr = newLexValueStr.replaceAll("<itype .*?>", ""); else if (!itypeContained && endItypeContained) newLexValueStr = newLexValueStr.replaceAll("</itype>", ""); */ } // repair cooper if (lexiconName.equals("cooper")) { newLexValueStr = newLexValueStr.replaceAll("<PB>", ""); // TODO hack newLexValueStr = newLexValueStr.replaceAll("<p>", "<p/>"); // TODO hack } // repair baretti if (lexiconName.equals("baretti")) { newLexValueStr = newLexValueStr.replaceAll("<li>", "<li/>"); // TODO hack } // repair for all lexicons newLexValueStr = newLexValueStr.replaceAll("type=style", "type=\"style\""); newLexValueStr = newLexValueStr.replaceAll("type=dom", "type=\"dom\""); newLexValueStr = newLexValueStr.replaceAll("<\\*>", ""); newLexValueStr = newLexValueStr.replaceAll("<p />", "<p/>"); LexiconEntry newLexEntryTemp = new LexiconEntry(lexiconName, lexDumpKeyStr, newLexValueStr); // lexDumpKeyStr is not transcoded yet but it will not be used in further in the code LexiconEntry newLexEntry = xmlParseAndRepair(newLexEntryTemp); String xmlValidString = "<xml-valid>true</xml-valid>"; if (! newLexEntry.isXmlValid()) { xmlValidString = "<xml-valid>false</xml-valid>"; } newLexValueStr = newLexEntry.getContent(); // transcode the Betacode lexicon entries to Unicode (key and value) if (lexicon.isBetacodeLexicon()) { Transcoder transcoder = Transcoder.getInstance(); lexDumpKeyStr = transcoder.transcodeFromBetaCode2Unicode(lexDumpKeyStr); String elementName = "G"; if (newLexEntry.isXmlValid()) { newLexValueStr = transcodeByElementName("fromBetacode2Unicode", elementName, newLexValueStr); } } // transcode the Buckwalter entries to Unicode (key and value) if (lexicon.isBuckwalterLexicon()) { Transcoder transcoder = Transcoder.getInstance(); lexDumpKeyStr = transcoder.transcodeFromBuckwalter2Unicode(lexDumpKeyStr); String elementName = "AR"; if (newLexEntry.isXmlValid()) { newLexValueStr = transcodeByElementName("fromBuckwalter2Unicode", elementName, newLexValueStr); } } // put the entry into database newLexValueStr = "<content>" + xmlValidString + "<original-entry>" + lexDumpValueStr + "</original-entry>" + "<repaired-entry>" + newLexValueStr + "</repaired-entry>" + "</content>"; DatabaseEntry newLexDumpKey = new DatabaseEntry(lexDumpKeyStr.getBytes("utf-8")); DatabaseEntry newLexValue = new DatabaseEntry(newLexValueStr.getBytes("utf-8")); lexDB.put(null, newLexDumpKey, newLexValue); } } } catch (DatabaseException e) { throw new ApplicationException(e); } catch (UnsupportedEncodingException e) { throw new ApplicationException(e); } } private void printSampleEntries(String lexiconName, int count) throws ApplicationException { try { int counter = 0; dbEnvLexica.openDatabase(lexiconName); Database lexDB = dbEnvLexica.getLexiconDB(lexiconName); Cursor cursor = lexDB.openCursor(null, null); DatabaseEntry dbEntryKey = new DatabaseEntry(); DatabaseEntry dbEntryValue = new DatabaseEntry(); OperationStatus operationStatus = cursor.getFirst(dbEntryKey, dbEntryValue, LockMode.DEFAULT); while (operationStatus == OperationStatus.SUCCESS && counter < count) { int size = dbEntryKey.getSize(); if (size > 0) { byte[] dbEntryKeyBytes = dbEntryKey.getData(); String dbEntryKeyStr = new String(dbEntryKeyBytes, "utf-8"); System.out.println(lexiconName + ": key: " + dbEntryKeyStr + " value size: " + dbEntryValue.getSize()); } operationStatus = cursor.getNext(dbEntryKey, dbEntryValue, LockMode.DEFAULT); counter++; } cursor.close(); } catch (DatabaseException e) { throw new ApplicationException(e); } catch (UnsupportedEncodingException e) { throw new ApplicationException(e); } } private void testTranscoder() throws ApplicationException { String testStr = "<G>hfhf fdfd<G>ei)mi/</G> (<tr>sum</tr>), Aeol. <G>e)/mmi</G> hfhfh </G><author>Sapph.</author>2.15, <author>Theoc.</author>20.32; Cret. <G>h)mi/</G> <title>GDI</title> 4959a; <per>2</per><number>sg.</number> <G>ei)=</G>, Ep. and Ion. <cit><G>ei)s</G> <author>Od.</author>17.388</cit>, al., Aeol. <G>e)/ssi</G>, Ep. and Dor. <cit><G>e)ssi/</G> <author>Il.</author>1.176</cit>, <author>Pi.</author>"; String testStr2 = "aaaaa <G>1111a <G>2222a</G> <G>3333a</G> 1111a</G> aaaaa bbbbb <G>1111b <G>2222b</G> <G>3333b</G> 1111b</G> bbbbb "; String testStr3 = "<G>e)pano/rqwsin e)/xein</G>, opp <G>a)ni/aton ei)=nai *hi</G>3. 1165 b18. --<G>e)panorqw/seis kai boh/qeiai *rb</G>5. 1383 a20."; String testStr4 = "<G>suni^hmi</G> <author>Ar.</author><title>Av.</title>946 (s. v.l.), <author>Strato Com.</author>1.3: with variation of quantity, <G>plei=ston ou)=lon i(/ei <G>[i^]</G>, i)/oulon i(/ei [i_</G>] <title>Carm.Pop.</title> 1.]:—" + ";<br><tr>release, let go</tr>, <cit><G>h(=ka ..po/das kai\\ xei=re fe/resqai</G> <author>Od.</author>12.442</cit>; <G>h(=ke fe/resqai</G> <tr>let</tr> him float" + "off, <author>Il.</author>21.120; <tr>let fall</tr>, <G>ka\\d de\\ ka/rhtos h(=ke ko/mas</G> <tr>made</tr> his locks <tr>flow</tr> down from his head, <author>Od.<" + "/author>6.231; [<cit><G>e)qei/ras] i(/ei lo/fon a)mfi/</G> .... ggg"; String testStr5 = "plei=ston ou)=lon i(/ei "; String testStr6 = "*a as< as as: *)a *s ss "; Transcoder t = Transcoder.getInstance(); String transcoded = t.transcodeFromBetaCode2Unicode(testStr4); transcoded = t.transcodeFromBetaCode2Unicode(testStr5); transcoded = t.transcodeFromBetaCode2Unicode(testStr6); String arabTestStr1 = "^nutaf"; String arabTestStr2 = "min"; String arabTestStr3 = "Aal-Hiyal (^qAla ^>arisTwTAlys) yataEaj~aba Aal-nAs minhA <im~A fy Aal->a$yA' Aal~aty taEriDu TabEAF fa-mim~A lA yuElamu Eil~atuhu wa-<im~A fy Aal->a$yA' Aal-muxAlifap li-l-TabE fa-mim~A yuEmalu bi-Aal-SinAEap li-manfaEap Aal-nAs li->an~a Aal-TabyEap tulzimu >abadAF jihap wAHidap wa->am~A manAfiE Aal-nAs fa-<in~ahA taxtalifu <ixtilAfAF kavyrAF."; transcoded = t.transcodeFromBuckwalter2Unicode(arabTestStr1); transcoded = t.transcodeFromBuckwalter2Unicode(arabTestStr2); transcoded = t.transcodeFromBuckwalter2Unicode(arabTestStr3); // String deletedNestedTags = deleteNestedTags("G", testStr4); // String regExpr = "(<G>.*?)<G>(.*?)</G>(.*?)<G>(.*?)</G>(.*?</G>)"; String regExpr = "(<G>.*?)<G>(.*)(</G>){1,}(.*?</G>)"; // String regExpr = "(<G>.*?)<G>(.*?)</G>(.*?)<G>(.*?)</G>(.*?</G>)"; String replaceStr = testStr2.replaceAll(regExpr, "$1$2$4"); // String replaceStr2 = testStr2.replaceAll("<G>(.*)<G>(.*)</G>(.*)<G>(.*)</G>(.*)</G>", "<G>$2$3$4$5</G>"); regExpr = "<G>.*?(<G>.*?</G>){1,}.*?</G>"; regExpr = "(<G>.*?)<G>(.*?)</G>(.*?){1,}(.*?</G>)"; // String regExpr = "[a-zA-Z0-9]+?\\[.+?\\]/" + "|" + "[a-zA-Z0-9]+?/" + "|" + "[a-zA-Z0-9]+?\\[.+\\]$" + "|" + "[a-zA-Z0-9]+?$"; // pathName example: "/archimedes[@xmlns:xlink eq "http://www.w3.org/1999/xlink"]/text/body/chap/p[@type eq "main"]/s/foreign[@lang eq "en"]" Pattern p = Pattern.compile(regExpr, Pattern.CASE_INSENSITIVE | Pattern.MULTILINE); // both flags enabled Matcher m = p.matcher(testStr2); while (m.find()) { int msBeginPos = m.start(); int msEndPos = m.end(); String matchStr = testStr2.substring(msBeginPos, msEndPos); String bla = ""; } String retStr = transcodeByElementName("fromBetacode2Unicode", "G", testStr); retStr = transcodeByElementName("fromBetacode2Unicode", "G", "bla"); retStr = transcodeByElementName("fromBetacode2Unicode", "G", ""); } private String transcodeByElementName(String transcodeDirection, String elementName, String inputStr) throws ApplicationException { if (inputStr == null || elementName == null) return null; String elemBeginTag = "<" + elementName + ">"; String elemEndTag = "</" + elementName + ">"; Transcoder transcoder = Transcoder.getInstance(); String outputStr = ""; int begin = inputStr.indexOf(elemBeginTag); int end = inputStr.indexOf(elemEndTag); while (begin != -1 && end != -1 && begin < end) { String before = inputStr.substring(0, begin); String origStr = inputStr.substring(begin + elemBeginTag.length(), end); origStr = StringUtils.deleteSpecialXmlEntities(origStr); String transcodedStr = origStr; if (transcodeDirection.equals("fromBetacode2Unicode")) transcodedStr = transcoder.transcodeFromBetaCode2Unicode(origStr); else if (transcodeDirection.equals("fromBuckwalter2Unicode")) transcodedStr = transcoder.transcodeFromBuckwalter2Unicode(origStr); outputStr = outputStr + before + new String(elemBeginTag); outputStr = outputStr + transcodedStr; outputStr = outputStr + new String(elemEndTag); inputStr = inputStr.substring(end + elemEndTag.length()); begin = inputStr.indexOf(elemBeginTag); end = inputStr.indexOf(elemEndTag); } outputStr = outputStr + inputStr; return outputStr; } private String deleteNestedTags(String elementName, String inputStr) { String inputStrTmp = new String(inputStr); String elemBeginTag = "<" + elementName + ">"; String elemEndTag = "</" + elementName + ">"; String outputStr = ""; int begin = inputStrTmp.indexOf(elemBeginTag); int end = inputStrTmp.indexOf(elemEndTag); while (begin != -1 && end != -1) { end = getIndexClosedTag(begin, elementName, inputStrTmp); String before = inputStrTmp.substring(0, begin); String origStr = null; if (end == -1) // if no end tag could be found origStr = inputStrTmp.substring(begin + elemBeginTag.length(), inputStrTmp.length()); else origStr = inputStrTmp.substring(begin + elemBeginTag.length(), end); origStr = origStr.replaceAll(elemBeginTag, ""); origStr = origStr.replaceAll(elemEndTag, ""); outputStr = outputStr + before + new String(elemBeginTag); outputStr = outputStr + origStr; outputStr = outputStr + new String(elemEndTag); inputStrTmp = inputStrTmp.substring(end + elemEndTag.length()); begin = inputStrTmp.indexOf(elemBeginTag); } outputStr = outputStr + inputStrTmp; return outputStr; } private int getIndexClosedTag(int begin, String elementName, String inputStr) { int beginTmp = begin; int retIndex = -1; String elemBeginTag = "<" + elementName + ">"; String elemEndTag = "</" + elementName + ">"; int indexEndTag = inputStr.indexOf(elemEndTag); while (indexEndTag != -1) { String betweenTmpStr = inputStr.substring(beginTmp + elemBeginTag.length(), indexEndTag); int indexBeginTag = betweenTmpStr.indexOf(elemBeginTag); if (indexBeginTag != -1) { beginTmp = indexEndTag; } else { return indexEndTag; } indexEndTag = inputStr.indexOf(elemEndTag, indexEndTag + elemEndTag.length()); } return retIndex; } private HashMap<String, DatabaseEntry> getWholeLexiconHashMap(String lexiconName) throws ApplicationException { HashMap<String, DatabaseEntry> lexHashMap = new HashMap<String, DatabaseEntry>(); try { dbEnvLexica.openDatabase(lexiconName); Database lexDB = dbEnvLexica.getLexiconDB(lexiconName); Cursor cursor = lexDB.openCursor(null, null); DatabaseEntry dbEntryKey = new DatabaseEntry(); DatabaseEntry dbEntryValue = new DatabaseEntry(); OperationStatus operationStatus = cursor.getFirst(dbEntryKey, dbEntryValue, LockMode.DEFAULT); while (operationStatus == OperationStatus.SUCCESS) { int size = dbEntryKey.getSize(); if (size > 0) { byte[] dbEntryKeyBytes = dbEntryKey.getData(); String dbEntryKeyStr = new String(dbEntryKeyBytes, "utf-8"); DatabaseEntry newDbEntryValue = new DatabaseEntry(dbEntryValue.getData()); lexHashMap.put(dbEntryKeyStr, newDbEntryValue); } operationStatus = cursor.getNext(dbEntryKey, dbEntryValue, LockMode.DEFAULT); } cursor.close(); } catch (DatabaseException e) { throw new ApplicationException(e); } catch (UnsupportedEncodingException e) { throw new ApplicationException(e); } return lexHashMap; } private LexiconEntry xmlParseAndRepair(LexiconEntry lexEntry) throws ApplicationException { String origLexEntryContent = lexEntry.getContent(); String lexEntryContent = new String(origLexEntryContent); lexEntry.setContent(lexEntryContent); // parse and repair: try to repair it 3 times through parsing LexiconEntry retLexiconEntry = xmParseAndRepairLocal(lexEntry); retLexiconEntry = xmParseAndRepairLocal(retLexiconEntry); retLexiconEntry = xmParseAndRepairLocal(retLexiconEntry); // if it could not be repaired the original content (which is not XML valid) is delivered if (! retLexiconEntry.isXmlValid()) retLexiconEntry.setContent(origLexEntryContent); return retLexiconEntry; } private LexiconEntry xmParseAndRepairLocal(LexiconEntry lexEntry) throws ApplicationException { if (! lexEntry.isXmlValid()) { lexEntry = xmlParse(lexEntry); } if (! lexEntry.isXmlValid() && lexEntry.getValidationCode() != null && lexEntry.getValidationCode().equals("elementNotClosed")) { String elementName = lexEntry.getValidationFailElementName(); String lexiconEntryContent = lexEntry.getContent(); lexiconEntryContent = lexiconEntryContent.replaceAll("<" + elementName + " .*?>", ""); lexiconEntryContent = lexiconEntryContent.replaceAll("</" + elementName + ">", ""); lexEntry.setContent(lexiconEntryContent); lexEntry.setXmlMadeValid(true); } return lexEntry; } private LexiconEntry xmlParse(LexiconEntry lexEntry) throws ApplicationException { String lexEntryContent = "<content>" + lexEntry.getContent() + "</content>"; LexEntryContentHandler lexEntryContentHandler = new LexEntryContentHandler(); XMLReader xmlParser = new SAXParser(); xmlParser.setContentHandler(lexEntryContentHandler); LexEntryErrorHandler lexEntryErrorHandler = new LexEntryErrorHandler(); xmlParser.setErrorHandler(lexEntryErrorHandler); try { Reader reader = new StringReader(lexEntryContent); InputSource input = new InputSource(reader); xmlParser.parse(input); lexEntry.setXmlValid(true); } catch (SAXException e) { // nothing but following lexEntry.setXmlValid(false); String exceptionMessage = e.getMessage(); if (exceptionMessage.matches("The element type .* must be terminated by the matching end-tag .*")) { int begin = exceptionMessage.indexOf("\""); if (begin != -1) { String subStr = exceptionMessage.substring(begin + 1); int end = subStr.indexOf("\""); if (end != -1) { String elementName = exceptionMessage.substring(begin + 1, begin + 1 + end); lexEntry.setValidationCode("elementNotClosed"); lexEntry.setValidationFailElementName(elementName); } } } } catch (IOException e) { throw new ApplicationException(e); } return lexEntry; } private void writeLexiconsToFiles() throws ApplicationException { BufferedReader in = null; BufferedOutputStream out = null; try { ArrayList<Lexicon> lexicons = Lexica.getInstance().getLocalLexicons(); for (int i=0; i<lexicons.size(); i++) { Lexicon lexicon = lexicons.get(i); String lexiconName = lexicon.getName(); HashMap<String, DatabaseEntry> lexHashMap = getWholeLexiconHashMap(lexiconName); Iterator<String> lexDumpIter = lexHashMap.keySet().iterator(); File outputFile = new File(DATA_FILES_DIR_LEXICA + "/" + lexiconName + ".xml"); out = new BufferedOutputStream(new FileOutputStream(outputFile)); write("<lexicon>\n", out); write("<name>" + lexiconName + "</name>\n", out); write("<description>" + lexicon.getDescription() + "</description>\n", out); write("<entries>\n", out); while (lexDumpIter.hasNext()) { write("<entry>\n", out); String lexKeyStr = lexDumpIter.next(); write("<form>" + lexKeyStr + "</form>\n", out); DatabaseEntry lexValue = lexHashMap.get(lexKeyStr); byte[] lexValueBytes = lexValue.getData(); write(lexValueBytes, out); write("</entry>\n", out); } write("</entries>\n", out); write("</lexicon>\n", out); } } catch (FileNotFoundException e) { throw new ApplicationException(e); } finally { // always close the stream if (in != null) try { in.close(); } catch (Exception e) { } if (out != null) try { out.close(); } catch (Exception e) { } } } private void write(byte[] inputBytes, BufferedOutputStream out) throws ApplicationException { try { out.write(inputBytes, 0, inputBytes.length); out.flush(); } catch (IOException e) { throw new ApplicationException(e); } } private void write(String outStr, BufferedOutputStream out) throws ApplicationException { try { byte[] bytes = outStr.getBytes("utf-8"); out.write(bytes, 0, bytes.length); out.flush(); } catch (IOException e) { throw new ApplicationException(e); } } private void beginOperation() { beginOfOperation = new Date(); } private void endOperation() { endOfOperation = new Date(); } }