cdli/cdliSplitter.py - view

File: [Repository] / cdli / cdliSplitter.py
Revision 1.7.2.1: download - view: text, annotated - select for diffs - revision graph
Sat Oct 6 13:44:46 2007 UTC (16 years, 8 months ago) by casties
Branches: zcat_only_1
Diff to: branchpoint 1.7: preferred, unified

CLOSED - # 35: uploading a new version of a CDLIFile indexes the last version
https://itgroup.mpiwg-berlin.mpg.de:8080/tracs/cdli/ticket/35

1: """ 2: Author splitter 3: """ 4: 5: import Zope2 6: import transaction 7: 8: from Products.ZCTextIndex.ISplitter import ISplitter 9: from Products.ZCTextIndex.PipelineFactory import element_factory 10: 11: import re 12: from types import StringType 13: import logging 14: try: 15: import PyLucene 16: except: 17: print "no Lucene support" 18: 19: def getSupportedEncoding(encodings): 20: for encoding in encodings: 21: try: 22: unicode('A', encoding) 23: return encoding 24: except: 25: pass 26: return 'utf-8' 27: 28: 29: 30: """beta of a fulltext splitter for cdli 31: 32: """ 33: ignoreLines=['$','@','#','&','>'] 34: separators=[''] 35: komma_exception="([^sStThH])," # komma relevant for graphemes will not be deleted 36: deleteGraphems="\{|\}|<|>|$|$|-|_|\#|,|\||\]|\[|\!|\?" # for graphems 37: deleteWords="<|>|$|$|_|\#|,|\||\]|\[|\!|\?"# for words 38: 39: class IndexLine(object): 40: """index a line with lucene""" 41: 42: def __init__(self, storeDir, analyzer,name,line,content): 43: logging.error("i am here %s %s %s %s %s"%((storeDir, analyzer,name,line,content))) 44: if not os.path.exists(storeDir): 45: os.mkdir(storeDir) 46: store = PyLucene.FSDirectory.getDirectory(storeDir, True) 47: writer = PyLucene.IndexWriter(store, analyzer, True) 48: writer.setMaxFieldLength(1048576) 49: self.indexDocs(writer,name,line,content) 50: writer.optimize() 51: writer.close() 52: 53: def indexDocs(self, writer,name,line,content): 54: 55: doc = PyLucene.Document() 56: doc.add(PyLucene.Field("name", pn, 57: PyLucene.Field.Store.YES, 58: PyLucene.Field.Index.UN_TOKENIZED)) 59: 60: doc.add(PyLucene.Field("line", str(i), 61: PyLucene.Field.Store.YES, 62: PyLucene.Field.Index.UN_TOKENIZED)) 63: 64: 65: doc.add(PyLucene.Field("contents", line, 66: PyLucene.Field.Store.YES, 67: PyLucene.Field.Index.TOKENIZED)) 68: 69: writer.addDocument(doc) 70: 71: class cdliSplitter: 72: """basis class for splitter, 73: der Unterschied zwischen Word und Graphemesplitter 74: ist lediglich die unterschiedliche Auschlie§ungsliste""" 75: 76: default_encoding = "utf-8" 77: delete=deleteGraphems 78: indexName="cdliSplitter" 79: 80: 81: def process(self, lst): 82: logging.debug("cdliSplitter") 83: result = [] 84: pNum=None 85: lineNum=None 86: 87: for t in lst: 88: 89: t.replace("\r","\n") 90: for s in t.split("\n"): 91: 92: if type(s) is StringType: # not unicode 93: s = unicode(s, self.default_encoding, 'replace') 94: 95: if (s!="") and (s[0]=="&"): # store pNum 96: pNum=s[1:8] 97: logging.debug("cdliSplitter processing: %s"%pNum) 98: 99: elif (s!="") and (not (s[0] in ignoreLines)): 100: splitted=s.split(".") 101: 102: if len(splitted)==1: #kein punkt 103: txt=splitted[0] 104: else: 105: txt=splitted[1] 106: lineNum=splitted[0] #store line number 107: 108: analyse=txt 109: analyse=re.sub(komma_exception,r"\1",analyse) # delete kommata except kommata relevant in graphems 110: analyse=re.sub(self.delete,' ',analyse) # deletions 111: 112: if self.indexName=="luceneSplitter": 113: if pNum: 114: analyser=PyLucene.StandardAnalyzer() 115: logging.error("calling lucene") 116: 117: IndexLine("/tmp/index",analyser,pNum,lineNum,analyse) 118: else: 119: splitted = analyse.split(" ") 120: for w in splitted: 121: w=w.lstrip().rstrip() 122: 123: if not (w==''): 124: #if pNum: #only whe pnum is found (first call of the splitter, is always called twice in the pipeline 125: # Zope2.app().cdliRoot.storeInLineIndex(self.indexName,w.lstrip().strip(),(pNum,lineNum)) 126: # transaction.get().commit() 127: 128: result.append(w) 129: return result 130: 131: 132: class graphemeSplitter(cdliSplitter): 133: delete=deleteGraphems 134: indexName="graphemeSplitter" 135: 136: class wordSplitter(cdliSplitter): 137: delete=deleteWords 138: indexName="wordSplitter" 139: 140: class luceneSplitter(cdliSplitter): 141: delete=deleteWords 142: indexName="luceneSplitter" 143: 144: 145: try: 146: element_factory.registerFactory('Word Splitter', 147: 'CDLI grapheme splitter', graphemeSplitter) 148: except: 149: # in case the splitter is already registered, ValueError is raised 150: pass 151: 152: try: 153: element_factory.registerFactory('Word Splitter', 154: 'CDLI word splitter', wordSplitter) 155: except: 156: # in case the splitter is already registered, ValueError is raised 157: pass 158: 159: try: 160: element_factory.registerFactory('Word Splitter', 161: 'CDLI lucene splitter', luceneSplitter) 162: except: 163: # in case the splitter is already registered, ValueError is raised 164: pass 165: if __name__ == '__main__': 166: a = 'abc defÎÒÃÇµÄºÜ ºÃ¡£' 167: u = unicode(a, 'gbk') 168: s = authorSplitter() 169: print s.process([u]) 170: print s.process([u], 1)