cdli/cdliSplitter.py - view

File: [Repository] / cdli / cdliSplitter.py
Revision 1.5: download - view: text, annotated - select for diffs - revision graph
Wed Mar 21 19:29:23 2007 UTC (17 years, 3 months ago) by dwinter
Branches: MAIN
CVS tags: HEAD

new indices

1: """ 2: Author splitter 3: """ 4: 5: import Zope2 6: import transaction 7: 8: from Products.ZCTextIndex.ISplitter import ISplitter 9: from Products.ZCTextIndex.PipelineFactory import element_factory 10: 11: import re 12: from types import StringType 13: import logging 14: 15: import PyLucene 16: 17: def getSupportedEncoding(encodings): 18: for encoding in encodings: 19: try: 20: unicode('A', encoding) 21: return encoding 22: except: 23: pass 24: return 'utf-8' 25: 26: 27: 28: """beta of a fulltext splitter for cdli 29: 30: """ 31: ignoreLines=['$','@','#','&'] 32: separators=[''] 33: komma_exception="([^sStThH])," # komma relevant for graphemes will not be deleted 34: deleteGraphems="\{|\}|<|>|$|$|-|_|\#|,|\||\]|\[|\!|\?" # for graphems 35: deleteWords="<|>|$|$|_|\#|,|\||\]|\[|\!|\?"# for words 36: 37: class IndexLine(object): 38: """index a line with lucene""" 39: 40: def __init__(self, storeDir, analyzer,name,line,content): 41: logging.error("i am here %s %s %s %s %s"%((storeDir, analyzer,name,line,content))) 42: if not os.path.exists(storeDir): 43: os.mkdir(storeDir) 44: store = PyLucene.FSDirectory.getDirectory(storeDir, True) 45: writer = PyLucene.IndexWriter(store, analyzer, True) 46: writer.setMaxFieldLength(1048576) 47: self.indexDocs(writer,name,line,content) 48: writer.optimize() 49: writer.close() 50: 51: def indexDocs(self, writer,name,line,content): 52: 53: doc = PyLucene.Document() 54: doc.add(PyLucene.Field("name", pn, 55: PyLucene.Field.Store.YES, 56: PyLucene.Field.Index.UN_TOKENIZED)) 57: 58: doc.add(PyLucene.Field("line", str(i), 59: PyLucene.Field.Store.YES, 60: PyLucene.Field.Index.UN_TOKENIZED)) 61: 62: 63: doc.add(PyLucene.Field("contents", line, 64: PyLucene.Field.Store.YES, 65: PyLucene.Field.Index.TOKENIZED)) 66: 67: writer.addDocument(doc) 68: 69: class cdliSplitter: 70: """basis class for splitter, 71: der Unterschied zwischen Word und Graphemesplitter 72: ist lediglich die unterschiedliche Auschlie§ungsliste""" 73: 74: default_encoding = "utf-8" 75: delete=deleteGraphems 76: indexName="cdliSplitter" 77: 78: 79: def process(self, lst): 80: result = [] 81: pNum=None 82: lineNum=None 83: 84: for t in lst: 85: 86: t.replace("\r","\n") 87: for s in t.split("\n"): 88: 89: if type(s) is StringType: # not unicode 90: s = unicode(s, self.default_encoding, 'replace') 91: 92: if (s!="") and (s[0]=="&"): # store pNum 93: pNum=s[1:8] 94: logging.debug("storing: %s"%pNum) 95: elif (s!="") and (not (s[0] in ignoreLines)): 96: splitted=s.split(".") 97: 98: if len(splitted)==1: #kein punkt 99: txt=splitted[0] 100: else: 101: txt=splitted[1] 102: lineNum=splitted[0] #store line number 103: 104: analyse=txt 105: analyse=re.sub(komma_exception,r"\1",analyse) # delete kommata except kommata relevant in graphems 106: analyse=re.sub(self.delete,' ',analyse) # deletions 107: 108: if self.indexName=="luceneSplitter": 109: if pNum: 110: analyser=PyLucene.StandardAnalyzer() 111: logging.error("calling lucene") 112: 113: IndexLine("/tmp/index",analyser,pNum,lineNum,analyse) 114: else: 115: splitted = analyse.split(" ") 116: 117: 118: for w in splitted: 119: w=w.lstrip().rstrip() 120: 121: if not (w==''): 122: if pNum: #only whe pnum is found (first call of the splitter, is always called twice in the pipeline 123: 124: Zope2.app().cdliRoot.storeInLineIndex(self.indexName,w.lstrip().strip(),(pNum,lineNum)) 125: transaction.get().commit() 126: 127: result.append(w.lstrip().rstrip()) 128: return result 129: 130: 131: class graphemeSplitter(cdliSplitter): 132: delete=deleteGraphems 133: indexName="graphemeSplitter" 134: 135: class wordSplitter(cdliSplitter): 136: delete=deleteWords 137: indexName="wordSplitter" 138: 139: class luceneSplitter(cdliSplitter): 140: delete=deleteWords 141: indexName="luceneSplitter" 142: 143: 144: try: 145: element_factory.registerFactory('Word Splitter', 146: 'CDLI grapheme splitter', graphemeSplitter) 147: except: 148: # in case the splitter is already registered, ValueError is raised 149: pass 150: 151: try: 152: element_factory.registerFactory('Word Splitter', 153: 'CDLI word splitter', wordSplitter) 154: except: 155: # in case the splitter is already registered, ValueError is raised 156: pass 157: 158: try: 159: element_factory.registerFactory('Word Splitter', 160: 'CDLI lucene splitter', luceneSplitter) 161: except: 162: # in case the splitter is already registered, ValueError is raised 163: pass 164: if __name__ == '__main__': 165: a = 'abc defÎÒÃÇµÄºÜ ºÃ¡£' 166: u = unicode(a, 'gbk') 167: s = authorSplitter() 168: print s.process([u]) 169: print s.process([u], 1)