cdli/cdliSplitter.py - view

File: [Repository] / cdli / cdliSplitter.py
Revision 1.6: download - view: text, annotated - select for diffs - revision graph
Mon Apr 23 13:07:10 2007 UTC (17 years, 2 months ago) by dwinter
Branches: MAIN
CVS tags: HEAD

minorCVS: ----------------------------------------------------------------------

1: """ 2: Author splitter 3: """ 4: 5: import Zope2 6: import transaction 7: 8: from Products.ZCTextIndex.ISplitter import ISplitter 9: from Products.ZCTextIndex.PipelineFactory import element_factory 10: 11: import re 12: from types import StringType 13: import logging 14: try: 15: import PyLucene 16: except: 17: print "no Lucene support" 18: 19: def getSupportedEncoding(encodings): 20: for encoding in encodings: 21: try: 22: unicode('A', encoding) 23: return encoding 24: except: 25: pass 26: return 'utf-8' 27: 28: 29: 30: """beta of a fulltext splitter for cdli 31: 32: """ 33: ignoreLines=['$','@','#','&'] 34: separators=[''] 35: komma_exception="([^sStThH])," # komma relevant for graphemes will not be deleted 36: deleteGraphems="\{|\}|<|>|$|$|-|_|\#|,|\||\]|\[|\!|\?" # for graphems 37: deleteWords="<|>|$|$|_|\#|,|\||\]|\[|\!|\?"# for words 38: 39: class IndexLine(object): 40: """index a line with lucene""" 41: 42: def __init__(self, storeDir, analyzer,name,line,content): 43: logging.error("i am here %s %s %s %s %s"%((storeDir, analyzer,name,line,content))) 44: if not os.path.exists(storeDir): 45: os.mkdir(storeDir) 46: store = PyLucene.FSDirectory.getDirectory(storeDir, True) 47: writer = PyLucene.IndexWriter(store, analyzer, True) 48: writer.setMaxFieldLength(1048576) 49: self.indexDocs(writer,name,line,content) 50: writer.optimize() 51: writer.close() 52: 53: def indexDocs(self, writer,name,line,content): 54: 55: doc = PyLucene.Document() 56: doc.add(PyLucene.Field("name", pn, 57: PyLucene.Field.Store.YES, 58: PyLucene.Field.Index.UN_TOKENIZED)) 59: 60: doc.add(PyLucene.Field("line", str(i), 61: PyLucene.Field.Store.YES, 62: PyLucene.Field.Index.UN_TOKENIZED)) 63: 64: 65: doc.add(PyLucene.Field("contents", line, 66: PyLucene.Field.Store.YES, 67: PyLucene.Field.Index.TOKENIZED)) 68: 69: writer.addDocument(doc) 70: 71: class cdliSplitter: 72: """basis class for splitter, 73: der Unterschied zwischen Word und Graphemesplitter 74: ist lediglich die unterschiedliche Auschlie§ungsliste""" 75: 76: default_encoding = "utf-8" 77: delete=deleteGraphems 78: indexName="cdliSplitter" 79: 80: 81: def process(self, lst): 82: result = [] 83: pNum=None 84: lineNum=None 85: 86: for t in lst: 87: 88: t.replace("\r","\n") 89: for s in t.split("\n"): 90: 91: if type(s) is StringType: # not unicode 92: s = unicode(s, self.default_encoding, 'replace') 93: 94: if (s!="") and (s[0]=="&"): # store pNum 95: pNum=s[1:8] 96: logging.debug("storing: %s"%pNum) 97: elif (s!="") and (not (s[0] in ignoreLines)): 98: splitted=s.split(".") 99: 100: if len(splitted)==1: #kein punkt 101: txt=splitted[0] 102: else: 103: txt=splitted[1] 104: lineNum=splitted[0] #store line number 105: 106: analyse=txt 107: analyse=re.sub(komma_exception,r"\1",analyse) # delete kommata except kommata relevant in graphems 108: analyse=re.sub(self.delete,' ',analyse) # deletions 109: 110: if self.indexName=="luceneSplitter": 111: if pNum: 112: analyser=PyLucene.StandardAnalyzer() 113: logging.error("calling lucene") 114: 115: IndexLine("/tmp/index",analyser,pNum,lineNum,analyse) 116: else: 117: splitted = analyse.split(" ") 118: 119: 120: for w in splitted: 121: w=w.lstrip().rstrip() 122: 123: if not (w==''): 124: if pNum: #only whe pnum is found (first call of the splitter, is always called twice in the pipeline 125: 126: Zope2.app().cdliRoot.storeInLineIndex(self.indexName,w.lstrip().strip(),(pNum,lineNum)) 127: transaction.get().commit() 128: 129: result.append(w.lstrip().rstrip()) 130: return result 131: 132: 133: class graphemeSplitter(cdliSplitter): 134: delete=deleteGraphems 135: indexName="graphemeSplitter" 136: 137: class wordSplitter(cdliSplitter): 138: delete=deleteWords 139: indexName="wordSplitter" 140: 141: class luceneSplitter(cdliSplitter): 142: delete=deleteWords 143: indexName="luceneSplitter" 144: 145: 146: try: 147: element_factory.registerFactory('Word Splitter', 148: 'CDLI grapheme splitter', graphemeSplitter) 149: except: 150: # in case the splitter is already registered, ValueError is raised 151: pass 152: 153: try: 154: element_factory.registerFactory('Word Splitter', 155: 'CDLI word splitter', wordSplitter) 156: except: 157: # in case the splitter is already registered, ValueError is raised 158: pass 159: 160: try: 161: element_factory.registerFactory('Word Splitter', 162: 'CDLI lucene splitter', luceneSplitter) 163: except: 164: # in case the splitter is already registered, ValueError is raised 165: pass 166: if __name__ == '__main__': 167: a = 'abc defÎÒÃÇµÄºÜ ºÃ¡£' 168: u = unicode(a, 'gbk') 169: s = authorSplitter() 170: print s.process([u]) 171: print s.process([u], 1)