cdli/cdliSplitter.py - view

File: [Repository] / cdli / cdliSplitter.py
Revision 1.2: download - view: text, annotated - select for diffs - revision graph
Fri Dec 22 11:56:08 2006 UTC (17 years, 6 months ago) by dwinter
Branches: MAIN
CVS tags: HEAD

first version of grapheme indexing

""" Author splitter """ import Zope from Products.ZCTextIndex.ISplitter import ISplitter from Products.ZCTextIndex.PipelineFactory import element_factory import re from types import StringType def getSupportedEncoding(encodings): for encoding in encodings: try: unicode('A', encoding) return encoding except: pass return 'utf-8' """beta of a fulltext splitter for cdli """ ignoreLines=['$','@','#','&'] separators=[''] komma_exception="([^sStThH])," # komma relevant for graphemes will not be deleted delete="\{|\}|<|>|$|$|-|_|\#|,|\||\]|\[|\!|\?" # for graphems #delete="<|>|$|$|_|\#|,|\||\]|\[|!|?" for words class graphemeSplitter: default_encoding = "utf-8" def process(self, lst): result = [] pNum=None lineNum=None #print "LLLL",lst for t in lst: t.replace("\r","\n") for s in t.split("\n"): if type(s) is StringType: # not unicode s = unicode(s, self.default_encoding, 'replace') #ignore lines if (s!="") and (s[0]=="&"): # store pNum pNum=s[1:8] elif (s!="") and (not (s[0] in ignoreLines)): #ignore everthing bevor "." splitted=s.split(".") if len(splitted)==1: #kein punkt txt=splitted[0] else: txt=splitted[1] lineNum=splitted[0] #store line number analyse=txt analyse=re.sub(komma_exception,r"\1",analyse) # delete kommata except kommata relevant in graphems analyse=re.sub(delete,' ',analyse) # deletions splitted = analyse.split(" ") for w in splitted: w=w.lstrip().rstrip() if not (w==''): if pNum: #only whe pnum is found (first call of the splitter, is always called twice in the pipeline Zope.app().cdliRoot.storeInLineIndex(w.lstrip().strip(),(pNum,lineNum)) get_transaction().commit() result.append(w.lstrip().rstrip()) return result try: element_factory.registerFactory('Word Splitter', 'CDLI grapheme splitter', graphemeSplitter) except: # in case the splitter is already registered, ValueError is raised pass if __name__ == '__main__': a = 'abc def我们的很好。' u = unicode(a, 'gbk') s = authorSplitter() print s.process([u]) print s.process([u], 1)