"""
Author splitter
"""
import Zope
from Products.ZCTextIndex.ISplitter import ISplitter
from Products.ZCTextIndex.PipelineFactory import element_factory
import re
from types import StringType
def getSupportedEncoding(encodings):
for encoding in encodings:
try:
unicode('A', encoding)
return encoding
except:
pass
return 'utf-8'
"""beta of a fulltext splitter for cdli
"""
ignoreLines=['$','@','#','&']
separators=['']
komma_exception="([^sStThH])," # komma relevant for graphemes will not be deleted
delete="\{|\}|<|>|\(|\)|-|_|\#|,|\||\]|\[|\!|\?" # for graphems
#delete="<|>|\(|\)|_|\#|,|\||\]|\[|!|?" for words
class graphemeSplitter:
default_encoding = "utf-8"
def process(self, lst):
result = []
pNum=None
lineNum=None
#print "LLLL",lst
for t in lst:
t.replace("\r","\n")
for s in t.split("\n"):
if type(s) is StringType: # not unicode
s = unicode(s, self.default_encoding, 'replace')
#ignore lines
if (s!="") and (s[0]=="&"): # store pNum
pNum=s[1:8]
elif (s!="") and (not (s[0] in ignoreLines)):
#ignore everthing bevor "."
splitted=s.split(".")
if len(splitted)==1: #kein punkt
txt=splitted[0]
else:
txt=splitted[1]
lineNum=splitted[0] #store line number
analyse=txt
analyse=re.sub(komma_exception,r"\1",analyse) # delete kommata except kommata relevant in graphems
analyse=re.sub(delete,' ',analyse) # deletions
splitted = analyse.split(" ")
for w in splitted:
w=w.lstrip().rstrip()
if not (w==''):
if pNum: #only whe pnum is found (first call of the splitter, is always called twice in the pipeline
Zope.app().cdliRoot.storeInLineIndex(w.lstrip().strip(),(pNum,lineNum))
get_transaction().commit()
result.append(w.lstrip().rstrip())
return result
try:
element_factory.registerFactory('Word Splitter',
'CDLI grapheme splitter', graphemeSplitter)
except:
# in case the splitter is already registered, ValueError is raised
pass
if __name__ == '__main__':
a = 'abc def我们的很 好。'
u = unicode(a, 'gbk')
s = authorSplitter()
print s.process([u])
print s.process([u], 1)
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>