"""
Author splitter
"""
from Products.ZCTextIndex.ISplitter import ISplitter
from Products.ZCTextIndex.PipelineFactory import element_factory
import re
from types import StringType
def getSupportedEncoding(encodings):
for encoding in encodings:
try:
unicode('A', encoding)
return encoding
except:
pass
return 'utf-8'
"""beta of a fulltext splitter for cdli
"""
ignoreLines=['$','@','#','&']
separators=['']
delete="{|}|<|>|\(|\)|-|_|\#|,|\~|\||\]|\["
class graphemeSplitter:
default_encoding = "utf-8"
def process(self, lst):
result = []
for t in lst:
t.replace("\r","\n")
for s in t.split("\n"):
if type(s) is StringType: # not unicode
s = unicode(s, self.default_encoding, 'replace')
#ignore lines
if (s!="") and (not (s[0] in ignoreLines)):
#ignore everthing bevor "."
splitted=s.split(".")
if len(splitted)==1: #kein punkt
txt=splitted[0]
else:
txt=splitted[1]
analyse=txt
analyse=re.sub(delete,' ',analyse) # deletions
splitted = analyse.split(" ")
for w in splitted:
w=w.lstrip().rstrip()
if not (w==''):
print repr(w)
result.append(w.lstrip().rstrip())
return result
element_factory.registerFactory('Word Splitter',
'CDLI grapheme splitter', graphemeSplitter)
try:
element_factory.registerFactory('graphemeSplitter',
'CDLI grapheme splitter', graphemeSplitter)
except:
# in case the splitter is already registered, ValueError is raised
pass
if __name__ == '__main__':
a = 'abc def我们的很 好。'
u = unicode(a, 'gbk')
s = authorSplitter()
print s.process([u])
print s.process([u], 1)
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>