Annotation of cdli/cdliSplitter.py, revision 1.1
1.1 ! dwinter 1: """
! 2: Author splitter
! 3: """
! 4:
! 5: from Products.ZCTextIndex.ISplitter import ISplitter
! 6: from Products.ZCTextIndex.PipelineFactory import element_factory
! 7:
! 8: import re
! 9: from types import StringType
! 10:
! 11: def getSupportedEncoding(encodings):
! 12: for encoding in encodings:
! 13: try:
! 14: unicode('A', encoding)
! 15: return encoding
! 16: except:
! 17: pass
! 18: return 'utf-8'
! 19:
! 20:
! 21:
! 22: """beta of a fulltext splitter for cdli
! 23:
! 24: """
! 25: ignoreLines=['$','@','#','&']
! 26: separators=['']
! 27: delete="{|}|<|>|\(|\)|-|_|\#|,|\~|\||\]|\["
! 28:
! 29: class graphemeSplitter:
! 30:
! 31: default_encoding = "utf-8"
! 32:
! 33: def process(self, lst):
! 34: result = []
! 35:
! 36: for t in lst:
! 37:
! 38: t.replace("\r","\n")
! 39: for s in t.split("\n"):
! 40:
! 41: if type(s) is StringType: # not unicode
! 42: s = unicode(s, self.default_encoding, 'replace')
! 43:
! 44: #ignore lines
! 45:
! 46: if (s!="") and (not (s[0] in ignoreLines)):
! 47:
! 48: #ignore everthing bevor "."
! 49: splitted=s.split(".")
! 50:
! 51: if len(splitted)==1: #kein punkt
! 52: txt=splitted[0]
! 53: else:
! 54: txt=splitted[1]
! 55:
! 56: analyse=txt
! 57:
! 58: analyse=re.sub(delete,' ',analyse) # deletions
! 59:
! 60: splitted = analyse.split(" ")
! 61:
! 62: for w in splitted:
! 63: w=w.lstrip().rstrip()
! 64: if not (w==''):
! 65: print repr(w)
! 66: result.append(w.lstrip().rstrip())
! 67: return result
! 68:
! 69: element_factory.registerFactory('Word Splitter',
! 70: 'CDLI grapheme splitter', graphemeSplitter)
! 71:
! 72: try:
! 73: element_factory.registerFactory('graphemeSplitter',
! 74: 'CDLI grapheme splitter', graphemeSplitter)
! 75: except:
! 76: # in case the splitter is already registered, ValueError is raised
! 77: pass
! 78:
! 79: if __name__ == '__main__':
! 80: a = 'abc def我们的很 好。'
! 81: u = unicode(a, 'gbk')
! 82: s = authorSplitter()
! 83: print s.process([u])
! 84: print s.process([u], 1)
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>