cdli/cdliSplitter.py - annotate

Return to cdliSplitter.py CVS log
Up to [Repository] / cdli
Annotation of cdli/cdliSplitter.py, revision 1.1

1.1     ! dwinter     1: """
        !             2: Author splitter
        !             3: """
        !             4: 
        !             5: from Products.ZCTextIndex.ISplitter import ISplitter
        !             6: from Products.ZCTextIndex.PipelineFactory import element_factory
        !             7: 
        !             8: import re
        !             9: from types import StringType
        !            10: 
        !            11: def getSupportedEncoding(encodings):
        !            12:     for encoding in encodings:
        !            13:         try:
        !            14:             unicode('A', encoding)
        !            15:             return encoding
        !            16:         except:
        !            17:             pass
        !            18:     return 'utf-8'
        !            19: 
        !            20: 
        !            21: 
        !            22: """beta of a fulltext splitter for cdli
        !            23: 
        !            24: """
        !            25: ignoreLines=['$','@','#','&']
        !            26: separators=['']
        !            27: delete="{|}|<|>|\(|\)|-|_|\#|,|\~|\||\]|\["
        !            28: 
        !            29: class graphemeSplitter:
        !            30: 
        !            31:     default_encoding = "utf-8"
        !            32: 
        !            33:     def process(self, lst):
        !            34:         result = []
        !            35:        
        !            36:         for t in lst:
        !            37: 
        !            38:          t.replace("\r","\n")
        !            39:          for s in t.split("\n"):
        !            40:        
        !            41:             if type(s) is StringType: # not unicode
        !            42:                 s = unicode(s, self.default_encoding, 'replace')
        !            43:             
        !            44:             #ignore lines
        !            45:             
        !            46:             if (s!="") and (not (s[0] in ignoreLines)):
        !            47:               
        !            48:                 #ignore everthing bevor "."
        !            49:                 splitted=s.split(".")
        !            50:                 
        !            51:                 if len(splitted)==1: #kein punkt
        !            52:                     txt=splitted[0]
        !            53:                 else:
        !            54:                     txt=splitted[1]
        !            55:                 
        !            56:                 analyse=txt
        !            57:     
        !            58:                 analyse=re.sub(delete,' ',analyse) # deletions
        !            59:                 
        !            60:                 splitted = analyse.split(" ")
        !            61:                
        !            62:                 for w in splitted:
        !            63:                     w=w.lstrip().rstrip()
        !            64:                     if not (w==''):
        !            65:                         print repr(w)
        !            66:                         result.append(w.lstrip().rstrip())
        !            67:         return result
        !            68: 
        !            69: element_factory.registerFactory('Word Splitter',
        !            70:           'CDLI grapheme splitter', graphemeSplitter)
        !            71:  
        !            72: try:
        !            73:     element_factory.registerFactory('graphemeSplitter',
        !            74:           'CDLI grapheme splitter', graphemeSplitter)
        !            75: except:
        !            76:     # in case the splitter is already registered, ValueError is raised
        !            77:     pass
        !            78: 
        !            79: if __name__ == '__main__':
        !            80:    a = 'abc def我们的很 好。'
        !            81:    u = unicode(a, 'gbk')
        !            82:    s = authorSplitter()
        !            83:    print s.process([u])
        !            84:    print s.process([u], 1)
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>