cdli/cdliSplitter.py - annotate

Return to cdliSplitter.py CVS log
Up to [Repository] / cdli
Annotation of cdli/cdliSplitter.py, revision 1.2

1.1       dwinter     1: """
                      2: Author splitter
                      3: """
                      4: 
1.2     ! dwinter     5: import Zope
        !             6: 
1.1       dwinter     7: from Products.ZCTextIndex.ISplitter import ISplitter
                      8: from Products.ZCTextIndex.PipelineFactory import element_factory
                      9: 
                     10: import re
                     11: from types import StringType
                     12: 
                     13: def getSupportedEncoding(encodings):
                     14:     for encoding in encodings:
                     15:         try:
                     16:             unicode('A', encoding)
                     17:             return encoding
                     18:         except:
                     19:             pass
                     20:     return 'utf-8'
                     21: 
                     22: 
                     23: 
                     24: """beta of a fulltext splitter for cdli
                     25: 
                     26: """
                     27: ignoreLines=['$','@','#','&']
                     28: separators=['']
1.2     ! dwinter    29: komma_exception="([^sStThH])," # komma relevant for graphemes will not be deleted
        !            30: delete="\{|\}|<|>|\(|\)|-|_|\#|,|\||\]|\[|\!|\?" # for graphems
        !            31: #delete="<|>|\(|\)|_|\#|,|\||\]|\[|!|?" for words
1.1       dwinter    32: 
                     33: class graphemeSplitter:
                     34: 
                     35:     default_encoding = "utf-8"
1.2     ! dwinter    36:     
1.1       dwinter    37:     def process(self, lst):
                     38:         result = []
1.2     ! dwinter    39:         pNum=None
        !            40:         lineNum=None
1.1       dwinter    41:        
1.2     ! dwinter    42:   
        !            43:         #print "LLLL",lst
        !            44:         
        !            45:       
1.1       dwinter    46:         for t in lst:
1.2     ! dwinter    47:       
1.1       dwinter    48:          t.replace("\r","\n")
                     49:          for s in t.split("\n"):
1.2     ! dwinter    50:       
1.1       dwinter    51:             if type(s) is StringType: # not unicode
                     52:                 s = unicode(s, self.default_encoding, 'replace')
                     53:             
                     54:             #ignore lines
1.2     ! dwinter    55: 
        !            56:             if (s!="") and (s[0]=="&"): # store pNum
        !            57:                 pNum=s[1:8]
        !            58: 
        !            59:             elif (s!="") and (not (s[0] in ignoreLines)):
        !            60: 
1.1       dwinter    61:               
                     62:                 #ignore everthing bevor "."
                     63:                 splitted=s.split(".")
1.2     ! dwinter    64:                
1.1       dwinter    65:                 if len(splitted)==1: #kein punkt
                     66:                     txt=splitted[0]
                     67:                 else:
                     68:                     txt=splitted[1]
1.2     ! dwinter    69:                     lineNum=splitted[0] #store line number
1.1       dwinter    70:                 
                     71:                 analyse=txt
1.2     ! dwinter    72:                 
        !            73:                 analyse=re.sub(komma_exception,r"\1",analyse) # delete kommata except kommata relevant in graphems
        !            74: 
1.1       dwinter    75:                 analyse=re.sub(delete,' ',analyse) # deletions
1.2     ! dwinter    76: 
1.1       dwinter    77:                 splitted = analyse.split(" ")
                     78:                
                     79:                 for w in splitted:
                     80:                     w=w.lstrip().rstrip()
1.2     ! dwinter    81: 
1.1       dwinter    82:                     if not (w==''):
1.2     ! dwinter    83:                         if pNum: #only whe pnum is found (first call of the splitter, is always called twice in the pipeline
        !            84:                             Zope.app().cdliRoot.storeInLineIndex(w.lstrip().strip(),(pNum,lineNum))
        !            85:                             get_transaction().commit()
        !            86: 
1.1       dwinter    87:                         result.append(w.lstrip().rstrip())
                     88:         return result
                     89: 
                     90:  
                     91: try:
1.2     ! dwinter    92:     element_factory.registerFactory('Word Splitter',
1.1       dwinter    93:           'CDLI grapheme splitter', graphemeSplitter)
                     94: except:
                     95:     # in case the splitter is already registered, ValueError is raised
                     96:     pass
                     97: 
                     98: if __name__ == '__main__':
                     99:    a = 'abc def我们的很 好。'
                    100:    u = unicode(a, 'gbk')
                    101:    s = authorSplitter()
                    102:    print s.process([u])
                    103:    print s.process([u], 1)
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>