cdli/cdliSplitter.py - view

Return to cdliSplitter.py CVS log

Up to [Repository] / cdli

File: [Repository] / cdli / cdliSplitter.py
Revision 1.2: download - view: text, annotated - select for diffs - revision graph
Fri Dec 22 11:56:08 2006 UTC (17 years, 6 months ago) by dwinter
Branches: MAIN
CVS tags: HEAD

first version of grapheme indexing

    1: """
    2: Author splitter
    3: """
    4: 
    5: import Zope
    6: 
    7: from Products.ZCTextIndex.ISplitter import ISplitter
    8: from Products.ZCTextIndex.PipelineFactory import element_factory
    9: 
   10: import re
   11: from types import StringType
   12: 
   13: def getSupportedEncoding(encodings):
   14:     for encoding in encodings:
   15:         try:
   16:             unicode('A', encoding)
   17:             return encoding
   18:         except:
   19:             pass
   20:     return 'utf-8'
   21: 
   22: 
   23: 
   24: """beta of a fulltext splitter for cdli
   25: 
   26: """
   27: ignoreLines=['$','@','#','&']
   28: separators=['']
   29: komma_exception="([^sStThH])," # komma relevant for graphemes will not be deleted
   30: delete="\{|\}|<|>|\(|\)|-|_|\#|,|\||\]|\[|\!|\?" # for graphems
   31: #delete="<|>|\(|\)|_|\#|,|\||\]|\[|!|?" for words
   32: 
   33: class graphemeSplitter:
   34: 
   35:     default_encoding = "utf-8"
   36:     
   37:     def process(self, lst):
   38:         result = []
   39:         pNum=None
   40:         lineNum=None
   41:        
   42:   
   43:         #print "LLLL",lst
   44:         
   45:       
   46:         for t in lst:
   47:       
   48:          t.replace("\r","\n")
   49:          for s in t.split("\n"):
   50:       
   51:             if type(s) is StringType: # not unicode
   52:                 s = unicode(s, self.default_encoding, 'replace')
   53:             
   54:             #ignore lines
   55: 
   56:             if (s!="") and (s[0]=="&"): # store pNum
   57:                 pNum=s[1:8]
   58: 
   59:             elif (s!="") and (not (s[0] in ignoreLines)):
   60: 
   61:               
   62:                 #ignore everthing bevor "."
   63:                 splitted=s.split(".")
   64:                
   65:                 if len(splitted)==1: #kein punkt
   66:                     txt=splitted[0]
   67:                 else:
   68:                     txt=splitted[1]
   69:                     lineNum=splitted[0] #store line number
   70:                 
   71:                 analyse=txt
   72:                 
   73:                 analyse=re.sub(komma_exception,r"\1",analyse) # delete kommata except kommata relevant in graphems
   74: 
   75:                 analyse=re.sub(delete,' ',analyse) # deletions
   76: 
   77:                 splitted = analyse.split(" ")
   78:                
   79:                 for w in splitted:
   80:                     w=w.lstrip().rstrip()
   81: 
   82:                     if not (w==''):
   83:                         if pNum: #only whe pnum is found (first call of the splitter, is always called twice in the pipeline
   84:                             Zope.app().cdliRoot.storeInLineIndex(w.lstrip().strip(),(pNum,lineNum))
   85:                             get_transaction().commit()
   86: 
   87:                         result.append(w.lstrip().rstrip())
   88:         return result
   89: 
   90:  
   91: try:
   92:     element_factory.registerFactory('Word Splitter',
   93:           'CDLI grapheme splitter', graphemeSplitter)
   94: except:
   95:     # in case the splitter is already registered, ValueError is raised
   96:     pass
   97: 
   98: if __name__ == '__main__':
   99:    a = 'abc def我们的很 好。'
  100:    u = unicode(a, 'gbk')
  101:    s = authorSplitter()
  102:    print s.process([u])
  103:    print s.process([u], 1)

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>