File:  [Repository] / cdli / cdliSplitter.py
Revision 1.4: download - view: text, annotated - select for diffs - revision graph
Thu Feb 8 12:00:23 2007 UTC (17 years, 3 months ago) by dwinter
Branches: MAIN
CVS tags: HEAD
ASSIGNED - # 10: How to deal with non-printable characters
https://itgroup.mpiwg-berlin.mpg.de:8080/tracs/cdli/ticket/10

replace all BOM

    1: """
    2: Author splitter
    3: """
    4: 
    5: import Zope2
    6: import transaction
    7: 
    8: from Products.ZCTextIndex.ISplitter import ISplitter
    9: from Products.ZCTextIndex.PipelineFactory import element_factory
   10: 
   11: import re
   12: from types import StringType
   13: 
   14: def getSupportedEncoding(encodings):
   15:     for encoding in encodings:
   16:         try:
   17:             unicode('A', encoding)
   18:             return encoding
   19:         except:
   20:             pass
   21:     return 'utf-8'
   22: 
   23: 
   24: 
   25: """beta of a fulltext splitter for cdli
   26: 
   27: """
   28: ignoreLines=['$','@','#','&']
   29: separators=['']
   30: komma_exception="([^sStThH])," # komma relevant for graphemes will not be deleted
   31: delete="\{|\}|<|>|\(|\)|-|_|\#|,|\||\]|\[|\!|\?" # for graphems
   32: #delete="<|>|\(|\)|_|\#|,|\||\]|\[|!|?" for words
   33: 
   34: class graphemeSplitter:
   35: 
   36:     default_encoding = "utf-8"
   37:     
   38:     def process(self, lst):
   39:         result = []
   40:         pNum=None
   41:         lineNum=None
   42:        
   43:   
   44:         #print "LLLL",lst
   45:         
   46:       
   47:         for t in lst:
   48:       
   49:          t.replace("\r","\n")
   50:          for s in t.split("\n"):
   51:       
   52:             if type(s) is StringType: # not unicode
   53:                 s = unicode(s, self.default_encoding, 'replace')
   54:             
   55:             #ignore lines
   56: 
   57:             if (s!="") and (s[0]=="&"): # store pNum
   58:                 pNum=s[1:8]
   59: 
   60:             elif (s!="") and (not (s[0] in ignoreLines)):
   61: 
   62:               
   63:                 #ignore everthing bevor "."
   64:                 splitted=s.split(".")
   65:                
   66:                 if len(splitted)==1: #kein punkt
   67:                     txt=splitted[0]
   68:                 else:
   69:                     txt=splitted[1]
   70:                     lineNum=splitted[0] #store line number
   71:                 
   72:                 analyse=txt
   73:                 
   74:                 analyse=re.sub(komma_exception,r"\1",analyse) # delete kommata except kommata relevant in graphems
   75: 
   76:                 analyse=re.sub(delete,' ',analyse) # deletions
   77: 
   78:                 splitted = analyse.split(" ")
   79:                
   80:                 for w in splitted:
   81:                     w=w.lstrip().rstrip()
   82: 
   83:                     if not (w==''):
   84:                         if pNum: #only whe pnum is found (first call of the splitter, is always called twice in the pipeline
   85:                             Zope.app().cdliRoot.storeInLineIndex(w.lstrip().strip(),(pNum,lineNum))
   86:                             transaction.get().commit()
   87: 
   88:                         result.append(w.lstrip().rstrip())
   89:         return result
   90: 
   91:  
   92: try:
   93:     element_factory.registerFactory('Word Splitter',
   94:           'CDLI grapheme splitter', graphemeSplitter)
   95: except:
   96:     # in case the splitter is already registered, ValueError is raised
   97:     pass
   98: 
   99: if __name__ == '__main__':
  100:    a = 'abc def我们的很 好。'
  101:    u = unicode(a, 'gbk')
  102:    s = authorSplitter()
  103:    print s.process([u])
  104:    print s.process([u], 1)

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>