File:  [Repository] / cdli / cdliSplitter.py
Revision 1.2: download - view: text, annotated - select for diffs - revision graph
Fri Dec 22 11:56:08 2006 UTC (17 years, 4 months ago) by dwinter
Branches: MAIN
CVS tags: HEAD
first version of grapheme indexing

"""
Author splitter
"""

import Zope

from Products.ZCTextIndex.ISplitter import ISplitter
from Products.ZCTextIndex.PipelineFactory import element_factory

import re
from types import StringType

def getSupportedEncoding(encodings):
    for encoding in encodings:
        try:
            unicode('A', encoding)
            return encoding
        except:
            pass
    return 'utf-8'



"""beta of a fulltext splitter for cdli

"""
ignoreLines=['$','@','#','&']
separators=['']
komma_exception="([^sStThH])," # komma relevant for graphemes will not be deleted
delete="\{|\}|<|>|\(|\)|-|_|\#|,|\||\]|\[|\!|\?" # for graphems
#delete="<|>|\(|\)|_|\#|,|\||\]|\[|!|?" for words

class graphemeSplitter:

    default_encoding = "utf-8"
    
    def process(self, lst):
        result = []
        pNum=None
        lineNum=None
       
  
        #print "LLLL",lst
        
      
        for t in lst:
      
         t.replace("\r","\n")
         for s in t.split("\n"):
      
            if type(s) is StringType: # not unicode
                s = unicode(s, self.default_encoding, 'replace')
            
            #ignore lines

            if (s!="") and (s[0]=="&"): # store pNum
                pNum=s[1:8]

            elif (s!="") and (not (s[0] in ignoreLines)):

              
                #ignore everthing bevor "."
                splitted=s.split(".")
               
                if len(splitted)==1: #kein punkt
                    txt=splitted[0]
                else:
                    txt=splitted[1]
                    lineNum=splitted[0] #store line number
                
                analyse=txt
                
                analyse=re.sub(komma_exception,r"\1",analyse) # delete kommata except kommata relevant in graphems

                analyse=re.sub(delete,' ',analyse) # deletions

                splitted = analyse.split(" ")
               
                for w in splitted:
                    w=w.lstrip().rstrip()

                    if not (w==''):
                        if pNum: #only whe pnum is found (first call of the splitter, is always called twice in the pipeline
                            Zope.app().cdliRoot.storeInLineIndex(w.lstrip().strip(),(pNum,lineNum))
                            get_transaction().commit()

                        result.append(w.lstrip().rstrip())
        return result

 
try:
    element_factory.registerFactory('Word Splitter',
          'CDLI grapheme splitter', graphemeSplitter)
except:
    # in case the splitter is already registered, ValueError is raised
    pass

if __name__ == '__main__':
   a = 'abc def我们的很 好。'
   u = unicode(a, 'gbk')
   s = authorSplitter()
   print s.process([u])
   print s.process([u], 1)

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>