cdli/cdliSplitter.py - annotate

Return to cdliSplitter.py CVS log
Up to [Repository] / cdli
Annotation of cdli/cdliSplitter.py, revision 1.7.2.7

1.1       dwinter     1: """
1.7.2.2   casties     2: CDLI word and grapheme splitter
1.1       dwinter     3: """
                      4: 
                      5: from Products.ZCTextIndex.PipelineFactory import element_factory
                      6: 
                      7: import re
1.5       dwinter     8: import logging
1.1       dwinter     9: 
                     10: def getSupportedEncoding(encodings):
                     11:     for encoding in encodings:
                     12:         try:
                     13:             unicode('A', encoding)
                     14:             return encoding
                     15:         except:
                     16:             pass
                     17:     return 'utf-8'
                     18: 
                     19: 
                     20: 
                     21: """beta of a fulltext splitter for cdli
                     22: 
                     23: """
1.7       casties    24: ignoreLines=['$','@','#','&','>']
1.1       dwinter    25: separators=['']
1.7.2.2   casties    26: # kommas relevant for graphemes will not be deleted
                     27: komma_exception="([^sStThH])," 
                     28: # grapheme boundaries
1.7.2.6   casties    29: #graphemeBounds="\{|\}|<|>|\(|\)|-|_|\#|,|\||\]|\[|\!|\?"
                     30: graphemeBounds="\{|\}|<|>|-|_|\#|,|\]|\[|\!|\?|\""
1.7.2.7 ! casties    31: graphemeIgnore=""
1.7.2.2   casties    32: # for words 
1.7.2.6   casties    33: #wordBounds="<|>|\(|\)|_|\#|,|\||\]|\[|\!|\?"
1.7.2.7 ! casties    34: wordBounds="_|,|\""
        !            35: wordIgnore="<|>|\#|\||\]|\[|\!|\?"
1.5       dwinter    36:            
                     37: class cdliSplitter:
1.7.2.2   casties    38:     """base class for splitter. 
                     39:     the difference between word and grapheme splitter 
                     40:     is the word boundary list."""
1.5       dwinter    41:     
1.1       dwinter    42:     default_encoding = "utf-8"
1.7.2.2   casties    43:     bounds=graphemeBounds
1.7.2.7 ! casties    44:     ignore=graphemeIgnore
1.5       dwinter    45:     indexName="cdliSplitter"
                     46:     
1.2       dwinter    47:     
1.1       dwinter    48:     def process(self, lst):
1.7.2.2   casties    49:         """gets a list of strings and returns a list of words"""
                     50:         
1.7.2.4   casties    51:         logging.debug("cdliSplitter: %s"%self.indexName) 
1.1       dwinter    52:         result = []
1.2       dwinter    53:         pNum=None
                     54:         lineNum=None
1.5       dwinter    55:     
1.1       dwinter    56:         for t in lst:
1.7.2.2   casties    57:             # normalise line breaks
                     58:             t.replace("\r","\n")
                     59:             # split lines
                     60:             for s in t.split("\n"):
                     61:                 if isinstance(s, str): 
                     62:                     # not unicode
                     63:                     s = unicode(s, self.default_encoding, 'replace')
                     64:          
                     65:                 if (s!=''):
                     66:                     if s[0]=='&': 
                     67:                         # store pNum
                     68:                         pNum=s[1:8]
1.7.2.3   casties    69:                         logging.debug("%s processing: %s"%(self.indexName,pNum))
1.5       dwinter    70:                         
1.7.2.2   casties    71:                     elif not (s[0] in ignoreLines):
                     72:                         # regular line
                     73:                         lineparts=s.split(".")
                     74:                         if len(lineparts)==1: 
                     75:                             # no line number
                     76:                             txt=s
                     77:                         else:
                     78:                             #store line number
                     79:                             txt=lineparts[1]
                     80:                             lineNum=lineparts[0] 
                     81:                             
                     82:                         # delete kommata except kommata relevant for graphemes
                     83:                         txt = re.sub(komma_exception,r"\1",txt)
1.7.2.7 ! casties    84:                         # replace letters to be ignored
        !            85:                         txt = re.sub(self.ignore,'',txt)
1.7.2.2   casties    86:                         # replace word boundaries by spaces
                     87:                         txt = re.sub(self.bounds,' ',txt)
                     88:                         # split words
                     89:                         words = txt.split(" ")
                     90:                         for w in words:
                     91:                             w=w.strip()
                     92:                             if not (w==''):
                     93:                                 result.append(w)
                     94: 
1.7.2.6   casties    95:         logging.debug("split '%s' into %s"%(lst,repr(result)))
1.5       dwinter    96:         return result
1.2       dwinter    97: 
                     98: 
1.5       dwinter    99: class graphemeSplitter(cdliSplitter):
1.7.2.2   casties   100:     bounds=graphemeBounds
1.7.2.7 ! casties   101:     ignore=graphemeIgnore
1.5       dwinter   102:     indexName="graphemeSplitter"
                    103:     
                    104: class wordSplitter(cdliSplitter):
1.7.2.2   casties   105:     bounds=wordBounds
1.7.2.7 ! casties   106:     ignore=wordIgnore
1.5       dwinter   107:     indexName="wordSplitter"
                    108:       
                    109: try:
                    110:     element_factory.registerFactory('Word Splitter',
                    111:           'CDLI grapheme splitter', graphemeSplitter)
                    112: except:
                    113:     # in case the splitter is already registered, ValueError is raised
                    114:     pass
1.2       dwinter   115: 
1.5       dwinter   116: try:
                    117:     element_factory.registerFactory('Word Splitter',
                    118:           'CDLI word splitter', wordSplitter)
                    119: except:
                    120:     # in case the splitter is already registered, ValueError is raised
                    121:     pass
1.1       dwinter   122:
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>