File:  [Repository] / cdli / cdliSplitter.py
Revision 1.9: download - view: text, annotated - select for diffs - revision graph
Thu Sep 25 12:37:55 2008 UTC (15 years, 7 months ago) by dwinter
Branches: MAIN
CVS tags: HEAD
erste version fŸr neues basket managment

"""
CDLI word and grapheme splitter
"""

from Products.ZCTextIndex.PipelineFactory import element_factory

import re
import logging

def getSupportedEncoding(encodings):
    for encoding in encodings:
        try:
            unicode('A', encoding)
            return encoding
        except:
            pass
    return 'utf-8'



"""beta of a fulltext splitter for cdli

"""
ignoreLines=['$','@','#','&','>']
separators=['']
# kommas relevant for graphemes will not be deleted
komma_exception="([^sStThH]),"
komma_exceptionex=re.compile(komma_exception)
# grapheme boundaries
#graphemeBounds="\{|\}|<|>|\(|\)|-|_|\#|,|\||\]|\[|\!|\?"
graphemeBounds="\{|\}|<|>|-|_|\#|,|\]|\[|\!|\?|\""
graphemeIgnore="<|>|\#|\||\]|\[|\!|\?\*|;"
# for words 
#wordBounds="<|>|\(|\)|_|\#|,|\||\]|\[|\!|\?"
wordBounds="_|,|\""
wordIgnore="<|>|\#|\||\]|\[|\!|\?\*|;"
           
class cdliSplitter:

    """base class for splitter. 
    the difference between word and grapheme splitter 
    is the word boundary list."""

    
    default_encoding = "utf-8"
    bounds=graphemeBounds
    boundsex=re.compile(graphemeBounds)
    ignore=graphemeIgnore
    ignorex=re.compile(graphemeIgnore)
    indexName="cdliSplitter"
    
    
    def process(self, lst):
        """gets a list of strings and returns a list of words"""
        
        logging.debug("cdliSplitter: %s"%self.indexName) 
        result = []
        pNum=None
        lineNum=None
    
        for t in lst:
            # normalise line breaks
            t.replace("\r","\n")
            # split lines
            for s in t.split("\n"):
                if isinstance(s, str): 
                    # not unicode
                    s = unicode(s, self.default_encoding, 'replace')
         
                if (s!=''):
                    if s[0]=='&': 
                        # store pNum
                        pNum=s[1:8]
                        logging.debug("%s processing: %s"%(self.indexName,pNum))
                        
                    elif not (s[0] in ignoreLines):
                        # regular line
                        lineparts=s.split(". ",1)
                        if len(lineparts)==1: 
                            # no line number
                            txt=s
                        else:
                            #store line number
                            txt=lineparts[1]
                            lineNum=lineparts[0] 
                            
                        # delete kommata except kommata relevant for graphemes
                        txt = komma_exceptionex.sub(r"\1",txt)
                        # replace word boundaries by spaces
                        txt = self.boundsex.sub(' ',txt)
                        # replace letters to be ignored
                        txt = self.ignorex.sub('',txt)
                        # split words
                        words = txt.split(" ")
                        for w in words:
                            w=w.strip()
                            if not (w==''):
                                result.append(w)

        #logging.debug("split '%s' into %s"%(lst,repr(result)))
        return result


class graphemeSplitter(cdliSplitter):
    bounds=graphemeBounds
    boundsex=re.compile(graphemeBounds)
    ignore=graphemeIgnore
    ignorex=re.compile(graphemeIgnore)
    indexName="graphemeSplitter"
    
class wordSplitter(cdliSplitter):
    bounds=wordBounds
    boundsex=re.compile(wordBounds)
    ignore=wordIgnore
    ignorex=re.compile(wordIgnore)
    indexName="wordSplitter"
      
try:
    element_factory.registerFactory('Word Splitter',
          'CDLI grapheme splitter', graphemeSplitter)
except:
    # in case the splitter is already registered, ValueError is raised
    pass

try:
    element_factory.registerFactory('Word Splitter',
          'CDLI word splitter', wordSplitter)
except:
    # in case the splitter is already registered, ValueError is raised
    pass


FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>