File:  [Repository] / cdli / cdliSplitter.py
Revision 1.5: download - view: text, annotated - select for diffs - revision graph
Wed Mar 21 19:29:23 2007 UTC (17 years, 2 months ago) by dwinter
Branches: MAIN
CVS tags: HEAD
new indices

"""
Author splitter
"""

import Zope2
import transaction

from Products.ZCTextIndex.ISplitter import ISplitter
from Products.ZCTextIndex.PipelineFactory import element_factory

import re
from types import StringType
import logging

import PyLucene

def getSupportedEncoding(encodings):
    for encoding in encodings:
        try:
            unicode('A', encoding)
            return encoding
        except:
            pass
    return 'utf-8'



"""beta of a fulltext splitter for cdli

"""
ignoreLines=['$','@','#','&']
separators=['']
komma_exception="([^sStThH])," # komma relevant for graphemes will not be deleted
deleteGraphems="\{|\}|<|>|\(|\)|-|_|\#|,|\||\]|\[|\!|\?" # for graphems
deleteWords="<|>|\(|\)|_|\#|,|\||\]|\[|\!|\?"# for words

class IndexLine(object):
    """index a line with lucene"""

    def __init__(self, storeDir, analyzer,name,line,content):
        logging.error("i am here %s %s %s %s %s"%((storeDir, analyzer,name,line,content)))
        if not os.path.exists(storeDir):
            os.mkdir(storeDir)
        store = PyLucene.FSDirectory.getDirectory(storeDir, True)
        writer = PyLucene.IndexWriter(store, analyzer, True)
        writer.setMaxFieldLength(1048576)
        self.indexDocs(writer,name,line,content)  
        writer.optimize()
        writer.close()
      
    def indexDocs(self, writer,name,line,content):
       
        doc = PyLucene.Document()
        doc.add(PyLucene.Field("name", pn,
                               PyLucene.Field.Store.YES,
                               PyLucene.Field.Index.UN_TOKENIZED))
      
        doc.add(PyLucene.Field("line", str(i),
                               PyLucene.Field.Store.YES,
                               PyLucene.Field.Index.UN_TOKENIZED))
      
                
        doc.add(PyLucene.Field("contents", line,
                               PyLucene.Field.Store.YES,
                               PyLucene.Field.Index.TOKENIZED))
        
        writer.addDocument(doc)
           
class cdliSplitter:
    """basis class for splitter, 
    der Unterschied zwischen Word und Graphemesplitter 
    ist lediglich die unterschiedliche Auschliengsliste"""
    
    default_encoding = "utf-8"
    delete=deleteGraphems
    indexName="cdliSplitter"
    
    
    def process(self, lst):
        result = []
        pNum=None
        lineNum=None
    
        for t in lst:
      
         t.replace("\r","\n")
         for s in t.split("\n"):
      
            if type(s) is StringType: # not unicode
                s = unicode(s, self.default_encoding, 'replace')
     
            if (s!="") and (s[0]=="&"): # store pNum
                pNum=s[1:8]
                logging.debug("storing: %s"%pNum)    
            elif (s!="") and (not (s[0] in ignoreLines)):
                splitted=s.split(".")
               
                if len(splitted)==1: #kein punkt
                    txt=splitted[0]
                else:
                    txt=splitted[1]
                    lineNum=splitted[0] #store line number
                
                analyse=txt      
                analyse=re.sub(komma_exception,r"\1",analyse) # delete kommata except kommata relevant in graphems
                analyse=re.sub(self.delete,' ',analyse) # deletions
                
                if self.indexName=="luceneSplitter":
                    if pNum:
                        analyser=PyLucene.StandardAnalyzer()
                        logging.error("calling lucene")
                        
                        IndexLine("/tmp/index",analyser,pNum,lineNum,analyse)
                else:
                    splitted = analyse.split(" ")
                   
                   
                    for w in splitted:
                        w=w.lstrip().rstrip()
    
                        if not (w==''):
                            if pNum: #only whe pnum is found (first call of the splitter, is always called twice in the pipeline
    
                                Zope2.app().cdliRoot.storeInLineIndex(self.indexName,w.lstrip().strip(),(pNum,lineNum))
                                transaction.get().commit()
    
                            result.append(w.lstrip().rstrip())
        return result


class graphemeSplitter(cdliSplitter):
    delete=deleteGraphems
    indexName="graphemeSplitter"
    
class wordSplitter(cdliSplitter):
    delete=deleteWords
    indexName="wordSplitter"

class luceneSplitter(cdliSplitter):
    delete=deleteWords
    indexName="luceneSplitter"
    
      
try:
    element_factory.registerFactory('Word Splitter',
          'CDLI grapheme splitter', graphemeSplitter)
except:
    # in case the splitter is already registered, ValueError is raised
    pass

try:
    element_factory.registerFactory('Word Splitter',
          'CDLI word splitter', wordSplitter)
except:
    # in case the splitter is already registered, ValueError is raised
    pass

try:
    element_factory.registerFactory('Word Splitter',
          'CDLI lucene splitter', luceneSplitter)
except:
    # in case the splitter is already registered, ValueError is raised
    pass
if __name__ == '__main__':
   a = 'abc def我们的很 好。'
   u = unicode(a, 'gbk')
   s = authorSplitter()
   print s.process([u])
   print s.process([u], 1)

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>