File:  [Repository] / cdli / cdliSplitter.py
Revision 1.5: download - view: text, annotated - select for diffs - revision graph
Wed Mar 21 19:29:23 2007 UTC (17 years, 3 months ago) by dwinter
Branches: MAIN
CVS tags: HEAD
new indices

    1: """
    2: Author splitter
    3: """
    4: 
    5: import Zope2
    6: import transaction
    7: 
    8: from Products.ZCTextIndex.ISplitter import ISplitter
    9: from Products.ZCTextIndex.PipelineFactory import element_factory
   10: 
   11: import re
   12: from types import StringType
   13: import logging
   14: 
   15: import PyLucene
   16: 
   17: def getSupportedEncoding(encodings):
   18:     for encoding in encodings:
   19:         try:
   20:             unicode('A', encoding)
   21:             return encoding
   22:         except:
   23:             pass
   24:     return 'utf-8'
   25: 
   26: 
   27: 
   28: """beta of a fulltext splitter for cdli
   29: 
   30: """
   31: ignoreLines=['$','@','#','&']
   32: separators=['']
   33: komma_exception="([^sStThH])," # komma relevant for graphemes will not be deleted
   34: deleteGraphems="\{|\}|<|>|\(|\)|-|_|\#|,|\||\]|\[|\!|\?" # for graphems
   35: deleteWords="<|>|\(|\)|_|\#|,|\||\]|\[|\!|\?"# for words
   36: 
   37: class IndexLine(object):
   38:     """index a line with lucene"""
   39: 
   40:     def __init__(self, storeDir, analyzer,name,line,content):
   41:         logging.error("i am here %s %s %s %s %s"%((storeDir, analyzer,name,line,content)))
   42:         if not os.path.exists(storeDir):
   43:             os.mkdir(storeDir)
   44:         store = PyLucene.FSDirectory.getDirectory(storeDir, True)
   45:         writer = PyLucene.IndexWriter(store, analyzer, True)
   46:         writer.setMaxFieldLength(1048576)
   47:         self.indexDocs(writer,name,line,content)  
   48:         writer.optimize()
   49:         writer.close()
   50:       
   51:     def indexDocs(self, writer,name,line,content):
   52:        
   53:         doc = PyLucene.Document()
   54:         doc.add(PyLucene.Field("name", pn,
   55:                                PyLucene.Field.Store.YES,
   56:                                PyLucene.Field.Index.UN_TOKENIZED))
   57:       
   58:         doc.add(PyLucene.Field("line", str(i),
   59:                                PyLucene.Field.Store.YES,
   60:                                PyLucene.Field.Index.UN_TOKENIZED))
   61:       
   62:                 
   63:         doc.add(PyLucene.Field("contents", line,
   64:                                PyLucene.Field.Store.YES,
   65:                                PyLucene.Field.Index.TOKENIZED))
   66:         
   67:         writer.addDocument(doc)
   68:            
   69: class cdliSplitter:
   70:     """basis class for splitter, 
   71:     der Unterschied zwischen Word und Graphemesplitter 
   72:     ist lediglich die unterschiedliche Auschliengsliste"""
   73:     
   74:     default_encoding = "utf-8"
   75:     delete=deleteGraphems
   76:     indexName="cdliSplitter"
   77:     
   78:     
   79:     def process(self, lst):
   80:         result = []
   81:         pNum=None
   82:         lineNum=None
   83:     
   84:         for t in lst:
   85:       
   86:          t.replace("\r","\n")
   87:          for s in t.split("\n"):
   88:       
   89:             if type(s) is StringType: # not unicode
   90:                 s = unicode(s, self.default_encoding, 'replace')
   91:      
   92:             if (s!="") and (s[0]=="&"): # store pNum
   93:                 pNum=s[1:8]
   94:                 logging.debug("storing: %s"%pNum)    
   95:             elif (s!="") and (not (s[0] in ignoreLines)):
   96:                 splitted=s.split(".")
   97:                
   98:                 if len(splitted)==1: #kein punkt
   99:                     txt=splitted[0]
  100:                 else:
  101:                     txt=splitted[1]
  102:                     lineNum=splitted[0] #store line number
  103:                 
  104:                 analyse=txt      
  105:                 analyse=re.sub(komma_exception,r"\1",analyse) # delete kommata except kommata relevant in graphems
  106:                 analyse=re.sub(self.delete,' ',analyse) # deletions
  107:                 
  108:                 if self.indexName=="luceneSplitter":
  109:                     if pNum:
  110:                         analyser=PyLucene.StandardAnalyzer()
  111:                         logging.error("calling lucene")
  112:                         
  113:                         IndexLine("/tmp/index",analyser,pNum,lineNum,analyse)
  114:                 else:
  115:                     splitted = analyse.split(" ")
  116:                    
  117:                    
  118:                     for w in splitted:
  119:                         w=w.lstrip().rstrip()
  120:     
  121:                         if not (w==''):
  122:                             if pNum: #only whe pnum is found (first call of the splitter, is always called twice in the pipeline
  123:     
  124:                                 Zope2.app().cdliRoot.storeInLineIndex(self.indexName,w.lstrip().strip(),(pNum,lineNum))
  125:                                 transaction.get().commit()
  126:     
  127:                             result.append(w.lstrip().rstrip())
  128:         return result
  129: 
  130: 
  131: class graphemeSplitter(cdliSplitter):
  132:     delete=deleteGraphems
  133:     indexName="graphemeSplitter"
  134:     
  135: class wordSplitter(cdliSplitter):
  136:     delete=deleteWords
  137:     indexName="wordSplitter"
  138: 
  139: class luceneSplitter(cdliSplitter):
  140:     delete=deleteWords
  141:     indexName="luceneSplitter"
  142:     
  143:       
  144: try:
  145:     element_factory.registerFactory('Word Splitter',
  146:           'CDLI grapheme splitter', graphemeSplitter)
  147: except:
  148:     # in case the splitter is already registered, ValueError is raised
  149:     pass
  150: 
  151: try:
  152:     element_factory.registerFactory('Word Splitter',
  153:           'CDLI word splitter', wordSplitter)
  154: except:
  155:     # in case the splitter is already registered, ValueError is raised
  156:     pass
  157: 
  158: try:
  159:     element_factory.registerFactory('Word Splitter',
  160:           'CDLI lucene splitter', luceneSplitter)
  161: except:
  162:     # in case the splitter is already registered, ValueError is raised
  163:     pass
  164: if __name__ == '__main__':
  165:    a = 'abc def我们的很 好。'
  166:    u = unicode(a, 'gbk')
  167:    s = authorSplitter()
  168:    print s.process([u])
  169:    print s.process([u], 1)

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>