File:  [Repository] / cdli / cdliSplitter.py
Revision 1.7.2.1: download - view: text, annotated - select for diffs - revision graph
Sat Oct 6 13:44:46 2007 UTC (16 years, 8 months ago) by casties
Branches: zcat_only_1
Diff to: branchpoint 1.7: preferred, unified
CLOSED - # 35: uploading a new version of a CDLIFile indexes the last version
https://itgroup.mpiwg-berlin.mpg.de:8080/tracs/cdli/ticket/35

    1: """
    2: Author splitter
    3: """
    4: 
    5: import Zope2
    6: import transaction
    7: 
    8: from Products.ZCTextIndex.ISplitter import ISplitter
    9: from Products.ZCTextIndex.PipelineFactory import element_factory
   10: 
   11: import re
   12: from types import StringType
   13: import logging
   14: try:
   15: 	import PyLucene
   16: except:
   17: 	print "no Lucene support"
   18: 
   19: def getSupportedEncoding(encodings):
   20:     for encoding in encodings:
   21:         try:
   22:             unicode('A', encoding)
   23:             return encoding
   24:         except:
   25:             pass
   26:     return 'utf-8'
   27: 
   28: 
   29: 
   30: """beta of a fulltext splitter for cdli
   31: 
   32: """
   33: ignoreLines=['$','@','#','&','>']
   34: separators=['']
   35: komma_exception="([^sStThH])," # komma relevant for graphemes will not be deleted
   36: deleteGraphems="\{|\}|<|>|\(|\)|-|_|\#|,|\||\]|\[|\!|\?" # for graphems
   37: deleteWords="<|>|\(|\)|_|\#|,|\||\]|\[|\!|\?"# for words
   38: 
   39: class IndexLine(object):
   40:     """index a line with lucene"""
   41: 
   42:     def __init__(self, storeDir, analyzer,name,line,content):
   43:         logging.error("i am here %s %s %s %s %s"%((storeDir, analyzer,name,line,content)))
   44:         if not os.path.exists(storeDir):
   45:             os.mkdir(storeDir)
   46:         store = PyLucene.FSDirectory.getDirectory(storeDir, True)
   47:         writer = PyLucene.IndexWriter(store, analyzer, True)
   48:         writer.setMaxFieldLength(1048576)
   49:         self.indexDocs(writer,name,line,content)  
   50:         writer.optimize()
   51:         writer.close()
   52:       
   53:     def indexDocs(self, writer,name,line,content):
   54:        
   55:         doc = PyLucene.Document()
   56:         doc.add(PyLucene.Field("name", pn,
   57:                                PyLucene.Field.Store.YES,
   58:                                PyLucene.Field.Index.UN_TOKENIZED))
   59:       
   60:         doc.add(PyLucene.Field("line", str(i),
   61:                                PyLucene.Field.Store.YES,
   62:                                PyLucene.Field.Index.UN_TOKENIZED))
   63:       
   64:                 
   65:         doc.add(PyLucene.Field("contents", line,
   66:                                PyLucene.Field.Store.YES,
   67:                                PyLucene.Field.Index.TOKENIZED))
   68:         
   69:         writer.addDocument(doc)
   70:            
   71: class cdliSplitter:
   72:     """basis class for splitter, 
   73:     der Unterschied zwischen Word und Graphemesplitter 
   74:     ist lediglich die unterschiedliche Auschliengsliste"""
   75:     
   76:     default_encoding = "utf-8"
   77:     delete=deleteGraphems
   78:     indexName="cdliSplitter"
   79:     
   80:     
   81:     def process(self, lst):
   82:         logging.debug("cdliSplitter") 
   83:         result = []
   84:         pNum=None
   85:         lineNum=None
   86:     
   87:         for t in lst:
   88:       
   89:          t.replace("\r","\n")
   90:          for s in t.split("\n"):
   91:       
   92:             if type(s) is StringType: # not unicode
   93:                 s = unicode(s, self.default_encoding, 'replace')
   94:      
   95:             if (s!="") and (s[0]=="&"): # store pNum
   96:                 pNum=s[1:8]
   97:                 logging.debug("cdliSplitter processing: %s"%pNum)
   98:                     
   99:             elif (s!="") and (not (s[0] in ignoreLines)):
  100:                 splitted=s.split(".")
  101:                
  102:                 if len(splitted)==1: #kein punkt
  103:                     txt=splitted[0]
  104:                 else:
  105:                     txt=splitted[1]
  106:                     lineNum=splitted[0] #store line number
  107:                 
  108:                 analyse=txt      
  109:                 analyse=re.sub(komma_exception,r"\1",analyse) # delete kommata except kommata relevant in graphems
  110:                 analyse=re.sub(self.delete,' ',analyse) # deletions
  111:                 
  112:                 if self.indexName=="luceneSplitter":
  113:                     if pNum:
  114:                         analyser=PyLucene.StandardAnalyzer()
  115:                         logging.error("calling lucene")
  116:                         
  117:                         IndexLine("/tmp/index",analyser,pNum,lineNum,analyse)
  118:                 else:
  119:                     splitted = analyse.split(" ")
  120:                     for w in splitted:
  121:                         w=w.lstrip().rstrip()
  122:     
  123:                         if not (w==''):
  124:                             #if pNum: #only whe pnum is found (first call of the splitter, is always called twice in the pipeline
  125:                             #    Zope2.app().cdliRoot.storeInLineIndex(self.indexName,w.lstrip().strip(),(pNum,lineNum))
  126:                             #    transaction.get().commit()
  127:     
  128:                             result.append(w)
  129:         return result
  130: 
  131: 
  132: class graphemeSplitter(cdliSplitter):
  133:     delete=deleteGraphems
  134:     indexName="graphemeSplitter"
  135:     
  136: class wordSplitter(cdliSplitter):
  137:     delete=deleteWords
  138:     indexName="wordSplitter"
  139: 
  140: class luceneSplitter(cdliSplitter):
  141:     delete=deleteWords
  142:     indexName="luceneSplitter"
  143:     
  144:       
  145: try:
  146:     element_factory.registerFactory('Word Splitter',
  147:           'CDLI grapheme splitter', graphemeSplitter)
  148: except:
  149:     # in case the splitter is already registered, ValueError is raised
  150:     pass
  151: 
  152: try:
  153:     element_factory.registerFactory('Word Splitter',
  154:           'CDLI word splitter', wordSplitter)
  155: except:
  156:     # in case the splitter is already registered, ValueError is raised
  157:     pass
  158: 
  159: try:
  160:     element_factory.registerFactory('Word Splitter',
  161:           'CDLI lucene splitter', luceneSplitter)
  162: except:
  163:     # in case the splitter is already registered, ValueError is raised
  164:     pass
  165: if __name__ == '__main__':
  166:    a = 'abc def我们的很 好。'
  167:    u = unicode(a, 'gbk')
  168:    s = authorSplitter()
  169:    print s.process([u])
  170:    print s.process([u], 1)

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>