File:  [Repository] / cdli / cdliSplitter.py
Revision 1.6: download - view: text, annotated - select for diffs - revision graph
Mon Apr 23 13:07:10 2007 UTC (17 years, 2 months ago) by dwinter
Branches: MAIN
CVS tags: HEAD
minorCVS: ----------------------------------------------------------------------

    1: """
    2: Author splitter
    3: """
    4: 
    5: import Zope2
    6: import transaction
    7: 
    8: from Products.ZCTextIndex.ISplitter import ISplitter
    9: from Products.ZCTextIndex.PipelineFactory import element_factory
   10: 
   11: import re
   12: from types import StringType
   13: import logging
   14: try:
   15: 	import PyLucene
   16: except:
   17: 	print "no Lucene support"
   18: 
   19: def getSupportedEncoding(encodings):
   20:     for encoding in encodings:
   21:         try:
   22:             unicode('A', encoding)
   23:             return encoding
   24:         except:
   25:             pass
   26:     return 'utf-8'
   27: 
   28: 
   29: 
   30: """beta of a fulltext splitter for cdli
   31: 
   32: """
   33: ignoreLines=['$','@','#','&']
   34: separators=['']
   35: komma_exception="([^sStThH])," # komma relevant for graphemes will not be deleted
   36: deleteGraphems="\{|\}|<|>|\(|\)|-|_|\#|,|\||\]|\[|\!|\?" # for graphems
   37: deleteWords="<|>|\(|\)|_|\#|,|\||\]|\[|\!|\?"# for words
   38: 
   39: class IndexLine(object):
   40:     """index a line with lucene"""
   41: 
   42:     def __init__(self, storeDir, analyzer,name,line,content):
   43:         logging.error("i am here %s %s %s %s %s"%((storeDir, analyzer,name,line,content)))
   44:         if not os.path.exists(storeDir):
   45:             os.mkdir(storeDir)
   46:         store = PyLucene.FSDirectory.getDirectory(storeDir, True)
   47:         writer = PyLucene.IndexWriter(store, analyzer, True)
   48:         writer.setMaxFieldLength(1048576)
   49:         self.indexDocs(writer,name,line,content)  
   50:         writer.optimize()
   51:         writer.close()
   52:       
   53:     def indexDocs(self, writer,name,line,content):
   54:        
   55:         doc = PyLucene.Document()
   56:         doc.add(PyLucene.Field("name", pn,
   57:                                PyLucene.Field.Store.YES,
   58:                                PyLucene.Field.Index.UN_TOKENIZED))
   59:       
   60:         doc.add(PyLucene.Field("line", str(i),
   61:                                PyLucene.Field.Store.YES,
   62:                                PyLucene.Field.Index.UN_TOKENIZED))
   63:       
   64:                 
   65:         doc.add(PyLucene.Field("contents", line,
   66:                                PyLucene.Field.Store.YES,
   67:                                PyLucene.Field.Index.TOKENIZED))
   68:         
   69:         writer.addDocument(doc)
   70:            
   71: class cdliSplitter:
   72:     """basis class for splitter, 
   73:     der Unterschied zwischen Word und Graphemesplitter 
   74:     ist lediglich die unterschiedliche Auschliengsliste"""
   75:     
   76:     default_encoding = "utf-8"
   77:     delete=deleteGraphems
   78:     indexName="cdliSplitter"
   79:     
   80:     
   81:     def process(self, lst):
   82:         result = []
   83:         pNum=None
   84:         lineNum=None
   85:     
   86:         for t in lst:
   87:       
   88:          t.replace("\r","\n")
   89:          for s in t.split("\n"):
   90:       
   91:             if type(s) is StringType: # not unicode
   92:                 s = unicode(s, self.default_encoding, 'replace')
   93:      
   94:             if (s!="") and (s[0]=="&"): # store pNum
   95:                 pNum=s[1:8]
   96:                 logging.debug("storing: %s"%pNum)    
   97:             elif (s!="") and (not (s[0] in ignoreLines)):
   98:                 splitted=s.split(".")
   99:                
  100:                 if len(splitted)==1: #kein punkt
  101:                     txt=splitted[0]
  102:                 else:
  103:                     txt=splitted[1]
  104:                     lineNum=splitted[0] #store line number
  105:                 
  106:                 analyse=txt      
  107:                 analyse=re.sub(komma_exception,r"\1",analyse) # delete kommata except kommata relevant in graphems
  108:                 analyse=re.sub(self.delete,' ',analyse) # deletions
  109:                 
  110:                 if self.indexName=="luceneSplitter":
  111:                     if pNum:
  112:                         analyser=PyLucene.StandardAnalyzer()
  113:                         logging.error("calling lucene")
  114:                         
  115:                         IndexLine("/tmp/index",analyser,pNum,lineNum,analyse)
  116:                 else:
  117:                     splitted = analyse.split(" ")
  118:                    
  119:                    
  120:                     for w in splitted:
  121:                         w=w.lstrip().rstrip()
  122:     
  123:                         if not (w==''):
  124:                             if pNum: #only whe pnum is found (first call of the splitter, is always called twice in the pipeline
  125:     
  126:                                 Zope2.app().cdliRoot.storeInLineIndex(self.indexName,w.lstrip().strip(),(pNum,lineNum))
  127:                                 transaction.get().commit()
  128:     
  129:                             result.append(w.lstrip().rstrip())
  130:         return result
  131: 
  132: 
  133: class graphemeSplitter(cdliSplitter):
  134:     delete=deleteGraphems
  135:     indexName="graphemeSplitter"
  136:     
  137: class wordSplitter(cdliSplitter):
  138:     delete=deleteWords
  139:     indexName="wordSplitter"
  140: 
  141: class luceneSplitter(cdliSplitter):
  142:     delete=deleteWords
  143:     indexName="luceneSplitter"
  144:     
  145:       
  146: try:
  147:     element_factory.registerFactory('Word Splitter',
  148:           'CDLI grapheme splitter', graphemeSplitter)
  149: except:
  150:     # in case the splitter is already registered, ValueError is raised
  151:     pass
  152: 
  153: try:
  154:     element_factory.registerFactory('Word Splitter',
  155:           'CDLI word splitter', wordSplitter)
  156: except:
  157:     # in case the splitter is already registered, ValueError is raised
  158:     pass
  159: 
  160: try:
  161:     element_factory.registerFactory('Word Splitter',
  162:           'CDLI lucene splitter', luceneSplitter)
  163: except:
  164:     # in case the splitter is already registered, ValueError is raised
  165:     pass
  166: if __name__ == '__main__':
  167:    a = 'abc def我们的很 好。'
  168:    u = unicode(a, 'gbk')
  169:    s = authorSplitter()
  170:    print s.process([u])
  171:    print s.process([u], 1)

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>