File:  [Repository] / cdli / indexCDLI.py
Revision 1.1: download - view: text, annotated - select for diffs - revision graph
Wed Mar 21 19:29:23 2007 UTC (17 years, 3 months ago) by dwinter
Branches: MAIN
CVS tags: zcat_only_1, Root_zcat_only_1, HEAD
new indices

    1: #!/usr/bin/env python
    2: 
    3: import sys, os, PyLucene, threading, time
    4: import xmlrpclib
    5: 
    6: from datetime import datetime
    7: 
    8: """
    9: This class is loosely based on the Lucene (java implementation) demo class 
   10: org.apache.lucene.demo.IndexFiles.  It will take a directory as an argument
   11: and will index all of the files in that directory and downward recursively.
   12: It will index on the file path, the file name and the file contents.  The
   13: resulting Lucene index will be placed in the current directory and called
   14: 'index'.
   15: """
   16: 
   17: class Ticker(object):
   18: 
   19:     def __init__(self):
   20:         self.tick = True
   21: 
   22:     def run(self):
   23:         while self.tick:
   24:             sys.stdout.write('.')
   25:             sys.stdout.flush()
   26:             time.sleep(1.0)
   27: 
   28: class IndexFiles(object):
   29:     """Usage: python IndexFiles"""
   30: 
   31:     def __init__(self, storeDir, analyzer):
   32: 
   33:         if not os.path.exists(storeDir):
   34:             os.mkdir(storeDir)
   35:         store = PyLucene.FSDirectory.getDirectory(storeDir, True)
   36:         writer = PyLucene.IndexWriter(store, analyzer, True)
   37:         writer.setMaxFieldLength(1048576)
   38:         self.indexDocs(writer)
   39:         ticker = Ticker()
   40:         print 'optimizing index',
   41:         threading.Thread(target=ticker.run).start()
   42:         writer.optimize()
   43:         writer.close()
   44:         ticker.tick = False
   45:         print 'done'
   46: 
   47:     def indexDocs(self, writer):
   48:         s=xmlrpclib.Server("http://127.0.0.1:8080/cdliRoot/cdli_main")
   49:         pns=s.getAllPNumbers()
   50:         for pn in pns:
   51: 
   52:             txt=s.getFile(pn)
   53:             i=0
   54:             for line in txt.split("\n"):
   55:                 doc = PyLucene.Document()
   56:                 doc.add(PyLucene.Field("name", pn,
   57:                                        PyLucene.Field.Store.YES,
   58:                                        PyLucene.Field.Index.UN_TOKENIZED))
   59:               
   60:                 doc.add(PyLucene.Field("line", str(i),
   61:                                        PyLucene.Field.Store.YES,
   62:                                        PyLucene.Field.Index.UN_TOKENIZED))
   63:               
   64:                         
   65:                 doc.add(PyLucene.Field("contents", line,
   66:                                        PyLucene.Field.Store.YES,
   67:                                        PyLucene.Field.Index.TOKENIZED))
   68:                 
   69:                 writer.addDocument(doc)
   70:                 i+=1
   71:                 print "    %s"%i
   72:             print "indexed %s"%pn
   73: 
   74: if __name__ == '__main__':
   75:     
   76:     print 'PyLucene', PyLucene.VERSION, 'Lucene', PyLucene.LUCENE_VERSION
   77:     start = datetime.now()
   78:     try:
   79:         IndexFiles("index", PyLucene.StandardAnalyzer())
   80:         end = datetime.now()
   81:         print end - start
   82:     except Exception, e:
   83:         print "Failed: ", e

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>