Annotation of cdli/indexCDLI.py, revision 1.1
1.1 ! dwinter 1: #!/usr/bin/env python
! 2:
! 3: import sys, os, PyLucene, threading, time
! 4: import xmlrpclib
! 5:
! 6: from datetime import datetime
! 7:
! 8: """
! 9: This class is loosely based on the Lucene (java implementation) demo class
! 10: org.apache.lucene.demo.IndexFiles. It will take a directory as an argument
! 11: and will index all of the files in that directory and downward recursively.
! 12: It will index on the file path, the file name and the file contents. The
! 13: resulting Lucene index will be placed in the current directory and called
! 14: 'index'.
! 15: """
! 16:
! 17: class Ticker(object):
! 18:
! 19: def __init__(self):
! 20: self.tick = True
! 21:
! 22: def run(self):
! 23: while self.tick:
! 24: sys.stdout.write('.')
! 25: sys.stdout.flush()
! 26: time.sleep(1.0)
! 27:
! 28: class IndexFiles(object):
! 29: """Usage: python IndexFiles"""
! 30:
! 31: def __init__(self, storeDir, analyzer):
! 32:
! 33: if not os.path.exists(storeDir):
! 34: os.mkdir(storeDir)
! 35: store = PyLucene.FSDirectory.getDirectory(storeDir, True)
! 36: writer = PyLucene.IndexWriter(store, analyzer, True)
! 37: writer.setMaxFieldLength(1048576)
! 38: self.indexDocs(writer)
! 39: ticker = Ticker()
! 40: print 'optimizing index',
! 41: threading.Thread(target=ticker.run).start()
! 42: writer.optimize()
! 43: writer.close()
! 44: ticker.tick = False
! 45: print 'done'
! 46:
! 47: def indexDocs(self, writer):
! 48: s=xmlrpclib.Server("http://127.0.0.1:8080/cdliRoot/cdli_main")
! 49: pns=s.getAllPNumbers()
! 50: for pn in pns:
! 51:
! 52: txt=s.getFile(pn)
! 53: i=0
! 54: for line in txt.split("\n"):
! 55: doc = PyLucene.Document()
! 56: doc.add(PyLucene.Field("name", pn,
! 57: PyLucene.Field.Store.YES,
! 58: PyLucene.Field.Index.UN_TOKENIZED))
! 59:
! 60: doc.add(PyLucene.Field("line", str(i),
! 61: PyLucene.Field.Store.YES,
! 62: PyLucene.Field.Index.UN_TOKENIZED))
! 63:
! 64:
! 65: doc.add(PyLucene.Field("contents", line,
! 66: PyLucene.Field.Store.YES,
! 67: PyLucene.Field.Index.TOKENIZED))
! 68:
! 69: writer.addDocument(doc)
! 70: i+=1
! 71: print " %s"%i
! 72: print "indexed %s"%pn
! 73:
! 74: if __name__ == '__main__':
! 75:
! 76: print 'PyLucene', PyLucene.VERSION, 'Lucene', PyLucene.LUCENE_VERSION
! 77: start = datetime.now()
! 78: try:
! 79: IndexFiles("index", PyLucene.StandardAnalyzer())
! 80: end = datetime.now()
! 81: print end - start
! 82: except Exception, e:
! 83: print "Failed: ", e
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>