1: #!/usr/bin/env python
2:
3: import sys, os, PyLucene, threading, time
4: import xmlrpclib
5:
6: from datetime import datetime
7:
8: """
9: This class is loosely based on the Lucene (java implementation) demo class
10: org.apache.lucene.demo.IndexFiles. It will take a directory as an argument
11: and will index all of the files in that directory and downward recursively.
12: It will index on the file path, the file name and the file contents. The
13: resulting Lucene index will be placed in the current directory and called
14: 'index'.
15: """
16:
17: class Ticker(object):
18:
19: def __init__(self):
20: self.tick = True
21:
22: def run(self):
23: while self.tick:
24: sys.stdout.write('.')
25: sys.stdout.flush()
26: time.sleep(1.0)
27:
28: class IndexFiles(object):
29: """Usage: python IndexFiles"""
30:
31: def __init__(self, storeDir, analyzer):
32:
33: if not os.path.exists(storeDir):
34: os.mkdir(storeDir)
35: store = PyLucene.FSDirectory.getDirectory(storeDir, True)
36: writer = PyLucene.IndexWriter(store, analyzer, True)
37: writer.setMaxFieldLength(1048576)
38: self.indexDocs(writer)
39: ticker = Ticker()
40: print 'optimizing index',
41: threading.Thread(target=ticker.run).start()
42: writer.optimize()
43: writer.close()
44: ticker.tick = False
45: print 'done'
46:
47: def indexDocs(self, writer):
48: s=xmlrpclib.Server("http://127.0.0.1:8080/cdliRoot/cdli_main")
49: pns=s.getAllPNumbers()
50: for pn in pns:
51:
52: txt=s.getFile(pn)
53: i=0
54: for line in txt.split("\n"):
55: doc = PyLucene.Document()
56: doc.add(PyLucene.Field("name", pn,
57: PyLucene.Field.Store.YES,
58: PyLucene.Field.Index.UN_TOKENIZED))
59:
60: doc.add(PyLucene.Field("line", str(i),
61: PyLucene.Field.Store.YES,
62: PyLucene.Field.Index.UN_TOKENIZED))
63:
64:
65: doc.add(PyLucene.Field("contents", line,
66: PyLucene.Field.Store.YES,
67: PyLucene.Field.Index.TOKENIZED))
68:
69: writer.addDocument(doc)
70: i+=1
71: print " %s"%i
72: print "indexed %s"%pn
73:
74: if __name__ == '__main__':
75:
76: print 'PyLucene', PyLucene.VERSION, 'Lucene', PyLucene.LUCENE_VERSION
77: start = datetime.now()
78: try:
79: IndexFiles("index", PyLucene.StandardAnalyzer())
80: end = datetime.now()
81: print end - start
82: except Exception, e:
83: print "Failed: ", e
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>