version 1.4, 2007/02/08 12:00:23
|
version 1.5, 2007/03/21 19:29:23
|
Line 10 from Products.ZCTextIndex.PipelineFactor
|
Line 10 from Products.ZCTextIndex.PipelineFactor
|
|
|
import re |
import re |
from types import StringType |
from types import StringType |
|
import logging |
|
|
|
import PyLucene |
|
|
def getSupportedEncoding(encodings): |
def getSupportedEncoding(encodings): |
for encoding in encodings: |
for encoding in encodings: |
Line 28 def getSupportedEncoding(encodings):
|
Line 31 def getSupportedEncoding(encodings):
|
ignoreLines=['$','@','#','&'] |
ignoreLines=['$','@','#','&'] |
separators=[''] |
separators=[''] |
komma_exception="([^sStThH])," # komma relevant for graphemes will not be deleted |
komma_exception="([^sStThH])," # komma relevant for graphemes will not be deleted |
delete="\{|\}|<|>|\(|\)|-|_|\#|,|\||\]|\[|\!|\?" # for graphems |
deleteGraphems="\{|\}|<|>|\(|\)|-|_|\#|,|\||\]|\[|\!|\?" # for graphems |
#delete="<|>|\(|\)|_|\#|,|\||\]|\[|!|?" for words |
deleteWords="<|>|\(|\)|_|\#|,|\||\]|\[|\!|\?"# for words |
|
|
|
class IndexLine(object): |
|
"""index a line with lucene""" |
|
|
class graphemeSplitter: |
def __init__(self, storeDir, analyzer,name,line,content): |
|
logging.error("i am here %s %s %s %s %s"%((storeDir, analyzer,name,line,content))) |
|
if not os.path.exists(storeDir): |
|
os.mkdir(storeDir) |
|
store = PyLucene.FSDirectory.getDirectory(storeDir, True) |
|
writer = PyLucene.IndexWriter(store, analyzer, True) |
|
writer.setMaxFieldLength(1048576) |
|
self.indexDocs(writer,name,line,content) |
|
writer.optimize() |
|
writer.close() |
|
|
|
def indexDocs(self, writer,name,line,content): |
|
|
|
doc = PyLucene.Document() |
|
doc.add(PyLucene.Field("name", pn, |
|
PyLucene.Field.Store.YES, |
|
PyLucene.Field.Index.UN_TOKENIZED)) |
|
|
|
doc.add(PyLucene.Field("line", str(i), |
|
PyLucene.Field.Store.YES, |
|
PyLucene.Field.Index.UN_TOKENIZED)) |
|
|
|
|
|
doc.add(PyLucene.Field("contents", line, |
|
PyLucene.Field.Store.YES, |
|
PyLucene.Field.Index.TOKENIZED)) |
|
|
|
writer.addDocument(doc) |
|
|
|
class cdliSplitter: |
|
"""basis class for splitter, |
|
der Unterschied zwischen Word und Graphemesplitter |
|
ist lediglich die unterschiedliche Auschliengsliste""" |
|
|
default_encoding = "utf-8" |
default_encoding = "utf-8" |
|
delete=deleteGraphems |
|
indexName="cdliSplitter" |
|
|
|
|
def process(self, lst): |
def process(self, lst): |
result = [] |
result = [] |
pNum=None |
pNum=None |
lineNum=None |
lineNum=None |
|
|
|
|
#print "LLLL",lst |
|
|
|
|
|
for t in lst: |
for t in lst: |
|
|
t.replace("\r","\n") |
t.replace("\r","\n") |
Line 52 class graphemeSplitter:
|
Line 89 class graphemeSplitter:
|
if type(s) is StringType: # not unicode |
if type(s) is StringType: # not unicode |
s = unicode(s, self.default_encoding, 'replace') |
s = unicode(s, self.default_encoding, 'replace') |
|
|
#ignore lines |
|
|
|
if (s!="") and (s[0]=="&"): # store pNum |
if (s!="") and (s[0]=="&"): # store pNum |
pNum=s[1:8] |
pNum=s[1:8] |
|
logging.debug("storing: %s"%pNum) |
elif (s!="") and (not (s[0] in ignoreLines)): |
elif (s!="") and (not (s[0] in ignoreLines)): |
|
|
|
|
#ignore everthing bevor "." |
|
splitted=s.split(".") |
splitted=s.split(".") |
|
|
if len(splitted)==1: #kein punkt |
if len(splitted)==1: #kein punkt |
Line 70 class graphemeSplitter:
|
Line 102 class graphemeSplitter:
|
lineNum=splitted[0] #store line number |
lineNum=splitted[0] #store line number |
|
|
analyse=txt |
analyse=txt |
|
|
analyse=re.sub(komma_exception,r"\1",analyse) # delete kommata except kommata relevant in graphems |
analyse=re.sub(komma_exception,r"\1",analyse) # delete kommata except kommata relevant in graphems |
|
analyse=re.sub(self.delete,' ',analyse) # deletions |
|
|
analyse=re.sub(delete,' ',analyse) # deletions |
if self.indexName=="luceneSplitter": |
|
if pNum: |
|
analyser=PyLucene.StandardAnalyzer() |
|
logging.error("calling lucene") |
|
|
|
IndexLine("/tmp/index",analyser,pNum,lineNum,analyse) |
|
else: |
splitted = analyse.split(" ") |
splitted = analyse.split(" ") |
|
|
|
|
for w in splitted: |
for w in splitted: |
w=w.lstrip().rstrip() |
w=w.lstrip().rstrip() |
|
|
if not (w==''): |
if not (w==''): |
if pNum: #only whe pnum is found (first call of the splitter, is always called twice in the pipeline |
if pNum: #only whe pnum is found (first call of the splitter, is always called twice in the pipeline |
Zope.app().cdliRoot.storeInLineIndex(w.lstrip().strip(),(pNum,lineNum)) |
|
|
Zope2.app().cdliRoot.storeInLineIndex(self.indexName,w.lstrip().strip(),(pNum,lineNum)) |
transaction.get().commit() |
transaction.get().commit() |
|
|
result.append(w.lstrip().rstrip()) |
result.append(w.lstrip().rstrip()) |
return result |
return result |
|
|
|
|
|
class graphemeSplitter(cdliSplitter): |
|
delete=deleteGraphems |
|
indexName="graphemeSplitter" |
|
|
|
class wordSplitter(cdliSplitter): |
|
delete=deleteWords |
|
indexName="wordSplitter" |
|
|
|
class luceneSplitter(cdliSplitter): |
|
delete=deleteWords |
|
indexName="luceneSplitter" |
|
|
|
|
try: |
try: |
element_factory.registerFactory('Word Splitter', |
element_factory.registerFactory('Word Splitter', |
'CDLI grapheme splitter', graphemeSplitter) |
'CDLI grapheme splitter', graphemeSplitter) |
Line 96 except:
|
Line 148 except:
|
# in case the splitter is already registered, ValueError is raised |
# in case the splitter is already registered, ValueError is raised |
pass |
pass |
|
|
|
try: |
|
element_factory.registerFactory('Word Splitter', |
|
'CDLI word splitter', wordSplitter) |
|
except: |
|
# in case the splitter is already registered, ValueError is raised |
|
pass |
|
|
|
try: |
|
element_factory.registerFactory('Word Splitter', |
|
'CDLI lucene splitter', luceneSplitter) |
|
except: |
|
# in case the splitter is already registered, ValueError is raised |
|
pass |
if __name__ == '__main__': |
if __name__ == '__main__': |
a = 'abc def我们的很 好。' |
a = 'abc def我们的很 好。' |
u = unicode(a, 'gbk') |
u = unicode(a, 'gbk') |