"""
Author splitter
"""
import Zope2
import transaction
from Products.ZCTextIndex.ISplitter import ISplitter
from Products.ZCTextIndex.PipelineFactory import element_factory
import re
from types import StringType
import logging
try:
import PyLucene
except:
print "no Lucene support"
def getSupportedEncoding(encodings):
for encoding in encodings:
try:
unicode('A', encoding)
return encoding
except:
pass
return 'utf-8'
"""beta of a fulltext splitter for cdli
"""
ignoreLines=['$','@','#','&']
separators=['']
komma_exception="([^sStThH])," # komma relevant for graphemes will not be deleted
deleteGraphems="\{|\}|<|>|\(|\)|-|_|\#|,|\||\]|\[|\!|\?" # for graphems
deleteWords="<|>|\(|\)|_|\#|,|\||\]|\[|\!|\?"# for words
class IndexLine(object):
"""index a line with lucene"""
def __init__(self, storeDir, analyzer,name,line,content):
logging.error("i am here %s %s %s %s %s"%((storeDir, analyzer,name,line,content)))
if not os.path.exists(storeDir):
os.mkdir(storeDir)
store = PyLucene.FSDirectory.getDirectory(storeDir, True)
writer = PyLucene.IndexWriter(store, analyzer, True)
writer.setMaxFieldLength(1048576)
self.indexDocs(writer,name,line,content)
writer.optimize()
writer.close()
def indexDocs(self, writer,name,line,content):
doc = PyLucene.Document()
doc.add(PyLucene.Field("name", pn,
PyLucene.Field.Store.YES,
PyLucene.Field.Index.UN_TOKENIZED))
doc.add(PyLucene.Field("line", str(i),
PyLucene.Field.Store.YES,
PyLucene.Field.Index.UN_TOKENIZED))
doc.add(PyLucene.Field("contents", line,
PyLucene.Field.Store.YES,
PyLucene.Field.Index.TOKENIZED))
writer.addDocument(doc)
class cdliSplitter:
"""basis class for splitter,
der Unterschied zwischen Word und Graphemesplitter
ist lediglich die unterschiedliche Auschliengsliste"""
default_encoding = "utf-8"
delete=deleteGraphems
indexName="cdliSplitter"
def process(self, lst):
result = []
pNum=None
lineNum=None
for t in lst:
t.replace("\r","\n")
for s in t.split("\n"):
if type(s) is StringType: # not unicode
s = unicode(s, self.default_encoding, 'replace')
if (s!="") and (s[0]=="&"): # store pNum
pNum=s[1:8]
logging.debug("storing: %s"%pNum)
elif (s!="") and (not (s[0] in ignoreLines)):
splitted=s.split(".")
if len(splitted)==1: #kein punkt
txt=splitted[0]
else:
txt=splitted[1]
lineNum=splitted[0] #store line number
analyse=txt
analyse=re.sub(komma_exception,r"\1",analyse) # delete kommata except kommata relevant in graphems
analyse=re.sub(self.delete,' ',analyse) # deletions
if self.indexName=="luceneSplitter":
if pNum:
analyser=PyLucene.StandardAnalyzer()
logging.error("calling lucene")
IndexLine("/tmp/index",analyser,pNum,lineNum,analyse)
else:
splitted = analyse.split(" ")
for w in splitted:
w=w.lstrip().rstrip()
if not (w==''):
if pNum: #only whe pnum is found (first call of the splitter, is always called twice in the pipeline
Zope2.app().cdliRoot.storeInLineIndex(self.indexName,w.lstrip().strip(),(pNum,lineNum))
transaction.get().commit()
result.append(w.lstrip().rstrip())
return result
class graphemeSplitter(cdliSplitter):
delete=deleteGraphems
indexName="graphemeSplitter"
class wordSplitter(cdliSplitter):
delete=deleteWords
indexName="wordSplitter"
class luceneSplitter(cdliSplitter):
delete=deleteWords
indexName="luceneSplitter"
try:
element_factory.registerFactory('Word Splitter',
'CDLI grapheme splitter', graphemeSplitter)
except:
# in case the splitter is already registered, ValueError is raised
pass
try:
element_factory.registerFactory('Word Splitter',
'CDLI word splitter', wordSplitter)
except:
# in case the splitter is already registered, ValueError is raised
pass
try:
element_factory.registerFactory('Word Splitter',
'CDLI lucene splitter', luceneSplitter)
except:
# in case the splitter is already registered, ValueError is raised
pass
if __name__ == '__main__':
a = 'abc def我们的很 好。'
u = unicode(a, 'gbk')
s = authorSplitter()
print s.process([u])
print s.process([u], 1)
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>