version 1.7.2.1, 2007/10/06 13:44:46
|
version 1.8, 2008/01/21 17:19:01
|
Line 1
|
Line 1
|
""" |
""" |
Author splitter |
CDLI word and grapheme splitter |
""" |
""" |
|
|
import Zope2 |
|
import transaction |
|
|
|
from Products.ZCTextIndex.ISplitter import ISplitter |
|
from Products.ZCTextIndex.PipelineFactory import element_factory |
from Products.ZCTextIndex.PipelineFactory import element_factory |
|
|
import re |
import re |
from types import StringType |
|
import logging |
import logging |
try: |
|
import PyLucene |
|
except: |
|
print "no Lucene support" |
|
|
|
def getSupportedEncoding(encodings): |
def getSupportedEncoding(encodings): |
for encoding in encodings: |
for encoding in encodings: |
Line 32 def getSupportedEncoding(encodings):
|
Line 23 def getSupportedEncoding(encodings):
|
""" |
""" |
ignoreLines=['$','@','#','&','>'] |
ignoreLines=['$','@','#','&','>'] |
separators=[''] |
separators=[''] |
komma_exception="([^sStThH])," # komma relevant for graphemes will not be deleted |
# kommas relevant for graphemes will not be deleted |
deleteGraphems="\{|\}|<|>|\(|\)|-|_|\#|,|\||\]|\[|\!|\?" # for graphems |
komma_exception="([^sStThH])," |
deleteWords="<|>|\(|\)|_|\#|,|\||\]|\[|\!|\?"# for words |
komma_exceptionex=re.compile(komma_exception) |
|
# grapheme boundaries |
class IndexLine(object): |
#graphemeBounds="\{|\}|<|>|\(|\)|-|_|\#|,|\||\]|\[|\!|\?" |
"""index a line with lucene""" |
graphemeBounds="\{|\}|<|>|-|_|\#|,|\]|\[|\!|\?|\"" |
|
graphemeIgnore="<|>|\#|\||\]|\[|\!|\?\*|;" |
def __init__(self, storeDir, analyzer,name,line,content): |
# for words |
logging.error("i am here %s %s %s %s %s"%((storeDir, analyzer,name,line,content))) |
#wordBounds="<|>|\(|\)|_|\#|,|\||\]|\[|\!|\?" |
if not os.path.exists(storeDir): |
wordBounds="_|,|\"" |
os.mkdir(storeDir) |
wordIgnore="<|>|\#|\||\]|\[|\!|\?\*|;" |
store = PyLucene.FSDirectory.getDirectory(storeDir, True) |
|
writer = PyLucene.IndexWriter(store, analyzer, True) |
|
writer.setMaxFieldLength(1048576) |
|
self.indexDocs(writer,name,line,content) |
|
writer.optimize() |
|
writer.close() |
|
|
|
def indexDocs(self, writer,name,line,content): |
|
|
|
doc = PyLucene.Document() |
|
doc.add(PyLucene.Field("name", pn, |
|
PyLucene.Field.Store.YES, |
|
PyLucene.Field.Index.UN_TOKENIZED)) |
|
|
|
doc.add(PyLucene.Field("line", str(i), |
|
PyLucene.Field.Store.YES, |
|
PyLucene.Field.Index.UN_TOKENIZED)) |
|
|
|
|
|
doc.add(PyLucene.Field("contents", line, |
|
PyLucene.Field.Store.YES, |
|
PyLucene.Field.Index.TOKENIZED)) |
|
|
|
writer.addDocument(doc) |
|
|
|
class cdliSplitter: |
class cdliSplitter: |
"""basis class for splitter, |
"""base class for splitter. |
der Unterschied zwischen Word und Graphemesplitter |
the difference between word and grapheme splitter |
ist lediglich die unterschiedliche Auschliengsliste""" |
is the word boundary list.""" |
|
|
default_encoding = "utf-8" |
default_encoding = "utf-8" |
delete=deleteGraphems |
bounds=graphemeBounds |
|
boundsex=re.compile(graphemeBounds) |
|
ignore=graphemeIgnore |
|
ignorex=re.compile(graphemeIgnore) |
indexName="cdliSplitter" |
indexName="cdliSplitter" |
|
|
|
|
def process(self, lst): |
def process(self, lst): |
logging.debug("cdliSplitter") |
"""gets a list of strings and returns a list of words""" |
|
|
|
logging.debug("cdliSplitter: %s"%self.indexName) |
result = [] |
result = [] |
pNum=None |
pNum=None |
lineNum=None |
lineNum=None |
|
|
for t in lst: |
for t in lst: |
|
# normalise line breaks |
t.replace("\r","\n") |
t.replace("\r","\n") |
|
# split lines |
for s in t.split("\n"): |
for s in t.split("\n"): |
|
if isinstance(s, str): |
if type(s) is StringType: # not unicode |
# not unicode |
s = unicode(s, self.default_encoding, 'replace') |
s = unicode(s, self.default_encoding, 'replace') |
|
|
if (s!="") and (s[0]=="&"): # store pNum |
if (s!=''): |
|
if s[0]=='&': |
|
# store pNum |
pNum=s[1:8] |
pNum=s[1:8] |
logging.debug("cdliSplitter processing: %s"%pNum) |
logging.debug("%s processing: %s"%(self.indexName,pNum)) |
|
|
elif (s!="") and (not (s[0] in ignoreLines)): |
|
splitted=s.split(".") |
|
|
|
if len(splitted)==1: #kein punkt |
|
txt=splitted[0] |
|
else: |
|
txt=splitted[1] |
|
lineNum=splitted[0] #store line number |
|
|
|
analyse=txt |
|
analyse=re.sub(komma_exception,r"\1",analyse) # delete kommata except kommata relevant in graphems |
|
analyse=re.sub(self.delete,' ',analyse) # deletions |
|
|
|
if self.indexName=="luceneSplitter": |
|
if pNum: |
|
analyser=PyLucene.StandardAnalyzer() |
|
logging.error("calling lucene") |
|
|
|
IndexLine("/tmp/index",analyser,pNum,lineNum,analyse) |
elif not (s[0] in ignoreLines): |
|
# regular line |
|
lineparts=s.split(". ",1) |
|
if len(lineparts)==1: |
|
# no line number |
|
txt=s |
else: |
else: |
splitted = analyse.split(" ") |
#store line number |
for w in splitted: |
txt=lineparts[1] |
w=w.lstrip().rstrip() |
lineNum=lineparts[0] |
|
|
|
# delete kommata except kommata relevant for graphemes |
|
txt = komma_exceptionex.sub(r"\1",txt) |
|
# replace word boundaries by spaces |
|
txt = self.boundsex.sub(' ',txt) |
|
# replace letters to be ignored |
|
txt = self.ignorex.sub('',txt) |
|
# split words |
|
words = txt.split(" ") |
|
for w in words: |
|
w=w.strip() |
if not (w==''): |
if not (w==''): |
#if pNum: #only whe pnum is found (first call of the splitter, is always called twice in the pipeline |
|
# Zope2.app().cdliRoot.storeInLineIndex(self.indexName,w.lstrip().strip(),(pNum,lineNum)) |
|
# transaction.get().commit() |
|
|
|
result.append(w) |
result.append(w) |
|
|
|
#logging.debug("split '%s' into %s"%(lst,repr(result))) |
return result |
return result |
|
|
|
|
class graphemeSplitter(cdliSplitter): |
class graphemeSplitter(cdliSplitter): |
delete=deleteGraphems |
bounds=graphemeBounds |
|
boundsex=re.compile(graphemeBounds) |
|
ignore=graphemeIgnore |
|
ignorex=re.compile(graphemeIgnore) |
indexName="graphemeSplitter" |
indexName="graphemeSplitter" |
|
|
class wordSplitter(cdliSplitter): |
class wordSplitter(cdliSplitter): |
delete=deleteWords |
bounds=wordBounds |
|
boundsex=re.compile(wordBounds) |
|
ignore=wordIgnore |
|
ignorex=re.compile(wordIgnore) |
indexName="wordSplitter" |
indexName="wordSplitter" |
|
|
class luceneSplitter(cdliSplitter): |
|
delete=deleteWords |
|
indexName="luceneSplitter" |
|
|
|
|
|
try: |
try: |
element_factory.registerFactory('Word Splitter', |
element_factory.registerFactory('Word Splitter', |
'CDLI grapheme splitter', graphemeSplitter) |
'CDLI grapheme splitter', graphemeSplitter) |
Line 156 except:
|
Line 127 except:
|
# in case the splitter is already registered, ValueError is raised |
# in case the splitter is already registered, ValueError is raised |
pass |
pass |
|
|
try: |
|
element_factory.registerFactory('Word Splitter', |
|
'CDLI lucene splitter', luceneSplitter) |
|
except: |
|
# in case the splitter is already registered, ValueError is raised |
|
pass |
|
if __name__ == '__main__': |
|
a = 'abc def我们的很 好。' |
|
u = unicode(a, 'gbk') |
|
s = authorSplitter() |
|
print s.process([u]) |
|
print s.process([u], 1) |
|