version 1.4, 2007/02/08 12:00:23
|
version 1.9, 2008/09/25 12:37:55
|
Line 1
|
Line 1
|
""" |
""" |
Author splitter |
CDLI word and grapheme splitter |
""" |
""" |
|
|
import Zope2 |
|
import transaction |
|
|
|
from Products.ZCTextIndex.ISplitter import ISplitter |
|
from Products.ZCTextIndex.PipelineFactory import element_factory |
from Products.ZCTextIndex.PipelineFactory import element_factory |
|
|
import re |
import re |
from types import StringType |
import logging |
|
|
def getSupportedEncoding(encodings): |
def getSupportedEncoding(encodings): |
for encoding in encodings: |
for encoding in encodings: |
Line 25 def getSupportedEncoding(encodings):
|
Line 21 def getSupportedEncoding(encodings):
|
"""beta of a fulltext splitter for cdli |
"""beta of a fulltext splitter for cdli |
|
|
""" |
""" |
ignoreLines=['$','@','#','&'] |
ignoreLines=['$','@','#','&','>'] |
separators=[''] |
separators=[''] |
komma_exception="([^sStThH])," # komma relevant for graphemes will not be deleted |
# kommas relevant for graphemes will not be deleted |
delete="\{|\}|<|>|\(|\)|-|_|\#|,|\||\]|\[|\!|\?" # for graphems |
komma_exception="([^sStThH])," |
#delete="<|>|\(|\)|_|\#|,|\||\]|\[|!|?" for words |
komma_exceptionex=re.compile(komma_exception) |
|
# grapheme boundaries |
|
#graphemeBounds="\{|\}|<|>|\(|\)|-|_|\#|,|\||\]|\[|\!|\?" |
|
graphemeBounds="\{|\}|<|>|-|_|\#|,|\]|\[|\!|\?|\"" |
|
graphemeIgnore="<|>|\#|\||\]|\[|\!|\?\*|;" |
|
# for words |
|
#wordBounds="<|>|\(|\)|_|\#|,|\||\]|\[|\!|\?" |
|
wordBounds="_|,|\"" |
|
wordIgnore="<|>|\#|\||\]|\[|\!|\?\*|;" |
|
|
|
class cdliSplitter: |
|
|
|
"""base class for splitter. |
|
the difference between word and grapheme splitter |
|
is the word boundary list.""" |
|
|
class graphemeSplitter: |
|
|
|
default_encoding = "utf-8" |
default_encoding = "utf-8" |
|
bounds=graphemeBounds |
|
boundsex=re.compile(graphemeBounds) |
|
ignore=graphemeIgnore |
|
ignorex=re.compile(graphemeIgnore) |
|
indexName="cdliSplitter" |
|
|
|
|
def process(self, lst): |
def process(self, lst): |
|
"""gets a list of strings and returns a list of words""" |
|
|
|
logging.debug("cdliSplitter: %s"%self.indexName) |
result = [] |
result = [] |
pNum=None |
pNum=None |
lineNum=None |
lineNum=None |
|
|
|
|
#print "LLLL",lst |
|
|
|
|
|
for t in lst: |
for t in lst: |
|
# normalise line breaks |
t.replace("\r","\n") |
t.replace("\r","\n") |
|
# split lines |
for s in t.split("\n"): |
for s in t.split("\n"): |
|
if isinstance(s, str): |
if type(s) is StringType: # not unicode |
# not unicode |
s = unicode(s, self.default_encoding, 'replace') |
s = unicode(s, self.default_encoding, 'replace') |
|
|
#ignore lines |
if (s!=''): |
|
if s[0]=='&': |
if (s!="") and (s[0]=="&"): # store pNum |
# store pNum |
pNum=s[1:8] |
pNum=s[1:8] |
|
logging.debug("%s processing: %s"%(self.indexName,pNum)) |
|
|
elif (s!="") and (not (s[0] in ignoreLines)): |
elif not (s[0] in ignoreLines): |
|
# regular line |
|
lineparts=s.split(". ",1) |
#ignore everthing bevor "." |
if len(lineparts)==1: |
splitted=s.split(".") |
# no line number |
|
txt=s |
if len(splitted)==1: #kein punkt |
|
txt=splitted[0] |
|
else: |
else: |
txt=splitted[1] |
#store line number |
lineNum=splitted[0] #store line number |
txt=lineparts[1] |
|
lineNum=lineparts[0] |
analyse=txt |
|
|
# delete kommata except kommata relevant for graphemes |
analyse=re.sub(komma_exception,r"\1",analyse) # delete kommata except kommata relevant in graphems |
txt = komma_exceptionex.sub(r"\1",txt) |
|
# replace word boundaries by spaces |
analyse=re.sub(delete,' ',analyse) # deletions |
txt = self.boundsex.sub(' ',txt) |
|
# replace letters to be ignored |
splitted = analyse.split(" ") |
txt = self.ignorex.sub('',txt) |
|
# split words |
for w in splitted: |
words = txt.split(" ") |
w=w.lstrip().rstrip() |
for w in words: |
|
w=w.strip() |
if not (w==''): |
if not (w==''): |
if pNum: #only whe pnum is found (first call of the splitter, is always called twice in the pipeline |
result.append(w) |
Zope.app().cdliRoot.storeInLineIndex(w.lstrip().strip(),(pNum,lineNum)) |
|
transaction.get().commit() |
|
|
|
result.append(w.lstrip().rstrip()) |
#logging.debug("split '%s' into %s"%(lst,repr(result))) |
return result |
return result |
|
|
|
|
|
class graphemeSplitter(cdliSplitter): |
|
bounds=graphemeBounds |
|
boundsex=re.compile(graphemeBounds) |
|
ignore=graphemeIgnore |
|
ignorex=re.compile(graphemeIgnore) |
|
indexName="graphemeSplitter" |
|
|
|
class wordSplitter(cdliSplitter): |
|
bounds=wordBounds |
|
boundsex=re.compile(wordBounds) |
|
ignore=wordIgnore |
|
ignorex=re.compile(wordIgnore) |
|
indexName="wordSplitter" |
|
|
try: |
try: |
element_factory.registerFactory('Word Splitter', |
element_factory.registerFactory('Word Splitter', |
'CDLI grapheme splitter', graphemeSplitter) |
'CDLI grapheme splitter', graphemeSplitter) |
Line 96 except:
|
Line 122 except:
|
# in case the splitter is already registered, ValueError is raised |
# in case the splitter is already registered, ValueError is raised |
pass |
pass |
|
|
if __name__ == '__main__': |
try: |
a = 'abc def我们的很 好。' |
element_factory.registerFactory('Word Splitter', |
u = unicode(a, 'gbk') |
'CDLI word splitter', wordSplitter) |
s = authorSplitter() |
except: |
print s.process([u]) |
# in case the splitter is already registered, ValueError is raised |
print s.process([u], 1) |
pass |
|
|