"""
CDLI word and grapheme splitter
"""
from Products.ZCTextIndex.PipelineFactory import element_factory
import re
import logging
def getSupportedEncoding(encodings):
for encoding in encodings:
try:
unicode('A', encoding)
return encoding
except:
pass
return 'utf-8'
"""beta of a fulltext splitter for cdli
"""
ignoreLines=['$','@','#','&','>']
separators=['']
# kommas relevant for graphemes will not be deleted
komma_exception="([^sStThH]),"
# grapheme boundaries
graphemeBounds="\{|\}|<|>|\(|\)|-|_|\#|,|\||\]|\[|\!|\?"
# for words
wordBounds="<|>|\(|\)|_|\#|,|\||\]|\[|\!|\?"
class cdliSplitter:
"""base class for splitter.
the difference between word and grapheme splitter
is the word boundary list."""
default_encoding = "utf-8"
bounds=graphemeBounds
indexName="cdliSplitter"
def process(self, lst):
"""gets a list of strings and returns a list of words"""
logging.debug("cdliSplitter: %s"%self.indexName)
result = []
pNum=None
lineNum=None
for t in lst:
# normalise line breaks
t.replace("\r","\n")
# split lines
for s in t.split("\n"):
if isinstance(s, str):
# not unicode
s = unicode(s, self.default_encoding, 'replace')
if (s!=''):
if s[0]=='&':
# store pNum
pNum=s[1:8]
logging.debug("%s processing: %s"%(self.indexName,pNum))
elif not (s[0] in ignoreLines):
# regular line
lineparts=s.split(".")
if len(lineparts)==1:
# no line number
txt=s
else:
#store line number
txt=lineparts[1]
lineNum=lineparts[0]
# delete kommata except kommata relevant for graphemes
txt = re.sub(komma_exception,r"\1",txt)
# replace word boundaries by spaces
txt = re.sub(self.bounds,' ',txt)
# split words
words = txt.split(" ")
for w in words:
w=w.strip()
if not (w==''):
result.append(w)
#logging.debug("split '%s' into %s"%(lst,repr(result)))
return result
class graphemeSplitter(cdliSplitter):
bounds=graphemeBounds
indexName="graphemeSplitter"
class wordSplitter(cdliSplitter):
bounds=wordBounds
indexName="wordSplitter"
try:
element_factory.registerFactory('Word Splitter',
'CDLI grapheme splitter', graphemeSplitter)
except:
# in case the splitter is already registered, ValueError is raised
pass
try:
element_factory.registerFactory('Word Splitter',
'CDLI word splitter', wordSplitter)
except:
# in case the splitter is already registered, ValueError is raised
pass
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>