version 1.1, 2006/11/14 17:02:59
|
version 1.4, 2007/02/08 12:00:23
|
Line 2
|
Line 2
|
Author splitter |
Author splitter |
""" |
""" |
|
|
|
import Zope2 |
|
import transaction |
|
|
from Products.ZCTextIndex.ISplitter import ISplitter |
from Products.ZCTextIndex.ISplitter import ISplitter |
from Products.ZCTextIndex.PipelineFactory import element_factory |
from Products.ZCTextIndex.PipelineFactory import element_factory |
|
|
Line 24 def getSupportedEncoding(encodings):
|
Line 27 def getSupportedEncoding(encodings):
|
""" |
""" |
ignoreLines=['$','@','#','&'] |
ignoreLines=['$','@','#','&'] |
separators=[''] |
separators=[''] |
delete="{|}|<|>|\(|\)|-|_|\#|,|\~|\||\]|\[" |
komma_exception="([^sStThH])," # komma relevant for graphemes will not be deleted |
|
delete="\{|\}|<|>|\(|\)|-|_|\#|,|\||\]|\[|\!|\?" # for graphems |
|
#delete="<|>|\(|\)|_|\#|,|\||\]|\[|!|?" for words |
|
|
class graphemeSplitter: |
class graphemeSplitter: |
|
|
Line 32 class graphemeSplitter:
|
Line 37 class graphemeSplitter:
|
|
|
def process(self, lst): |
def process(self, lst): |
result = [] |
result = [] |
|
pNum=None |
|
lineNum=None |
|
|
|
|
|
#print "LLLL",lst |
|
|
|
|
for t in lst: |
for t in lst: |
|
|
Line 43 class graphemeSplitter:
|
Line 54 class graphemeSplitter:
|
|
|
#ignore lines |
#ignore lines |
|
|
if (s!="") and (not (s[0] in ignoreLines)): |
if (s!="") and (s[0]=="&"): # store pNum |
|
pNum=s[1:8] |
|
|
|
elif (s!="") and (not (s[0] in ignoreLines)): |
|
|
|
|
#ignore everthing bevor "." |
#ignore everthing bevor "." |
splitted=s.split(".") |
splitted=s.split(".") |
Line 52 class graphemeSplitter:
|
Line 67 class graphemeSplitter:
|
txt=splitted[0] |
txt=splitted[0] |
else: |
else: |
txt=splitted[1] |
txt=splitted[1] |
|
lineNum=splitted[0] #store line number |
|
|
analyse=txt |
analyse=txt |
|
|
|
analyse=re.sub(komma_exception,r"\1",analyse) # delete kommata except kommata relevant in graphems |
|
|
analyse=re.sub(delete,' ',analyse) # deletions |
analyse=re.sub(delete,' ',analyse) # deletions |
|
|
splitted = analyse.split(" ") |
splitted = analyse.split(" ") |
|
|
for w in splitted: |
for w in splitted: |
w=w.lstrip().rstrip() |
w=w.lstrip().rstrip() |
|
|
if not (w==''): |
if not (w==''): |
print repr(w) |
if pNum: #only whe pnum is found (first call of the splitter, is always called twice in the pipeline |
|
Zope.app().cdliRoot.storeInLineIndex(w.lstrip().strip(),(pNum,lineNum)) |
|
transaction.get().commit() |
|
|
result.append(w.lstrip().rstrip()) |
result.append(w.lstrip().rstrip()) |
return result |
return result |
|
|
element_factory.registerFactory('Word Splitter', |
|
'CDLI grapheme splitter', graphemeSplitter) |
|
|
|
try: |
try: |
element_factory.registerFactory('graphemeSplitter', |
element_factory.registerFactory('Word Splitter', |
'CDLI grapheme splitter', graphemeSplitter) |
'CDLI grapheme splitter', graphemeSplitter) |
except: |
except: |
# in case the splitter is already registered, ValueError is raised |
# in case the splitter is already registered, ValueError is raised |