1: """
2: CDLI word and grapheme splitter
3: """
4:
5: from Products.ZCTextIndex.PipelineFactory import element_factory
6:
7: import re
8: import logging
9:
10: def getSupportedEncoding(encodings):
11: for encoding in encodings:
12: try:
13: unicode('A', encoding)
14: return encoding
15: except:
16: pass
17: return 'utf-8'
18:
19:
20:
21: """beta of a fulltext splitter for cdli
22:
23: """
24: ignoreLines=['$','@','#','&','>']
25: separators=['']
26: # kommas relevant for graphemes will not be deleted
27: komma_exception="([^sStThH]),"
28: # grapheme boundaries
29: graphemeBounds="\{|\}|<|>|\(|\)|-|_|\#|,|\||\]|\[|\!|\?"
30: # for words
31: wordBounds="<|>|\(|\)|_|\#|,|\||\]|\[|\!|\?"
32:
33:
34: class cdliSplitter:
35: """base class for splitter.
36: the difference between word and grapheme splitter
37: is the word boundary list."""
38:
39: default_encoding = "utf-8"
40: bounds=graphemeBounds
41: indexName="cdliSplitter"
42:
43:
44: def process(self, lst):
45: """gets a list of strings and returns a list of words"""
46:
47: logging.debug("cdliSplitter: %s"%self.indexName)
48: result = []
49: pNum=None
50: lineNum=None
51:
52: for t in lst:
53: # normalise line breaks
54: t.replace("\r","\n")
55: # split lines
56: for s in t.split("\n"):
57: if isinstance(s, str):
58: # not unicode
59: s = unicode(s, self.default_encoding, 'replace')
60:
61: if (s!=''):
62: if s[0]=='&':
63: # store pNum
64: pNum=s[1:8]
65: logging.debug("%s processing: %s"%(self.indexName,pNum))
66:
67: elif not (s[0] in ignoreLines):
68: # regular line
69: lineparts=s.split(".")
70: if len(lineparts)==1:
71: # no line number
72: txt=s
73: else:
74: #store line number
75: txt=lineparts[1]
76: lineNum=lineparts[0]
77:
78: # delete kommata except kommata relevant for graphemes
79: txt = re.sub(komma_exception,r"\1",txt)
80: # replace word boundaries by spaces
81: txt = re.sub(self.bounds,' ',txt)
82: # split words
83: words = txt.split(" ")
84: for w in words:
85: w=w.strip()
86: if not (w==''):
87: result.append(w)
88:
89: logging.debug("split '%s' into %s"%(lst,repr(result)))
90: return result
91:
92:
93: class graphemeSplitter(cdliSplitter):
94: bounds=graphemeBounds
95: indexName="graphemeSplitter"
96:
97: class wordSplitter(cdliSplitter):
98: bounds=wordBounds
99: indexName="wordSplitter"
100:
101: try:
102: element_factory.registerFactory('Word Splitter',
103: 'CDLI grapheme splitter', graphemeSplitter)
104: except:
105: # in case the splitter is already registered, ValueError is raised
106: pass
107:
108: try:
109: element_factory.registerFactory('Word Splitter',
110: 'CDLI word splitter', wordSplitter)
111: except:
112: # in case the splitter is already registered, ValueError is raised
113: pass
114:
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>