1: """
2: CDLI word and grapheme splitter
3: """
4:
5: from Products.ZCTextIndex.PipelineFactory import element_factory
6:
7: import re
8: import logging
9:
10: def getSupportedEncoding(encodings):
11: for encoding in encodings:
12: try:
13: unicode('A', encoding)
14: return encoding
15: except:
16: pass
17: return 'utf-8'
18:
19:
20:
21: """beta of a fulltext splitter for cdli
22:
23: """
24: ignoreLines=['$','@','#','&','>']
25: separators=['']
26: # kommas relevant for graphemes will not be deleted
27: komma_exception="([^sStThH]),"
28: # grapheme boundaries
29: graphemeBounds="\{|\}|<|>|\(|\)|-|_|\#|,|\||\]|\[|\!|\?"
30: # for words
31: wordBounds="<|>|\(|\)|_|\#|,|\||\]|\[|\!|\?"
32:
33:
34: class cdliSplitter:
35: """base class for splitter.
36: the difference between word and grapheme splitter
37: is the word boundary list."""
38:
39: default_encoding = "utf-8"
40: bounds=graphemeBounds
41: indexName="cdliSplitter"
42:
43:
44: def process(self, lst):
45: """gets a list of strings and returns a list of words"""
46:
47: logging.debug("cdliSplitter")
48: result = []
49: pNum=None
50: lineNum=None
51:
52: for t in lst:
53: # normalise line breaks
54: t.replace("\r","\n")
55: # split lines
56: for s in t.split("\n"):
57: if isinstance(s, str):
58: # not unicode
59: s = unicode(s, self.default_encoding, 'replace')
60:
61: if (s!=''):
62: if s[0]=='&':
63: # store pNum
64: pNum=s[1:8]
65: logging.debug("%s processing: %s"%(indexName,pNum))
66:
67: elif not (s[0] in ignoreLines):
68: # regular line
69: lineparts=s.split(".")
70: if len(lineparts)==1:
71: # no line number
72: txt=s
73: else:
74: #store line number
75: txt=lineparts[1]
76: lineNum=lineparts[0]
77:
78: # delete kommata except kommata relevant for graphemes
79: txt = re.sub(komma_exception,r"\1",txt)
80: # replace word boundaries by spaces
81: txt = re.sub(self.bounds,' ',txt)
82: # split words
83: words = txt.split(" ")
84: for w in words:
85: w=w.strip()
86: if not (w==''):
87: result.append(w)
88:
89: return result
90:
91:
92: class graphemeSplitter(cdliSplitter):
93: bounds=graphemeBounds
94: indexName="graphemeSplitter"
95:
96: class wordSplitter(cdliSplitter):
97: bounds=wordBounds
98: indexName="wordSplitter"
99:
100: try:
101: element_factory.registerFactory('Word Splitter',
102: 'CDLI grapheme splitter', graphemeSplitter)
103: except:
104: # in case the splitter is already registered, ValueError is raised
105: pass
106:
107: try:
108: element_factory.registerFactory('Word Splitter',
109: 'CDLI word splitter', wordSplitter)
110: except:
111: # in case the splitter is already registered, ValueError is raised
112: pass
113:
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>