1: """
2: CDLI word and grapheme splitter
3: """
4:
5: from Products.ZCTextIndex.PipelineFactory import element_factory
6:
7: import re
8: import logging
9:
10: def getSupportedEncoding(encodings):
11: for encoding in encodings:
12: try:
13: unicode('A', encoding)
14: return encoding
15: except:
16: pass
17: return 'utf-8'
18:
19:
20:
21: """beta of a fulltext splitter for cdli
22:
23: """
24: ignoreLines=['$','@','#','&','>']
25: separators=['']
26: # kommas relevant for graphemes will not be deleted
27: komma_exception="([^sStThH]),"
28: komma_exceptionex=re.compile(komma_exception)
29: # grapheme boundaries
30: #graphemeBounds="\{|\}|<|>|\(|\)|-|_|\#|,|\||\]|\[|\!|\?"
31: graphemeBounds="\{|\}|<|>|-|_|\#|,|\]|\[|\!|\?|\""
32: graphemeIgnore="<|>|\#|\||\]|\[|\!|\?"
33: # for words
34: #wordBounds="<|>|\(|\)|_|\#|,|\||\]|\[|\!|\?"
35: wordBounds="_|,|\""
36: wordIgnore="<|>|\#|\||\]|\[|\!|\?"
37:
38: class cdliSplitter:
39: """base class for splitter.
40: the difference between word and grapheme splitter
41: is the word boundary list."""
42:
43: default_encoding = "utf-8"
44: bounds=graphemeBounds
45: boundsex=re.compile(graphemeBounds)
46: ignore=graphemeIgnore
47: ignorex=re.compile(graphemeIgnore)
48: indexName="cdliSplitter"
49:
50:
51: def process(self, lst):
52: """gets a list of strings and returns a list of words"""
53:
54: logging.debug("cdliSplitter: %s"%self.indexName)
55: result = []
56: pNum=None
57: lineNum=None
58:
59: for t in lst:
60: # normalise line breaks
61: t.replace("\r","\n")
62: # split lines
63: for s in t.split("\n"):
64: if isinstance(s, str):
65: # not unicode
66: s = unicode(s, self.default_encoding, 'replace')
67:
68: if (s!=''):
69: if s[0]=='&':
70: # store pNum
71: pNum=s[1:8]
72: logging.debug("%s processing: %s"%(self.indexName,pNum))
73:
74: elif not (s[0] in ignoreLines):
75: # regular line
76: lineparts=s.split(".")
77: if len(lineparts)==1:
78: # no line number
79: txt=s
80: else:
81: #store line number
82: txt=lineparts[1]
83: lineNum=lineparts[0]
84:
85: # delete kommata except kommata relevant for graphemes
86: txt = komma_exceptionex.sub(r"\1",txt)
87: # replace word boundaries by spaces
88: txt = self.boundsex.sub(' ',txt)
89: # replace letters to be ignored
90: txt = self.ignorex.sub('',txt)
91: # split words
92: words = txt.split(" ")
93: for w in words:
94: w=w.strip()
95: if not (w==''):
96: result.append(w)
97:
98: #logging.debug("split '%s' into %s"%(lst,repr(result)))
99: return result
100:
101:
102: class graphemeSplitter(cdliSplitter):
103: bounds=graphemeBounds
104: boundsex=re.compile(graphemeBounds)
105: ignore=graphemeIgnore
106: ignorex=re.compile(graphemeIgnore)
107: indexName="graphemeSplitter"
108:
109: class wordSplitter(cdliSplitter):
110: bounds=wordBounds
111: boundsex=re.compile(wordBounds)
112: ignore=wordIgnore
113: ignorex=re.compile(wordIgnore)
114: indexName="wordSplitter"
115:
116: try:
117: element_factory.registerFactory('Word Splitter',
118: 'CDLI grapheme splitter', graphemeSplitter)
119: except:
120: # in case the splitter is already registered, ValueError is raised
121: pass
122:
123: try:
124: element_factory.registerFactory('Word Splitter',
125: 'CDLI word splitter', wordSplitter)
126: except:
127: # in case the splitter is already registered, ValueError is raised
128: pass
129:
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>