Annotation of cdli/cdliSplitter.py, revision 1.7.2.5
1.1 dwinter 1: """
1.7.2.2 casties 2: CDLI word and grapheme splitter
1.1 dwinter 3: """
4:
5: from Products.ZCTextIndex.PipelineFactory import element_factory
6:
7: import re
1.5 dwinter 8: import logging
1.1 dwinter 9:
10: def getSupportedEncoding(encodings):
11: for encoding in encodings:
12: try:
13: unicode('A', encoding)
14: return encoding
15: except:
16: pass
17: return 'utf-8'
18:
19:
20:
21: """beta of a fulltext splitter for cdli
22:
23: """
1.7 casties 24: ignoreLines=['$','@','#','&','>']
1.1 dwinter 25: separators=['']
1.7.2.2 casties 26: # kommas relevant for graphemes will not be deleted
27: komma_exception="([^sStThH]),"
28: # grapheme boundaries
29: graphemeBounds="\{|\}|<|>|\(|\)|-|_|\#|,|\||\]|\[|\!|\?"
30: # for words
31: wordBounds="<|>|\(|\)|_|\#|,|\||\]|\[|\!|\?"
32:
1.5 dwinter 33:
34: class cdliSplitter:
1.7.2.2 casties 35: """base class for splitter.
36: the difference between word and grapheme splitter
37: is the word boundary list."""
1.5 dwinter 38:
1.1 dwinter 39: default_encoding = "utf-8"
1.7.2.2 casties 40: bounds=graphemeBounds
1.5 dwinter 41: indexName="cdliSplitter"
42:
1.2 dwinter 43:
1.1 dwinter 44: def process(self, lst):
1.7.2.2 casties 45: """gets a list of strings and returns a list of words"""
46:
1.7.2.4 casties 47: logging.debug("cdliSplitter: %s"%self.indexName)
1.1 dwinter 48: result = []
1.2 dwinter 49: pNum=None
50: lineNum=None
1.5 dwinter 51:
1.1 dwinter 52: for t in lst:
1.7.2.2 casties 53: # normalise line breaks
54: t.replace("\r","\n")
55: # split lines
56: for s in t.split("\n"):
57: if isinstance(s, str):
58: # not unicode
59: s = unicode(s, self.default_encoding, 'replace')
60:
61: if (s!=''):
62: if s[0]=='&':
63: # store pNum
64: pNum=s[1:8]
1.7.2.3 casties 65: logging.debug("%s processing: %s"%(self.indexName,pNum))
1.5 dwinter 66:
1.7.2.2 casties 67: elif not (s[0] in ignoreLines):
68: # regular line
69: lineparts=s.split(".")
70: if len(lineparts)==1:
71: # no line number
72: txt=s
73: else:
74: #store line number
75: txt=lineparts[1]
76: lineNum=lineparts[0]
77:
78: # delete kommata except kommata relevant for graphemes
79: txt = re.sub(komma_exception,r"\1",txt)
80: # replace word boundaries by spaces
81: txt = re.sub(self.bounds,' ',txt)
82: # split words
83: words = txt.split(" ")
84: for w in words:
85: w=w.strip()
86: if not (w==''):
87: result.append(w)
88:
1.7.2.5 ! casties 89: #logging.debug("split '%s' into %s"%(lst,repr(result)))
1.5 dwinter 90: return result
1.2 dwinter 91:
92:
1.5 dwinter 93: class graphemeSplitter(cdliSplitter):
1.7.2.2 casties 94: bounds=graphemeBounds
1.5 dwinter 95: indexName="graphemeSplitter"
96:
97: class wordSplitter(cdliSplitter):
1.7.2.2 casties 98: bounds=wordBounds
1.5 dwinter 99: indexName="wordSplitter"
100:
101: try:
102: element_factory.registerFactory('Word Splitter',
103: 'CDLI grapheme splitter', graphemeSplitter)
104: except:
105: # in case the splitter is already registered, ValueError is raised
106: pass
1.2 dwinter 107:
1.5 dwinter 108: try:
109: element_factory.registerFactory('Word Splitter',
110: 'CDLI word splitter', wordSplitter)
111: except:
112: # in case the splitter is already registered, ValueError is raised
113: pass
1.1 dwinter 114:
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>