Annotation of cdli/cdliSplitter.py, revision 1.9
1.1 dwinter 1: """
1.8 casties 2: CDLI word and grapheme splitter
1.1 dwinter 3: """
4:
5: from Products.ZCTextIndex.PipelineFactory import element_factory
6:
7: import re
1.5 dwinter 8: import logging
1.1 dwinter 9:
10: def getSupportedEncoding(encodings):
11: for encoding in encodings:
12: try:
13: unicode('A', encoding)
14: return encoding
15: except:
16: pass
17: return 'utf-8'
18:
19:
20:
21: """beta of a fulltext splitter for cdli
22:
23: """
1.7 casties 24: ignoreLines=['$','@','#','&','>']
1.1 dwinter 25: separators=['']
1.8 casties 26: # kommas relevant for graphemes will not be deleted
27: komma_exception="([^sStThH]),"
28: komma_exceptionex=re.compile(komma_exception)
29: # grapheme boundaries
30: #graphemeBounds="\{|\}|<|>|\(|\)|-|_|\#|,|\||\]|\[|\!|\?"
31: graphemeBounds="\{|\}|<|>|-|_|\#|,|\]|\[|\!|\?|\""
32: graphemeIgnore="<|>|\#|\||\]|\[|\!|\?\*|;"
33: # for words
34: #wordBounds="<|>|\(|\)|_|\#|,|\||\]|\[|\!|\?"
35: wordBounds="_|,|\""
36: wordIgnore="<|>|\#|\||\]|\[|\!|\?\*|;"
1.5 dwinter 37:
38: class cdliSplitter:
1.9 ! dwinter 39:
1.8 casties 40: """base class for splitter.
41: the difference between word and grapheme splitter
42: is the word boundary list."""
1.9 ! dwinter 43:
1.5 dwinter 44:
1.1 dwinter 45: default_encoding = "utf-8"
1.8 casties 46: bounds=graphemeBounds
47: boundsex=re.compile(graphemeBounds)
48: ignore=graphemeIgnore
49: ignorex=re.compile(graphemeIgnore)
1.5 dwinter 50: indexName="cdliSplitter"
51:
1.2 dwinter 52:
1.1 dwinter 53: def process(self, lst):
1.8 casties 54: """gets a list of strings and returns a list of words"""
55:
56: logging.debug("cdliSplitter: %s"%self.indexName)
1.1 dwinter 57: result = []
1.2 dwinter 58: pNum=None
59: lineNum=None
1.5 dwinter 60:
1.1 dwinter 61: for t in lst:
1.8 casties 62: # normalise line breaks
63: t.replace("\r","\n")
64: # split lines
65: for s in t.split("\n"):
66: if isinstance(s, str):
67: # not unicode
68: s = unicode(s, self.default_encoding, 'replace')
69:
70: if (s!=''):
71: if s[0]=='&':
72: # store pNum
73: pNum=s[1:8]
74: logging.debug("%s processing: %s"%(self.indexName,pNum))
1.5 dwinter 75:
1.8 casties 76: elif not (s[0] in ignoreLines):
77: # regular line
78: lineparts=s.split(". ",1)
79: if len(lineparts)==1:
80: # no line number
81: txt=s
82: else:
83: #store line number
84: txt=lineparts[1]
85: lineNum=lineparts[0]
86:
87: # delete kommata except kommata relevant for graphemes
88: txt = komma_exceptionex.sub(r"\1",txt)
89: # replace word boundaries by spaces
90: txt = self.boundsex.sub(' ',txt)
91: # replace letters to be ignored
92: txt = self.ignorex.sub('',txt)
93: # split words
94: words = txt.split(" ")
95: for w in words:
96: w=w.strip()
97: if not (w==''):
98: result.append(w)
99:
100: #logging.debug("split '%s' into %s"%(lst,repr(result)))
1.5 dwinter 101: return result
1.2 dwinter 102:
103:
1.5 dwinter 104: class graphemeSplitter(cdliSplitter):
1.8 casties 105: bounds=graphemeBounds
106: boundsex=re.compile(graphemeBounds)
107: ignore=graphemeIgnore
108: ignorex=re.compile(graphemeIgnore)
1.5 dwinter 109: indexName="graphemeSplitter"
110:
111: class wordSplitter(cdliSplitter):
1.8 casties 112: bounds=wordBounds
113: boundsex=re.compile(wordBounds)
114: ignore=wordIgnore
115: ignorex=re.compile(wordIgnore)
1.5 dwinter 116: indexName="wordSplitter"
117:
118: try:
119: element_factory.registerFactory('Word Splitter',
120: 'CDLI grapheme splitter', graphemeSplitter)
121: except:
122: # in case the splitter is already registered, ValueError is raised
123: pass
1.2 dwinter 124:
1.5 dwinter 125: try:
126: element_factory.registerFactory('Word Splitter',
127: 'CDLI word splitter', wordSplitter)
128: except:
129: # in case the splitter is already registered, ValueError is raised
130: pass
1.1 dwinter 131:
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>