Annotation of cdli/cdliSplitter.py, revision 1.7.2.7
1.1 dwinter 1: """
1.7.2.2 casties 2: CDLI word and grapheme splitter
1.1 dwinter 3: """
4:
5: from Products.ZCTextIndex.PipelineFactory import element_factory
6:
7: import re
1.5 dwinter 8: import logging
1.1 dwinter 9:
10: def getSupportedEncoding(encodings):
11: for encoding in encodings:
12: try:
13: unicode('A', encoding)
14: return encoding
15: except:
16: pass
17: return 'utf-8'
18:
19:
20:
21: """beta of a fulltext splitter for cdli
22:
23: """
1.7 casties 24: ignoreLines=['$','@','#','&','>']
1.1 dwinter 25: separators=['']
1.7.2.2 casties 26: # kommas relevant for graphemes will not be deleted
27: komma_exception="([^sStThH]),"
28: # grapheme boundaries
1.7.2.6 casties 29: #graphemeBounds="\{|\}|<|>|\(|\)|-|_|\#|,|\||\]|\[|\!|\?"
30: graphemeBounds="\{|\}|<|>|-|_|\#|,|\]|\[|\!|\?|\""
1.7.2.7 ! casties 31: graphemeIgnore=""
1.7.2.2 casties 32: # for words
1.7.2.6 casties 33: #wordBounds="<|>|\(|\)|_|\#|,|\||\]|\[|\!|\?"
1.7.2.7 ! casties 34: wordBounds="_|,|\""
! 35: wordIgnore="<|>|\#|\||\]|\[|\!|\?"
1.5 dwinter 36:
37: class cdliSplitter:
1.7.2.2 casties 38: """base class for splitter.
39: the difference between word and grapheme splitter
40: is the word boundary list."""
1.5 dwinter 41:
1.1 dwinter 42: default_encoding = "utf-8"
1.7.2.2 casties 43: bounds=graphemeBounds
1.7.2.7 ! casties 44: ignore=graphemeIgnore
1.5 dwinter 45: indexName="cdliSplitter"
46:
1.2 dwinter 47:
1.1 dwinter 48: def process(self, lst):
1.7.2.2 casties 49: """gets a list of strings and returns a list of words"""
50:
1.7.2.4 casties 51: logging.debug("cdliSplitter: %s"%self.indexName)
1.1 dwinter 52: result = []
1.2 dwinter 53: pNum=None
54: lineNum=None
1.5 dwinter 55:
1.1 dwinter 56: for t in lst:
1.7.2.2 casties 57: # normalise line breaks
58: t.replace("\r","\n")
59: # split lines
60: for s in t.split("\n"):
61: if isinstance(s, str):
62: # not unicode
63: s = unicode(s, self.default_encoding, 'replace')
64:
65: if (s!=''):
66: if s[0]=='&':
67: # store pNum
68: pNum=s[1:8]
1.7.2.3 casties 69: logging.debug("%s processing: %s"%(self.indexName,pNum))
1.5 dwinter 70:
1.7.2.2 casties 71: elif not (s[0] in ignoreLines):
72: # regular line
73: lineparts=s.split(".")
74: if len(lineparts)==1:
75: # no line number
76: txt=s
77: else:
78: #store line number
79: txt=lineparts[1]
80: lineNum=lineparts[0]
81:
82: # delete kommata except kommata relevant for graphemes
83: txt = re.sub(komma_exception,r"\1",txt)
1.7.2.7 ! casties 84: # replace letters to be ignored
! 85: txt = re.sub(self.ignore,'',txt)
1.7.2.2 casties 86: # replace word boundaries by spaces
87: txt = re.sub(self.bounds,' ',txt)
88: # split words
89: words = txt.split(" ")
90: for w in words:
91: w=w.strip()
92: if not (w==''):
93: result.append(w)
94:
1.7.2.6 casties 95: logging.debug("split '%s' into %s"%(lst,repr(result)))
1.5 dwinter 96: return result
1.2 dwinter 97:
98:
1.5 dwinter 99: class graphemeSplitter(cdliSplitter):
1.7.2.2 casties 100: bounds=graphemeBounds
1.7.2.7 ! casties 101: ignore=graphemeIgnore
1.5 dwinter 102: indexName="graphemeSplitter"
103:
104: class wordSplitter(cdliSplitter):
1.7.2.2 casties 105: bounds=wordBounds
1.7.2.7 ! casties 106: ignore=wordIgnore
1.5 dwinter 107: indexName="wordSplitter"
108:
109: try:
110: element_factory.registerFactory('Word Splitter',
111: 'CDLI grapheme splitter', graphemeSplitter)
112: except:
113: # in case the splitter is already registered, ValueError is raised
114: pass
1.2 dwinter 115:
1.5 dwinter 116: try:
117: element_factory.registerFactory('Word Splitter',
118: 'CDLI word splitter', wordSplitter)
119: except:
120: # in case the splitter is already registered, ValueError is raised
121: pass
1.1 dwinter 122:
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>