Annotation of cdli/cdliSplitter.py, revision 1.7.2.10
1.1 dwinter 1: """
1.7.2.2 casties 2: CDLI word and grapheme splitter
1.1 dwinter 3: """
4:
5: from Products.ZCTextIndex.PipelineFactory import element_factory
6:
7: import re
1.5 dwinter 8: import logging
1.1 dwinter 9:
10: def getSupportedEncoding(encodings):
11: for encoding in encodings:
12: try:
13: unicode('A', encoding)
14: return encoding
15: except:
16: pass
17: return 'utf-8'
18:
19:
20:
21: """beta of a fulltext splitter for cdli
22:
23: """
1.7 casties 24: ignoreLines=['$','@','#','&','>']
1.1 dwinter 25: separators=['']
1.7.2.2 casties 26: # kommas relevant for graphemes will not be deleted
1.7.2.9 casties 27: komma_exception="([^sStThH]),"
28: komma_exceptionex=re.compile(komma_exception)
1.7.2.2 casties 29: # grapheme boundaries
1.7.2.6 casties 30: #graphemeBounds="\{|\}|<|>|\(|\)|-|_|\#|,|\||\]|\[|\!|\?"
31: graphemeBounds="\{|\}|<|>|-|_|\#|,|\]|\[|\!|\?|\""
1.7.2.10! casties 32: graphemeIgnore="<|>|\#|\||\]|\[|\!|\?\*"
1.7.2.2 casties 33: # for words
1.7.2.6 casties 34: #wordBounds="<|>|\(|\)|_|\#|,|\||\]|\[|\!|\?"
1.7.2.7 casties 35: wordBounds="_|,|\""
1.7.2.10! casties 36: wordIgnore="<|>|\#|\||\]|\[|\!|\?\*"
1.5 dwinter 37:
38: class cdliSplitter:
1.7.2.2 casties 39: """base class for splitter.
40: the difference between word and grapheme splitter
41: is the word boundary list."""
1.5 dwinter 42:
1.1 dwinter 43: default_encoding = "utf-8"
1.7.2.2 casties 44: bounds=graphemeBounds
1.7.2.9 casties 45: boundsex=re.compile(graphemeBounds)
1.7.2.7 casties 46: ignore=graphemeIgnore
1.7.2.9 casties 47: ignorex=re.compile(graphemeIgnore)
1.5 dwinter 48: indexName="cdliSplitter"
49:
1.2 dwinter 50:
1.1 dwinter 51: def process(self, lst):
1.7.2.2 casties 52: """gets a list of strings and returns a list of words"""
53:
1.7.2.4 casties 54: logging.debug("cdliSplitter: %s"%self.indexName)
1.1 dwinter 55: result = []
1.2 dwinter 56: pNum=None
57: lineNum=None
1.5 dwinter 58:
1.1 dwinter 59: for t in lst:
1.7.2.2 casties 60: # normalise line breaks
61: t.replace("\r","\n")
62: # split lines
63: for s in t.split("\n"):
64: if isinstance(s, str):
65: # not unicode
66: s = unicode(s, self.default_encoding, 'replace')
67:
68: if (s!=''):
69: if s[0]=='&':
70: # store pNum
71: pNum=s[1:8]
1.7.2.3 casties 72: logging.debug("%s processing: %s"%(self.indexName,pNum))
1.5 dwinter 73:
1.7.2.2 casties 74: elif not (s[0] in ignoreLines):
75: # regular line
76: lineparts=s.split(".")
77: if len(lineparts)==1:
78: # no line number
79: txt=s
80: else:
81: #store line number
82: txt=lineparts[1]
83: lineNum=lineparts[0]
84:
85: # delete kommata except kommata relevant for graphemes
1.7.2.9 casties 86: txt = komma_exceptionex.sub(r"\1",txt)
1.7.2.2 casties 87: # replace word boundaries by spaces
1.7.2.9 casties 88: txt = self.boundsex.sub(' ',txt)
89: # replace letters to be ignored
90: txt = self.ignorex.sub('',txt)
1.7.2.2 casties 91: # split words
92: words = txt.split(" ")
93: for w in words:
94: w=w.strip()
95: if not (w==''):
96: result.append(w)
97:
1.7.2.8 casties 98: #logging.debug("split '%s' into %s"%(lst,repr(result)))
1.5 dwinter 99: return result
1.2 dwinter 100:
101:
1.5 dwinter 102: class graphemeSplitter(cdliSplitter):
1.7.2.2 casties 103: bounds=graphemeBounds
1.7.2.9 casties 104: boundsex=re.compile(graphemeBounds)
1.7.2.7 casties 105: ignore=graphemeIgnore
1.7.2.9 casties 106: ignorex=re.compile(graphemeIgnore)
1.5 dwinter 107: indexName="graphemeSplitter"
108:
109: class wordSplitter(cdliSplitter):
1.7.2.2 casties 110: bounds=wordBounds
1.7.2.9 casties 111: boundsex=re.compile(wordBounds)
1.7.2.7 casties 112: ignore=wordIgnore
1.7.2.9 casties 113: ignorex=re.compile(wordIgnore)
1.5 dwinter 114: indexName="wordSplitter"
115:
116: try:
117: element_factory.registerFactory('Word Splitter',
118: 'CDLI grapheme splitter', graphemeSplitter)
119: except:
120: # in case the splitter is already registered, ValueError is raised
121: pass
1.2 dwinter 122:
1.5 dwinter 123: try:
124: element_factory.registerFactory('Word Splitter',
125: 'CDLI word splitter', wordSplitter)
126: except:
127: # in case the splitter is already registered, ValueError is raised
128: pass
1.1 dwinter 129:
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>