Annotation of cdli/cdliSplitter.py, revision 1.7.2.6
1.1 dwinter 1: """
1.7.2.2 casties 2: CDLI word and grapheme splitter
1.1 dwinter 3: """
4:
5: from Products.ZCTextIndex.PipelineFactory import element_factory
6:
7: import re
1.5 dwinter 8: import logging
1.1 dwinter 9:
10: def getSupportedEncoding(encodings):
11: for encoding in encodings:
12: try:
13: unicode('A', encoding)
14: return encoding
15: except:
16: pass
17: return 'utf-8'
18:
19:
20:
21: """beta of a fulltext splitter for cdli
22:
23: """
1.7 casties 24: ignoreLines=['$','@','#','&','>']
1.1 dwinter 25: separators=['']
1.7.2.2 casties 26: # kommas relevant for graphemes will not be deleted
27: komma_exception="([^sStThH]),"
28: # grapheme boundaries
1.7.2.6 ! casties 29: #graphemeBounds="\{|\}|<|>|\(|\)|-|_|\#|,|\||\]|\[|\!|\?"
! 30: graphemeBounds="\{|\}|<|>|-|_|\#|,|\]|\[|\!|\?|\""
1.7.2.2 casties 31: # for words
1.7.2.6 ! casties 32: #wordBounds="<|>|\(|\)|_|\#|,|\||\]|\[|\!|\?"
! 33: wordBounds="<|>|_|\#|,|\]|\[|\!|\?|\""
1.7.2.2 casties 34:
1.5 dwinter 35:
36: class cdliSplitter:
1.7.2.2 casties 37: """base class for splitter.
38: the difference between word and grapheme splitter
39: is the word boundary list."""
1.5 dwinter 40:
1.1 dwinter 41: default_encoding = "utf-8"
1.7.2.2 casties 42: bounds=graphemeBounds
1.5 dwinter 43: indexName="cdliSplitter"
44:
1.2 dwinter 45:
1.1 dwinter 46: def process(self, lst):
1.7.2.2 casties 47: """gets a list of strings and returns a list of words"""
48:
1.7.2.4 casties 49: logging.debug("cdliSplitter: %s"%self.indexName)
1.1 dwinter 50: result = []
1.2 dwinter 51: pNum=None
52: lineNum=None
1.5 dwinter 53:
1.1 dwinter 54: for t in lst:
1.7.2.2 casties 55: # normalise line breaks
56: t.replace("\r","\n")
57: # split lines
58: for s in t.split("\n"):
59: if isinstance(s, str):
60: # not unicode
61: s = unicode(s, self.default_encoding, 'replace')
62:
63: if (s!=''):
64: if s[0]=='&':
65: # store pNum
66: pNum=s[1:8]
1.7.2.3 casties 67: logging.debug("%s processing: %s"%(self.indexName,pNum))
1.5 dwinter 68:
1.7.2.2 casties 69: elif not (s[0] in ignoreLines):
70: # regular line
71: lineparts=s.split(".")
72: if len(lineparts)==1:
73: # no line number
74: txt=s
75: else:
76: #store line number
77: txt=lineparts[1]
78: lineNum=lineparts[0]
79:
80: # delete kommata except kommata relevant for graphemes
81: txt = re.sub(komma_exception,r"\1",txt)
82: # replace word boundaries by spaces
83: txt = re.sub(self.bounds,' ',txt)
84: # split words
85: words = txt.split(" ")
86: for w in words:
87: w=w.strip()
88: if not (w==''):
89: result.append(w)
90:
1.7.2.6 ! casties 91: logging.debug("split '%s' into %s"%(lst,repr(result)))
1.5 dwinter 92: return result
1.2 dwinter 93:
94:
1.5 dwinter 95: class graphemeSplitter(cdliSplitter):
1.7.2.2 casties 96: bounds=graphemeBounds
1.5 dwinter 97: indexName="graphemeSplitter"
98:
99: class wordSplitter(cdliSplitter):
1.7.2.2 casties 100: bounds=wordBounds
1.5 dwinter 101: indexName="wordSplitter"
102:
103: try:
104: element_factory.registerFactory('Word Splitter',
105: 'CDLI grapheme splitter', graphemeSplitter)
106: except:
107: # in case the splitter is already registered, ValueError is raised
108: pass
1.2 dwinter 109:
1.5 dwinter 110: try:
111: element_factory.registerFactory('Word Splitter',
112: 'CDLI word splitter', wordSplitter)
113: except:
114: # in case the splitter is already registered, ValueError is raised
115: pass
1.1 dwinter 116:
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>