Annotation of cdli/cdliSplitter.py, revision 1.7.2.2
1.1 dwinter 1: """
1.7.2.2 ! casties 2: CDLI word and grapheme splitter
1.1 dwinter 3: """
4:
5: from Products.ZCTextIndex.PipelineFactory import element_factory
6:
7: import re
1.5 dwinter 8: import logging
1.1 dwinter 9:
10: def getSupportedEncoding(encodings):
11: for encoding in encodings:
12: try:
13: unicode('A', encoding)
14: return encoding
15: except:
16: pass
17: return 'utf-8'
18:
19:
20:
21: """beta of a fulltext splitter for cdli
22:
23: """
1.7 casties 24: ignoreLines=['$','@','#','&','>']
1.1 dwinter 25: separators=['']
1.7.2.2 ! casties 26: # kommas relevant for graphemes will not be deleted
! 27: komma_exception="([^sStThH]),"
! 28: # grapheme boundaries
! 29: graphemeBounds="\{|\}|<|>|\(|\)|-|_|\#|,|\||\]|\[|\!|\?"
! 30: # for words
! 31: wordBounds="<|>|\(|\)|_|\#|,|\||\]|\[|\!|\?"
! 32:
1.5 dwinter 33:
34: class cdliSplitter:
1.7.2.2 ! casties 35: """base class for splitter.
! 36: the difference between word and grapheme splitter
! 37: is the word boundary list."""
1.5 dwinter 38:
1.1 dwinter 39: default_encoding = "utf-8"
1.7.2.2 ! casties 40: bounds=graphemeBounds
1.5 dwinter 41: indexName="cdliSplitter"
42:
1.2 dwinter 43:
1.1 dwinter 44: def process(self, lst):
1.7.2.2 ! casties 45: """gets a list of strings and returns a list of words"""
! 46:
1.7.2.1 casties 47: logging.debug("cdliSplitter")
1.1 dwinter 48: result = []
1.2 dwinter 49: pNum=None
50: lineNum=None
1.5 dwinter 51:
1.1 dwinter 52: for t in lst:
1.7.2.2 ! casties 53: # normalise line breaks
! 54: t.replace("\r","\n")
! 55: # split lines
! 56: for s in t.split("\n"):
! 57: if isinstance(s, str):
! 58: # not unicode
! 59: s = unicode(s, self.default_encoding, 'replace')
! 60:
! 61: if (s!=''):
! 62: if s[0]=='&':
! 63: # store pNum
! 64: pNum=s[1:8]
! 65: logging.debug("%s processing: %s"%(indexName,pNum))
1.5 dwinter 66:
1.7.2.2 ! casties 67: elif not (s[0] in ignoreLines):
! 68: # regular line
! 69: lineparts=s.split(".")
! 70: if len(lineparts)==1:
! 71: # no line number
! 72: txt=s
! 73: else:
! 74: #store line number
! 75: txt=lineparts[1]
! 76: lineNum=lineparts[0]
! 77:
! 78: # delete kommata except kommata relevant for graphemes
! 79: txt = re.sub(komma_exception,r"\1",txt)
! 80: # replace word boundaries by spaces
! 81: txt = re.sub(self.bounds,' ',txt)
! 82: # split words
! 83: words = txt.split(" ")
! 84: for w in words:
! 85: w=w.strip()
! 86: if not (w==''):
! 87: result.append(w)
! 88:
1.5 dwinter 89: return result
1.2 dwinter 90:
91:
1.5 dwinter 92: class graphemeSplitter(cdliSplitter):
1.7.2.2 ! casties 93: bounds=graphemeBounds
1.5 dwinter 94: indexName="graphemeSplitter"
95:
96: class wordSplitter(cdliSplitter):
1.7.2.2 ! casties 97: bounds=wordBounds
1.5 dwinter 98: indexName="wordSplitter"
99:
100: try:
101: element_factory.registerFactory('Word Splitter',
102: 'CDLI grapheme splitter', graphemeSplitter)
103: except:
104: # in case the splitter is already registered, ValueError is raised
105: pass
1.2 dwinter 106:
1.5 dwinter 107: try:
108: element_factory.registerFactory('Word Splitter',
109: 'CDLI word splitter', wordSplitter)
110: except:
111: # in case the splitter is already registered, ValueError is raised
112: pass
1.1 dwinter 113:
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>