Annotation of cdli/cdliSplitter.py, revision 1.8
1.1 dwinter 1: """
1.8 ! casties 2: CDLI word and grapheme splitter
1.1 dwinter 3: """
4:
5: from Products.ZCTextIndex.PipelineFactory import element_factory
6:
7: import re
1.5 dwinter 8: import logging
1.1 dwinter 9:
10: def getSupportedEncoding(encodings):
11: for encoding in encodings:
12: try:
13: unicode('A', encoding)
14: return encoding
15: except:
16: pass
17: return 'utf-8'
18:
19:
20:
21: """beta of a fulltext splitter for cdli
22:
23: """
1.7 casties 24: ignoreLines=['$','@','#','&','>']
1.1 dwinter 25: separators=['']
1.8 ! casties 26: # kommas relevant for graphemes will not be deleted
! 27: komma_exception="([^sStThH]),"
! 28: komma_exceptionex=re.compile(komma_exception)
! 29: # grapheme boundaries
! 30: #graphemeBounds="\{|\}|<|>|\(|\)|-|_|\#|,|\||\]|\[|\!|\?"
! 31: graphemeBounds="\{|\}|<|>|-|_|\#|,|\]|\[|\!|\?|\""
! 32: graphemeIgnore="<|>|\#|\||\]|\[|\!|\?\*|;"
! 33: # for words
! 34: #wordBounds="<|>|\(|\)|_|\#|,|\||\]|\[|\!|\?"
! 35: wordBounds="_|,|\""
! 36: wordIgnore="<|>|\#|\||\]|\[|\!|\?\*|;"
1.5 dwinter 37:
38: class cdliSplitter:
1.8 ! casties 39: """base class for splitter.
! 40: the difference between word and grapheme splitter
! 41: is the word boundary list."""
1.5 dwinter 42:
1.1 dwinter 43: default_encoding = "utf-8"
1.8 ! casties 44: bounds=graphemeBounds
! 45: boundsex=re.compile(graphemeBounds)
! 46: ignore=graphemeIgnore
! 47: ignorex=re.compile(graphemeIgnore)
1.5 dwinter 48: indexName="cdliSplitter"
49:
1.2 dwinter 50:
1.1 dwinter 51: def process(self, lst):
1.8 ! casties 52: """gets a list of strings and returns a list of words"""
! 53:
! 54: logging.debug("cdliSplitter: %s"%self.indexName)
1.1 dwinter 55: result = []
1.2 dwinter 56: pNum=None
57: lineNum=None
1.5 dwinter 58:
1.1 dwinter 59: for t in lst:
1.8 ! casties 60: # normalise line breaks
! 61: t.replace("\r","\n")
! 62: # split lines
! 63: for s in t.split("\n"):
! 64: if isinstance(s, str):
! 65: # not unicode
! 66: s = unicode(s, self.default_encoding, 'replace')
! 67:
! 68: if (s!=''):
! 69: if s[0]=='&':
! 70: # store pNum
! 71: pNum=s[1:8]
! 72: logging.debug("%s processing: %s"%(self.indexName,pNum))
1.5 dwinter 73:
1.8 ! casties 74: elif not (s[0] in ignoreLines):
! 75: # regular line
! 76: lineparts=s.split(". ",1)
! 77: if len(lineparts)==1:
! 78: # no line number
! 79: txt=s
! 80: else:
! 81: #store line number
! 82: txt=lineparts[1]
! 83: lineNum=lineparts[0]
! 84:
! 85: # delete kommata except kommata relevant for graphemes
! 86: txt = komma_exceptionex.sub(r"\1",txt)
! 87: # replace word boundaries by spaces
! 88: txt = self.boundsex.sub(' ',txt)
! 89: # replace letters to be ignored
! 90: txt = self.ignorex.sub('',txt)
! 91: # split words
! 92: words = txt.split(" ")
! 93: for w in words:
! 94: w=w.strip()
! 95: if not (w==''):
! 96: result.append(w)
! 97:
! 98: #logging.debug("split '%s' into %s"%(lst,repr(result)))
1.5 dwinter 99: return result
1.2 dwinter 100:
101:
1.5 dwinter 102: class graphemeSplitter(cdliSplitter):
1.8 ! casties 103: bounds=graphemeBounds
! 104: boundsex=re.compile(graphemeBounds)
! 105: ignore=graphemeIgnore
! 106: ignorex=re.compile(graphemeIgnore)
1.5 dwinter 107: indexName="graphemeSplitter"
108:
109: class wordSplitter(cdliSplitter):
1.8 ! casties 110: bounds=wordBounds
! 111: boundsex=re.compile(wordBounds)
! 112: ignore=wordIgnore
! 113: ignorex=re.compile(wordIgnore)
1.5 dwinter 114: indexName="wordSplitter"
115:
116: try:
117: element_factory.registerFactory('Word Splitter',
118: 'CDLI grapheme splitter', graphemeSplitter)
119: except:
120: # in case the splitter is already registered, ValueError is raised
121: pass
1.2 dwinter 122:
1.5 dwinter 123: try:
124: element_factory.registerFactory('Word Splitter',
125: 'CDLI word splitter', wordSplitter)
126: except:
127: # in case the splitter is already registered, ValueError is raised
128: pass
1.1 dwinter 129:
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>