cdli/cdliSplitter.py - diff

Return to cdliSplitter.py CVS log

Up to [Repository] / cdli

Diff for /cdli/cdliSplitter.py between versions 1.4 and 1.9

-version 1.4, 2007/02/08 12:00:23
+version 1.9, 2008/09/25 12:37:55
  Line 1
  """
- Author splitter
+ CDLI word and grapheme splitter
  """
- import Zope2
- import transaction
- from Products.ZCTextIndex.ISplitter import ISplitter
  from Products.ZCTextIndex.PipelineFactory import element_factory
  import re
- from types import StringType
+ import logging
  def getSupportedEncoding(encodings):
      for encoding in encodings:
- Line 25  def getSupportedEncoding(encodings):
+ Line 21  def getSupportedEncoding(encodings):
  """beta of a fulltext splitter for cdli
  """
- ignoreLines=['$','@','#','&']
+ ignoreLines=['$','@','#','&','>']
  separators=['']
- komma_exception="([^sStThH])," # komma relevant for graphemes will not be deleted
+ # kommas relevant for graphemes will not be deleted
- delete="\{|\}|<|>|\(|\)|-|_|\#|,|\||\]|\[|\!|\?" # for graphems
+ komma_exception="([^sStThH]),"
- #delete="<|>|\(|\)|_|\#|,|\||\]|\[|!|?" for words
+ komma_exceptionex=re.compile(komma_exception)
+ # grapheme boundaries
+ #graphemeBounds="\{|\}|<|>|\(|\)|-|_|\#|,|\||\]|\[|\!|\?"
+ graphemeBounds="\{|\}|<|>|-|_|\#|,|\]|\[|\!|\?|\""
+ graphemeIgnore="<|>|\#|\||\]|\[|\!|\?\*|;"
+ # for words
+ #wordBounds="<|>|\(|\)|_|\#|,|\||\]|\[|\!|\?"
+ wordBounds="_|,|\""
+ wordIgnore="<|>|\#|\||\]|\[|\!|\?\*|;"
+ class cdliSplitter:
+     """base class for splitter.
+     the difference between word and grapheme splitter
+     is the word boundary list."""
- class graphemeSplitter:
      default_encoding = "utf-8"
+     bounds=graphemeBounds
+     boundsex=re.compile(graphemeBounds)
+     ignore=graphemeIgnore
+     ignorex=re.compile(graphemeIgnore)
+     indexName="cdliSplitter"
      def process(self, lst):
+         """gets a list of strings and returns a list of words"""
+         logging.debug("cdliSplitter: %s"%self.indexName)
          result = []
          pNum=None
          lineNum=None
-         #print "LLLL",lst
          for t in lst:
+             # normalise line breaks
           t.replace("\r","\n")
+             # split lines
           for s in t.split("\n"):
+                 if isinstance(s, str):
-             if type(s) is StringType: # not unicode
+                     # not unicode
                  s = unicode(s, self.default_encoding, 'replace')
-             #ignore lines
+                 if (s!=''):
+                     if s[0]=='&':
-             if (s!="") and (s[0]=="&"): # store pNum
+                         # store pNum
                  pNum=s[1:8]
+                         logging.debug("%s processing: %s"%(self.indexName,pNum))
-             elif (s!="") and (not (s[0] in ignoreLines)):
+                     elif not (s[0] in ignoreLines):
+                         # regular line
+                         lineparts=s.split(". ",1)
-                 #ignore everthing bevor "."
+                         if len(lineparts)==1:
-                 splitted=s.split(".")
+                             # no line number
+                             txt=s
-                 if len(splitted)==1: #kein punkt
-                     txt=splitted[0]
                  else:
-                     txt=splitted[1]
+                             #store line number
-                     lineNum=splitted[0] #store line number
+                             txt=lineparts[1]
+                             lineNum=lineparts[0]
-                 analyse=txt
+                         # delete kommata except kommata relevant for graphemes
-                 analyse=re.sub(komma_exception,r"\1",analyse) # delete kommata except kommata relevant in graphems
+                         txt = komma_exceptionex.sub(r"\1",txt)
+                         # replace word boundaries by spaces
-                 analyse=re.sub(delete,' ',analyse) # deletions
+                         txt = self.boundsex.sub(' ',txt)
+                         # replace letters to be ignored
-                 splitted = analyse.split(" ")
+                         txt = self.ignorex.sub('',txt)
+                         # split words
-                 for w in splitted:
+                         words = txt.split(" ")
-                     w=w.lstrip().rstrip()
+                         for w in words:
+                             w=w.strip()
                      if not (w==''):
-                         if pNum: #only whe pnum is found (first call of the splitter, is always called twice in the pipeline
+                                 result.append(w)
-                             Zope.app().cdliRoot.storeInLineIndex(w.lstrip().strip(),(pNum,lineNum))
-                             transaction.get().commit()
-                         result.append(w.lstrip().rstrip())
+         #logging.debug("split '%s' into %s"%(lst,repr(result)))
          return result
+ class graphemeSplitter(cdliSplitter):
+     bounds=graphemeBounds
+     boundsex=re.compile(graphemeBounds)
+     ignore=graphemeIgnore
+     ignorex=re.compile(graphemeIgnore)
+     indexName="graphemeSplitter"
+ class wordSplitter(cdliSplitter):
+     bounds=wordBounds
+     boundsex=re.compile(wordBounds)
+     ignore=wordIgnore
+     ignorex=re.compile(wordIgnore)
+     indexName="wordSplitter"
  try:
      element_factory.registerFactory('Word Splitter',
            'CDLI grapheme splitter', graphemeSplitter)
- Line 96  except:
+ Line 122  except:
      # in case the splitter is already registered, ValueError is raised
      pass
- if __name__ == '__main__':
+ try:
-    a = 'abc def���ǵĺ� �á�'
+     element_factory.registerFactory('Word Splitter',
-    u = unicode(a, 'gbk')
+           'CDLI word splitter', wordSplitter)
-    s = authorSplitter()
+ except:
-    print s.process([u])
+     # in case the splitter is already registered, ValueError is raised
-    print s.process([u], 1)
+     pass

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>

Removed from v.1.4
changed lines
	Added in v.1.9