cdli/cdliSplitter.py - diff

Return to cdliSplitter.py CVS log

Up to [Repository] / cdli

Diff for /cdli/cdliSplitter.py between versions 1.7.2.1 and 1.8

-version 1.7.2.1, 2007/10/06 13:44:46
+version 1.8, 2008/01/21 17:19:01
  Line 1
  """
- Author splitter
+ CDLI word and grapheme splitter
  """
- import Zope2
- import transaction
- from Products.ZCTextIndex.ISplitter import ISplitter
  from Products.ZCTextIndex.PipelineFactory import element_factory
  import re
- from types import StringType
  import logging
- try:
-     import PyLucene
- except:
-     print "no Lucene support"
  def getSupportedEncoding(encodings):
      for encoding in encodings:
- Line 32  def getSupportedEncoding(encodings):
+ Line 23  def getSupportedEncoding(encodings):
  """
  ignoreLines=['$','@','#','&','>']
  separators=['']
- komma_exception="([^sStThH])," # komma relevant for graphemes will not be deleted
+ # kommas relevant for graphemes will not be deleted
- deleteGraphems="\{|\}|<|>|\(|\)|-|_|\#|,|\||\]|\[|\!|\?" # for graphems
+ komma_exception="([^sStThH]),"
- deleteWords="<|>|\(|\)|_|\#|,|\||\]|\[|\!|\?"# for words
+ komma_exceptionex=re.compile(komma_exception)
+ # grapheme boundaries
- class IndexLine(object):
+ #graphemeBounds="\{|\}|<|>|\(|\)|-|_|\#|,|\||\]|\[|\!|\?"
-     """index a line with lucene"""
+ graphemeBounds="\{|\}|<|>|-|_|\#|,|\]|\[|\!|\?|\""
+ graphemeIgnore="<|>|\#|\||\]|\[|\!|\?\*|;"
-     def __init__(self, storeDir, analyzer,name,line,content):
+ # for words
-         logging.error("i am here %s %s %s %s %s"%((storeDir, analyzer,name,line,content)))
+ #wordBounds="<|>|\(|\)|_|\#|,|\||\]|\[|\!|\?"
-         if not os.path.exists(storeDir):
+ wordBounds="_|,|\""
-             os.mkdir(storeDir)
+ wordIgnore="<|>|\#|\||\]|\[|\!|\?\*|;"
-         store = PyLucene.FSDirectory.getDirectory(storeDir, True)
-         writer = PyLucene.IndexWriter(store, analyzer, True)
-         writer.setMaxFieldLength(1048576)
-         self.indexDocs(writer,name,line,content)
-         writer.optimize()
-         writer.close()
-     def indexDocs(self, writer,name,line,content):
-         doc = PyLucene.Document()
-         doc.add(PyLucene.Field("name", pn,
-                                PyLucene.Field.Store.YES,
-                                PyLucene.Field.Index.UN_TOKENIZED))
-         doc.add(PyLucene.Field("line", str(i),
-                                PyLucene.Field.Store.YES,
-                                PyLucene.Field.Index.UN_TOKENIZED))
-         doc.add(PyLucene.Field("contents", line,
-                                PyLucene.Field.Store.YES,
-                                PyLucene.Field.Index.TOKENIZED))
-         writer.addDocument(doc)
  class cdliSplitter:
-     """basis class for splitter,
+     """base class for splitter.
-     der Unterschied zwischen Word und Graphemesplitter
+     the difference between word and grapheme splitter
-     ist lediglich die unterschiedliche Auschlie§ungsliste"""
+     is the word boundary list."""
      default_encoding = "utf-8"
-     delete=deleteGraphems
+     bounds=graphemeBounds
+     boundsex=re.compile(graphemeBounds)
+     ignore=graphemeIgnore
+     ignorex=re.compile(graphemeIgnore)
      indexName="cdliSplitter"
      def process(self, lst):
-         logging.debug("cdliSplitter")
+         """gets a list of strings and returns a list of words"""
+         logging.debug("cdliSplitter: %s"%self.indexName)
          result = []
          pNum=None
          lineNum=None
          for t in lst:
+             # normalise line breaks
           t.replace("\r","\n")
+             # split lines
           for s in t.split("\n"):
+                 if isinstance(s, str):
-             if type(s) is StringType: # not unicode
+                     # not unicode
                  s = unicode(s, self.default_encoding, 'replace')
-             if (s!="") and (s[0]=="&"): # store pNum
+                 if (s!=''):
+                     if s[0]=='&':
+                         # store pNum
                  pNum=s[1:8]
-                 logging.debug("cdliSplitter processing: %s"%pNum)
+                         logging.debug("%s processing: %s"%(self.indexName,pNum))
-             elif (s!="") and (not (s[0] in ignoreLines)):
-                 splitted=s.split(".")
-                 if len(splitted)==1: #kein punkt
-                     txt=splitted[0]
-                 else:
-                     txt=splitted[1]
-                     lineNum=splitted[0] #store line number
-                 analyse=txt
-                 analyse=re.sub(komma_exception,r"\1",analyse) # delete kommata except kommata relevant in graphems
-                 analyse=re.sub(self.delete,' ',analyse) # deletions
-                 if self.indexName=="luceneSplitter":
-                     if pNum:
-                         analyser=PyLucene.StandardAnalyzer()
-                         logging.error("calling lucene")
-                         IndexLine("/tmp/index",analyser,pNum,lineNum,analyse)
+                     elif not (s[0] in ignoreLines):
+                         # regular line
+                         lineparts=s.split(". ",1)
+                         if len(lineparts)==1:
+                             # no line number
+                             txt=s
                  else:
-                     splitted = analyse.split(" ")
+                             #store line number
-                     for w in splitted:
+                             txt=lineparts[1]
-                         w=w.lstrip().rstrip()
+                             lineNum=lineparts[0]
+                         # delete kommata except kommata relevant for graphemes
+                         txt = komma_exceptionex.sub(r"\1",txt)
+                         # replace word boundaries by spaces
+                         txt = self.boundsex.sub(' ',txt)
+                         # replace letters to be ignored
+                         txt = self.ignorex.sub('',txt)
+                         # split words
+                         words = txt.split(" ")
+                         for w in words:
+                             w=w.strip()
                          if not (w==''):
-                             #if pNum: #only whe pnum is found (first call of the splitter, is always called twice in the pipeline
-                             #    Zope2.app().cdliRoot.storeInLineIndex(self.indexName,w.lstrip().strip(),(pNum,lineNum))
-                             #    transaction.get().commit()
                              result.append(w)
+         #logging.debug("split '%s' into %s"%(lst,repr(result)))
          return result
  class graphemeSplitter(cdliSplitter):
-     delete=deleteGraphems
+     bounds=graphemeBounds
+     boundsex=re.compile(graphemeBounds)
+     ignore=graphemeIgnore
+     ignorex=re.compile(graphemeIgnore)
      indexName="graphemeSplitter"
  class wordSplitter(cdliSplitter):
-     delete=deleteWords
+     bounds=wordBounds
+     boundsex=re.compile(wordBounds)
+     ignore=wordIgnore
+     ignorex=re.compile(wordIgnore)
      indexName="wordSplitter"
- class luceneSplitter(cdliSplitter):
-     delete=deleteWords
-     indexName="luceneSplitter"
  try:
      element_factory.registerFactory('Word Splitter',
            'CDLI grapheme splitter', graphemeSplitter)
- Line 156  except:
+ Line 127  except:
      # in case the splitter is already registered, ValueError is raised
      pass
- try:
-     element_factory.registerFactory('Word Splitter',
-           'CDLI lucene splitter', luceneSplitter)
- except:
-     # in case the splitter is already registered, ValueError is raised
-     pass
- if __name__ == '__main__':
-    a = 'abc defÎÒÃÇµÄºÜ ºÃ¡£'
-    u = unicode(a, 'gbk')
-    s = authorSplitter()
-    print s.process([u])
-    print s.process([u], 1)

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>

Removed from v.1.7.2.1
changed lines
	Added in v.1.8