cdli/cdli_files.py - diff

Return to cdli_files.py CVS log

Up to [Repository] / cdli

Diff for /cdli/cdli_files.py between versions 1.80.2.5 and 1.80.2.6

-version 1.80.2.5, 2007/10/26 22:45:12
+version 1.80.2.6, 2007/11/02 15:37:46
  Line 53  def utf8ify(s):
      else:
          return s.encode('utf-8')
- def formatAtfLineHtml(l, nolemma=True):
+ def formatAtfHtml(l):
      """escape special ATF characters for HTML"""
      if not l:
          return ""
-     if nolemma:
-         # ignore lemma lines
-         if l.lstrip().startswith('#lem:'):
-             return ""
      # replace &
      l = l.replace('&','&amp;')
      # replace angular brackets
- Line 69  def formatAtfLineHtml(l, nolemma=True):
+ Line 65  def formatAtfLineHtml(l, nolemma=True):
      l = l.replace('>','&gt;')
      return l
+ def formatAtfLineHtml(l, nolemma=True):
+     """format ATF line for HTML"""
+     if not l:
+         return ""
+     if nolemma:
+         # ignore lemma lines
+         if l.lstrip().startswith('#lem:'):
+             return ""
+     return formatAtfHtml(l)
+ def formatAtfFullLineNum(txt, nolemma=True):
+     """format full line numbers in ATF text"""
+     # surface codes
+     surfaces = {'@obverse':'obv',
+                 '@reverse':'rev',
+                 '@surface':'surface',
+                 '@edge':'edge',
+                 '@left':'left',
+                 '@right':'right',
+                 '@top':'top',
+                 '@bottom':'bottom',
+                 '@face':'face',
+                 '@seal':'seal'}
+     if not txt:
+         return ""
+     ret = []
+     surf = ""
+     col = ""
+     for line in txt.split("\n"):
+         line = unicodify(line)
+         if line and line[0] == '@':
+             # surface or column
+             words = line.split(' ')
+             if words[0] in surfaces:
+                 surf = line.replace(words[0],surfaces[words[0]]).strip()
+             elif words[0] == '@column':
+                 col = words[1]
+         elif line and line[0] in '123456789':
+             # ordinary line -> add line number
+             line = "%s:%s:%s"%(surf,col,line)
+         ret.append(line)
+     return '\n'.join(ret)
  def generateXMLReturn(hash):
      """erzeugt das xml file als returnwert fuer uploadATFRPC"""
- Line 1584  class CDLIFile(extVersionedFile,CatalogA
+ Line 1633  class CDLIFile(extVersionedFile,CatalogA
      content_meta_type = ["CDLI File Object"]
      default_catalog='CDLICatalog'
      security.declareProtected('manage','index_html')
      def getLastVersionData(self):
- Line 1594  class CDLIFile(extVersionedFile,CatalogA
+ Line 1644  class CDLIFile(extVersionedFile,CatalogA
          """get last version data"""
          return self.getContentObject().getFormattedData()
+     def getTextId(self):
+         """returns P-number of text"""
+         # assuming that its the beginning of the title
+         return self.title[:7]
      #security.declarePublic('history')
      def history(self):
          """history"""
- Line 1796  class CDLIFileFolder(extVersionedFileFol
+ Line 1851  class CDLIFileFolder(extVersionedFileFol
      file_meta_type=['CDLI file']
      folder_meta_type=['CDLI Folder']
-     default_catalog='CDLICatalog'
+     file_catalog='CDLICatalog'
-     defaultFileCatalog=default_catalog #wenn dieses definiert ist, wird beim hinzufuegen einer neuen version eines files dieser catalog neuindiziert
      #downloadCounter=0 # counts how many download for all files currently run, be mehr als 5 wird verweigert.
      tmpStore2={}
- Line 1834  class CDLIFileFolder(extVersionedFileFol
+ Line 1889  class CDLIFileFolder(extVersionedFileFol
          logging.debug("getFile: %s"%repr(fn))
          if not self.hasObject(fn):
              # search deeper
-             founds=self.CDLICatalog.search({'title':fn})
+             founds=getattr(self, self.file_catalog).search({'textid':fn})
              if founds:
                  obj=founds[0].getObject().getContentObject()
              else:
- Line 1937  class CDLIFileFolder(extVersionedFileFol
+ Line 1992  class CDLIFileFolder(extVersionedFileFol
          def sortF(x,y):
              return cmp(x[0],y[0])
-         catalog=getattr(self,self.default_catalog)
+         catalog=getattr(self,self.file_catalog)
          #tf,tfilename=mkstemp()
          if not hasattr(self.temp_folder,'downloadCounter'):
              self.temp_folder.downloadCounter=0
- Line 2069  class CDLIRoot(Folder):
+ Line 2124  class CDLIRoot(Folder):
-     def searchText(self, query, index='words'):
+     def searchText(self, query, index='graphemes'):
          """searches query in the fulltext index and returns a list of file ids/P-numbers"""
+         # see also: http://www.plope.com/Books/2_7Edition/SearchingZCatalog.stx#2-13
          idxQuery = {index:{'query':query}}
          idx = getattr(self, self.file_catalog)
-         results = []
          # do search
-         resultset = idx.search(idxQuery)
+         resultset = idx.search(query_request=idxQuery,sort_index='textid')
-         for res in resultset:
              # put only the P-Number in the result
-             results.append(res.getId[:7])
+         results = [res.getId[:7] for res in resultset]
          return results
-         # from PluginINdexes.common.util.py:parseIndexRequest:
-         #
+     def getFile(self, pnum):
-         #      The class understands the following type of parameters:
+         """get the translit file with the given pnum"""
-         #
+         f = getattr(self, self.file_catalog).search({'textid':pnum})
-         #    - old-style parameters where the query for an index as value inside
+         if not f:
-         #      the request directory where the index name is the name of the key.
+             return ""
-         #      Additional parameters for an index could be passed as index+"_usage" ...
-         #
+         return f[0].getObject().getData()
-         #
-         #    - dictionary-style parameters specify a query for an index as
-         #      an entry in the request dictionary where the key corresponds to the
-         #      name of the index and the key is a dictionary with the parameters
-         #      passed to the index.
-         #
-         #      Allowed keys of the parameter dictionary:
-         #
-         #      'query'  - contains the query (either string, list or tuple) (required)
-         #
-         #      other parameters depend on the the index
-         #
-         #
-         #   - record-style parameters specify a query for an index as instance of the
-         #     Record class. This happens usually when parameters from a web form use
-         #     the "record" type e.g. <input type="text" name="path.query:record:string">.
-         #     All restrictions of the dictionary-style parameters apply to the record-style
-         #     parameters
- Line 2114  class CDLIRoot(Folder):
+ Line 2150  class CDLIRoot(Folder):
          """show a file
          @param fileId: P-Number of the document to be displayed
          """
-         f=self.CDLICatalog({'title':fileId})
+         f=getattr(self, self.file_catalog).search({'textid':fileId})
          if not f:
              return ""
- Line 2128  class CDLIRoot(Folder):
+ Line 2164  class CDLIRoot(Folder):
      def showWordInFile(self,fileId,word,indexName='graphemes',regExp=False,):
          """get lines with word from FileId"""
-         file=self.showFile(fileId)
+         file = formatAtfFullLineNum(self.getFile(fileId))
          ret=[]
-         # search using lowercase
-         word = word.lower()
+         # add whitespace before and whitespace and line-end to splitter bounds expressions
-         if regExp: # wenn regexp dann generiere alle worte aus der list die der regexp entsprechen
+         bounds = self.splitter[indexName].bounds
-             wordlist=self.findWordRegExp(indexName,word)
+         splitexp = "(%s|\s)(%%s)(%s|\s|\Z)"%(bounds,bounds)
-         else:
+         # compile into regexp objects
-             # split the search term into words according to the corresponding splitter
+         wordlist = [re.compile(splitexp%w,re.IGNORECASE) for w in word.split(' ')]
-             #try:
-             wordlist = self.splitter[indexName].process([word])
-             #except:
-             #    wordlist=[word]
          for line in file.split("\n"):
-             line = formatAtfLineHtml(unicodify(line))
-             if not line:
-                 # formatAtf can produce empty lines
-                 continue
              for word in wordlist:
-                 if line.lower().find(word)>-1:
+                 #logging.debug("showwordinfile: searching for %s in %s"%(word.pattern,line))
+                 if word.search(line):
+                     line = formatAtfLineHtml(line)
                      ret.append(line)
+                     break
          return ret
      def tagWordInFile(self,fileId,word,indexName='graphemes',regExp=False):
          """get text with word highlighted from FileId"""
-         file=self.showFile(fileId)
+         file=self.getFile(fileId)
-         tagStr=u'<span class="found">%s</span>'
+         tagStart=u'<span class="found">'
+         tagEnd=u'</span>'
+         tagStr=tagStart + u'%%s' + tagEnd
          ret=[]
-         # search using lowercase
-         word = word.lower()
-         if regExp: # wenn regexp dann generiere alle worte aus der list die der regexp entsprechen
+         # add whitespace to splitter bounds expressions and compile into regexp object
-             wordlist=self.findWordRegExp(indexName,word)
+         bounds = self.splitter[indexName].bounds
-         else:
+         wordsplit = re.compile("(%s|\s)"%bounds)
-             # split the search term into words according to the corresponding splitter
+         # split search terms by blanks
-             #try:
+         words = word.split(' ')
-             wordlist = self.splitter[indexName].process([word])
-             #except:
-             #    wordlist=[word]
          for line in file.split("\n"):
-             line = formatAtfLineHtml(unicodify(line))
+             line = unicodify(line)
-             if not line:
+             # ignore lemma lines
-                 # formatAtf can produce empty lines
+             if line.lstrip().startswith('#lem:'):
                  continue
-             for w in wordlist:
+             # first scan
-                 if line.lower().find(w)>-1:
+             hitwords = []
-                     #word ist gefunden dann makiere
+             for w in words:
-                     line = line.replace(w,tagStr%w)
+                 if line.find(w) > -1:
+                     # word is in line
+                     hitwords.append(w)
+             # examine hits closer
+             if hitwords:
+                 # split line into words
+                 parts = wordsplit.split(line)
+                 line = ""
+                 for p in parts:
+                     # reassemble line
+                     if p in hitwords:
+                         # this part was found
+                         line += tagStart + formatAtfHtml(p) + tagEnd
+                     else:
+                         line += formatAtfHtml(p)
+             else:
+                 # no hits
+                 line = formatAtfHtml(line)
              ret.append(line)
          return u'<br>\n'.join(ret)
      def URLquote(self,str):
          """quote url"""
          return urllib.quote(str)

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>

Removed from v.1.80.2.5
changed lines
	Added in v.1.80.2.6