cdli/cdli_files.py - diff

Return to cdli_files.py CVS log

Up to [Repository] / cdli

Diff for /cdli/cdli_files.py between versions 1.80.2.1 and 1.80.2.5

-version 1.80.2.1, 2007/10/06 13:44:46
+version 1.80.2.5, 2007/10/26 22:45:12
  Line 28  import copy
  import codecs
  import sys
+ import cdliSplitter
  def unicodify(s):
      """decode str (utf-8 or latin-1 representation) into unicode object"""
      if not s:
- Line 50  def utf8ify(s):
+ Line 53  def utf8ify(s):
      else:
          return s.encode('utf-8')
+ def formatAtfLineHtml(l, nolemma=True):
+     """escape special ATF characters for HTML"""
+     if not l:
+         return ""
+     if nolemma:
+         # ignore lemma lines
+         if l.lstrip().startswith('#lem:'):
+             return ""
+     # replace &
+     l = l.replace('&','&amp;')
+     # replace angular brackets
+     l = l.replace('<','&lt;')
+     l = l.replace('>','&gt;')
+     return l
  def generateXMLReturn(hash):
      """erzeugt das xml file als returnwert fuer uploadATFRPC"""
- Line 912  class CDLIBasket(Folder,CatalogAware):
+ Line 931  class CDLIBasket(Folder,CatalogAware):
      def searchInBasket(self,indexName,searchStr,regExp=False):
          """searchInBasket"""
-         lst=self.searchInLineIndexDocs(indexName,searchStr,uniq=True,regExp=regExp)
+         lst=self.searchInLineIndexDocs(indexName,searchStr,uniq=True,regExp=regExp) #TODO: fix this
          ret={}
          lv=self.getLastVersion()
- Line 1778  class CDLIFileFolder(extVersionedFileFol
+ Line 1797  class CDLIFileFolder(extVersionedFileFol
      folder_meta_type=['CDLI Folder']
      default_catalog='CDLICatalog'
-     defaultFileCatalog=default_catalog #wenn dieses definiert ist, wird beim hinzufŸgen einer neuen version eines files dieser catalog neuiniziert
+     defaultFileCatalog=default_catalog #wenn dieses definiert ist, wird beim hinzufuegen einer neuen version eines files dieser catalog neuindiziert
      #downloadCounter=0 # counts how many download for all files currently run, be mehr als 5 wird verweigert.
      tmpStore2={}
- Line 1789  class CDLIFileFolder(extVersionedFileFol
+ Line 1808  class CDLIFileFolder(extVersionedFileFol
      def delete(self,ids):
-         """delete this file, i.e. move into a trash folder"""
+         """delete these files"""
-         found=self.ZopeFind(self,obj_ids=['.trash'])
-         if len(found)<1:
-             manage_addCDLIFileFolder(self, '.trash',title="Trash")
-             trash=self._getOb('.trash')
-         else:
-             trash=found[0][1]
          if type(ids) is not ListType:
              ids=[ids]
-         cut=self.manage_cutObjects(ids)
-         trash.manage_pasteObjects(cut)
+         self.manage_delObjects(ids)
      def getVersionNumbersFromIds(self,ids):
          """get the numbers of the current versions of documents described by their ids"""
- Line 1820  class CDLIFileFolder(extVersionedFileFol
+ Line 1831  class CDLIFileFolder(extVersionedFileFol
      def getFile(self,fn):
          """get the content of the file fn"""
+         logging.debug("getFile: %s"%repr(fn))
+         if not self.hasObject(fn):
+             # search deeper
          founds=self.CDLICatalog.search({'title':fn})
-         if not founds:
+             if founds:
+                 obj=founds[0].getObject().getContentObject()
+             else:
              return ""
          else:
-             obj=founds[0].getObject().getContentObject()
+             obj = self[fn].getContentObject()
              return obj.getData()[0:]
      def checkCatalog(self,fn):
          """check if fn is in the catalog"""
          #TODO add checkCatalog
      def findObjectsFromListWithVersion(self,list,author=None):
          """find objects from a list with versions
          @param list: list of tuples  (cdliFile,version)
          """
          #self.REQUEST.SESSION['fileIds']=list#store fieldIds in session for further usage
          #self.REQUEST.SESSION['searchList']=self.REQUEST.SESSION['fileIds']
          pt=getattr(self,'filelistVersioned.html')
          return pt(search=list,author=author)
- Line 2038  class CDLIRoot(Folder):
+ Line 2050  class CDLIRoot(Folder):
      meta_type="CDLIRoot"
      downloadCounterBaskets=0# counts the current basket downloads if counter > 10 no downloads are possible
-     def deleteFiles(self,ids):
+     file_catalog = 'CDLICatalog'
-         """delete files (resp. move into .trash folder)"""
-         # find or generete trash folder
-         found=self.ZopeFind(self,obj_ids=['.trash'])
+     # word splitter for search
+     splitter = {'words':cdliSplitter.wordSplitter(),
+                 'graphemes':cdliSplitter.graphemeSplitter()}
-         if len(found)<1:
-             manage_addCDLIFileFolder(self, '.trash',title="Trash")
-             trash=self._getOb('.trash')
-         else:
-             trash=found[0][1]
+     def deleteFiles(self,ids):
+         """delete files"""
          for id in ids:
              founds=self.CDLICatalog.search({'title':id.split(".")[0]})
              if founds:
-                 logging.info(founds)
+                 logging.debug("deleting %s"%founds)
                  folder=founds[0].getObject().aq_parent #get the parent folder of the object
-                 logging.info(folder)
+                 logging.debug("deleting from %s"%folder)
-                 cut=folder.manage_cutObjects([founds[0].getId]) #cut it out
+                 cut=folder.delete([founds[0].getId]) #cut it out
-                 trash.manage_pasteObjects(cut)  #paste it in the trash
-     def findWordRegExp(self,indexName,searchTerm):
-         """find all words in index which match regexp in SearchTerm
-         @param indexName: name of the index to be searched in
-         @param searchTerm: word to be searched"""
-         ret=[]
-         for x in self.lineIndexes[indexName].iterkeys():
-             if re.match(searchTerm,x):
-                 ret.append(x)
-         return ret
-     def searchRegExpInLineIndexDocs(self,indexName,searchTerm):
-         """search in inLineIndex with regexp
-         @param indexName: name of the index to be searched in
-         @param searchTerm: term to be searched
-         """
-         if not searchTerm:
-             return []
-         ret=[]
-         words=self.findWordRegExp(indexName,searchTerm) # suche nach allen Treffern
-         logging.info("wd:%s"%words)
-         for word in words:
-             ret+=self.searchInLineIndexDocs(indexName,word)
-         x= unique(ret)
-     logging.info("words_done")
-         return x
-     def showInLineIndex(self):
-         """get the index for debug purposes"""
-         print "show"
-         for key in self.lineIndexes.keys():
-             logging.info("index:%s"%key)
-             for x in self.lineIndexes[key].iterkeys():
-                 logging.info("word:%s"%repr(x))
-                 #for y in self.lineIndex[x].iterkeys():
-                 #    print "doc",repr(y),repr(self.lineIndex[x][y])
-         return self.lineIndexes
-     def searchInLineIndexDocs(self,indexName,word,uniq=True,regExp=False):
-         """search occurences in an index
-         @param indexName: name of the index to be searched in
-         @param word: word to be searched
-         @param unique: (optional) unify the list of results
-         @param regExp: (optional) use regular expressions
-         """
-         if regExp:
-             return self.searchRegExpInLineIndexDocs(indexName,word)
-         try:
-                 lst=list(self.lineIndexes[indexName].get(word).keys())
-         except:
-             logging.error("error: searchInLineIndexDocs (%s %s)"%(sys.exc_info()[0:2]))
-             lst=[]
-         if uniq:
-             return unique(lst)
-         else:
-             return lst
-     def getLinesFromIndex(self,indexName,word,doc,regExp=False):
-         """return all lines from a document where word is found
-         @param indexName: Name of the index
-         @param word: word to be searched
-         @param doc: name of the document (usuallay the p-number)
-         @param regExp: (optional) use regExp
-         """
-         if not regExp:
-             return self.lineIndexes[indexName].get(word)[doc]
-         else: # wenn regexp, suche welches word
-             for w in self.findWordRegExp(indexName,word):
-                 if self.lineIndexes[indexName].get(w): # ein word in im dex gefunden
-                     try:
-                         dc=self.lineIndex[indexName].get(word)[doc]
-                         return dc # und ein document dann gib es zurueck
-                     except:
-                          pass #andernfalls weiter
-     def cleanInLineIndex(self,indexName):
-         """empty an InlineIndex
-         @param indexName: name of the index
-         """
-         for x in list(self.lineIndexes[indexName].keys()):
-             del(self.lineIndexes[indexName][x])
-         print [x for x in self.lineIndexes[indexName].keys()]
-         return "ok"
-     def storeInLineIndex(self,indexName,key,value):
-         """store in index, key is normally a word or grapheme
-         and value is a tuple (documentname, line) where the word can be found
-         @param indexName: name of the index
-         @param key: key in index
-         @param value: value in index, value is a tuple (document name, line)
-         """
-         logging.error("indexing: %s %s"%(indexName,key))
-         if (not hasattr(self,'lineIndexes')):
-             self.lineIndexes={}
-         if self.lineIndexes.get(indexName,None) is None:
-             #index exisitiert noch nicht dann anlegen
-             self.lineIndexes[indexName]=OOBTree()
-         lis=self.lineIndexes
-         li=lis[indexName]
-         if li.has_key(key):
+     def searchText(self, query, index='words'):
+         """searches query in the fulltext index and returns a list of file ids/P-numbers"""
- #            if li[key].has_key(value[0]) and (not (value[1] in li[key][value[0]])):
+         idxQuery = {index:{'query':query}}
-             if li[key].has_key(value[0]):
+         idx = getattr(self, self.file_catalog)
-                 tmp=li[key][value[0]]
+         results = []
-                 tmp.append(value[1]) # add it if now in the array
+         # do search
-                 li[key][value[0]]=tmp[0:]
+         resultset = idx.search(idxQuery)
-             else:
+         for res in resultset:
-                 li[key][value[0]]=[value[1]] # new array for lines
+             # put only the P-Number in the result
+             results.append(res.getId[:7])
-         else:
+         return results
-             li[key]=OOBTree()# new btree for lines
+         # from PluginINdexes.common.util.py:parseIndexRequest:
-             li[key][value[0]]=[value[1]]
+         #
+         #      The class understands the following type of parameters:
+         #
+         #    - old-style parameters where the query for an index as value inside
+         #      the request directory where the index name is the name of the key.
+         #      Additional parameters for an index could be passed as index+"_usage" ...
+         #
+         #
+         #    - dictionary-style parameters specify a query for an index as
+         #      an entry in the request dictionary where the key corresponds to the
+         #      name of the index and the key is a dictionary with the parameters
+         #      passed to the index.
+         #
+         #      Allowed keys of the parameter dictionary:
+         #
+         #      'query'  - contains the query (either string, list or tuple) (required)
+         #
+         #      other parameters depend on the the index
+         #
+         #
+         #   - record-style parameters specify a query for an index as instance of the
+         #     Record class. This happens usually when parameters from a web form use
+         #     the "record" type e.g. <input type="text" name="path.query:record:string">.
+         #     All restrictions of the dictionary-style parameters apply to the record-style
+         #     parameters
-         self.lineIndexes=lis
-         transaction.get().commit()
      def showFile(self,fileId,wholePage=False):
          """show a file
- Line 2202  class CDLIRoot(Folder):
+ Line 2119  class CDLIRoot(Folder):
              return ""
          if wholePage:
-             logging.info("whole")
+             logging.debug("show whole page")
              return f[0].getObject().getContentObject().view()
          else:
              return f[0].getObject().getLastVersionFormattedData()
-     def showWordInFile(self,fileId,word,lineList=None,regExp=True,indexName=""):
+     def showWordInFile(self,fileId,word,indexName='graphemes',regExp=False,):
          """get lines with word  fromFileId"""
          file=self.showFile(fileId)
-         logging.info("regEXP %s"%regExp)
          ret=[]
+         # search using lowercase
+         word = word.lower()
          if regExp: # wenn regexp dann generiere alle worte aus der list die der regexp entsprechen
              wordlist=self.findWordRegExp(indexName,word)
          else:
-             wordlist=[word]
+             # split the search term into words according to the corresponding splitter
+             #try:
+             wordlist = self.splitter[indexName].process([word])
+             #except:
+             #    wordlist=[word]
          for line in file.split("\n"):
-             found=False
+             line = formatAtfLineHtml(unicodify(line))
+             if not line:
+                 # formatAtf can produce empty lines
+                 continue
              for word in wordlist:
-         try: # just a hack because of possible unicode errors in line
+                 if line.lower().find(word)>-1:
-                  if line.find(word)>-1:
-                         if lineList: #liste of moeglichen Zeilennummern
-                                 num=line.split(".")[0] #Zeilenummer ist alles vor dem . in der Zeile
-                                 if num in lineList:
-                                         ret.append(line)
-                         else: # nimm alles ohne line check
                                  ret.append(line)
-                         break;
-         except:
-             pass
          return ret
-     def tagWordInFile(self,fileId,word,lineList=None,regExp=True,indexName=""):
-         """get lines with word  fromFileId"""
+     def tagWordInFile(self,fileId,word,indexName='graphemes',regExp=False):
+         """get text with word highlighted from FileId"""
          file=self.showFile(fileId)
          tagStr=u'<span class="found">%s</span>'
          ret=[]
+         # search using lowercase
+         word = word.lower()
          if regExp: # wenn regexp dann generiere alle worte aus der list die der regexp entsprechen
              wordlist=self.findWordRegExp(indexName,word)
          else:
-             wordlist=[word]
+             # split the search term into words according to the corresponding splitter
+             #try:
+             wordlist = self.splitter[indexName].process([word])
+             #except:
+             #    wordlist=[word]
          for line in file.split("\n"):
-             line = unicodify(line)
+             line = formatAtfLineHtml(unicodify(line))
-             found=False
+             if not line:
-             for word in wordlist:
+                 # formatAtf can produce empty lines
-                 if line.find(word)>-1: #word ist gefunden dann makiere und breche die Schleife ab
+                 continue
-                         if lineList: #liste of moeglichen Zeilennummern
-                                 num=line.split(".")[0] #Zeilenummer ist alles vor dem . in der Zeile
+             for w in wordlist:
+                 if line.lower().find(w)>-1:
-                                 if num in lineList:
+                     #word ist gefunden dann makiere
+                     line = line.replace(w,tagStr%w)
-                                         ret.append(line.replace(word,tagStr%word))
-                         else: # nimm alles ohne line check
-                                 ret.append(line.replace(word,tagStr%word))
-                         found=True
-                         break
-             if not found: #word wurde nicht gefunden keine makierung
                          ret.append(line)
          return u'<br>\n'.join(ret)

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>

Removed from v.1.80.2.1
changed lines
	Added in v.1.80.2.5