--- cdli/cdli_files.py	2007/10/06 13:44:46	1.80.2.1
+++ cdli/cdli_files.py	2007/10/24 20:36:07	1.80.2.4
@@ -28,6 +28,9 @@ import copy
 import codecs
 import sys
 
+import cdliSplitter
+
+
 def unicodify(s):
     """decode str (utf-8 or latin-1 representation) into unicode object"""
     if not s:
@@ -50,6 +53,22 @@ def utf8ify(s):
     else:
         return s.encode('utf-8')
 
+def formatAtfLineHtml(l, nolemma=True):
+    """escape special ATF characters for HTML"""
+    if not l:
+        return ""
+
+    if nolemma:
+        # ignore lemma lines
+        if l.lstrip().startswith('#lem:'):
+            return ""
+    # replace &
+    l = l.replace('&','&amp;')
+    # replace angular brackets
+    l = l.replace('<','&lt;')
+    l = l.replace('>','&gt;')
+    return l
+
 
 def generateXMLReturn(hash):
     """erzeugt das xml file als returnwert fuer uploadATFRPC"""
@@ -912,7 +931,7 @@ class CDLIBasket(Folder,CatalogAware):
     def searchInBasket(self,indexName,searchStr,regExp=False):
         """searchInBasket"""
 
-        lst=self.searchInLineIndexDocs(indexName,searchStr,uniq=True,regExp=regExp)
+        lst=self.searchInLineIndexDocs(indexName,searchStr,uniq=True,regExp=regExp) #TODO: fix this
         ret={}
         
         lv=self.getLastVersion()
@@ -1778,7 +1797,7 @@ class CDLIFileFolder(extVersionedFileFol
     folder_meta_type=['CDLI Folder']
     
     default_catalog='CDLICatalog'
-    defaultFileCatalog=default_catalog #wenn dieses definiert ist, wird beim hinzufŸgen einer neuen version eines files dieser catalog neuiniziert
+    defaultFileCatalog=default_catalog #wenn dieses definiert ist, wird beim hinzufuegen einer neuen version eines files dieser catalog neuindiziert
     #downloadCounter=0 # counts how many download for all files currently run, be mehr als 5 wird verweigert.
     tmpStore2={}
 
@@ -1789,20 +1808,12 @@ class CDLIFileFolder(extVersionedFileFol
                                         
                                        
     def delete(self,ids):
-        """delete this file, i.e. move into a trash folder"""
-             
-        found=self.ZopeFind(self,obj_ids=['.trash'])
-        
-        if len(found)<1:
-            manage_addCDLIFileFolder(self, '.trash',title="Trash")
-            trash=self._getOb('.trash')
-        else:
-            trash=found[0][1]
-        
+        """delete these files"""
         if type(ids) is not ListType:
             ids=[ids]
-        cut=self.manage_cutObjects(ids)
-        trash.manage_pasteObjects(cut)
+
+        self.manage_delObjects(ids)
+
 
     def getVersionNumbersFromIds(self,ids):
         """get the numbers of the current versions of documents described by their ids"""
@@ -1820,31 +1831,32 @@ class CDLIFileFolder(extVersionedFileFol
     
     def getFile(self,fn):
         """get the content of the file fn"""
-        founds=self.CDLICatalog.search({'title':fn})
-        if not founds:
-            return "" 
+        logging.debug("getFile: %s"%repr(fn))
+        if not self.hasObject(fn):
+            # search deeper
+            founds=self.CDLICatalog.search({'title':fn})
+            if founds:
+                obj=founds[0].getObject().getContentObject()
+            else:
+                return "" 
         else:
-            obj=founds[0].getObject().getContentObject()
+            obj = self[fn].getContentObject()
 
-            return obj.getData()[0:] 
+        return obj.getData()[0:] 
+ 
     
     def checkCatalog(self,fn):
         """check if fn is in the catalog"""
         #TODO add checkCatalog
-         
         
                                    
     def findObjectsFromListWithVersion(self,list,author=None):
         """find objects from a list with versions
         @param list: list of tuples  (cdliFile,version)
         """
-        
-       
-       
         #self.REQUEST.SESSION['fileIds']=list#store fieldIds in session for further usage
         #self.REQUEST.SESSION['searchList']=self.REQUEST.SESSION['fileIds']
         
-        
         pt=getattr(self,'filelistVersioned.html')
             
         return pt(search=list,author=author)
@@ -2036,162 +2048,67 @@ class CDLIRoot(Folder):
     """main folder for cdli"""
     
     meta_type="CDLIRoot"
-    downloadCounterBaskets=0# counts the current basket downloads if counter > 10 no downloads are possible
+    downloadCounterBaskets=0 # counts the current basket downloads if counter > 10 no downloads are possible
+    
+    file_catalog = 'CDLICatalog'
+    
+    # word splitter for search
+    splitter = {'words':cdliSplitter.wordSplitter(),
+                'graphemes':cdliSplitter.graphemeSplitter()}
+    
     
     def deleteFiles(self,ids):
-        """delete files (resp. move into .trash folder)"""
-        # find or generete trash folder
-        
-        found=self.ZopeFind(self,obj_ids=['.trash'])
-        
-        if len(found)<1:
-            manage_addCDLIFileFolder(self, '.trash',title="Trash")
-            trash=self._getOb('.trash')
-        else:
-            trash=found[0][1]
-        
+        """delete files"""
         for id in ids:
             founds=self.CDLICatalog.search({'title':id.split(".")[0]})
             if founds:
-                logging.info(founds)
+                logging.debug("deleting %s"%founds)
                 folder=founds[0].getObject().aq_parent #get the parent folder of the object
-                logging.info(folder)
-                cut=folder.manage_cutObjects([founds[0].getId]) #cut it out
-                trash.manage_pasteObjects(cut)  #paste it in the trash
+                logging.debug("deleting from %s"%folder)
+                cut=folder.delete([founds[0].getId]) #cut it out
 
 
-    def findWordRegExp(self,indexName,searchTerm):
-        """find all words in index which match regexp in SearchTerm
-        @param indexName: name of the index to be searched in
-        @param searchTerm: word to be searched"""
-        
-        ret=[]
-        for x in self.lineIndexes[indexName].iterkeys():
-            if re.match(searchTerm,x):
-                ret.append(x)
-        return ret
-    
-    def searchRegExpInLineIndexDocs(self,indexName,searchTerm):
-        """search in inLineIndex with regexp
-        @param indexName: name of the index to be searched in
-        @param searchTerm: term to be searched
-        """
-        if not searchTerm:
-            return []
-        ret=[]
-        words=self.findWordRegExp(indexName,searchTerm) # suche nach allen Treffern
-        logging.info("wd:%s"%words)
-        for word in words:
-
-            ret+=self.searchInLineIndexDocs(indexName,word)
-	
-
-        x= unique(ret)
-	logging.info("words_done")
-        return x
-
-    def showInLineIndex(self):
-        """get the index for debug purposes"""
-        print "show"
-        for key in self.lineIndexes.keys():
-            logging.info("index:%s"%key)
-            for x in self.lineIndexes[key].iterkeys():
-                logging.info("word:%s"%repr(x))
-                #for y in self.lineIndex[x].iterkeys():
-                #    print "doc",repr(y),repr(self.lineIndex[x][y])
-                    
-        return self.lineIndexes
-        
-    def searchInLineIndexDocs(self,indexName,word,uniq=True,regExp=False):
-        """search occurences in an index
-        @param indexName: name of the index to be searched in
-        @param word: word to be searched
-        @param unique: (optional) unify the list of results
-        @param regExp: (optional) use regular expressions
-        """
 
-        if regExp:
-            return self.searchRegExpInLineIndexDocs(indexName,word)
-        
-        try:    
-                
-                lst=list(self.lineIndexes[indexName].get(word).keys())
-        except:
-            logging.error("error: searchInLineIndexDocs (%s %s)"%(sys.exc_info()[0:2]))
-            lst=[]
-        if uniq:
-            return unique(lst)
-        else:
-            return lst
-        
-    def getLinesFromIndex(self,indexName,word,doc,regExp=False):
-        """return all lines from a document where word is found
-        @param indexName: Name of the index
-        @param word: word to be searched
-        @param doc: name of the document (usuallay the p-number)
-        @param regExp: (optional) use regExp       
-        """
-        
-        if not regExp:
-            return self.lineIndexes[indexName].get(word)[doc]
-        else: # wenn regexp, suche welches word
-            for w in self.findWordRegExp(indexName,word):
-                if self.lineIndexes[indexName].get(w): # ein word in im dex gefunden
-                    try:    
-                        dc=self.lineIndex[indexName].get(word)[doc]
-                        return dc # und ein document dann gib es zurueck
-                    except:
-                         pass #andernfalls weiter
-                     
-    def cleanInLineIndex(self,indexName):
-        """empty an InlineIndex
-        @param indexName: name of the index
-        """
-        for x in list(self.lineIndexes[indexName].keys()):
-            del(self.lineIndexes[indexName][x])
-        print [x for x in self.lineIndexes[indexName].keys()]
-     
-        return "ok"
-    
-    def storeInLineIndex(self,indexName,key,value):
-        """store in index, key is normally a word or grapheme 
-        and value is a tuple (documentname, line) where the word can be found
-        @param indexName: name of the index
-        @param key: key in index
-        @param value: value in index, value is a tuple (document name, line)
-        """
-        logging.error("indexing: %s %s"%(indexName,key))
-        if (not hasattr(self,'lineIndexes')):
-      
-            self.lineIndexes={}
-            
-        if self.lineIndexes.get(indexName,None) is None:
-            #index exisitiert noch nicht dann anlegen
-            
-            self.lineIndexes[indexName]=OOBTree()
-        lis=self.lineIndexes
-        li=lis[indexName]
-        
-        if li.has_key(key):
-
-#            if li[key].has_key(value[0]) and (not (value[1] in li[key][value[0]])):
-            if li[key].has_key(value[0]):
-                tmp=li[key][value[0]]
-                tmp.append(value[1]) # add it if now in the array
-                li[key][value[0]]=tmp[0:]
-            else:
-                li[key][value[0]]=[value[1]] # new array for lines
-                
-        else:
-            
-            li[key]=OOBTree()# new btree for lines
-            li[key][value[0]]=[value[1]] 
-                    
-        
-        self.lineIndexes=lis
-     
-        transaction.get().commit()
-        
+    def searchText(self, query, index='words'):
+        """searches query in the fulltext index and returns a list of file ids/P-numbers"""
+        idxQuery = {index:{'query':query}}
+        idx = getattr(self, self.file_catalog)
+        results = []
+        # do search
+        resultset = idx.search(idxQuery)
+        for res in resultset:
+            # put only the P-Number in the result 
+            results.append(res.getId[:7])
+        return results
+
+        # from PluginINdexes.common.util.py:parseIndexRequest:
+        #
+        #      The class understands the following type of parameters:
+        #
+        #    - old-style parameters where the query for an index as value inside
+        #      the request directory where the index name is the name of the key.
+        #      Additional parameters for an index could be passed as index+"_usage" ...
+        #
+        #
+        #    - dictionary-style parameters specify a query for an index as
+        #      an entry in the request dictionary where the key corresponds to the
+        #      name of the index and the key is a dictionary with the parameters
+        #      passed to the index.
+        #
+        #      Allowed keys of the parameter dictionary:
+        #
+        #      'query'  - contains the query (either string, list or tuple) (required)
+        #
+        #      other parameters depend on the the index
+        #
+        #
+        #   - record-style parameters specify a query for an index as instance of the
+        #     Record class. This happens usually when parameters from a web form use
+        #     the "record" type e.g. <input type="text" name="path.query:record:string">.
+        #     All restrictions of the dictionary-style parameters apply to the record-style
+        #     parameters
+
+
 
     def showFile(self,fileId,wholePage=False):
         """show a file
@@ -2202,17 +2119,17 @@ class CDLIRoot(Folder):
             return ""
         
         if wholePage:
-            logging.info("whole")
+            logging.debug("show whole page")
             return f[0].getObject().getContentObject().view()
         else:
             return f[0].getObject().getLastVersionFormattedData()
     
 
-    def showWordInFile(self,fileId,word,lineList=None,regExp=True,indexName=""):
-        """get lines with word  fromFileId"""
+    def showWordInFile(self,fileId,word,lineList=None,regExp=False,indexName=""):
+        """get lines with word from FileId"""
         
         file=self.showFile(fileId)
-        logging.info("regEXP %s"%regExp)
+        logging.debug("show word regEXP %s"%regExp)
         ret=[]
         if regExp: # wenn regexp dann generiere alle worte aus der list die der regexp entsprechen
             wordlist=self.findWordRegExp(indexName,word)
@@ -2220,54 +2137,53 @@ class CDLIRoot(Folder):
             wordlist=[word]
         
         for line in file.split("\n"):
+            line = formatAtfLineHtml(unicodify(line))
             found=False
             for word in wordlist:
-		try: # just a hack because of possible unicode errors in line
-                 if line.find(word)>-1:
+                try: # just a hack because of possible unicode errors in line
+                    if line.find(word)>-1:
                         if lineList: #liste of moeglichen Zeilennummern
-                                num=line.split(".")[0] #Zeilenummer ist alles vor dem . in der Zeile
-
-                                if num in lineList: 
-
-                                        ret.append(line)
-                        else: # nimm alles ohne line check
+                            num=line.split(".")[0] #Zeilenummer ist alles vor dem . in der Zeile
+                            if num in lineList: 
                                 ret.append(line)
-                        
-                        break;
-		except:
-			pass
+                            else: # nimm alles ohne line check
+                                ret.append(line)
+                            break;
+                except:
+                    pass
         return ret
+    
 
-    def tagWordInFile(self,fileId,word,lineList=None,regExp=True,indexName=""):
-        """get lines with word  fromFileId"""
+    def tagWordInFile(self,fileId,word,indexName='words',regExp=False):
+        """get text with word highlighted from FileId"""
         
         file=self.showFile(fileId)
         tagStr=u'<span class="found">%s</span>'
         ret=[]
+        # search using lowercase
+        word = word.lower()
         
         if regExp: # wenn regexp dann generiere alle worte aus der list die der regexp entsprechen
             wordlist=self.findWordRegExp(indexName,word)
         else:
-            wordlist=[word]
+            # split the search term into words according to the corresponding splitter
+            #try:
+            wordlist = self.splitter[indexName].process([word])
+            #except:
+            #    wordlist=[word]
             
         for line in file.split("\n"):
-            line = unicodify(line)
-            found=False
-            for word in wordlist:
-                if line.find(word)>-1: #word ist gefunden dann makiere und breche die Schleife ab
-                        if lineList: #liste of moeglichen Zeilennummern
-                                num=line.split(".")[0] #Zeilenummer ist alles vor dem . in der Zeile
-
-                                if num in lineList: 
-
-                                        ret.append(line.replace(word,tagStr%word))
-                        
-                        else: # nimm alles ohne line check
-                                ret.append(line.replace(word,tagStr%word))
-                        found=True
-                        break
-            if not found: #word wurde nicht gefunden keine makierung
-                        ret.append(line)
+            line = formatAtfLineHtml(unicodify(line))
+            if not line:
+                # formatAtf can produce empty lines
+                continue
+            
+            for w in wordlist:
+                if line.lower().find(w)>-1: 
+                    #word ist gefunden dann makiere
+                    line = line.replace(w,tagStr%w)
+                    
+            ret.append(line)
                         
         return u'<br>\n'.join(ret)