--- cdli/cdli_files.py	2007/10/22 16:26:40	1.80.2.3
+++ cdli/cdli_files.py	2007/10/24 20:36:07	1.80.2.4
@@ -28,6 +28,9 @@ import copy
 import codecs
 import sys
 
+import cdliSplitter
+
+
 def unicodify(s):
     """decode str (utf-8 or latin-1 representation) into unicode object"""
     if not s:
@@ -50,6 +53,22 @@ def utf8ify(s):
     else:
         return s.encode('utf-8')
 
+def formatAtfLineHtml(l, nolemma=True):
+    """escape special ATF characters for HTML"""
+    if not l:
+        return ""
+
+    if nolemma:
+        # ignore lemma lines
+        if l.lstrip().startswith('#lem:'):
+            return ""
+    # replace &
+    l = l.replace('&','&amp;')
+    # replace angular brackets
+    l = l.replace('<','&lt;')
+    l = l.replace('>','&gt;')
+    return l
+
 
 def generateXMLReturn(hash):
     """erzeugt das xml file als returnwert fuer uploadATFRPC"""
@@ -1812,6 +1831,7 @@ class CDLIFileFolder(extVersionedFileFol
     
     def getFile(self,fn):
         """get the content of the file fn"""
+        logging.debug("getFile: %s"%repr(fn))
         if not self.hasObject(fn):
             # search deeper
             founds=self.CDLICatalog.search({'title':fn})
@@ -2030,6 +2050,13 @@ class CDLIRoot(Folder):
     meta_type="CDLIRoot"
     downloadCounterBaskets=0 # counts the current basket downloads if counter > 10 no downloads are possible
     
+    file_catalog = 'CDLICatalog'
+    
+    # word splitter for search
+    splitter = {'words':cdliSplitter.wordSplitter(),
+                'graphemes':cdliSplitter.graphemeSplitter()}
+    
+    
     def deleteFiles(self,ids):
         """delete files"""
         for id in ids:
@@ -2042,6 +2069,47 @@ class CDLIRoot(Folder):
 
 
 
+    def searchText(self, query, index='words'):
+        """searches query in the fulltext index and returns a list of file ids/P-numbers"""
+        idxQuery = {index:{'query':query}}
+        idx = getattr(self, self.file_catalog)
+        results = []
+        # do search
+        resultset = idx.search(idxQuery)
+        for res in resultset:
+            # put only the P-Number in the result 
+            results.append(res.getId[:7])
+        return results
+
+        # from PluginINdexes.common.util.py:parseIndexRequest:
+        #
+        #      The class understands the following type of parameters:
+        #
+        #    - old-style parameters where the query for an index as value inside
+        #      the request directory where the index name is the name of the key.
+        #      Additional parameters for an index could be passed as index+"_usage" ...
+        #
+        #
+        #    - dictionary-style parameters specify a query for an index as
+        #      an entry in the request dictionary where the key corresponds to the
+        #      name of the index and the key is a dictionary with the parameters
+        #      passed to the index.
+        #
+        #      Allowed keys of the parameter dictionary:
+        #
+        #      'query'  - contains the query (either string, list or tuple) (required)
+        #
+        #      other parameters depend on the the index
+        #
+        #
+        #   - record-style parameters specify a query for an index as instance of the
+        #     Record class. This happens usually when parameters from a web form use
+        #     the "record" type e.g. <input type="text" name="path.query:record:string">.
+        #     All restrictions of the dictionary-style parameters apply to the record-style
+        #     parameters
+
+
+
     def showFile(self,fileId,wholePage=False):
         """show a file
         @param fileId: P-Number of the document to be displayed
@@ -2069,54 +2137,53 @@ class CDLIRoot(Folder):
             wordlist=[word]
         
         for line in file.split("\n"):
+            line = formatAtfLineHtml(unicodify(line))
             found=False
             for word in wordlist:
-		try: # just a hack because of possible unicode errors in line
-                 if line.find(word)>-1:
+                try: # just a hack because of possible unicode errors in line
+                    if line.find(word)>-1:
                         if lineList: #liste of moeglichen Zeilennummern
-                                num=line.split(".")[0] #Zeilenummer ist alles vor dem . in der Zeile
-
-                                if num in lineList: 
-
-                                        ret.append(line)
-                        else: # nimm alles ohne line check
+                            num=line.split(".")[0] #Zeilenummer ist alles vor dem . in der Zeile
+                            if num in lineList: 
                                 ret.append(line)
-                        
-                        break;
-		except:
-			pass
+                            else: # nimm alles ohne line check
+                                ret.append(line)
+                            break;
+                except:
+                    pass
         return ret
+    
 
-    def tagWordInFile(self,fileId,word,lineList=None,regExp=False,indexName=""):
+    def tagWordInFile(self,fileId,word,indexName='words',regExp=False):
         """get text with word highlighted from FileId"""
         
         file=self.showFile(fileId)
         tagStr=u'<span class="found">%s</span>'
         ret=[]
+        # search using lowercase
+        word = word.lower()
         
         if regExp: # wenn regexp dann generiere alle worte aus der list die der regexp entsprechen
             wordlist=self.findWordRegExp(indexName,word)
         else:
-            wordlist=[word]
+            # split the search term into words according to the corresponding splitter
+            #try:
+            wordlist = self.splitter[indexName].process([word])
+            #except:
+            #    wordlist=[word]
             
         for line in file.split("\n"):
-            line = unicodify(line)
-            found=False
-            for word in wordlist:
-                if line.find(word)>-1: #word ist gefunden dann makiere und breche die Schleife ab
-                        if lineList: #liste of moeglichen Zeilennummern
-                                num=line.split(".")[0] #Zeilenummer ist alles vor dem . in der Zeile
-
-                                if num in lineList: 
-
-                                        ret.append(line.replace(word,tagStr%word))
-                        
-                        else: # nimm alles ohne line check
-                                ret.append(line.replace(word,tagStr%word))
-                        found=True
-                        break
-            if not found: #word wurde nicht gefunden keine makierung
-                        ret.append(line)
+            line = formatAtfLineHtml(unicodify(line))
+            if not line:
+                # formatAtf can produce empty lines
+                continue
+            
+            for w in wordlist:
+                if line.lower().find(w)>-1: 
+                    #word ist gefunden dann makiere
+                    line = line.replace(w,tagStr%w)
+                    
+            ret.append(line)
                         
         return u'<br>\n'.join(ret)