--- cdli/cdli_files.py	2007/10/22 16:26:40	1.80.2.3
+++ cdli/cdli_files.py	2007/12/03 21:30:19	1.80.2.10
@@ -28,6 +28,9 @@ import copy
 import codecs
 import sys
 
+import cdliSplitter
+
+
 def unicodify(s):
     """decode str (utf-8 or latin-1 representation) into unicode object"""
     if not s:
@@ -50,7 +53,72 @@ def utf8ify(s):
     else:
         return s.encode('utf-8')
 
+def formatAtfHtml(l):
+    """escape special ATF characters for HTML"""
+    if not l:
+        return ""
+
+    # replace &
+    l = l.replace('&','&amp;')
+    # replace angular brackets
+    l = l.replace('<','&lt;')
+    l = l.replace('>','&gt;')
+    return l
+
+def formatAtfLineHtml(l, nolemma=True):
+    """format ATF line for HTML"""
+    if not l:
+        return ""
+
+    if nolemma:
+        # ignore lemma lines
+        if l.lstrip().startswith('#lem:'):
+            return ""
+    
+    return formatAtfHtml(l)
 
+
+
+def formatAtfFullLineNum(txt, nolemma=True):
+    """format full line numbers in ATF text"""
+    # surface codes
+    surfaces = {'@obverse':'obv',
+                '@reverse':'rev',
+                '@surface':'surface',
+                '@edge':'edge',
+                '@left':'left',
+                '@right':'right',
+                '@top':'top',
+                '@bottom':'bottom',
+                '@face':'face',
+                '@seal':'seal'}
+
+    if not txt:
+        return ""
+    
+    ret = []
+    surf = ""
+    col = ""
+    for line in txt.split("\n"):
+        line = unicodify(line)
+        if line and line[0] == '@':
+            # surface or column
+            words = line.split(' ')
+            if words[0] in surfaces:
+                surf = line.replace(words[0],surfaces[words[0]]).strip()
+            
+            elif words[0] == '@column':
+                col = words[1]
+            
+        elif line and line[0] in '123456789':
+            # ordinary line -> add line number
+            line = "%s:%s:%s"%(surf,col,line)
+            
+        ret.append(line)
+    
+    return '\n'.join(ret)
+            
+            
 def generateXMLReturn(hash):
     """erzeugt das xml file als returnwert fuer uploadATFRPC"""
 
@@ -1565,6 +1633,7 @@ class CDLIFile(extVersionedFile,CatalogA
     content_meta_type = ["CDLI File Object"]
     
     default_catalog='CDLICatalog'
+    
     security.declareProtected('manage','index_html')
     
     def getLastVersionData(self):
@@ -1575,6 +1644,11 @@ class CDLIFile(extVersionedFile,CatalogA
         """get last version data"""
         return self.getContentObject().getFormattedData()
 
+    def getTextId(self):
+        """returns P-number of text"""
+        # assuming that its the beginning of the title
+        return self.title[:7]
+
     #security.declarePublic('history')
     def history(self):
         """history"""  
@@ -1776,9 +1850,9 @@ class CDLIFileFolder(extVersionedFileFol
     meta_type="CDLI Folder"
     file_meta_type=['CDLI file']
     folder_meta_type=['CDLI Folder']
-    
-    default_catalog='CDLICatalog'
-    defaultFileCatalog=default_catalog #wenn dieses definiert ist, wird beim hinzufuegen einer neuen version eines files dieser catalog neuindiziert
+
+    file_catalog='CDLICatalog'
+
     #downloadCounter=0 # counts how many download for all files currently run, be mehr als 5 wird verweigert.
     tmpStore2={}
 
@@ -1812,9 +1886,10 @@ class CDLIFileFolder(extVersionedFileFol
     
     def getFile(self,fn):
         """get the content of the file fn"""
+        logging.debug("getFile: %s"%repr(fn))
         if not self.hasObject(fn):
             # search deeper
-            founds=self.CDLICatalog.search({'title':fn})
+            founds=getattr(self, self.file_catalog).search({'textid':fn})
             if founds:
                 obj=founds[0].getObject().getContentObject()
             else:
@@ -1917,7 +1992,7 @@ class CDLIFileFolder(extVersionedFileFol
         def sortF(x,y):
             return cmp(x[0],y[0])
         
-        catalog=getattr(self,self.default_catalog)
+        catalog=getattr(self,self.file_catalog)
         #tf,tfilename=mkstemp()
         if not hasattr(self.temp_folder,'downloadCounter'):
             self.temp_folder.downloadCounter=0
@@ -2030,6 +2105,13 @@ class CDLIRoot(Folder):
     meta_type="CDLIRoot"
     downloadCounterBaskets=0 # counts the current basket downloads if counter > 10 no downloads are possible
     
+    file_catalog = 'CDLICatalog'
+    
+    # word splitter for search
+    splitter = {'words':cdliSplitter.wordSplitter(),
+                'graphemes':cdliSplitter.graphemeSplitter()}
+    
+    
     def deleteFiles(self,ids):
         """delete files"""
         for id in ids:
@@ -2042,11 +2124,38 @@ class CDLIRoot(Folder):
 
 
 
+    def searchText(self, query, index='graphemes'):
+        """searches query in the fulltext index and returns a list of file ids/P-numbers"""
+        # see also: http://www.plope.com/Books/2_7Edition/SearchingZCatalog.stx#2-13
+        logging.debug("searchtext for '%s' in index %s"%(query,index))
+        #import Products.ZCTextIndex.QueryParser
+        #qp = QueryParser.QueryParser()
+        #logging.debug()
+        idxQuery = {index:{'query':query}}
+        idx = getattr(self, self.file_catalog)
+        # do search
+        resultset = idx.search(query_request=idxQuery,sort_index='textid')
+        # put only the P-Number in the result 
+        results = [res.getId[:7] for res in resultset]
+        logging.debug("searchtext: found %d texts"%len(results))
+        return results
+
+
+    def getFile(self, pnum):
+        """get the translit file with the given pnum"""
+        f = getattr(self, self.file_catalog).search({'textid':pnum})
+        if not f:
+            return ""
+        
+        return f[0].getObject().getData()
+         
+
+
     def showFile(self,fileId,wholePage=False):
         """show a file
         @param fileId: P-Number of the document to be displayed
         """
-        f=self.CDLICatalog({'title':fileId})
+        f=getattr(self, self.file_catalog).search({'textid':fileId})
         if not f:
             return ""
         
@@ -2057,69 +2166,108 @@ class CDLIRoot(Folder):
             return f[0].getObject().getLastVersionFormattedData()
     
 
-    def showWordInFile(self,fileId,word,lineList=None,regExp=False,indexName=""):
+    def showWordInFile(self,fileId,word,indexName='graphemes',regExp=False,):
         """get lines with word from FileId"""
+        logging.debug("showwordinfile word='%s' index=%s file=%s"%(word,indexName,fileId)) 
         
-        file=self.showFile(fileId)
-        logging.debug("show word regEXP %s"%regExp)
+        file = formatAtfFullLineNum(self.getFile(fileId))
         ret=[]
-        if regExp: # wenn regexp dann generiere alle worte aus der list die der regexp entsprechen
-            wordlist=self.findWordRegExp(indexName,word)
-        else:
-            wordlist=[word]
         
+        # add whitespace before and whitespace and line-end to splitter bounds expressions
+        bounds = self.splitter[indexName].bounds
+        splitexp = "(%s|\s)(%%s)(%s|\s|\Z)"%(bounds,bounds)
+        # clean word expression 
+        # TODO: this should use QueryParser itself
+        word = word.replace('"','') # take out double quotes
+        # escape parens for regexp too
+        # compile into regexp objects
+        wordlist = [re.compile(splitexp%re.escape(w)) for w in word.split(' ')]
+            
         for line in file.split("\n"):
-            found=False
             for word in wordlist:
-		try: # just a hack because of possible unicode errors in line
-                 if line.find(word)>-1:
-                        if lineList: #liste of moeglichen Zeilennummern
-                                num=line.split(".")[0] #Zeilenummer ist alles vor dem . in der Zeile
-
-                                if num in lineList: 
-
-                                        ret.append(line)
-                        else: # nimm alles ohne line check
-                                ret.append(line)
-                        
-                        break;
-		except:
-			pass
+                #logging.debug("showwordinfile: searching for %s in %s"%(word.pattern,line))
+                if word.search(line):
+                    line = formatAtfLineHtml(line)
+                    ret.append(line)
+                    break
+                    
         return ret
 
-    def tagWordInFile(self,fileId,word,lineList=None,regExp=False,indexName=""):
+    
+    def showWordInFiles(self,fileIds,word,indexName='graphemes',regExp=False):
+        """
+        get lines with word from all ids in list FileIds.
+        returns dict with id:lines pairs.
+        """
+        logging.debug("showwordinfiles word='%s' index=%s file=%s"%(word,indexName,fileIds))
+        
+        return dict([(id,self.showWordInFile(id, word, indexName, regExp)) for id in fileIds])
+    
+
+    def tagWordInFile(self,fileId,word,indexName='graphemes',regExp=False):
         """get text with word highlighted from FileId"""
+        logging.debug("tagwordinfile word='%s' index=%s file=%s"%(word,indexName,fileId)) 
         
-        file=self.showFile(fileId)
-        tagStr=u'<span class="found">%s</span>'
+        file=self.getFile(fileId)
+        tagStart=u'<span class="found">'
+        tagEnd=u'</span>'
+        tagStr=tagStart + u'%%s' + tagEnd
         ret=[]
         
-        if regExp: # wenn regexp dann generiere alle worte aus der list die der regexp entsprechen
-            wordlist=self.findWordRegExp(indexName,word)
-        else:
-            wordlist=[word]
+        # add whitespace to splitter bounds expressions and compile into regexp object
+        bounds = self.splitter[indexName].bounds
+        wordsplit = re.compile("(%s|\s)"%bounds)
+        # clean word expression 
+        # TODO: this should use QueryParser itself
+        word = word.replace('"','') # take out double quotes
+        # split search terms by blanks
+        words = word.split(' ')
             
         for line in file.split("\n"):
             line = unicodify(line)
-            found=False
-            for word in wordlist:
-                if line.find(word)>-1: #word ist gefunden dann makiere und breche die Schleife ab
-                        if lineList: #liste of moeglichen Zeilennummern
-                                num=line.split(".")[0] #Zeilenummer ist alles vor dem . in der Zeile
-
-                                if num in lineList: 
-
-                                        ret.append(line.replace(word,tagStr%word))
-                        
-                        else: # nimm alles ohne line check
-                                ret.append(line.replace(word,tagStr%word))
-                        found=True
-                        break
-            if not found: #word wurde nicht gefunden keine makierung
-                        ret.append(line)
+            # ignore lemma lines
+            if line.lstrip().startswith('#lem:'):
+                continue
+
+            # first scan
+            hitwords = []
+            for w in words:
+                if line.find(w) > -1:
+                    # word is in line
+                    hitwords.append(w)
+                   
+            # examine hits closer
+            if hitwords:
+                # split line into words
+                parts = wordsplit.split(line)
+                line = ""
+                for p in parts:
+                    # reassemble line
+                    if p in hitwords:
+                        # this part was found
+                        line += tagStart + formatAtfHtml(p) + tagEnd
+                    else:
+                        line += formatAtfHtml(p)
+                
+            else:
+                # no hits
+                line = formatAtfHtml(line)
+            
+            ret.append(line)
                         
         return u'<br>\n'.join(ret)
 
+
+
+    def tagWordInFiles(self,fileIds,word,indexName='graphemes',regExp=False):
+        """
+        get texts with highlighted word from all ids in list FileIds.
+        returns dict with id:text pairs.
+        """
+        logging.debug("tagwordinfiles word='%s' index=%s file=%s"%(word,indexName,fileIds)) 
+        return dict([(id,self.tagWordInFile(id, word, indexName, regExp)) for id in fileIds])
+    
+
     def URLquote(self,str):
         """quote url"""
         return urllib.quote(str)