--- cdli/cdli_files.py	2007/10/26 22:45:12	1.80.2.5
+++ cdli/cdli_files.py	2007/12/13 19:20:45	1.80.2.11
@@ -53,15 +53,11 @@ def utf8ify(s):
     else:
         return s.encode('utf-8')
 
-def formatAtfLineHtml(l, nolemma=True):
+def formatAtfHtml(l):
     """escape special ATF characters for HTML"""
     if not l:
         return ""
 
-    if nolemma:
-        # ignore lemma lines
-        if l.lstrip().startswith('#lem:'):
-            return ""
     # replace &
     l = l.replace('&','&amp;')
     # replace angular brackets
@@ -69,7 +65,60 @@ def formatAtfLineHtml(l, nolemma=True):
     l = l.replace('>','&gt;')
     return l
 
+def formatAtfLineHtml(l, nolemma=True):
+    """format ATF line for HTML"""
+    if not l:
+        return ""
+
+    if nolemma:
+        # ignore lemma lines
+        if l.lstrip().startswith('#lem:'):
+            return ""
+    
+    return formatAtfHtml(l)
+
+
 
+def formatAtfFullLineNum(txt, nolemma=True):
+    """format full line numbers in ATF text"""
+    # surface codes
+    surfaces = {'@obverse':'obv',
+                '@reverse':'rev',
+                '@surface':'surface',
+                '@edge':'edge',
+                '@left':'left',
+                '@right':'right',
+                '@top':'top',
+                '@bottom':'bottom',
+                '@face':'face',
+                '@seal':'seal'}
+
+    if not txt:
+        return ""
+    
+    ret = []
+    surf = ""
+    col = ""
+    for line in txt.split("\n"):
+        line = unicodify(line)
+        if line and line[0] == '@':
+            # surface or column
+            words = line.split(' ')
+            if words[0] in surfaces:
+                surf = line.replace(words[0],surfaces[words[0]]).strip()
+            
+            elif words[0] == '@column':
+                col = words[1]
+            
+        elif line and line[0] in '123456789':
+            # ordinary line -> add line number
+            line = "%s:%s:%s"%(surf,col,line)
+            
+        ret.append(line)
+    
+    return '\n'.join(ret)
+            
+            
 def generateXMLReturn(hash):
     """erzeugt das xml file als returnwert fuer uploadATFRPC"""
 
@@ -1584,6 +1633,7 @@ class CDLIFile(extVersionedFile,CatalogA
     content_meta_type = ["CDLI File Object"]
     
     default_catalog='CDLICatalog'
+    
     security.declareProtected('manage','index_html')
     
     def getLastVersionData(self):
@@ -1594,6 +1644,11 @@ class CDLIFile(extVersionedFile,CatalogA
         """get last version data"""
         return self.getContentObject().getFormattedData()
 
+    def getTextId(self):
+        """returns P-number of text"""
+        # assuming that its the beginning of the title
+        return self.title[:7]
+
     #security.declarePublic('history')
     def history(self):
         """history"""  
@@ -1795,9 +1850,9 @@ class CDLIFileFolder(extVersionedFileFol
     meta_type="CDLI Folder"
     file_meta_type=['CDLI file']
     folder_meta_type=['CDLI Folder']
-    
-    default_catalog='CDLICatalog'
-    defaultFileCatalog=default_catalog #wenn dieses definiert ist, wird beim hinzufuegen einer neuen version eines files dieser catalog neuindiziert
+
+    file_catalog='CDLICatalog'
+
     #downloadCounter=0 # counts how many download for all files currently run, be mehr als 5 wird verweigert.
     tmpStore2={}
 
@@ -1834,7 +1889,7 @@ class CDLIFileFolder(extVersionedFileFol
         logging.debug("getFile: %s"%repr(fn))
         if not self.hasObject(fn):
             # search deeper
-            founds=self.CDLICatalog.search({'title':fn})
+            founds=getattr(self, self.file_catalog).search({'textid':fn})
             if founds:
                 obj=founds[0].getObject().getContentObject()
             else:
@@ -1937,7 +1992,7 @@ class CDLIFileFolder(extVersionedFileFol
         def sortF(x,y):
             return cmp(x[0],y[0])
         
-        catalog=getattr(self,self.default_catalog)
+        catalog=getattr(self,self.file_catalog)
         #tf,tfilename=mkstemp()
         if not hasattr(self.temp_folder,'downloadCounter'):
             self.temp_folder.downloadCounter=0
@@ -2069,52 +2124,38 @@ class CDLIRoot(Folder):
 
 
 
-    def searchText(self, query, index='words'):
+    def searchText(self, query, index='graphemes'):
         """searches query in the fulltext index and returns a list of file ids/P-numbers"""
+        # see also: http://www.plope.com/Books/2_7Edition/SearchingZCatalog.stx#2-13
+        logging.debug("searchtext for '%s' in index %s"%(query,index))
+        #import Products.ZCTextIndex.QueryParser
+        #qp = QueryParser.QueryParser()
+        #logging.debug()
         idxQuery = {index:{'query':query}}
         idx = getattr(self, self.file_catalog)
-        results = []
         # do search
-        resultset = idx.search(idxQuery)
-        for res in resultset:
-            # put only the P-Number in the result 
-            results.append(res.getId[:7])
+        resultset = idx.search(query_request=idxQuery,sort_index='textid')
+        # put only the P-Number in the result 
+        results = [res.getId[:7] for res in resultset]
+        logging.debug("searchtext: found %d texts"%len(results))
         return results
 
-        # from PluginINdexes.common.util.py:parseIndexRequest:
-        #
-        #      The class understands the following type of parameters:
-        #
-        #    - old-style parameters where the query for an index as value inside
-        #      the request directory where the index name is the name of the key.
-        #      Additional parameters for an index could be passed as index+"_usage" ...
-        #
-        #
-        #    - dictionary-style parameters specify a query for an index as
-        #      an entry in the request dictionary where the key corresponds to the
-        #      name of the index and the key is a dictionary with the parameters
-        #      passed to the index.
-        #
-        #      Allowed keys of the parameter dictionary:
-        #
-        #      'query'  - contains the query (either string, list or tuple) (required)
-        #
-        #      other parameters depend on the the index
-        #
-        #
-        #   - record-style parameters specify a query for an index as instance of the
-        #     Record class. This happens usually when parameters from a web form use
-        #     the "record" type e.g. <input type="text" name="path.query:record:string">.
-        #     All restrictions of the dictionary-style parameters apply to the record-style
-        #     parameters
 
+    def getFile(self, pnum):
+        """get the translit file with the given pnum"""
+        f = getattr(self, self.file_catalog).search({'textid':pnum})
+        if not f:
+            return ""
+        
+        return f[0].getObject().getData()
+         
 
 
     def showFile(self,fileId,wholePage=False):
         """show a file
         @param fileId: P-Number of the document to be displayed
         """
-        f=self.CDLICatalog({'title':fileId})
+        f=getattr(self, self.file_catalog).search({'textid':fileId})
         if not f:
             return ""
         
@@ -2127,64 +2168,118 @@ class CDLIRoot(Folder):
 
     def showWordInFile(self,fileId,word,indexName='graphemes',regExp=False,):
         """get lines with word from FileId"""
+        logging.debug("showwordinfile word='%s' index=%s file=%s"%(word,indexName,fileId)) 
         
-        file=self.showFile(fileId)
+        file = formatAtfFullLineNum(self.getFile(fileId))
         ret=[]
-        # search using lowercase
-        word = word.lower()
-        if regExp: # wenn regexp dann generiere alle worte aus der list die der regexp entsprechen
-            wordlist=self.findWordRegExp(indexName,word)
-        else:
-            # split the search term into words according to the corresponding splitter
-            #try:
-            wordlist = self.splitter[indexName].process([word])
-            #except:
-            #    wordlist=[word]
         
+        # add whitespace before and whitespace and line-end to splitter bounds expressions
+        bounds = self.splitter[indexName].bounds
+        splitexp = "(%s|\s)(%%s)(%s|\s|\Z)"%(bounds,bounds)
+        # clean word expression 
+        # TODO: this should use QueryParser itself
+        # take out double quotes
+        word = word.replace('"','')
+        # take out ignorable signs
+        ignorable = self.splitter[indexName].ignorex
+        word = ignorable.sub('', word)
+        # compile into regexp objects and escape parens
+        wordlist = [re.compile(splitexp%re.escape(w)) for w in word.split(' ')]
+            
         for line in file.split("\n"):
-            line = formatAtfLineHtml(unicodify(line))
-            if not line:
-                # formatAtf can produce empty lines
-                continue
             for word in wordlist:
-                if line.lower().find(word)>-1:
+                #logging.debug("showwordinfile: searching for %s in %s"%(word.pattern,ignoreable.sub('',line)))
+                if word.search(ignorable.sub('',line)):
+                    line = formatAtfLineHtml(line)
                     ret.append(line)
+                    break
+                    
         return ret
+
+    
+    def showWordInFiles(self,fileIds,word,indexName='graphemes',regExp=False):
+        """
+        get lines with word from all ids in list FileIds.
+        returns dict with id:lines pairs.
+        """
+        logging.debug("showwordinfiles word='%s' index=%s file=%s"%(word,indexName,fileIds))
+        
+        return dict([(id,self.showWordInFile(id, word, indexName, regExp)) for id in fileIds])
     
 
     def tagWordInFile(self,fileId,word,indexName='graphemes',regExp=False):
         """get text with word highlighted from FileId"""
+        logging.debug("tagwordinfile word='%s' index=%s file=%s"%(word,indexName,fileId)) 
         
-        file=self.showFile(fileId)
-        tagStr=u'<span class="found">%s</span>'
+        file=self.getFile(fileId)
+        tagStart=u'<span class="found">'
+        tagEnd=u'</span>'
+        tagStr=tagStart + u'%%s' + tagEnd
         ret=[]
-        # search using lowercase
-        word = word.lower()
         
-        if regExp: # wenn regexp dann generiere alle worte aus der list die der regexp entsprechen
-            wordlist=self.findWordRegExp(indexName,word)
-        else:
-            # split the search term into words according to the corresponding splitter
-            #try:
-            wordlist = self.splitter[indexName].process([word])
-            #except:
-            #    wordlist=[word]
+        # add whitespace to splitter bounds expressions and compile into regexp object
+        bounds = self.splitter[indexName].bounds
+        wordsplit = re.compile("(%s|\s)"%bounds)
+        # clean word expression 
+        # TODO: this should use QueryParser itself
+        word = word.replace('"','') # take out double quotes
+        # take out ignoreable signs
+        ignorable = self.splitter[indexName].ignorex
+        word = ignorable.sub('', word)
+        # split search terms by blanks
+        words = word.split(' ')
+        # split search terms again (for grapheme search with words)
+        splitwords = dict(((w,self.splitter[indexName].process([w])) for w in words))
             
         for line in file.split("\n"):
-            line = formatAtfLineHtml(unicodify(line))
-            if not line:
-                # formatAtf can produce empty lines
+            line = unicodify(line)
+            # ignore lemma lines
+            if line.lstrip().startswith('#lem:'):
                 continue
+
+            # first scan
+            hitwords = []
+            for w in words:
+                if ignorable.sub('',line).find(w) > -1:
+                    # word is in line
+                    # append split word for grapheme search with words
+                    hitwords.extend(splitwords[w])
+                    #hitwords.extend(wordsplit.split(w))
+                   
+            # examine hits closer
+            if hitwords:
+                # split line into words
+                parts = wordsplit.split(line)
+                line = ""
+                for p in parts:
+                    #logging.debug("tagwordinfile: searching for %s in %s"%(p,hitwords))
+                    # reassemble line
+                    if ignorable.sub('', p) in hitwords:
+                        #logging.debug("tagwordinfile: found %s in %s"%(p,hitwords))
+                        # this part was found
+                        line += tagStart + formatAtfHtml(p) + tagEnd
+                    else:
+                        line += formatAtfHtml(p)
+                
+            else:
+                # no hits
+                line = formatAtfHtml(line)
             
-            for w in wordlist:
-                if line.lower().find(w)>-1: 
-                    #word ist gefunden dann makiere
-                    line = line.replace(w,tagStr%w)
-                    
             ret.append(line)
                         
         return u'<br>\n'.join(ret)
 
+
+
+    def tagWordInFiles(self,fileIds,word,indexName='graphemes',regExp=False):
+        """
+        get texts with highlighted word from all ids in list FileIds.
+        returns dict with id:text pairs.
+        """
+        logging.debug("tagwordinfiles word='%s' index=%s file=%s"%(word,indexName,fileIds)) 
+        return dict([(id,self.tagWordInFile(id, word, indexName, regExp)) for id in fileIds])
+    
+
     def URLquote(self,str):
         """quote url"""
         return urllib.quote(str)