--- cdli/cdli_files.py 2007/10/22 16:26:40 1.80.2.3 +++ cdli/cdli_files.py 2007/12/03 21:30:19 1.80.2.10 @@ -28,6 +28,9 @@ import copy import codecs import sys +import cdliSplitter + + def unicodify(s): """decode str (utf-8 or latin-1 representation) into unicode object""" if not s: @@ -50,7 +53,72 @@ def utf8ify(s): else: return s.encode('utf-8') +def formatAtfHtml(l): + """escape special ATF characters for HTML""" + if not l: + return "" + + # replace & + l = l.replace('&','&') + # replace angular brackets + l = l.replace('<','<') + l = l.replace('>','>') + return l + +def formatAtfLineHtml(l, nolemma=True): + """format ATF line for HTML""" + if not l: + return "" + + if nolemma: + # ignore lemma lines + if l.lstrip().startswith('#lem:'): + return "" + + return formatAtfHtml(l) + + +def formatAtfFullLineNum(txt, nolemma=True): + """format full line numbers in ATF text""" + # surface codes + surfaces = {'@obverse':'obv', + '@reverse':'rev', + '@surface':'surface', + '@edge':'edge', + '@left':'left', + '@right':'right', + '@top':'top', + '@bottom':'bottom', + '@face':'face', + '@seal':'seal'} + + if not txt: + return "" + + ret = [] + surf = "" + col = "" + for line in txt.split("\n"): + line = unicodify(line) + if line and line[0] == '@': + # surface or column + words = line.split(' ') + if words[0] in surfaces: + surf = line.replace(words[0],surfaces[words[0]]).strip() + + elif words[0] == '@column': + col = words[1] + + elif line and line[0] in '123456789': + # ordinary line -> add line number + line = "%s:%s:%s"%(surf,col,line) + + ret.append(line) + + return '\n'.join(ret) + + def generateXMLReturn(hash): """erzeugt das xml file als returnwert fuer uploadATFRPC""" @@ -1565,6 +1633,7 @@ class CDLIFile(extVersionedFile,CatalogA content_meta_type = ["CDLI File Object"] default_catalog='CDLICatalog' + security.declareProtected('manage','index_html') def getLastVersionData(self): @@ -1575,6 +1644,11 @@ class CDLIFile(extVersionedFile,CatalogA """get last version data""" return self.getContentObject().getFormattedData() + def getTextId(self): + """returns P-number of text""" + # assuming that its the beginning of the title + return self.title[:7] + #security.declarePublic('history') def history(self): """history""" @@ -1776,9 +1850,9 @@ class CDLIFileFolder(extVersionedFileFol meta_type="CDLI Folder" file_meta_type=['CDLI file'] folder_meta_type=['CDLI Folder'] - - default_catalog='CDLICatalog' - defaultFileCatalog=default_catalog #wenn dieses definiert ist, wird beim hinzufuegen einer neuen version eines files dieser catalog neuindiziert + + file_catalog='CDLICatalog' + #downloadCounter=0 # counts how many download for all files currently run, be mehr als 5 wird verweigert. tmpStore2={} @@ -1812,9 +1886,10 @@ class CDLIFileFolder(extVersionedFileFol def getFile(self,fn): """get the content of the file fn""" + logging.debug("getFile: %s"%repr(fn)) if not self.hasObject(fn): # search deeper - founds=self.CDLICatalog.search({'title':fn}) + founds=getattr(self, self.file_catalog).search({'textid':fn}) if founds: obj=founds[0].getObject().getContentObject() else: @@ -1917,7 +1992,7 @@ class CDLIFileFolder(extVersionedFileFol def sortF(x,y): return cmp(x[0],y[0]) - catalog=getattr(self,self.default_catalog) + catalog=getattr(self,self.file_catalog) #tf,tfilename=mkstemp() if not hasattr(self.temp_folder,'downloadCounter'): self.temp_folder.downloadCounter=0 @@ -2030,6 +2105,13 @@ class CDLIRoot(Folder): meta_type="CDLIRoot" downloadCounterBaskets=0 # counts the current basket downloads if counter > 10 no downloads are possible + file_catalog = 'CDLICatalog' + + # word splitter for search + splitter = {'words':cdliSplitter.wordSplitter(), + 'graphemes':cdliSplitter.graphemeSplitter()} + + def deleteFiles(self,ids): """delete files""" for id in ids: @@ -2042,11 +2124,38 @@ class CDLIRoot(Folder): + def searchText(self, query, index='graphemes'): + """searches query in the fulltext index and returns a list of file ids/P-numbers""" + # see also: http://www.plope.com/Books/2_7Edition/SearchingZCatalog.stx#2-13 + logging.debug("searchtext for '%s' in index %s"%(query,index)) + #import Products.ZCTextIndex.QueryParser + #qp = QueryParser.QueryParser() + #logging.debug() + idxQuery = {index:{'query':query}} + idx = getattr(self, self.file_catalog) + # do search + resultset = idx.search(query_request=idxQuery,sort_index='textid') + # put only the P-Number in the result + results = [res.getId[:7] for res in resultset] + logging.debug("searchtext: found %d texts"%len(results)) + return results + + + def getFile(self, pnum): + """get the translit file with the given pnum""" + f = getattr(self, self.file_catalog).search({'textid':pnum}) + if not f: + return "" + + return f[0].getObject().getData() + + + def showFile(self,fileId,wholePage=False): """show a file @param fileId: P-Number of the document to be displayed """ - f=self.CDLICatalog({'title':fileId}) + f=getattr(self, self.file_catalog).search({'textid':fileId}) if not f: return "" @@ -2057,69 +2166,108 @@ class CDLIRoot(Folder): return f[0].getObject().getLastVersionFormattedData() - def showWordInFile(self,fileId,word,lineList=None,regExp=False,indexName=""): + def showWordInFile(self,fileId,word,indexName='graphemes',regExp=False,): """get lines with word from FileId""" + logging.debug("showwordinfile word='%s' index=%s file=%s"%(word,indexName,fileId)) - file=self.showFile(fileId) - logging.debug("show word regEXP %s"%regExp) + file = formatAtfFullLineNum(self.getFile(fileId)) ret=[] - if regExp: # wenn regexp dann generiere alle worte aus der list die der regexp entsprechen - wordlist=self.findWordRegExp(indexName,word) - else: - wordlist=[word] + # add whitespace before and whitespace and line-end to splitter bounds expressions + bounds = self.splitter[indexName].bounds + splitexp = "(%s|\s)(%%s)(%s|\s|\Z)"%(bounds,bounds) + # clean word expression + # TODO: this should use QueryParser itself + word = word.replace('"','') # take out double quotes + # escape parens for regexp too + # compile into regexp objects + wordlist = [re.compile(splitexp%re.escape(w)) for w in word.split(' ')] + for line in file.split("\n"): - found=False for word in wordlist: - try: # just a hack because of possible unicode errors in line - if line.find(word)>-1: - if lineList: #liste of moeglichen Zeilennummern - num=line.split(".")[0] #Zeilenummer ist alles vor dem . in der Zeile - - if num in lineList: - - ret.append(line) - else: # nimm alles ohne line check - ret.append(line) - - break; - except: - pass + #logging.debug("showwordinfile: searching for %s in %s"%(word.pattern,line)) + if word.search(line): + line = formatAtfLineHtml(line) + ret.append(line) + break + return ret - def tagWordInFile(self,fileId,word,lineList=None,regExp=False,indexName=""): + + def showWordInFiles(self,fileIds,word,indexName='graphemes',regExp=False): + """ + get lines with word from all ids in list FileIds. + returns dict with id:lines pairs. + """ + logging.debug("showwordinfiles word='%s' index=%s file=%s"%(word,indexName,fileIds)) + + return dict([(id,self.showWordInFile(id, word, indexName, regExp)) for id in fileIds]) + + + def tagWordInFile(self,fileId,word,indexName='graphemes',regExp=False): """get text with word highlighted from FileId""" + logging.debug("tagwordinfile word='%s' index=%s file=%s"%(word,indexName,fileId)) - file=self.showFile(fileId) - tagStr=u'%s' + file=self.getFile(fileId) + tagStart=u'' + tagEnd=u'' + tagStr=tagStart + u'%%s' + tagEnd ret=[] - if regExp: # wenn regexp dann generiere alle worte aus der list die der regexp entsprechen - wordlist=self.findWordRegExp(indexName,word) - else: - wordlist=[word] + # add whitespace to splitter bounds expressions and compile into regexp object + bounds = self.splitter[indexName].bounds + wordsplit = re.compile("(%s|\s)"%bounds) + # clean word expression + # TODO: this should use QueryParser itself + word = word.replace('"','') # take out double quotes + # split search terms by blanks + words = word.split(' ') for line in file.split("\n"): line = unicodify(line) - found=False - for word in wordlist: - if line.find(word)>-1: #word ist gefunden dann makiere und breche die Schleife ab - if lineList: #liste of moeglichen Zeilennummern - num=line.split(".")[0] #Zeilenummer ist alles vor dem . in der Zeile - - if num in lineList: - - ret.append(line.replace(word,tagStr%word)) - - else: # nimm alles ohne line check - ret.append(line.replace(word,tagStr%word)) - found=True - break - if not found: #word wurde nicht gefunden keine makierung - ret.append(line) + # ignore lemma lines + if line.lstrip().startswith('#lem:'): + continue + + # first scan + hitwords = [] + for w in words: + if line.find(w) > -1: + # word is in line + hitwords.append(w) + + # examine hits closer + if hitwords: + # split line into words + parts = wordsplit.split(line) + line = "" + for p in parts: + # reassemble line + if p in hitwords: + # this part was found + line += tagStart + formatAtfHtml(p) + tagEnd + else: + line += formatAtfHtml(p) + + else: + # no hits + line = formatAtfHtml(line) + + ret.append(line) return u'
\n'.join(ret) + + + def tagWordInFiles(self,fileIds,word,indexName='graphemes',regExp=False): + """ + get texts with highlighted word from all ids in list FileIds. + returns dict with id:text pairs. + """ + logging.debug("tagwordinfiles word='%s' index=%s file=%s"%(word,indexName,fileIds)) + return dict([(id,self.tagWordInFile(id, word, indexName, regExp)) for id in fileIds]) + + def URLquote(self,str): """quote url""" return urllib.quote(str)