--- cdli/cdli_files.py 2007/10/19 16:25:06 1.80.2.2 +++ cdli/cdli_files.py 2008/01/18 20:53:49 1.80.2.18 @@ -28,6 +28,9 @@ import copy import codecs import sys +import cdliSplitter + + def unicodify(s): """decode str (utf-8 or latin-1 representation) into unicode object""" if not s: @@ -50,7 +53,72 @@ def utf8ify(s): else: return s.encode('utf-8') +def formatAtfHtml(l): + """escape special ATF characters for HTML""" + if not l: + return "" + # replace & + l = l.replace('&','&') + # replace angular brackets + l = l.replace('<','<') + l = l.replace('>','>') + return l + +def formatAtfLineHtml(l, nolemma=True): + """format ATF line for HTML""" + if not l: + return "" + + if nolemma: + # ignore lemma lines + if l.lstrip().startswith('#lem:'): + return "" + + return formatAtfHtml(l) + + + +def formatAtfFullLineNum(txt, nolemma=True): + """format full line numbers in ATF text""" + # surface codes + surfaces = {'@obverse':'obv', + '@reverse':'rev', + '@surface':'surface', + '@edge':'edge', + '@left':'left', + '@right':'right', + '@top':'top', + '@bottom':'bottom', + '@face':'face', + '@seal':'seal'} + + if not txt: + return "" + + ret = [] + surf = "" + col = "" + for line in txt.splitlines(): + line = unicodify(line) + if line and line[0] == '@': + # surface or column + words = line.split(' ') + if words[0] in surfaces: + surf = line.replace(words[0],surfaces[words[0]]).strip() + + elif words[0] == '@column': + col = ''.join(words[1:]) + + elif line and line[0] in '123456789': + # ordinary line -> add line number + line = "%s:%s:%s"%(surf,col,line) + + ret.append(line) + + return '\n'.join(ret) + + def generateXMLReturn(hash): """erzeugt das xml file als returnwert fuer uploadATFRPC""" @@ -76,14 +144,6 @@ def generateXMLReturn(hash): return ret - - - - - - - - def unique(s): """Return a list of the elements in s, but without duplicates. @@ -253,7 +313,7 @@ class uploadATFfinallyThread(Thread): self.result+="

Start processing

" #shall I only upload the changed files? - logging.info("uploadATFfinally procedure: %s"%procedure) + logging.debug("uploadATFfinally procedure: %s"%procedure) if procedure=="uploadchanged": changed=[x[0] for x in SESSION.get('changed',[])] uploadFns=changed+SESSION.get('newPs',[]) @@ -267,48 +327,48 @@ class uploadATFfinallyThread(Thread): #or maybe nothing elif procedure=="noupload": - return True + return True else: uploadFns=[] #do first the changed files i=0 for fn in uploadFns: + logging.debug("uploadATFfinally uploadFn=%s"%fn) i+=1 founds=ctx2.CDLICatalog.search({'title':fn}) if len(founds)>0: SESSION['author']=str(username) self.result="

Changing : %s"%fn+self.result + logging.debug("uploadatffinallythread changing:%s"%fn+self.result) founds[0].getObject().manage_addCDLIFileObject('',comment,SESSION['author'],file=os.path.join(SESSION['tmpdir'],fn),from_tmp=True) - if i==200: - i=0 - transaction.get().commit() - logging.info("changing: do commit") + if i%200==0: + transaction.get().commit() + logging.debug("uploadatffinallythread changing: do commit") transaction.get().commit() - logging.info("changing: last commit") + logging.debug("uploadatffinallythread changing: last commit") #now add the new files newPs=SESSION['newPs'] if len(newPs)>0: tmpDir=SESSION['tmpdir'] - logging.info("adding start") + logging.debug("uploadatffinallythread adding start") self.result="

Adding files

"+self.result #TODO: make this configurable, at the moment base folder for the files has to be cdli_main - ctx2.importFiles(comment=comment,author=str(username) ,folderName=tmpDir, files=newPs,ext=self) - logging.info("adding finished") - + logging.debug("uploadatffinallythread adding finished") #unlock locked files? if unlock: - logging.info("unlocking start") + logging.debug("uploadatffinallythread unlocking start") self.result="

Unlock files

"+self.result unlockFns=[] for x in os.listdir(SESSION['tmpdir']): if not x in SESSION['errors']: unlockFns.append(x) - logging.info("unlocking have now what to unlock") + + logging.debug("unlocking have now what to unlock") for fn in unlockFns: #logging.info("will unlock: %s"%fn) @@ -317,18 +377,18 @@ class uploadATFfinallyThread(Thread): if len(founds)>0: #logging.info("unlock: %s"%founds[0].getObject().getId()) SESSION['author']=str(username) - founds[0].getObject().lockedBy="" - logging.info("unlocking done") + + logging.debug("uploadatffinallythread unlocking done") #if a basketname is given, add files to the basket if not (basketname ==''): - logging.info("add to basket %s"%basketname) + logging.debug("uploadatffinallythread add to basket %s"%basketname) self.result="

Add to basket

"+self.result basketId=ctx2.basketContainer.getBasketIdfromName(basketname) if not basketId: # create new basket - logging.info("create basket %s"%basketname) + logging.debug("uploadatffinallythread create basket %s"%basketname) self.result="

Create a new basket

"+self.result ob=ctx2.basketContainer.addBasket(basketname) basketId=ob.getId() @@ -336,11 +396,11 @@ class uploadATFfinallyThread(Thread): ids=os.listdir(SESSION['tmpdir']) basket.addObjects(ids,deleteOld=True,username=str(username)) + logging.debug("uploadatffinallythread uploadfinally done") + if RESPONSE is not None: RESPONSE.redirect(self.aq_parent.absolute_url()) - - logging.info("uploadfinally done") return True class tmpStore(SimpleItem): @@ -572,7 +632,7 @@ class CDLIBasketContainer(OrderedFolder) ret+=str(object[0].getData())+"\n" elif current=="yes": #search current object - logging.info("crrent: %s"%object[1].getId().split(".")[0]) + logging.debug("current: %s"%object[1].getId().split(".")[0]) founds=self.CDLICatalog.search({'title':object[1].getId().split(".")[0]}) if len(founds)>0: ret+=str(founds[0].getObject().getLastVersion().getData())+"\n" @@ -1452,6 +1512,12 @@ class CDLIFileObject(CatalogAware,extVer security.declareProtected('manage','index_html') + security.declarePublic('view') + view = PageTemplateFile('zpt/viewCDLIFile.zpt', globals()) + + security.declarePublic('editATF') + editATF = PageTemplateFile('zpt/editATFFile.zpt', globals()) + def PrincipiaSearchSource(self): """Return cataloguable key for ourselves.""" return str(self) @@ -1466,11 +1532,12 @@ class CDLIFileObject(CatalogAware,extVer def makeThisVersionCurrent(self,comment,author,RESPONSE=None): """copy this version to current""" parent=self.aq_parent - newversion=parent.manage_addCDLIFileObject('',comment,author) - newversion.manage_upload(self.getData()) + parent.manage_addVersionedFileObject(id=None,vC=comment,author=author,file=self.getData(),RESPONSE=RESPONSE) + #newversion=parent.manage_addCDLIFileObject('',comment,author) + #newversion.manage_upload(self.getData()) - if RESPONSE is not None: - RESPONSE.redirect(self.aq_parent.absolute_url()+'/history') + #if RESPONSE is not None: + # RESPONSE.redirect(self.aq_parent.absolute_url()+'/history') return True @@ -1480,11 +1547,6 @@ class CDLIFileObject(CatalogAware,extVer # return re.sub("\s\#lem"," #lem",data) #remove return vor #lem return re.sub("#lem"," #lem",data) #remove return vor #lem - security.declarePublic('view') - def view(self): - """view file""" - pt=PageTemplateFile(os.path.join(package_home(globals()),'zpt','viewCDLIFile.zpt')).__of__(self) - return pt() security.declarePublic('getPNumber') def getPNumber(self): @@ -1513,6 +1575,7 @@ class CDLIFileObject(CatalogAware,extVer return txt.group(2) except: return "ERROR" + manage_addCDLIFileObjectForm=DTMLFile('dtml/fileAdd', globals(),Kind='CDLIFileObject',kind='CDLIFileObject', version='1') @@ -1546,10 +1609,10 @@ def manage_addCDLIFileObject(self,id,vC= if content_type: fob.content_type=content_type - logging.debug("manage_add: lastversion=%s"%self.getData()) + #logging.debug("manage_add: lastversion=%s"%self.getData()) logging.debug("reindex1: %s in %s"%(repr(self),repr(self.default_catalog))) self.reindex_object() - logging.debug("manage_add: fob_data=%s"%fob.getData()) + #logging.debug("manage_add: fob_data=%s"%fob.getData()) logging.debug("reindex2: %s in %s"%(repr(fob), repr(fob.default_catalog))) fob.index_object() @@ -1565,6 +1628,7 @@ class CDLIFile(extVersionedFile,CatalogA content_meta_type = ["CDLI File Object"] default_catalog='CDLICatalog' + security.declareProtected('manage','index_html') def getLastVersionData(self): @@ -1575,6 +1639,11 @@ class CDLIFile(extVersionedFile,CatalogA """get last version data""" return self.getContentObject().getFormattedData() + def getTextId(self): + """returns P-number of text""" + # assuming that its the beginning of the title + return self.title[:7] + #security.declarePublic('history') def history(self): """history""" @@ -1622,6 +1691,7 @@ class CDLIFile(extVersionedFile,CatalogA def _newContentObject(self, id, title='', versionNumber=0, versionComment=None, time=None, author=None): """factory for content objects. to be overridden in derived classes.""" + logging.debug("_newContentObject(CDLI)") return CDLIFileObject(id,title,versionNumber=versionNumber,versionComment=versionComment,time=time,author=author) @@ -1691,6 +1761,7 @@ def manage_addCDLIFile(self,id,title,loc if RESPONSE is not None: RESPONSE.redirect('manage_main') + def checkUTF8(data): """check utf 8""" try: @@ -1721,6 +1792,7 @@ def splitatf(fh,dir=None,ext=None): nf=None i=0 + #ROC: why split \n first and then \r??? if (type(fh) is StringType) or (type(fh) is UnicodeType): iter=fh.split("\n") else: @@ -1776,24 +1848,33 @@ class CDLIFileFolder(extVersionedFileFol meta_type="CDLI Folder" file_meta_type=['CDLI file'] folder_meta_type=['CDLI Folder'] - - default_catalog='CDLICatalog' - defaultFileCatalog=default_catalog #wenn dieses definiert ist, wird beim hinzufuegen einer neuen version eines files dieser catalog neuindiziert + + file_catalog='CDLICatalog' + #downloadCounter=0 # counts how many download for all files currently run, be mehr als 5 wird verweigert. tmpStore2={} + def _newVersionedFile(self, id, title='', lockedBy=None, author=None): + """factory for versioned files. to be overridden in derived classes.""" + logging.debug("_newVersionedFile(CDLI)") + return CDLIFile(id, title, lockedBy=lockedBy, author=author) + def setTemp(self,name,value): """set tmp""" setattr(self,name,value) + deleteFileForm = PageTemplateFile("zpt/doDeleteFile", globals()) - def delete(self,ids): + def delete(self,ids,REQUEST=None): """delete these files""" if type(ids) is not ListType: ids=[ids] self.manage_delObjects(ids) + + if REQUEST is not None: + return self.index_html() def getVersionNumbersFromIds(self,ids): @@ -1812,31 +1893,32 @@ class CDLIFileFolder(extVersionedFileFol def getFile(self,fn): """get the content of the file fn""" - founds=self.CDLICatalog.search({'title':fn}) - if not founds: - return "" + logging.debug("getFile: %s"%repr(fn)) + if not self.hasObject(fn): + # search deeper + founds=getattr(self, self.file_catalog).search({'textid':fn}) + if founds: + obj=founds[0].getObject().getContentObject() + else: + return "" else: - obj=founds[0].getObject().getContentObject() + obj = self[fn].getContentObject() - return obj.getData()[0:] + return obj.getData()[0:] + def checkCatalog(self,fn): """check if fn is in the catalog""" #TODO add checkCatalog - def findObjectsFromListWithVersion(self,list,author=None): """find objects from a list with versions @param list: list of tuples (cdliFile,version) """ - - - #self.REQUEST.SESSION['fileIds']=list#store fieldIds in session for further usage #self.REQUEST.SESSION['searchList']=self.REQUEST.SESSION['fileIds'] - pt=getattr(self,'filelistVersioned.html') return pt(search=list,author=author) @@ -1917,7 +1999,7 @@ class CDLIFileFolder(extVersionedFileFol def sortF(x,y): return cmp(x[0],y[0]) - catalog=getattr(self,self.default_catalog) + catalog=getattr(self,self.file_catalog) #tf,tfilename=mkstemp() if not hasattr(self.temp_folder,'downloadCounter'): self.temp_folder.downloadCounter=0 @@ -2030,6 +2112,13 @@ class CDLIRoot(Folder): meta_type="CDLIRoot" downloadCounterBaskets=0 # counts the current basket downloads if counter > 10 no downloads are possible + file_catalog = 'CDLICatalog' + + # word splitter for search + splitter = {'words':cdliSplitter.wordSplitter(), + 'graphemes':cdliSplitter.graphemeSplitter()} + + def deleteFiles(self,ids): """delete files""" for id in ids: @@ -2042,11 +2131,38 @@ class CDLIRoot(Folder): + def searchText(self, query, index='graphemes'): + """searches query in the fulltext index and returns a list of file ids/P-numbers""" + # see also: http://www.plope.com/Books/2_7Edition/SearchingZCatalog.stx#2-13 + logging.debug("searchtext for '%s' in index %s"%(query,index)) + #import Products.ZCTextIndex.QueryParser + #qp = QueryParser.QueryParser() + #logging.debug() + idxQuery = {index:{'query':query}} + idx = getattr(self, self.file_catalog) + # do search + resultset = idx.search(query_request=idxQuery,sort_index='textid') + # put only the P-Number in the result + results = [res.getId[:7] for res in resultset] + logging.debug("searchtext: found %d texts"%len(results)) + return results + + + def getFile(self, pnum): + """get the translit file with the given pnum""" + f = getattr(self, self.file_catalog).search({'textid':pnum}) + if not f: + return "" + + return f[0].getObject().getData() + + + def showFile(self,fileId,wholePage=False): """show a file @param fileId: P-Number of the document to be displayed """ - f=self.CDLICatalog({'title':fileId}) + f=getattr(self, self.file_catalog).search({'textid':fileId}) if not f: return "" @@ -2057,69 +2173,129 @@ class CDLIRoot(Folder): return f[0].getObject().getLastVersionFormattedData() - def showWordInFile(self,fileId,word,lineList=None,regExp=False,indexName=""): + def showWordInFile(self,fileId,word,indexName='graphemes',regExp=False,): """get lines with word from FileId""" + logging.debug("showwordinfile word='%s' index=%s file=%s"%(word,indexName,fileId)) - file=self.showFile(fileId) - logging.debug("show word regEXP %s"%regExp) + file = formatAtfFullLineNum(self.getFile(fileId)) ret=[] - if regExp: # wenn regexp dann generiere alle worte aus der list die der regexp entsprechen - wordlist=self.findWordRegExp(indexName,word) - else: - wordlist=[word] - for line in file.split("\n"): - found=False + # add whitespace before and whitespace and line-end to splitter bounds expressions + bounds = self.splitter[indexName].bounds + splitexp = "(%s|\s)(%%s)(%s|\s|\Z)"%(bounds,bounds) + # clean word expression + # TODO: this should use QueryParser itself + # take out double quotes + word = word.replace('"','') + # take out ignorable signs + ignorable = self.splitter[indexName].ignorex + word = ignorable.sub('', word) + # compile into regexp objects and escape parens + wordlist = [re.compile(splitexp%re.escape(w)) for w in word.split(' ')] + + for line in file.splitlines(): for word in wordlist: - try: # just a hack because of possible unicode errors in line - if line.find(word)>-1: - if lineList: #liste of moeglichen Zeilennummern - num=line.split(".")[0] #Zeilenummer ist alles vor dem . in der Zeile - - if num in lineList: - - ret.append(line) - else: # nimm alles ohne line check - ret.append(line) - - break; - except: - pass + #logging.debug("showwordinfile: searching for %s in %s"%(word.pattern,ignoreable.sub('',line))) + if word.search(ignorable.sub('',line)): + line = formatAtfLineHtml(line) + ret.append(line) + break + return ret - def tagWordInFile(self,fileId,word,lineList=None,regExp=False,indexName=""): + + def showWordInFiles(self,fileIds,word,indexName='graphemes',regExp=False): + """ + get lines with word from all ids in list FileIds. + returns dict with id:lines pairs. + """ + logging.debug("showwordinfiles word='%s' index=%s file=%s"%(word,indexName,fileIds)) + + return dict([(id,self.showWordInFile(id, word, indexName, regExp)) for id in fileIds]) + + + def tagWordInFile(self,fileId,word,indexName='graphemes',regExp=False): """get text with word highlighted from FileId""" + logging.debug("tagwordinfile word='%s' index=%s file=%s"%(word,indexName,fileId)) - file=self.showFile(fileId) - tagStr=u'%s' + file=self.getFile(fileId) + tagStart=u'' + tagEnd=u'' + tagStr=tagStart + u'%%s' + tagEnd ret=[] - if regExp: # wenn regexp dann generiere alle worte aus der list die der regexp entsprechen - wordlist=self.findWordRegExp(indexName,word) - else: - wordlist=[word] + # add whitespace to splitter bounds expressions and compile into regexp object + bounds = self.splitter[indexName].bounds + wordsplit = re.compile("(%s|\s)"%bounds) + # clean word expression + # TODO: this should use QueryParser itself + word = word.replace('"','') # take out double quotes + # take out ignoreable signs + ignorable = self.splitter[indexName].ignorex + word = ignorable.sub('', word) + # split search terms by blanks + words = word.split(' ') + # split search terms again (for grapheme search with words) + splitwords = dict(((w,self.splitter[indexName].process([w])) for w in words)) - for line in file.split("\n"): + for line in file.splitlines(): line = unicodify(line) - found=False - for word in wordlist: - if line.find(word)>-1: #word ist gefunden dann makiere und breche die Schleife ab - if lineList: #liste of moeglichen Zeilennummern - num=line.split(".")[0] #Zeilenummer ist alles vor dem . in der Zeile - - if num in lineList: - - ret.append(line.replace(word,tagStr%word)) - - else: # nimm alles ohne line check - ret.append(line.replace(word,tagStr%word)) - found=True - break - if not found: #word wurde nicht gefunden keine makierung - ret.append(line) + # ignore lemma and other lines + if line.lstrip().startswith('#lem:'): + continue + # ignore p-num line + if line.startswith('&P'): + continue + # ignore version lines + if line.startswith('#version'): + continue + # ignore atf type lines + if line.startswith('#atf:'): + continue + + # first scan + hitwords = [] + for w in words: + if ignorable.sub('',line).find(w) > -1: + # word is in line + # append split word for grapheme search with words + hitwords.extend(splitwords[w]) + #hitwords.extend(wordsplit.split(w)) + + # examine hits closer + if hitwords: + # split line into words + parts = wordsplit.split(line) + line = "" + for p in parts: + #logging.debug("tagwordinfile: searching for %s in %s"%(p,hitwords)) + # reassemble line + if ignorable.sub('', p) in hitwords: + #logging.debug("tagwordinfile: found %s in %s"%(p,hitwords)) + # this part was found + line += tagStart + formatAtfHtml(p) + tagEnd + else: + line += formatAtfHtml(p) + + else: + # no hits + line = formatAtfHtml(line) + + ret.append(line) return u'
\n'.join(ret) + + + def tagWordInFiles(self,fileIds,word,indexName='graphemes',regExp=False): + """ + get texts with highlighted word from all ids in list FileIds. + returns dict with id:text pairs. + """ + logging.debug("tagwordinfiles word='%s' index=%s file=%s"%(word,indexName,fileIds)) + return dict([(id,self.tagWordInFile(id, word, indexName, regExp)) for id in fileIds]) + + def URLquote(self,str): """quote url""" return urllib.quote(str) @@ -2374,6 +2550,7 @@ class CDLIRoot(Folder): def importFiles(self,comment="",author="" ,folderName="/Users/dwinter/atf", files=None,ext=None): """import files""" + logging.debug("importFiles folderName=%s files=%s ext=%s"%(folderName,files,ext)) root=self.cdli_main count=0 if not files: @@ -2383,17 +2560,20 @@ class CDLIRoot(Folder): folder=f[0:3] f2=f[0:5] obj=self.ZopeFind(root,obj_ids=[folder]) + logging.debug("importFiles: folder=%s f2=%s obj=%s"%(folder,f2,obj)) if ext: - ext.result="

adding: %s

"%f+ext.result + if not obj: manage_addCDLIFileFolder(root,folder,folder) fobj=getattr(root,folder) #transaction.get().commit() + else: fobj=obj[0][1] obj2=fobj.ZopeFind(fobj,obj_ids=[f2]) + logging.debug("importFiles: fobj=%s obj2=%s"%(fobj,obj2)) if not obj2: manage_addCDLIFileFolder(fobj,f2,f2) @@ -2404,22 +2584,15 @@ class CDLIRoot(Folder): file2=os.path.join(folderName,f) id=f - manage_addCDLIFile(fobj2,f,'','') - id=f - ob=fobj2._getOb(f) - ob.title=id - - manage_addCDLIFileObject(ob,id,comment,author,file2,content_type='',from_tmp=True) - self.CDLICatalog.catalog_object(ob) - #self.CDLICatalog.manage_catalogFoundItems(obj_ids=[id],search_sub=1) - #self.CDLICatalog.manage_catalogObject(self.REQUEST, self.REQUEST.RESPONSE, 'CDLICatalog', urlparse.urlparse(ob.absolute_url())[1]) + logging.debug("importFiles: addCDLIFile fobj2=%s, f=%s file2=%s"%(fobj2,repr(f),repr(file2))) + fobj2.addFile(vC='',file=file(file2),author=author,newName=f) count+=1 - if count > 1000: - print "committing" + if count%100==0: + logging.debug("importfiles: committing") transaction.get().commit() - count=0 - transaction.get().commit() + + transaction.get().commit() return "ok"