--- cdli/cdli_files.py 2007/12/03 21:30:19 1.80.2.10 +++ cdli/cdli_files.py 2008/09/25 12:37:54 1.83 @@ -21,15 +21,16 @@ from ZPublisher.HTTPRequest import HTTPR from ZPublisher.HTTPResponse import HTTPResponse from ZPublisher.BaseRequest import RequestContainer import threading -from BTrees.OOBTree import OOBTree +from BTrees.OOBTree import OOBTree, OOTreeSet import logging import transaction import copy import codecs import sys - +from BTrees.IOBTree import IOBTree import cdliSplitter - +from sets import Set +import md5 def unicodify(s): """decode str (utf-8 or latin-1 representation) into unicode object""" @@ -99,7 +100,7 @@ def formatAtfFullLineNum(txt, nolemma=Tr ret = [] surf = "" col = "" - for line in txt.split("\n"): + for line in txt.splitlines(): line = unicodify(line) if line and line[0] == '@': # surface or column @@ -108,7 +109,7 @@ def formatAtfFullLineNum(txt, nolemma=Tr surf = line.replace(words[0],surfaces[words[0]]).strip() elif words[0] == '@column': - col = words[1] + col = ' '.join(words[1:]) elif line and line[0] in '123456789': # ordinary line -> add line number @@ -144,14 +145,6 @@ def generateXMLReturn(hash): return ret - - - - - - - - def unique(s): """Return a list of the elements in s, but without duplicates. @@ -321,7 +314,7 @@ class uploadATFfinallyThread(Thread): self.result+="

Start processing

" #shall I only upload the changed files? - logging.info("uploadATFfinally procedure: %s"%procedure) + logging.debug("uploadATFfinally procedure: %s"%procedure) if procedure=="uploadchanged": changed=[x[0] for x in SESSION.get('changed',[])] uploadFns=changed+SESSION.get('newPs',[]) @@ -335,48 +328,48 @@ class uploadATFfinallyThread(Thread): #or maybe nothing elif procedure=="noupload": - return True + return True else: uploadFns=[] #do first the changed files i=0 for fn in uploadFns: + logging.debug("uploadATFfinally uploadFn=%s"%fn) i+=1 founds=ctx2.CDLICatalog.search({'title':fn}) if len(founds)>0: SESSION['author']=str(username) self.result="

Changing : %s"%fn+self.result + logging.debug("uploadatffinallythread changing:%s"%fn+self.result) founds[0].getObject().manage_addCDLIFileObject('',comment,SESSION['author'],file=os.path.join(SESSION['tmpdir'],fn),from_tmp=True) - if i==200: - i=0 - transaction.get().commit() - logging.info("changing: do commit") + if i%200==0: + transaction.get().commit() + logging.debug("uploadatffinallythread changing: do commit") transaction.get().commit() - logging.info("changing: last commit") + logging.debug("uploadatffinallythread changing: last commit") #now add the new files newPs=SESSION['newPs'] if len(newPs)>0: tmpDir=SESSION['tmpdir'] - logging.info("adding start") + logging.debug("uploadatffinallythread adding start") self.result="

Adding files

"+self.result #TODO: make this configurable, at the moment base folder for the files has to be cdli_main - ctx2.importFiles(comment=comment,author=str(username) ,folderName=tmpDir, files=newPs,ext=self) - logging.info("adding finished") - + logging.debug("uploadatffinallythread adding finished") #unlock locked files? if unlock: - logging.info("unlocking start") + logging.debug("uploadatffinallythread unlocking start") self.result="

Unlock files

"+self.result unlockFns=[] for x in os.listdir(SESSION['tmpdir']): if not x in SESSION['errors']: unlockFns.append(x) - logging.info("unlocking have now what to unlock") + + logging.debug("unlocking have now what to unlock") for fn in unlockFns: #logging.info("will unlock: %s"%fn) @@ -385,18 +378,18 @@ class uploadATFfinallyThread(Thread): if len(founds)>0: #logging.info("unlock: %s"%founds[0].getObject().getId()) SESSION['author']=str(username) - founds[0].getObject().lockedBy="" - logging.info("unlocking done") + + logging.debug("uploadatffinallythread unlocking done") #if a basketname is given, add files to the basket if not (basketname ==''): - logging.info("add to basket %s"%basketname) + logging.debug("uploadatffinallythread add to basket %s"%basketname) self.result="

Add to basket

"+self.result basketId=ctx2.basketContainer.getBasketIdfromName(basketname) if not basketId: # create new basket - logging.info("create basket %s"%basketname) + logging.debug("uploadatffinallythread create basket %s"%basketname) self.result="

Create a new basket

"+self.result ob=ctx2.basketContainer.addBasket(basketname) basketId=ob.getId() @@ -404,11 +397,11 @@ class uploadATFfinallyThread(Thread): ids=os.listdir(SESSION['tmpdir']) basket.addObjects(ids,deleteOld=True,username=str(username)) + logging.debug("uploadatffinallythread uploadfinally done") + if RESPONSE is not None: RESPONSE.redirect(self.aq_parent.absolute_url()) - - logging.info("uploadfinally done") return True class tmpStore(SimpleItem): @@ -640,7 +633,7 @@ class CDLIBasketContainer(OrderedFolder) ret+=str(object[0].getData())+"\n" elif current=="yes": #search current object - logging.info("crrent: %s"%object[1].getId().split(".")[0]) + logging.debug("current: %s"%object[1].getId().split(".")[0]) founds=self.CDLICatalog.search({'title':object[1].getId().split(".")[0]}) if len(founds)>0: ret+=str(founds[0].getObject().getLastVersion().getData())+"\n" @@ -932,10 +925,13 @@ class CDLIBasketContainer(OrderedFolder) """store it""" if not ids: ids=self.REQUEST.SESSION['fileIds'] - - if type(ids) is not ListType: + + if (type(ids) is not ListType) and (not isinstance(ids,Set)): ids=[ids] + if isinstance(ids,Set): + ids=list(ids) + if (submit.lower()=="store in new basket") or (submit.lower()=="new basket"): basketRet=self.addBasket(newBasketName) self.setActiveBasket(basketRet.getId()) @@ -949,7 +945,7 @@ class CDLIBasketContainer(OrderedFolder) if fromFileList: - return self.cdli_main.findObjectsFromList(list=self.REQUEST.SESSION['fileIds'],basketName=basket.title,numberOfObjects=added) + return self.cdli_main.findObjectsFromList(list=ids,basketName=basket.title,numberOfObjects=added) if RESPONSE: @@ -1185,6 +1181,10 @@ class CDLIBasket(Folder,CatalogAware): def addObjects(self,ids,deleteOld=None,username=None): """generate a new version of the basket with objects added""" + + def swap(x): + return (x[1],x[0]) + logging.info("add to basket (%s)"%(self.getId())) lastVersion=self.getLastVersion() @@ -1196,27 +1196,42 @@ class CDLIBasket(Folder,CatalogAware): if deleteOld: oldContent=[] - newContent=[] added=0 - for id in ids: - try: - founds=self.CDLICatalog.search({'title':id}) - except: - founds=[] - - for found in founds: - if found.getObject() not in oldContent: - #TODO: was passiert wenn, man eine Object dazufŸgt, das schon da ist aber eine neuere version - newContent.append((found.getObject().getLastVersion(),found.getObject())) - added+=1 - - content=oldContent+newContent +# for id in ids: +# logging.debug("adding:"+id) +# try: +# founds=self.CDLICatalog.search({'title':id}) +# except: +# founds=[] +# +# for found in founds: +# if found.getObject() not in oldContent: +# #TODO: was passiert wenn, man eine Object dazufŸgt, das schon da ist aber eine neuere version +# newContent.append((found.getObject().getLastVersion(),found.getObject())) +# added+=1 + + hash = md5.new(repr(self.makelist(ids))).digest() # erzeuge hash als identification + #logging.debug("JJJJJJJ:"+repr(self.makelist(ids))) + logging.debug("JJJJJJJ:"+repr(hash)) + + if hasattr(self.cdliRoot,'_v_tmpStore') and self.cdliRoot._v_tmpStore.has_key(hash): + logging.debug("from store!") + newContent=Set(map(swap,self.cdliRoot._v_tmpStore[hash])) + + else: + logging.debug("not from store!") + newContent=Set([(self.getFileObjectLastVersion(x),self.getFileObject(x)) for x in ids]) + + + content=Set(oldContent).union(newContent) + added = len(content)-len(oldContent) if not username: user=self.getActualUserName() else: user = username - - ob=manage_addCDLIBasketVersion(self,user,comment="",basketContent=content) + + #logging.debug("content:"+repr(list(content))) + ob=manage_addCDLIBasketVersion(self,user,comment="",basketContent=list(content)) logging.info("add to basket (%s) done"%(self.getId())) return added @@ -1400,11 +1415,17 @@ class CDLIBasketVersion(Implicit,Persist for object in self.content.getContent(): - + logging.error("ret:"+repr(object[0])) + logging.error(" -"+repr(procedure)) + logging.error(" -"+repr(object[1].lockedBy)) + if (procedure=="downloadAll") or (object[1].lockedBy=='') or (object[1].lockedBy==self.REQUEST['AUTHENTICATED_USER']): + logging.error("ret1") if current=="no": #version as they are in the basket + logging.error("ret2") ret+=str(object[0].getData())+"\n" elif current=="yes": + logging.error("ret3") #search current object founds=self.CDLICatalog.search({'title':object[1].getId().split(".")[0]}) if len(founds)>0: @@ -1520,6 +1541,12 @@ class CDLIFileObject(CatalogAware,extVer security.declareProtected('manage','index_html') + security.declarePublic('view') + view = PageTemplateFile('zpt/viewCDLIFile.zpt', globals()) + + security.declarePublic('editATF') + editATF = PageTemplateFile('zpt/editATFFile.zpt', globals()) + def PrincipiaSearchSource(self): """Return cataloguable key for ourselves.""" return str(self) @@ -1534,11 +1561,12 @@ class CDLIFileObject(CatalogAware,extVer def makeThisVersionCurrent(self,comment,author,RESPONSE=None): """copy this version to current""" parent=self.aq_parent - newversion=parent.manage_addCDLIFileObject('',comment,author) - newversion.manage_upload(self.getData()) + parent.manage_addVersionedFileObject(id=None,vC=comment,author=author,file=self.getData(),RESPONSE=RESPONSE) + #newversion=parent.manage_addCDLIFileObject('',comment,author) + #newversion.manage_upload(self.getData()) - if RESPONSE is not None: - RESPONSE.redirect(self.aq_parent.absolute_url()+'/history') + #if RESPONSE is not None: + # RESPONSE.redirect(self.aq_parent.absolute_url()+'/history') return True @@ -1548,11 +1576,6 @@ class CDLIFileObject(CatalogAware,extVer # return re.sub("\s\#lem"," #lem",data) #remove return vor #lem return re.sub("#lem"," #lem",data) #remove return vor #lem - security.declarePublic('view') - def view(self): - """view file""" - pt=PageTemplateFile(os.path.join(package_home(globals()),'zpt','viewCDLIFile.zpt')).__of__(self) - return pt() security.declarePublic('getPNumber') def getPNumber(self): @@ -1581,6 +1604,7 @@ class CDLIFileObject(CatalogAware,extVer return txt.group(2) except: return "ERROR" + manage_addCDLIFileObjectForm=DTMLFile('dtml/fileAdd', globals(),Kind='CDLIFileObject',kind='CDLIFileObject', version='1') @@ -1614,10 +1638,10 @@ def manage_addCDLIFileObject(self,id,vC= if content_type: fob.content_type=content_type - logging.debug("manage_add: lastversion=%s"%self.getData()) + #logging.debug("manage_add: lastversion=%s"%self.getData()) logging.debug("reindex1: %s in %s"%(repr(self),repr(self.default_catalog))) self.reindex_object() - logging.debug("manage_add: fob_data=%s"%fob.getData()) + #logging.debug("manage_add: fob_data=%s"%fob.getData()) logging.debug("reindex2: %s in %s"%(repr(fob), repr(fob.default_catalog))) fob.index_object() @@ -1696,6 +1720,7 @@ class CDLIFile(extVersionedFile,CatalogA def _newContentObject(self, id, title='', versionNumber=0, versionComment=None, time=None, author=None): """factory for content objects. to be overridden in derived classes.""" + logging.debug("_newContentObject(CDLI)") return CDLIFileObject(id,title,versionNumber=versionNumber,versionComment=versionComment,time=time,author=author) @@ -1765,6 +1790,7 @@ def manage_addCDLIFile(self,id,title,loc if RESPONSE is not None: RESPONSE.redirect('manage_main') + def checkUTF8(data): """check utf 8""" try: @@ -1795,6 +1821,7 @@ def splitatf(fh,dir=None,ext=None): nf=None i=0 + #ROC: why split \n first and then \r??? if (type(fh) is StringType) or (type(fh) is UnicodeType): iter=fh.split("\n") else: @@ -1856,18 +1883,27 @@ class CDLIFileFolder(extVersionedFileFol #downloadCounter=0 # counts how many download for all files currently run, be mehr als 5 wird verweigert. tmpStore2={} + def _newVersionedFile(self, id, title='', lockedBy=None, author=None): + """factory for versioned files. to be overridden in derived classes.""" + logging.debug("_newVersionedFile(CDLI)") + return CDLIFile(id, title, lockedBy=lockedBy, author=author) + def setTemp(self,name,value): """set tmp""" setattr(self,name,value) + deleteFileForm = PageTemplateFile("zpt/doDeleteFile", globals()) - def delete(self,ids): + def delete(self,ids,REQUEST=None): """delete these files""" if type(ids) is not ListType: ids=[ids] self.manage_delObjects(ids) + + if REQUEST is not None: + return self.index_html() def getVersionNumbersFromIds(self,ids): @@ -1924,10 +1960,17 @@ class CDLIFileFolder(extVersionedFileFol return ret - def findObjectsFromList(self,enterList=None,display=False,start=None,upload=None,list=None,basketName=None,numberOfObjects=None,RESPONSE=None): + def expandFile(self,fileId,fileTree): + """wildcard in fileID suche alle Treffer""" + founds=self.CDLICatalog({'title':fileId}) + for found in founds: + fileTree.add(found.getId) + logging.debug("ADDD:"+found.getId) + + def findObjectsFromList(self,enterList=None,display=False,start=None,upload=None,list=None,basketName=None,numberOfObjects=None,RESPONSE=None,REQUEST=None): """findObjectsFromList (, TAB oder LINE separated)""" - + logging.debug("start: findObjectsFromList") if upload: # list from file upload txt=upload.read() @@ -1959,24 +2002,52 @@ class CDLIFileFolder(extVersionedFileFol return pt(basketName=basketName,numberOfObjects=numberOfObjects) if list is not None: # got already a list + + logging.debug(" ----List version") ret=[] + fileTree=Set() + for fileId in list: - if fileId.find("*"): #check for wildcards - fileId=fileId + + if fileId.find("*")>-1: #check for wildcards + self.expandFile(fileId,fileTree) + elif len(fileId.split("."))==1: fileId=fileId+".atf" + fileTree.add(fileId) + #logging.debug(" -----:"+fileId) + #ret+=self.CDLICatalog({'title':fileId}) + #x =self.getFileObject(fileId) + #if x is not None: + # ret.append(x) - ret+=self.CDLICatalog({'title':fileId}) + + + ids = fileTree & self.v_file_ids + + + hash = md5.new(repr(self.makelist(ids))).digest() # erzeuge hash als identification + #TODO: do I need garbage collection for _v_tmpStore ? + if hasattr(self.cdliRoot,'_v_tmpStore') and self.cdliRoot._v_tmpStore.has_key(hash): + return self.cdliRoot._v_tmpStore[hash] + #TODO: get rid of one of these.. - ids=[x.getObject().getId() for x in ret] + #ids=[x.getObject().getId() for x in ret] + ret=[(self.getFileObject(x),self.getFileObjectLastVersion(x)) for x in ids] self.REQUEST.SESSION['fileIds']=ids#store fieldIds in session for further usage self.REQUEST.SESSION['searchList']=self.REQUEST.SESSION['fileIds'] - + if display: pt=getattr(self,'filelist.html') return pt(search=ids) - else: + else: + #self.REQUEST.SESSION['hash'] = ret # store in session + if not hasattr(self,'_v_tmpStore'): + self.cdliRoot._v_tmpStore={} + #logging.debug("HHHHHHNEU:"+repr(self.makelist(ids))) + logging.debug("HHHHHHNEU:"+repr(hash)) + self.cdliRoot._v_tmpStore[hash] = ret # store in session return ret @@ -1984,7 +2055,11 @@ class CDLIFileFolder(extVersionedFileFol if start: RESPONSE.redirect("filelist.html?start:int="+str(start)) - + def makelist(self,mySet): + x = list(mySet) + x.sort() + return x + security.declareProtected('Manage','createAllFilesAsSingleFile') def createAllFilesAsSingleFile(self,RESPONSE=None): """download all files""" @@ -2112,6 +2187,30 @@ class CDLIRoot(Folder): 'graphemes':cdliSplitter.graphemeSplitter()} + def getFileObject(self,fileId): + x=self.v_files.get(fileId) + #logging.debug(x) + return x + + def getFileObjectLastVersion(self,fileId): + x=self.v_files_lastVersion.get(fileId) + #logging.debug(x) + return x + + def generateFileBTree(self): + """erzeuge einen Btree aus allen Files""" + self.v_files = OOBTree() + self.v_files_lastVersion = OOBTree() + self.v_file_ids = Set() + + for x in self.CDLICatalog.searchResults(): + + self.v_files.update({x.getId:x.getObject()}) + self.v_files_lastVersion.update({x.getId:x.getObject().getLastVersion()}) + self.v_file_ids.add(x.getId) + logging.debug("add:"+x.getId+"XXX"+repr(x.getObject())) + + return "done" def deleteFiles(self,ids): """delete files""" for id in ids: @@ -2150,7 +2249,6 @@ class CDLIRoot(Folder): return f[0].getObject().getData() - def showFile(self,fileId,wholePage=False): """show a file @param fileId: P-Number of the document to be displayed @@ -2178,15 +2276,18 @@ class CDLIRoot(Folder): splitexp = "(%s|\s)(%%s)(%s|\s|\Z)"%(bounds,bounds) # clean word expression # TODO: this should use QueryParser itself - word = word.replace('"','') # take out double quotes - # escape parens for regexp too - # compile into regexp objects + # take out double quotes + word = word.replace('"','') + # take out ignorable signs + ignorable = self.splitter[indexName].ignorex + word = ignorable.sub('', word) + # compile into regexp objects and escape parens wordlist = [re.compile(splitexp%re.escape(w)) for w in word.split(' ')] - for line in file.split("\n"): + for line in file.splitlines(): for word in wordlist: - #logging.debug("showwordinfile: searching for %s in %s"%(word.pattern,line)) - if word.search(line): + #logging.debug("showwordinfile: searching for %s in %s"%(word.pattern,ignoreable.sub('',line))) + if word.search(ignorable.sub('',line)): line = formatAtfLineHtml(line) ret.append(line) break @@ -2220,21 +2321,37 @@ class CDLIRoot(Folder): # clean word expression # TODO: this should use QueryParser itself word = word.replace('"','') # take out double quotes + # take out ignoreable signs + ignorable = self.splitter[indexName].ignorex + word = ignorable.sub('', word) # split search terms by blanks words = word.split(' ') + # split search terms again (for grapheme search with words) + splitwords = dict(((w,self.splitter[indexName].process([w])) for w in words)) - for line in file.split("\n"): + for line in file.splitlines(): line = unicodify(line) - # ignore lemma lines + # ignore lemma and other lines if line.lstrip().startswith('#lem:'): continue + # ignore p-num line + if line.startswith('&P'): + continue + # ignore version lines + if line.startswith('#version'): + continue + # ignore atf type lines + if line.startswith('#atf:'): + continue # first scan hitwords = [] for w in words: - if line.find(w) > -1: + if ignorable.sub('',line).find(w) > -1: # word is in line - hitwords.append(w) + # append split word for grapheme search with words + hitwords.extend(splitwords[w]) + #hitwords.extend(wordsplit.split(w)) # examine hits closer if hitwords: @@ -2242,8 +2359,10 @@ class CDLIRoot(Folder): parts = wordsplit.split(line) line = "" for p in parts: + #logging.debug("tagwordinfile: searching for %s in %s"%(p,hitwords)) # reassemble line - if p in hitwords: + if ignorable.sub('', p) in hitwords: + #logging.debug("tagwordinfile: found %s in %s"%(p,hitwords)) # this part was found line += tagStart + formatAtfHtml(p) + tagEnd else: @@ -2268,6 +2387,15 @@ class CDLIRoot(Folder): return dict([(id,self.tagWordInFile(id, word, indexName, regExp)) for id in fileIds]) + def getFileVersionList(self, pnum): + """get the version history as a list for the translit file with the given pnum""" + f = getattr(self, self.file_catalog).search({'textid':pnum}) + if not f: + return [] + + return f[0].getObject().getVersionList() + + def URLquote(self,str): """quote url""" return urllib.quote(str) @@ -2522,6 +2650,7 @@ class CDLIRoot(Folder): def importFiles(self,comment="",author="" ,folderName="/Users/dwinter/atf", files=None,ext=None): """import files""" + logging.debug("importFiles folderName=%s files=%s ext=%s"%(folderName,files,ext)) root=self.cdli_main count=0 if not files: @@ -2531,17 +2660,20 @@ class CDLIRoot(Folder): folder=f[0:3] f2=f[0:5] obj=self.ZopeFind(root,obj_ids=[folder]) + logging.debug("importFiles: folder=%s f2=%s obj=%s"%(folder,f2,obj)) if ext: - ext.result="

adding: %s

"%f+ext.result + if not obj: manage_addCDLIFileFolder(root,folder,folder) fobj=getattr(root,folder) #transaction.get().commit() + else: fobj=obj[0][1] obj2=fobj.ZopeFind(fobj,obj_ids=[f2]) + logging.debug("importFiles: fobj=%s obj2=%s"%(fobj,obj2)) if not obj2: manage_addCDLIFileFolder(fobj,f2,f2) @@ -2552,22 +2684,15 @@ class CDLIRoot(Folder): file2=os.path.join(folderName,f) id=f - manage_addCDLIFile(fobj2,f,'','') - id=f - ob=fobj2._getOb(f) - ob.title=id - - manage_addCDLIFileObject(ob,id,comment,author,file2,content_type='',from_tmp=True) - self.CDLICatalog.catalog_object(ob) - #self.CDLICatalog.manage_catalogFoundItems(obj_ids=[id],search_sub=1) - #self.CDLICatalog.manage_catalogObject(self.REQUEST, self.REQUEST.RESPONSE, 'CDLICatalog', urlparse.urlparse(ob.absolute_url())[1]) + logging.debug("importFiles: addCDLIFile fobj2=%s, f=%s file2=%s"%(fobj2,repr(f),repr(file2))) + fobj2.addFile(vC='',file=file(file2),author=author,newName=f) count+=1 - if count > 1000: - print "committing" + if count%100==0: + logging.debug("importfiles: committing") transaction.get().commit() - count=0 - transaction.get().commit() + + transaction.get().commit() return "ok"