--- cdli/cdli_files.py 2007/09/03 11:10:04 1.80 +++ cdli/cdli_files.py 2007/10/26 22:45:12 1.80.2.5 @@ -28,6 +28,9 @@ import copy import codecs import sys +import cdliSplitter + + def unicodify(s): """decode str (utf-8 or latin-1 representation) into unicode object""" if not s: @@ -50,6 +53,22 @@ def utf8ify(s): else: return s.encode('utf-8') +def formatAtfLineHtml(l, nolemma=True): + """escape special ATF characters for HTML""" + if not l: + return "" + + if nolemma: + # ignore lemma lines + if l.lstrip().startswith('#lem:'): + return "" + # replace & + l = l.replace('&','&') + # replace angular brackets + l = l.replace('<','<') + l = l.replace('>','>') + return l + def generateXMLReturn(hash): """erzeugt das xml file als returnwert fuer uploadATFRPC""" @@ -912,7 +931,7 @@ class CDLIBasket(Folder,CatalogAware): def searchInBasket(self,indexName,searchStr,regExp=False): """searchInBasket""" - lst=self.searchInLineIndexDocs(indexName,searchStr,uniq=True,regExp=regExp) + lst=self.searchInLineIndexDocs(indexName,searchStr,uniq=True,regExp=regExp) #TODO: fix this ret={} lv=self.getLastVersion() @@ -1450,10 +1469,8 @@ class CDLIFileObject(CatalogAware,extVer security=ClassSecurityInfo() - - security.declarePublic('makeThisVersionCurrent') - security.declareProtected('manage','index_html') + def PrincipiaSearchSource(self): """Return cataloguable key for ourselves.""" return str(self) @@ -1463,28 +1480,26 @@ class CDLIFileObject(CatalogAware,extVer pt=PageTemplateFile(os.path.join(package_home(globals()),'zpt','makeThisVersionCurrent.zpt')).__of__(self) return pt() + + security.declarePublic('makeThisVersionCurrent') def makeThisVersionCurrent(self,comment,author,RESPONSE=None): """copy this version to current""" parent=self.aq_parent - - newversion=parent.manage_addCDLIFileObject('',comment,author) newversion.manage_upload(self.getData()) if RESPONSE is not None: RESPONSE.redirect(self.aq_parent.absolute_url()+'/history') - return True - security.declarePublic('view') - def getFormattedData(self): """fromat text""" data=self.getData() # return re.sub("\s\#lem"," #lem",data) #remove return vor #lem return re.sub("#lem"," #lem",data) #remove return vor #lem + security.declarePublic('view') def view(self): """view file""" pt=PageTemplateFile(os.path.join(package_home(globals()),'zpt','viewCDLIFile.zpt')).__of__(self) @@ -1520,10 +1535,10 @@ class CDLIFileObject(CatalogAware,extVer manage_addCDLIFileObjectForm=DTMLFile('dtml/fileAdd', globals(),Kind='CDLIFileObject',kind='CDLIFileObject', version='1') -def manage_addCDLIFileObject(self,id,vC='',author='', file='',title='',precondition='', content_type='', +def manage_addCDLIFileObject(self,id,vC='',author='', file='',title='',versionNumber=0, + precondition='', content_type='', from_tmp=False,REQUEST=None): """Add a new File object. - Creates a new File object 'id' with the contents of 'file'""" id=str(id) @@ -1536,11 +1551,8 @@ def manage_addCDLIFileObject(self,id,vC= self=self.this() # First, we create the file without data: - self._setObject(id, CDLIFileObject(id,title,'',content_type, precondition)) + self._setObject(id, CDLIFileObject(id,title,versionNumber=versionNumber,versionComment=vC,time=time.localtime(),author=author)) fob = self._getOb(id) - fob.versionComment=str(vC) - fob.time=time.localtime() - setattr(fob,'author',author) # Now we "upload" the data. By doing this in two steps, we # can use a database trick to make the upload more efficient. @@ -1548,14 +1560,17 @@ def manage_addCDLIFileObject(self,id,vC= if file and not from_tmp: fob.manage_upload(file) elif file and from_tmp: - fob.manage_upload_from_tmp(file) + fob.manage_file_upload(file) # manage_upload_from_tmp doesn't exist in ExtFile2 + # fob.manage_upload_from_tmp(file) # manage_upload_from_tmp doesn't exist in ExtFile2 if content_type: fob.content_type=content_type - logging.debug("reindex1: %s"%repr(self)) + logging.debug("manage_add: lastversion=%s"%self.getData()) + logging.debug("reindex1: %s in %s"%(repr(self),repr(self.default_catalog))) self.reindex_object() + logging.debug("manage_add: fob_data=%s"%fob.getData()) logging.debug("reindex2: %s in %s"%(repr(fob), repr(fob.default_catalog))) - fob.reindex_object() + fob.index_object() if REQUEST is not None: REQUEST['RESPONSE'].redirect(self.absolute_url()+'/manage_main') @@ -1566,20 +1581,20 @@ class CDLIFile(extVersionedFile,CatalogA security=ClassSecurityInfo() meta_type="CDLI file" + content_meta_type = ["CDLI File Object"] + default_catalog='CDLICatalog' security.declareProtected('manage','index_html') - #security.declarePublic('history') + def getLastVersionData(self): """get last version data""" - return self.getLastVersion().getData() + return self.getData() def getLastVersionFormattedData(self): """get last version data""" - return self.getLastVersion().getFormattedData() + return self.getContentObject().getFormattedData() #security.declarePublic('history') - - def history(self): """history""" @@ -1624,6 +1639,11 @@ class CDLIFile(extVersionedFile,CatalogA #return [x.getObject() for x in context.CDLIBasketCatalog.search({'getFileNamesInLastVersion':self.getId()})] + def _newContentObject(self, id, title='', versionNumber=0, versionComment=None, time=None, author=None): + """factory for content objects. to be overridden in derived classes.""" + return CDLIFileObject(id,title,versionNumber=versionNumber,versionComment=versionComment,time=time,author=author) + + def addCDLIFileObjectForm(self): """add a new version""" @@ -1649,58 +1669,27 @@ class CDLIFile(extVersionedFile,CatalogA except: pass - - if changeName=="yes": - filename=file.filename - self.title=filename[max(filename.rfind('/'), - filename.rfind('\\'), - filename.rfind(':'), - )+1:] + ob = self.addContentObject(id, vC, author, file, title, changeName=changeName, newName=newName, from_tmp=from_tmp, + precondition=precondition, content_type=content_type) - - if not newName=='': - self.title=newName[0:] - - - - - positionVersionNum=getattr(self,'positionVersionNum','front') - - if positionVersionNum=='front': - id="V%i"%self.getVersion()+"_"+self.title - else: - tmp=os.path.splitext(self.title) - if len(tmp)>1: - id=tmp[0]+"_V%i"%self.getVersion()+tmp[1] - else: - id=tmp[0]+"_V%i"%self.getVersion() - - - manage_addCDLIFileObject(self,id,vC,author,file,id,precondition, content_type,from_tmp=from_tmp) - #objs=self.ZopeFind(self,obj_ids=[id])[0][1].setVersionNumber(int(self.getVersion())) - objs=getattr(self,id).setVersionNumber(int(self.getVersion())) try: - #FIXME: wozu ist das gut? - self.REQUEST.SESSION['objID_parent']=self.getId() + #FIXME: wozu ist das gut? + self.REQUEST.SESSION['objID_parent']=self.getId() except: - pass + pass if RESPONSE: - - obj=self.ZopeFind(self,obj_ids=[id])[0][1] - if obj.getSize()==0: - self.REQUEST.SESSION['objID']=obj.getId() + if ob.getSize()==0: + self.REQUEST.SESSION['objID']=ob.getId() pt=PageTemplateFile(os.path.join(package_home(globals()),'zpt','errorUploadFile')).__of__(self) return pt() - else: if come_from and (come_from!=""): - RESPONSE.redirect(come_from+"?change="+self.getId()) + RESPONSE.redirect(come_from+"?change="+self.getId()) else: RESPONSE.redirect(self.REQUEST['URL2']+'?uploaded=%s'%self.title) - else: - return self.ZopeFind(self,obj_ids=[id])[0][1] + return ob def manage_addCDLIFileForm(self): @@ -1714,8 +1703,6 @@ def manage_addCDLIFile(self,id,title,loc tryToggle=True tryCount=0 - - self._setObject(id,newObj) getattr(self,id).reindex_object() @@ -1806,12 +1793,14 @@ class CDLIFileFolder(extVersionedFileFol security=ClassSecurityInfo() meta_type="CDLI Folder" - filesMetaType=['CDLI file'] - folderMetaType=['CDLI Folder'] + file_meta_type=['CDLI file'] + folder_meta_type=['CDLI Folder'] + default_catalog='CDLICatalog' - defaultFileCatalog=default_catalog #wenn dieses definiert ist, wird beim hinzufŸgen einer neuen version eines files dieser catalog neuiniziert + defaultFileCatalog=default_catalog #wenn dieses definiert ist, wird beim hinzufuegen einer neuen version eines files dieser catalog neuindiziert #downloadCounter=0 # counts how many download for all files currently run, be mehr als 5 wird verweigert. tmpStore2={} + def setTemp(self,name,value): """set tmp""" @@ -1819,21 +1808,13 @@ class CDLIFileFolder(extVersionedFileFol def delete(self,ids): - """delete this file, i.e. move into a trash folder""" - - found=self.ZopeFind(self,obj_ids=['.trash']) - - if len(found)<1: - manage_addCDLIFileFolder(self, '.trash',title="Trash") - trash=self._getOb('.trash') - else: - trash=found[0][1] - + """delete these files""" if type(ids) is not ListType: ids=[ids] - cut=self.manage_cutObjects(ids) - trash.manage_pasteObjects(cut) - + + self.manage_delObjects(ids) + + def getVersionNumbersFromIds(self,ids): """get the numbers of the current versions of documents described by their ids""" @@ -1843,38 +1824,39 @@ class CDLIFileFolder(extVersionedFileFol founds=self.CDLICatalog.search({'title':searchStr}) for found in founds: - lastVersion=found.getObject().getLastVersion() + lastVersion=found.getObject().getContentObject() ret.append((found.getId,lastVersion)) return ret def getFile(self,fn): """get the content of the file fn""" - founds=self.CDLICatalog.search({'title':fn}) - if not founds: - return "" + logging.debug("getFile: %s"%repr(fn)) + if not self.hasObject(fn): + # search deeper + founds=self.CDLICatalog.search({'title':fn}) + if founds: + obj=founds[0].getObject().getContentObject() + else: + return "" else: - obj=founds[0].getObject().getLastVersion() + obj = self[fn].getContentObject() - return obj.getData()[0:] + return obj.getData()[0:] + def checkCatalog(self,fn): """check if fn is in the catalog""" #TODO add checkCatalog - def findObjectsFromListWithVersion(self,list,author=None): """find objects from a list with versions @param list: list of tuples (cdliFile,version) """ - - - #self.REQUEST.SESSION['fileIds']=list#store fieldIds in session for further usage #self.REQUEST.SESSION['searchList']=self.REQUEST.SESSION['fileIds'] - pt=getattr(self,'filelistVersioned.html') return pt(search=list,author=author) @@ -1982,7 +1964,7 @@ class CDLIFileFolder(extVersionedFileFol #os.write(tf,obj.getLastVersion().data) if RESPONSE: - RESPONSE.write(obj.getLastVersion().getData()[0:]) + RESPONSE.write(obj.getData()[0:]) RESPONSE.write("\n") self.temp_folder.downloadCounter-=1 self._p_changed=1 @@ -2002,7 +1984,7 @@ class CDLIFileFolder(extVersionedFileFol def hasParent(self): """returns true falls subfolder""" - if self.aq_parent.meta_type in self.folderMetaType: + if self.aq_parent.meta_type in self.folder_meta_type: return True else: return False @@ -2010,11 +1992,11 @@ class CDLIFileFolder(extVersionedFileFol def getFolders(self): """get all subfolders""" ret=[] - folders=self.ZopeFind(self,obj_metatypes=self.folderMetaType) + folders=self.ZopeFind(self,obj_metatypes=self.folder_meta_type) for folder in folders: ret.append((folder[1], - len(self.ZopeFind(folder[1],obj_metatypes=self.folderMetaType)), - len(self.ZopeFind(folder[1],obj_metatypes=self.filesMetaType)) + len(self.ZopeFind(folder[1],obj_metatypes=self.folder_meta_type)), + len(self.ZopeFind(folder[1],obj_metatypes=self.file_meta_type)) )) return ret @@ -2066,164 +2048,67 @@ class CDLIRoot(Folder): """main folder for cdli""" meta_type="CDLIRoot" - downloadCounterBaskets=0# counts the current basket downloads if counter > 10 no downloads are possible + downloadCounterBaskets=0 # counts the current basket downloads if counter > 10 no downloads are possible + + file_catalog = 'CDLICatalog' + + # word splitter for search + splitter = {'words':cdliSplitter.wordSplitter(), + 'graphemes':cdliSplitter.graphemeSplitter()} + def deleteFiles(self,ids): - """delete files (resp. move into .trash folder)""" - # find or generete trash folder - - found=self.ZopeFind(self,obj_ids=['.trash']) - - if len(found)<1: - manage_addCDLIFileFolder(self, '.trash',title="Trash") - trash=self._getOb('.trash') - else: - logging.info(found) - trash=found[0][1] - - + """delete files""" for id in ids: founds=self.CDLICatalog.search({'title':id.split(".")[0]}) if founds: - logging.info(founds) + logging.debug("deleting %s"%founds) folder=founds[0].getObject().aq_parent #get the parent folder of the object - logging.info(folder) - cut=folder.manage_cutObjects([founds[0].getId]) #cut it out - trash.manage_pasteObjects(cut) #paste it in the trash + logging.debug("deleting from %s"%folder) + cut=folder.delete([founds[0].getId]) #cut it out - def findWordRegExp(self,indexName,searchTerm): - """find all words in index which match regexp in SearchTerm - @param indexName: name of the index to be searched in - @param searchTerm: word to be searched""" - - ret=[] - for x in self.lineIndexes[indexName].iterkeys(): - if re.match(searchTerm,x): - ret.append(x) - return ret - - def searchRegExpInLineIndexDocs(self,indexName,searchTerm): - """search in inLineIndex with regexp - @param indexName: name of the index to be searched in - @param searchTerm: term to be searched - """ - if not searchTerm: - return [] - ret=[] - words=self.findWordRegExp(indexName,searchTerm) # suche nach allen Treffern - logging.info("wd:%s"%words) - for word in words: - - ret+=self.searchInLineIndexDocs(indexName,word) - - - x= unique(ret) - logging.info("words_done") - return x - - def showInLineIndex(self): - """get the index for debug purposes""" - print "show" - for key in self.lineIndexes.keys(): - logging.info("index:%s"%key) - for x in self.lineIndexes[key].iterkeys(): - logging.info("word:%s"%repr(x)) - #for y in self.lineIndex[x].iterkeys(): - # print "doc",repr(y),repr(self.lineIndex[x][y]) - - return self.lineIndexes - - def searchInLineIndexDocs(self,indexName,word,uniq=True,regExp=False): - """search occurences in an index - @param indexName: name of the index to be searched in - @param word: word to be searched - @param unique: (optional) unify the list of results - @param regExp: (optional) use regular expressions - """ - if regExp: - return self.searchRegExpInLineIndexDocs(indexName,word) - - try: - - lst=list(self.lineIndexes[indexName].get(word).keys()) - except: - logging.error("error: searchInLineIndexDocs (%s %s)"%(sys.exc_info()[0:2])) - lst=[] - if uniq: - return unique(lst) - else: - return lst - - def getLinesFromIndex(self,indexName,word,doc,regExp=False): - """return all lines from a document where word is found - @param indexName: Name of the index - @param word: word to be searched - @param doc: name of the document (usuallay the p-number) - @param regExp: (optional) use regExp - """ - - if not regExp: - return self.lineIndexes[indexName].get(word)[doc] - else: # wenn regexp, suche welches word - for w in self.findWordRegExp(indexName,word): - if self.lineIndexes[indexName].get(w): # ein word in im dex gefunden - try: - dc=self.lineIndex[indexName].get(word)[doc] - return dc # und ein document dann gib es zurueck - except: - pass #andernfalls weiter - - def cleanInLineIndex(self,indexName): - """empty an InlineIndex - @param indexName: name of the index - """ - for x in list(self.lineIndexes[indexName].keys()): - del(self.lineIndexes[indexName][x]) - print [x for x in self.lineIndexes[indexName].keys()] - - return "ok" - - def storeInLineIndex(self,indexName,key,value): - """store in index, key is normally a word or grapheme - and value is a tuple (documentname, line) where the word can be found - @param indexName: name of the index - @param key: key in index - @param value: value in index, value is a tuple (document name, line) - """ - logging.error("indexing: %s %s"%(indexName,key)) - if (not hasattr(self,'lineIndexes')): - - self.lineIndexes={} - - if self.lineIndexes.get(indexName,None) is None: - #index exisitiert noch nicht dann anlegen - - self.lineIndexes[indexName]=OOBTree() - lis=self.lineIndexes - li=lis[indexName] - - if li.has_key(key): - -# if li[key].has_key(value[0]) and (not (value[1] in li[key][value[0]])): - if li[key].has_key(value[0]): - tmp=li[key][value[0]] - tmp.append(value[1]) # add it if now in the array - li[key][value[0]]=tmp[0:] - else: - li[key][value[0]]=[value[1]] # new array for lines - - else: - - li[key]=OOBTree()# new btree for lines - li[key][value[0]]=[value[1]] - - - self.lineIndexes=lis - - transaction.get().commit() - + def searchText(self, query, index='words'): + """searches query in the fulltext index and returns a list of file ids/P-numbers""" + idxQuery = {index:{'query':query}} + idx = getattr(self, self.file_catalog) + results = [] + # do search + resultset = idx.search(idxQuery) + for res in resultset: + # put only the P-Number in the result + results.append(res.getId[:7]) + return results + + # from PluginINdexes.common.util.py:parseIndexRequest: + # + # The class understands the following type of parameters: + # + # - old-style parameters where the query for an index as value inside + # the request directory where the index name is the name of the key. + # Additional parameters for an index could be passed as index+"_usage" ... + # + # + # - dictionary-style parameters specify a query for an index as + # an entry in the request dictionary where the key corresponds to the + # name of the index and the key is a dictionary with the parameters + # passed to the index. + # + # Allowed keys of the parameter dictionary: + # + # 'query' - contains the query (either string, list or tuple) (required) + # + # other parameters depend on the the index + # + # + # - record-style parameters specify a query for an index as instance of the + # Record class. This happens usually when parameters from a web form use + # the "record" type e.g. . + # All restrictions of the dictionary-style parameters apply to the record-style + # parameters + + def showFile(self,fileId,wholePage=False): """show a file @@ -2234,72 +2119,69 @@ class CDLIRoot(Folder): return "" if wholePage: - logging.info("whole") - return f[0].getObject().getLastVersion().view() + logging.debug("show whole page") + return f[0].getObject().getContentObject().view() else: return f[0].getObject().getLastVersionFormattedData() - def showWordInFile(self,fileId,word,lineList=None,regExp=True,indexName=""): - """get lines with word fromFileId""" + def showWordInFile(self,fileId,word,indexName='graphemes',regExp=False,): + """get lines with word from FileId""" file=self.showFile(fileId) - logging.info("regEXP %s"%regExp) ret=[] + # search using lowercase + word = word.lower() if regExp: # wenn regexp dann generiere alle worte aus der list die der regexp entsprechen wordlist=self.findWordRegExp(indexName,word) else: - wordlist=[word] + # split the search term into words according to the corresponding splitter + #try: + wordlist = self.splitter[indexName].process([word]) + #except: + # wordlist=[word] for line in file.split("\n"): - found=False + line = formatAtfLineHtml(unicodify(line)) + if not line: + # formatAtf can produce empty lines + continue for word in wordlist: - try: # just a hack because of possible unicode errors in line - if line.find(word)>-1: - if lineList: #liste of moeglichen Zeilennummern - num=line.split(".")[0] #Zeilenummer ist alles vor dem . in der Zeile - - if num in lineList: - - ret.append(line) - else: # nimm alles ohne line check - ret.append(line) - - break; - except: - pass + if line.lower().find(word)>-1: + ret.append(line) return ret + - def tagWordInFile(self,fileId,word,lineList=None,regExp=True,indexName=""): - """get lines with word fromFileId""" + def tagWordInFile(self,fileId,word,indexName='graphemes',regExp=False): + """get text with word highlighted from FileId""" file=self.showFile(fileId) tagStr=u'%s' ret=[] + # search using lowercase + word = word.lower() if regExp: # wenn regexp dann generiere alle worte aus der list die der regexp entsprechen wordlist=self.findWordRegExp(indexName,word) else: - wordlist=[word] + # split the search term into words according to the corresponding splitter + #try: + wordlist = self.splitter[indexName].process([word]) + #except: + # wordlist=[word] for line in file.split("\n"): - line = unicodify(line) - found=False - for word in wordlist: - if line.find(word)>-1: #word ist gefunden dann makiere und breche die Schleife ab - if lineList: #liste of moeglichen Zeilennummern - num=line.split(".")[0] #Zeilenummer ist alles vor dem . in der Zeile - - if num in lineList: - - ret.append(line.replace(word,tagStr%word)) - - else: # nimm alles ohne line check - ret.append(line.replace(word,tagStr%word)) - found=True - break - if not found: #word wurde nicht gefunden keine makierung - ret.append(line) + line = formatAtfLineHtml(unicodify(line)) + if not line: + # formatAtf can produce empty lines + continue + + for w in wordlist: + if line.lower().find(w)>-1: + #word ist gefunden dann makiere + line = line.replace(w,tagStr%w) + + ret.append(line) return u'
\n'.join(ret)