version 1.78, 2007/04/27 14:42:28
|
version 1.80.2.9, 2007/11/27 10:27:39
|
Line 28 import copy
|
Line 28 import copy
|
import codecs |
import codecs |
import sys |
import sys |
|
|
|
import cdliSplitter |
|
|
|
|
|
def unicodify(s): |
|
"""decode str (utf-8 or latin-1 representation) into unicode object""" |
|
if not s: |
|
return u"" |
|
if isinstance(s, str): |
|
try: |
|
return s.decode('utf-8') |
|
except: |
|
return s.decode('latin-1') |
|
else: |
|
return s |
|
|
|
def utf8ify(s): |
|
"""encode unicode object or string into byte string in utf-8 representation. |
|
assumes string objects to be utf-8""" |
|
if not s: |
|
return "" |
|
if isinstance(s, str): |
|
return s |
|
else: |
|
return s.encode('utf-8') |
|
|
|
def formatAtfHtml(l): |
|
"""escape special ATF characters for HTML""" |
|
if not l: |
|
return "" |
|
|
|
# replace & |
|
l = l.replace('&','&') |
|
# replace angular brackets |
|
l = l.replace('<','<') |
|
l = l.replace('>','>') |
|
return l |
|
|
|
def formatAtfLineHtml(l, nolemma=True): |
|
"""format ATF line for HTML""" |
|
if not l: |
|
return "" |
|
|
|
if nolemma: |
|
# ignore lemma lines |
|
if l.lstrip().startswith('#lem:'): |
|
return "" |
|
|
|
return formatAtfHtml(l) |
|
|
|
|
|
|
|
def formatAtfFullLineNum(txt, nolemma=True): |
|
"""format full line numbers in ATF text""" |
|
# surface codes |
|
surfaces = {'@obverse':'obv', |
|
'@reverse':'rev', |
|
'@surface':'surface', |
|
'@edge':'edge', |
|
'@left':'left', |
|
'@right':'right', |
|
'@top':'top', |
|
'@bottom':'bottom', |
|
'@face':'face', |
|
'@seal':'seal'} |
|
|
|
if not txt: |
|
return "" |
|
|
|
ret = [] |
|
surf = "" |
|
col = "" |
|
for line in txt.split("\n"): |
|
line = unicodify(line) |
|
if line and line[0] == '@': |
|
# surface or column |
|
words = line.split(' ') |
|
if words[0] in surfaces: |
|
surf = line.replace(words[0],surfaces[words[0]]).strip() |
|
|
|
elif words[0] == '@column': |
|
col = words[1] |
|
|
|
elif line and line[0] in '123456789': |
|
# ordinary line -> add line number |
|
line = "%s:%s:%s"%(surf,col,line) |
|
|
|
ret.append(line) |
|
|
|
return '\n'.join(ret) |
|
|
|
|
def generateXMLReturn(hash): |
def generateXMLReturn(hash): |
"""erzeugt das xml file als returnwert fuer uploadATFRPC""" |
"""erzeugt das xml file als returnwert fuer uploadATFRPC""" |
|
|
Line 889 class CDLIBasket(Folder,CatalogAware):
|
Line 980 class CDLIBasket(Folder,CatalogAware):
|
def searchInBasket(self,indexName,searchStr,regExp=False): |
def searchInBasket(self,indexName,searchStr,regExp=False): |
"""searchInBasket""" |
"""searchInBasket""" |
|
|
lst=self.searchInLineIndexDocs(indexName,searchStr,uniq=True,regExp=regExp) |
lst=self.searchInLineIndexDocs(indexName,searchStr,uniq=True,regExp=regExp) #TODO: fix this |
ret={} |
ret={} |
|
|
lv=self.getLastVersion() |
lv=self.getLastVersion() |
Line 1427 class CDLIFileObject(CatalogAware,extVer
|
Line 1518 class CDLIFileObject(CatalogAware,extVer
|
|
|
security=ClassSecurityInfo() |
security=ClassSecurityInfo() |
|
|
|
|
security.declarePublic('makeThisVersionCurrent') |
|
|
|
security.declareProtected('manage','index_html') |
security.declareProtected('manage','index_html') |
|
|
def PrincipiaSearchSource(self): |
def PrincipiaSearchSource(self): |
"""Return cataloguable key for ourselves.""" |
"""Return cataloguable key for ourselves.""" |
return str(self) |
return str(self) |
Line 1440 class CDLIFileObject(CatalogAware,extVer
|
Line 1529 class CDLIFileObject(CatalogAware,extVer
|
|
|
pt=PageTemplateFile(os.path.join(package_home(globals()),'zpt','makeThisVersionCurrent.zpt')).__of__(self) |
pt=PageTemplateFile(os.path.join(package_home(globals()),'zpt','makeThisVersionCurrent.zpt')).__of__(self) |
return pt() |
return pt() |
|
|
|
security.declarePublic('makeThisVersionCurrent') |
def makeThisVersionCurrent(self,comment,author,RESPONSE=None): |
def makeThisVersionCurrent(self,comment,author,RESPONSE=None): |
"""copy this version to current""" |
"""copy this version to current""" |
parent=self.aq_parent |
parent=self.aq_parent |
|
|
|
|
newversion=parent.manage_addCDLIFileObject('',comment,author) |
newversion=parent.manage_addCDLIFileObject('',comment,author) |
newversion.manage_upload(self.getData()) |
newversion.manage_upload(self.getData()) |
|
|
if RESPONSE is not None: |
if RESPONSE is not None: |
RESPONSE.redirect(self.aq_parent.absolute_url()+'/history') |
RESPONSE.redirect(self.aq_parent.absolute_url()+'/history') |
|
|
|
|
return True |
return True |
|
|
security.declarePublic('view') |
|
|
|
def getFormattedData(self): |
def getFormattedData(self): |
"""fromat text""" |
"""fromat text""" |
data=self.getData() |
data=self.getData() |
# return re.sub("\s\#lem"," #lem",data) #remove return vor #lem |
# return re.sub("\s\#lem"," #lem",data) #remove return vor #lem |
return re.sub("#lem"," #lem",data) #remove return vor #lem |
return re.sub("#lem"," #lem",data) #remove return vor #lem |
|
|
|
security.declarePublic('view') |
def view(self): |
def view(self): |
"""view file""" |
"""view file""" |
pt=PageTemplateFile(os.path.join(package_home(globals()),'zpt','viewCDLIFile.zpt')).__of__(self) |
pt=PageTemplateFile(os.path.join(package_home(globals()),'zpt','viewCDLIFile.zpt')).__of__(self) |
Line 1497 class CDLIFileObject(CatalogAware,extVer
|
Line 1584 class CDLIFileObject(CatalogAware,extVer
|
|
|
manage_addCDLIFileObjectForm=DTMLFile('dtml/fileAdd', globals(),Kind='CDLIFileObject',kind='CDLIFileObject', version='1') |
manage_addCDLIFileObjectForm=DTMLFile('dtml/fileAdd', globals(),Kind='CDLIFileObject',kind='CDLIFileObject', version='1') |
|
|
def manage_addCDLIFileObject(self,id,vC='',author='', file='',title='',precondition='', content_type='', |
def manage_addCDLIFileObject(self,id,vC='',author='', file='',title='',versionNumber=0, |
|
precondition='', content_type='', |
from_tmp=False,REQUEST=None): |
from_tmp=False,REQUEST=None): |
"""Add a new File object. |
"""Add a new File object. |
|
|
Creates a new File object 'id' with the contents of 'file'""" |
Creates a new File object 'id' with the contents of 'file'""" |
|
|
id=str(id) |
id=str(id) |
Line 1513 def manage_addCDLIFileObject(self,id,vC=
|
Line 1600 def manage_addCDLIFileObject(self,id,vC=
|
self=self.this() |
self=self.this() |
|
|
# First, we create the file without data: |
# First, we create the file without data: |
self._setObject(id, CDLIFileObject(id,title,'',content_type, precondition)) |
self._setObject(id, CDLIFileObject(id,title,versionNumber=versionNumber,versionComment=vC,time=time.localtime(),author=author)) |
self._getOb(id).versionComment=str(vC) |
fob = self._getOb(id) |
self._getOb(id).time=time.localtime() |
|
|
|
setattr(self._getOb(id),'author',author) |
|
|
|
|
|
# Now we "upload" the data. By doing this in two steps, we |
# Now we "upload" the data. By doing this in two steps, we |
# can use a database trick to make the upload more efficient. |
# can use a database trick to make the upload more efficient. |
|
|
if file and not from_tmp: |
if file and not from_tmp: |
self._getOb(id).manage_upload(file) |
fob.manage_upload(file) |
elif file and from_tmp: |
elif file and from_tmp: |
self._getOb(id).manage_upload_from_tmp(file) |
fob.manage_file_upload(file) # manage_upload_from_tmp doesn't exist in ExtFile2 |
|
# fob.manage_upload_from_tmp(file) # manage_upload_from_tmp doesn't exist in ExtFile2 |
if content_type: |
if content_type: |
self._getOb(id).content_type=content_type |
fob.content_type=content_type |
|
|
|
logging.debug("manage_add: lastversion=%s"%self.getData()) |
|
logging.debug("reindex1: %s in %s"%(repr(self),repr(self.default_catalog))) |
self.reindex_object() |
self.reindex_object() |
self._getOb(id).reindex_object() |
logging.debug("manage_add: fob_data=%s"%fob.getData()) |
|
logging.debug("reindex2: %s in %s"%(repr(fob), repr(fob.default_catalog))) |
|
fob.index_object() |
|
|
if REQUEST is not None: |
if REQUEST is not None: |
REQUEST['RESPONSE'].redirect(self.absolute_url()+'/manage_main') |
REQUEST['RESPONSE'].redirect(self.absolute_url()+'/manage_main') |
|
|
|
|
class CDLIFile(extVersionedFile,CatalogAware): |
class CDLIFile(extVersionedFile,CatalogAware): |
"""CDLI file""" |
"""CDLI file""" |
|
|
security=ClassSecurityInfo() |
security=ClassSecurityInfo() |
meta_type="CDLI file" |
meta_type="CDLI file" |
|
content_meta_type = ["CDLI File Object"] |
|
|
default_catalog='CDLICatalog' |
default_catalog='CDLICatalog' |
|
|
security.declareProtected('manage','index_html') |
security.declareProtected('manage','index_html') |
#security.declarePublic('history') |
|
def getLastVersionData(self): |
def getLastVersionData(self): |
"""get last version data""" |
"""get last version data""" |
return self.getLastVersion().getData() |
return self.getData() |
|
|
def getLastVersionFormattedData(self): |
def getLastVersionFormattedData(self): |
"""get last version data""" |
"""get last version data""" |
return self.getLastVersion().getFormattedData() |
return self.getContentObject().getFormattedData() |
|
|
#security.declarePublic('history') |
|
|
|
|
def getTextId(self): |
|
"""returns P-number of text""" |
|
# assuming that its the beginning of the title |
|
return self.title[:7] |
|
|
|
#security.declarePublic('history') |
def history(self): |
def history(self): |
"""history""" |
"""history""" |
|
|
Line 1599 class CDLIFile(extVersionedFile,CatalogA
|
Line 1694 class CDLIFile(extVersionedFile,CatalogA
|
#return [x.getObject() for x in context.CDLIBasketCatalog.search({'getFileNamesInLastVersion':self.getId()})] |
#return [x.getObject() for x in context.CDLIBasketCatalog.search({'getFileNamesInLastVersion':self.getId()})] |
|
|
|
|
|
def _newContentObject(self, id, title='', versionNumber=0, versionComment=None, time=None, author=None): |
|
"""factory for content objects. to be overridden in derived classes.""" |
|
return CDLIFileObject(id,title,versionNumber=versionNumber,versionComment=versionComment,time=time,author=author) |
|
|
|
|
def addCDLIFileObjectForm(self): |
def addCDLIFileObjectForm(self): |
"""add a new version""" |
"""add a new version""" |
|
|
Line 1624 class CDLIFile(extVersionedFile,CatalogA
|
Line 1724 class CDLIFile(extVersionedFile,CatalogA
|
except: |
except: |
pass |
pass |
|
|
|
ob = self.addContentObject(id, vC, author, file, title, changeName=changeName, newName=newName, from_tmp=from_tmp, |
|
precondition=precondition, content_type=content_type) |
|
|
if changeName=="yes": |
|
filename=file.filename |
|
self.title=filename[max(filename.rfind('/'), |
|
filename.rfind('\\'), |
|
filename.rfind(':'), |
|
)+1:] |
|
|
|
|
|
if not newName=='': |
|
self.title=newName[0:] |
|
|
|
|
|
|
|
|
|
positionVersionNum=getattr(self,'positionVersionNum','front') |
|
|
|
if positionVersionNum=='front': |
|
id="V%i"%self.getVersion()+"_"+self.title |
|
else: |
|
tmp=os.path.splitext(self.title) |
|
if len(tmp)>1: |
|
id=tmp[0]+"_V%i"%self.getVersion()+tmp[1] |
|
else: |
|
id=tmp[0]+"_V%i"%self.getVersion() |
|
|
|
|
|
manage_addCDLIFileObject(self,id,vC,author,file,id,precondition, content_type,from_tmp=from_tmp) |
|
#objs=self.ZopeFind(self,obj_ids=[id])[0][1].setVersionNumber(int(self.getVersion())) |
|
objs=getattr(self,id).setVersionNumber(int(self.getVersion())) |
|
try: |
try: |
#FIXME: wozu ist das gut? |
#FIXME: wozu ist das gut? |
self.REQUEST.SESSION['objID_parent']=self.getId() |
self.REQUEST.SESSION['objID_parent']=self.getId() |
Line 1661 class CDLIFile(extVersionedFile,CatalogA
|
Line 1734 class CDLIFile(extVersionedFile,CatalogA
|
pass |
pass |
|
|
if RESPONSE: |
if RESPONSE: |
|
if ob.getSize()==0: |
obj=self.ZopeFind(self,obj_ids=[id])[0][1] |
self.REQUEST.SESSION['objID']=ob.getId() |
if obj.getSize()==0: |
|
self.REQUEST.SESSION['objID']=obj.getId() |
|
pt=PageTemplateFile(os.path.join(package_home(globals()),'zpt','errorUploadFile')).__of__(self) |
pt=PageTemplateFile(os.path.join(package_home(globals()),'zpt','errorUploadFile')).__of__(self) |
return pt() |
return pt() |
|
|
else: |
else: |
if come_from and (come_from!=""): |
if come_from and (come_from!=""): |
RESPONSE.redirect(come_from+"?change="+self.getId()) |
RESPONSE.redirect(come_from+"?change="+self.getId()) |
else: |
else: |
RESPONSE.redirect(self.REQUEST['URL2']+'?uploaded=%s'%self.title) |
RESPONSE.redirect(self.REQUEST['URL2']+'?uploaded=%s'%self.title) |
|
|
else: |
else: |
return self.ZopeFind(self,obj_ids=[id])[0][1] |
return ob |
|
|
|
|
def manage_addCDLIFileForm(self): |
def manage_addCDLIFileForm(self): |
Line 1690 def manage_addCDLIFile(self,id,title,loc
|
Line 1759 def manage_addCDLIFile(self,id,title,loc
|
tryToggle=True |
tryToggle=True |
tryCount=0 |
tryCount=0 |
|
|
|
|
|
|
self._setObject(id,newObj) |
self._setObject(id,newObj) |
getattr(self,id).reindex_object() |
getattr(self,id).reindex_object() |
|
|
Line 1781 class CDLIFileFolder(extVersionedFileFol
|
Line 1848 class CDLIFileFolder(extVersionedFileFol
|
|
|
security=ClassSecurityInfo() |
security=ClassSecurityInfo() |
meta_type="CDLI Folder" |
meta_type="CDLI Folder" |
filesMetaType=['CDLI file'] |
file_meta_type=['CDLI file'] |
folderMetaType=['CDLI Folder'] |
folder_meta_type=['CDLI Folder'] |
default_catalog='CDLICatalog' |
|
defaultFileCatalog=default_catalog #wenn dieses definiert ist, wird beim hinzufgen einer neuen version eines files dieser catalog neuiniziert |
file_catalog='CDLICatalog' |
|
|
#downloadCounter=0 # counts how many download for all files currently run, be mehr als 5 wird verweigert. |
#downloadCounter=0 # counts how many download for all files currently run, be mehr als 5 wird verweigert. |
tmpStore2={} |
tmpStore2={} |
|
|
def setTemp(self,name,value): |
def setTemp(self,name,value): |
"""set tmp""" |
"""set tmp""" |
|
|
Line 1794 class CDLIFileFolder(extVersionedFileFol
|
Line 1863 class CDLIFileFolder(extVersionedFileFol
|
|
|
|
|
def delete(self,ids): |
def delete(self,ids): |
"""delete this file, i.e. move into a trash folder""" |
"""delete these files""" |
|
|
found=self.ZopeFind(self,obj_ids=['.trash']) |
|
|
|
if len(found)<1: |
|
manage_addCDLIFileFolder(self, '.trash',title="Trash") |
|
trash=self._getOb('.trash') |
|
else: |
|
trash=found[0][1] |
|
|
|
if type(ids) is not ListType: |
if type(ids) is not ListType: |
ids=[ids] |
ids=[ids] |
cut=self.manage_cutObjects(ids) |
|
trash.manage_pasteObjects(cut) |
self.manage_delObjects(ids) |
|
|
|
|
def getVersionNumbersFromIds(self,ids): |
def getVersionNumbersFromIds(self,ids): |
"""get the numbers of the current versions of documents described by their ids""" |
"""get the numbers of the current versions of documents described by their ids""" |
Line 1818 class CDLIFileFolder(extVersionedFileFol
|
Line 1879 class CDLIFileFolder(extVersionedFileFol
|
founds=self.CDLICatalog.search({'title':searchStr}) |
founds=self.CDLICatalog.search({'title':searchStr}) |
|
|
for found in founds: |
for found in founds: |
lastVersion=found.getObject().getLastVersion() |
lastVersion=found.getObject().getContentObject() |
ret.append((found.getId,lastVersion)) |
ret.append((found.getId,lastVersion)) |
|
|
return ret |
return ret |
|
|
def getFile(self,fn): |
def getFile(self,fn): |
"""get the content of the file fn""" |
"""get the content of the file fn""" |
founds=self.CDLICatalog.search({'title':fn}) |
logging.debug("getFile: %s"%repr(fn)) |
if not founds: |
if not self.hasObject(fn): |
|
# search deeper |
|
founds=getattr(self, self.file_catalog).search({'textid':fn}) |
|
if founds: |
|
obj=founds[0].getObject().getContentObject() |
|
else: |
return "" |
return "" |
else: |
else: |
obj=founds[0].getObject().getLastVersion() |
obj = self[fn].getContentObject() |
|
|
return obj.getData()[0:] |
return obj.getData()[0:] |
|
|
|
|
def checkCatalog(self,fn): |
def checkCatalog(self,fn): |
"""check if fn is in the catalog""" |
"""check if fn is in the catalog""" |
#TODO add checkCatalog |
#TODO add checkCatalog |
|
|
|
|
|
|
def findObjectsFromListWithVersion(self,list,author=None): |
def findObjectsFromListWithVersion(self,list,author=None): |
"""find objects from a list with versions |
"""find objects from a list with versions |
@param list: list of tuples (cdliFile,version) |
@param list: list of tuples (cdliFile,version) |
""" |
""" |
|
|
|
|
|
|
#self.REQUEST.SESSION['fileIds']=list#store fieldIds in session for further usage |
#self.REQUEST.SESSION['fileIds']=list#store fieldIds in session for further usage |
#self.REQUEST.SESSION['searchList']=self.REQUEST.SESSION['fileIds'] |
#self.REQUEST.SESSION['searchList']=self.REQUEST.SESSION['fileIds'] |
|
|
|
|
pt=getattr(self,'filelistVersioned.html') |
pt=getattr(self,'filelistVersioned.html') |
|
|
return pt(search=list,author=author) |
return pt(search=list,author=author) |
Line 1930 class CDLIFileFolder(extVersionedFileFol
|
Line 1992 class CDLIFileFolder(extVersionedFileFol
|
def sortF(x,y): |
def sortF(x,y): |
return cmp(x[0],y[0]) |
return cmp(x[0],y[0]) |
|
|
catalog=getattr(self,self.default_catalog) |
catalog=getattr(self,self.file_catalog) |
#tf,tfilename=mkstemp() |
#tf,tfilename=mkstemp() |
if not hasattr(self.temp_folder,'downloadCounter'): |
if not hasattr(self.temp_folder,'downloadCounter'): |
self.temp_folder.downloadCounter=0 |
self.temp_folder.downloadCounter=0 |
Line 1957 class CDLIFileFolder(extVersionedFileFol
|
Line 2019 class CDLIFileFolder(extVersionedFileFol
|
|
|
#os.write(tf,obj.getLastVersion().data) |
#os.write(tf,obj.getLastVersion().data) |
if RESPONSE: |
if RESPONSE: |
RESPONSE.write(obj.getLastVersion().getData()[0:]) |
RESPONSE.write(obj.getData()[0:]) |
RESPONSE.write("\n") |
RESPONSE.write("\n") |
self.temp_folder.downloadCounter-=1 |
self.temp_folder.downloadCounter-=1 |
self._p_changed=1 |
self._p_changed=1 |
Line 1977 class CDLIFileFolder(extVersionedFileFol
|
Line 2039 class CDLIFileFolder(extVersionedFileFol
|
def hasParent(self): |
def hasParent(self): |
"""returns true falls subfolder""" |
"""returns true falls subfolder""" |
|
|
if self.aq_parent.meta_type in self.folderMetaType: |
if self.aq_parent.meta_type in self.folder_meta_type: |
return True |
return True |
else: |
else: |
return False |
return False |
Line 1985 class CDLIFileFolder(extVersionedFileFol
|
Line 2047 class CDLIFileFolder(extVersionedFileFol
|
def getFolders(self): |
def getFolders(self): |
"""get all subfolders""" |
"""get all subfolders""" |
ret=[] |
ret=[] |
folders=self.ZopeFind(self,obj_metatypes=self.folderMetaType) |
folders=self.ZopeFind(self,obj_metatypes=self.folder_meta_type) |
for folder in folders: |
for folder in folders: |
ret.append((folder[1], |
ret.append((folder[1], |
len(self.ZopeFind(folder[1],obj_metatypes=self.folderMetaType)), |
len(self.ZopeFind(folder[1],obj_metatypes=self.folder_meta_type)), |
len(self.ZopeFind(folder[1],obj_metatypes=self.filesMetaType)) |
len(self.ZopeFind(folder[1],obj_metatypes=self.file_meta_type)) |
)) |
)) |
return ret |
return ret |
|
|
Line 2043 class CDLIRoot(Folder):
|
Line 2105 class CDLIRoot(Folder):
|
meta_type="CDLIRoot" |
meta_type="CDLIRoot" |
downloadCounterBaskets=0# counts the current basket downloads if counter > 10 no downloads are possible |
downloadCounterBaskets=0# counts the current basket downloads if counter > 10 no downloads are possible |
|
|
def deleteFiles(self,ids): |
file_catalog = 'CDLICatalog' |
"""delete files (resp. move into .trash folder)""" |
|
# find or generete trash folder |
|
|
|
found=self.ZopeFind(self,obj_ids=['.trash']) |
|
|
|
if len(found)<1: |
# word splitter for search |
manage_addCDLIFileFolder(self, '.trash',title="Trash") |
splitter = {'words':cdliSplitter.wordSplitter(), |
trash=self._getOb('.trash') |
'graphemes':cdliSplitter.graphemeSplitter()} |
else: |
|
logging.info(found) |
|
trash=found[0][1] |
|
|
|
|
|
|
def deleteFiles(self,ids): |
|
"""delete files""" |
for id in ids: |
for id in ids: |
founds=self.CDLICatalog.search({'title':id.split(".")[0]}) |
founds=self.CDLICatalog.search({'title':id.split(".")[0]}) |
if founds: |
if founds: |
logging.info(founds) |
logging.debug("deleting %s"%founds) |
folder=founds[0].getObject().aq_parent #get the parent folder of the object |
folder=founds[0].getObject().aq_parent #get the parent folder of the object |
logging.info(folder) |
logging.debug("deleting from %s"%folder) |
cut=folder.manage_cutObjects([founds[0].getId]) #cut it out |
cut=folder.delete([founds[0].getId]) #cut it out |
trash.manage_pasteObjects(cut) #paste it in the trash |
|
|
|
|
|
def findWordRegExp(self,indexName,searchTerm): |
|
"""find all words in index which match regexp in SearchTerm |
|
@param indexName: name of the index to be searched in |
|
@param searchTerm: word to be searched""" |
|
|
|
ret=[] |
|
for x in self.lineIndexes[indexName].iterkeys(): |
|
if re.match(searchTerm,x): |
|
ret.append(x) |
|
return ret |
|
|
|
def searchRegExpInLineIndexDocs(self,indexName,searchTerm): |
|
"""search in inLineIndex with regexp |
|
@param indexName: name of the index to be searched in |
|
@param searchTerm: term to be searched |
|
""" |
|
if not searchTerm: |
|
return [] |
|
ret=[] |
|
words=self.findWordRegExp(indexName,searchTerm) # suche nach allen Treffern |
|
logging.info("wd:%s"%words) |
|
for word in words: |
|
|
|
ret+=self.searchInLineIndexDocs(indexName,word) |
|
|
|
|
|
x= unique(ret) |
|
logging.info("words_done") |
|
return x |
|
|
|
def showInLineIndex(self): |
|
"""get the index for debug purposes""" |
|
print "show" |
|
for key in self.lineIndexes.keys(): |
|
logging.info("index:%s"%key) |
|
for x in self.lineIndexes[key].iterkeys(): |
|
logging.info("word:%s"%repr(x)) |
|
#for y in self.lineIndex[x].iterkeys(): |
|
# print "doc",repr(y),repr(self.lineIndex[x][y]) |
|
|
|
return self.lineIndexes |
|
|
|
def searchInLineIndexDocs(self,indexName,word,uniq=True,regExp=False): |
|
"""search occurences in an index |
|
@param indexName: name of the index to be searched in |
|
@param word: word to be searched |
|
@param unique: (optional) unify the list of results |
|
@param regExp: (optional) use regular expressions |
|
""" |
|
|
|
if regExp: |
|
return self.searchRegExpInLineIndexDocs(indexName,word) |
|
|
|
try: |
|
|
|
lst=list(self.lineIndexes[indexName].get(word).keys()) |
|
except: |
|
logging.error("error: searchInLineIndexDocs (%s %s)"%(sys.exc_info()[0:2])) |
|
lst=[] |
|
if uniq: |
|
return unique(lst) |
|
else: |
|
return lst |
|
|
|
def getLinesFromIndex(self,indexName,word,doc,regExp=False): |
|
"""return all lines from a document where word is found |
|
@param indexName: Name of the index |
|
@param word: word to be searched |
|
@param doc: name of the document (usuallay the p-number) |
|
@param regExp: (optional) use regExp |
|
""" |
|
|
|
if not regExp: |
|
return self.lineIndexes[indexName].get(word)[doc] |
|
else: # wenn regexp, suche welches word |
|
for w in self.findWordRegExp(indexName,word): |
|
if self.lineIndexes[indexName].get(w): # ein word in im dex gefunden |
|
try: |
|
dc=self.lineIndex[indexName].get(word)[doc] |
|
return dc # und ein document dann gib es zurueck |
|
except: |
|
pass #andernfalls weiter |
|
|
|
def cleanInLineIndex(self,indexName): |
|
"""empty an InlineIndex |
|
@param indexName: name of the index |
|
""" |
|
for x in list(self.lineIndexes[indexName].keys()): |
|
del(self.lineIndexes[indexName][x]) |
|
print [x for x in self.lineIndexes[indexName].keys()] |
|
|
|
return "ok" |
|
|
|
def storeInLineIndex(self,indexName,key,value): |
|
"""store in index, key is normally a word or grapheme |
|
and value is a tuple (documentname, line) where the word can be found |
|
@param indexName: name of the index |
|
@param key: key in index |
|
@param value: value in index, value is a tuple (document name, line) |
|
""" |
|
logging.error("indexing: %s %s"%(indexName,key)) |
|
if (not hasattr(self,'lineIndexes')): |
|
|
|
self.lineIndexes={} |
|
|
|
if self.lineIndexes.get(indexName,None) is None: |
|
#index exisitiert noch nicht dann anlegen |
|
|
|
self.lineIndexes[indexName]=OOBTree() |
|
lis=self.lineIndexes |
|
li=lis[indexName] |
|
|
|
if li.has_key(key): |
|
|
|
# if li[key].has_key(value[0]) and (not (value[1] in li[key][value[0]])): |
|
if li[key].has_key(value[0]): |
|
tmp=li[key][value[0]] |
|
tmp.append(value[1]) # add it if now in the array |
|
li[key][value[0]]=tmp[0:] |
|
else: |
|
li[key][value[0]]=[value[1]] # new array for lines |
|
|
|
else: |
|
|
|
li[key]=OOBTree()# new btree for lines |
|
li[key][value[0]]=[value[1]] |
|
|
|
|
def searchText(self, query, index='graphemes'): |
|
"""searches query in the fulltext index and returns a list of file ids/P-numbers""" |
|
# see also: http://www.plope.com/Books/2_7Edition/SearchingZCatalog.stx#2-13 |
|
logging.debug("searchtext for '%s' in index %s"%(query,index)) |
|
#import Products.ZCTextIndex.QueryParser |
|
#qp = QueryParser.QueryParser() |
|
#logging.debug() |
|
idxQuery = {index:{'query':query}} |
|
idx = getattr(self, self.file_catalog) |
|
# do search |
|
resultset = idx.search(query_request=idxQuery,sort_index='textid') |
|
# put only the P-Number in the result |
|
results = [res.getId[:7] for res in resultset] |
|
return results |
|
|
|
|
|
def getFile(self, pnum): |
|
"""get the translit file with the given pnum""" |
|
f = getattr(self, self.file_catalog).search({'textid':pnum}) |
|
if not f: |
|
return "" |
|
|
self.lineIndexes=lis |
return f[0].getObject().getData() |
|
|
transaction.get().commit() |
|
|
|
|
|
def showFile(self,fileId,wholePage=False): |
def showFile(self,fileId,wholePage=False): |
"""show a file |
"""show a file |
@param fileId: P-Number of the document to be displayed |
@param fileId: P-Number of the document to be displayed |
""" |
""" |
f=self.CDLICatalog({'title':fileId}) |
f=getattr(self, self.file_catalog).search({'textid':fileId}) |
if not f: |
if not f: |
return "" |
return "" |
|
|
if wholePage: |
if wholePage: |
logging.info("whole") |
logging.debug("show whole page") |
return f[0].getObject().getLastVersion().view() |
return f[0].getObject().getContentObject().view() |
else: |
else: |
return f[0].getObject().getLastVersionFormattedData() |
return f[0].getObject().getLastVersionFormattedData() |
|
|
|
|
def showWordInFile(self,fileId,word,lineList=None,regExp=True,indexName=""): |
def showWordInFile(self,fileId,word,indexName='graphemes',regExp=False,): |
"""get lines with word fromFileId""" |
"""get lines with word fromFileId""" |
|
logging.debug("showwordinfile word='%s' index=%s file=%s"%(word,indexName,fileId)) |
|
|
file=self.showFile(fileId) |
file = formatAtfFullLineNum(self.getFile(fileId)) |
logging.info("regEXP %s"%regExp) |
|
ret=[] |
ret=[] |
if regExp: # wenn regexp dann generiere alle worte aus der list die der regexp entsprechen |
|
wordlist=self.findWordRegExp(indexName,word) |
# add whitespace before and whitespace and line-end to splitter bounds expressions |
else: |
bounds = self.splitter[indexName].bounds |
wordlist=[word] |
splitexp = "(%s|\s)(%%s)(%s|\s|\Z)"%(bounds,bounds) |
|
# compile into regexp objects |
|
wordlist = [re.compile(splitexp%w) for w in word.split(' ')] |
|
|
for line in file.split("\n"): |
for line in file.split("\n"): |
found=False |
|
for word in wordlist: |
for word in wordlist: |
try: # just a hack because of possible unicode errors in line |
#logging.debug("showwordinfile: searching for %s in %s"%(word.pattern,line)) |
if line.find(word)>-1: |
if word.search(line): |
if lineList: #liste of moeglichen Zeilennummern |
line = formatAtfLineHtml(line) |
num=line.split(".")[0] #Zeilenummer ist alles vor dem . in der Zeile |
|
|
|
if num in lineList: |
|
|
|
ret.append(line) |
|
else: # nimm alles ohne line check |
|
ret.append(line) |
ret.append(line) |
|
break |
|
|
break; |
|
except: |
|
pass |
|
return ret |
return ret |
|
|
def tagWordInFile(self,fileId,word,lineList=None,regExp=True,indexName=""): |
|
"""get lines with word fromFileId""" |
|
|
|
file=self.showFile(fileId) |
def showWordInFiles(self,fileIds,word,indexName='graphemes',regExp=False): |
tagStr="""<span class="found">%s</span>""" |
""" |
|
get lines with word from all ids in list FileIds. |
|
returns dict with id:lines pairs. |
|
""" |
|
logging.debug("showwordinfiles word='%s' index=%s file=%s"%(word,indexName,fileIds)) |
|
|
|
return dict([(id,self.showWordInFile(id, word, indexName, regExp)) for id in fileIds]) |
|
|
|
|
|
def tagWordInFile(self,fileId,word,indexName='graphemes',regExp=False): |
|
"""get text with word highlighted from FileId""" |
|
logging.debug("tagwordinfile word='%s' index=%s file=%s"%(word,indexName,fileId)) |
|
|
|
file=self.getFile(fileId) |
|
tagStart=u'<span class="found">' |
|
tagEnd=u'</span>' |
|
tagStr=tagStart + u'%%s' + tagEnd |
ret=[] |
ret=[] |
|
|
if regExp: # wenn regexp dann generiere alle worte aus der list die der regexp entsprechen |
# add whitespace to splitter bounds expressions and compile into regexp object |
wordlist=self.findWordRegExp(indexName,word) |
bounds = self.splitter[indexName].bounds |
else: |
wordsplit = re.compile("(%s|\s)"%bounds) |
wordlist=[word] |
# split search terms by blanks |
|
words = word.split(' ') |
|
|
for line in file.split("\n"): |
for line in file.split("\n"): |
found=False |
line = unicodify(line) |
for word in wordlist: |
# ignore lemma lines |
if line.find(word)>-1: #word ist gefunden dann makiere und breche die Schleife ab |
if line.lstrip().startswith('#lem:'): |
if lineList: #liste of moeglichen Zeilennummern |
continue |
num=line.split(".")[0] #Zeilenummer ist alles vor dem . in der Zeile |
|
|
# first scan |
if num in lineList: |
hitwords = [] |
|
for w in words: |
|
if line.find(w) > -1: |
|
# word is in line |
|
hitwords.append(w) |
|
|
|
# examine hits closer |
|
if hitwords: |
|
# split line into words |
|
parts = wordsplit.split(line) |
|
line = "" |
|
for p in parts: |
|
# reassemble line |
|
if p in hitwords: |
|
# this part was found |
|
line += tagStart + formatAtfHtml(p) + tagEnd |
|
else: |
|
line += formatAtfHtml(p) |
|
|
ret.append(line.replace(word,tagStr%word)) |
else: |
|
# no hits |
|
line = formatAtfHtml(line) |
|
|
else: # nimm alles ohne line check |
|
ret.append(line.replace(word,tagStr%word)) |
|
found=True |
|
break |
|
if not found: #word wurde nicht gefunden keine makierung |
|
ret.append(line) |
ret.append(line) |
|
|
return "<br>\n".join(ret) |
return u'<br>\n'.join(ret) |
|
|
|
|
|
|
|
def tagWordInFiles(self,fileIds,word,indexName='graphemes',regExp=False): |
|
""" |
|
get texts with highlighted word from all ids in list FileIds. |
|
returns dict with id:text pairs. |
|
""" |
|
logging.debug("tagwordinfiles word='%s' index=%s file=%s"%(word,indexName,fileIds)) |
|
return dict([(id,self.tagWordInFile(id, word, indexName, regExp)) for id in fileIds]) |
|
|
|
|
def URLquote(self,str): |
def URLquote(self,str): |
"""quote url""" |
"""quote url""" |