version 1.80.2.1, 2007/10/06 13:44:46
|
version 1.80.2.5, 2007/10/26 22:45:12
|
Line 28 import copy
|
Line 28 import copy
|
import codecs |
import codecs |
import sys |
import sys |
|
|
|
import cdliSplitter |
|
|
|
|
def unicodify(s): |
def unicodify(s): |
"""decode str (utf-8 or latin-1 representation) into unicode object""" |
"""decode str (utf-8 or latin-1 representation) into unicode object""" |
if not s: |
if not s: |
Line 50 def utf8ify(s):
|
Line 53 def utf8ify(s):
|
else: |
else: |
return s.encode('utf-8') |
return s.encode('utf-8') |
|
|
|
def formatAtfLineHtml(l, nolemma=True): |
|
"""escape special ATF characters for HTML""" |
|
if not l: |
|
return "" |
|
|
|
if nolemma: |
|
# ignore lemma lines |
|
if l.lstrip().startswith('#lem:'): |
|
return "" |
|
# replace & |
|
l = l.replace('&','&') |
|
# replace angular brackets |
|
l = l.replace('<','<') |
|
l = l.replace('>','>') |
|
return l |
|
|
|
|
def generateXMLReturn(hash): |
def generateXMLReturn(hash): |
"""erzeugt das xml file als returnwert fuer uploadATFRPC""" |
"""erzeugt das xml file als returnwert fuer uploadATFRPC""" |
Line 912 class CDLIBasket(Folder,CatalogAware):
|
Line 931 class CDLIBasket(Folder,CatalogAware):
|
def searchInBasket(self,indexName,searchStr,regExp=False): |
def searchInBasket(self,indexName,searchStr,regExp=False): |
"""searchInBasket""" |
"""searchInBasket""" |
|
|
lst=self.searchInLineIndexDocs(indexName,searchStr,uniq=True,regExp=regExp) |
lst=self.searchInLineIndexDocs(indexName,searchStr,uniq=True,regExp=regExp) #TODO: fix this |
ret={} |
ret={} |
|
|
lv=self.getLastVersion() |
lv=self.getLastVersion() |
Line 1778 class CDLIFileFolder(extVersionedFileFol
|
Line 1797 class CDLIFileFolder(extVersionedFileFol
|
folder_meta_type=['CDLI Folder'] |
folder_meta_type=['CDLI Folder'] |
|
|
default_catalog='CDLICatalog' |
default_catalog='CDLICatalog' |
defaultFileCatalog=default_catalog #wenn dieses definiert ist, wird beim hinzufgen einer neuen version eines files dieser catalog neuiniziert |
defaultFileCatalog=default_catalog #wenn dieses definiert ist, wird beim hinzufuegen einer neuen version eines files dieser catalog neuindiziert |
#downloadCounter=0 # counts how many download for all files currently run, be mehr als 5 wird verweigert. |
#downloadCounter=0 # counts how many download for all files currently run, be mehr als 5 wird verweigert. |
tmpStore2={} |
tmpStore2={} |
|
|
Line 1789 class CDLIFileFolder(extVersionedFileFol
|
Line 1808 class CDLIFileFolder(extVersionedFileFol
|
|
|
|
|
def delete(self,ids): |
def delete(self,ids): |
"""delete this file, i.e. move into a trash folder""" |
"""delete these files""" |
|
|
found=self.ZopeFind(self,obj_ids=['.trash']) |
|
|
|
if len(found)<1: |
|
manage_addCDLIFileFolder(self, '.trash',title="Trash") |
|
trash=self._getOb('.trash') |
|
else: |
|
trash=found[0][1] |
|
|
|
if type(ids) is not ListType: |
if type(ids) is not ListType: |
ids=[ids] |
ids=[ids] |
cut=self.manage_cutObjects(ids) |
|
trash.manage_pasteObjects(cut) |
self.manage_delObjects(ids) |
|
|
|
|
def getVersionNumbersFromIds(self,ids): |
def getVersionNumbersFromIds(self,ids): |
"""get the numbers of the current versions of documents described by their ids""" |
"""get the numbers of the current versions of documents described by their ids""" |
Line 1820 class CDLIFileFolder(extVersionedFileFol
|
Line 1831 class CDLIFileFolder(extVersionedFileFol
|
|
|
def getFile(self,fn): |
def getFile(self,fn): |
"""get the content of the file fn""" |
"""get the content of the file fn""" |
|
logging.debug("getFile: %s"%repr(fn)) |
|
if not self.hasObject(fn): |
|
# search deeper |
founds=self.CDLICatalog.search({'title':fn}) |
founds=self.CDLICatalog.search({'title':fn}) |
if not founds: |
if founds: |
|
obj=founds[0].getObject().getContentObject() |
|
else: |
return "" |
return "" |
else: |
else: |
obj=founds[0].getObject().getContentObject() |
obj = self[fn].getContentObject() |
|
|
return obj.getData()[0:] |
return obj.getData()[0:] |
|
|
|
|
def checkCatalog(self,fn): |
def checkCatalog(self,fn): |
"""check if fn is in the catalog""" |
"""check if fn is in the catalog""" |
#TODO add checkCatalog |
#TODO add checkCatalog |
|
|
|
|
|
|
def findObjectsFromListWithVersion(self,list,author=None): |
def findObjectsFromListWithVersion(self,list,author=None): |
"""find objects from a list with versions |
"""find objects from a list with versions |
@param list: list of tuples (cdliFile,version) |
@param list: list of tuples (cdliFile,version) |
""" |
""" |
|
|
|
|
|
|
#self.REQUEST.SESSION['fileIds']=list#store fieldIds in session for further usage |
#self.REQUEST.SESSION['fileIds']=list#store fieldIds in session for further usage |
#self.REQUEST.SESSION['searchList']=self.REQUEST.SESSION['fileIds'] |
#self.REQUEST.SESSION['searchList']=self.REQUEST.SESSION['fileIds'] |
|
|
|
|
pt=getattr(self,'filelistVersioned.html') |
pt=getattr(self,'filelistVersioned.html') |
|
|
return pt(search=list,author=author) |
return pt(search=list,author=author) |
Line 2038 class CDLIRoot(Folder):
|
Line 2050 class CDLIRoot(Folder):
|
meta_type="CDLIRoot" |
meta_type="CDLIRoot" |
downloadCounterBaskets=0# counts the current basket downloads if counter > 10 no downloads are possible |
downloadCounterBaskets=0# counts the current basket downloads if counter > 10 no downloads are possible |
|
|
def deleteFiles(self,ids): |
file_catalog = 'CDLICatalog' |
"""delete files (resp. move into .trash folder)""" |
|
# find or generete trash folder |
|
|
|
found=self.ZopeFind(self,obj_ids=['.trash']) |
# word splitter for search |
|
splitter = {'words':cdliSplitter.wordSplitter(), |
|
'graphemes':cdliSplitter.graphemeSplitter()} |
|
|
if len(found)<1: |
|
manage_addCDLIFileFolder(self, '.trash',title="Trash") |
|
trash=self._getOb('.trash') |
|
else: |
|
trash=found[0][1] |
|
|
|
|
def deleteFiles(self,ids): |
|
"""delete files""" |
for id in ids: |
for id in ids: |
founds=self.CDLICatalog.search({'title':id.split(".")[0]}) |
founds=self.CDLICatalog.search({'title':id.split(".")[0]}) |
if founds: |
if founds: |
logging.info(founds) |
logging.debug("deleting %s"%founds) |
folder=founds[0].getObject().aq_parent #get the parent folder of the object |
folder=founds[0].getObject().aq_parent #get the parent folder of the object |
logging.info(folder) |
logging.debug("deleting from %s"%folder) |
cut=folder.manage_cutObjects([founds[0].getId]) #cut it out |
cut=folder.delete([founds[0].getId]) #cut it out |
trash.manage_pasteObjects(cut) #paste it in the trash |
|
|
|
|
|
def findWordRegExp(self,indexName,searchTerm): |
|
"""find all words in index which match regexp in SearchTerm |
|
@param indexName: name of the index to be searched in |
|
@param searchTerm: word to be searched""" |
|
|
|
ret=[] |
|
for x in self.lineIndexes[indexName].iterkeys(): |
|
if re.match(searchTerm,x): |
|
ret.append(x) |
|
return ret |
|
|
|
def searchRegExpInLineIndexDocs(self,indexName,searchTerm): |
|
"""search in inLineIndex with regexp |
|
@param indexName: name of the index to be searched in |
|
@param searchTerm: term to be searched |
|
""" |
|
if not searchTerm: |
|
return [] |
|
ret=[] |
|
words=self.findWordRegExp(indexName,searchTerm) # suche nach allen Treffern |
|
logging.info("wd:%s"%words) |
|
for word in words: |
|
|
|
ret+=self.searchInLineIndexDocs(indexName,word) |
|
|
|
|
|
x= unique(ret) |
|
logging.info("words_done") |
|
return x |
|
|
|
def showInLineIndex(self): |
|
"""get the index for debug purposes""" |
|
print "show" |
|
for key in self.lineIndexes.keys(): |
|
logging.info("index:%s"%key) |
|
for x in self.lineIndexes[key].iterkeys(): |
|
logging.info("word:%s"%repr(x)) |
|
#for y in self.lineIndex[x].iterkeys(): |
|
# print "doc",repr(y),repr(self.lineIndex[x][y]) |
|
|
|
return self.lineIndexes |
|
|
|
def searchInLineIndexDocs(self,indexName,word,uniq=True,regExp=False): |
|
"""search occurences in an index |
|
@param indexName: name of the index to be searched in |
|
@param word: word to be searched |
|
@param unique: (optional) unify the list of results |
|
@param regExp: (optional) use regular expressions |
|
""" |
|
|
|
if regExp: |
|
return self.searchRegExpInLineIndexDocs(indexName,word) |
|
|
|
try: |
|
|
|
lst=list(self.lineIndexes[indexName].get(word).keys()) |
|
except: |
|
logging.error("error: searchInLineIndexDocs (%s %s)"%(sys.exc_info()[0:2])) |
|
lst=[] |
|
if uniq: |
|
return unique(lst) |
|
else: |
|
return lst |
|
|
|
def getLinesFromIndex(self,indexName,word,doc,regExp=False): |
|
"""return all lines from a document where word is found |
|
@param indexName: Name of the index |
|
@param word: word to be searched |
|
@param doc: name of the document (usuallay the p-number) |
|
@param regExp: (optional) use regExp |
|
""" |
|
|
|
if not regExp: |
|
return self.lineIndexes[indexName].get(word)[doc] |
|
else: # wenn regexp, suche welches word |
|
for w in self.findWordRegExp(indexName,word): |
|
if self.lineIndexes[indexName].get(w): # ein word in im dex gefunden |
|
try: |
|
dc=self.lineIndex[indexName].get(word)[doc] |
|
return dc # und ein document dann gib es zurueck |
|
except: |
|
pass #andernfalls weiter |
|
|
|
def cleanInLineIndex(self,indexName): |
|
"""empty an InlineIndex |
|
@param indexName: name of the index |
|
""" |
|
for x in list(self.lineIndexes[indexName].keys()): |
|
del(self.lineIndexes[indexName][x]) |
|
print [x for x in self.lineIndexes[indexName].keys()] |
|
|
|
return "ok" |
|
|
|
def storeInLineIndex(self,indexName,key,value): |
|
"""store in index, key is normally a word or grapheme |
|
and value is a tuple (documentname, line) where the word can be found |
|
@param indexName: name of the index |
|
@param key: key in index |
|
@param value: value in index, value is a tuple (document name, line) |
|
""" |
|
logging.error("indexing: %s %s"%(indexName,key)) |
|
if (not hasattr(self,'lineIndexes')): |
|
|
|
self.lineIndexes={} |
|
|
|
if self.lineIndexes.get(indexName,None) is None: |
|
#index exisitiert noch nicht dann anlegen |
|
|
|
self.lineIndexes[indexName]=OOBTree() |
|
lis=self.lineIndexes |
|
li=lis[indexName] |
|
|
|
if li.has_key(key): |
def searchText(self, query, index='words'): |
|
"""searches query in the fulltext index and returns a list of file ids/P-numbers""" |
# if li[key].has_key(value[0]) and (not (value[1] in li[key][value[0]])): |
idxQuery = {index:{'query':query}} |
if li[key].has_key(value[0]): |
idx = getattr(self, self.file_catalog) |
tmp=li[key][value[0]] |
results = [] |
tmp.append(value[1]) # add it if now in the array |
# do search |
li[key][value[0]]=tmp[0:] |
resultset = idx.search(idxQuery) |
else: |
for res in resultset: |
li[key][value[0]]=[value[1]] # new array for lines |
# put only the P-Number in the result |
|
results.append(res.getId[:7]) |
else: |
return results |
|
|
li[key]=OOBTree()# new btree for lines |
# from PluginINdexes.common.util.py:parseIndexRequest: |
li[key][value[0]]=[value[1]] |
# |
|
# The class understands the following type of parameters: |
|
# |
|
# - old-style parameters where the query for an index as value inside |
|
# the request directory where the index name is the name of the key. |
|
# Additional parameters for an index could be passed as index+"_usage" ... |
|
# |
|
# |
|
# - dictionary-style parameters specify a query for an index as |
|
# an entry in the request dictionary where the key corresponds to the |
|
# name of the index and the key is a dictionary with the parameters |
|
# passed to the index. |
|
# |
|
# Allowed keys of the parameter dictionary: |
|
# |
|
# 'query' - contains the query (either string, list or tuple) (required) |
|
# |
|
# other parameters depend on the the index |
|
# |
|
# |
|
# - record-style parameters specify a query for an index as instance of the |
|
# Record class. This happens usually when parameters from a web form use |
|
# the "record" type e.g. <input type="text" name="path.query:record:string">. |
|
# All restrictions of the dictionary-style parameters apply to the record-style |
|
# parameters |
|
|
|
|
self.lineIndexes=lis |
|
|
|
transaction.get().commit() |
|
|
|
|
|
def showFile(self,fileId,wholePage=False): |
def showFile(self,fileId,wholePage=False): |
"""show a file |
"""show a file |
Line 2202 class CDLIRoot(Folder):
|
Line 2119 class CDLIRoot(Folder):
|
return "" |
return "" |
|
|
if wholePage: |
if wholePage: |
logging.info("whole") |
logging.debug("show whole page") |
return f[0].getObject().getContentObject().view() |
return f[0].getObject().getContentObject().view() |
else: |
else: |
return f[0].getObject().getLastVersionFormattedData() |
return f[0].getObject().getLastVersionFormattedData() |
|
|
|
|
def showWordInFile(self,fileId,word,lineList=None,regExp=True,indexName=""): |
def showWordInFile(self,fileId,word,indexName='graphemes',regExp=False,): |
"""get lines with word fromFileId""" |
"""get lines with word fromFileId""" |
|
|
file=self.showFile(fileId) |
file=self.showFile(fileId) |
logging.info("regEXP %s"%regExp) |
|
ret=[] |
ret=[] |
|
# search using lowercase |
|
word = word.lower() |
if regExp: # wenn regexp dann generiere alle worte aus der list die der regexp entsprechen |
if regExp: # wenn regexp dann generiere alle worte aus der list die der regexp entsprechen |
wordlist=self.findWordRegExp(indexName,word) |
wordlist=self.findWordRegExp(indexName,word) |
else: |
else: |
wordlist=[word] |
# split the search term into words according to the corresponding splitter |
|
#try: |
|
wordlist = self.splitter[indexName].process([word]) |
|
#except: |
|
# wordlist=[word] |
|
|
for line in file.split("\n"): |
for line in file.split("\n"): |
found=False |
line = formatAtfLineHtml(unicodify(line)) |
|
if not line: |
|
# formatAtf can produce empty lines |
|
continue |
for word in wordlist: |
for word in wordlist: |
try: # just a hack because of possible unicode errors in line |
if line.lower().find(word)>-1: |
if line.find(word)>-1: |
|
if lineList: #liste of moeglichen Zeilennummern |
|
num=line.split(".")[0] #Zeilenummer ist alles vor dem . in der Zeile |
|
|
|
if num in lineList: |
|
|
|
ret.append(line) |
|
else: # nimm alles ohne line check |
|
ret.append(line) |
ret.append(line) |
|
|
break; |
|
except: |
|
pass |
|
return ret |
return ret |
|
|
def tagWordInFile(self,fileId,word,lineList=None,regExp=True,indexName=""): |
|
"""get lines with word fromFileId""" |
def tagWordInFile(self,fileId,word,indexName='graphemes',regExp=False): |
|
"""get text with word highlighted from FileId""" |
|
|
file=self.showFile(fileId) |
file=self.showFile(fileId) |
tagStr=u'<span class="found">%s</span>' |
tagStr=u'<span class="found">%s</span>' |
ret=[] |
ret=[] |
|
# search using lowercase |
|
word = word.lower() |
|
|
if regExp: # wenn regexp dann generiere alle worte aus der list die der regexp entsprechen |
if regExp: # wenn regexp dann generiere alle worte aus der list die der regexp entsprechen |
wordlist=self.findWordRegExp(indexName,word) |
wordlist=self.findWordRegExp(indexName,word) |
else: |
else: |
wordlist=[word] |
# split the search term into words according to the corresponding splitter |
|
#try: |
|
wordlist = self.splitter[indexName].process([word]) |
|
#except: |
|
# wordlist=[word] |
|
|
for line in file.split("\n"): |
for line in file.split("\n"): |
line = unicodify(line) |
line = formatAtfLineHtml(unicodify(line)) |
found=False |
if not line: |
for word in wordlist: |
# formatAtf can produce empty lines |
if line.find(word)>-1: #word ist gefunden dann makiere und breche die Schleife ab |
continue |
if lineList: #liste of moeglichen Zeilennummern |
|
num=line.split(".")[0] #Zeilenummer ist alles vor dem . in der Zeile |
for w in wordlist: |
|
if line.lower().find(w)>-1: |
if num in lineList: |
#word ist gefunden dann makiere |
|
line = line.replace(w,tagStr%w) |
ret.append(line.replace(word,tagStr%word)) |
|
|
|
else: # nimm alles ohne line check |
|
ret.append(line.replace(word,tagStr%word)) |
|
found=True |
|
break |
|
if not found: #word wurde nicht gefunden keine makierung |
|
ret.append(line) |
ret.append(line) |
|
|
return u'<br>\n'.join(ret) |
return u'<br>\n'.join(ret) |