--- cdli/cdli_files.py 2007/10/06 13:44:46 1.80.2.1
+++ cdli/cdli_files.py 2007/10/24 20:36:07 1.80.2.4
@@ -28,6 +28,9 @@ import copy
import codecs
import sys
+import cdliSplitter
+
+
def unicodify(s):
"""decode str (utf-8 or latin-1 representation) into unicode object"""
if not s:
@@ -50,6 +53,22 @@ def utf8ify(s):
else:
return s.encode('utf-8')
+def formatAtfLineHtml(l, nolemma=True):
+ """escape special ATF characters for HTML"""
+ if not l:
+ return ""
+
+ if nolemma:
+ # ignore lemma lines
+ if l.lstrip().startswith('#lem:'):
+ return ""
+ # replace &
+ l = l.replace('&','&')
+ # replace angular brackets
+ l = l.replace('<','<')
+ l = l.replace('>','>')
+ return l
+
def generateXMLReturn(hash):
"""erzeugt das xml file als returnwert fuer uploadATFRPC"""
@@ -912,7 +931,7 @@ class CDLIBasket(Folder,CatalogAware):
def searchInBasket(self,indexName,searchStr,regExp=False):
"""searchInBasket"""
- lst=self.searchInLineIndexDocs(indexName,searchStr,uniq=True,regExp=regExp)
+ lst=self.searchInLineIndexDocs(indexName,searchStr,uniq=True,regExp=regExp) #TODO: fix this
ret={}
lv=self.getLastVersion()
@@ -1778,7 +1797,7 @@ class CDLIFileFolder(extVersionedFileFol
folder_meta_type=['CDLI Folder']
default_catalog='CDLICatalog'
- defaultFileCatalog=default_catalog #wenn dieses definiert ist, wird beim hinzufgen einer neuen version eines files dieser catalog neuiniziert
+ defaultFileCatalog=default_catalog #wenn dieses definiert ist, wird beim hinzufuegen einer neuen version eines files dieser catalog neuindiziert
#downloadCounter=0 # counts how many download for all files currently run, be mehr als 5 wird verweigert.
tmpStore2={}
@@ -1789,20 +1808,12 @@ class CDLIFileFolder(extVersionedFileFol
def delete(self,ids):
- """delete this file, i.e. move into a trash folder"""
-
- found=self.ZopeFind(self,obj_ids=['.trash'])
-
- if len(found)<1:
- manage_addCDLIFileFolder(self, '.trash',title="Trash")
- trash=self._getOb('.trash')
- else:
- trash=found[0][1]
-
+ """delete these files"""
if type(ids) is not ListType:
ids=[ids]
- cut=self.manage_cutObjects(ids)
- trash.manage_pasteObjects(cut)
+
+ self.manage_delObjects(ids)
+
def getVersionNumbersFromIds(self,ids):
"""get the numbers of the current versions of documents described by their ids"""
@@ -1820,31 +1831,32 @@ class CDLIFileFolder(extVersionedFileFol
def getFile(self,fn):
"""get the content of the file fn"""
- founds=self.CDLICatalog.search({'title':fn})
- if not founds:
- return ""
+ logging.debug("getFile: %s"%repr(fn))
+ if not self.hasObject(fn):
+ # search deeper
+ founds=self.CDLICatalog.search({'title':fn})
+ if founds:
+ obj=founds[0].getObject().getContentObject()
+ else:
+ return ""
else:
- obj=founds[0].getObject().getContentObject()
+ obj = self[fn].getContentObject()
- return obj.getData()[0:]
+ return obj.getData()[0:]
+
def checkCatalog(self,fn):
"""check if fn is in the catalog"""
#TODO add checkCatalog
-
def findObjectsFromListWithVersion(self,list,author=None):
"""find objects from a list with versions
@param list: list of tuples (cdliFile,version)
"""
-
-
-
#self.REQUEST.SESSION['fileIds']=list#store fieldIds in session for further usage
#self.REQUEST.SESSION['searchList']=self.REQUEST.SESSION['fileIds']
-
pt=getattr(self,'filelistVersioned.html')
return pt(search=list,author=author)
@@ -2036,162 +2048,67 @@ class CDLIRoot(Folder):
"""main folder for cdli"""
meta_type="CDLIRoot"
- downloadCounterBaskets=0# counts the current basket downloads if counter > 10 no downloads are possible
+ downloadCounterBaskets=0 # counts the current basket downloads if counter > 10 no downloads are possible
+
+ file_catalog = 'CDLICatalog'
+
+ # word splitter for search
+ splitter = {'words':cdliSplitter.wordSplitter(),
+ 'graphemes':cdliSplitter.graphemeSplitter()}
+
def deleteFiles(self,ids):
- """delete files (resp. move into .trash folder)"""
- # find or generete trash folder
-
- found=self.ZopeFind(self,obj_ids=['.trash'])
-
- if len(found)<1:
- manage_addCDLIFileFolder(self, '.trash',title="Trash")
- trash=self._getOb('.trash')
- else:
- trash=found[0][1]
-
+ """delete files"""
for id in ids:
founds=self.CDLICatalog.search({'title':id.split(".")[0]})
if founds:
- logging.info(founds)
+ logging.debug("deleting %s"%founds)
folder=founds[0].getObject().aq_parent #get the parent folder of the object
- logging.info(folder)
- cut=folder.manage_cutObjects([founds[0].getId]) #cut it out
- trash.manage_pasteObjects(cut) #paste it in the trash
+ logging.debug("deleting from %s"%folder)
+ cut=folder.delete([founds[0].getId]) #cut it out
- def findWordRegExp(self,indexName,searchTerm):
- """find all words in index which match regexp in SearchTerm
- @param indexName: name of the index to be searched in
- @param searchTerm: word to be searched"""
-
- ret=[]
- for x in self.lineIndexes[indexName].iterkeys():
- if re.match(searchTerm,x):
- ret.append(x)
- return ret
-
- def searchRegExpInLineIndexDocs(self,indexName,searchTerm):
- """search in inLineIndex with regexp
- @param indexName: name of the index to be searched in
- @param searchTerm: term to be searched
- """
- if not searchTerm:
- return []
- ret=[]
- words=self.findWordRegExp(indexName,searchTerm) # suche nach allen Treffern
- logging.info("wd:%s"%words)
- for word in words:
-
- ret+=self.searchInLineIndexDocs(indexName,word)
-
-
- x= unique(ret)
- logging.info("words_done")
- return x
-
- def showInLineIndex(self):
- """get the index for debug purposes"""
- print "show"
- for key in self.lineIndexes.keys():
- logging.info("index:%s"%key)
- for x in self.lineIndexes[key].iterkeys():
- logging.info("word:%s"%repr(x))
- #for y in self.lineIndex[x].iterkeys():
- # print "doc",repr(y),repr(self.lineIndex[x][y])
-
- return self.lineIndexes
-
- def searchInLineIndexDocs(self,indexName,word,uniq=True,regExp=False):
- """search occurences in an index
- @param indexName: name of the index to be searched in
- @param word: word to be searched
- @param unique: (optional) unify the list of results
- @param regExp: (optional) use regular expressions
- """
- if regExp:
- return self.searchRegExpInLineIndexDocs(indexName,word)
-
- try:
-
- lst=list(self.lineIndexes[indexName].get(word).keys())
- except:
- logging.error("error: searchInLineIndexDocs (%s %s)"%(sys.exc_info()[0:2]))
- lst=[]
- if uniq:
- return unique(lst)
- else:
- return lst
-
- def getLinesFromIndex(self,indexName,word,doc,regExp=False):
- """return all lines from a document where word is found
- @param indexName: Name of the index
- @param word: word to be searched
- @param doc: name of the document (usuallay the p-number)
- @param regExp: (optional) use regExp
- """
-
- if not regExp:
- return self.lineIndexes[indexName].get(word)[doc]
- else: # wenn regexp, suche welches word
- for w in self.findWordRegExp(indexName,word):
- if self.lineIndexes[indexName].get(w): # ein word in im dex gefunden
- try:
- dc=self.lineIndex[indexName].get(word)[doc]
- return dc # und ein document dann gib es zurueck
- except:
- pass #andernfalls weiter
-
- def cleanInLineIndex(self,indexName):
- """empty an InlineIndex
- @param indexName: name of the index
- """
- for x in list(self.lineIndexes[indexName].keys()):
- del(self.lineIndexes[indexName][x])
- print [x for x in self.lineIndexes[indexName].keys()]
-
- return "ok"
-
- def storeInLineIndex(self,indexName,key,value):
- """store in index, key is normally a word or grapheme
- and value is a tuple (documentname, line) where the word can be found
- @param indexName: name of the index
- @param key: key in index
- @param value: value in index, value is a tuple (document name, line)
- """
- logging.error("indexing: %s %s"%(indexName,key))
- if (not hasattr(self,'lineIndexes')):
-
- self.lineIndexes={}
-
- if self.lineIndexes.get(indexName,None) is None:
- #index exisitiert noch nicht dann anlegen
-
- self.lineIndexes[indexName]=OOBTree()
- lis=self.lineIndexes
- li=lis[indexName]
-
- if li.has_key(key):
-
-# if li[key].has_key(value[0]) and (not (value[1] in li[key][value[0]])):
- if li[key].has_key(value[0]):
- tmp=li[key][value[0]]
- tmp.append(value[1]) # add it if now in the array
- li[key][value[0]]=tmp[0:]
- else:
- li[key][value[0]]=[value[1]] # new array for lines
-
- else:
-
- li[key]=OOBTree()# new btree for lines
- li[key][value[0]]=[value[1]]
-
-
- self.lineIndexes=lis
-
- transaction.get().commit()
-
+ def searchText(self, query, index='words'):
+ """searches query in the fulltext index and returns a list of file ids/P-numbers"""
+ idxQuery = {index:{'query':query}}
+ idx = getattr(self, self.file_catalog)
+ results = []
+ # do search
+ resultset = idx.search(idxQuery)
+ for res in resultset:
+ # put only the P-Number in the result
+ results.append(res.getId[:7])
+ return results
+
+ # from PluginINdexes.common.util.py:parseIndexRequest:
+ #
+ # The class understands the following type of parameters:
+ #
+ # - old-style parameters where the query for an index as value inside
+ # the request directory where the index name is the name of the key.
+ # Additional parameters for an index could be passed as index+"_usage" ...
+ #
+ #
+ # - dictionary-style parameters specify a query for an index as
+ # an entry in the request dictionary where the key corresponds to the
+ # name of the index and the key is a dictionary with the parameters
+ # passed to the index.
+ #
+ # Allowed keys of the parameter dictionary:
+ #
+ # 'query' - contains the query (either string, list or tuple) (required)
+ #
+ # other parameters depend on the the index
+ #
+ #
+ # - record-style parameters specify a query for an index as instance of the
+ # Record class. This happens usually when parameters from a web form use
+ # the "record" type e.g. .
+ # All restrictions of the dictionary-style parameters apply to the record-style
+ # parameters
+
+
def showFile(self,fileId,wholePage=False):
"""show a file
@@ -2202,17 +2119,17 @@ class CDLIRoot(Folder):
return ""
if wholePage:
- logging.info("whole")
+ logging.debug("show whole page")
return f[0].getObject().getContentObject().view()
else:
return f[0].getObject().getLastVersionFormattedData()
- def showWordInFile(self,fileId,word,lineList=None,regExp=True,indexName=""):
- """get lines with word fromFileId"""
+ def showWordInFile(self,fileId,word,lineList=None,regExp=False,indexName=""):
+ """get lines with word from FileId"""
file=self.showFile(fileId)
- logging.info("regEXP %s"%regExp)
+ logging.debug("show word regEXP %s"%regExp)
ret=[]
if regExp: # wenn regexp dann generiere alle worte aus der list die der regexp entsprechen
wordlist=self.findWordRegExp(indexName,word)
@@ -2220,54 +2137,53 @@ class CDLIRoot(Folder):
wordlist=[word]
for line in file.split("\n"):
+ line = formatAtfLineHtml(unicodify(line))
found=False
for word in wordlist:
- try: # just a hack because of possible unicode errors in line
- if line.find(word)>-1:
+ try: # just a hack because of possible unicode errors in line
+ if line.find(word)>-1:
if lineList: #liste of moeglichen Zeilennummern
- num=line.split(".")[0] #Zeilenummer ist alles vor dem . in der Zeile
-
- if num in lineList:
-
- ret.append(line)
- else: # nimm alles ohne line check
+ num=line.split(".")[0] #Zeilenummer ist alles vor dem . in der Zeile
+ if num in lineList:
ret.append(line)
-
- break;
- except:
- pass
+ else: # nimm alles ohne line check
+ ret.append(line)
+ break;
+ except:
+ pass
return ret
+
- def tagWordInFile(self,fileId,word,lineList=None,regExp=True,indexName=""):
- """get lines with word fromFileId"""
+ def tagWordInFile(self,fileId,word,indexName='words',regExp=False):
+ """get text with word highlighted from FileId"""
file=self.showFile(fileId)
tagStr=u'%s'
ret=[]
+ # search using lowercase
+ word = word.lower()
if regExp: # wenn regexp dann generiere alle worte aus der list die der regexp entsprechen
wordlist=self.findWordRegExp(indexName,word)
else:
- wordlist=[word]
+ # split the search term into words according to the corresponding splitter
+ #try:
+ wordlist = self.splitter[indexName].process([word])
+ #except:
+ # wordlist=[word]
for line in file.split("\n"):
- line = unicodify(line)
- found=False
- for word in wordlist:
- if line.find(word)>-1: #word ist gefunden dann makiere und breche die Schleife ab
- if lineList: #liste of moeglichen Zeilennummern
- num=line.split(".")[0] #Zeilenummer ist alles vor dem . in der Zeile
-
- if num in lineList:
-
- ret.append(line.replace(word,tagStr%word))
-
- else: # nimm alles ohne line check
- ret.append(line.replace(word,tagStr%word))
- found=True
- break
- if not found: #word wurde nicht gefunden keine makierung
- ret.append(line)
+ line = formatAtfLineHtml(unicodify(line))
+ if not line:
+ # formatAtf can produce empty lines
+ continue
+
+ for w in wordlist:
+ if line.lower().find(w)>-1:
+ #word ist gefunden dann makiere
+ line = line.replace(w,tagStr%w)
+
+ ret.append(line)
return u' \n'.join(ret)