--- cdli/cdli_files.py 2007/10/22 16:26:40 1.80.2.3 +++ cdli/cdli_files.py 2007/10/24 20:36:07 1.80.2.4 @@ -28,6 +28,9 @@ import copy import codecs import sys +import cdliSplitter + + def unicodify(s): """decode str (utf-8 or latin-1 representation) into unicode object""" if not s: @@ -50,6 +53,22 @@ def utf8ify(s): else: return s.encode('utf-8') +def formatAtfLineHtml(l, nolemma=True): + """escape special ATF characters for HTML""" + if not l: + return "" + + if nolemma: + # ignore lemma lines + if l.lstrip().startswith('#lem:'): + return "" + # replace & + l = l.replace('&','&') + # replace angular brackets + l = l.replace('<','<') + l = l.replace('>','>') + return l + def generateXMLReturn(hash): """erzeugt das xml file als returnwert fuer uploadATFRPC""" @@ -1812,6 +1831,7 @@ class CDLIFileFolder(extVersionedFileFol def getFile(self,fn): """get the content of the file fn""" + logging.debug("getFile: %s"%repr(fn)) if not self.hasObject(fn): # search deeper founds=self.CDLICatalog.search({'title':fn}) @@ -2030,6 +2050,13 @@ class CDLIRoot(Folder): meta_type="CDLIRoot" downloadCounterBaskets=0 # counts the current basket downloads if counter > 10 no downloads are possible + file_catalog = 'CDLICatalog' + + # word splitter for search + splitter = {'words':cdliSplitter.wordSplitter(), + 'graphemes':cdliSplitter.graphemeSplitter()} + + def deleteFiles(self,ids): """delete files""" for id in ids: @@ -2042,6 +2069,47 @@ class CDLIRoot(Folder): + def searchText(self, query, index='words'): + """searches query in the fulltext index and returns a list of file ids/P-numbers""" + idxQuery = {index:{'query':query}} + idx = getattr(self, self.file_catalog) + results = [] + # do search + resultset = idx.search(idxQuery) + for res in resultset: + # put only the P-Number in the result + results.append(res.getId[:7]) + return results + + # from PluginINdexes.common.util.py:parseIndexRequest: + # + # The class understands the following type of parameters: + # + # - old-style parameters where the query for an index as value inside + # the request directory where the index name is the name of the key. + # Additional parameters for an index could be passed as index+"_usage" ... + # + # + # - dictionary-style parameters specify a query for an index as + # an entry in the request dictionary where the key corresponds to the + # name of the index and the key is a dictionary with the parameters + # passed to the index. + # + # Allowed keys of the parameter dictionary: + # + # 'query' - contains the query (either string, list or tuple) (required) + # + # other parameters depend on the the index + # + # + # - record-style parameters specify a query for an index as instance of the + # Record class. This happens usually when parameters from a web form use + # the "record" type e.g. . + # All restrictions of the dictionary-style parameters apply to the record-style + # parameters + + + def showFile(self,fileId,wholePage=False): """show a file @param fileId: P-Number of the document to be displayed @@ -2069,54 +2137,53 @@ class CDLIRoot(Folder): wordlist=[word] for line in file.split("\n"): + line = formatAtfLineHtml(unicodify(line)) found=False for word in wordlist: - try: # just a hack because of possible unicode errors in line - if line.find(word)>-1: + try: # just a hack because of possible unicode errors in line + if line.find(word)>-1: if lineList: #liste of moeglichen Zeilennummern - num=line.split(".")[0] #Zeilenummer ist alles vor dem . in der Zeile - - if num in lineList: - - ret.append(line) - else: # nimm alles ohne line check + num=line.split(".")[0] #Zeilenummer ist alles vor dem . in der Zeile + if num in lineList: ret.append(line) - - break; - except: - pass + else: # nimm alles ohne line check + ret.append(line) + break; + except: + pass return ret + - def tagWordInFile(self,fileId,word,lineList=None,regExp=False,indexName=""): + def tagWordInFile(self,fileId,word,indexName='words',regExp=False): """get text with word highlighted from FileId""" file=self.showFile(fileId) tagStr=u'%s' ret=[] + # search using lowercase + word = word.lower() if regExp: # wenn regexp dann generiere alle worte aus der list die der regexp entsprechen wordlist=self.findWordRegExp(indexName,word) else: - wordlist=[word] + # split the search term into words according to the corresponding splitter + #try: + wordlist = self.splitter[indexName].process([word]) + #except: + # wordlist=[word] for line in file.split("\n"): - line = unicodify(line) - found=False - for word in wordlist: - if line.find(word)>-1: #word ist gefunden dann makiere und breche die Schleife ab - if lineList: #liste of moeglichen Zeilennummern - num=line.split(".")[0] #Zeilenummer ist alles vor dem . in der Zeile - - if num in lineList: - - ret.append(line.replace(word,tagStr%word)) - - else: # nimm alles ohne line check - ret.append(line.replace(word,tagStr%word)) - found=True - break - if not found: #word wurde nicht gefunden keine makierung - ret.append(line) + line = formatAtfLineHtml(unicodify(line)) + if not line: + # formatAtf can produce empty lines + continue + + for w in wordlist: + if line.lower().find(w)>-1: + #word ist gefunden dann makiere + line = line.replace(w,tagStr%w) + + ret.append(line) return u'
\n'.join(ret)