--- cdli/cdli_files.py 2007/10/22 16:26:40 1.80.2.3
+++ cdli/cdli_files.py 2007/10/24 20:36:07 1.80.2.4
@@ -28,6 +28,9 @@ import copy
import codecs
import sys
+import cdliSplitter
+
+
def unicodify(s):
"""decode str (utf-8 or latin-1 representation) into unicode object"""
if not s:
@@ -50,6 +53,22 @@ def utf8ify(s):
else:
return s.encode('utf-8')
+def formatAtfLineHtml(l, nolemma=True):
+ """escape special ATF characters for HTML"""
+ if not l:
+ return ""
+
+ if nolemma:
+ # ignore lemma lines
+ if l.lstrip().startswith('#lem:'):
+ return ""
+ # replace &
+ l = l.replace('&','&')
+ # replace angular brackets
+ l = l.replace('<','<')
+ l = l.replace('>','>')
+ return l
+
def generateXMLReturn(hash):
"""erzeugt das xml file als returnwert fuer uploadATFRPC"""
@@ -1812,6 +1831,7 @@ class CDLIFileFolder(extVersionedFileFol
def getFile(self,fn):
"""get the content of the file fn"""
+ logging.debug("getFile: %s"%repr(fn))
if not self.hasObject(fn):
# search deeper
founds=self.CDLICatalog.search({'title':fn})
@@ -2030,6 +2050,13 @@ class CDLIRoot(Folder):
meta_type="CDLIRoot"
downloadCounterBaskets=0 # counts the current basket downloads if counter > 10 no downloads are possible
+ file_catalog = 'CDLICatalog'
+
+ # word splitter for search
+ splitter = {'words':cdliSplitter.wordSplitter(),
+ 'graphemes':cdliSplitter.graphemeSplitter()}
+
+
def deleteFiles(self,ids):
"""delete files"""
for id in ids:
@@ -2042,6 +2069,47 @@ class CDLIRoot(Folder):
+ def searchText(self, query, index='words'):
+ """searches query in the fulltext index and returns a list of file ids/P-numbers"""
+ idxQuery = {index:{'query':query}}
+ idx = getattr(self, self.file_catalog)
+ results = []
+ # do search
+ resultset = idx.search(idxQuery)
+ for res in resultset:
+ # put only the P-Number in the result
+ results.append(res.getId[:7])
+ return results
+
+ # from PluginINdexes.common.util.py:parseIndexRequest:
+ #
+ # The class understands the following type of parameters:
+ #
+ # - old-style parameters where the query for an index as value inside
+ # the request directory where the index name is the name of the key.
+ # Additional parameters for an index could be passed as index+"_usage" ...
+ #
+ #
+ # - dictionary-style parameters specify a query for an index as
+ # an entry in the request dictionary where the key corresponds to the
+ # name of the index and the key is a dictionary with the parameters
+ # passed to the index.
+ #
+ # Allowed keys of the parameter dictionary:
+ #
+ # 'query' - contains the query (either string, list or tuple) (required)
+ #
+ # other parameters depend on the the index
+ #
+ #
+ # - record-style parameters specify a query for an index as instance of the
+ # Record class. This happens usually when parameters from a web form use
+ # the "record" type e.g. .
+ # All restrictions of the dictionary-style parameters apply to the record-style
+ # parameters
+
+
+
def showFile(self,fileId,wholePage=False):
"""show a file
@param fileId: P-Number of the document to be displayed
@@ -2069,54 +2137,53 @@ class CDLIRoot(Folder):
wordlist=[word]
for line in file.split("\n"):
+ line = formatAtfLineHtml(unicodify(line))
found=False
for word in wordlist:
- try: # just a hack because of possible unicode errors in line
- if line.find(word)>-1:
+ try: # just a hack because of possible unicode errors in line
+ if line.find(word)>-1:
if lineList: #liste of moeglichen Zeilennummern
- num=line.split(".")[0] #Zeilenummer ist alles vor dem . in der Zeile
-
- if num in lineList:
-
- ret.append(line)
- else: # nimm alles ohne line check
+ num=line.split(".")[0] #Zeilenummer ist alles vor dem . in der Zeile
+ if num in lineList:
ret.append(line)
-
- break;
- except:
- pass
+ else: # nimm alles ohne line check
+ ret.append(line)
+ break;
+ except:
+ pass
return ret
+
- def tagWordInFile(self,fileId,word,lineList=None,regExp=False,indexName=""):
+ def tagWordInFile(self,fileId,word,indexName='words',regExp=False):
"""get text with word highlighted from FileId"""
file=self.showFile(fileId)
tagStr=u'%s'
ret=[]
+ # search using lowercase
+ word = word.lower()
if regExp: # wenn regexp dann generiere alle worte aus der list die der regexp entsprechen
wordlist=self.findWordRegExp(indexName,word)
else:
- wordlist=[word]
+ # split the search term into words according to the corresponding splitter
+ #try:
+ wordlist = self.splitter[indexName].process([word])
+ #except:
+ # wordlist=[word]
for line in file.split("\n"):
- line = unicodify(line)
- found=False
- for word in wordlist:
- if line.find(word)>-1: #word ist gefunden dann makiere und breche die Schleife ab
- if lineList: #liste of moeglichen Zeilennummern
- num=line.split(".")[0] #Zeilenummer ist alles vor dem . in der Zeile
-
- if num in lineList:
-
- ret.append(line.replace(word,tagStr%word))
-
- else: # nimm alles ohne line check
- ret.append(line.replace(word,tagStr%word))
- found=True
- break
- if not found: #word wurde nicht gefunden keine makierung
- ret.append(line)
+ line = formatAtfLineHtml(unicodify(line))
+ if not line:
+ # formatAtf can produce empty lines
+ continue
+
+ for w in wordlist:
+ if line.lower().find(w)>-1:
+ #word ist gefunden dann makiere
+ line = line.replace(w,tagStr%w)
+
+ ret.append(line)
return u' \n'.join(ret)