--- cdli/cdli_files.py 2007/10/26 22:45:12 1.80.2.5
+++ cdli/cdli_files.py 2007/12/13 19:20:45 1.80.2.11
@@ -53,15 +53,11 @@ def utf8ify(s):
else:
return s.encode('utf-8')
-def formatAtfLineHtml(l, nolemma=True):
+def formatAtfHtml(l):
"""escape special ATF characters for HTML"""
if not l:
return ""
- if nolemma:
- # ignore lemma lines
- if l.lstrip().startswith('#lem:'):
- return ""
# replace &
l = l.replace('&','&')
# replace angular brackets
@@ -69,7 +65,60 @@ def formatAtfLineHtml(l, nolemma=True):
l = l.replace('>','>')
return l
+def formatAtfLineHtml(l, nolemma=True):
+ """format ATF line for HTML"""
+ if not l:
+ return ""
+
+ if nolemma:
+ # ignore lemma lines
+ if l.lstrip().startswith('#lem:'):
+ return ""
+
+ return formatAtfHtml(l)
+
+
+def formatAtfFullLineNum(txt, nolemma=True):
+ """format full line numbers in ATF text"""
+ # surface codes
+ surfaces = {'@obverse':'obv',
+ '@reverse':'rev',
+ '@surface':'surface',
+ '@edge':'edge',
+ '@left':'left',
+ '@right':'right',
+ '@top':'top',
+ '@bottom':'bottom',
+ '@face':'face',
+ '@seal':'seal'}
+
+ if not txt:
+ return ""
+
+ ret = []
+ surf = ""
+ col = ""
+ for line in txt.split("\n"):
+ line = unicodify(line)
+ if line and line[0] == '@':
+ # surface or column
+ words = line.split(' ')
+ if words[0] in surfaces:
+ surf = line.replace(words[0],surfaces[words[0]]).strip()
+
+ elif words[0] == '@column':
+ col = words[1]
+
+ elif line and line[0] in '123456789':
+ # ordinary line -> add line number
+ line = "%s:%s:%s"%(surf,col,line)
+
+ ret.append(line)
+
+ return '\n'.join(ret)
+
+
def generateXMLReturn(hash):
"""erzeugt das xml file als returnwert fuer uploadATFRPC"""
@@ -1584,6 +1633,7 @@ class CDLIFile(extVersionedFile,CatalogA
content_meta_type = ["CDLI File Object"]
default_catalog='CDLICatalog'
+
security.declareProtected('manage','index_html')
def getLastVersionData(self):
@@ -1594,6 +1644,11 @@ class CDLIFile(extVersionedFile,CatalogA
"""get last version data"""
return self.getContentObject().getFormattedData()
+ def getTextId(self):
+ """returns P-number of text"""
+ # assuming that its the beginning of the title
+ return self.title[:7]
+
#security.declarePublic('history')
def history(self):
"""history"""
@@ -1795,9 +1850,9 @@ class CDLIFileFolder(extVersionedFileFol
meta_type="CDLI Folder"
file_meta_type=['CDLI file']
folder_meta_type=['CDLI Folder']
-
- default_catalog='CDLICatalog'
- defaultFileCatalog=default_catalog #wenn dieses definiert ist, wird beim hinzufuegen einer neuen version eines files dieser catalog neuindiziert
+
+ file_catalog='CDLICatalog'
+
#downloadCounter=0 # counts how many download for all files currently run, be mehr als 5 wird verweigert.
tmpStore2={}
@@ -1834,7 +1889,7 @@ class CDLIFileFolder(extVersionedFileFol
logging.debug("getFile: %s"%repr(fn))
if not self.hasObject(fn):
# search deeper
- founds=self.CDLICatalog.search({'title':fn})
+ founds=getattr(self, self.file_catalog).search({'textid':fn})
if founds:
obj=founds[0].getObject().getContentObject()
else:
@@ -1937,7 +1992,7 @@ class CDLIFileFolder(extVersionedFileFol
def sortF(x,y):
return cmp(x[0],y[0])
- catalog=getattr(self,self.default_catalog)
+ catalog=getattr(self,self.file_catalog)
#tf,tfilename=mkstemp()
if not hasattr(self.temp_folder,'downloadCounter'):
self.temp_folder.downloadCounter=0
@@ -2069,52 +2124,38 @@ class CDLIRoot(Folder):
- def searchText(self, query, index='words'):
+ def searchText(self, query, index='graphemes'):
"""searches query in the fulltext index and returns a list of file ids/P-numbers"""
+ # see also: http://www.plope.com/Books/2_7Edition/SearchingZCatalog.stx#2-13
+ logging.debug("searchtext for '%s' in index %s"%(query,index))
+ #import Products.ZCTextIndex.QueryParser
+ #qp = QueryParser.QueryParser()
+ #logging.debug()
idxQuery = {index:{'query':query}}
idx = getattr(self, self.file_catalog)
- results = []
# do search
- resultset = idx.search(idxQuery)
- for res in resultset:
- # put only the P-Number in the result
- results.append(res.getId[:7])
+ resultset = idx.search(query_request=idxQuery,sort_index='textid')
+ # put only the P-Number in the result
+ results = [res.getId[:7] for res in resultset]
+ logging.debug("searchtext: found %d texts"%len(results))
return results
- # from PluginINdexes.common.util.py:parseIndexRequest:
- #
- # The class understands the following type of parameters:
- #
- # - old-style parameters where the query for an index as value inside
- # the request directory where the index name is the name of the key.
- # Additional parameters for an index could be passed as index+"_usage" ...
- #
- #
- # - dictionary-style parameters specify a query for an index as
- # an entry in the request dictionary where the key corresponds to the
- # name of the index and the key is a dictionary with the parameters
- # passed to the index.
- #
- # Allowed keys of the parameter dictionary:
- #
- # 'query' - contains the query (either string, list or tuple) (required)
- #
- # other parameters depend on the the index
- #
- #
- # - record-style parameters specify a query for an index as instance of the
- # Record class. This happens usually when parameters from a web form use
- # the "record" type e.g. .
- # All restrictions of the dictionary-style parameters apply to the record-style
- # parameters
+ def getFile(self, pnum):
+ """get the translit file with the given pnum"""
+ f = getattr(self, self.file_catalog).search({'textid':pnum})
+ if not f:
+ return ""
+
+ return f[0].getObject().getData()
+
def showFile(self,fileId,wholePage=False):
"""show a file
@param fileId: P-Number of the document to be displayed
"""
- f=self.CDLICatalog({'title':fileId})
+ f=getattr(self, self.file_catalog).search({'textid':fileId})
if not f:
return ""
@@ -2127,64 +2168,118 @@ class CDLIRoot(Folder):
def showWordInFile(self,fileId,word,indexName='graphemes',regExp=False,):
"""get lines with word from FileId"""
+ logging.debug("showwordinfile word='%s' index=%s file=%s"%(word,indexName,fileId))
- file=self.showFile(fileId)
+ file = formatAtfFullLineNum(self.getFile(fileId))
ret=[]
- # search using lowercase
- word = word.lower()
- if regExp: # wenn regexp dann generiere alle worte aus der list die der regexp entsprechen
- wordlist=self.findWordRegExp(indexName,word)
- else:
- # split the search term into words according to the corresponding splitter
- #try:
- wordlist = self.splitter[indexName].process([word])
- #except:
- # wordlist=[word]
+ # add whitespace before and whitespace and line-end to splitter bounds expressions
+ bounds = self.splitter[indexName].bounds
+ splitexp = "(%s|\s)(%%s)(%s|\s|\Z)"%(bounds,bounds)
+ # clean word expression
+ # TODO: this should use QueryParser itself
+ # take out double quotes
+ word = word.replace('"','')
+ # take out ignorable signs
+ ignorable = self.splitter[indexName].ignorex
+ word = ignorable.sub('', word)
+ # compile into regexp objects and escape parens
+ wordlist = [re.compile(splitexp%re.escape(w)) for w in word.split(' ')]
+
for line in file.split("\n"):
- line = formatAtfLineHtml(unicodify(line))
- if not line:
- # formatAtf can produce empty lines
- continue
for word in wordlist:
- if line.lower().find(word)>-1:
+ #logging.debug("showwordinfile: searching for %s in %s"%(word.pattern,ignoreable.sub('',line)))
+ if word.search(ignorable.sub('',line)):
+ line = formatAtfLineHtml(line)
ret.append(line)
+ break
+
return ret
+
+
+ def showWordInFiles(self,fileIds,word,indexName='graphemes',regExp=False):
+ """
+ get lines with word from all ids in list FileIds.
+ returns dict with id:lines pairs.
+ """
+ logging.debug("showwordinfiles word='%s' index=%s file=%s"%(word,indexName,fileIds))
+
+ return dict([(id,self.showWordInFile(id, word, indexName, regExp)) for id in fileIds])
def tagWordInFile(self,fileId,word,indexName='graphemes',regExp=False):
"""get text with word highlighted from FileId"""
+ logging.debug("tagwordinfile word='%s' index=%s file=%s"%(word,indexName,fileId))
- file=self.showFile(fileId)
- tagStr=u'%s'
+ file=self.getFile(fileId)
+ tagStart=u''
+ tagEnd=u''
+ tagStr=tagStart + u'%%s' + tagEnd
ret=[]
- # search using lowercase
- word = word.lower()
- if regExp: # wenn regexp dann generiere alle worte aus der list die der regexp entsprechen
- wordlist=self.findWordRegExp(indexName,word)
- else:
- # split the search term into words according to the corresponding splitter
- #try:
- wordlist = self.splitter[indexName].process([word])
- #except:
- # wordlist=[word]
+ # add whitespace to splitter bounds expressions and compile into regexp object
+ bounds = self.splitter[indexName].bounds
+ wordsplit = re.compile("(%s|\s)"%bounds)
+ # clean word expression
+ # TODO: this should use QueryParser itself
+ word = word.replace('"','') # take out double quotes
+ # take out ignoreable signs
+ ignorable = self.splitter[indexName].ignorex
+ word = ignorable.sub('', word)
+ # split search terms by blanks
+ words = word.split(' ')
+ # split search terms again (for grapheme search with words)
+ splitwords = dict(((w,self.splitter[indexName].process([w])) for w in words))
for line in file.split("\n"):
- line = formatAtfLineHtml(unicodify(line))
- if not line:
- # formatAtf can produce empty lines
+ line = unicodify(line)
+ # ignore lemma lines
+ if line.lstrip().startswith('#lem:'):
continue
+
+ # first scan
+ hitwords = []
+ for w in words:
+ if ignorable.sub('',line).find(w) > -1:
+ # word is in line
+ # append split word for grapheme search with words
+ hitwords.extend(splitwords[w])
+ #hitwords.extend(wordsplit.split(w))
+
+ # examine hits closer
+ if hitwords:
+ # split line into words
+ parts = wordsplit.split(line)
+ line = ""
+ for p in parts:
+ #logging.debug("tagwordinfile: searching for %s in %s"%(p,hitwords))
+ # reassemble line
+ if ignorable.sub('', p) in hitwords:
+ #logging.debug("tagwordinfile: found %s in %s"%(p,hitwords))
+ # this part was found
+ line += tagStart + formatAtfHtml(p) + tagEnd
+ else:
+ line += formatAtfHtml(p)
+
+ else:
+ # no hits
+ line = formatAtfHtml(line)
- for w in wordlist:
- if line.lower().find(w)>-1:
- #word ist gefunden dann makiere
- line = line.replace(w,tagStr%w)
-
ret.append(line)
return u' \n'.join(ret)
+
+
+ def tagWordInFiles(self,fileIds,word,indexName='graphemes',regExp=False):
+ """
+ get texts with highlighted word from all ids in list FileIds.
+ returns dict with id:text pairs.
+ """
+ logging.debug("tagwordinfiles word='%s' index=%s file=%s"%(word,indexName,fileIds))
+ return dict([(id,self.tagWordInFile(id, word, indexName, regExp)) for id in fileIds])
+
+
def URLquote(self,str):
"""quote url"""
return urllib.quote(str)