--- cdli/cdli_files.py 2007/10/26 22:45:12 1.80.2.5 +++ cdli/cdli_files.py 2007/11/02 15:37:46 1.80.2.6 @@ -53,15 +53,11 @@ def utf8ify(s): else: return s.encode('utf-8') -def formatAtfLineHtml(l, nolemma=True): +def formatAtfHtml(l): """escape special ATF characters for HTML""" if not l: return "" - if nolemma: - # ignore lemma lines - if l.lstrip().startswith('#lem:'): - return "" # replace & l = l.replace('&','&') # replace angular brackets @@ -69,7 +65,60 @@ def formatAtfLineHtml(l, nolemma=True): l = l.replace('>','>') return l +def formatAtfLineHtml(l, nolemma=True): + """format ATF line for HTML""" + if not l: + return "" + if nolemma: + # ignore lemma lines + if l.lstrip().startswith('#lem:'): + return "" + + return formatAtfHtml(l) + + + +def formatAtfFullLineNum(txt, nolemma=True): + """format full line numbers in ATF text""" + # surface codes + surfaces = {'@obverse':'obv', + '@reverse':'rev', + '@surface':'surface', + '@edge':'edge', + '@left':'left', + '@right':'right', + '@top':'top', + '@bottom':'bottom', + '@face':'face', + '@seal':'seal'} + + if not txt: + return "" + + ret = [] + surf = "" + col = "" + for line in txt.split("\n"): + line = unicodify(line) + if line and line[0] == '@': + # surface or column + words = line.split(' ') + if words[0] in surfaces: + surf = line.replace(words[0],surfaces[words[0]]).strip() + + elif words[0] == '@column': + col = words[1] + + elif line and line[0] in '123456789': + # ordinary line -> add line number + line = "%s:%s:%s"%(surf,col,line) + + ret.append(line) + + return '\n'.join(ret) + + def generateXMLReturn(hash): """erzeugt das xml file als returnwert fuer uploadATFRPC""" @@ -1584,6 +1633,7 @@ class CDLIFile(extVersionedFile,CatalogA content_meta_type = ["CDLI File Object"] default_catalog='CDLICatalog' + security.declareProtected('manage','index_html') def getLastVersionData(self): @@ -1594,6 +1644,11 @@ class CDLIFile(extVersionedFile,CatalogA """get last version data""" return self.getContentObject().getFormattedData() + def getTextId(self): + """returns P-number of text""" + # assuming that its the beginning of the title + return self.title[:7] + #security.declarePublic('history') def history(self): """history""" @@ -1795,9 +1850,9 @@ class CDLIFileFolder(extVersionedFileFol meta_type="CDLI Folder" file_meta_type=['CDLI file'] folder_meta_type=['CDLI Folder'] - - default_catalog='CDLICatalog' - defaultFileCatalog=default_catalog #wenn dieses definiert ist, wird beim hinzufuegen einer neuen version eines files dieser catalog neuindiziert + + file_catalog='CDLICatalog' + #downloadCounter=0 # counts how many download for all files currently run, be mehr als 5 wird verweigert. tmpStore2={} @@ -1834,7 +1889,7 @@ class CDLIFileFolder(extVersionedFileFol logging.debug("getFile: %s"%repr(fn)) if not self.hasObject(fn): # search deeper - founds=self.CDLICatalog.search({'title':fn}) + founds=getattr(self, self.file_catalog).search({'textid':fn}) if founds: obj=founds[0].getObject().getContentObject() else: @@ -1937,7 +1992,7 @@ class CDLIFileFolder(extVersionedFileFol def sortF(x,y): return cmp(x[0],y[0]) - catalog=getattr(self,self.default_catalog) + catalog=getattr(self,self.file_catalog) #tf,tfilename=mkstemp() if not hasattr(self.temp_folder,'downloadCounter'): self.temp_folder.downloadCounter=0 @@ -2069,52 +2124,33 @@ class CDLIRoot(Folder): - def searchText(self, query, index='words'): + def searchText(self, query, index='graphemes'): """searches query in the fulltext index and returns a list of file ids/P-numbers""" + # see also: http://www.plope.com/Books/2_7Edition/SearchingZCatalog.stx#2-13 idxQuery = {index:{'query':query}} idx = getattr(self, self.file_catalog) - results = [] # do search - resultset = idx.search(idxQuery) - for res in resultset: - # put only the P-Number in the result - results.append(res.getId[:7]) + resultset = idx.search(query_request=idxQuery,sort_index='textid') + # put only the P-Number in the result + results = [res.getId[:7] for res in resultset] return results - # from PluginINdexes.common.util.py:parseIndexRequest: - # - # The class understands the following type of parameters: - # - # - old-style parameters where the query for an index as value inside - # the request directory where the index name is the name of the key. - # Additional parameters for an index could be passed as index+"_usage" ... - # - # - # - dictionary-style parameters specify a query for an index as - # an entry in the request dictionary where the key corresponds to the - # name of the index and the key is a dictionary with the parameters - # passed to the index. - # - # Allowed keys of the parameter dictionary: - # - # 'query' - contains the query (either string, list or tuple) (required) - # - # other parameters depend on the the index - # - # - # - record-style parameters specify a query for an index as instance of the - # Record class. This happens usually when parameters from a web form use - # the "record" type e.g. . - # All restrictions of the dictionary-style parameters apply to the record-style - # parameters + def getFile(self, pnum): + """get the translit file with the given pnum""" + f = getattr(self, self.file_catalog).search({'textid':pnum}) + if not f: + return "" + + return f[0].getObject().getData() + def showFile(self,fileId,wholePage=False): """show a file @param fileId: P-Number of the document to be displayed """ - f=self.CDLICatalog({'title':fileId}) + f=getattr(self, self.file_catalog).search({'textid':fileId}) if not f: return "" @@ -2128,62 +2164,75 @@ class CDLIRoot(Folder): def showWordInFile(self,fileId,word,indexName='graphemes',regExp=False,): """get lines with word from FileId""" - file=self.showFile(fileId) + file = formatAtfFullLineNum(self.getFile(fileId)) ret=[] - # search using lowercase - word = word.lower() - if regExp: # wenn regexp dann generiere alle worte aus der list die der regexp entsprechen - wordlist=self.findWordRegExp(indexName,word) - else: - # split the search term into words according to the corresponding splitter - #try: - wordlist = self.splitter[indexName].process([word]) - #except: - # wordlist=[word] + # add whitespace before and whitespace and line-end to splitter bounds expressions + bounds = self.splitter[indexName].bounds + splitexp = "(%s|\s)(%%s)(%s|\s|\Z)"%(bounds,bounds) + # compile into regexp objects + wordlist = [re.compile(splitexp%w,re.IGNORECASE) for w in word.split(' ')] + for line in file.split("\n"): - line = formatAtfLineHtml(unicodify(line)) - if not line: - # formatAtf can produce empty lines - continue for word in wordlist: - if line.lower().find(word)>-1: + #logging.debug("showwordinfile: searching for %s in %s"%(word.pattern,line)) + if word.search(line): + line = formatAtfLineHtml(line) ret.append(line) + break + return ret def tagWordInFile(self,fileId,word,indexName='graphemes',regExp=False): """get text with word highlighted from FileId""" - file=self.showFile(fileId) - tagStr=u'%s' + file=self.getFile(fileId) + tagStart=u'' + tagEnd=u'' + tagStr=tagStart + u'%%s' + tagEnd ret=[] - # search using lowercase - word = word.lower() - if regExp: # wenn regexp dann generiere alle worte aus der list die der regexp entsprechen - wordlist=self.findWordRegExp(indexName,word) - else: - # split the search term into words according to the corresponding splitter - #try: - wordlist = self.splitter[indexName].process([word]) - #except: - # wordlist=[word] + # add whitespace to splitter bounds expressions and compile into regexp object + bounds = self.splitter[indexName].bounds + wordsplit = re.compile("(%s|\s)"%bounds) + # split search terms by blanks + words = word.split(' ') for line in file.split("\n"): - line = formatAtfLineHtml(unicodify(line)) - if not line: - # formatAtf can produce empty lines + line = unicodify(line) + # ignore lemma lines + if line.lstrip().startswith('#lem:'): continue + + # first scan + hitwords = [] + for w in words: + if line.find(w) > -1: + # word is in line + hitwords.append(w) + + # examine hits closer + if hitwords: + # split line into words + parts = wordsplit.split(line) + line = "" + for p in parts: + # reassemble line + if p in hitwords: + # this part was found + line += tagStart + formatAtfHtml(p) + tagEnd + else: + line += formatAtfHtml(p) + + else: + # no hits + line = formatAtfHtml(line) - for w in wordlist: - if line.lower().find(w)>-1: - #word ist gefunden dann makiere - line = line.replace(w,tagStr%w) - ret.append(line) return u'
\n'.join(ret) + def URLquote(self,str): """quote url"""