--- cdli/cdli_files.py 2006/12/22 11:56:08 1.53 +++ cdli/cdli_files.py 2007/01/08 14:36:28 1.56 @@ -23,6 +23,76 @@ from ZPublisher.BaseRequest import Reque import threading from BTrees.OOBTree import OOBTree +def unique(s): + """Return a list of the elements in s, but without duplicates. + + For example, unique([1,2,3,1,2,3]) is some permutation of [1,2,3], + unique("abcabc") some permutation of ["a", "b", "c"], and + unique(([1, 2], [2, 3], [1, 2])) some permutation of + [[2, 3], [1, 2]]. + + For best speed, all sequence elements should be hashable. Then + unique() will usually work in linear time. + + If not possible, the sequence elements should enjoy a total + ordering, and if list(s).sort() doesn't raise TypeError it's + assumed that they do enjoy a total ordering. Then unique() will + usually work in O(N*log2(N)) time. + + If that's not possible either, the sequence elements must support + equality-testing. Then unique() will usually work in quadratic + time. + (from the python cookbook) + """ + + n = len(s) + if n == 0: + return [] + + # Try using a dict first, as that's the fastest and will usually + # work. If it doesn't work, it will usually fail quickly, so it + # usually doesn't cost much to *try* it. It requires that all the + # sequence elements be hashable, and support equality comparison. + u = {} + try: + for x in s: + u[x] = 1 + except TypeError: + del u # move on to the next method + else: + return u.keys() + + # We can't hash all the elements. Second fastest is to sort, + # which brings the equal elements together; then duplicates are + # easy to weed out in a single pass. + # NOTE: Python's list.sort() was designed to be efficient in the + # presence of many duplicate elements. This isn't true of all + # sort functions in all languages or libraries, so this approach + # is more effective in Python than it may be elsewhere. + try: + t = list(s) + t.sort() + except TypeError: + del t # move on to the next method + else: + assert n > 0 + last = t[0] + lasti = i = 1 + while i < n: + if t[i] != last: + t[lasti] = last = t[i] + lasti += 1 + i += 1 + return t[:lasti] + + # Brute force is all that's left. + u = [] + for x in s: + if x not in u: + u.append(x) + return u + + class BasketContent(SimpleItem): """classe fuer den Inhalt eines Baskets""" @@ -1866,19 +1936,28 @@ class CDLIRoot(Folder): """get the index for debug purposes""" print "show" for x in self.lineIndex.iterkeys(): - print "word:",x - for y in self.lineIndex[x].iterkeys(): - print "doc",y,self.lineIndex[x][y] + print "word:",repr(x) + #for y in self.lineIndex[x].iterkeys(): + # print "doc",repr(y),repr(self.lineIndex[x][y]) return self.lineIndex - def searchInLineIndexDocs(self,word): + def searchInLineIndexDocs(self,word,uniq=True): """search occurences""" - return list(self.lineIndex.get(word.upper()).keys()) - + + try: + lst=list(self.lineIndex.get(word).keys()) + except: + lst=[] + if uniq: + return unique(lst) + else: + return lst + def getLinesFromIndex(self,word,doc): """get lines""" - return self.lineIndex[word][doc] + + return self.lineIndex.get(word)[doc] def cleanInLineIndex(self): """delete InlineIndex""" @@ -1896,9 +1975,12 @@ class CDLIRoot(Folder): li=self.lineIndex if li.has_key(key): - - if li[key].has_key(value[0]) and (not (value[1] in li[key][value[0]])): - li[key][value[0]].append(value[1]) # add it if now in the array + +# if li[key].has_key(value[0]) and (not (value[1] in li[key][value[0]])): + if li[key].has_key(value[0]): + tmp=li[key][value[0]] + tmp.append(value[1]) # add it if now in the array + li[key][value[0]]=tmp[0:] else: li[key][value[0]]=[value[1]] # new array for lines @@ -1920,7 +2002,70 @@ class CDLIRoot(Folder): return "" return f[0].getObject().getLastVersionFormattedData() + + def showLineFromFile(self,fileId,lineNum,word): + """get line lineNum fromFileId""" + file=self.showFile(fileId) + #str="^%s\.[^%s\.]*%s[^\n]*\n"%(lineNum,lineNum,word) + #str="^%s\..*?%s[^\n]*\n"%(lineNum,word) + + #print str + #m=re.search(str,file,flags=re.M|re.DOTALL) + #if m: + # return m.group() + #else: + # return "" + #ret=lineNum+"." + #splitted=file.split(lineNum+".") + #if len(splitted)>1: + #for part in splitted[1:]: + #if part.find(word)>-1: + # for x in part.split("\n"): + #ret+=x + #if x.find(word)>-1: + #break + #break; + #return ret + + def showWordInFile(self,fileId,word,lineList=None): + """get lines with word fromFileId""" + + file=self.showFile(fileId) + + ret=[] + for line in file.split("\n"): + if line.find(word)>-1: + if lineList: #liste of moeglichen Zeilennummern + num=line.split(".")[0] #Zeilenummer ist alles vor dem . in der Zeile + + if num in lineList: + + ret.append(line) + else: # nimm alles ohne line check + ret.append(line) + return ret + + def tagWordInFile(self,fileId,word,lineList=None): + """get lines with word fromFileId""" + + file=self.showFile(fileId) + tagStr="""%s""" + ret=[] + for line in file.split("\n"): + if line.find(word)>-1: + if lineList: #liste of moeglichen Zeilennummern + num=line.split(".")[0] #Zeilenummer ist alles vor dem . in der Zeile + + if num in lineList: + + ret.append(line.replace(word,tagStr%word)) + else: # nimm alles ohne line check + ret.append(line.replace(word,tagStr%word)) + else: + ret.append(line) + return "
\n".join(ret) + def URLquote(self,str): """quote url""" return urllib.quote(str)