--- cdli/cdli_files.py 2010/03/19 14:01:41 1.115 +++ cdli/cdli_files.py 2011/04/27 16:19:27 1.117 @@ -306,10 +306,15 @@ def manage_addCDLIFile(self,id,title,loc def checkUTF8(data): """check utf 8""" + if not isinstance(data, str): + logging.error("checkUTF8 data is not string! (%s)"%repr(data)) + try: - data.encode('utf-8') + data.decode('utf-8') + logging.debug("checkUTF8: ok!") return True except: + logging.debug("checkUTF8: false!") return False @@ -335,7 +340,7 @@ def splitatf(fh,dir=None,ext=None): i=0 #ROC: why split \n first and then \r??? - if (type(fh) is StringType) or (type(fh) is UnicodeType): + if isinstance(fh, basestring): iter=fh.split("\n") else: iter=fh.readlines() @@ -369,7 +374,7 @@ def splitatf(fh,dir=None,ext=None): if dir: filename=os.path.join(dir,filename) nf=file(filename,"w") - logging.info("open %s"%filename) + logging.debug("open %s"%filename) if nf: nf.write(line.replace("\n","")+"\n") @@ -378,8 +383,9 @@ def splitatf(fh,dir=None,ext=None): except: pass - if not((type(fh) is StringType) or (type(fh) is UnicodeType)): + if not isinstance(fh, basestring): fh.close() + return ret,len(os.listdir(dir)) @@ -888,8 +894,9 @@ class CDLIRoot(Folder): - def searchText(self, query, index='graphemes'): - """searches query in the fulltext index and returns a list of file ids/P-numbers""" + def searchText(self, query, index='graphemes', resultFilter=None): + """searches query in the fulltext index and returns a list of file IDs/P-numbers + resultFilter is matched against the beginning of the file ID""" # see also: http://www.plope.com/Books/2_7Edition/SearchingZCatalog.stx#2-13 logging.debug("searchtext for '%s' in index %s"%(query,index)) #import Products.ZCTextIndex.QueryParser @@ -899,8 +906,11 @@ class CDLIRoot(Folder): idx = getattr(self, self.file_catalog) # do search resultset = idx.search(query_request=idxQuery,sort_index='textid') - # put only the P-Number in the result - results = [res.getId[:7] for res in resultset] + # put only the P-Number in the result + if resultFilter is None: + results = [res.getId[:7] for res in resultset] + else: + results = [res.getId[:7] for res in resultset if res.getId.startswith(resultFilter)] logging.debug("searchtext: found %d texts"%len(results)) return results