# HG changeset patch # User dwinter # Date 1370446629 -7200 # Node ID bcd8076ff7ec448d0281480c9e4868129cb466ef # Parent 7027fbf1d141de0b428033e3267b0d63a26c5e8c random selection of entries bug fixes diff -r 7027fbf1d141 -r bcd8076ff7ec addDriToIndexMeta.py --- a/addDriToIndexMeta.py Fri May 24 16:53:09 2013 +0200 +++ b/addDriToIndexMeta.py Wed Jun 05 17:37:09 2013 +0200 @@ -7,6 +7,7 @@ import managePurls.manageIndexMetaPURLs as manageIndexMetaPURLs import re from lxml import etree +import sys from os.path import join, getsize @@ -14,14 +15,36 @@ parseErrorFile = file("/tmp/addDRIParseErrors.txt","w") alreadyExistsFile = file("/tmp/addDRIalreadyExists.txt","w") + +def correctAuthor(tree): + """ersetzt in den autor felder "\r" durch ;""" + + + authors = tree.xpath("/resource/meta/bib/author") + for author in authors: + + if author.text is not None: + splitted =author.text.split("\n") + txt = "; ".join(splitted) + + author.text=txt + + + def addPURL(fl,purl,test=False): try: tree = etree.parse(fl) except: parseErrorFile.write("PARSE ERROR:"+fl+"\n") return False + dris = tree.xpath("/resource/meta/dri[@type='mpiwg']") + + correctAuthor(tree) + + + if len(dris)==0: # erzeuge neu newDri = etree.Element("dri",type="mpiwg") newDri.text=purl @@ -34,18 +57,23 @@ else: dris[0].text=purl alreadyExistsFile.write("%s \n"%fl) - return True + #return True print etree.tostring(tree, pretty_print=True) + if not test: try: + os.rename(fl, fl+"_mpiwg_dri") out = etree.tostring(tree, encoding="UTF-8",xml_declaration=False) fo = file(fl,"w") fo.write(out) fo.close except: + + print sys.exc_info()[0] + print sys.exc_info()[1] errorFile.write(fl+"\n") return True @@ -73,4 +101,4 @@ dirs.remove(dir) if __name__ == '__main__': - addDriToIndexMeta("/mpiwg/online/",delpath="/mpiwg/online",test=True) + addDriToIndexMeta("/mpiwg/online/permanent/vlp",delpath="/mpiwg/online",test=False) diff -r 7027fbf1d141 -r bcd8076ff7ec managePurls/manageIndexMetaPURLs.py --- a/managePurls/manageIndexMetaPURLs.py Fri May 24 16:53:09 2013 +0200 +++ b/managePurls/manageIndexMetaPURLs.py Wed Jun 05 17:37:09 2013 +0200 @@ -225,8 +225,35 @@ return ERROR,None - - + def getExistingRandom(self,number): + """gibt zufaellig existierende purls zurueck""" + + qst = "select count(*) from purls" + max = self.purlDB.query(qst)[0].count + + random.seed() + + ret=set() + + while len(ret)