Mercurial > hg > purlService
view addDriToIndexMeta.py @ 35:d3ecbfd21e06
Merge with a25bfc49a068371c555ac034df1a24e349850163
author | dwinter |
---|---|
date | Wed, 23 Oct 2013 12:28:22 +0200 |
parents | 0190f49bce88 |
children | be8640c08d99 |
line wrap: on
line source
''' Created on 01.11.2012 @author: dwinter ''' import os import managePurls.manageIndexMetaPURLs as manageIndexMetaPURLs import re from lxml import etree import sys from os.path import join, getsize errorFile = file("/tmp/addDRIErrors.txt","w") parseErrorFile = file("/tmp/addDRIParseErrors.txt","w") alreadyExistsFile = file("/tmp/addDRIalreadyExists.txt","w") def correctAuthor(tree): """ersetzt in den autor felder "\r" durch ;""" authors = tree.xpath("/resource/meta/bib/author") changed = False for author in authors: if author.text is not None: splitted =author.text.split("\n") txt = "; ".join(splitted) if txt!=author.text: author.text=txt changed=True return changed def addPURL(fl,purl,test=False): try: tree = etree.parse(fl) except: parseErrorFile.write("PARSE ERROR:"+fl+"\n") return False dris = tree.xpath("/resource/meta/dri[@type='mpiwg']") changed = correctAuthor(tree) if len(dris)==0: # erzeuge neu newDri = etree.Element("dri",type="mpiwg") newDri.text=purl metas=tree.xpath("/resource/meta") if len(metas)==0: parseErrorFile.write("no resource/meta: %s \n"%fl) return False else: metas[0].append(newDri) else: dris[0].text=purl alreadyExistsFile.write("%s \n"%fl) if not changed: #nothing has to be done return True print etree.tostring(tree, pretty_print=True) if not test: try: os.rename(fl, fl+"_mpiwg_dri") out = etree.tostring(tree, encoding="UTF-8",xml_declaration=False) fo = file(fl,"w") fo.write(out) fo.close except: print sys.exc_info()[0] print sys.exc_info()[1] errorFile.write(fl+"\n") return True def addDriToIndexMeta(path,delpath="",replacepath="",test=False): md=manageIndexMetaPURLs.IndexMetaPURLManager() for root, dirs, files in os.walk(path): for name in files: if name=="index.meta": fl=join(root, name) shortPath=re.sub("^"+delpath,replacepath,fl) purl=md.getPurl(shortPath) addPURL(fl,purl,test) if 'pageimg' in dirs: dirs.remove('pageimg') # don't visit pageimf for dir in dirs: if dir== "pageimg": dirs.remove('pageimg') if dir.startswith("."): dirs.remove(dir) if __name__ == '__main__': addDriToIndexMeta("/mpiwg/online/permanent/vlp",delpath="/mpiwg/online",test=False)