view addDriToIndexMeta.py @ 19:cce127a28fc9

added getpurls
author dwinter
date Wed, 21 Nov 2012 15:39:08 +0100
parents fad73212354b
children f748e2b684c9
line wrap: on
line source

'''
Created on 01.11.2012

@author: dwinter
'''
import os
import managePurls.manageIndexMetaPURLs as manageIndexMetaPURLs 
import re
from lxml import etree

from os.path import join, getsize

errorFile = file("/tmp/addDRIErrors.txt","w")
parseErrorFile = file("/tmp/addDRIParseErrors.txt","w")

def addPURL(fl,purl,test=False):
    try:
        tree = etree.parse(fl)
    except:
        parseErrorFile.write("PARSE ERROR:"+fl+"\n")
        return False
    dris = tree.xpath("/resource/meta/dri[@type='mpiwg']")
    
    if len(dris)==0: # erzeuge neu
        newDri = etree.Element("dri",type="mpiwg")
        newDri.text=purl
        metas=tree.xpath("/resource/meta")
        if len(metas)==0:
            parseErrorFile.write("no resource/meta: %s \n"%fl)
            return False
        else:
            metas[0].append(newDri)
    else:
        dris[0].text=purl
        
    print etree.tostring(tree, pretty_print=True)
    
    if not test:
        try:
            os.rename(fl, fl+"_mpiwg_dri")
            out = etree.tostring(tree, encoding="UTF-8",xml_declaration=False)
            fo = file(fl,"w")
            fo.write(out)
            fo.close
        except:
            errorFile.write(fl+"\n")
    return True
    
def addDriToIndexMeta(path,delpath="",replacepath="",test=False):
    
    md=manageIndexMetaPURLs.IndexMetaPURLManager()
    
    for root, dirs, files in os.walk(path):
    
    
        for name in files:
            if name=="index.meta":
                fl=join(root, name)
                shortPath=re.sub("^"+delpath,replacepath,fl)
                purl=md.getPurl(shortPath)
              
                addPURL(fl,purl,test)
                    
        if 'pageimg' in dirs:
            dirs.remove('pageimg')  # don't visit pageimf
        for dir in dirs:
            if  dir== "pageimg":
                dirs.remove('pageimg')
            if  dir.startswith("."):
                dirs.remove(dir)

if __name__ == '__main__':
     addDriToIndexMeta("/mpiwg/online/",delpath="/mpiwg/online",test=False)