view addDriToIndexMeta.py @ 40:671dd1e4bd09 default tip

minor bug
author dwinter
date Wed, 05 Mar 2014 10:20:54 +0100
parents a33fa2377075
children
line wrap: on
line source

'''
Created on 01.11.2012

@author: dwinter
'''
import os
import managePurls.manageIndexMetaPURLs as manageIndexMetaPURLs 
import re
from lxml import etree
import sys

from os.path import join, getsize



def correctAuthor(tree):
    """ersetzt in den autor felder "\r" durch ;"""
    
    
    authors = tree.xpath("/resource/meta/bib/author")
    
    changed = False
    for author in authors:

        if author.text is not None:
            splitted =author.text.split("\n")
            txt = "; ".join(splitted)
    
            if txt!=author.text:
                author.text=txt
                changed=True
    
    return changed


def getDRIfromIndexMeta(fl,parseErrorFile=None):
    
    
    if parseErrorFile is None:
          parseErrorFile = file("/tmp/addDRIParseErrors.txt","w")
    try:
        tree = etree.parse(fl)
    except:
        parseErrorFile.write("PARSE ERROR:"+fl+"\n")
        return False
    
    dris = tree.xpath("/resource/meta/dri[@type='mpiwg']")
    
    if len(dris)==0:
        return None
    else:
        return dris[0].text


def addPURL(fl,purl,efiles,test=False):
    try:
        tree = etree.parse(fl)
    except:
        efiles.parseErrorFile.write("PARSE ERROR:"+fl+"\n")
        return False
    
    dris = tree.xpath("/resource/meta/dri[@type='mpiwg']")
    
    
    changed = correctAuthor(tree)
    
    
    
    if len(dris)==0: # erzeuge neu
        newDri = etree.Element("dri",type="mpiwg")
        newDri.text=purl
        metas=tree.xpath("/resource/meta")
        if len(metas)==0:
            efiles.parseErrorFile.write("no resource/meta: %s \n"%fl)
            return False
        else:
            metas[0].append(newDri)
    else:
        dris[0].text=purl
        efiles.alreadyExistsFile.write("%s \n"%fl)
        if not changed: #nothing has to be done
            return True

    print etree.tostring(tree, pretty_print=True)
    
    
    if not test:
        try:
          
            os.rename(fl, fl+"_mpiwg_dri")
            out = etree.tostring(tree, encoding="UTF-8",xml_declaration=False)
            fo = file(fl,"w")
            fo.write(out)
            fo.close
        except:
            
            print sys.exc_info()[0]
            print sys.exc_info()[1]
            efiles.errorFile.write(fl+"\n")
    return True
    
def addDriToIndexMeta(path,efiles,delpath="",replacepath="",test=False):
    
    md=manageIndexMetaPURLs.IndexMetaPURLManager()
    
    for root, dirs, files in os.walk(path):
    
    
        for name in files:
            if name=="index.meta":
                fl=join(root, name)
                shortPath=re.sub("^"+delpath,replacepath,fl)
                purl=md.getPurl(shortPath)
              
                addPURL(fl,purl,efiles,test)
                    
        if 'pageimg' in dirs:
            dirs.remove('pageimg')  # don't visit pageimf
        for dir in dirs:
            if  dir== "pageimg":
                dirs.remove('pageimg')
            if  dir.startswith("."):
                dirs.remove(dir)

if __name__ == '__main__':
    class ef:
        errorFile = file("/tmp/addDRIErrors.txt","w")
        parseErrorFile = file("/tmp/addDRIParseErrors.txt","w")
        alreadyExistsFile = file("/tmp/addDRIalreadyExists.txt","w")
        
    efiles = ef()

    addDriToIndexMeta("/mpiwg/online/permanent/vlp",efiles,delpath="/mpiwg/online",test=False)