view harvestToPurl.py @ 19:cce127a28fc9

added getpurls
author dwinter
date Wed, 21 Nov 2012 15:39:08 +0100
parents 1b2d74f94ca8
children be8640c08d99
line wrap: on
line source

'''
Created on 31.10.2012

@author: dwinter
'''

import managePurls.manageIndexMetaPURLs as manageIndexMetaPURLs



import os
from os.path import join, getsize
import sys
import re
from lxml import etree

def harvestIndexMeta(path,user,delpath="",replacepath="", update=False):
    
    md = manageIndexMetaPURLs.IndexMetaPURLManager()
    
    for root, dirs, files in os.walk(path):
    
    
        for name in files:
            if name.endswith(".meta"):
                fl=join(root, name)
                  
                imagePath=createImagePath(fl,root)
                imagePath=re.sub("^"+delpath,replacepath,imagePath)
              
                fl=re.sub("^"+delpath,replacepath,fl) #loesche den teil vom path der mir delpath beginnt
                
                val,purl = md.register(fl, True,  user=user,imagePath=imagePath,update=update)
                try:
                    if val==manageIndexMetaPURLs.ALREADY_EXISTS:
                        print "found %s -> %s"%(fl,purl)
                
                    elif val==manageIndexMetaPURLs.UPDATED:
                        print "updated %s -> %s"%(fl,purl)
                    else:
                        print "added %s -> %s"%(fl,purl)
                except:
                    print "cannot print: %s"%purl
                     
        if 'pageimg' in dirs:
            dirs.remove('pageimg')  # don't visit pageimf
        for dir in dirs:
            if  dir== "pageimg":
                dirs.remove('pageimg')
            if  dir.startswith("."):
                dirs.remove(dir)

            if  dir.startswith(":"):
                dirs.remove(dir)


# erzeugt einen imagepath wenn kein texttooltag existiert
def createImagePath(path,root):
    print "parsing: %s"%path
    try:
        tree= etree.parse(path)
    except:
        print "cannot parse %s"%path
        return ""

    #teste ob texttool tag, dann kein imagePath
    tt =tree.xpath('//texttool')
    if len(tt)>0:
        return ""

    
    #im anderen fall, heuristic
    
    imageFolders=["pageimg","pages"]
    
    for imageFolder in imageFolders:
        fl=join(root, imageFolder)
        if os.path.exists(fl): # gibt es einen der folder
            return fl
        
    return ""
            
    
    

if __name__ == '__main__':
    args = sys.argv[1:]
    if not (len(args)==2 or len(args)==3 or len(args)==4):
        print "USAGE: python harvestToPurl.py path user (optional)pathPrefixToDelete (optional)replacedeleted"
        sys.exit(2)
    path=args[0]
    user=args[1]
    
    delpath=""
    replacepath=""
    
    if len(args)==3:
        delpath=args[2]
    elif len(args)==4:
        delpath=args[2]
        replacepath=args[3]
       
        
    if not os.path.exists(path):
        print "ERROR: path %s does not exist!"%path
        sys.exit(2)
        
    harvestIndexMeta(path,user,delpath=delpath,replacepath=replacepath,update=True)