view createIndexMetas.py @ 0:5e33fa5a2fdc

initial
author Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
date Tue, 19 Aug 2014 14:12:52 +0200
parents
children 5b7ed10ecbb4
line wrap: on
line source

import os
import os.path

from lxml import etree

class indexMetaHandler:
    def createInitialIndexMetas(self,path,start=None,end=None):
        cnt=-1
        for fn in os.listdir(path):
            cnt+=1

            if start is not None:
                if cnt<start:
                    print("SKIP: %s"%fn)
                    continue
                
            if end is not None:
                if cnt>end:
                    return
            print("PROCESS: %s"%fn)
                    
            self.createInitialIndexMeta(path,fn)
            
    def createInitialIndexMeta(self,path,fn,remove="/Volumes/MPIWG"):
        
        dom=etree.parse("index.meta.stub.xml") #read the stub
        
        element = dom.xpath("/resource/name")
        element[0].text=fn
        
        element = dom.xpath("/resource/archive-path")
        archivePath=path+fn
        
        element[0].text=archivePath.replace(remove,"")
        
        #check images
        
        if os.path.exists(path+fn+"/pages"):
            element = dom.xpath("/resource/meta/texttool/image")
            element[0].text="pages"
        
        element = dom.xpath("/resource/meta/texttool")[0]
        if os.path.exists(path+fn+"/full"):
            fullElement = etree.Element("text",type="pdf")
            fullElement.text="full"
            
            element.append(fullElement)
            
        if os.path.exists(path+fn+"/hocr"):
            fullElement = etree.Element("text",type="hocr")
            fullElement.text="hocr"
            
            element.append(fullElement)
            
        
       
        element = dom.xpath("/resource/meta/bib/title")
        element[0].text=fn
        
        
        txt=etree.tostring(dom, pretty_print=True)
        
        out = open(path+fn+"/index.meta","bw")
        
        out.write(txt)
        
        out.close()

        
        
              
        
              
            
ih = indexMetaHandler()

ih.createInitialIndexMetas("/Volumes/MPGARCHIV2/struct2/")