Mercurial > hg > ocrHandling
view createIndexMetas.py @ 0:5e33fa5a2fdc
initial
author | Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de> |
---|---|
date | Tue, 19 Aug 2014 14:12:52 +0200 |
parents | |
children | 5b7ed10ecbb4 |
line wrap: on
line source
import os import os.path from lxml import etree class indexMetaHandler: def createInitialIndexMetas(self,path,start=None,end=None): cnt=-1 for fn in os.listdir(path): cnt+=1 if start is not None: if cnt<start: print("SKIP: %s"%fn) continue if end is not None: if cnt>end: return print("PROCESS: %s"%fn) self.createInitialIndexMeta(path,fn) def createInitialIndexMeta(self,path,fn,remove="/Volumes/MPIWG"): dom=etree.parse("index.meta.stub.xml") #read the stub element = dom.xpath("/resource/name") element[0].text=fn element = dom.xpath("/resource/archive-path") archivePath=path+fn element[0].text=archivePath.replace(remove,"") #check images if os.path.exists(path+fn+"/pages"): element = dom.xpath("/resource/meta/texttool/image") element[0].text="pages" element = dom.xpath("/resource/meta/texttool")[0] if os.path.exists(path+fn+"/full"): fullElement = etree.Element("text",type="pdf") fullElement.text="full" element.append(fullElement) if os.path.exists(path+fn+"/hocr"): fullElement = etree.Element("text",type="hocr") fullElement.text="hocr" element.append(fullElement) element = dom.xpath("/resource/meta/bib/title") element[0].text=fn txt=etree.tostring(dom, pretty_print=True) out = open(path+fn+"/index.meta","bw") out.write(txt) out.close() ih = indexMetaHandler() ih.createInitialIndexMetas("/Volumes/MPGARCHIV2/struct2/")