Mercurial > hg > ocrHandling
diff createIndexMetas.py @ 0:5e33fa5a2fdc
initial
author | Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de> |
---|---|
date | Tue, 19 Aug 2014 14:12:52 +0200 |
parents | |
children | 5b7ed10ecbb4 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/createIndexMetas.py Tue Aug 19 14:12:52 2014 +0200 @@ -0,0 +1,77 @@ +import os +import os.path + +from lxml import etree + +class indexMetaHandler: + def createInitialIndexMetas(self,path,start=None,end=None): + cnt=-1 + for fn in os.listdir(path): + cnt+=1 + + if start is not None: + if cnt<start: + print("SKIP: %s"%fn) + continue + + if end is not None: + if cnt>end: + return + print("PROCESS: %s"%fn) + + self.createInitialIndexMeta(path,fn) + + def createInitialIndexMeta(self,path,fn,remove="/Volumes/MPIWG"): + + dom=etree.parse("index.meta.stub.xml") #read the stub + + element = dom.xpath("/resource/name") + element[0].text=fn + + element = dom.xpath("/resource/archive-path") + archivePath=path+fn + + element[0].text=archivePath.replace(remove,"") + + #check images + + if os.path.exists(path+fn+"/pages"): + element = dom.xpath("/resource/meta/texttool/image") + element[0].text="pages" + + element = dom.xpath("/resource/meta/texttool")[0] + if os.path.exists(path+fn+"/full"): + fullElement = etree.Element("text",type="pdf") + fullElement.text="full" + + element.append(fullElement) + + if os.path.exists(path+fn+"/hocr"): + fullElement = etree.Element("text",type="hocr") + fullElement.text="hocr" + + element.append(fullElement) + + + + element = dom.xpath("/resource/meta/bib/title") + element[0].text=fn + + + txt=etree.tostring(dom, pretty_print=True) + + out = open(path+fn+"/index.meta","bw") + + out.write(txt) + + out.close() + + + + + + + +ih = indexMetaHandler() + +ih.createInitialIndexMetas("/Volumes/MPGARCHIV2/struct2/") \ No newline at end of file