comparison createIndexMetas.py @ 0:5e33fa5a2fdc

initial
author Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
date Tue, 19 Aug 2014 14:12:52 +0200
parents
children 5b7ed10ecbb4
comparison
equal deleted inserted replaced
-1:000000000000 0:5e33fa5a2fdc
1 import os
2 import os.path
3
4 from lxml import etree
5
6 class indexMetaHandler:
7 def createInitialIndexMetas(self,path,start=None,end=None):
8 cnt=-1
9 for fn in os.listdir(path):
10 cnt+=1
11
12 if start is not None:
13 if cnt<start:
14 print("SKIP: %s"%fn)
15 continue
16
17 if end is not None:
18 if cnt>end:
19 return
20 print("PROCESS: %s"%fn)
21
22 self.createInitialIndexMeta(path,fn)
23
24 def createInitialIndexMeta(self,path,fn,remove="/Volumes/MPIWG"):
25
26 dom=etree.parse("index.meta.stub.xml") #read the stub
27
28 element = dom.xpath("/resource/name")
29 element[0].text=fn
30
31 element = dom.xpath("/resource/archive-path")
32 archivePath=path+fn
33
34 element[0].text=archivePath.replace(remove,"")
35
36 #check images
37
38 if os.path.exists(path+fn+"/pages"):
39 element = dom.xpath("/resource/meta/texttool/image")
40 element[0].text="pages"
41
42 element = dom.xpath("/resource/meta/texttool")[0]
43 if os.path.exists(path+fn+"/full"):
44 fullElement = etree.Element("text",type="pdf")
45 fullElement.text="full"
46
47 element.append(fullElement)
48
49 if os.path.exists(path+fn+"/hocr"):
50 fullElement = etree.Element("text",type="hocr")
51 fullElement.text="hocr"
52
53 element.append(fullElement)
54
55
56
57 element = dom.xpath("/resource/meta/bib/title")
58 element[0].text=fn
59
60
61 txt=etree.tostring(dom, pretty_print=True)
62
63 out = open(path+fn+"/index.meta","bw")
64
65 out.write(txt)
66
67 out.close()
68
69
70
71
72
73
74
75 ih = indexMetaHandler()
76
77 ih.createInitialIndexMetas("/Volumes/MPGARCHIV2/struct2/")