Mercurial > hg > ocrHandling
comparison createIndexMetas.py @ 0:5e33fa5a2fdc
initial
author | Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de> |
---|---|
date | Tue, 19 Aug 2014 14:12:52 +0200 |
parents | |
children | 5b7ed10ecbb4 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:5e33fa5a2fdc |
---|---|
1 import os | |
2 import os.path | |
3 | |
4 from lxml import etree | |
5 | |
6 class indexMetaHandler: | |
7 def createInitialIndexMetas(self,path,start=None,end=None): | |
8 cnt=-1 | |
9 for fn in os.listdir(path): | |
10 cnt+=1 | |
11 | |
12 if start is not None: | |
13 if cnt<start: | |
14 print("SKIP: %s"%fn) | |
15 continue | |
16 | |
17 if end is not None: | |
18 if cnt>end: | |
19 return | |
20 print("PROCESS: %s"%fn) | |
21 | |
22 self.createInitialIndexMeta(path,fn) | |
23 | |
24 def createInitialIndexMeta(self,path,fn,remove="/Volumes/MPIWG"): | |
25 | |
26 dom=etree.parse("index.meta.stub.xml") #read the stub | |
27 | |
28 element = dom.xpath("/resource/name") | |
29 element[0].text=fn | |
30 | |
31 element = dom.xpath("/resource/archive-path") | |
32 archivePath=path+fn | |
33 | |
34 element[0].text=archivePath.replace(remove,"") | |
35 | |
36 #check images | |
37 | |
38 if os.path.exists(path+fn+"/pages"): | |
39 element = dom.xpath("/resource/meta/texttool/image") | |
40 element[0].text="pages" | |
41 | |
42 element = dom.xpath("/resource/meta/texttool")[0] | |
43 if os.path.exists(path+fn+"/full"): | |
44 fullElement = etree.Element("text",type="pdf") | |
45 fullElement.text="full" | |
46 | |
47 element.append(fullElement) | |
48 | |
49 if os.path.exists(path+fn+"/hocr"): | |
50 fullElement = etree.Element("text",type="hocr") | |
51 fullElement.text="hocr" | |
52 | |
53 element.append(fullElement) | |
54 | |
55 | |
56 | |
57 element = dom.xpath("/resource/meta/bib/title") | |
58 element[0].text=fn | |
59 | |
60 | |
61 txt=etree.tostring(dom, pretty_print=True) | |
62 | |
63 out = open(path+fn+"/index.meta","bw") | |
64 | |
65 out.write(txt) | |
66 | |
67 out.close() | |
68 | |
69 | |
70 | |
71 | |
72 | |
73 | |
74 | |
75 ih = indexMetaHandler() | |
76 | |
77 ih.createInitialIndexMetas("/Volumes/MPGARCHIV2/struct2/") |