diff createIndexMetas.py @ 0:5e33fa5a2fdc

initial
author Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
date Tue, 19 Aug 2014 14:12:52 +0200
parents
children 5b7ed10ecbb4
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/createIndexMetas.py	Tue Aug 19 14:12:52 2014 +0200
@@ -0,0 +1,77 @@
+import os
+import os.path
+
+from lxml import etree
+
+class indexMetaHandler:
+    def createInitialIndexMetas(self,path,start=None,end=None):
+        cnt=-1
+        for fn in os.listdir(path):
+            cnt+=1
+
+            if start is not None:
+                if cnt<start:
+                    print("SKIP: %s"%fn)
+                    continue
+                
+            if end is not None:
+                if cnt>end:
+                    return
+            print("PROCESS: %s"%fn)
+                    
+            self.createInitialIndexMeta(path,fn)
+            
+    def createInitialIndexMeta(self,path,fn,remove="/Volumes/MPIWG"):
+        
+        dom=etree.parse("index.meta.stub.xml") #read the stub
+        
+        element = dom.xpath("/resource/name")
+        element[0].text=fn
+        
+        element = dom.xpath("/resource/archive-path")
+        archivePath=path+fn
+        
+        element[0].text=archivePath.replace(remove,"")
+        
+        #check images
+        
+        if os.path.exists(path+fn+"/pages"):
+            element = dom.xpath("/resource/meta/texttool/image")
+            element[0].text="pages"
+        
+        element = dom.xpath("/resource/meta/texttool")[0]
+        if os.path.exists(path+fn+"/full"):
+            fullElement = etree.Element("text",type="pdf")
+            fullElement.text="full"
+            
+            element.append(fullElement)
+            
+        if os.path.exists(path+fn+"/hocr"):
+            fullElement = etree.Element("text",type="hocr")
+            fullElement.text="hocr"
+            
+            element.append(fullElement)
+            
+        
+       
+        element = dom.xpath("/resource/meta/bib/title")
+        element[0].text=fn
+        
+        
+        txt=etree.tostring(dom, pretty_print=True)
+        
+        out = open(path+fn+"/index.meta","bw")
+        
+        out.write(txt)
+        
+        out.close()
+
+        
+        
+              
+        
+              
+            
+ih = indexMetaHandler()
+
+ih.createInitialIndexMetas("/Volumes/MPGARCHIV2/struct2/")
\ No newline at end of file