changeset 0:5e33fa5a2fdc

initial
author Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
date Tue, 19 Aug 2014 14:12:52 +0200
parents
children 5b7ed10ecbb4
files .hgignore .project .pydevproject .settings/org.eclipse.core.resources.prefs copyFiles.py copyFiles2.py createIndexMetas.py doOCR.py index.meta.stub.xml
diffstat 9 files changed, 324 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/.hgignore	Tue Aug 19 14:12:52 2014 +0200
@@ -0,0 +1,3 @@
+
+syntax: regexp
+^data$
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/.project	Tue Aug 19 14:12:52 2014 +0200
@@ -0,0 +1,17 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<projectDescription>
+	<name>ocrHandling</name>
+	<comment></comment>
+	<projects>
+	</projects>
+	<buildSpec>
+		<buildCommand>
+			<name>org.python.pydev.PyDevBuilder</name>
+			<arguments>
+			</arguments>
+		</buildCommand>
+	</buildSpec>
+	<natures>
+		<nature>org.python.pydev.pythonNature</nature>
+	</natures>
+</projectDescription>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/.pydevproject	Tue Aug 19 14:12:52 2014 +0200
@@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<?eclipse-pydev version="1.0"?><pydev_project>
+<pydev_pathproperty name="org.python.pydev.PROJECT_SOURCE_PATH">
+<path>/${PROJECT_DIR_NAME}</path>
+</pydev_pathproperty>
+<pydev_property name="org.python.pydev.PYTHON_PROJECT_VERSION">python 3.0</pydev_property>
+<pydev_property name="org.python.pydev.PYTHON_PROJECT_INTERPRETER">python</pydev_property>
+</pydev_project>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/.settings/org.eclipse.core.resources.prefs	Tue Aug 19 14:12:52 2014 +0200
@@ -0,0 +1,3 @@
+eclipse.preferences.version=1
+encoding//data/OrdnerMusterJPG/pages/canon_01.txt=ISO-8859-1
+encoding/<project>=UTF-8
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/copyFiles.py	Tue Aug 19 14:12:52 2014 +0200
@@ -0,0 +1,38 @@
+from _csv import Error
+start ="/Volumes/Folivora/MPG_Archiv/"
+target = "/Volumes/MPGARCHIV/input/"
+
+
+import os
+import shutil
+
+errorf = open("/tmp/error.out","w",encoding="utf-8")
+errorf.write("START")
+
+for root, dirs, files in os.walk(start, topdown=False):
+    for name in files:
+        path = os.path.join(root, name)
+        ext = os.path.splitext(path)
+        print (name)
+        if ext[1] == ".pdf":
+            
+            fld = os.path.split(root)
+            print (fld)
+            neu = os.path.join(target,fld[1]+"___"+name.replace(" ","_"))
+            if not os.path.exists(neu):
+                try:
+                    shutil.copy(path, neu)
+                except:
+                    errorf.write(path+"\n")
+                print (path)
+            else:
+                print("EXISTS:" + path)
+                            
+errorf.close();   
+            
+        
+        
+    #for name in dirs:
+    #    print(os.path.join(root, name))
+        
+        
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/copyFiles2.py	Tue Aug 19 14:12:52 2014 +0200
@@ -0,0 +1,48 @@
+from _csv import Error
+start ="/Volumes/Folivora/MPG_Archiv/"
+target = "/Volumes/MPGARCHIV/struct2/"
+
+
+import os
+import os.path
+import shutil
+
+errorf = open("/tmp/error.out","w",encoding="utf-8")
+errorf.write("START")
+
+for root, dirs, files in os.walk(start, topdown=False):
+    for name in files:
+        path = os.path.join(root, name)
+        ext = os.path.splitext(path)
+        print (name)
+        if ext[1] == ".pdf":
+            
+            fld = os.path.split(root)
+            print (fld)
+            neu = os.path.join(target,fld[1]+"___"+name.replace(" ","_"))
+            
+            pathNeu,ext=os.path.splitext(neu)
+            
+            if not os.path.exists(pathNeu):
+                
+                os.makedirs(pathNeu)
+                os.makedirs(pathNeu+"/full")
+                
+                try:
+                    neu = pathNeu+"/full/"+name.replace(" ","_")
+                    shutil.copy(path, neu)
+                except:
+                    errorf.write(path+"\n")
+
+                print (path)
+            else:
+                print("EXISTS:" + path)
+                            
+errorf.close();   
+            
+        
+        
+    #for name in dirs:
+    #    print(os.path.join(root, name))
+        
+        
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/createIndexMetas.py	Tue Aug 19 14:12:52 2014 +0200
@@ -0,0 +1,77 @@
+import os
+import os.path
+
+from lxml import etree
+
+class indexMetaHandler:
+    def createInitialIndexMetas(self,path,start=None,end=None):
+        cnt=-1
+        for fn in os.listdir(path):
+            cnt+=1
+
+            if start is not None:
+                if cnt<start:
+                    print("SKIP: %s"%fn)
+                    continue
+                
+            if end is not None:
+                if cnt>end:
+                    return
+            print("PROCESS: %s"%fn)
+                    
+            self.createInitialIndexMeta(path,fn)
+            
+    def createInitialIndexMeta(self,path,fn,remove="/Volumes/MPIWG"):
+        
+        dom=etree.parse("index.meta.stub.xml") #read the stub
+        
+        element = dom.xpath("/resource/name")
+        element[0].text=fn
+        
+        element = dom.xpath("/resource/archive-path")
+        archivePath=path+fn
+        
+        element[0].text=archivePath.replace(remove,"")
+        
+        #check images
+        
+        if os.path.exists(path+fn+"/pages"):
+            element = dom.xpath("/resource/meta/texttool/image")
+            element[0].text="pages"
+        
+        element = dom.xpath("/resource/meta/texttool")[0]
+        if os.path.exists(path+fn+"/full"):
+            fullElement = etree.Element("text",type="pdf")
+            fullElement.text="full"
+            
+            element.append(fullElement)
+            
+        if os.path.exists(path+fn+"/hocr"):
+            fullElement = etree.Element("text",type="hocr")
+            fullElement.text="hocr"
+            
+            element.append(fullElement)
+            
+        
+       
+        element = dom.xpath("/resource/meta/bib/title")
+        element[0].text=fn
+        
+        
+        txt=etree.tostring(dom, pretty_print=True)
+        
+        out = open(path+fn+"/index.meta","bw")
+        
+        out.write(txt)
+        
+        out.close()
+
+        
+        
+              
+        
+              
+            
+ih = indexMetaHandler()
+
+ih.createInitialIndexMetas("/Volumes/MPGARCHIV2/struct2/")
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/doOCR.py	Tue Aug 19 14:12:52 2014 +0200
@@ -0,0 +1,108 @@
+import shlex
+import subprocess
+import os.path
+import os
+ 
+class ocrManager:
+    
+    #strCommand = "/opt/local/bin/gm convert " + strImagepath + " -resize 1500x\\> " + strImagepath 
+
+    def split(self,foldername):
+        """splits pdf file in foldername, default it assumes a folder full containing a file foldername.pdf"""
+        
+        filename=foldername.split("/")[-1].split("___")[1]
+        
+        
+     
+        outfolder = foldername + "/pages/"
+        
+        if not os.path.exists(outfolder):
+            os.mkdir(outfolder)
+        
+        outName = outfolder +  "img" + "-\%03d.png"
+        #outName = outfolder +  filename + "-%03d.png"
+        strCommand = "/usr/local/bin/convert -verbose -density 300 " + foldername + "/full/" + filename +".pdf " + outName
+        #strCommand = "/usr/local/bin/gm convert -verbose -density 300 " + foldername + "/full/" + filename +".pdf " + outName
+    
+        listArguments = shlex.split(strCommand)
+        #listArguments = ["/usr/local/bin/gm","convert", "-verbose -density 300 " + foldername + "/full/" + filename +".pdf " + outName]
+        exeShell = subprocess.call(listArguments)
+      
+       
+        
+        while False:
+            line = exeShell.stdout.readline()
+            if line != b'':
+                #the real code does filtering here
+                print (line.rstrip())
+            else:
+                break
+        
+    def ocr(self,foldername):
+        filename=foldername.split("/")[-1]
+        
+        #tesseract -l deu JB1953-17802.png JB1953-17802.tess hocr
+        
+     
+        pagesfolder = foldername + "/pages/"
+        hocrfolder = foldername + "/hocr/"
+        
+        if not os.path.exists(hocrfolder):
+            os.mkdir(hocrfolder)
+        
+        
+        
+        for fn in os.listdir(pagesfolder):
+        
+            outName = hocrfolder +  fn
+        
+        
+            strCommand = "/usr/local/bin/tesseract -l deu  " + pagesfolder + fn +"  " + hocrfolder + fn + " hocr"
+    
+            listArguments = shlex.split(strCommand)
+            exeShell = subprocess.Popen(listArguments,stdout=subprocess.PIPE)
+        
+            while True:
+                line = exeShell.stdout.readline()
+                if line != b'':
+                    #the real code does filtering here
+                    print (line.rstrip())
+                else:
+                    break
+        
+  
+
+import sys      
+om = ocrManager()
+
+args = sys.argv
+
+start=None
+end=None
+if len(args) > 1: #start foldernummer
+    start = int(args[1])
+
+if len(args) > 2: #start foldernummer
+    end = int(args[2])
+
+
+foldername = "/Volumes/MPGARCHIV/struct2/"
+
+cnt=-1
+for fn in os.listdir(foldername):
+    cnt+=1
+    
+    if start is not None:
+        if cnt<start:
+            print("SKIP: %s"%fn)
+            continue
+        
+    if end is not None:
+        if cnt>end:
+            sys.exit()
+    print("PROCESS: %s"%fn)
+            
+    
+    
+    om.split(foldername+fn)
+    om.ocr(foldername+fn)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/index.meta.stub.xml	Tue Aug 19 14:12:52 2014 +0200
@@ -0,0 +1,22 @@
+<?xml version="1.0" encoding="UTF-8"?><resource version="1.1" type="MPIWG">
+<name></name>
+<archive-path></archive-path>
+<archive-creation-date></archive-creation-date>
+<creator>mpiwg</creator>
+<description></description>
+<media-type></media-type>
+<meta>
+<content-type>scanned document</content-type>
+<access-conditions>
+<access type="mpiwg"/>
+</access-conditions>
+<texttool>
+<image></image>
+</texttool>
+<bib type="manuscript">
+<author></author>
+<title></title>
+</bib>
+<dri type=""></dri>
+</meta>
+</resource>