diff doOCR.py @ 0:5e33fa5a2fdc

initial
author Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
date Tue, 19 Aug 2014 14:12:52 +0200
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/doOCR.py	Tue Aug 19 14:12:52 2014 +0200
@@ -0,0 +1,108 @@
+import shlex
+import subprocess
+import os.path
+import os
+ 
+class ocrManager:
+    
+    #strCommand = "/opt/local/bin/gm convert " + strImagepath + " -resize 1500x\\> " + strImagepath 
+
+    def split(self,foldername):
+        """splits pdf file in foldername, default it assumes a folder full containing a file foldername.pdf"""
+        
+        filename=foldername.split("/")[-1].split("___")[1]
+        
+        
+     
+        outfolder = foldername + "/pages/"
+        
+        if not os.path.exists(outfolder):
+            os.mkdir(outfolder)
+        
+        outName = outfolder +  "img" + "-\%03d.png"
+        #outName = outfolder +  filename + "-%03d.png"
+        strCommand = "/usr/local/bin/convert -verbose -density 300 " + foldername + "/full/" + filename +".pdf " + outName
+        #strCommand = "/usr/local/bin/gm convert -verbose -density 300 " + foldername + "/full/" + filename +".pdf " + outName
+    
+        listArguments = shlex.split(strCommand)
+        #listArguments = ["/usr/local/bin/gm","convert", "-verbose -density 300 " + foldername + "/full/" + filename +".pdf " + outName]
+        exeShell = subprocess.call(listArguments)
+      
+       
+        
+        while False:
+            line = exeShell.stdout.readline()
+            if line != b'':
+                #the real code does filtering here
+                print (line.rstrip())
+            else:
+                break
+        
+    def ocr(self,foldername):
+        filename=foldername.split("/")[-1]
+        
+        #tesseract -l deu JB1953-17802.png JB1953-17802.tess hocr
+        
+     
+        pagesfolder = foldername + "/pages/"
+        hocrfolder = foldername + "/hocr/"
+        
+        if not os.path.exists(hocrfolder):
+            os.mkdir(hocrfolder)
+        
+        
+        
+        for fn in os.listdir(pagesfolder):
+        
+            outName = hocrfolder +  fn
+        
+        
+            strCommand = "/usr/local/bin/tesseract -l deu  " + pagesfolder + fn +"  " + hocrfolder + fn + " hocr"
+    
+            listArguments = shlex.split(strCommand)
+            exeShell = subprocess.Popen(listArguments,stdout=subprocess.PIPE)
+        
+            while True:
+                line = exeShell.stdout.readline()
+                if line != b'':
+                    #the real code does filtering here
+                    print (line.rstrip())
+                else:
+                    break
+        
+  
+
+import sys      
+om = ocrManager()
+
+args = sys.argv
+
+start=None
+end=None
+if len(args) > 1: #start foldernummer
+    start = int(args[1])
+
+if len(args) > 2: #start foldernummer
+    end = int(args[2])
+
+
+foldername = "/Volumes/MPGARCHIV/struct2/"
+
+cnt=-1
+for fn in os.listdir(foldername):
+    cnt+=1
+    
+    if start is not None:
+        if cnt<start:
+            print("SKIP: %s"%fn)
+            continue
+        
+    if end is not None:
+        if cnt>end:
+            sys.exit()
+    print("PROCESS: %s"%fn)
+            
+    
+    
+    om.split(foldername+fn)
+    om.ocr(foldername+fn)