changeset 2:90c0df483890 default tip

inital3
author dwinter
date Tue, 19 Aug 2014 14:25:29 +0200
parents 5b7ed10ecbb4
children
files doOCR.py
diffstat 1 files changed, 120 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/doOCR.py	Tue Aug 19 14:25:29 2014 +0200
@@ -0,0 +1,120 @@
+import shlex
+import subprocess
+import os.path
+import os
+ 
+class ocrManager:
+    
+    #strCommand = "/opt/local/bin/gm convert " + strImagepath + " -resize 1500x\\> " + strImagepath 
+
+    def split(self,foldername,onExistExit=False):
+        """splits pdf file in foldername, default it assumes a folder full containing a file foldername.pdf"""
+        
+        filename=foldername.split("/")[-1].split("___")[1]
+        
+        
+     
+        outfolder = foldername + "/pages/"
+        
+        if not os.path.exists(outfolder):
+            os.mkdir(outfolder)
+        else:
+            if onExistExit: #if set don't do anything
+                print (" ---- exists (split)")
+                return
+        
+        outName = outfolder +  "img" + "-\%03d.png"
+        #outName = outfolder +  filename + "-%03d.png"
+        strCommand = "/usr/local/bin/convert -verbose -density 300 " + foldername + "/full/" + filename +".pdf " + outName
+        #strCommand = "/usr/local/bin/gm convert -verbose -density 300 " + foldername + "/full/" + filename +".pdf " + outName
+    
+        listArguments = shlex.split(strCommand)
+        #listArguments = ["/usr/local/bin/gm","convert", "-verbose -density 300 " + foldername + "/full/" + filename +".pdf " + outName]
+        exeShell = subprocess.call(listArguments)
+      
+       
+        
+        while False:
+            line = exeShell.stdout.readline()
+            if line != b'':
+                #the real code does filtering here
+                print (line.rstrip())
+            else:
+                break
+        
+    def ocr(self,foldername,onExistExit=False):
+        filename=foldername.split("/")[-1]
+        
+        #tesseract -l deu JB1953-17802.png JB1953-17802.tess hocr
+        
+     
+        pagesfolder = foldername + "/pages/"
+        hocrfolder = foldername + "/hocr/"
+        
+        if not os.path.exists(hocrfolder):
+            os.mkdir(hocrfolder)
+        else:
+            if onExistExit: #if set don't do anything
+                print (" ---- exists (ocrj)")
+                return
+        
+        
+        
+        
+        for fn in os.listdir(pagesfolder):
+        
+            outName = hocrfolder +  fn
+        
+        
+            strCommand = "/usr/local/bin/tesseract -l deu  " + pagesfolder + fn +"  " + hocrfolder + fn + " hocr"
+    
+            listArguments = shlex.split(strCommand)
+            exeShell = subprocess.Popen(listArguments,stdout=subprocess.PIPE)
+        
+            while True:
+                line = exeShell.stdout.readline()
+                if line != b'':
+                    #the real code does filtering here
+                    print (line.rstrip())
+                else:
+                    break
+        
+  
+
+import sys      
+om = ocrManager()
+
+args = sys.argv
+
+start=None
+end=None
+if len(args) > 1: #start foldernummer
+    start = int(args[1])
+
+if len(args) > 2: #start foldernummer
+    end = int(args[2])
+
+
+foldername = "/Volumes/MPGARCHIV/struct2/"
+
+cnt=-1
+for fn in os.listdir(foldername):
+    cnt+=1
+   # if fn.find("igkeitsber")<0:
+    if fn.find("Jahrbuch")<0:
+        print ("DON't do:"+fn)
+        continue
+    if start is not None:
+        if cnt<start:
+            print("SKIP: %s"%fn)
+            continue
+        
+    if end is not None:
+        if cnt>end:
+            sys.exit()
+    print("PROCESS: %s"%fn)
+            
+    
+    
+    om.split(foldername+fn,onExistExit=True)
+    om.ocr(foldername+fn,onExistExit=True)