view doOCR.py @ 0:5e33fa5a2fdc

initial
author Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
date Tue, 19 Aug 2014 14:12:52 +0200
parents
children
line wrap: on
line source

import shlex
import subprocess
import os.path
import os
 
class ocrManager:
    
    #strCommand = "/opt/local/bin/gm convert " + strImagepath + " -resize 1500x\\> " + strImagepath 

    def split(self,foldername):
        """splits pdf file in foldername, default it assumes a folder full containing a file foldername.pdf"""
        
        filename=foldername.split("/")[-1].split("___")[1]
        
        
     
        outfolder = foldername + "/pages/"
        
        if not os.path.exists(outfolder):
            os.mkdir(outfolder)
        
        outName = outfolder +  "img" + "-\%03d.png"
        #outName = outfolder +  filename + "-%03d.png"
        strCommand = "/usr/local/bin/convert -verbose -density 300 " + foldername + "/full/" + filename +".pdf " + outName
        #strCommand = "/usr/local/bin/gm convert -verbose -density 300 " + foldername + "/full/" + filename +".pdf " + outName
    
        listArguments = shlex.split(strCommand)
        #listArguments = ["/usr/local/bin/gm","convert", "-verbose -density 300 " + foldername + "/full/" + filename +".pdf " + outName]
        exeShell = subprocess.call(listArguments)
      
       
        
        while False:
            line = exeShell.stdout.readline()
            if line != b'':
                #the real code does filtering here
                print (line.rstrip())
            else:
                break
        
    def ocr(self,foldername):
        filename=foldername.split("/")[-1]
        
        #tesseract -l deu JB1953-17802.png JB1953-17802.tess hocr
        
     
        pagesfolder = foldername + "/pages/"
        hocrfolder = foldername + "/hocr/"
        
        if not os.path.exists(hocrfolder):
            os.mkdir(hocrfolder)
        
        
        
        for fn in os.listdir(pagesfolder):
        
            outName = hocrfolder +  fn
        
        
            strCommand = "/usr/local/bin/tesseract -l deu  " + pagesfolder + fn +"  " + hocrfolder + fn + " hocr"
    
            listArguments = shlex.split(strCommand)
            exeShell = subprocess.Popen(listArguments,stdout=subprocess.PIPE)
        
            while True:
                line = exeShell.stdout.readline()
                if line != b'':
                    #the real code does filtering here
                    print (line.rstrip())
                else:
                    break
        
  

import sys      
om = ocrManager()

args = sys.argv

start=None
end=None
if len(args) > 1: #start foldernummer
    start = int(args[1])

if len(args) > 2: #start foldernummer
    end = int(args[2])


foldername = "/Volumes/MPGARCHIV/struct2/"

cnt=-1
for fn in os.listdir(foldername):
    cnt+=1
    
    if start is not None:
        if cnt<start:
            print("SKIP: %s"%fn)
            continue
        
    if end is not None:
        if cnt>end:
            sys.exit()
    print("PROCESS: %s"%fn)
            
    
    
    om.split(foldername+fn)
    om.ocr(foldername+fn)