Mercurial > hg > ocrHandling
view doOCR.py @ 0:5e33fa5a2fdc
initial
author | Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de> |
---|---|
date | Tue, 19 Aug 2014 14:12:52 +0200 |
parents | |
children |
line wrap: on
line source
import shlex import subprocess import os.path import os class ocrManager: #strCommand = "/opt/local/bin/gm convert " + strImagepath + " -resize 1500x\\> " + strImagepath def split(self,foldername): """splits pdf file in foldername, default it assumes a folder full containing a file foldername.pdf""" filename=foldername.split("/")[-1].split("___")[1] outfolder = foldername + "/pages/" if not os.path.exists(outfolder): os.mkdir(outfolder) outName = outfolder + "img" + "-\%03d.png" #outName = outfolder + filename + "-%03d.png" strCommand = "/usr/local/bin/convert -verbose -density 300 " + foldername + "/full/" + filename +".pdf " + outName #strCommand = "/usr/local/bin/gm convert -verbose -density 300 " + foldername + "/full/" + filename +".pdf " + outName listArguments = shlex.split(strCommand) #listArguments = ["/usr/local/bin/gm","convert", "-verbose -density 300 " + foldername + "/full/" + filename +".pdf " + outName] exeShell = subprocess.call(listArguments) while False: line = exeShell.stdout.readline() if line != b'': #the real code does filtering here print (line.rstrip()) else: break def ocr(self,foldername): filename=foldername.split("/")[-1] #tesseract -l deu JB1953-17802.png JB1953-17802.tess hocr pagesfolder = foldername + "/pages/" hocrfolder = foldername + "/hocr/" if not os.path.exists(hocrfolder): os.mkdir(hocrfolder) for fn in os.listdir(pagesfolder): outName = hocrfolder + fn strCommand = "/usr/local/bin/tesseract -l deu " + pagesfolder + fn +" " + hocrfolder + fn + " hocr" listArguments = shlex.split(strCommand) exeShell = subprocess.Popen(listArguments,stdout=subprocess.PIPE) while True: line = exeShell.stdout.readline() if line != b'': #the real code does filtering here print (line.rstrip()) else: break import sys om = ocrManager() args = sys.argv start=None end=None if len(args) > 1: #start foldernummer start = int(args[1]) if len(args) > 2: #start foldernummer end = int(args[2]) foldername = "/Volumes/MPGARCHIV/struct2/" cnt=-1 for fn in os.listdir(foldername): cnt+=1 if start is not None: if cnt<start: print("SKIP: %s"%fn) continue if end is not None: if cnt>end: sys.exit() print("PROCESS: %s"%fn) om.split(foldername+fn) om.ocr(foldername+fn)