# HG changeset patch # User dwinter # Date 1408451129 -7200 # Node ID 90c0df48389010c98dfdfbce47ffb2a218d363a3 # Parent 5b7ed10ecbb426f02b3c78769c3acc2258f56d23 inital3 diff -r 5b7ed10ecbb4 -r 90c0df483890 doOCR.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/doOCR.py Tue Aug 19 14:25:29 2014 +0200 @@ -0,0 +1,120 @@ +import shlex +import subprocess +import os.path +import os + +class ocrManager: + + #strCommand = "/opt/local/bin/gm convert " + strImagepath + " -resize 1500x\\> " + strImagepath + + def split(self,foldername,onExistExit=False): + """splits pdf file in foldername, default it assumes a folder full containing a file foldername.pdf""" + + filename=foldername.split("/")[-1].split("___")[1] + + + + outfolder = foldername + "/pages/" + + if not os.path.exists(outfolder): + os.mkdir(outfolder) + else: + if onExistExit: #if set don't do anything + print (" ---- exists (split)") + return + + outName = outfolder + "img" + "-\%03d.png" + #outName = outfolder + filename + "-%03d.png" + strCommand = "/usr/local/bin/convert -verbose -density 300 " + foldername + "/full/" + filename +".pdf " + outName + #strCommand = "/usr/local/bin/gm convert -verbose -density 300 " + foldername + "/full/" + filename +".pdf " + outName + + listArguments = shlex.split(strCommand) + #listArguments = ["/usr/local/bin/gm","convert", "-verbose -density 300 " + foldername + "/full/" + filename +".pdf " + outName] + exeShell = subprocess.call(listArguments) + + + + while False: + line = exeShell.stdout.readline() + if line != b'': + #the real code does filtering here + print (line.rstrip()) + else: + break + + def ocr(self,foldername,onExistExit=False): + filename=foldername.split("/")[-1] + + #tesseract -l deu JB1953-17802.png JB1953-17802.tess hocr + + + pagesfolder = foldername + "/pages/" + hocrfolder = foldername + "/hocr/" + + if not os.path.exists(hocrfolder): + os.mkdir(hocrfolder) + else: + if onExistExit: #if set don't do anything + print (" ---- exists (ocrj)") + return + + + + + for fn in os.listdir(pagesfolder): + + outName = hocrfolder + fn + + + strCommand = "/usr/local/bin/tesseract -l deu " + pagesfolder + fn +" " + hocrfolder + fn + " hocr" + + listArguments = shlex.split(strCommand) + exeShell = subprocess.Popen(listArguments,stdout=subprocess.PIPE) + + while True: + line = exeShell.stdout.readline() + if line != b'': + #the real code does filtering here + print (line.rstrip()) + else: + break + + + +import sys +om = ocrManager() + +args = sys.argv + +start=None +end=None +if len(args) > 1: #start foldernummer + start = int(args[1]) + +if len(args) > 2: #start foldernummer + end = int(args[2]) + + +foldername = "/Volumes/MPGARCHIV/struct2/" + +cnt=-1 +for fn in os.listdir(foldername): + cnt+=1 + # if fn.find("igkeitsber")<0: + if fn.find("Jahrbuch")<0: + print ("DON't do:"+fn) + continue + if start is not None: + if cntend: + sys.exit() + print("PROCESS: %s"%fn) + + + + om.split(foldername+fn,onExistExit=True) + om.ocr(foldername+fn,onExistExit=True)