Mercurial > hg > ocrHandling
comparison doOCR2.py @ 1:5b7ed10ecbb4
initial2
| author | dwinter |
|---|---|
| date | Tue, 19 Aug 2014 14:24:36 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 0:5e33fa5a2fdc | 1:5b7ed10ecbb4 |
|---|---|
| 1 import shlex | |
| 2 import subprocess | |
| 3 import os.path | |
| 4 import os | |
| 5 | |
| 6 class ocrManager: | |
| 7 | |
| 8 #strCommand = "/opt/local/bin/gm convert " + strImagepath + " -resize 1500x\\> " + strImagepath | |
| 9 | |
| 10 def split(self,foldername,onExistExit=False): | |
| 11 """splits pdf file in foldername, default it assumes a folder full containing a file foldername.pdf""" | |
| 12 | |
| 13 filename=foldername.split("/")[-1].split("___")[1] | |
| 14 | |
| 15 | |
| 16 | |
| 17 outfolder = foldername + "/pages/" | |
| 18 | |
| 19 if not os.path.exists(outfolder): | |
| 20 os.mkdir(outfolder) | |
| 21 else: | |
| 22 if onExistExit: #if set don't do anything | |
| 23 print (" ---- exists (split)") | |
| 24 return | |
| 25 | |
| 26 outName = outfolder + "img" + "-\%03d.png" | |
| 27 #outName = outfolder + filename + "-%03d.png" | |
| 28 strCommand = "/usr/local/bin/convert -verbose -density 300 " + foldername + "/full/" + filename +".pdf " + outName | |
| 29 #strCommand = "/usr/local/bin/gm convert -verbose -density 300 " + foldername + "/full/" + filename +".pdf " + outName | |
| 30 | |
| 31 listArguments = shlex.split(strCommand) | |
| 32 #listArguments = ["/usr/local/bin/gm","convert", "-verbose -density 300 " + foldername + "/full/" + filename +".pdf " + outName] | |
| 33 exeShell = subprocess.call(listArguments) | |
| 34 | |
| 35 | |
| 36 | |
| 37 while False: | |
| 38 line = exeShell.stdout.readline() | |
| 39 if line != b'': | |
| 40 #the real code does filtering here | |
| 41 print (line.rstrip()) | |
| 42 else: | |
| 43 break | |
| 44 | |
| 45 def ocr(self,foldername,onExistExit=False): | |
| 46 filename=foldername.split("/")[-1] | |
| 47 | |
| 48 #tesseract -l deu JB1953-17802.png JB1953-17802.tess hocr | |
| 49 | |
| 50 | |
| 51 pagesfolder = foldername + "/pages/" | |
| 52 hocrfolder = foldername + "/hocr/" | |
| 53 | |
| 54 if not os.path.exists(hocrfolder): | |
| 55 os.mkdir(hocrfolder) | |
| 56 else: | |
| 57 if onExistExit: #if set don't do anything | |
| 58 print (" ---- exists (ocrj)") | |
| 59 return | |
| 60 | |
| 61 | |
| 62 | |
| 63 | |
| 64 for fn in os.listdir(pagesfolder): | |
| 65 | |
| 66 outName = hocrfolder + fn | |
| 67 | |
| 68 | |
| 69 strCommand = "/usr/local/bin/tesseract -l deu " + pagesfolder + fn +" " + hocrfolder + fn + " hocr" | |
| 70 | |
| 71 listArguments = shlex.split(strCommand) | |
| 72 exeShell = subprocess.Popen(listArguments,stdout=subprocess.PIPE) | |
| 73 | |
| 74 while True: | |
| 75 line = exeShell.stdout.readline() | |
| 76 if line != b'': | |
| 77 #the real code does filtering here | |
| 78 print (line.rstrip()) | |
| 79 else: | |
| 80 break | |
| 81 | |
| 82 | |
| 83 | |
| 84 import sys | |
| 85 om = ocrManager() | |
| 86 | |
| 87 args = sys.argv | |
| 88 | |
| 89 start=None | |
| 90 end=None | |
| 91 if len(args) > 1: #start foldernummer | |
| 92 start = int(args[1]) | |
| 93 | |
| 94 if len(args) > 2: #start foldernummer | |
| 95 end = int(args[2]) | |
| 96 | |
| 97 | |
| 98 foldername = "/Volumes/MPGARCHIV/struct2/" | |
| 99 | |
| 100 cnt=-1 | |
| 101 | |
| 102 | |
| 103 | |
| 104 | |
| 105 om.ocr("/Volumes/MPGARCHIV/israel",onExistExit=True) |
