comparison doOCR2.py @ 1:5b7ed10ecbb4

initial2
author dwinter
date Tue, 19 Aug 2014 14:24:36 +0200
parents
children
comparison
equal deleted inserted replaced
0:5e33fa5a2fdc 1:5b7ed10ecbb4
1 import shlex
2 import subprocess
3 import os.path
4 import os
5
6 class ocrManager:
7
8 #strCommand = "/opt/local/bin/gm convert " + strImagepath + " -resize 1500x\\> " + strImagepath
9
10 def split(self,foldername,onExistExit=False):
11 """splits pdf file in foldername, default it assumes a folder full containing a file foldername.pdf"""
12
13 filename=foldername.split("/")[-1].split("___")[1]
14
15
16
17 outfolder = foldername + "/pages/"
18
19 if not os.path.exists(outfolder):
20 os.mkdir(outfolder)
21 else:
22 if onExistExit: #if set don't do anything
23 print (" ---- exists (split)")
24 return
25
26 outName = outfolder + "img" + "-\%03d.png"
27 #outName = outfolder + filename + "-%03d.png"
28 strCommand = "/usr/local/bin/convert -verbose -density 300 " + foldername + "/full/" + filename +".pdf " + outName
29 #strCommand = "/usr/local/bin/gm convert -verbose -density 300 " + foldername + "/full/" + filename +".pdf " + outName
30
31 listArguments = shlex.split(strCommand)
32 #listArguments = ["/usr/local/bin/gm","convert", "-verbose -density 300 " + foldername + "/full/" + filename +".pdf " + outName]
33 exeShell = subprocess.call(listArguments)
34
35
36
37 while False:
38 line = exeShell.stdout.readline()
39 if line != b'':
40 #the real code does filtering here
41 print (line.rstrip())
42 else:
43 break
44
45 def ocr(self,foldername,onExistExit=False):
46 filename=foldername.split("/")[-1]
47
48 #tesseract -l deu JB1953-17802.png JB1953-17802.tess hocr
49
50
51 pagesfolder = foldername + "/pages/"
52 hocrfolder = foldername + "/hocr/"
53
54 if not os.path.exists(hocrfolder):
55 os.mkdir(hocrfolder)
56 else:
57 if onExistExit: #if set don't do anything
58 print (" ---- exists (ocrj)")
59 return
60
61
62
63
64 for fn in os.listdir(pagesfolder):
65
66 outName = hocrfolder + fn
67
68
69 strCommand = "/usr/local/bin/tesseract -l deu " + pagesfolder + fn +" " + hocrfolder + fn + " hocr"
70
71 listArguments = shlex.split(strCommand)
72 exeShell = subprocess.Popen(listArguments,stdout=subprocess.PIPE)
73
74 while True:
75 line = exeShell.stdout.readline()
76 if line != b'':
77 #the real code does filtering here
78 print (line.rstrip())
79 else:
80 break
81
82
83
84 import sys
85 om = ocrManager()
86
87 args = sys.argv
88
89 start=None
90 end=None
91 if len(args) > 1: #start foldernummer
92 start = int(args[1])
93
94 if len(args) > 2: #start foldernummer
95 end = int(args[2])
96
97
98 foldername = "/Volumes/MPGARCHIV/struct2/"
99
100 cnt=-1
101
102
103
104
105 om.ocr("/Volumes/MPGARCHIV/israel",onExistExit=True)