2
|
1 import shlex
|
|
2 import subprocess
|
|
3 import os.path
|
|
4 import os
|
|
5
|
|
6 class ocrManager:
|
|
7
|
|
8 #strCommand = "/opt/local/bin/gm convert " + strImagepath + " -resize 1500x\\> " + strImagepath
|
|
9
|
|
10 def split(self,foldername,onExistExit=False):
|
|
11 """splits pdf file in foldername, default it assumes a folder full containing a file foldername.pdf"""
|
|
12
|
|
13 filename=foldername.split("/")[-1].split("___")[1]
|
|
14
|
|
15
|
|
16
|
|
17 outfolder = foldername + "/pages/"
|
|
18
|
|
19 if not os.path.exists(outfolder):
|
|
20 os.mkdir(outfolder)
|
|
21 else:
|
|
22 if onExistExit: #if set don't do anything
|
|
23 print (" ---- exists (split)")
|
|
24 return
|
|
25
|
|
26 outName = outfolder + "img" + "-\%03d.png"
|
|
27 #outName = outfolder + filename + "-%03d.png"
|
|
28 strCommand = "/usr/local/bin/convert -verbose -density 300 " + foldername + "/full/" + filename +".pdf " + outName
|
|
29 #strCommand = "/usr/local/bin/gm convert -verbose -density 300 " + foldername + "/full/" + filename +".pdf " + outName
|
|
30
|
|
31 listArguments = shlex.split(strCommand)
|
|
32 #listArguments = ["/usr/local/bin/gm","convert", "-verbose -density 300 " + foldername + "/full/" + filename +".pdf " + outName]
|
|
33 exeShell = subprocess.call(listArguments)
|
|
34
|
|
35
|
|
36
|
|
37 while False:
|
|
38 line = exeShell.stdout.readline()
|
|
39 if line != b'':
|
|
40 #the real code does filtering here
|
|
41 print (line.rstrip())
|
|
42 else:
|
|
43 break
|
|
44
|
|
45 def ocr(self,foldername,onExistExit=False):
|
|
46 filename=foldername.split("/")[-1]
|
|
47
|
|
48 #tesseract -l deu JB1953-17802.png JB1953-17802.tess hocr
|
|
49
|
|
50
|
|
51 pagesfolder = foldername + "/pages/"
|
|
52 hocrfolder = foldername + "/hocr/"
|
|
53
|
|
54 if not os.path.exists(hocrfolder):
|
|
55 os.mkdir(hocrfolder)
|
|
56 else:
|
|
57 if onExistExit: #if set don't do anything
|
|
58 print (" ---- exists (ocrj)")
|
|
59 return
|
|
60
|
|
61
|
|
62
|
|
63
|
|
64 for fn in os.listdir(pagesfolder):
|
|
65
|
|
66 outName = hocrfolder + fn
|
|
67
|
|
68
|
|
69 strCommand = "/usr/local/bin/tesseract -l deu " + pagesfolder + fn +" " + hocrfolder + fn + " hocr"
|
|
70
|
|
71 listArguments = shlex.split(strCommand)
|
|
72 exeShell = subprocess.Popen(listArguments,stdout=subprocess.PIPE)
|
|
73
|
|
74 while True:
|
|
75 line = exeShell.stdout.readline()
|
|
76 if line != b'':
|
|
77 #the real code does filtering here
|
|
78 print (line.rstrip())
|
|
79 else:
|
|
80 break
|
|
81
|
|
82
|
|
83
|
|
84 import sys
|
|
85 om = ocrManager()
|
|
86
|
|
87 args = sys.argv
|
|
88
|
|
89 start=None
|
|
90 end=None
|
|
91 if len(args) > 1: #start foldernummer
|
|
92 start = int(args[1])
|
|
93
|
|
94 if len(args) > 2: #start foldernummer
|
|
95 end = int(args[2])
|
|
96
|
|
97
|
|
98 foldername = "/Volumes/MPGARCHIV/struct2/"
|
|
99
|
|
100 cnt=-1
|
|
101 for fn in os.listdir(foldername):
|
|
102 cnt+=1
|
|
103 # if fn.find("igkeitsber")<0:
|
|
104 if fn.find("Jahrbuch")<0:
|
|
105 print ("DON't do:"+fn)
|
|
106 continue
|
|
107 if start is not None:
|
|
108 if cnt<start:
|
|
109 print("SKIP: %s"%fn)
|
|
110 continue
|
|
111
|
|
112 if end is not None:
|
|
113 if cnt>end:
|
|
114 sys.exit()
|
|
115 print("PROCESS: %s"%fn)
|
|
116
|
|
117
|
|
118
|
|
119 om.split(foldername+fn,onExistExit=True)
|
|
120 om.ocr(foldername+fn,onExistExit=True)
|