annotate doOCR.py @ 2:90c0df483890 default tip

inital3
author dwinter
date Tue, 19 Aug 2014 14:25:29 +0200
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
2
90c0df483890 inital3
dwinter
parents:
diff changeset
1 import shlex
90c0df483890 inital3
dwinter
parents:
diff changeset
2 import subprocess
90c0df483890 inital3
dwinter
parents:
diff changeset
3 import os.path
90c0df483890 inital3
dwinter
parents:
diff changeset
4 import os
90c0df483890 inital3
dwinter
parents:
diff changeset
5
90c0df483890 inital3
dwinter
parents:
diff changeset
6 class ocrManager:
90c0df483890 inital3
dwinter
parents:
diff changeset
7
90c0df483890 inital3
dwinter
parents:
diff changeset
8 #strCommand = "/opt/local/bin/gm convert " + strImagepath + " -resize 1500x\\> " + strImagepath
90c0df483890 inital3
dwinter
parents:
diff changeset
9
90c0df483890 inital3
dwinter
parents:
diff changeset
10 def split(self,foldername,onExistExit=False):
90c0df483890 inital3
dwinter
parents:
diff changeset
11 """splits pdf file in foldername, default it assumes a folder full containing a file foldername.pdf"""
90c0df483890 inital3
dwinter
parents:
diff changeset
12
90c0df483890 inital3
dwinter
parents:
diff changeset
13 filename=foldername.split("/")[-1].split("___")[1]
90c0df483890 inital3
dwinter
parents:
diff changeset
14
90c0df483890 inital3
dwinter
parents:
diff changeset
15
90c0df483890 inital3
dwinter
parents:
diff changeset
16
90c0df483890 inital3
dwinter
parents:
diff changeset
17 outfolder = foldername + "/pages/"
90c0df483890 inital3
dwinter
parents:
diff changeset
18
90c0df483890 inital3
dwinter
parents:
diff changeset
19 if not os.path.exists(outfolder):
90c0df483890 inital3
dwinter
parents:
diff changeset
20 os.mkdir(outfolder)
90c0df483890 inital3
dwinter
parents:
diff changeset
21 else:
90c0df483890 inital3
dwinter
parents:
diff changeset
22 if onExistExit: #if set don't do anything
90c0df483890 inital3
dwinter
parents:
diff changeset
23 print (" ---- exists (split)")
90c0df483890 inital3
dwinter
parents:
diff changeset
24 return
90c0df483890 inital3
dwinter
parents:
diff changeset
25
90c0df483890 inital3
dwinter
parents:
diff changeset
26 outName = outfolder + "img" + "-\%03d.png"
90c0df483890 inital3
dwinter
parents:
diff changeset
27 #outName = outfolder + filename + "-%03d.png"
90c0df483890 inital3
dwinter
parents:
diff changeset
28 strCommand = "/usr/local/bin/convert -verbose -density 300 " + foldername + "/full/" + filename +".pdf " + outName
90c0df483890 inital3
dwinter
parents:
diff changeset
29 #strCommand = "/usr/local/bin/gm convert -verbose -density 300 " + foldername + "/full/" + filename +".pdf " + outName
90c0df483890 inital3
dwinter
parents:
diff changeset
30
90c0df483890 inital3
dwinter
parents:
diff changeset
31 listArguments = shlex.split(strCommand)
90c0df483890 inital3
dwinter
parents:
diff changeset
32 #listArguments = ["/usr/local/bin/gm","convert", "-verbose -density 300 " + foldername + "/full/" + filename +".pdf " + outName]
90c0df483890 inital3
dwinter
parents:
diff changeset
33 exeShell = subprocess.call(listArguments)
90c0df483890 inital3
dwinter
parents:
diff changeset
34
90c0df483890 inital3
dwinter
parents:
diff changeset
35
90c0df483890 inital3
dwinter
parents:
diff changeset
36
90c0df483890 inital3
dwinter
parents:
diff changeset
37 while False:
90c0df483890 inital3
dwinter
parents:
diff changeset
38 line = exeShell.stdout.readline()
90c0df483890 inital3
dwinter
parents:
diff changeset
39 if line != b'':
90c0df483890 inital3
dwinter
parents:
diff changeset
40 #the real code does filtering here
90c0df483890 inital3
dwinter
parents:
diff changeset
41 print (line.rstrip())
90c0df483890 inital3
dwinter
parents:
diff changeset
42 else:
90c0df483890 inital3
dwinter
parents:
diff changeset
43 break
90c0df483890 inital3
dwinter
parents:
diff changeset
44
90c0df483890 inital3
dwinter
parents:
diff changeset
45 def ocr(self,foldername,onExistExit=False):
90c0df483890 inital3
dwinter
parents:
diff changeset
46 filename=foldername.split("/")[-1]
90c0df483890 inital3
dwinter
parents:
diff changeset
47
90c0df483890 inital3
dwinter
parents:
diff changeset
48 #tesseract -l deu JB1953-17802.png JB1953-17802.tess hocr
90c0df483890 inital3
dwinter
parents:
diff changeset
49
90c0df483890 inital3
dwinter
parents:
diff changeset
50
90c0df483890 inital3
dwinter
parents:
diff changeset
51 pagesfolder = foldername + "/pages/"
90c0df483890 inital3
dwinter
parents:
diff changeset
52 hocrfolder = foldername + "/hocr/"
90c0df483890 inital3
dwinter
parents:
diff changeset
53
90c0df483890 inital3
dwinter
parents:
diff changeset
54 if not os.path.exists(hocrfolder):
90c0df483890 inital3
dwinter
parents:
diff changeset
55 os.mkdir(hocrfolder)
90c0df483890 inital3
dwinter
parents:
diff changeset
56 else:
90c0df483890 inital3
dwinter
parents:
diff changeset
57 if onExistExit: #if set don't do anything
90c0df483890 inital3
dwinter
parents:
diff changeset
58 print (" ---- exists (ocrj)")
90c0df483890 inital3
dwinter
parents:
diff changeset
59 return
90c0df483890 inital3
dwinter
parents:
diff changeset
60
90c0df483890 inital3
dwinter
parents:
diff changeset
61
90c0df483890 inital3
dwinter
parents:
diff changeset
62
90c0df483890 inital3
dwinter
parents:
diff changeset
63
90c0df483890 inital3
dwinter
parents:
diff changeset
64 for fn in os.listdir(pagesfolder):
90c0df483890 inital3
dwinter
parents:
diff changeset
65
90c0df483890 inital3
dwinter
parents:
diff changeset
66 outName = hocrfolder + fn
90c0df483890 inital3
dwinter
parents:
diff changeset
67
90c0df483890 inital3
dwinter
parents:
diff changeset
68
90c0df483890 inital3
dwinter
parents:
diff changeset
69 strCommand = "/usr/local/bin/tesseract -l deu " + pagesfolder + fn +" " + hocrfolder + fn + " hocr"
90c0df483890 inital3
dwinter
parents:
diff changeset
70
90c0df483890 inital3
dwinter
parents:
diff changeset
71 listArguments = shlex.split(strCommand)
90c0df483890 inital3
dwinter
parents:
diff changeset
72 exeShell = subprocess.Popen(listArguments,stdout=subprocess.PIPE)
90c0df483890 inital3
dwinter
parents:
diff changeset
73
90c0df483890 inital3
dwinter
parents:
diff changeset
74 while True:
90c0df483890 inital3
dwinter
parents:
diff changeset
75 line = exeShell.stdout.readline()
90c0df483890 inital3
dwinter
parents:
diff changeset
76 if line != b'':
90c0df483890 inital3
dwinter
parents:
diff changeset
77 #the real code does filtering here
90c0df483890 inital3
dwinter
parents:
diff changeset
78 print (line.rstrip())
90c0df483890 inital3
dwinter
parents:
diff changeset
79 else:
90c0df483890 inital3
dwinter
parents:
diff changeset
80 break
90c0df483890 inital3
dwinter
parents:
diff changeset
81
90c0df483890 inital3
dwinter
parents:
diff changeset
82
90c0df483890 inital3
dwinter
parents:
diff changeset
83
90c0df483890 inital3
dwinter
parents:
diff changeset
84 import sys
90c0df483890 inital3
dwinter
parents:
diff changeset
85 om = ocrManager()
90c0df483890 inital3
dwinter
parents:
diff changeset
86
90c0df483890 inital3
dwinter
parents:
diff changeset
87 args = sys.argv
90c0df483890 inital3
dwinter
parents:
diff changeset
88
90c0df483890 inital3
dwinter
parents:
diff changeset
89 start=None
90c0df483890 inital3
dwinter
parents:
diff changeset
90 end=None
90c0df483890 inital3
dwinter
parents:
diff changeset
91 if len(args) > 1: #start foldernummer
90c0df483890 inital3
dwinter
parents:
diff changeset
92 start = int(args[1])
90c0df483890 inital3
dwinter
parents:
diff changeset
93
90c0df483890 inital3
dwinter
parents:
diff changeset
94 if len(args) > 2: #start foldernummer
90c0df483890 inital3
dwinter
parents:
diff changeset
95 end = int(args[2])
90c0df483890 inital3
dwinter
parents:
diff changeset
96
90c0df483890 inital3
dwinter
parents:
diff changeset
97
90c0df483890 inital3
dwinter
parents:
diff changeset
98 foldername = "/Volumes/MPGARCHIV/struct2/"
90c0df483890 inital3
dwinter
parents:
diff changeset
99
90c0df483890 inital3
dwinter
parents:
diff changeset
100 cnt=-1
90c0df483890 inital3
dwinter
parents:
diff changeset
101 for fn in os.listdir(foldername):
90c0df483890 inital3
dwinter
parents:
diff changeset
102 cnt+=1
90c0df483890 inital3
dwinter
parents:
diff changeset
103 # if fn.find("igkeitsber")<0:
90c0df483890 inital3
dwinter
parents:
diff changeset
104 if fn.find("Jahrbuch")<0:
90c0df483890 inital3
dwinter
parents:
diff changeset
105 print ("DON't do:"+fn)
90c0df483890 inital3
dwinter
parents:
diff changeset
106 continue
90c0df483890 inital3
dwinter
parents:
diff changeset
107 if start is not None:
90c0df483890 inital3
dwinter
parents:
diff changeset
108 if cnt<start:
90c0df483890 inital3
dwinter
parents:
diff changeset
109 print("SKIP: %s"%fn)
90c0df483890 inital3
dwinter
parents:
diff changeset
110 continue
90c0df483890 inital3
dwinter
parents:
diff changeset
111
90c0df483890 inital3
dwinter
parents:
diff changeset
112 if end is not None:
90c0df483890 inital3
dwinter
parents:
diff changeset
113 if cnt>end:
90c0df483890 inital3
dwinter
parents:
diff changeset
114 sys.exit()
90c0df483890 inital3
dwinter
parents:
diff changeset
115 print("PROCESS: %s"%fn)
90c0df483890 inital3
dwinter
parents:
diff changeset
116
90c0df483890 inital3
dwinter
parents:
diff changeset
117
90c0df483890 inital3
dwinter
parents:
diff changeset
118
90c0df483890 inital3
dwinter
parents:
diff changeset
119 om.split(foldername+fn,onExistExit=True)
90c0df483890 inital3
dwinter
parents:
diff changeset
120 om.ocr(foldername+fn,onExistExit=True)