annotate doOCR_tmp.py @ 2:90c0df483890 default tip

inital3
author dwinter
date Tue, 19 Aug 2014 14:25:29 +0200
parents 5b7ed10ecbb4
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
1
5b7ed10ecbb4 initial2
dwinter
parents:
diff changeset
1 import shlex
5b7ed10ecbb4 initial2
dwinter
parents:
diff changeset
2 import subprocess
5b7ed10ecbb4 initial2
dwinter
parents:
diff changeset
3 import os.path
5b7ed10ecbb4 initial2
dwinter
parents:
diff changeset
4 import os
5b7ed10ecbb4 initial2
dwinter
parents:
diff changeset
5
5b7ed10ecbb4 initial2
dwinter
parents:
diff changeset
6 class ocrManager:
5b7ed10ecbb4 initial2
dwinter
parents:
diff changeset
7
5b7ed10ecbb4 initial2
dwinter
parents:
diff changeset
8 #strCommand = "/opt/local/bin/gm convert " + strImagepath + " -resize 1500x\\> " + strImagepath
5b7ed10ecbb4 initial2
dwinter
parents:
diff changeset
9
5b7ed10ecbb4 initial2
dwinter
parents:
diff changeset
10 def split(self,foldername,onExistExit=False):
5b7ed10ecbb4 initial2
dwinter
parents:
diff changeset
11 """splits pdf file in foldername, default it assumes a folder full containing a file foldername.pdf"""
5b7ed10ecbb4 initial2
dwinter
parents:
diff changeset
12
5b7ed10ecbb4 initial2
dwinter
parents:
diff changeset
13 filename=foldername.split("/")[-1].split("___")[1]
5b7ed10ecbb4 initial2
dwinter
parents:
diff changeset
14
5b7ed10ecbb4 initial2
dwinter
parents:
diff changeset
15
5b7ed10ecbb4 initial2
dwinter
parents:
diff changeset
16
5b7ed10ecbb4 initial2
dwinter
parents:
diff changeset
17 outfolder = foldername + "/pages/"
5b7ed10ecbb4 initial2
dwinter
parents:
diff changeset
18
5b7ed10ecbb4 initial2
dwinter
parents:
diff changeset
19 if not os.path.exists(outfolder):
5b7ed10ecbb4 initial2
dwinter
parents:
diff changeset
20 os.mkdir(outfolder)
5b7ed10ecbb4 initial2
dwinter
parents:
diff changeset
21 else:
5b7ed10ecbb4 initial2
dwinter
parents:
diff changeset
22 if onExistExit: #if set don't do anything
5b7ed10ecbb4 initial2
dwinter
parents:
diff changeset
23 print (" ---- exists (split)")
5b7ed10ecbb4 initial2
dwinter
parents:
diff changeset
24 return
5b7ed10ecbb4 initial2
dwinter
parents:
diff changeset
25
5b7ed10ecbb4 initial2
dwinter
parents:
diff changeset
26 outName = outfolder + "img" + "-\%03d.png"
5b7ed10ecbb4 initial2
dwinter
parents:
diff changeset
27 #outName = outfolder + filename + "-%03d.png"
5b7ed10ecbb4 initial2
dwinter
parents:
diff changeset
28 strCommand = "/usr/local/bin/convert -verbose -density 300 " + foldername + "/full/" + filename +".pdf " + outName
5b7ed10ecbb4 initial2
dwinter
parents:
diff changeset
29 #strCommand = "/usr/local/bin/gm convert -verbose -density 300 " + foldername + "/full/" + filename +".pdf " + outName
5b7ed10ecbb4 initial2
dwinter
parents:
diff changeset
30
5b7ed10ecbb4 initial2
dwinter
parents:
diff changeset
31 listArguments = shlex.split(strCommand)
5b7ed10ecbb4 initial2
dwinter
parents:
diff changeset
32 #listArguments = ["/usr/local/bin/gm","convert", "-verbose -density 300 " + foldername + "/full/" + filename +".pdf " + outName]
5b7ed10ecbb4 initial2
dwinter
parents:
diff changeset
33 exeShell = subprocess.call(listArguments)
5b7ed10ecbb4 initial2
dwinter
parents:
diff changeset
34
5b7ed10ecbb4 initial2
dwinter
parents:
diff changeset
35
5b7ed10ecbb4 initial2
dwinter
parents:
diff changeset
36
5b7ed10ecbb4 initial2
dwinter
parents:
diff changeset
37 while False:
5b7ed10ecbb4 initial2
dwinter
parents:
diff changeset
38 line = exeShell.stdout.readline()
5b7ed10ecbb4 initial2
dwinter
parents:
diff changeset
39 if line != b'':
5b7ed10ecbb4 initial2
dwinter
parents:
diff changeset
40 #the real code does filtering here
5b7ed10ecbb4 initial2
dwinter
parents:
diff changeset
41 print (line.rstrip())
5b7ed10ecbb4 initial2
dwinter
parents:
diff changeset
42 else:
5b7ed10ecbb4 initial2
dwinter
parents:
diff changeset
43 break
5b7ed10ecbb4 initial2
dwinter
parents:
diff changeset
44
5b7ed10ecbb4 initial2
dwinter
parents:
diff changeset
45 def ocr(self,foldername,onExistExit=False):
5b7ed10ecbb4 initial2
dwinter
parents:
diff changeset
46 filename=foldername.split("/")[-1]
5b7ed10ecbb4 initial2
dwinter
parents:
diff changeset
47
5b7ed10ecbb4 initial2
dwinter
parents:
diff changeset
48 #tesseract -l deu JB1953-17802.png JB1953-17802.tess hocr
5b7ed10ecbb4 initial2
dwinter
parents:
diff changeset
49
5b7ed10ecbb4 initial2
dwinter
parents:
diff changeset
50
5b7ed10ecbb4 initial2
dwinter
parents:
diff changeset
51 pagesfolder = foldername + "/pages/"
5b7ed10ecbb4 initial2
dwinter
parents:
diff changeset
52 hocrfolder = foldername + "/hocr/"
5b7ed10ecbb4 initial2
dwinter
parents:
diff changeset
53
5b7ed10ecbb4 initial2
dwinter
parents:
diff changeset
54 if not os.path.exists(hocrfolder):
5b7ed10ecbb4 initial2
dwinter
parents:
diff changeset
55 os.mkdir(hocrfolder)
5b7ed10ecbb4 initial2
dwinter
parents:
diff changeset
56 else:
5b7ed10ecbb4 initial2
dwinter
parents:
diff changeset
57 if onExistExit: #if set don't do anything
5b7ed10ecbb4 initial2
dwinter
parents:
diff changeset
58 print (" ---- exists (ocrj)")
5b7ed10ecbb4 initial2
dwinter
parents:
diff changeset
59 return
5b7ed10ecbb4 initial2
dwinter
parents:
diff changeset
60
5b7ed10ecbb4 initial2
dwinter
parents:
diff changeset
61
5b7ed10ecbb4 initial2
dwinter
parents:
diff changeset
62
5b7ed10ecbb4 initial2
dwinter
parents:
diff changeset
63
5b7ed10ecbb4 initial2
dwinter
parents:
diff changeset
64 for fn in os.listdir(pagesfolder):
5b7ed10ecbb4 initial2
dwinter
parents:
diff changeset
65
5b7ed10ecbb4 initial2
dwinter
parents:
diff changeset
66 outName = hocrfolder + fn
5b7ed10ecbb4 initial2
dwinter
parents:
diff changeset
67
5b7ed10ecbb4 initial2
dwinter
parents:
diff changeset
68
5b7ed10ecbb4 initial2
dwinter
parents:
diff changeset
69 strCommand = "/usr/local/bin/tesseract -l deu " + pagesfolder + fn +" " + hocrfolder + fn + " hocr"
5b7ed10ecbb4 initial2
dwinter
parents:
diff changeset
70
5b7ed10ecbb4 initial2
dwinter
parents:
diff changeset
71 listArguments = shlex.split(strCommand)
5b7ed10ecbb4 initial2
dwinter
parents:
diff changeset
72 exeShell = subprocess.Popen(listArguments,stdout=subprocess.PIPE)
5b7ed10ecbb4 initial2
dwinter
parents:
diff changeset
73
5b7ed10ecbb4 initial2
dwinter
parents:
diff changeset
74 while True:
5b7ed10ecbb4 initial2
dwinter
parents:
diff changeset
75 line = exeShell.stdout.readline()
5b7ed10ecbb4 initial2
dwinter
parents:
diff changeset
76 if line != b'':
5b7ed10ecbb4 initial2
dwinter
parents:
diff changeset
77 #the real code does filtering here
5b7ed10ecbb4 initial2
dwinter
parents:
diff changeset
78 print (line.rstrip())
5b7ed10ecbb4 initial2
dwinter
parents:
diff changeset
79 else:
5b7ed10ecbb4 initial2
dwinter
parents:
diff changeset
80 break
5b7ed10ecbb4 initial2
dwinter
parents:
diff changeset
81
5b7ed10ecbb4 initial2
dwinter
parents:
diff changeset
82
5b7ed10ecbb4 initial2
dwinter
parents:
diff changeset
83
5b7ed10ecbb4 initial2
dwinter
parents:
diff changeset
84 import sys
5b7ed10ecbb4 initial2
dwinter
parents:
diff changeset
85 om = ocrManager()
5b7ed10ecbb4 initial2
dwinter
parents:
diff changeset
86
5b7ed10ecbb4 initial2
dwinter
parents:
diff changeset
87 args = sys.argv
5b7ed10ecbb4 initial2
dwinter
parents:
diff changeset
88
5b7ed10ecbb4 initial2
dwinter
parents:
diff changeset
89 start=None
5b7ed10ecbb4 initial2
dwinter
parents:
diff changeset
90 end=None
5b7ed10ecbb4 initial2
dwinter
parents:
diff changeset
91 if len(args) > 1: #start foldernummer
5b7ed10ecbb4 initial2
dwinter
parents:
diff changeset
92 start = int(args[1])
5b7ed10ecbb4 initial2
dwinter
parents:
diff changeset
93
5b7ed10ecbb4 initial2
dwinter
parents:
diff changeset
94 if len(args) > 2: #start foldernummer
5b7ed10ecbb4 initial2
dwinter
parents:
diff changeset
95 end = int(args[2])
5b7ed10ecbb4 initial2
dwinter
parents:
diff changeset
96
5b7ed10ecbb4 initial2
dwinter
parents:
diff changeset
97
5b7ed10ecbb4 initial2
dwinter
parents:
diff changeset
98 foldername = "/Volumes/MPGARCHIV/struct2/"
5b7ed10ecbb4 initial2
dwinter
parents:
diff changeset
99
5b7ed10ecbb4 initial2
dwinter
parents:
diff changeset
100 cnt=-1
5b7ed10ecbb4 initial2
dwinter
parents:
diff changeset
101 for fn in os.listdir(foldername):
5b7ed10ecbb4 initial2
dwinter
parents:
diff changeset
102 cnt+=1
5b7ed10ecbb4 initial2
dwinter
parents:
diff changeset
103 if fn.find("1953-1974")<0:
5b7ed10ecbb4 initial2
dwinter
parents:
diff changeset
104 print ("DON't do:"+fn)
5b7ed10ecbb4 initial2
dwinter
parents:
diff changeset
105 continue
5b7ed10ecbb4 initial2
dwinter
parents:
diff changeset
106 if start is not None:
5b7ed10ecbb4 initial2
dwinter
parents:
diff changeset
107 if cnt<start:
5b7ed10ecbb4 initial2
dwinter
parents:
diff changeset
108 print("SKIP: %s"%fn)
5b7ed10ecbb4 initial2
dwinter
parents:
diff changeset
109 continue
5b7ed10ecbb4 initial2
dwinter
parents:
diff changeset
110
5b7ed10ecbb4 initial2
dwinter
parents:
diff changeset
111 if end is not None:
5b7ed10ecbb4 initial2
dwinter
parents:
diff changeset
112 if cnt>end:
5b7ed10ecbb4 initial2
dwinter
parents:
diff changeset
113 sys.exit()
5b7ed10ecbb4 initial2
dwinter
parents:
diff changeset
114 print("PROCESS: %s"%fn)
5b7ed10ecbb4 initial2
dwinter
parents:
diff changeset
115
5b7ed10ecbb4 initial2
dwinter
parents:
diff changeset
116
5b7ed10ecbb4 initial2
dwinter
parents:
diff changeset
117
5b7ed10ecbb4 initial2
dwinter
parents:
diff changeset
118 om.split(foldername+fn,onExistExit=False)
5b7ed10ecbb4 initial2
dwinter
parents:
diff changeset
119 om.ocr(foldername+fn,onExistExit=False)