annotate doOCR.py @ 0:5e33fa5a2fdc

initial
author Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
date Tue, 19 Aug 2014 14:12:52 +0200
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
5e33fa5a2fdc initial
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff changeset
1 import shlex
5e33fa5a2fdc initial
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff changeset
2 import subprocess
5e33fa5a2fdc initial
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff changeset
3 import os.path
5e33fa5a2fdc initial
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff changeset
4 import os
5e33fa5a2fdc initial
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff changeset
5
5e33fa5a2fdc initial
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff changeset
6 class ocrManager:
5e33fa5a2fdc initial
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff changeset
7
5e33fa5a2fdc initial
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff changeset
8 #strCommand = "/opt/local/bin/gm convert " + strImagepath + " -resize 1500x\\> " + strImagepath
5e33fa5a2fdc initial
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff changeset
9
5e33fa5a2fdc initial
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff changeset
10 def split(self,foldername):
5e33fa5a2fdc initial
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff changeset
11 """splits pdf file in foldername, default it assumes a folder full containing a file foldername.pdf"""
5e33fa5a2fdc initial
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff changeset
12
5e33fa5a2fdc initial
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff changeset
13 filename=foldername.split("/")[-1].split("___")[1]
5e33fa5a2fdc initial
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff changeset
14
5e33fa5a2fdc initial
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff changeset
15
5e33fa5a2fdc initial
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff changeset
16
5e33fa5a2fdc initial
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff changeset
17 outfolder = foldername + "/pages/"
5e33fa5a2fdc initial
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff changeset
18
5e33fa5a2fdc initial
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff changeset
19 if not os.path.exists(outfolder):
5e33fa5a2fdc initial
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff changeset
20 os.mkdir(outfolder)
5e33fa5a2fdc initial
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff changeset
21
5e33fa5a2fdc initial
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff changeset
22 outName = outfolder + "img" + "-\%03d.png"
5e33fa5a2fdc initial
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff changeset
23 #outName = outfolder + filename + "-%03d.png"
5e33fa5a2fdc initial
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff changeset
24 strCommand = "/usr/local/bin/convert -verbose -density 300 " + foldername + "/full/" + filename +".pdf " + outName
5e33fa5a2fdc initial
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff changeset
25 #strCommand = "/usr/local/bin/gm convert -verbose -density 300 " + foldername + "/full/" + filename +".pdf " + outName
5e33fa5a2fdc initial
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff changeset
26
5e33fa5a2fdc initial
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff changeset
27 listArguments = shlex.split(strCommand)
5e33fa5a2fdc initial
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff changeset
28 #listArguments = ["/usr/local/bin/gm","convert", "-verbose -density 300 " + foldername + "/full/" + filename +".pdf " + outName]
5e33fa5a2fdc initial
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff changeset
29 exeShell = subprocess.call(listArguments)
5e33fa5a2fdc initial
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff changeset
30
5e33fa5a2fdc initial
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff changeset
31
5e33fa5a2fdc initial
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff changeset
32
5e33fa5a2fdc initial
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff changeset
33 while False:
5e33fa5a2fdc initial
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff changeset
34 line = exeShell.stdout.readline()
5e33fa5a2fdc initial
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff changeset
35 if line != b'':
5e33fa5a2fdc initial
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff changeset
36 #the real code does filtering here
5e33fa5a2fdc initial
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff changeset
37 print (line.rstrip())
5e33fa5a2fdc initial
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff changeset
38 else:
5e33fa5a2fdc initial
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff changeset
39 break
5e33fa5a2fdc initial
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff changeset
40
5e33fa5a2fdc initial
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff changeset
41 def ocr(self,foldername):
5e33fa5a2fdc initial
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff changeset
42 filename=foldername.split("/")[-1]
5e33fa5a2fdc initial
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff changeset
43
5e33fa5a2fdc initial
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff changeset
44 #tesseract -l deu JB1953-17802.png JB1953-17802.tess hocr
5e33fa5a2fdc initial
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff changeset
45
5e33fa5a2fdc initial
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff changeset
46
5e33fa5a2fdc initial
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff changeset
47 pagesfolder = foldername + "/pages/"
5e33fa5a2fdc initial
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff changeset
48 hocrfolder = foldername + "/hocr/"
5e33fa5a2fdc initial
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff changeset
49
5e33fa5a2fdc initial
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff changeset
50 if not os.path.exists(hocrfolder):
5e33fa5a2fdc initial
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff changeset
51 os.mkdir(hocrfolder)
5e33fa5a2fdc initial
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff changeset
52
5e33fa5a2fdc initial
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff changeset
53
5e33fa5a2fdc initial
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff changeset
54
5e33fa5a2fdc initial
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff changeset
55 for fn in os.listdir(pagesfolder):
5e33fa5a2fdc initial
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff changeset
56
5e33fa5a2fdc initial
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff changeset
57 outName = hocrfolder + fn
5e33fa5a2fdc initial
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff changeset
58
5e33fa5a2fdc initial
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff changeset
59
5e33fa5a2fdc initial
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff changeset
60 strCommand = "/usr/local/bin/tesseract -l deu " + pagesfolder + fn +" " + hocrfolder + fn + " hocr"
5e33fa5a2fdc initial
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff changeset
61
5e33fa5a2fdc initial
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff changeset
62 listArguments = shlex.split(strCommand)
5e33fa5a2fdc initial
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff changeset
63 exeShell = subprocess.Popen(listArguments,stdout=subprocess.PIPE)
5e33fa5a2fdc initial
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff changeset
64
5e33fa5a2fdc initial
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff changeset
65 while True:
5e33fa5a2fdc initial
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff changeset
66 line = exeShell.stdout.readline()
5e33fa5a2fdc initial
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff changeset
67 if line != b'':
5e33fa5a2fdc initial
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff changeset
68 #the real code does filtering here
5e33fa5a2fdc initial
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff changeset
69 print (line.rstrip())
5e33fa5a2fdc initial
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff changeset
70 else:
5e33fa5a2fdc initial
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff changeset
71 break
5e33fa5a2fdc initial
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff changeset
72
5e33fa5a2fdc initial
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff changeset
73
5e33fa5a2fdc initial
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff changeset
74
5e33fa5a2fdc initial
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff changeset
75 import sys
5e33fa5a2fdc initial
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff changeset
76 om = ocrManager()
5e33fa5a2fdc initial
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff changeset
77
5e33fa5a2fdc initial
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff changeset
78 args = sys.argv
5e33fa5a2fdc initial
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff changeset
79
5e33fa5a2fdc initial
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff changeset
80 start=None
5e33fa5a2fdc initial
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff changeset
81 end=None
5e33fa5a2fdc initial
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff changeset
82 if len(args) > 1: #start foldernummer
5e33fa5a2fdc initial
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff changeset
83 start = int(args[1])
5e33fa5a2fdc initial
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff changeset
84
5e33fa5a2fdc initial
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff changeset
85 if len(args) > 2: #start foldernummer
5e33fa5a2fdc initial
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff changeset
86 end = int(args[2])
5e33fa5a2fdc initial
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff changeset
87
5e33fa5a2fdc initial
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff changeset
88
5e33fa5a2fdc initial
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff changeset
89 foldername = "/Volumes/MPGARCHIV/struct2/"
5e33fa5a2fdc initial
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff changeset
90
5e33fa5a2fdc initial
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff changeset
91 cnt=-1
5e33fa5a2fdc initial
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff changeset
92 for fn in os.listdir(foldername):
5e33fa5a2fdc initial
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff changeset
93 cnt+=1
5e33fa5a2fdc initial
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff changeset
94
5e33fa5a2fdc initial
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff changeset
95 if start is not None:
5e33fa5a2fdc initial
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff changeset
96 if cnt<start:
5e33fa5a2fdc initial
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff changeset
97 print("SKIP: %s"%fn)
5e33fa5a2fdc initial
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff changeset
98 continue
5e33fa5a2fdc initial
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff changeset
99
5e33fa5a2fdc initial
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff changeset
100 if end is not None:
5e33fa5a2fdc initial
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff changeset
101 if cnt>end:
5e33fa5a2fdc initial
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff changeset
102 sys.exit()
5e33fa5a2fdc initial
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff changeset
103 print("PROCESS: %s"%fn)
5e33fa5a2fdc initial
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff changeset
104
5e33fa5a2fdc initial
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff changeset
105
5e33fa5a2fdc initial
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff changeset
106
5e33fa5a2fdc initial
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff changeset
107 om.split(foldername+fn)
5e33fa5a2fdc initial
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff changeset
108 om.ocr(foldername+fn)