comparison doOCR.py @ 0:5e33fa5a2fdc

initial
author Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
date Tue, 19 Aug 2014 14:12:52 +0200
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:5e33fa5a2fdc
1 import shlex
2 import subprocess
3 import os.path
4 import os
5
6 class ocrManager:
7
8 #strCommand = "/opt/local/bin/gm convert " + strImagepath + " -resize 1500x\\> " + strImagepath
9
10 def split(self,foldername):
11 """splits pdf file in foldername, default it assumes a folder full containing a file foldername.pdf"""
12
13 filename=foldername.split("/")[-1].split("___")[1]
14
15
16
17 outfolder = foldername + "/pages/"
18
19 if not os.path.exists(outfolder):
20 os.mkdir(outfolder)
21
22 outName = outfolder + "img" + "-\%03d.png"
23 #outName = outfolder + filename + "-%03d.png"
24 strCommand = "/usr/local/bin/convert -verbose -density 300 " + foldername + "/full/" + filename +".pdf " + outName
25 #strCommand = "/usr/local/bin/gm convert -verbose -density 300 " + foldername + "/full/" + filename +".pdf " + outName
26
27 listArguments = shlex.split(strCommand)
28 #listArguments = ["/usr/local/bin/gm","convert", "-verbose -density 300 " + foldername + "/full/" + filename +".pdf " + outName]
29 exeShell = subprocess.call(listArguments)
30
31
32
33 while False:
34 line = exeShell.stdout.readline()
35 if line != b'':
36 #the real code does filtering here
37 print (line.rstrip())
38 else:
39 break
40
41 def ocr(self,foldername):
42 filename=foldername.split("/")[-1]
43
44 #tesseract -l deu JB1953-17802.png JB1953-17802.tess hocr
45
46
47 pagesfolder = foldername + "/pages/"
48 hocrfolder = foldername + "/hocr/"
49
50 if not os.path.exists(hocrfolder):
51 os.mkdir(hocrfolder)
52
53
54
55 for fn in os.listdir(pagesfolder):
56
57 outName = hocrfolder + fn
58
59
60 strCommand = "/usr/local/bin/tesseract -l deu " + pagesfolder + fn +" " + hocrfolder + fn + " hocr"
61
62 listArguments = shlex.split(strCommand)
63 exeShell = subprocess.Popen(listArguments,stdout=subprocess.PIPE)
64
65 while True:
66 line = exeShell.stdout.readline()
67 if line != b'':
68 #the real code does filtering here
69 print (line.rstrip())
70 else:
71 break
72
73
74
75 import sys
76 om = ocrManager()
77
78 args = sys.argv
79
80 start=None
81 end=None
82 if len(args) > 1: #start foldernummer
83 start = int(args[1])
84
85 if len(args) > 2: #start foldernummer
86 end = int(args[2])
87
88
89 foldername = "/Volumes/MPGARCHIV/struct2/"
90
91 cnt=-1
92 for fn in os.listdir(foldername):
93 cnt+=1
94
95 if start is not None:
96 if cnt<start:
97 print("SKIP: %s"%fn)
98 continue
99
100 if end is not None:
101 if cnt>end:
102 sys.exit()
103 print("PROCESS: %s"%fn)
104
105
106
107 om.split(foldername+fn)
108 om.ocr(foldername+fn)