0
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
1 import shlex
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
2 import subprocess
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
3 import os.path
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
4 import os
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
5
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
6 class ocrManager:
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
7
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
8 #strCommand = "/opt/local/bin/gm convert " + strImagepath + " -resize 1500x\\> " + strImagepath
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
9
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
10 def split(self,foldername):
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
11 """splits pdf file in foldername, default it assumes a folder full containing a file foldername.pdf"""
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
12
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
13 filename=foldername.split("/")[-1].split("___")[1]
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
14
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
15
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
16
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
17 outfolder = foldername + "/pages/"
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
18
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
19 if not os.path.exists(outfolder):
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
20 os.mkdir(outfolder)
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
21
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
22 outName = outfolder + "img" + "-\%03d.png"
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
23 #outName = outfolder + filename + "-%03d.png"
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
24 strCommand = "/usr/local/bin/convert -verbose -density 300 " + foldername + "/full/" + filename +".pdf " + outName
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
25 #strCommand = "/usr/local/bin/gm convert -verbose -density 300 " + foldername + "/full/" + filename +".pdf " + outName
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
26
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
27 listArguments = shlex.split(strCommand)
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
28 #listArguments = ["/usr/local/bin/gm","convert", "-verbose -density 300 " + foldername + "/full/" + filename +".pdf " + outName]
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
29 exeShell = subprocess.call(listArguments)
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
30
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
31
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
32
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
33 while False:
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
34 line = exeShell.stdout.readline()
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
35 if line != b'':
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
36 #the real code does filtering here
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
37 print (line.rstrip())
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
38 else:
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
39 break
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
40
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
41 def ocr(self,foldername):
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
42 filename=foldername.split("/")[-1]
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
43
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
44 #tesseract -l deu JB1953-17802.png JB1953-17802.tess hocr
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
45
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
46
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
47 pagesfolder = foldername + "/pages/"
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
48 hocrfolder = foldername + "/hocr/"
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
49
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
50 if not os.path.exists(hocrfolder):
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
51 os.mkdir(hocrfolder)
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
52
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
53
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
54
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
55 for fn in os.listdir(pagesfolder):
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
56
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
57 outName = hocrfolder + fn
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
58
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
59
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
60 strCommand = "/usr/local/bin/tesseract -l deu " + pagesfolder + fn +" " + hocrfolder + fn + " hocr"
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
61
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
62 listArguments = shlex.split(strCommand)
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
63 exeShell = subprocess.Popen(listArguments,stdout=subprocess.PIPE)
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
64
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
65 while True:
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
66 line = exeShell.stdout.readline()
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
67 if line != b'':
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
68 #the real code does filtering here
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
69 print (line.rstrip())
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
70 else:
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
71 break
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
72
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
73
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
74
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
75 import sys
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
76 om = ocrManager()
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
77
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
78 args = sys.argv
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
79
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
80 start=None
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
81 end=None
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
82 if len(args) > 1: #start foldernummer
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
83 start = int(args[1])
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
84
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
85 if len(args) > 2: #start foldernummer
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
86 end = int(args[2])
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
87
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
88
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
89 foldername = "/Volumes/MPGARCHIV/struct2/"
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
90
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
91 cnt=-1
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
92 for fn in os.listdir(foldername):
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
93 cnt+=1
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
94
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
95 if start is not None:
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
96 if cnt<start:
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
97 print("SKIP: %s"%fn)
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
98 continue
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
99
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
100 if end is not None:
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
101 if cnt>end:
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
102 sys.exit()
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
103 print("PROCESS: %s"%fn)
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
104
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
105
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
106
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
107 om.split(foldername+fn)
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
108 om.ocr(foldername+fn)
|