Mercurial > hg > ocrHandling
comparison doOCR.py @ 0:5e33fa5a2fdc
initial
author | Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de> |
---|---|
date | Tue, 19 Aug 2014 14:12:52 +0200 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:5e33fa5a2fdc |
---|---|
1 import shlex | |
2 import subprocess | |
3 import os.path | |
4 import os | |
5 | |
6 class ocrManager: | |
7 | |
8 #strCommand = "/opt/local/bin/gm convert " + strImagepath + " -resize 1500x\\> " + strImagepath | |
9 | |
10 def split(self,foldername): | |
11 """splits pdf file in foldername, default it assumes a folder full containing a file foldername.pdf""" | |
12 | |
13 filename=foldername.split("/")[-1].split("___")[1] | |
14 | |
15 | |
16 | |
17 outfolder = foldername + "/pages/" | |
18 | |
19 if not os.path.exists(outfolder): | |
20 os.mkdir(outfolder) | |
21 | |
22 outName = outfolder + "img" + "-\%03d.png" | |
23 #outName = outfolder + filename + "-%03d.png" | |
24 strCommand = "/usr/local/bin/convert -verbose -density 300 " + foldername + "/full/" + filename +".pdf " + outName | |
25 #strCommand = "/usr/local/bin/gm convert -verbose -density 300 " + foldername + "/full/" + filename +".pdf " + outName | |
26 | |
27 listArguments = shlex.split(strCommand) | |
28 #listArguments = ["/usr/local/bin/gm","convert", "-verbose -density 300 " + foldername + "/full/" + filename +".pdf " + outName] | |
29 exeShell = subprocess.call(listArguments) | |
30 | |
31 | |
32 | |
33 while False: | |
34 line = exeShell.stdout.readline() | |
35 if line != b'': | |
36 #the real code does filtering here | |
37 print (line.rstrip()) | |
38 else: | |
39 break | |
40 | |
41 def ocr(self,foldername): | |
42 filename=foldername.split("/")[-1] | |
43 | |
44 #tesseract -l deu JB1953-17802.png JB1953-17802.tess hocr | |
45 | |
46 | |
47 pagesfolder = foldername + "/pages/" | |
48 hocrfolder = foldername + "/hocr/" | |
49 | |
50 if not os.path.exists(hocrfolder): | |
51 os.mkdir(hocrfolder) | |
52 | |
53 | |
54 | |
55 for fn in os.listdir(pagesfolder): | |
56 | |
57 outName = hocrfolder + fn | |
58 | |
59 | |
60 strCommand = "/usr/local/bin/tesseract -l deu " + pagesfolder + fn +" " + hocrfolder + fn + " hocr" | |
61 | |
62 listArguments = shlex.split(strCommand) | |
63 exeShell = subprocess.Popen(listArguments,stdout=subprocess.PIPE) | |
64 | |
65 while True: | |
66 line = exeShell.stdout.readline() | |
67 if line != b'': | |
68 #the real code does filtering here | |
69 print (line.rstrip()) | |
70 else: | |
71 break | |
72 | |
73 | |
74 | |
75 import sys | |
76 om = ocrManager() | |
77 | |
78 args = sys.argv | |
79 | |
80 start=None | |
81 end=None | |
82 if len(args) > 1: #start foldernummer | |
83 start = int(args[1]) | |
84 | |
85 if len(args) > 2: #start foldernummer | |
86 end = int(args[2]) | |
87 | |
88 | |
89 foldername = "/Volumes/MPGARCHIV/struct2/" | |
90 | |
91 cnt=-1 | |
92 for fn in os.listdir(foldername): | |
93 cnt+=1 | |
94 | |
95 if start is not None: | |
96 if cnt<start: | |
97 print("SKIP: %s"%fn) | |
98 continue | |
99 | |
100 if end is not None: | |
101 if cnt>end: | |
102 sys.exit() | |
103 print("PROCESS: %s"%fn) | |
104 | |
105 | |
106 | |
107 om.split(foldername+fn) | |
108 om.ocr(foldername+fn) |