Mercurial > hg > ocrHandling
comparison doOCR2.py @ 1:5b7ed10ecbb4
initial2
author | dwinter |
---|---|
date | Tue, 19 Aug 2014 14:24:36 +0200 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
0:5e33fa5a2fdc | 1:5b7ed10ecbb4 |
---|---|
1 import shlex | |
2 import subprocess | |
3 import os.path | |
4 import os | |
5 | |
6 class ocrManager: | |
7 | |
8 #strCommand = "/opt/local/bin/gm convert " + strImagepath + " -resize 1500x\\> " + strImagepath | |
9 | |
10 def split(self,foldername,onExistExit=False): | |
11 """splits pdf file in foldername, default it assumes a folder full containing a file foldername.pdf""" | |
12 | |
13 filename=foldername.split("/")[-1].split("___")[1] | |
14 | |
15 | |
16 | |
17 outfolder = foldername + "/pages/" | |
18 | |
19 if not os.path.exists(outfolder): | |
20 os.mkdir(outfolder) | |
21 else: | |
22 if onExistExit: #if set don't do anything | |
23 print (" ---- exists (split)") | |
24 return | |
25 | |
26 outName = outfolder + "img" + "-\%03d.png" | |
27 #outName = outfolder + filename + "-%03d.png" | |
28 strCommand = "/usr/local/bin/convert -verbose -density 300 " + foldername + "/full/" + filename +".pdf " + outName | |
29 #strCommand = "/usr/local/bin/gm convert -verbose -density 300 " + foldername + "/full/" + filename +".pdf " + outName | |
30 | |
31 listArguments = shlex.split(strCommand) | |
32 #listArguments = ["/usr/local/bin/gm","convert", "-verbose -density 300 " + foldername + "/full/" + filename +".pdf " + outName] | |
33 exeShell = subprocess.call(listArguments) | |
34 | |
35 | |
36 | |
37 while False: | |
38 line = exeShell.stdout.readline() | |
39 if line != b'': | |
40 #the real code does filtering here | |
41 print (line.rstrip()) | |
42 else: | |
43 break | |
44 | |
45 def ocr(self,foldername,onExistExit=False): | |
46 filename=foldername.split("/")[-1] | |
47 | |
48 #tesseract -l deu JB1953-17802.png JB1953-17802.tess hocr | |
49 | |
50 | |
51 pagesfolder = foldername + "/pages/" | |
52 hocrfolder = foldername + "/hocr/" | |
53 | |
54 if not os.path.exists(hocrfolder): | |
55 os.mkdir(hocrfolder) | |
56 else: | |
57 if onExistExit: #if set don't do anything | |
58 print (" ---- exists (ocrj)") | |
59 return | |
60 | |
61 | |
62 | |
63 | |
64 for fn in os.listdir(pagesfolder): | |
65 | |
66 outName = hocrfolder + fn | |
67 | |
68 | |
69 strCommand = "/usr/local/bin/tesseract -l deu " + pagesfolder + fn +" " + hocrfolder + fn + " hocr" | |
70 | |
71 listArguments = shlex.split(strCommand) | |
72 exeShell = subprocess.Popen(listArguments,stdout=subprocess.PIPE) | |
73 | |
74 while True: | |
75 line = exeShell.stdout.readline() | |
76 if line != b'': | |
77 #the real code does filtering here | |
78 print (line.rstrip()) | |
79 else: | |
80 break | |
81 | |
82 | |
83 | |
84 import sys | |
85 om = ocrManager() | |
86 | |
87 args = sys.argv | |
88 | |
89 start=None | |
90 end=None | |
91 if len(args) > 1: #start foldernummer | |
92 start = int(args[1]) | |
93 | |
94 if len(args) > 2: #start foldernummer | |
95 end = int(args[2]) | |
96 | |
97 | |
98 foldername = "/Volumes/MPGARCHIV/struct2/" | |
99 | |
100 cnt=-1 | |
101 | |
102 | |
103 | |
104 | |
105 om.ocr("/Volumes/MPGARCHIV/israel",onExistExit=True) |