annotate doOCR_old.py @ 2:90c0df483890 default tip

inital3
author dwinter
date Tue, 19 Aug 2014 14:25:29 +0200
parents 5b7ed10ecbb4
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
1
5b7ed10ecbb4 initial2
dwinter
parents: 0
diff changeset
1 from _csv import Error
5b7ed10ecbb4 initial2
dwinter
parents: 0
diff changeset
2 #start ="/Volumes/Folivora/MPG_Archiv/"
5b7ed10ecbb4 initial2
dwinter
parents: 0
diff changeset
3 start ="/Volumes/Folivora/MPG_Archiv/Publikationen der MPG"
5b7ed10ecbb4 initial2
dwinter
parents: 0
diff changeset
4 target = "/Volumes/MPGARCHIV/struct2/"
5b7ed10ecbb4 initial2
dwinter
parents: 0
diff changeset
5
5b7ed10ecbb4 initial2
dwinter
parents: 0
diff changeset
6 pathAdd = True #add the path of the file to the filename
5b7ed10ecbb4 initial2
dwinter
parents: 0
diff changeset
7
0
5e33fa5a2fdc initial
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff changeset
8 import os
1
5b7ed10ecbb4 initial2
dwinter
parents: 0
diff changeset
9 import os.path
5b7ed10ecbb4 initial2
dwinter
parents: 0
diff changeset
10 import shutil
5b7ed10ecbb4 initial2
dwinter
parents: 0
diff changeset
11
5b7ed10ecbb4 initial2
dwinter
parents: 0
diff changeset
12 errorf = open("/tmp/error.out","w",encoding="utf-8")
5b7ed10ecbb4 initial2
dwinter
parents: 0
diff changeset
13 errorf.write("START")
0
5e33fa5a2fdc initial
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff changeset
14
1
5b7ed10ecbb4 initial2
dwinter
parents: 0
diff changeset
15 for root, dirs, files in os.walk(start, topdown=False):
5b7ed10ecbb4 initial2
dwinter
parents: 0
diff changeset
16 for name in files:
5b7ed10ecbb4 initial2
dwinter
parents: 0
diff changeset
17 path = os.path.join(root, name)
5b7ed10ecbb4 initial2
dwinter
parents: 0
diff changeset
18 ext = os.path.splitext(path)
5b7ed10ecbb4 initial2
dwinter
parents: 0
diff changeset
19 print (name)
5b7ed10ecbb4 initial2
dwinter
parents: 0
diff changeset
20 if ext[1] == ".pdf":
5b7ed10ecbb4 initial2
dwinter
parents: 0
diff changeset
21 if pathAdd:
5b7ed10ecbb4 initial2
dwinter
parents: 0
diff changeset
22 fld = os.path.split(root)
5b7ed10ecbb4 initial2
dwinter
parents: 0
diff changeset
23 print (fld)
5b7ed10ecbb4 initial2
dwinter
parents: 0
diff changeset
24 neu = os.path.join(target,fld[1].replace(" ","_")+"___"+name.replace(" ","_"))
0
5e33fa5a2fdc initial
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff changeset
25 else:
1
5b7ed10ecbb4 initial2
dwinter
parents: 0
diff changeset
26 neu = os.path.join(target,name.replace(" ","_"))
5b7ed10ecbb4 initial2
dwinter
parents: 0
diff changeset
27 pathNeu,ext=os.path.splitext(neu)
5b7ed10ecbb4 initial2
dwinter
parents: 0
diff changeset
28
5b7ed10ecbb4 initial2
dwinter
parents: 0
diff changeset
29 if not os.path.exists(pathNeu):
5b7ed10ecbb4 initial2
dwinter
parents: 0
diff changeset
30
5b7ed10ecbb4 initial2
dwinter
parents: 0
diff changeset
31 os.makedirs(pathNeu)
5b7ed10ecbb4 initial2
dwinter
parents: 0
diff changeset
32 os.makedirs(pathNeu+"/full")
5b7ed10ecbb4 initial2
dwinter
parents: 0
diff changeset
33
5b7ed10ecbb4 initial2
dwinter
parents: 0
diff changeset
34 try:
5b7ed10ecbb4 initial2
dwinter
parents: 0
diff changeset
35 neu = pathNeu+"/full/"+name.replace(" ","_")
5b7ed10ecbb4 initial2
dwinter
parents: 0
diff changeset
36 shutil.copy(path, neu)
5b7ed10ecbb4 initial2
dwinter
parents: 0
diff changeset
37 except:
5b7ed10ecbb4 initial2
dwinter
parents: 0
diff changeset
38 errorf.write(path+"\n")
5b7ed10ecbb4 initial2
dwinter
parents: 0
diff changeset
39
5b7ed10ecbb4 initial2
dwinter
parents: 0
diff changeset
40 print (path)
5b7ed10ecbb4 initial2
dwinter
parents: 0
diff changeset
41 else:
5b7ed10ecbb4 initial2
dwinter
parents: 0
diff changeset
42 print("EXISTS:" + path)
5b7ed10ecbb4 initial2
dwinter
parents: 0
diff changeset
43
5b7ed10ecbb4 initial2
dwinter
parents: 0
diff changeset
44 errorf.close();
5b7ed10ecbb4 initial2
dwinter
parents: 0
diff changeset
45
0
5e33fa5a2fdc initial
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff changeset
46
5e33fa5a2fdc initial
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff changeset
47
1
5b7ed10ecbb4 initial2
dwinter
parents: 0
diff changeset
48 #for name in dirs:
5b7ed10ecbb4 initial2
dwinter
parents: 0
diff changeset
49 # print(os.path.join(root, name))
0
5e33fa5a2fdc initial
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff changeset
50
1
5b7ed10ecbb4 initial2
dwinter
parents: 0
diff changeset
51