0
|
1 '''
|
|
2 Created on 31.10.2012
|
|
3
|
|
4 @author: dwinter
|
|
5 '''
|
|
6
|
10
|
7 import managePurls.manageIndexMetaPURLs as manageIndexMetaPURLs
|
0
|
8
|
|
9
|
|
10
|
|
11 import os
|
|
12 from os.path import join, getsize
|
|
13 import sys
|
|
14 import re
|
3
|
15 from lxml import etree
|
0
|
16
|
3
|
17 def harvestIndexMeta(path,user,delpath="",replacepath="", update=False):
|
0
|
18
|
|
19 md = manageIndexMetaPURLs.IndexMetaPURLManager()
|
|
20
|
|
21 for root, dirs, files in os.walk(path):
|
|
22
|
|
23
|
|
24 for name in files:
|
|
25 if name.endswith(".meta"):
|
|
26 fl=join(root, name)
|
3
|
27
|
|
28 imagePath=createImagePath(fl,root)
|
|
29 imagePath=re.sub("^"+delpath,replacepath,imagePath)
|
|
30
|
0
|
31 fl=re.sub("^"+delpath,replacepath,fl) #loesche den teil vom path der mir delpath beginnt
|
|
32
|
3
|
33 val,purl = md.register(fl, True, user=user,imagePath=imagePath,update=update)
|
4
|
34 try:
|
|
35 if val==manageIndexMetaPURLs.ALREADY_EXISTS:
|
|
36 print "found %s -> %s"%(fl,purl)
|
3
|
37
|
4
|
38 elif val==manageIndexMetaPURLs.UPDATED:
|
|
39 print "updated %s -> %s"%(fl,purl)
|
|
40 else:
|
|
41 print "added %s -> %s"%(fl,purl)
|
|
42 except:
|
|
43 print "cannot print: %s"%purl
|
|
44
|
0
|
45 if 'pageimg' in dirs:
|
|
46 dirs.remove('pageimg') # don't visit pageimf
|
|
47 for dir in dirs:
|
|
48 if dir== "pageimg":
|
|
49 dirs.remove('pageimg')
|
|
50 if dir.startswith("."):
|
|
51 dirs.remove(dir)
|
|
52
|
4
|
53 if dir.startswith(":"):
|
|
54 dirs.remove(dir)
|
|
55
|
0
|
56
|
3
|
57 # erzeugt einen imagepath wenn kein texttooltag existiert
|
|
58 def createImagePath(path,root):
|
4
|
59 print "parsing: %s"%path
|
|
60 try:
|
|
61 tree= etree.parse(path)
|
|
62 except:
|
|
63 print "cannot parse %s"%path
|
|
64 return ""
|
|
65
|
3
|
66 #teste ob texttool tag, dann kein imagePath
|
|
67 tt =tree.xpath('//texttool')
|
|
68 if len(tt)>0:
|
|
69 return ""
|
0
|
70
|
3
|
71
|
|
72 #im anderen fall, heuristic
|
|
73
|
|
74 imageFolders=["pageimg","pages"]
|
|
75
|
|
76 for imageFolder in imageFolders:
|
|
77 fl=join(root, imageFolder)
|
|
78 if os.path.exists(fl): # gibt es einen der folder
|
|
79 return fl
|
|
80
|
|
81 return ""
|
|
82
|
|
83
|
|
84
|
0
|
85
|
|
86 if __name__ == '__main__':
|
|
87 args = sys.argv[1:]
|
|
88 if not (len(args)==2 or len(args)==3 or len(args)==4):
|
|
89 print "USAGE: python harvestToPurl.py path user (optional)pathPrefixToDelete (optional)replacedeleted"
|
|
90 sys.exit(2)
|
|
91 path=args[0]
|
|
92 user=args[1]
|
|
93
|
|
94 delpath=""
|
|
95 replacepath=""
|
|
96
|
|
97 if len(args)==3:
|
|
98 delpath=args[2]
|
|
99 elif len(args)==4:
|
|
100 delpath=args[2]
|
|
101 replacepath=args[3]
|
|
102
|
|
103
|
|
104 if not os.path.exists(path):
|
|
105 print "ERROR: path %s does not exist!"%path
|
|
106 sys.exit(2)
|
|
107
|
3
|
108 harvestIndexMeta(path,user,delpath=delpath,replacepath=replacepath,update=True)
|
0
|
109
|
4
|
110
|