annotate harvestToPurl.py @ 40:671dd1e4bd09 default tip

minor bug
author dwinter
date Wed, 05 Mar 2014 10:20:54 +0100
parents a33fa2377075
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
36
be8640c08d99 Updatedri in harbestToPUrl ge?ndert.
dwinter
parents: 10
diff changeset
1 # -*- coding: utf-8 -*-
0
dwinter
parents:
diff changeset
2 '''
36
be8640c08d99 Updatedri in harbestToPUrl ge?ndert.
dwinter
parents: 10
diff changeset
3
0
dwinter
parents:
diff changeset
4 Created on 31.10.2012
dwinter
parents:
diff changeset
5
dwinter
parents:
diff changeset
6 @author: dwinter
36
be8640c08d99 Updatedri in harbestToPUrl ge?ndert.
dwinter
parents: 10
diff changeset
7
be8640c08d99 Updatedri in harbestToPUrl ge?ndert.
dwinter
parents: 10
diff changeset
8 wesentlich hier ist: harvestIndexMeta
0
dwinter
parents:
diff changeset
9 '''
dwinter
parents:
diff changeset
10
36
be8640c08d99 Updatedri in harbestToPUrl ge?ndert.
dwinter
parents: 10
diff changeset
11
be8640c08d99 Updatedri in harbestToPUrl ge?ndert.
dwinter
parents: 10
diff changeset
12
10
1b2d74f94ca8 repackaging
dwinter
parents: 4
diff changeset
13 import managePurls.manageIndexMetaPURLs as manageIndexMetaPURLs
39
a33fa2377075 outfiles are now set as parameter
dwinter
parents: 37
diff changeset
14 from addDriToIndexMeta import getDRIfromIndexMeta
0
dwinter
parents:
diff changeset
15
dwinter
parents:
diff changeset
16
dwinter
parents:
diff changeset
17 import os
dwinter
parents:
diff changeset
18 from os.path import join, getsize
dwinter
parents:
diff changeset
19 import sys
dwinter
parents:
diff changeset
20 import re
3
caeede0c9464 update and redirector
dwinter
parents: 0
diff changeset
21 from lxml import etree
0
dwinter
parents:
diff changeset
22
3
caeede0c9464 update and redirector
dwinter
parents: 0
diff changeset
23 def harvestIndexMeta(path,user,delpath="",replacepath="", update=False):
36
be8640c08d99 Updatedri in harbestToPUrl ge?ndert.
dwinter
parents: 10
diff changeset
24 """
be8640c08d99 Updatedri in harbestToPUrl ge?ndert.
dwinter
parents: 10
diff changeset
25 Sucht in path und allen unterordnern nach index.meta dateien und schaut dann nach ob es dort schon ein index meta gibt. Wenn ja wird geschaut, ob die
be8640c08d99 Updatedri in harbestToPUrl ge?ndert.
dwinter
parents: 10
diff changeset
26 dort enthaltenenn dris mit denen Übereinstimmen, die in der Datenbank gespeichert sind.
0
dwinter
parents:
diff changeset
27
36
be8640c08d99 Updatedri in harbestToPUrl ge?ndert.
dwinter
parents: 10
diff changeset
28 delpath ist ein optionaler Parameter, der angibt wird welcher Teil des Pfades in pfad aus dem Pfad zur Überprüfung gelöscht werden soll, d.h.
be8640c08d99 Updatedri in harbestToPUrl ge?ndert.
dwinter
parents: 10
diff changeset
29 angenommen path="/mpiwg/online/permanent" und delpath="/mpiwg/online" dann wird in der Datenbank nach Pfaden gesucht die mit /permanent beginnen.
be8640c08d99 Updatedri in harbestToPUrl ge?ndert.
dwinter
parents: 10
diff changeset
30
be8640c08d99 Updatedri in harbestToPUrl ge?ndert.
dwinter
parents: 10
diff changeset
31 Ist replacepath gesetzt wird der mit delpath geloeschte Teil durch replacepath ersetzt, dh.h
be8640c08d99 Updatedri in harbestToPUrl ge?ndert.
dwinter
parents: 10
diff changeset
32 path="/Volumes/online_permanent/library" und delpath="/Volumes/online_permanent", replacepath="/permanent" ann wird in der Datenbank nach Pfaden gesucht die mit /permanent beginnen.
be8640c08d99 Updatedri in harbestToPUrl ge?ndert.
dwinter
parents: 10
diff changeset
33
be8640c08d99 Updatedri in harbestToPUrl ge?ndert.
dwinter
parents: 10
diff changeset
34
be8640c08d99 Updatedri in harbestToPUrl ge?ndert.
dwinter
parents: 10
diff changeset
35 Falls es keine DRI gibt, wird eine PURL in der Datenbank erzeugt. Diese wird NICHT in den index.meta gespeichert, dazu muss ein anderes Script "addDriToIndexMeta" aufgerufen werden!
be8640c08d99 Updatedri in harbestToPUrl ge?ndert.
dwinter
parents: 10
diff changeset
36
be8640c08d99 Updatedri in harbestToPUrl ge?ndert.
dwinter
parents: 10
diff changeset
37 Falls es eine gibt und das update Flag gesetzt ist, wird die Dri aus den Index.meta in die Datenbank zu dem Pfad geschrieben.
be8640c08d99 Updatedri in harbestToPUrl ge?ndert.
dwinter
parents: 10
diff changeset
38 """
0
dwinter
parents:
diff changeset
39 md = manageIndexMetaPURLs.IndexMetaPURLManager()
dwinter
parents:
diff changeset
40
dwinter
parents:
diff changeset
41 for root, dirs, files in os.walk(path):
dwinter
parents:
diff changeset
42
dwinter
parents:
diff changeset
43
dwinter
parents:
diff changeset
44 for name in files:
dwinter
parents:
diff changeset
45 if name.endswith(".meta"):
dwinter
parents:
diff changeset
46 fl=join(root, name)
36
be8640c08d99 Updatedri in harbestToPUrl ge?ndert.
dwinter
parents: 10
diff changeset
47
39
a33fa2377075 outfiles are now set as parameter
dwinter
parents: 37
diff changeset
48 parseErrorFile = file("/tmp/HarvestToPurlParseErrors.txt","w")
a33fa2377075 outfiles are now set as parameter
dwinter
parents: 37
diff changeset
49 driIndexMeta=getDRIfromIndexMeta(fl,parseErrorFile=parseErrorFile)
36
be8640c08d99 Updatedri in harbestToPUrl ge?ndert.
dwinter
parents: 10
diff changeset
50
3
caeede0c9464 update and redirector
dwinter
parents: 0
diff changeset
51 imagePath=createImagePath(fl,root)
36
be8640c08d99 Updatedri in harbestToPUrl ge?ndert.
dwinter
parents: 10
diff changeset
52
3
caeede0c9464 update and redirector
dwinter
parents: 0
diff changeset
53 imagePath=re.sub("^"+delpath,replacepath,imagePath)
caeede0c9464 update and redirector
dwinter
parents: 0
diff changeset
54
0
dwinter
parents:
diff changeset
55 fl=re.sub("^"+delpath,replacepath,fl) #loesche den teil vom path der mir delpath beginnt
dwinter
parents:
diff changeset
56
36
be8640c08d99 Updatedri in harbestToPUrl ge?ndert.
dwinter
parents: 10
diff changeset
57
be8640c08d99 Updatedri in harbestToPUrl ge?ndert.
dwinter
parents: 10
diff changeset
58
be8640c08d99 Updatedri in harbestToPUrl ge?ndert.
dwinter
parents: 10
diff changeset
59 val,purl = md.register(fl, True, user=user,imagePath=imagePath,driIndexMeta=driIndexMeta,update=update)
4
107f13ca333b try except added
dwinter
parents: 3
diff changeset
60 try:
107f13ca333b try except added
dwinter
parents: 3
diff changeset
61 if val==manageIndexMetaPURLs.ALREADY_EXISTS:
107f13ca333b try except added
dwinter
parents: 3
diff changeset
62 print "found %s -> %s"%(fl,purl)
3
caeede0c9464 update and redirector
dwinter
parents: 0
diff changeset
63
4
107f13ca333b try except added
dwinter
parents: 3
diff changeset
64 elif val==manageIndexMetaPURLs.UPDATED:
107f13ca333b try except added
dwinter
parents: 3
diff changeset
65 print "updated %s -> %s"%(fl,purl)
107f13ca333b try except added
dwinter
parents: 3
diff changeset
66 else:
107f13ca333b try except added
dwinter
parents: 3
diff changeset
67 print "added %s -> %s"%(fl,purl)
107f13ca333b try except added
dwinter
parents: 3
diff changeset
68 except:
107f13ca333b try except added
dwinter
parents: 3
diff changeset
69 print "cannot print: %s"%purl
107f13ca333b try except added
dwinter
parents: 3
diff changeset
70
0
dwinter
parents:
diff changeset
71 if 'pageimg' in dirs:
dwinter
parents:
diff changeset
72 dirs.remove('pageimg') # don't visit pageimf
dwinter
parents:
diff changeset
73 for dir in dirs:
dwinter
parents:
diff changeset
74 if dir== "pageimg":
dwinter
parents:
diff changeset
75 dirs.remove('pageimg')
dwinter
parents:
diff changeset
76 if dir.startswith("."):
dwinter
parents:
diff changeset
77 dirs.remove(dir)
dwinter
parents:
diff changeset
78
4
107f13ca333b try except added
dwinter
parents: 3
diff changeset
79 if dir.startswith(":"):
107f13ca333b try except added
dwinter
parents: 3
diff changeset
80 dirs.remove(dir)
107f13ca333b try except added
dwinter
parents: 3
diff changeset
81
0
dwinter
parents:
diff changeset
82
3
caeede0c9464 update and redirector
dwinter
parents: 0
diff changeset
83 # erzeugt einen imagepath wenn kein texttooltag existiert
caeede0c9464 update and redirector
dwinter
parents: 0
diff changeset
84 def createImagePath(path,root):
4
107f13ca333b try except added
dwinter
parents: 3
diff changeset
85 print "parsing: %s"%path
107f13ca333b try except added
dwinter
parents: 3
diff changeset
86 try:
107f13ca333b try except added
dwinter
parents: 3
diff changeset
87 tree= etree.parse(path)
107f13ca333b try except added
dwinter
parents: 3
diff changeset
88 except:
107f13ca333b try except added
dwinter
parents: 3
diff changeset
89 print "cannot parse %s"%path
107f13ca333b try except added
dwinter
parents: 3
diff changeset
90 return ""
107f13ca333b try except added
dwinter
parents: 3
diff changeset
91
3
caeede0c9464 update and redirector
dwinter
parents: 0
diff changeset
92 #teste ob texttool tag, dann kein imagePath
caeede0c9464 update and redirector
dwinter
parents: 0
diff changeset
93 tt =tree.xpath('//texttool')
caeede0c9464 update and redirector
dwinter
parents: 0
diff changeset
94 if len(tt)>0:
caeede0c9464 update and redirector
dwinter
parents: 0
diff changeset
95 return ""
0
dwinter
parents:
diff changeset
96
3
caeede0c9464 update and redirector
dwinter
parents: 0
diff changeset
97
caeede0c9464 update and redirector
dwinter
parents: 0
diff changeset
98 #im anderen fall, heuristic
caeede0c9464 update and redirector
dwinter
parents: 0
diff changeset
99
caeede0c9464 update and redirector
dwinter
parents: 0
diff changeset
100 imageFolders=["pageimg","pages"]
caeede0c9464 update and redirector
dwinter
parents: 0
diff changeset
101
caeede0c9464 update and redirector
dwinter
parents: 0
diff changeset
102 for imageFolder in imageFolders:
caeede0c9464 update and redirector
dwinter
parents: 0
diff changeset
103 fl=join(root, imageFolder)
caeede0c9464 update and redirector
dwinter
parents: 0
diff changeset
104 if os.path.exists(fl): # gibt es einen der folder
caeede0c9464 update and redirector
dwinter
parents: 0
diff changeset
105 return fl
caeede0c9464 update and redirector
dwinter
parents: 0
diff changeset
106
caeede0c9464 update and redirector
dwinter
parents: 0
diff changeset
107 return ""
caeede0c9464 update and redirector
dwinter
parents: 0
diff changeset
108
caeede0c9464 update and redirector
dwinter
parents: 0
diff changeset
109
caeede0c9464 update and redirector
dwinter
parents: 0
diff changeset
110
0
dwinter
parents:
diff changeset
111
dwinter
parents:
diff changeset
112 if __name__ == '__main__':
dwinter
parents:
diff changeset
113 args = sys.argv[1:]
dwinter
parents:
diff changeset
114 if not (len(args)==2 or len(args)==3 or len(args)==4):
dwinter
parents:
diff changeset
115 print "USAGE: python harvestToPurl.py path user (optional)pathPrefixToDelete (optional)replacedeleted"
dwinter
parents:
diff changeset
116 sys.exit(2)
dwinter
parents:
diff changeset
117 path=args[0]
dwinter
parents:
diff changeset
118 user=args[1]
dwinter
parents:
diff changeset
119
dwinter
parents:
diff changeset
120 delpath=""
dwinter
parents:
diff changeset
121 replacepath=""
dwinter
parents:
diff changeset
122
dwinter
parents:
diff changeset
123 if len(args)==3:
dwinter
parents:
diff changeset
124 delpath=args[2]
dwinter
parents:
diff changeset
125 elif len(args)==4:
dwinter
parents:
diff changeset
126 delpath=args[2]
dwinter
parents:
diff changeset
127 replacepath=args[3]
dwinter
parents:
diff changeset
128
dwinter
parents:
diff changeset
129
dwinter
parents:
diff changeset
130 if not os.path.exists(path):
dwinter
parents:
diff changeset
131 print "ERROR: path %s does not exist!"%path
dwinter
parents:
diff changeset
132 sys.exit(2)
dwinter
parents:
diff changeset
133
37
6a0873a913c5 register updated die dri, wenn sie nicht gleich der gespeicherten dri ist, auch wenn update nicht gleich true ist.
dwinter
parents: 36
diff changeset
134 harvestIndexMeta(path,user,delpath=delpath,replacepath=replacepath,update=False)
0
dwinter
parents:
diff changeset
135
4
107f13ca333b try except added
dwinter
parents: 3
diff changeset
136