annotate addDriToIndexMeta.py @ 40:671dd1e4bd09 default tip

minor bug
author dwinter
date Wed, 05 Mar 2014 10:20:54 +0100
parents a33fa2377075
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
5
3ebe37d81071 addDri added
dwinter
parents:
diff changeset
1 '''
3ebe37d81071 addDri added
dwinter
parents:
diff changeset
2 Created on 01.11.2012
3ebe37d81071 addDri added
dwinter
parents:
diff changeset
3
3ebe37d81071 addDri added
dwinter
parents:
diff changeset
4 @author: dwinter
3ebe37d81071 addDri added
dwinter
parents:
diff changeset
5 '''
3ebe37d81071 addDri added
dwinter
parents:
diff changeset
6 import os
10
1b2d74f94ca8 repackaging
dwinter
parents: 8
diff changeset
7 import managePurls.manageIndexMetaPURLs as manageIndexMetaPURLs
5
3ebe37d81071 addDri added
dwinter
parents:
diff changeset
8 import re
3ebe37d81071 addDri added
dwinter
parents:
diff changeset
9 from lxml import etree
30
bcd8076ff7ec random selection of entries
dwinter
parents: 22
diff changeset
10 import sys
5
3ebe37d81071 addDri added
dwinter
parents:
diff changeset
11
3ebe37d81071 addDri added
dwinter
parents:
diff changeset
12 from os.path import join, getsize
3ebe37d81071 addDri added
dwinter
parents:
diff changeset
13
3ebe37d81071 addDri added
dwinter
parents:
diff changeset
14
30
bcd8076ff7ec random selection of entries
dwinter
parents: 22
diff changeset
15
bcd8076ff7ec random selection of entries
dwinter
parents: 22
diff changeset
16 def correctAuthor(tree):
bcd8076ff7ec random selection of entries
dwinter
parents: 22
diff changeset
17 """ersetzt in den autor felder "\r" durch ;"""
bcd8076ff7ec random selection of entries
dwinter
parents: 22
diff changeset
18
bcd8076ff7ec random selection of entries
dwinter
parents: 22
diff changeset
19
bcd8076ff7ec random selection of entries
dwinter
parents: 22
diff changeset
20 authors = tree.xpath("/resource/meta/bib/author")
31
0190f49bce88 added change flag
dwinter
parents: 30
diff changeset
21
0190f49bce88 added change flag
dwinter
parents: 30
diff changeset
22 changed = False
30
bcd8076ff7ec random selection of entries
dwinter
parents: 22
diff changeset
23 for author in authors:
bcd8076ff7ec random selection of entries
dwinter
parents: 22
diff changeset
24
bcd8076ff7ec random selection of entries
dwinter
parents: 22
diff changeset
25 if author.text is not None:
bcd8076ff7ec random selection of entries
dwinter
parents: 22
diff changeset
26 splitted =author.text.split("\n")
bcd8076ff7ec random selection of entries
dwinter
parents: 22
diff changeset
27 txt = "; ".join(splitted)
bcd8076ff7ec random selection of entries
dwinter
parents: 22
diff changeset
28
31
0190f49bce88 added change flag
dwinter
parents: 30
diff changeset
29 if txt!=author.text:
0190f49bce88 added change flag
dwinter
parents: 30
diff changeset
30 author.text=txt
0190f49bce88 added change flag
dwinter
parents: 30
diff changeset
31 changed=True
30
bcd8076ff7ec random selection of entries
dwinter
parents: 22
diff changeset
32
31
0190f49bce88 added change flag
dwinter
parents: 30
diff changeset
33 return changed
36
be8640c08d99 Updatedri in harbestToPUrl ge?ndert.
dwinter
parents: 31
diff changeset
34
be8640c08d99 Updatedri in harbestToPUrl ge?ndert.
dwinter
parents: 31
diff changeset
35
39
a33fa2377075 outfiles are now set as parameter
dwinter
parents: 36
diff changeset
36 def getDRIfromIndexMeta(fl,parseErrorFile=None):
a33fa2377075 outfiles are now set as parameter
dwinter
parents: 36
diff changeset
37
a33fa2377075 outfiles are now set as parameter
dwinter
parents: 36
diff changeset
38
a33fa2377075 outfiles are now set as parameter
dwinter
parents: 36
diff changeset
39 if parseErrorFile is None:
a33fa2377075 outfiles are now set as parameter
dwinter
parents: 36
diff changeset
40 parseErrorFile = file("/tmp/addDRIParseErrors.txt","w")
36
be8640c08d99 Updatedri in harbestToPUrl ge?ndert.
dwinter
parents: 31
diff changeset
41 try:
be8640c08d99 Updatedri in harbestToPUrl ge?ndert.
dwinter
parents: 31
diff changeset
42 tree = etree.parse(fl)
be8640c08d99 Updatedri in harbestToPUrl ge?ndert.
dwinter
parents: 31
diff changeset
43 except:
be8640c08d99 Updatedri in harbestToPUrl ge?ndert.
dwinter
parents: 31
diff changeset
44 parseErrorFile.write("PARSE ERROR:"+fl+"\n")
be8640c08d99 Updatedri in harbestToPUrl ge?ndert.
dwinter
parents: 31
diff changeset
45 return False
30
bcd8076ff7ec random selection of entries
dwinter
parents: 22
diff changeset
46
36
be8640c08d99 Updatedri in harbestToPUrl ge?ndert.
dwinter
parents: 31
diff changeset
47 dris = tree.xpath("/resource/meta/dri[@type='mpiwg']")
be8640c08d99 Updatedri in harbestToPUrl ge?ndert.
dwinter
parents: 31
diff changeset
48
be8640c08d99 Updatedri in harbestToPUrl ge?ndert.
dwinter
parents: 31
diff changeset
49 if len(dris)==0:
be8640c08d99 Updatedri in harbestToPUrl ge?ndert.
dwinter
parents: 31
diff changeset
50 return None
be8640c08d99 Updatedri in harbestToPUrl ge?ndert.
dwinter
parents: 31
diff changeset
51 else:
be8640c08d99 Updatedri in harbestToPUrl ge?ndert.
dwinter
parents: 31
diff changeset
52 return dris[0].text
be8640c08d99 Updatedri in harbestToPUrl ge?ndert.
dwinter
parents: 31
diff changeset
53
be8640c08d99 Updatedri in harbestToPUrl ge?ndert.
dwinter
parents: 31
diff changeset
54
39
a33fa2377075 outfiles are now set as parameter
dwinter
parents: 36
diff changeset
55 def addPURL(fl,purl,efiles,test=False):
7
78dd28ade713 error handling
dwinter
parents: 6
diff changeset
56 try:
78dd28ade713 error handling
dwinter
parents: 6
diff changeset
57 tree = etree.parse(fl)
78dd28ade713 error handling
dwinter
parents: 6
diff changeset
58 except:
39
a33fa2377075 outfiles are now set as parameter
dwinter
parents: 36
diff changeset
59 efiles.parseErrorFile.write("PARSE ERROR:"+fl+"\n")
7
78dd28ade713 error handling
dwinter
parents: 6
diff changeset
60 return False
30
bcd8076ff7ec random selection of entries
dwinter
parents: 22
diff changeset
61
5
3ebe37d81071 addDri added
dwinter
parents:
diff changeset
62 dris = tree.xpath("/resource/meta/dri[@type='mpiwg']")
3ebe37d81071 addDri added
dwinter
parents:
diff changeset
63
30
bcd8076ff7ec random selection of entries
dwinter
parents: 22
diff changeset
64
31
0190f49bce88 added change flag
dwinter
parents: 30
diff changeset
65 changed = correctAuthor(tree)
30
bcd8076ff7ec random selection of entries
dwinter
parents: 22
diff changeset
66
bcd8076ff7ec random selection of entries
dwinter
parents: 22
diff changeset
67
bcd8076ff7ec random selection of entries
dwinter
parents: 22
diff changeset
68
5
3ebe37d81071 addDri added
dwinter
parents:
diff changeset
69 if len(dris)==0: # erzeuge neu
3ebe37d81071 addDri added
dwinter
parents:
diff changeset
70 newDri = etree.Element("dri",type="mpiwg")
3ebe37d81071 addDri added
dwinter
parents:
diff changeset
71 newDri.text=purl
3ebe37d81071 addDri added
dwinter
parents:
diff changeset
72 metas=tree.xpath("/resource/meta")
3ebe37d81071 addDri added
dwinter
parents:
diff changeset
73 if len(metas)==0:
39
a33fa2377075 outfiles are now set as parameter
dwinter
parents: 36
diff changeset
74 efiles.parseErrorFile.write("no resource/meta: %s \n"%fl)
5
3ebe37d81071 addDri added
dwinter
parents:
diff changeset
75 return False
3ebe37d81071 addDri added
dwinter
parents:
diff changeset
76 else:
3ebe37d81071 addDri added
dwinter
parents:
diff changeset
77 metas[0].append(newDri)
3ebe37d81071 addDri added
dwinter
parents:
diff changeset
78 else:
3ebe37d81071 addDri added
dwinter
parents:
diff changeset
79 dris[0].text=purl
39
a33fa2377075 outfiles are now set as parameter
dwinter
parents: 36
diff changeset
80 efiles.alreadyExistsFile.write("%s \n"%fl)
31
0190f49bce88 added change flag
dwinter
parents: 30
diff changeset
81 if not changed: #nothing has to be done
0190f49bce88 added change flag
dwinter
parents: 30
diff changeset
82 return True
22
f748e2b684c9 bug in xml output fixed
dwinter
parents: 11
diff changeset
83
5
3ebe37d81071 addDri added
dwinter
parents:
diff changeset
84 print etree.tostring(tree, pretty_print=True)
3ebe37d81071 addDri added
dwinter
parents:
diff changeset
85
30
bcd8076ff7ec random selection of entries
dwinter
parents: 22
diff changeset
86
5
3ebe37d81071 addDri added
dwinter
parents:
diff changeset
87 if not test:
6
f0417a01690a error handling if file renaming not possible
dwinter
parents: 5
diff changeset
88 try:
30
bcd8076ff7ec random selection of entries
dwinter
parents: 22
diff changeset
89
6
f0417a01690a error handling if file renaming not possible
dwinter
parents: 5
diff changeset
90 os.rename(fl, fl+"_mpiwg_dri")
f0417a01690a error handling if file renaming not possible
dwinter
parents: 5
diff changeset
91 out = etree.tostring(tree, encoding="UTF-8",xml_declaration=False)
f0417a01690a error handling if file renaming not possible
dwinter
parents: 5
diff changeset
92 fo = file(fl,"w")
f0417a01690a error handling if file renaming not possible
dwinter
parents: 5
diff changeset
93 fo.write(out)
f0417a01690a error handling if file renaming not possible
dwinter
parents: 5
diff changeset
94 fo.close
f0417a01690a error handling if file renaming not possible
dwinter
parents: 5
diff changeset
95 except:
30
bcd8076ff7ec random selection of entries
dwinter
parents: 22
diff changeset
96
bcd8076ff7ec random selection of entries
dwinter
parents: 22
diff changeset
97 print sys.exc_info()[0]
bcd8076ff7ec random selection of entries
dwinter
parents: 22
diff changeset
98 print sys.exc_info()[1]
39
a33fa2377075 outfiles are now set as parameter
dwinter
parents: 36
diff changeset
99 efiles.errorFile.write(fl+"\n")
5
3ebe37d81071 addDri added
dwinter
parents:
diff changeset
100 return True
3ebe37d81071 addDri added
dwinter
parents:
diff changeset
101
39
a33fa2377075 outfiles are now set as parameter
dwinter
parents: 36
diff changeset
102 def addDriToIndexMeta(path,efiles,delpath="",replacepath="",test=False):
5
3ebe37d81071 addDri added
dwinter
parents:
diff changeset
103
8
733d43b30a82 connection handling changed
dwinter
parents: 7
diff changeset
104 md=manageIndexMetaPURLs.IndexMetaPURLManager()
5
3ebe37d81071 addDri added
dwinter
parents:
diff changeset
105
3ebe37d81071 addDri added
dwinter
parents:
diff changeset
106 for root, dirs, files in os.walk(path):
3ebe37d81071 addDri added
dwinter
parents:
diff changeset
107
3ebe37d81071 addDri added
dwinter
parents:
diff changeset
108
3ebe37d81071 addDri added
dwinter
parents:
diff changeset
109 for name in files:
11
fad73212354b wsgi addedx
dwinter
parents: 10
diff changeset
110 if name=="index.meta":
5
3ebe37d81071 addDri added
dwinter
parents:
diff changeset
111 fl=join(root, name)
3ebe37d81071 addDri added
dwinter
parents:
diff changeset
112 shortPath=re.sub("^"+delpath,replacepath,fl)
8
733d43b30a82 connection handling changed
dwinter
parents: 7
diff changeset
113 purl=md.getPurl(shortPath)
5
3ebe37d81071 addDri added
dwinter
parents:
diff changeset
114
40
671dd1e4bd09 minor bug
dwinter
parents: 39
diff changeset
115 addPURL(fl,purl,efiles,test)
5
3ebe37d81071 addDri added
dwinter
parents:
diff changeset
116
3ebe37d81071 addDri added
dwinter
parents:
diff changeset
117 if 'pageimg' in dirs:
3ebe37d81071 addDri added
dwinter
parents:
diff changeset
118 dirs.remove('pageimg') # don't visit pageimf
3ebe37d81071 addDri added
dwinter
parents:
diff changeset
119 for dir in dirs:
3ebe37d81071 addDri added
dwinter
parents:
diff changeset
120 if dir== "pageimg":
3ebe37d81071 addDri added
dwinter
parents:
diff changeset
121 dirs.remove('pageimg')
3ebe37d81071 addDri added
dwinter
parents:
diff changeset
122 if dir.startswith("."):
3ebe37d81071 addDri added
dwinter
parents:
diff changeset
123 dirs.remove(dir)
3ebe37d81071 addDri added
dwinter
parents:
diff changeset
124
3ebe37d81071 addDri added
dwinter
parents:
diff changeset
125 if __name__ == '__main__':
39
a33fa2377075 outfiles are now set as parameter
dwinter
parents: 36
diff changeset
126 class ef:
a33fa2377075 outfiles are now set as parameter
dwinter
parents: 36
diff changeset
127 errorFile = file("/tmp/addDRIErrors.txt","w")
a33fa2377075 outfiles are now set as parameter
dwinter
parents: 36
diff changeset
128 parseErrorFile = file("/tmp/addDRIParseErrors.txt","w")
a33fa2377075 outfiles are now set as parameter
dwinter
parents: 36
diff changeset
129 alreadyExistsFile = file("/tmp/addDRIalreadyExists.txt","w")
a33fa2377075 outfiles are now set as parameter
dwinter
parents: 36
diff changeset
130
a33fa2377075 outfiles are now set as parameter
dwinter
parents: 36
diff changeset
131 efiles = ef()
a33fa2377075 outfiles are now set as parameter
dwinter
parents: 36
diff changeset
132
a33fa2377075 outfiles are now set as parameter
dwinter
parents: 36
diff changeset
133 addDriToIndexMeta("/mpiwg/online/permanent/vlp",efiles,delpath="/mpiwg/online",test=False)