5
|
1 '''
|
|
2 Created on 01.11.2012
|
|
3
|
|
4 @author: dwinter
|
|
5 '''
|
|
6 import os
|
10
|
7 import managePurls.manageIndexMetaPURLs as manageIndexMetaPURLs
|
5
|
8 import re
|
|
9 from lxml import etree
|
30
|
10 import sys
|
5
|
11
|
|
12 from os.path import join, getsize
|
|
13
|
|
14
|
30
|
15
|
|
16 def correctAuthor(tree):
|
|
17 """ersetzt in den autor felder "\r" durch ;"""
|
|
18
|
|
19
|
|
20 authors = tree.xpath("/resource/meta/bib/author")
|
31
|
21
|
|
22 changed = False
|
30
|
23 for author in authors:
|
|
24
|
|
25 if author.text is not None:
|
|
26 splitted =author.text.split("\n")
|
|
27 txt = "; ".join(splitted)
|
|
28
|
31
|
29 if txt!=author.text:
|
|
30 author.text=txt
|
|
31 changed=True
|
30
|
32
|
31
|
33 return changed
|
36
|
34
|
|
35
|
39
|
36 def getDRIfromIndexMeta(fl,parseErrorFile=None):
|
|
37
|
|
38
|
|
39 if parseErrorFile is None:
|
|
40 parseErrorFile = file("/tmp/addDRIParseErrors.txt","w")
|
36
|
41 try:
|
|
42 tree = etree.parse(fl)
|
|
43 except:
|
|
44 parseErrorFile.write("PARSE ERROR:"+fl+"\n")
|
|
45 return False
|
30
|
46
|
36
|
47 dris = tree.xpath("/resource/meta/dri[@type='mpiwg']")
|
|
48
|
|
49 if len(dris)==0:
|
|
50 return None
|
|
51 else:
|
|
52 return dris[0].text
|
|
53
|
|
54
|
39
|
55 def addPURL(fl,purl,efiles,test=False):
|
7
|
56 try:
|
|
57 tree = etree.parse(fl)
|
|
58 except:
|
39
|
59 efiles.parseErrorFile.write("PARSE ERROR:"+fl+"\n")
|
7
|
60 return False
|
30
|
61
|
5
|
62 dris = tree.xpath("/resource/meta/dri[@type='mpiwg']")
|
|
63
|
30
|
64
|
31
|
65 changed = correctAuthor(tree)
|
30
|
66
|
|
67
|
|
68
|
5
|
69 if len(dris)==0: # erzeuge neu
|
|
70 newDri = etree.Element("dri",type="mpiwg")
|
|
71 newDri.text=purl
|
|
72 metas=tree.xpath("/resource/meta")
|
|
73 if len(metas)==0:
|
39
|
74 efiles.parseErrorFile.write("no resource/meta: %s \n"%fl)
|
5
|
75 return False
|
|
76 else:
|
|
77 metas[0].append(newDri)
|
|
78 else:
|
|
79 dris[0].text=purl
|
39
|
80 efiles.alreadyExistsFile.write("%s \n"%fl)
|
31
|
81 if not changed: #nothing has to be done
|
|
82 return True
|
22
|
83
|
5
|
84 print etree.tostring(tree, pretty_print=True)
|
|
85
|
30
|
86
|
5
|
87 if not test:
|
6
|
88 try:
|
30
|
89
|
6
|
90 os.rename(fl, fl+"_mpiwg_dri")
|
|
91 out = etree.tostring(tree, encoding="UTF-8",xml_declaration=False)
|
|
92 fo = file(fl,"w")
|
|
93 fo.write(out)
|
|
94 fo.close
|
|
95 except:
|
30
|
96
|
|
97 print sys.exc_info()[0]
|
|
98 print sys.exc_info()[1]
|
39
|
99 efiles.errorFile.write(fl+"\n")
|
5
|
100 return True
|
|
101
|
39
|
102 def addDriToIndexMeta(path,efiles,delpath="",replacepath="",test=False):
|
5
|
103
|
8
|
104 md=manageIndexMetaPURLs.IndexMetaPURLManager()
|
5
|
105
|
|
106 for root, dirs, files in os.walk(path):
|
|
107
|
|
108
|
|
109 for name in files:
|
11
|
110 if name=="index.meta":
|
5
|
111 fl=join(root, name)
|
|
112 shortPath=re.sub("^"+delpath,replacepath,fl)
|
8
|
113 purl=md.getPurl(shortPath)
|
5
|
114
|
40
|
115 addPURL(fl,purl,efiles,test)
|
5
|
116
|
|
117 if 'pageimg' in dirs:
|
|
118 dirs.remove('pageimg') # don't visit pageimf
|
|
119 for dir in dirs:
|
|
120 if dir== "pageimg":
|
|
121 dirs.remove('pageimg')
|
|
122 if dir.startswith("."):
|
|
123 dirs.remove(dir)
|
|
124
|
|
125 if __name__ == '__main__':
|
39
|
126 class ef:
|
|
127 errorFile = file("/tmp/addDRIErrors.txt","w")
|
|
128 parseErrorFile = file("/tmp/addDRIParseErrors.txt","w")
|
|
129 alreadyExistsFile = file("/tmp/addDRIalreadyExists.txt","w")
|
|
130
|
|
131 efiles = ef()
|
|
132
|
|
133 addDriToIndexMeta("/mpiwg/online/permanent/vlp",efiles,delpath="/mpiwg/online",test=False)
|