0
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
1 '''
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
2 Created on 22.04.2014
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
3
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
4 @author: dwinter
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
5 '''
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
6
|
2
|
7 import os
|
0
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
8 import json
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
9 import urllib.request
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
10
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
11 class Importer:
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
12
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
13 def loadJSON(self,url):
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
14
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
15
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
16 response = urllib.request.urlopen(url)
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
17 str_response = response.readall().decode('utf-8')
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
18
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
19 self.data = json.loads(str_response)
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
20
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
21
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
22 def loadJSONFromFile(self,fn):
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
23
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
24
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
25 self.data = json.load(open(fn+".json",'r', encoding="utf-8"),encoding="utf-8")
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
26
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
27
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
28 def getEntIdsMentioned(self,kind="tar",filterOC=[]):
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
29 """ holt alle Id entweder als src_id """
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
30
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
31 ents = self.data.get("ents")
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
32
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
33
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
34
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
35 ret=set()
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
36 rels=[]
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
37 if kind=="tar":
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
38 rel_type="tar_rels"
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
39 id_type="src_id"
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
40 oc_type="src_oc"
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
41 else:
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
42 rel_type="src_rels"
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
43 id_type="tar_id"
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
44 oc_type="tar_oc"
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
45
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
46 for ent in ents:
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
47 tar_rels = ent.get(rel_type)
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
48
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
49
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
50
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
51 for tar_rel in tar_rels:
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
52
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
53
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
54 if not tar_rel.get(oc_type) in filterOC:
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
55
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
56
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
57 ret.add(str(tar_rel.get(id_type)))
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
58
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
59
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
60 rels.append(tar_rel)
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
61
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
62
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
63
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
64 return ret,rels
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
65
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
66
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
67 def loadallEnts(self,kind="tar",filterOC=[]):
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
68
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
69 ids,rels = self.getEntIdsMentioned(kind=kind,filterOC=filterOC)
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
70
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
71
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
72 baseUrl="http://openmind-ismi-dev.mpiwg-berlin.mpg.de/om4-ismi/jsonInterface?include_content=true&include_romanization=true&method=get_ents"
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
73
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
74 lenId = len(ids)
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
75
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
76 portions = int(lenId / 500)
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
77
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
78 ents = []
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
79 for p in range(portions+1):
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
80
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
81
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
82 start = p * 500
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
83 end = min(lenId,(p+1)*500)
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
84
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
85 idsFrak = list(ids)[start:end]
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
86 idsString = ",".join(idsFrak)
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
87
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
88
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
89 qs = baseUrl+"&ids="+idsString
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
90 print (qs)
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
91 response = urllib.request.urlopen(qs)
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
92 entsJ = json.loads(response.readall().decode('utf-8'));
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
93 ents += entsJ.get("ents")
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
94 #str_response += response.readall().decode('utf-8')
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
95
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
96
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
97 str_response = json.dumps({"ents":ents});
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
98 return str_response,rels
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
99
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
100 def saveallEnts(self,filename,kind="tar",filterOC=[]):
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
101
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
102 ents,rels = self.loadallEnts(kind=kind,filterOC=filterOC)
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
103 of = open(filename+".json","wb")
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
104 of.write(ents.encode('utf-8'))
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
105 of.close()
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
106
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
107 of = open(filename+"_rels.json","w")
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
108 json.dump({'rels':rels},of);
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
109 of.close()
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
110
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
111
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
112
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
113 if __name__ == '__main__':
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
114 imp = Importer()
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
115
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
116 # url = """http://openmind-ismi-dev.mpiwg-berlin.mpg.de/om4-ismi/jsonInterface?method=get_ents&ids=27543,36745,58453,87298,259646,35093,22863,34870,36882,101488,36696,31794,37240,35014,35583,37025,35960,172492,98286,165721,260111,90980,36316,260120,36241,260129,260138,38860,176694,72545,36185,36575,260146,31672,37739,89861,176778,180743,86328,260150,90658,58423,181058,105948,35526,74078,260158,181096,31606,31568,27872,36938,4836,34668,76866,102230,76888,74070,73757,182685,260162,260170,1102,172888,260174,34806,28088,36713,37323,34551,35943,98095,260178,260182,182770,260186,260190,260194,36114,85003,31630,157290,37153,37213,172952,86871,64406,102590,82615,58245,179791,179550,12419,95861,36429,36099,74237,36065,74822,87549,83765,36733,19259,260198,34986,88041,260202,36550,260206,37228,39880,36318,36597,35035,58328,80831,58354,74277,36529,36380,69450,200246,260222,81178,260226,199952,262557,87212,99059,64270,81811,65785,36645
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
117 # """
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
118 #
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
119
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
120 url = """http://openmind-ismi-dev.mpiwg-berlin.mpg.de/om4-ismi/jsonInterface?method=get_public_codices"""
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
121
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
122
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
123 imp.loadJSON(url)
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
124
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
125 #ids= imp.getEntIdsMentioned()
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
126
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
127
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
128 #loadall = imp.loadallEnts()
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
129 #print(loadall.encode('utf-8'))
|
2
|
130
|
|
131 exportDir = '/tmp/ismi_data'
|
|
132 if not os.access(exportDir, os.R_OK):
|
|
133 # dir doesn't exist -> create
|
|
134 os.makedirs(exportDir)
|
0
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
135
|
2
|
136 imp.saveallEnts(exportDir+"/witnesses",kind="tar")
|
0
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
137
|
2
|
138 imp.saveallEnts(exportDir+"/codex_src",kind="src",filterOC=['CODEX','WITNESS'])
|
0
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
139
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
140 #hole jetzt alle relationen an den witnessen
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
141
|
2
|
142 imp.loadJSONFromFile(exportDir+"/witnesses")
|
0
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
143
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
144 #ids= imp.getEntIdsMentioned(kind="src")
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
145
|
2
|
146 imp.saveallEnts(exportDir+"/texts",kind="src",filterOC=['CODEX','WITNESS','PERSON'])
|
0
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
147
|
2
|
148 imp.loadJSONFromFile(exportDir+"/texts")
|
0
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
149
|
2
|
150 imp.saveallEnts(exportDir+"/authors_subjects_src",kind="src",filterOC=['CODEX','WITNESS','TEXT'])
|
0
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
151
|
2
|
152 imp.saveallEnts(exportDir+"/authors_subjects_tar",kind="tar",filterOC=['CODEX','WITNESS','TEXT'])
|
0
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
153
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
154
|
2
|
155 imp.loadJSONFromFile(exportDir+"/authors_subjects_src")
|
0
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
156
|
2
|
157 imp.saveallEnts(exportDir+"/subjects_places",kind="src",filterOC=['CODEX','WITNESS','TEXT','PERSON'])
|
|
158 imp.saveallEnts(exportDir+"/references_places",kind="tar",filterOC=['CODEX','WITNESS','TEXT','PERSON'])
|
0
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
159
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
160
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
161
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
162
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
163
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
164
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
165
|
Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
166
|