Mercurial > hg > drupalISMI
annotate importFromOpenMind/importer/filterISMI.py @ 10:2a786f0d46a7
more comments in the code.
author | casties |
---|---|
date | Fri, 26 Jun 2015 10:59:53 +0200 |
parents | 0ae6145e7c80 |
children |
rev | line source |
---|---|
0 | 1 ''' |
2 Created on 22.04.2014 | |
3 | |
4 @author: dwinter | |
5 ''' | |
6 | |
2
e55656794c82
create and use separate export directory.
root@ismi.rz-berlin.mpg.de
parents:
0
diff
changeset
|
7 import os |
0 | 8 import json |
9 import urllib.request | |
10 | |
11 class Importer: | |
12 | |
13 def loadJSON(self,url): | |
10 | 14 """Load JSON from URL. |
15 | |
16 Saves JSON in data member. | |
17 """ | |
18 print(" loading "+url) | |
0 | 19 response = urllib.request.urlopen(url) |
20 str_response = response.readall().decode('utf-8') | |
21 | |
22 self.data = json.loads(str_response) | |
23 | |
24 | |
25 def loadJSONFromFile(self,fn): | |
10 | 26 """Load JSON from file. |
0 | 27 |
10 | 28 Saves JSON in data member. |
29 """ | |
30 print(" loading "+fn+".json") | |
0 | 31 self.data = json.load(open(fn+".json",'r', encoding="utf-8"),encoding="utf-8") |
32 | |
33 | |
34 def getEntIdsMentioned(self,kind="tar",filterOC=[]): | |
10 | 35 """Extract related entities from data member. |
36 | |
37 Checks relations of direction kind. | |
38 Skips objects of type filterOC. | |
39 Returns a set of ids of related objects and a list of the relations. | |
40 """ | |
0 | 41 |
42 ents = self.data.get("ents") | |
43 | |
44 ret=set() | |
45 rels=[] | |
46 if kind=="tar": | |
47 rel_type="tar_rels" | |
48 id_type="src_id" | |
49 oc_type="src_oc" | |
50 else: | |
51 rel_type="src_rels" | |
52 id_type="tar_id" | |
53 oc_type="tar_oc" | |
54 | |
55 for ent in ents: | |
56 tar_rels = ent.get(rel_type) | |
57 | |
58 for tar_rel in tar_rels: | |
59 | |
60 if not tar_rel.get(oc_type) in filterOC: | |
61 | |
62 ret.add(str(tar_rel.get(id_type))) | |
63 | |
64 rels.append(tar_rel) | |
65 | |
66 return ret,rels | |
67 | |
68 | |
69 def loadallEnts(self,kind="tar",filterOC=[]): | |
10 | 70 """Get related entities from OpenMind. |
71 | |
72 Gets all related entities' ids using kind and filterOC via getEntIdsMentioned(). | |
73 Downloads the entities from OpenMind using the ids. | |
74 Returns the entities as JSON-string and a list of relations. | |
75 """ | |
0 | 76 |
77 ids,rels = self.getEntIdsMentioned(kind=kind,filterOC=filterOC) | |
78 | |
4 | 79 baseUrl="https://ismi.mpiwg-berlin.mpg.de/om4-ismi/jsonInterface?include_content=true&include_romanization=true&method=get_ents" |
0 | 80 |
81 lenId = len(ids) | |
82 | |
83 portions = int(lenId / 500) | |
84 | |
85 ents = [] | |
86 for p in range(portions+1): | |
87 | |
88 start = p * 500 | |
89 end = min(lenId,(p+1)*500) | |
90 | |
91 idsFrak = list(ids)[start:end] | |
92 idsString = ",".join(idsFrak) | |
93 | |
94 | |
95 qs = baseUrl+"&ids="+idsString | |
10 | 96 print(" loading ents from "+qs) |
0 | 97 response = urllib.request.urlopen(qs) |
98 entsJ = json.loads(response.readall().decode('utf-8')); | |
99 ents += entsJ.get("ents") | |
100 #str_response += response.readall().decode('utf-8') | |
101 | |
102 str_response = json.dumps({"ents":ents}); | |
103 return str_response,rels | |
104 | |
10 | 105 |
0 | 106 def saveallEnts(self,filename,kind="tar",filterOC=[]): |
10 | 107 """Loads all related entities and saves as JSON. |
108 | |
109 Loads all related entities using kind and filterOC via LoadAllEnts(). | |
110 Saves entities in file filename.json. | |
111 Saves relations in file filename_rels.json. | |
112 """ | |
0 | 113 |
114 ents,rels = self.loadallEnts(kind=kind,filterOC=filterOC) | |
10 | 115 |
116 print(" writing ", filename+".json") | |
0 | 117 of = open(filename+".json","wb") |
118 of.write(ents.encode('utf-8')) | |
119 of.close() | |
120 | |
10 | 121 print(" writing ", filename+"_rels.json") |
0 | 122 of = open(filename+"_rels.json","w") |
123 json.dump({'rels':rels},of); | |
124 of.close() | |
125 | |
126 | |
127 if __name__ == '__main__': | |
128 imp = Importer() | |
129 | |
130 # url = """http://openmind-ismi-dev.mpiwg-berlin.mpg.de/om4-ismi/jsonInterface?method=get_ents&ids=27543,36745,58453,87298,259646,35093,22863,34870,36882,101488,36696,31794,37240,35014,35583,37025,35960,172492,98286,165721,260111,90980,36316,260120,36241,260129,260138,38860,176694,72545,36185,36575,260146,31672,37739,89861,176778,180743,86328,260150,90658,58423,181058,105948,35526,74078,260158,181096,31606,31568,27872,36938,4836,34668,76866,102230,76888,74070,73757,182685,260162,260170,1102,172888,260174,34806,28088,36713,37323,34551,35943,98095,260178,260182,182770,260186,260190,260194,36114,85003,31630,157290,37153,37213,172952,86871,64406,102590,82615,58245,179791,179550,12419,95861,36429,36099,74237,36065,74822,87549,83765,36733,19259,260198,34986,88041,260202,36550,260206,37228,39880,36318,36597,35035,58328,80831,58354,74277,36529,36380,69450,200246,260222,81178,260226,199952,262557,87212,99059,64270,81811,65785,36645 | |
131 # """ | |
132 # | |
10 | 133 # |
134 # load all public codices | |
135 # contains codices with attributes and first-order relations | |
136 # | |
4 | 137 url = """https://ismi.mpiwg-berlin.mpg.de/om4-ismi/jsonInterface?method=get_public_codices""" |
0 | 138 |
139 imp.loadJSON(url) | |
140 | |
10 | 141 # create directory for export files |
2
e55656794c82
create and use separate export directory.
root@ismi.rz-berlin.mpg.de
parents:
0
diff
changeset
|
142 exportDir = '/tmp/ismi_data' |
e55656794c82
create and use separate export directory.
root@ismi.rz-berlin.mpg.de
parents:
0
diff
changeset
|
143 if not os.access(exportDir, os.R_OK): |
e55656794c82
create and use separate export directory.
root@ismi.rz-berlin.mpg.de
parents:
0
diff
changeset
|
144 # dir doesn't exist -> create |
e55656794c82
create and use separate export directory.
root@ismi.rz-berlin.mpg.de
parents:
0
diff
changeset
|
145 os.makedirs(exportDir) |
0 | 146 |
10 | 147 # |
148 # load and save all target relations of codices as witnesses.json | |
149 # | |
2
e55656794c82
create and use separate export directory.
root@ismi.rz-berlin.mpg.de
parents:
0
diff
changeset
|
150 imp.saveallEnts(exportDir+"/witnesses",kind="tar") |
0 | 151 |
10 | 152 # |
153 # load and save all source relations of codices except type codex and witness as codex_src.json | |
154 # | |
2
e55656794c82
create and use separate export directory.
root@ismi.rz-berlin.mpg.de
parents:
0
diff
changeset
|
155 imp.saveallEnts(exportDir+"/codex_src",kind="src",filterOC=['CODEX','WITNESS']) |
0 | 156 |
157 #hole jetzt alle relationen an den witnessen | |
158 | |
10 | 159 # |
160 # load the witnesses.json file from above | |
161 # | |
2
e55656794c82
create and use separate export directory.
root@ismi.rz-berlin.mpg.de
parents:
0
diff
changeset
|
162 imp.loadJSONFromFile(exportDir+"/witnesses") |
0 | 163 |
10 | 164 # |
165 # load and save all source relations except type codex, witness, person as texts.json | |
166 # | |
2
e55656794c82
create and use separate export directory.
root@ismi.rz-berlin.mpg.de
parents:
0
diff
changeset
|
167 imp.saveallEnts(exportDir+"/texts",kind="src",filterOC=['CODEX','WITNESS','PERSON']) |
0 | 168 |
10 | 169 # |
170 # load the texts.json file from above | |
171 # | |
2
e55656794c82
create and use separate export directory.
root@ismi.rz-berlin.mpg.de
parents:
0
diff
changeset
|
172 imp.loadJSONFromFile(exportDir+"/texts") |
0 | 173 |
10 | 174 # |
175 # load and save all source relations except type codex, witness and text as authors_subjects_src.json | |
176 # | |
2
e55656794c82
create and use separate export directory.
root@ismi.rz-berlin.mpg.de
parents:
0
diff
changeset
|
177 imp.saveallEnts(exportDir+"/authors_subjects_src",kind="src",filterOC=['CODEX','WITNESS','TEXT']) |
0 | 178 |
10 | 179 # |
180 # load and save all target relations except type codex, witness and text as authors_subjects_tar.json | |
181 # | |
2
e55656794c82
create and use separate export directory.
root@ismi.rz-berlin.mpg.de
parents:
0
diff
changeset
|
182 imp.saveallEnts(exportDir+"/authors_subjects_tar",kind="tar",filterOC=['CODEX','WITNESS','TEXT']) |
0 | 183 |
10 | 184 # |
185 # load the authors_subjects_src.json file from above | |
186 # | |
2
e55656794c82
create and use separate export directory.
root@ismi.rz-berlin.mpg.de
parents:
0
diff
changeset
|
187 imp.loadJSONFromFile(exportDir+"/authors_subjects_src") |
0 | 188 |
10 | 189 # |
190 # load and save all source relations except type codex, witness, text and person as subjects_places.json | |
191 # | |
2
e55656794c82
create and use separate export directory.
root@ismi.rz-berlin.mpg.de
parents:
0
diff
changeset
|
192 imp.saveallEnts(exportDir+"/subjects_places",kind="src",filterOC=['CODEX','WITNESS','TEXT','PERSON']) |
10 | 193 |
194 # | |
195 # load and save all source relations of type codex, witness, text and person as references_places.json | |
196 # | |
2
e55656794c82
create and use separate export directory.
root@ismi.rz-berlin.mpg.de
parents:
0
diff
changeset
|
197 imp.saveallEnts(exportDir+"/references_places",kind="tar",filterOC=['CODEX','WITNESS','TEXT','PERSON']) |
0 | 198 |
199 | |
200 | |
201 | |
202 | |
203 | |
204 | |
205 |