Mercurial > hg > drupalISMI
comparison importFromOpenMind/importer/ismi2model.py @ 19:ca1e02a2a9c4
unfilteredIsmi: openmind to json exporter like filterISMI.
ismi2model: openmind importer like ismi2neo4j that saves networkx pickle file.
author | casties |
---|---|
date | Wed, 09 Sep 2015 17:32:42 +0200 |
parents | |
children | 45a823b5bf33 |
comparison
equal
deleted
inserted
replaced
18:0827156df210 | 19:ca1e02a2a9c4 |
---|---|
1 import urllib.request | |
2 import json | |
3 import networkx | |
4 | |
5 ## configure behaviour | |
6 | |
7 # output filename | |
8 output_fn = "ismi_graph.gpickle" | |
9 | |
10 # contract relations to these objects into attributes with the relations' name | |
11 #contract_relations_into_attributes = ['PLACE', 'ALIAS'] | |
12 contract_relations_into_attributes = [] | |
13 | |
14 # OpenMind base URL | |
15 baseURL="http://localhost:18080/ismi-richfaces/jsonInterface?" | |
16 | |
17 | |
18 entsURL=baseURL+"method=get_ents&oc=%s" | |
19 | |
20 entsByIdURL = baseURL+"method=get_ents&include_content=True&ids=%s" | |
21 | |
22 entURL=baseURL+"method=get_ent&id=%s&include_content=True" | |
23 | |
24 | |
25 def readJSON(url): | |
26 #print("JSON loading %s"%url) | |
27 wsh=urllib.request.urlopen(url) | |
28 txt = wsh.read() | |
29 return json.loads(txt.decode("utf-8")) | |
30 | |
31 defs_json = readJSON(baseURL+"method=get_defs") | |
32 | |
33 # current list of all definitions | |
34 ismi_defs = [atts['ov'] for atts in defs_json['defs']] | |
35 | |
36 #ismi_types=["PERSON","WITNESS","CODEX","PLACE","COLLECTION","REPOSITORY"] | |
37 | |
38 | |
39 nx_graph = networkx.MultiDiGraph() | |
40 | |
41 nx_nodes = {} | |
42 ismi_relations = {} | |
43 nx_relations = {} | |
44 | |
45 ent_exclude_attrs = [ | |
46 'lw', | |
47 'node_type', | |
48 'nov' | |
49 ] | |
50 | |
51 def fixName(name, is_src_rel=False, is_tar_rel=False, att_from_rel=False): | |
52 # these are too embarrassing... | |
53 if 'FLORUIT' in name: | |
54 name = name.replace('FLORUIT', 'FLOURISH') | |
55 | |
56 elif 'floruit' in name: | |
57 name = name.replace('floruit', 'flourish') | |
58 | |
59 if is_src_rel: | |
60 #name = name + '>' | |
61 pass | |
62 | |
63 if is_tar_rel: | |
64 name = '<' + name | |
65 | |
66 if att_from_rel: | |
67 # clean up relations as attribute names | |
68 name = name.replace('is_', '') | |
69 name = name.replace('has_', '') | |
70 name = name.replace('was_', '') | |
71 name = name.replace('_of', '') | |
72 | |
73 return name | |
74 | |
75 | |
76 def nodeFromEnt(ent, etype): | |
77 """Create a Neo4J node from the given JSON entity. | |
78 | |
79 Creates the node in gdb and returns the node. | |
80 """ | |
81 attrs = {} | |
82 # go through all attributes | |
83 for att in ent['atts']: | |
84 ct = att.get('content_type', None) | |
85 if ct is None or ct.lower() in ['text', 'arabic', 'bool', 'boolean', 'url', 'language']: | |
86 # normal text attribute (assume no content_type is text too...) | |
87 key = att['name'] | |
88 val = att['ov'] | |
89 | |
90 if key in ent_exclude_attrs: | |
91 # exclude attribute | |
92 continue | |
93 | |
94 # keep attribute | |
95 attrs[key] = val | |
96 | |
97 elif ct == 'num': | |
98 # number attribute | |
99 key = att['name'] | |
100 val = att['ov'] | |
101 | |
102 if key in ent_exclude_attrs: | |
103 # exclude attribute | |
104 continue | |
105 | |
106 # keep attribute, assume num is int | |
107 attrs[key] = int(val) | |
108 | |
109 elif ct == 'date': | |
110 # date attribute | |
111 key = att['name'] | |
112 val = att['ov'] | |
113 #print("don't know what to do with date: %s=%s"%(key,val)) | |
114 | |
115 elif ct == 'old': | |
116 # ignore attribute | |
117 continue | |
118 | |
119 else: | |
120 print("WARN: attribute with unknown content_type: %s"%repr(att)) | |
121 # ignore other content types | |
122 continue | |
123 | |
124 # process base attributes | |
125 oc = ent['oc'] | |
126 if oc != etype: | |
127 print("ERROR: entity type doesn't match!") | |
128 return null | |
129 | |
130 attrs['type'] = fixName(oc) | |
131 | |
132 ismi_id = ent['id'] | |
133 # rename id to ismi_id | |
134 attrs['ismi_id'] = ismi_id | |
135 | |
136 ov = ent.get('ov', None) | |
137 if ov is not None: | |
138 # save ov as label | |
139 attrs['label'] = ov | |
140 | |
141 # create node with attributes | |
142 nx_graph.add_node(ismi_id, **attrs) | |
143 node = nx_graph.node[ismi_id] | |
144 | |
145 return node | |
146 | |
147 | |
148 def relsFromEnt(ent, relations): | |
149 """Extract all relations from JSON entity. | |
150 | |
151 Adds JSON to dict relations under relation's id. | |
152 """ | |
153 # go through src_rels and tar_rels | |
154 rels = ent.get('src_rels', []) + ent.get('tar_rels', []) | |
155 for rel in rels: | |
156 rel_id = rel['id'] | |
157 if rel_id in relations: | |
158 old_rel = relations[rel_id] | |
159 if rel != old_rel: | |
160 print("ERROR: relation is different: %s != %s"%(repr(rel), repr(old_rel))) | |
161 continue | |
162 | |
163 relations[rel_id] = rel | |
164 | |
165 return relations | |
166 | |
167 | |
168 def relationsFromRels(rels, nodes): | |
169 """Create relations in Neo4J. | |
170 | |
171 Args: | |
172 rels: dict of JSON relations | |
173 nodes: dict of existing Neo4J nodes | |
174 Returns: | |
175 dict of Neo4J relations | |
176 """ | |
177 # go through all rels | |
178 print("importing %s relations"%len(rels)) | |
179 cnt = 0 | |
180 for rel in rels.values(): | |
181 cnt += 1 | |
182 if cnt % 100 == 0: | |
183 print(" %s relations"%cnt) | |
184 | |
185 rel_id = rel['id'] | |
186 rel_name = rel['name'] | |
187 src_id = rel['src_id'] | |
188 tar_id = rel['tar_id'] | |
189 if not src_id in nodes: | |
190 print("ERROR: relation %s src node %s missing!"%(rel_id,src_id)) | |
191 continue | |
192 | |
193 if not tar_id in nodes: | |
194 print("ERROR: relation %s tar node %s missing!"%(rel_id,tar_id)) | |
195 continue | |
196 | |
197 if contract_relations_into_attributes: | |
198 # contract source relations | |
199 tar_type = rel['tar_oc'] | |
200 if tar_type in contract_relations_into_attributes: | |
201 att_name = fixName(rel_name, att_from_rel=True) | |
202 # TODO: clean up attribute names | |
203 while src.get(att_name, None) is not None: | |
204 # attribute exists | |
205 if att_name[-1].isnumeric(): | |
206 # increment last digit | |
207 att_name = att_name[:-1] + str(int(att_name[-1]) + 1) | |
208 else: | |
209 att_name += '2' | |
210 | |
211 # add target node's label as attribute | |
212 #print("contracting tar to attribute %s on id=%s"%(att_name, src_id)) | |
213 nx_graph.node[src_id][att_name] = nx_graph.node[tar_id]['label'] | |
214 | |
215 # contract target relations | |
216 src_type = rel['src_oc'] | |
217 if src_type in contract_relations_into_attributes: | |
218 att_name = fixName(rel_name, att_from_rel=True) | |
219 # TODO: clean up attribute names | |
220 while tar.get(att_name, None) is not None: | |
221 # attribute exists | |
222 if att_name[-1].isnumeric(): | |
223 # increment last digit | |
224 att_name = att_name[:-1] + str(int(att_name[-1]) + 1) | |
225 else: | |
226 att_name += '2' | |
227 | |
228 # add target node's label as attribute | |
229 #print("contracting src to attribute %s on id=%s"%(att_name, tar_id)) | |
230 nx_graph.node[tar_id][att_name] = nx_graph.node[src_id]['label'] | |
231 | |
232 # create relation with type | |
233 nx_rel = nx_graph.add_edge(src_id, tar_id, type=fixName(rel_name)) | |
234 | |
235 nx_relations[rel_id] = nx_rel | |
236 | |
237 return nx_relations | |
238 | |
239 | |
240 def importEnts(etype): | |
241 """Import all entities of the given type. | |
242 """ | |
243 # read json for all entities of given type | |
244 json = readJSON(entsURL%etype) | |
245 ents = json['ents'] | |
246 print("importing %s %ss"%(len(ents),etype)) | |
247 size = 100 | |
248 batches = [ents[pos:pos + size] for pos in range(0, len(ents), size)] | |
249 cnt = 0 | |
250 for batch in batches: | |
251 cnt += size | |
252 if cnt % 100 == 0: | |
253 print(" %s %ss"%(cnt, etype)) | |
254 | |
255 # extract list of ismi ids | |
256 ismi_ids = [str(ent['id']) for ent in batch] | |
257 | |
258 # fetch full data for list of entities | |
259 ent_json = readJSON(entsByIdURL%','.join(ismi_ids)) | |
260 ents_data = ent_json['ents'] | |
261 | |
262 # iterate through results batch | |
263 for ent_data in ents_data: | |
264 ismi_id = ent_data['id'] | |
265 if ismi_id in nx_nodes: | |
266 print("ERROR: entity with id=%s exists!"%ismi_id) | |
267 return | |
268 | |
269 # create neo4j node | |
270 node = nodeFromEnt(ent_data, etype) | |
271 | |
272 # save node reference | |
273 nx_nodes[ismi_id] = node | |
274 | |
275 # extract relations | |
276 relsFromEnt(ent_data, ismi_relations) | |
277 | |
278 #if cnt >= 100: | |
279 # return | |
280 | |
281 | |
282 # In[119]: | |
283 | |
284 def importAllEnts(etypes): | |
285 | |
286 for etype in etypes: | |
287 importEnts(etype) | |
288 | |
289 relationsFromRels(ismi_relations, nx_nodes) | |
290 | |
291 | |
292 # In[120]: | |
293 | |
294 importAllEnts(ismi_defs) | |
295 #importAllEnts(['TEXT']) | |
296 | |
297 print("Graph info: %s"%networkx.info(nx_graph)) | |
298 print("Number of nodes: %s"%networkx.number_of_nodes(nx_graph)) | |
299 print("Number of edges: %s"%networkx.number_of_edges(nx_graph)) | |
300 #print(" nodes:%s"%repr(nx_graph.nodes(data=True))) | |
301 # export pickle | |
302 networkx.write_gpickle(nx_graph, output_fn) | |
303 print("Wrote file %s"%output_fn) |