comparison importFromOpenMind/importer/ismi2model.py @ 19:ca1e02a2a9c4

unfilteredIsmi: openmind to json exporter like filterISMI. ismi2model: openmind importer like ismi2neo4j that saves networkx pickle file.
author casties
date Wed, 09 Sep 2015 17:32:42 +0200
parents
children 45a823b5bf33
comparison
equal deleted inserted replaced
18:0827156df210 19:ca1e02a2a9c4
1 import urllib.request
2 import json
3 import networkx
4
5 ## configure behaviour
6
7 # output filename
8 output_fn = "ismi_graph.gpickle"
9
10 # contract relations to these objects into attributes with the relations' name
11 #contract_relations_into_attributes = ['PLACE', 'ALIAS']
12 contract_relations_into_attributes = []
13
14 # OpenMind base URL
15 baseURL="http://localhost:18080/ismi-richfaces/jsonInterface?"
16
17
18 entsURL=baseURL+"method=get_ents&oc=%s"
19
20 entsByIdURL = baseURL+"method=get_ents&include_content=True&ids=%s"
21
22 entURL=baseURL+"method=get_ent&id=%s&include_content=True"
23
24
25 def readJSON(url):
26 #print("JSON loading %s"%url)
27 wsh=urllib.request.urlopen(url)
28 txt = wsh.read()
29 return json.loads(txt.decode("utf-8"))
30
31 defs_json = readJSON(baseURL+"method=get_defs")
32
33 # current list of all definitions
34 ismi_defs = [atts['ov'] for atts in defs_json['defs']]
35
36 #ismi_types=["PERSON","WITNESS","CODEX","PLACE","COLLECTION","REPOSITORY"]
37
38
39 nx_graph = networkx.MultiDiGraph()
40
41 nx_nodes = {}
42 ismi_relations = {}
43 nx_relations = {}
44
45 ent_exclude_attrs = [
46 'lw',
47 'node_type',
48 'nov'
49 ]
50
51 def fixName(name, is_src_rel=False, is_tar_rel=False, att_from_rel=False):
52 # these are too embarrassing...
53 if 'FLORUIT' in name:
54 name = name.replace('FLORUIT', 'FLOURISH')
55
56 elif 'floruit' in name:
57 name = name.replace('floruit', 'flourish')
58
59 if is_src_rel:
60 #name = name + '>'
61 pass
62
63 if is_tar_rel:
64 name = '<' + name
65
66 if att_from_rel:
67 # clean up relations as attribute names
68 name = name.replace('is_', '')
69 name = name.replace('has_', '')
70 name = name.replace('was_', '')
71 name = name.replace('_of', '')
72
73 return name
74
75
76 def nodeFromEnt(ent, etype):
77 """Create a Neo4J node from the given JSON entity.
78
79 Creates the node in gdb and returns the node.
80 """
81 attrs = {}
82 # go through all attributes
83 for att in ent['atts']:
84 ct = att.get('content_type', None)
85 if ct is None or ct.lower() in ['text', 'arabic', 'bool', 'boolean', 'url', 'language']:
86 # normal text attribute (assume no content_type is text too...)
87 key = att['name']
88 val = att['ov']
89
90 if key in ent_exclude_attrs:
91 # exclude attribute
92 continue
93
94 # keep attribute
95 attrs[key] = val
96
97 elif ct == 'num':
98 # number attribute
99 key = att['name']
100 val = att['ov']
101
102 if key in ent_exclude_attrs:
103 # exclude attribute
104 continue
105
106 # keep attribute, assume num is int
107 attrs[key] = int(val)
108
109 elif ct == 'date':
110 # date attribute
111 key = att['name']
112 val = att['ov']
113 #print("don't know what to do with date: %s=%s"%(key,val))
114
115 elif ct == 'old':
116 # ignore attribute
117 continue
118
119 else:
120 print("WARN: attribute with unknown content_type: %s"%repr(att))
121 # ignore other content types
122 continue
123
124 # process base attributes
125 oc = ent['oc']
126 if oc != etype:
127 print("ERROR: entity type doesn't match!")
128 return null
129
130 attrs['type'] = fixName(oc)
131
132 ismi_id = ent['id']
133 # rename id to ismi_id
134 attrs['ismi_id'] = ismi_id
135
136 ov = ent.get('ov', None)
137 if ov is not None:
138 # save ov as label
139 attrs['label'] = ov
140
141 # create node with attributes
142 nx_graph.add_node(ismi_id, **attrs)
143 node = nx_graph.node[ismi_id]
144
145 return node
146
147
148 def relsFromEnt(ent, relations):
149 """Extract all relations from JSON entity.
150
151 Adds JSON to dict relations under relation's id.
152 """
153 # go through src_rels and tar_rels
154 rels = ent.get('src_rels', []) + ent.get('tar_rels', [])
155 for rel in rels:
156 rel_id = rel['id']
157 if rel_id in relations:
158 old_rel = relations[rel_id]
159 if rel != old_rel:
160 print("ERROR: relation is different: %s != %s"%(repr(rel), repr(old_rel)))
161 continue
162
163 relations[rel_id] = rel
164
165 return relations
166
167
168 def relationsFromRels(rels, nodes):
169 """Create relations in Neo4J.
170
171 Args:
172 rels: dict of JSON relations
173 nodes: dict of existing Neo4J nodes
174 Returns:
175 dict of Neo4J relations
176 """
177 # go through all rels
178 print("importing %s relations"%len(rels))
179 cnt = 0
180 for rel in rels.values():
181 cnt += 1
182 if cnt % 100 == 0:
183 print(" %s relations"%cnt)
184
185 rel_id = rel['id']
186 rel_name = rel['name']
187 src_id = rel['src_id']
188 tar_id = rel['tar_id']
189 if not src_id in nodes:
190 print("ERROR: relation %s src node %s missing!"%(rel_id,src_id))
191 continue
192
193 if not tar_id in nodes:
194 print("ERROR: relation %s tar node %s missing!"%(rel_id,tar_id))
195 continue
196
197 if contract_relations_into_attributes:
198 # contract source relations
199 tar_type = rel['tar_oc']
200 if tar_type in contract_relations_into_attributes:
201 att_name = fixName(rel_name, att_from_rel=True)
202 # TODO: clean up attribute names
203 while src.get(att_name, None) is not None:
204 # attribute exists
205 if att_name[-1].isnumeric():
206 # increment last digit
207 att_name = att_name[:-1] + str(int(att_name[-1]) + 1)
208 else:
209 att_name += '2'
210
211 # add target node's label as attribute
212 #print("contracting tar to attribute %s on id=%s"%(att_name, src_id))
213 nx_graph.node[src_id][att_name] = nx_graph.node[tar_id]['label']
214
215 # contract target relations
216 src_type = rel['src_oc']
217 if src_type in contract_relations_into_attributes:
218 att_name = fixName(rel_name, att_from_rel=True)
219 # TODO: clean up attribute names
220 while tar.get(att_name, None) is not None:
221 # attribute exists
222 if att_name[-1].isnumeric():
223 # increment last digit
224 att_name = att_name[:-1] + str(int(att_name[-1]) + 1)
225 else:
226 att_name += '2'
227
228 # add target node's label as attribute
229 #print("contracting src to attribute %s on id=%s"%(att_name, tar_id))
230 nx_graph.node[tar_id][att_name] = nx_graph.node[src_id]['label']
231
232 # create relation with type
233 nx_rel = nx_graph.add_edge(src_id, tar_id, type=fixName(rel_name))
234
235 nx_relations[rel_id] = nx_rel
236
237 return nx_relations
238
239
240 def importEnts(etype):
241 """Import all entities of the given type.
242 """
243 # read json for all entities of given type
244 json = readJSON(entsURL%etype)
245 ents = json['ents']
246 print("importing %s %ss"%(len(ents),etype))
247 size = 100
248 batches = [ents[pos:pos + size] for pos in range(0, len(ents), size)]
249 cnt = 0
250 for batch in batches:
251 cnt += size
252 if cnt % 100 == 0:
253 print(" %s %ss"%(cnt, etype))
254
255 # extract list of ismi ids
256 ismi_ids = [str(ent['id']) for ent in batch]
257
258 # fetch full data for list of entities
259 ent_json = readJSON(entsByIdURL%','.join(ismi_ids))
260 ents_data = ent_json['ents']
261
262 # iterate through results batch
263 for ent_data in ents_data:
264 ismi_id = ent_data['id']
265 if ismi_id in nx_nodes:
266 print("ERROR: entity with id=%s exists!"%ismi_id)
267 return
268
269 # create neo4j node
270 node = nodeFromEnt(ent_data, etype)
271
272 # save node reference
273 nx_nodes[ismi_id] = node
274
275 # extract relations
276 relsFromEnt(ent_data, ismi_relations)
277
278 #if cnt >= 100:
279 # return
280
281
282 # In[119]:
283
284 def importAllEnts(etypes):
285
286 for etype in etypes:
287 importEnts(etype)
288
289 relationsFromRels(ismi_relations, nx_nodes)
290
291
292 # In[120]:
293
294 importAllEnts(ismi_defs)
295 #importAllEnts(['TEXT'])
296
297 print("Graph info: %s"%networkx.info(nx_graph))
298 print("Number of nodes: %s"%networkx.number_of_nodes(nx_graph))
299 print("Number of edges: %s"%networkx.number_of_edges(nx_graph))
300 #print(" nodes:%s"%repr(nx_graph.nodes(data=True)))
301 # export pickle
302 networkx.write_gpickle(nx_graph, output_fn)
303 print("Wrote file %s"%output_fn)