Mercurial > hg > drupalISMI
annotate importFromOpenMind/importer/ismi2neo4j.py @ 28:a9bfd49355f8
updated config for ismi-dev.
author | casties |
---|---|
date | Wed, 18 Nov 2015 15:22:05 +0100 |
parents | ca1e02a2a9c4 |
children |
rev | line source |
---|---|
16 | 1 import urllib.request |
2 import json | |
3 from neo4jrestclient.client import GraphDatabase, Node | |
4 | |
17 | 5 ## configure behaviour |
6 | |
7 # add inverse relations as "<relation" | |
8 add_inverse_relations = True | |
9 | |
18 | 10 # add relations to these objects as attributes with the relations name |
11 contract_relations_into_attributes = ['PLACE', 'ALIAS'] | |
12 | |
17 | 13 # label added to all nodes |
28 | 14 project_label = '_ismi' |
17 | 15 |
16 # OpenMind base URL | |
16 | 17 baseURL="http://localhost:18080/ismi-richfaces/jsonInterface?" |
18 | |
17 | 19 # neo4j base URL |
20 neo4jBaseURL = "http://localhost:7474/db/data/" | |
21 | |
22 | |
16 | 23 entsURL=baseURL+"method=get_ents&oc=%s" |
24 | |
19
ca1e02a2a9c4
unfilteredIsmi: openmind to json exporter like filterISMI.
casties
parents:
18
diff
changeset
|
25 entsByIdURL = baseURL+"method=get_ents&include_content=True&ids=%s" |
ca1e02a2a9c4
unfilteredIsmi: openmind to json exporter like filterISMI.
casties
parents:
18
diff
changeset
|
26 |
16 | 27 entURL=baseURL+"method=get_ent&id=%s&include_content=True" |
28 | |
29 | |
30 def readJSON(url): | |
19
ca1e02a2a9c4
unfilteredIsmi: openmind to json exporter like filterISMI.
casties
parents:
18
diff
changeset
|
31 #print("JSON loading %s"%url) |
16 | 32 wsh=urllib.request.urlopen(url) |
33 txt = wsh.read() | |
34 return json.loads(txt.decode("utf-8")) | |
35 | |
36 defs_json = readJSON(baseURL+"method=get_defs") | |
37 | |
17 | 38 # current list of all definitions |
16 | 39 ismi_defs = [atts['ov'] for atts in defs_json['defs']] |
40 | |
17 | 41 #ismi_types=["PERSON","WITNESS","CODEX","PLACE","COLLECTION","REPOSITORY"] |
16 | 42 |
17 | 43 |
44 gdb = GraphDatabase(neo4jBaseURL, username="neo4j", password="neo5j") | |
16 | 45 |
46 n4j_nodes = {} | |
47 ismi_relations = {} | |
48 n4j_relations = {} | |
49 | |
50 ent_exclude_attrs = [ | |
51 'lw', | |
52 'node_type', | |
53 'nov' | |
54 ] | |
55 | |
18 | 56 def fixName(name, is_src_rel=False, is_tar_rel=False, att_from_rel=False): |
57 # these are too embarrassing... | |
17 | 58 if 'FLORUIT' in name: |
59 name = name.replace('FLORUIT', 'FLOURISH') | |
60 | |
61 elif 'floruit' in name: | |
62 name = name.replace('floruit', 'flourish') | |
63 | |
64 if is_src_rel: | |
18 | 65 #name = name + '>' |
66 pass | |
17 | 67 |
68 if is_tar_rel: | |
69 name = '<' + name | |
18 | 70 |
71 if att_from_rel: | |
72 # clean up relations as attribute names | |
73 name = name.replace('is_', '') | |
74 name = name.replace('has_', '') | |
75 name = name.replace('was_', '') | |
76 name = name.replace('_of', '') | |
17 | 77 |
78 return name | |
79 | |
16 | 80 |
81 def getNode(ismi_id=None): | |
82 if ismi_id is not None: | |
17 | 83 res = gdb.query("match (n {ismi_id: %s}) return n"%ismi_id, returns=(Node)) |
16 | 84 if len(res) > 0: |
85 return res[0] | |
86 | |
87 return None | |
88 | |
17 | 89 |
16 | 90 def nodeFromEnt(ent, etype): |
18 | 91 """Create a Neo4J node from the given JSON entity. |
92 | |
93 Creates the node in gdb and returns the node. | |
94 """ | |
16 | 95 attrs = {} |
96 # go through all attributes | |
97 for att in ent['atts']: | |
98 ct = att.get('content_type', None) | |
17 | 99 if ct is None or ct.lower() in ['text', 'arabic', 'bool', 'boolean', 'url', 'language']: |
100 # normal text attribute (assume no content_type is text too...) | |
16 | 101 key = att['name'] |
102 val = att['ov'] | |
103 | |
104 if key in ent_exclude_attrs: | |
105 # exclude attribute | |
106 continue | |
107 | |
108 # keep attribute | |
109 attrs[key] = val | |
110 | |
17 | 111 elif ct == 'num': |
112 # number attribute | |
113 key = att['name'] | |
114 val = att['ov'] | |
115 | |
116 if key in ent_exclude_attrs: | |
117 # exclude attribute | |
118 continue | |
119 | |
120 # keep attribute, assume num is int | |
121 attrs[key] = int(val) | |
122 | |
16 | 123 elif ct == 'date': |
124 # date attribute | |
125 key = att['name'] | |
126 val = att['ov'] | |
127 #print("don't know what to do with date: %s=%s"%(key,val)) | |
128 | |
129 elif ct == 'old': | |
130 # ignore attribute | |
131 continue | |
132 | |
133 else: | |
17 | 134 print("WARN: attribute with unknown content_type: %s"%repr(att)) |
16 | 135 # ignore other content types |
136 continue | |
137 | |
138 # process base attributes | |
139 oc = ent['oc'] | |
140 if oc != etype: | |
141 print("ERROR: entity type doesn't match!") | |
142 return null | |
143 | |
17 | 144 attrs['type'] = fixName(oc) |
16 | 145 |
146 ismi_id = ent['id'] | |
147 # rename id to ismi_id | |
148 attrs['ismi_id'] = ismi_id | |
149 | |
150 ov = ent.get('ov', None) | |
151 if ov is not None: | |
152 # save ov as label | |
153 attrs['label'] = ov | |
154 | |
155 # create node with attributes | |
156 node = gdb.nodes.create(**attrs) | |
157 # add labels | |
17 | 158 node.labels.add([project_label, fixName(etype)]) |
16 | 159 return node |
160 | |
161 | |
162 def relsFromEnt(ent, relations): | |
18 | 163 """Extract all relations from JSON entity. |
164 | |
165 Adds JSON to dict relations under relation's id. | |
166 """ | |
16 | 167 # go through src_rels and tar_rels |
168 rels = ent.get('src_rels', []) + ent.get('tar_rels', []) | |
169 for rel in rels: | |
170 rel_id = rel['id'] | |
171 if rel_id in relations: | |
172 old_rel = relations[rel_id] | |
173 if rel != old_rel: | |
174 print("ERROR: relation is different: %s != %s"%(repr(rel), repr(old_rel))) | |
175 continue | |
176 | |
177 relations[rel_id] = rel | |
178 | |
179 return relations | |
180 | |
181 | |
182 def n4jrelationsFromRels(rels, nodes): | |
18 | 183 """Create relations in Neo4J. |
184 | |
185 Args: | |
186 rels: dict of JSON relations | |
187 nodes: dict of existing Neo4J nodes | |
188 Returns: | |
189 dict of Neo4J relations | |
190 """ | |
16 | 191 # go through all rels |
192 print("importing %s relations"%len(rels)) | |
193 cnt = 0 | |
194 for rel in rels.values(): | |
195 cnt += 1 | |
196 if cnt % 100 == 0: | |
197 print(" %s relations"%cnt) | |
198 | |
199 rel_id = rel['id'] | |
200 rel_name = rel['name'] | |
201 src_id = rel['src_id'] | |
202 tar_id = rel['tar_id'] | |
203 src = nodes.get(src_id, None) | |
204 if src is None: | |
205 print("ERROR: relation %s src node %s missing!"%(rel_id,src_id)) | |
206 continue | |
207 | |
208 tar = nodes.get(tar_id, None) | |
209 if tar is None: | |
210 print("ERROR: relation %s tar node %s missing!"%(rel_id,tar_id)) | |
211 continue | |
212 | |
18 | 213 if contract_relations_into_attributes: |
214 # contract source relations | |
215 tar_type = rel['tar_oc'] | |
216 if tar_type in contract_relations_into_attributes: | |
217 att_name = fixName(rel_name, att_from_rel=True) | |
218 # TODO: clean up attribute names | |
219 while src.get(att_name, None) is not None: | |
220 # attribute exists | |
221 if att_name[-1].isnumeric(): | |
222 # increment last digit | |
223 att_name = att_name[:-1] + str(int(att_name[-1]) + 1) | |
224 else: | |
225 att_name += '2' | |
226 | |
227 # add target node's label as attribute | |
228 #print("contracting tar to attribute %s on id=%s"%(att_name, src_id)) | |
229 src.set(att_name, tar.get('label')) | |
230 | |
231 # contract target relations | |
232 src_type = rel['src_oc'] | |
233 if src_type in contract_relations_into_attributes: | |
234 att_name = fixName(rel_name, att_from_rel=True) | |
235 # TODO: clean up attribute names | |
236 while tar.get(att_name, None) is not None: | |
237 # attribute exists | |
238 if att_name[-1].isnumeric(): | |
239 # increment last digit | |
240 att_name = att_name[:-1] + str(int(att_name[-1]) + 1) | |
241 else: | |
242 att_name += '2' | |
243 | |
244 # add target node's label as attribute | |
245 #print("contracting src to attribute %s on id=%s"%(att_name, tar_id)) | |
19
ca1e02a2a9c4
unfilteredIsmi: openmind to json exporter like filterISMI.
casties
parents:
18
diff
changeset
|
246 tar.set(att_name, src.get('label')) |
18 | 247 |
17 | 248 if add_inverse_relations: |
249 n4j_rel = [gdb.relationships.create(src, fixName(rel_name, is_src_rel=True), tar), | |
250 gdb.relationships.create(tar, fixName(rel_name, is_tar_rel=True), src)] | |
251 | |
252 else: | |
253 n4j_rel = gdb.relationships.create(src, fixName(rel_name), tar) | |
254 | |
16 | 255 n4j_relations[rel_id] = n4j_rel |
256 | |
257 return n4j_relations | |
258 | |
259 | |
260 def importEnts(etype): | |
18 | 261 """Import all entities of the given type. |
262 """ | |
16 | 263 # read json for all entities of given type |
264 json = readJSON(entsURL%etype) | |
265 ents = json['ents'] | |
266 print("importing %s %ss"%(len(ents),etype)) | |
19
ca1e02a2a9c4
unfilteredIsmi: openmind to json exporter like filterISMI.
casties
parents:
18
diff
changeset
|
267 size = 100 |
ca1e02a2a9c4
unfilteredIsmi: openmind to json exporter like filterISMI.
casties
parents:
18
diff
changeset
|
268 batches = [ents[pos:pos + size] for pos in range(0, len(ents), size)] |
16 | 269 cnt = 0 |
19
ca1e02a2a9c4
unfilteredIsmi: openmind to json exporter like filterISMI.
casties
parents:
18
diff
changeset
|
270 for batch in batches: |
ca1e02a2a9c4
unfilteredIsmi: openmind to json exporter like filterISMI.
casties
parents:
18
diff
changeset
|
271 cnt += size |
16 | 272 if cnt % 100 == 0: |
273 print(" %s %ss"%(cnt, etype)) | |
274 | |
19
ca1e02a2a9c4
unfilteredIsmi: openmind to json exporter like filterISMI.
casties
parents:
18
diff
changeset
|
275 # extract list of ismi ids |
ca1e02a2a9c4
unfilteredIsmi: openmind to json exporter like filterISMI.
casties
parents:
18
diff
changeset
|
276 ismi_ids = [str(ent['id']) for ent in batch] |
16 | 277 |
19
ca1e02a2a9c4
unfilteredIsmi: openmind to json exporter like filterISMI.
casties
parents:
18
diff
changeset
|
278 # fetch full data for list of entities |
ca1e02a2a9c4
unfilteredIsmi: openmind to json exporter like filterISMI.
casties
parents:
18
diff
changeset
|
279 ent_json = readJSON(entsByIdURL%','.join(ismi_ids)) |
ca1e02a2a9c4
unfilteredIsmi: openmind to json exporter like filterISMI.
casties
parents:
18
diff
changeset
|
280 ents_data = ent_json['ents'] |
16 | 281 |
19
ca1e02a2a9c4
unfilteredIsmi: openmind to json exporter like filterISMI.
casties
parents:
18
diff
changeset
|
282 # iterate through results batch |
ca1e02a2a9c4
unfilteredIsmi: openmind to json exporter like filterISMI.
casties
parents:
18
diff
changeset
|
283 for ent_data in ents_data: |
ca1e02a2a9c4
unfilteredIsmi: openmind to json exporter like filterISMI.
casties
parents:
18
diff
changeset
|
284 ismi_id = ent_data['id'] |
ca1e02a2a9c4
unfilteredIsmi: openmind to json exporter like filterISMI.
casties
parents:
18
diff
changeset
|
285 if ismi_id in n4j_nodes: |
ca1e02a2a9c4
unfilteredIsmi: openmind to json exporter like filterISMI.
casties
parents:
18
diff
changeset
|
286 print("ERROR: entity with id=%s exists!"%ismi_id) |
ca1e02a2a9c4
unfilteredIsmi: openmind to json exporter like filterISMI.
casties
parents:
18
diff
changeset
|
287 return |
ca1e02a2a9c4
unfilteredIsmi: openmind to json exporter like filterISMI.
casties
parents:
18
diff
changeset
|
288 |
ca1e02a2a9c4
unfilteredIsmi: openmind to json exporter like filterISMI.
casties
parents:
18
diff
changeset
|
289 # create neo4j node |
18 | 290 node = nodeFromEnt(ent_data, etype) |
19
ca1e02a2a9c4
unfilteredIsmi: openmind to json exporter like filterISMI.
casties
parents:
18
diff
changeset
|
291 |
ca1e02a2a9c4
unfilteredIsmi: openmind to json exporter like filterISMI.
casties
parents:
18
diff
changeset
|
292 # save node reference |
ca1e02a2a9c4
unfilteredIsmi: openmind to json exporter like filterISMI.
casties
parents:
18
diff
changeset
|
293 n4j_nodes[ismi_id] = node |
ca1e02a2a9c4
unfilteredIsmi: openmind to json exporter like filterISMI.
casties
parents:
18
diff
changeset
|
294 |
ca1e02a2a9c4
unfilteredIsmi: openmind to json exporter like filterISMI.
casties
parents:
18
diff
changeset
|
295 # extract relations |
ca1e02a2a9c4
unfilteredIsmi: openmind to json exporter like filterISMI.
casties
parents:
18
diff
changeset
|
296 relsFromEnt(ent_data, ismi_relations) |
16 | 297 |
298 #if cnt >= 100: | |
299 # return | |
300 | |
301 | |
302 # In[119]: | |
303 | |
304 def importAllEnts(etypes): | |
305 | |
306 for etype in etypes: | |
307 importEnts(etype) | |
308 | |
309 n4jrelationsFromRels(ismi_relations, n4j_nodes) | |
310 | |
311 | |
312 # In[120]: | |
313 | |
314 #importAllEnts(ismi_types) | |
315 importAllEnts(ismi_defs) |