Mercurial > hg > drupalISMI
comparison importFromOpenMind/importer/ismi2neo4j.py @ 18:0827156df210
added contraction of relations into attributes.
added some docstrings.
author | casties |
---|---|
date | Mon, 07 Sep 2015 16:57:10 +0200 |
parents | 4dfd832e9cd9 |
children | ca1e02a2a9c4 |
comparison
equal
deleted
inserted
replaced
17:4dfd832e9cd9 | 18:0827156df210 |
---|---|
5 ## configure behaviour | 5 ## configure behaviour |
6 | 6 |
7 # add inverse relations as "<relation" | 7 # add inverse relations as "<relation" |
8 add_inverse_relations = True | 8 add_inverse_relations = True |
9 | 9 |
10 # add relations to these objects as attributes with the relations name | |
11 contract_relations_into_attributes = ['PLACE', 'ALIAS'] | |
12 | |
10 # try to find and re-use existing nodes in neo4j (slow!) | 13 # try to find and re-use existing nodes in neo4j (slow!) |
11 keep_nodes = False | 14 keep_nodes = False |
12 | 15 |
13 # label added to all nodes | 16 # label added to all nodes |
14 project_label = '_ismi_inv_rel' | 17 project_label = '_ismi2' |
15 | 18 |
16 # OpenMind base URL | 19 # OpenMind base URL |
17 baseURL="http://localhost:18080/ismi-richfaces/jsonInterface?" | 20 baseURL="http://localhost:18080/ismi-richfaces/jsonInterface?" |
18 | 21 |
19 # neo4j base URL | 22 # neo4j base URL |
48 'lw', | 51 'lw', |
49 'node_type', | 52 'node_type', |
50 'nov' | 53 'nov' |
51 ] | 54 ] |
52 | 55 |
53 def fixName(name, is_src_rel=False, is_tar_rel=False): | 56 def fixName(name, is_src_rel=False, is_tar_rel=False, att_from_rel=False): |
54 # these are too embarrasing... | 57 # these are too embarrassing... |
55 if 'FLORUIT' in name: | 58 if 'FLORUIT' in name: |
56 name = name.replace('FLORUIT', 'FLOURISH') | 59 name = name.replace('FLORUIT', 'FLOURISH') |
57 | 60 |
58 elif 'floruit' in name: | 61 elif 'floruit' in name: |
59 name = name.replace('floruit', 'flourish') | 62 name = name.replace('floruit', 'flourish') |
60 | 63 |
61 if is_src_rel: | 64 if is_src_rel: |
62 name = name + '>' | 65 #name = name + '>' |
66 pass | |
63 | 67 |
64 if is_tar_rel: | 68 if is_tar_rel: |
65 name = '<' + name | 69 name = '<' + name |
70 | |
71 if att_from_rel: | |
72 # clean up relations as attribute names | |
73 name = name.replace('is_', '') | |
74 name = name.replace('has_', '') | |
75 name = name.replace('was_', '') | |
76 name = name.replace('_of', '') | |
66 | 77 |
67 return name | 78 return name |
68 | 79 |
69 | 80 |
70 def getNode(ismi_id=None): | 81 def getNode(ismi_id=None): |
75 | 86 |
76 return None | 87 return None |
77 | 88 |
78 | 89 |
79 def nodeFromEnt(ent, etype): | 90 def nodeFromEnt(ent, etype): |
91 """Create a Neo4J node from the given JSON entity. | |
92 | |
93 Creates the node in gdb and returns the node. | |
94 """ | |
80 attrs = {} | 95 attrs = {} |
81 # go through all attributes | 96 # go through all attributes |
82 for att in ent['atts']: | 97 for att in ent['atts']: |
83 ct = att.get('content_type', None) | 98 ct = att.get('content_type', None) |
84 if ct is None or ct.lower() in ['text', 'arabic', 'bool', 'boolean', 'url', 'language']: | 99 if ct is None or ct.lower() in ['text', 'arabic', 'bool', 'boolean', 'url', 'language']: |
142 # add labels | 157 # add labels |
143 node.labels.add([project_label, fixName(etype)]) | 158 node.labels.add([project_label, fixName(etype)]) |
144 return node | 159 return node |
145 | 160 |
146 | 161 |
147 # In[77]: | |
148 | |
149 def relsFromEnt(ent, relations): | 162 def relsFromEnt(ent, relations): |
163 """Extract all relations from JSON entity. | |
164 | |
165 Adds JSON to dict relations under relation's id. | |
166 """ | |
150 # go through src_rels and tar_rels | 167 # go through src_rels and tar_rels |
151 rels = ent.get('src_rels', []) + ent.get('tar_rels', []) | 168 rels = ent.get('src_rels', []) + ent.get('tar_rels', []) |
152 for rel in rels: | 169 for rel in rels: |
153 rel_id = rel['id'] | 170 rel_id = rel['id'] |
154 if rel_id in relations: | 171 if rel_id in relations: |
160 relations[rel_id] = rel | 177 relations[rel_id] = rel |
161 | 178 |
162 return relations | 179 return relations |
163 | 180 |
164 | 181 |
165 # In[110]: | |
166 | |
167 def n4jrelationsFromRels(rels, nodes): | 182 def n4jrelationsFromRels(rels, nodes): |
183 """Create relations in Neo4J. | |
184 | |
185 Args: | |
186 rels: dict of JSON relations | |
187 nodes: dict of existing Neo4J nodes | |
188 Returns: | |
189 dict of Neo4J relations | |
190 """ | |
168 # go through all rels | 191 # go through all rels |
169 print("importing %s relations"%len(rels)) | 192 print("importing %s relations"%len(rels)) |
170 cnt = 0 | 193 cnt = 0 |
171 for rel in rels.values(): | 194 for rel in rels.values(): |
172 cnt += 1 | 195 cnt += 1 |
185 tar = nodes.get(tar_id, None) | 208 tar = nodes.get(tar_id, None) |
186 if tar is None: | 209 if tar is None: |
187 print("ERROR: relation %s tar node %s missing!"%(rel_id,tar_id)) | 210 print("ERROR: relation %s tar node %s missing!"%(rel_id,tar_id)) |
188 continue | 211 continue |
189 | 212 |
213 if contract_relations_into_attributes: | |
214 # contract source relations | |
215 tar_type = rel['tar_oc'] | |
216 if tar_type in contract_relations_into_attributes: | |
217 att_name = fixName(rel_name, att_from_rel=True) | |
218 # TODO: clean up attribute names | |
219 while src.get(att_name, None) is not None: | |
220 # attribute exists | |
221 if att_name[-1].isnumeric(): | |
222 # increment last digit | |
223 att_name = att_name[:-1] + str(int(att_name[-1]) + 1) | |
224 else: | |
225 att_name += '2' | |
226 | |
227 # add target node's label as attribute | |
228 #print("contracting tar to attribute %s on id=%s"%(att_name, src_id)) | |
229 src.set(att_name, tar.get('label')) | |
230 | |
231 # contract target relations | |
232 src_type = rel['src_oc'] | |
233 if src_type in contract_relations_into_attributes: | |
234 att_name = fixName(rel_name, att_from_rel=True) | |
235 # TODO: clean up attribute names | |
236 while tar.get(att_name, None) is not None: | |
237 # attribute exists | |
238 if att_name[-1].isnumeric(): | |
239 # increment last digit | |
240 att_name = att_name[:-1] + str(int(att_name[-1]) + 1) | |
241 else: | |
242 att_name += '2' | |
243 | |
244 # add target node's label as attribute | |
245 #print("contracting src to attribute %s on id=%s"%(att_name, tar_id)) | |
246 src.set(att_name, src.get('label')) | |
247 | |
190 if add_inverse_relations: | 248 if add_inverse_relations: |
191 n4j_rel = [gdb.relationships.create(src, fixName(rel_name, is_src_rel=True), tar), | 249 n4j_rel = [gdb.relationships.create(src, fixName(rel_name, is_src_rel=True), tar), |
192 gdb.relationships.create(tar, fixName(rel_name, is_tar_rel=True), src)] | 250 gdb.relationships.create(tar, fixName(rel_name, is_tar_rel=True), src)] |
193 | 251 |
194 else: | 252 else: |
197 n4j_relations[rel_id] = n4j_rel | 255 n4j_relations[rel_id] = n4j_rel |
198 | 256 |
199 return n4j_relations | 257 return n4j_relations |
200 | 258 |
201 | 259 |
202 # In[114]: | |
203 | |
204 def importEnts(etype): | 260 def importEnts(etype): |
261 """Import all entities of the given type. | |
262 """ | |
205 # read json for all entities of given type | 263 # read json for all entities of given type |
206 json = readJSON(entsURL%etype) | 264 json = readJSON(entsURL%etype) |
207 ents = json['ents'] | 265 ents = json['ents'] |
208 print("importing %s %ss"%(len(ents),etype)) | 266 print("importing %s %ss"%(len(ents),etype)) |
209 cnt = 0 | 267 cnt = 0 |
222 ent_data = ent_json['ent'] | 280 ent_data = ent_json['ent'] |
223 # create neo4j node | 281 # create neo4j node |
224 if keep_nodes: | 282 if keep_nodes: |
225 node = getNode(ismi_id) | 283 node = getNode(ismi_id) |
226 | 284 |
227 if node is None: | |
228 node = nodeFromEnt(ent_data, etype) | |
229 | |
230 if ismi_id in n4j_nodes: | 285 if ismi_id in n4j_nodes: |
231 print("ERROR: entity with id=%s exists!"%ismi_id) | 286 print("ERROR: entity with id=%s exists!"%ismi_id) |
232 return | 287 return |
233 | 288 |
289 if node is None: | |
290 node = nodeFromEnt(ent_data, etype) | |
291 | |
234 # save node reference | 292 # save node reference |
235 n4j_nodes[ismi_id] = node | 293 n4j_nodes[ismi_id] = node |
236 | 294 |
237 # extract relations | 295 # extract relations |
238 relsFromEnt(ent_data, ismi_relations) | 296 relsFromEnt(ent_data, ismi_relations) |