comparison importFromOpenMind/importer/ismi2neo4j.py @ 18:0827156df210

added contraction of relations into attributes. added some docstrings.
author casties
date Mon, 07 Sep 2015 16:57:10 +0200
parents 4dfd832e9cd9
children ca1e02a2a9c4
comparison
equal deleted inserted replaced
17:4dfd832e9cd9 18:0827156df210
5 ## configure behaviour 5 ## configure behaviour
6 6
7 # add inverse relations as "<relation" 7 # add inverse relations as "<relation"
8 add_inverse_relations = True 8 add_inverse_relations = True
9 9
10 # add relations to these objects as attributes with the relations name
11 contract_relations_into_attributes = ['PLACE', 'ALIAS']
12
10 # try to find and re-use existing nodes in neo4j (slow!) 13 # try to find and re-use existing nodes in neo4j (slow!)
11 keep_nodes = False 14 keep_nodes = False
12 15
13 # label added to all nodes 16 # label added to all nodes
14 project_label = '_ismi_inv_rel' 17 project_label = '_ismi2'
15 18
16 # OpenMind base URL 19 # OpenMind base URL
17 baseURL="http://localhost:18080/ismi-richfaces/jsonInterface?" 20 baseURL="http://localhost:18080/ismi-richfaces/jsonInterface?"
18 21
19 # neo4j base URL 22 # neo4j base URL
48 'lw', 51 'lw',
49 'node_type', 52 'node_type',
50 'nov' 53 'nov'
51 ] 54 ]
52 55
53 def fixName(name, is_src_rel=False, is_tar_rel=False): 56 def fixName(name, is_src_rel=False, is_tar_rel=False, att_from_rel=False):
54 # these are too embarrasing... 57 # these are too embarrassing...
55 if 'FLORUIT' in name: 58 if 'FLORUIT' in name:
56 name = name.replace('FLORUIT', 'FLOURISH') 59 name = name.replace('FLORUIT', 'FLOURISH')
57 60
58 elif 'floruit' in name: 61 elif 'floruit' in name:
59 name = name.replace('floruit', 'flourish') 62 name = name.replace('floruit', 'flourish')
60 63
61 if is_src_rel: 64 if is_src_rel:
62 name = name + '>' 65 #name = name + '>'
66 pass
63 67
64 if is_tar_rel: 68 if is_tar_rel:
65 name = '<' + name 69 name = '<' + name
70
71 if att_from_rel:
72 # clean up relations as attribute names
73 name = name.replace('is_', '')
74 name = name.replace('has_', '')
75 name = name.replace('was_', '')
76 name = name.replace('_of', '')
66 77
67 return name 78 return name
68 79
69 80
70 def getNode(ismi_id=None): 81 def getNode(ismi_id=None):
75 86
76 return None 87 return None
77 88
78 89
79 def nodeFromEnt(ent, etype): 90 def nodeFromEnt(ent, etype):
91 """Create a Neo4J node from the given JSON entity.
92
93 Creates the node in gdb and returns the node.
94 """
80 attrs = {} 95 attrs = {}
81 # go through all attributes 96 # go through all attributes
82 for att in ent['atts']: 97 for att in ent['atts']:
83 ct = att.get('content_type', None) 98 ct = att.get('content_type', None)
84 if ct is None or ct.lower() in ['text', 'arabic', 'bool', 'boolean', 'url', 'language']: 99 if ct is None or ct.lower() in ['text', 'arabic', 'bool', 'boolean', 'url', 'language']:
142 # add labels 157 # add labels
143 node.labels.add([project_label, fixName(etype)]) 158 node.labels.add([project_label, fixName(etype)])
144 return node 159 return node
145 160
146 161
147 # In[77]:
148
149 def relsFromEnt(ent, relations): 162 def relsFromEnt(ent, relations):
163 """Extract all relations from JSON entity.
164
165 Adds JSON to dict relations under relation's id.
166 """
150 # go through src_rels and tar_rels 167 # go through src_rels and tar_rels
151 rels = ent.get('src_rels', []) + ent.get('tar_rels', []) 168 rels = ent.get('src_rels', []) + ent.get('tar_rels', [])
152 for rel in rels: 169 for rel in rels:
153 rel_id = rel['id'] 170 rel_id = rel['id']
154 if rel_id in relations: 171 if rel_id in relations:
160 relations[rel_id] = rel 177 relations[rel_id] = rel
161 178
162 return relations 179 return relations
163 180
164 181
165 # In[110]:
166
167 def n4jrelationsFromRels(rels, nodes): 182 def n4jrelationsFromRels(rels, nodes):
183 """Create relations in Neo4J.
184
185 Args:
186 rels: dict of JSON relations
187 nodes: dict of existing Neo4J nodes
188 Returns:
189 dict of Neo4J relations
190 """
168 # go through all rels 191 # go through all rels
169 print("importing %s relations"%len(rels)) 192 print("importing %s relations"%len(rels))
170 cnt = 0 193 cnt = 0
171 for rel in rels.values(): 194 for rel in rels.values():
172 cnt += 1 195 cnt += 1
185 tar = nodes.get(tar_id, None) 208 tar = nodes.get(tar_id, None)
186 if tar is None: 209 if tar is None:
187 print("ERROR: relation %s tar node %s missing!"%(rel_id,tar_id)) 210 print("ERROR: relation %s tar node %s missing!"%(rel_id,tar_id))
188 continue 211 continue
189 212
213 if contract_relations_into_attributes:
214 # contract source relations
215 tar_type = rel['tar_oc']
216 if tar_type in contract_relations_into_attributes:
217 att_name = fixName(rel_name, att_from_rel=True)
218 # TODO: clean up attribute names
219 while src.get(att_name, None) is not None:
220 # attribute exists
221 if att_name[-1].isnumeric():
222 # increment last digit
223 att_name = att_name[:-1] + str(int(att_name[-1]) + 1)
224 else:
225 att_name += '2'
226
227 # add target node's label as attribute
228 #print("contracting tar to attribute %s on id=%s"%(att_name, src_id))
229 src.set(att_name, tar.get('label'))
230
231 # contract target relations
232 src_type = rel['src_oc']
233 if src_type in contract_relations_into_attributes:
234 att_name = fixName(rel_name, att_from_rel=True)
235 # TODO: clean up attribute names
236 while tar.get(att_name, None) is not None:
237 # attribute exists
238 if att_name[-1].isnumeric():
239 # increment last digit
240 att_name = att_name[:-1] + str(int(att_name[-1]) + 1)
241 else:
242 att_name += '2'
243
244 # add target node's label as attribute
245 #print("contracting src to attribute %s on id=%s"%(att_name, tar_id))
246 src.set(att_name, src.get('label'))
247
190 if add_inverse_relations: 248 if add_inverse_relations:
191 n4j_rel = [gdb.relationships.create(src, fixName(rel_name, is_src_rel=True), tar), 249 n4j_rel = [gdb.relationships.create(src, fixName(rel_name, is_src_rel=True), tar),
192 gdb.relationships.create(tar, fixName(rel_name, is_tar_rel=True), src)] 250 gdb.relationships.create(tar, fixName(rel_name, is_tar_rel=True), src)]
193 251
194 else: 252 else:
197 n4j_relations[rel_id] = n4j_rel 255 n4j_relations[rel_id] = n4j_rel
198 256
199 return n4j_relations 257 return n4j_relations
200 258
201 259
202 # In[114]:
203
204 def importEnts(etype): 260 def importEnts(etype):
261 """Import all entities of the given type.
262 """
205 # read json for all entities of given type 263 # read json for all entities of given type
206 json = readJSON(entsURL%etype) 264 json = readJSON(entsURL%etype)
207 ents = json['ents'] 265 ents = json['ents']
208 print("importing %s %ss"%(len(ents),etype)) 266 print("importing %s %ss"%(len(ents),etype))
209 cnt = 0 267 cnt = 0
222 ent_data = ent_json['ent'] 280 ent_data = ent_json['ent']
223 # create neo4j node 281 # create neo4j node
224 if keep_nodes: 282 if keep_nodes:
225 node = getNode(ismi_id) 283 node = getNode(ismi_id)
226 284
227 if node is None:
228 node = nodeFromEnt(ent_data, etype)
229
230 if ismi_id in n4j_nodes: 285 if ismi_id in n4j_nodes:
231 print("ERROR: entity with id=%s exists!"%ismi_id) 286 print("ERROR: entity with id=%s exists!"%ismi_id)
232 return 287 return
233 288
289 if node is None:
290 node = nodeFromEnt(ent_data, etype)
291
234 # save node reference 292 # save node reference
235 n4j_nodes[ismi_id] = node 293 n4j_nodes[ismi_id] = node
236 294
237 # extract relations 295 # extract relations
238 relsFromEnt(ent_data, ismi_relations) 296 relsFromEnt(ent_data, ismi_relations)