comparison importFromOpenMind/importer/ismi2neo4j.py @ 16:de0a06eef13b

new neo4j importer for network visualisation frontend.
author casties
date Fri, 28 Aug 2015 17:24:45 +0200
parents
children 4dfd832e9cd9
comparison
equal deleted inserted replaced
15:61767ff5ce2b 16:de0a06eef13b
1 import urllib.request
2 import json
3 from neo4jrestclient.client import GraphDatabase, Node
4
5 # In[111]:
6 ismi_types=["PERSON","WITNESS","CODEX","PLACE","COLLECTION","REPOSITORY"]
7
8 baseURL="http://localhost:18080/ismi-richfaces/jsonInterface?"
9
10 entsURL=baseURL+"method=get_ents&oc=%s"
11
12 entURL=baseURL+"method=get_ent&id=%s&include_content=True"
13
14
15 def readJSON(url):
16 wsh=urllib.request.urlopen(url)
17 txt = wsh.read()
18 return json.loads(txt.decode("utf-8"))
19
20 defs_json = readJSON(baseURL+"method=get_defs")
21
22 ismi_defs = [atts['ov'] for atts in defs_json['defs']]
23
24
25 gdb = GraphDatabase("http://localhost:7474/db/data/", username="neo4j", password="neo5j")
26
27 n4j_nodes = {}
28 ismi_relations = {}
29 n4j_relations = {}
30
31 keep_nodes = False
32
33 ent_exclude_attrs = [
34 'lw',
35 'node_type',
36 'nov'
37 ]
38
39
40 def getNode(ismi_id=None):
41 if ismi_id is not None:
42 res = gdb.query("match (n {ismi_id: %s}) return n"%40635, returns=(Node))
43 if len(res) > 0:
44 return res[0]
45
46 return None
47
48 def nodeFromEnt(ent, etype):
49 attrs = {}
50 # go through all attributes
51 for att in ent['atts']:
52 ct = att.get('content_type', None)
53 if ct in ['text', 'arabic', 'bool', 'url']:
54 # normal text attribute
55 key = att['name']
56 val = att['ov']
57
58 if key in ent_exclude_attrs:
59 # exclude attribute
60 continue
61
62 # keep attribute
63 attrs[key] = val
64
65 elif ct == 'date':
66 # date attribute
67 key = att['name']
68 val = att['ov']
69 #print("don't know what to do with date: %s=%s"%(key,val))
70
71 elif ct == 'old':
72 # ignore attribute
73 continue
74
75 else:
76 #print("WARN: attribute with unknown content_type: %s"%repr(att))
77 # ignore other content types
78 continue
79
80 # process base attributes
81 oc = ent['oc']
82 if oc != etype:
83 print("ERROR: entity type doesn't match!")
84 return null
85
86 attrs['type'] = oc
87
88 ismi_id = ent['id']
89 # rename id to ismi_id
90 attrs['ismi_id'] = ismi_id
91
92 ov = ent.get('ov', None)
93 if ov is not None:
94 # save ov as label
95 attrs['label'] = ov
96
97 # create node with attributes
98 node = gdb.nodes.create(**attrs)
99 # add labels
100 node.labels.add(['project_ismi', etype])
101 return node
102
103
104 # In[77]:
105
106 def relsFromEnt(ent, relations):
107 # go through src_rels and tar_rels
108 rels = ent.get('src_rels', []) + ent.get('tar_rels', [])
109 for rel in rels:
110 rel_id = rel['id']
111 if rel_id in relations:
112 old_rel = relations[rel_id]
113 if rel != old_rel:
114 print("ERROR: relation is different: %s != %s"%(repr(rel), repr(old_rel)))
115 continue
116
117 relations[rel_id] = rel
118
119 return relations
120
121
122 # In[110]:
123
124 def n4jrelationsFromRels(rels, nodes):
125 # go through all rels
126 print("importing %s relations"%len(rels))
127 cnt = 0
128 for rel in rels.values():
129 cnt += 1
130 if cnt % 100 == 0:
131 print(" %s relations"%cnt)
132
133 rel_id = rel['id']
134 rel_name = rel['name']
135 src_id = rel['src_id']
136 tar_id = rel['tar_id']
137 src = nodes.get(src_id, None)
138 if src is None:
139 print("ERROR: relation %s src node %s missing!"%(rel_id,src_id))
140 continue
141
142 tar = nodes.get(tar_id, None)
143 if tar is None:
144 print("ERROR: relation %s tar node %s missing!"%(rel_id,tar_id))
145 continue
146
147 n4j_rel = gdb.relationships.create(src, rel_name, tar)
148 n4j_relations[rel_id] = n4j_rel
149
150 return n4j_relations
151
152
153 # In[114]:
154
155 def importEnts(etype):
156 # read json for all entities of given type
157 json = readJSON(entsURL%etype)
158 ents = json['ents']
159 print("importing %s %ss"%(len(ents),etype))
160 cnt = 0
161 for ent in ents:
162 cnt += 1
163 if cnt % 100 == 0:
164 print(" %s %ss"%(cnt, etype))
165
166 # extract ismi id
167 ismi_id = ent['id']
168
169 node = None
170
171 # fetch full data for entity
172 ent_json = readJSON(entURL%ismi_id)
173 ent_data = ent_json['ent']
174 # create neo4j node
175 if keep_nodes:
176 node = getNode(ismi_id)
177
178 if node is None:
179 node = nodeFromEnt(ent_data, etype)
180
181 if ismi_id in n4j_nodes:
182 print("ERROR: entity with id=%s exists!"%ismi_id)
183 return
184
185 # save node reference
186 n4j_nodes[ismi_id] = node
187
188 # extract relations
189 relsFromEnt(ent_data, ismi_relations)
190
191 #if cnt >= 100:
192 # return
193
194
195 # In[119]:
196
197 def importAllEnts(etypes):
198
199 for etype in etypes:
200 importEnts(etype)
201
202 n4jrelationsFromRels(ismi_relations, n4j_nodes)
203
204
205 # In[120]:
206
207 #importAllEnts(ismi_types)
208 importAllEnts(ismi_defs)