16
|
1 import urllib.request
|
|
2 import json
|
|
3 from neo4jrestclient.client import GraphDatabase, Node
|
|
4
|
17
|
5 ## configure behaviour
|
|
6
|
|
7 # add inverse relations as "<relation"
|
|
8 add_inverse_relations = True
|
|
9
|
|
10 # try to find and re-use existing nodes in neo4j (slow!)
|
|
11 keep_nodes = False
|
16
|
12
|
17
|
13 # label added to all nodes
|
|
14 project_label = '_ismi_inv_rel'
|
|
15
|
|
16 # OpenMind base URL
|
16
|
17 baseURL="http://localhost:18080/ismi-richfaces/jsonInterface?"
|
|
18
|
17
|
19 # neo4j base URL
|
|
20 neo4jBaseURL = "http://localhost:7474/db/data/"
|
|
21
|
|
22
|
16
|
23 entsURL=baseURL+"method=get_ents&oc=%s"
|
|
24
|
|
25 entURL=baseURL+"method=get_ent&id=%s&include_content=True"
|
|
26
|
|
27
|
|
28 def readJSON(url):
|
|
29 wsh=urllib.request.urlopen(url)
|
|
30 txt = wsh.read()
|
|
31 return json.loads(txt.decode("utf-8"))
|
|
32
|
|
33 defs_json = readJSON(baseURL+"method=get_defs")
|
|
34
|
17
|
35 # current list of all definitions
|
16
|
36 ismi_defs = [atts['ov'] for atts in defs_json['defs']]
|
|
37
|
17
|
38 #ismi_types=["PERSON","WITNESS","CODEX","PLACE","COLLECTION","REPOSITORY"]
|
16
|
39
|
17
|
40
|
|
41 gdb = GraphDatabase(neo4jBaseURL, username="neo4j", password="neo5j")
|
16
|
42
|
|
43 n4j_nodes = {}
|
|
44 ismi_relations = {}
|
|
45 n4j_relations = {}
|
|
46
|
|
47 ent_exclude_attrs = [
|
|
48 'lw',
|
|
49 'node_type',
|
|
50 'nov'
|
|
51 ]
|
|
52
|
17
|
53 def fixName(name, is_src_rel=False, is_tar_rel=False):
|
|
54 # these are too embarrasing...
|
|
55 if 'FLORUIT' in name:
|
|
56 name = name.replace('FLORUIT', 'FLOURISH')
|
|
57
|
|
58 elif 'floruit' in name:
|
|
59 name = name.replace('floruit', 'flourish')
|
|
60
|
|
61 if is_src_rel:
|
|
62 name = name + '>'
|
|
63
|
|
64 if is_tar_rel:
|
|
65 name = '<' + name
|
|
66
|
|
67 return name
|
|
68
|
16
|
69
|
|
70 def getNode(ismi_id=None):
|
|
71 if ismi_id is not None:
|
17
|
72 res = gdb.query("match (n {ismi_id: %s}) return n"%ismi_id, returns=(Node))
|
16
|
73 if len(res) > 0:
|
|
74 return res[0]
|
|
75
|
|
76 return None
|
|
77
|
17
|
78
|
16
|
79 def nodeFromEnt(ent, etype):
|
|
80 attrs = {}
|
|
81 # go through all attributes
|
|
82 for att in ent['atts']:
|
|
83 ct = att.get('content_type', None)
|
17
|
84 if ct is None or ct.lower() in ['text', 'arabic', 'bool', 'boolean', 'url', 'language']:
|
|
85 # normal text attribute (assume no content_type is text too...)
|
16
|
86 key = att['name']
|
|
87 val = att['ov']
|
|
88
|
|
89 if key in ent_exclude_attrs:
|
|
90 # exclude attribute
|
|
91 continue
|
|
92
|
|
93 # keep attribute
|
|
94 attrs[key] = val
|
|
95
|
17
|
96 elif ct == 'num':
|
|
97 # number attribute
|
|
98 key = att['name']
|
|
99 val = att['ov']
|
|
100
|
|
101 if key in ent_exclude_attrs:
|
|
102 # exclude attribute
|
|
103 continue
|
|
104
|
|
105 # keep attribute, assume num is int
|
|
106 attrs[key] = int(val)
|
|
107
|
16
|
108 elif ct == 'date':
|
|
109 # date attribute
|
|
110 key = att['name']
|
|
111 val = att['ov']
|
|
112 #print("don't know what to do with date: %s=%s"%(key,val))
|
|
113
|
|
114 elif ct == 'old':
|
|
115 # ignore attribute
|
|
116 continue
|
|
117
|
|
118 else:
|
17
|
119 print("WARN: attribute with unknown content_type: %s"%repr(att))
|
16
|
120 # ignore other content types
|
|
121 continue
|
|
122
|
|
123 # process base attributes
|
|
124 oc = ent['oc']
|
|
125 if oc != etype:
|
|
126 print("ERROR: entity type doesn't match!")
|
|
127 return null
|
|
128
|
17
|
129 attrs['type'] = fixName(oc)
|
16
|
130
|
|
131 ismi_id = ent['id']
|
|
132 # rename id to ismi_id
|
|
133 attrs['ismi_id'] = ismi_id
|
|
134
|
|
135 ov = ent.get('ov', None)
|
|
136 if ov is not None:
|
|
137 # save ov as label
|
|
138 attrs['label'] = ov
|
|
139
|
|
140 # create node with attributes
|
|
141 node = gdb.nodes.create(**attrs)
|
|
142 # add labels
|
17
|
143 node.labels.add([project_label, fixName(etype)])
|
16
|
144 return node
|
|
145
|
|
146
|
|
147 # In[77]:
|
|
148
|
|
149 def relsFromEnt(ent, relations):
|
|
150 # go through src_rels and tar_rels
|
|
151 rels = ent.get('src_rels', []) + ent.get('tar_rels', [])
|
|
152 for rel in rels:
|
|
153 rel_id = rel['id']
|
|
154 if rel_id in relations:
|
|
155 old_rel = relations[rel_id]
|
|
156 if rel != old_rel:
|
|
157 print("ERROR: relation is different: %s != %s"%(repr(rel), repr(old_rel)))
|
|
158 continue
|
|
159
|
|
160 relations[rel_id] = rel
|
|
161
|
|
162 return relations
|
|
163
|
|
164
|
|
165 # In[110]:
|
|
166
|
|
167 def n4jrelationsFromRels(rels, nodes):
|
|
168 # go through all rels
|
|
169 print("importing %s relations"%len(rels))
|
|
170 cnt = 0
|
|
171 for rel in rels.values():
|
|
172 cnt += 1
|
|
173 if cnt % 100 == 0:
|
|
174 print(" %s relations"%cnt)
|
|
175
|
|
176 rel_id = rel['id']
|
|
177 rel_name = rel['name']
|
|
178 src_id = rel['src_id']
|
|
179 tar_id = rel['tar_id']
|
|
180 src = nodes.get(src_id, None)
|
|
181 if src is None:
|
|
182 print("ERROR: relation %s src node %s missing!"%(rel_id,src_id))
|
|
183 continue
|
|
184
|
|
185 tar = nodes.get(tar_id, None)
|
|
186 if tar is None:
|
|
187 print("ERROR: relation %s tar node %s missing!"%(rel_id,tar_id))
|
|
188 continue
|
|
189
|
17
|
190 if add_inverse_relations:
|
|
191 n4j_rel = [gdb.relationships.create(src, fixName(rel_name, is_src_rel=True), tar),
|
|
192 gdb.relationships.create(tar, fixName(rel_name, is_tar_rel=True), src)]
|
|
193
|
|
194 else:
|
|
195 n4j_rel = gdb.relationships.create(src, fixName(rel_name), tar)
|
|
196
|
16
|
197 n4j_relations[rel_id] = n4j_rel
|
|
198
|
|
199 return n4j_relations
|
|
200
|
|
201
|
|
202 # In[114]:
|
|
203
|
|
204 def importEnts(etype):
|
|
205 # read json for all entities of given type
|
|
206 json = readJSON(entsURL%etype)
|
|
207 ents = json['ents']
|
|
208 print("importing %s %ss"%(len(ents),etype))
|
|
209 cnt = 0
|
|
210 for ent in ents:
|
|
211 cnt += 1
|
|
212 if cnt % 100 == 0:
|
|
213 print(" %s %ss"%(cnt, etype))
|
|
214
|
|
215 # extract ismi id
|
|
216 ismi_id = ent['id']
|
|
217
|
|
218 node = None
|
|
219
|
|
220 # fetch full data for entity
|
|
221 ent_json = readJSON(entURL%ismi_id)
|
|
222 ent_data = ent_json['ent']
|
|
223 # create neo4j node
|
|
224 if keep_nodes:
|
|
225 node = getNode(ismi_id)
|
|
226
|
|
227 if node is None:
|
|
228 node = nodeFromEnt(ent_data, etype)
|
|
229
|
|
230 if ismi_id in n4j_nodes:
|
|
231 print("ERROR: entity with id=%s exists!"%ismi_id)
|
|
232 return
|
|
233
|
|
234 # save node reference
|
|
235 n4j_nodes[ismi_id] = node
|
|
236
|
|
237 # extract relations
|
|
238 relsFromEnt(ent_data, ismi_relations)
|
|
239
|
|
240 #if cnt >= 100:
|
|
241 # return
|
|
242
|
|
243
|
|
244 # In[119]:
|
|
245
|
|
246 def importAllEnts(etypes):
|
|
247
|
|
248 for etype in etypes:
|
|
249 importEnts(etype)
|
|
250
|
|
251 n4jrelationsFromRels(ismi_relations, n4j_nodes)
|
|
252
|
|
253
|
|
254 # In[120]:
|
|
255
|
|
256 #importAllEnts(ismi_types)
|
|
257 importAllEnts(ismi_defs)
|