16
|
1 import urllib.request
|
|
2 import json
|
|
3 from neo4jrestclient.client import GraphDatabase, Node
|
|
4
|
|
5 # In[111]:
|
|
6 ismi_types=["PERSON","WITNESS","CODEX","PLACE","COLLECTION","REPOSITORY"]
|
|
7
|
|
8 baseURL="http://localhost:18080/ismi-richfaces/jsonInterface?"
|
|
9
|
|
10 entsURL=baseURL+"method=get_ents&oc=%s"
|
|
11
|
|
12 entURL=baseURL+"method=get_ent&id=%s&include_content=True"
|
|
13
|
|
14
|
|
15 def readJSON(url):
|
|
16 wsh=urllib.request.urlopen(url)
|
|
17 txt = wsh.read()
|
|
18 return json.loads(txt.decode("utf-8"))
|
|
19
|
|
20 defs_json = readJSON(baseURL+"method=get_defs")
|
|
21
|
|
22 ismi_defs = [atts['ov'] for atts in defs_json['defs']]
|
|
23
|
|
24
|
|
25 gdb = GraphDatabase("http://localhost:7474/db/data/", username="neo4j", password="neo5j")
|
|
26
|
|
27 n4j_nodes = {}
|
|
28 ismi_relations = {}
|
|
29 n4j_relations = {}
|
|
30
|
|
31 keep_nodes = False
|
|
32
|
|
33 ent_exclude_attrs = [
|
|
34 'lw',
|
|
35 'node_type',
|
|
36 'nov'
|
|
37 ]
|
|
38
|
|
39
|
|
40 def getNode(ismi_id=None):
|
|
41 if ismi_id is not None:
|
|
42 res = gdb.query("match (n {ismi_id: %s}) return n"%40635, returns=(Node))
|
|
43 if len(res) > 0:
|
|
44 return res[0]
|
|
45
|
|
46 return None
|
|
47
|
|
48 def nodeFromEnt(ent, etype):
|
|
49 attrs = {}
|
|
50 # go through all attributes
|
|
51 for att in ent['atts']:
|
|
52 ct = att.get('content_type', None)
|
|
53 if ct in ['text', 'arabic', 'bool', 'url']:
|
|
54 # normal text attribute
|
|
55 key = att['name']
|
|
56 val = att['ov']
|
|
57
|
|
58 if key in ent_exclude_attrs:
|
|
59 # exclude attribute
|
|
60 continue
|
|
61
|
|
62 # keep attribute
|
|
63 attrs[key] = val
|
|
64
|
|
65 elif ct == 'date':
|
|
66 # date attribute
|
|
67 key = att['name']
|
|
68 val = att['ov']
|
|
69 #print("don't know what to do with date: %s=%s"%(key,val))
|
|
70
|
|
71 elif ct == 'old':
|
|
72 # ignore attribute
|
|
73 continue
|
|
74
|
|
75 else:
|
|
76 #print("WARN: attribute with unknown content_type: %s"%repr(att))
|
|
77 # ignore other content types
|
|
78 continue
|
|
79
|
|
80 # process base attributes
|
|
81 oc = ent['oc']
|
|
82 if oc != etype:
|
|
83 print("ERROR: entity type doesn't match!")
|
|
84 return null
|
|
85
|
|
86 attrs['type'] = oc
|
|
87
|
|
88 ismi_id = ent['id']
|
|
89 # rename id to ismi_id
|
|
90 attrs['ismi_id'] = ismi_id
|
|
91
|
|
92 ov = ent.get('ov', None)
|
|
93 if ov is not None:
|
|
94 # save ov as label
|
|
95 attrs['label'] = ov
|
|
96
|
|
97 # create node with attributes
|
|
98 node = gdb.nodes.create(**attrs)
|
|
99 # add labels
|
|
100 node.labels.add(['project_ismi', etype])
|
|
101 return node
|
|
102
|
|
103
|
|
104 # In[77]:
|
|
105
|
|
106 def relsFromEnt(ent, relations):
|
|
107 # go through src_rels and tar_rels
|
|
108 rels = ent.get('src_rels', []) + ent.get('tar_rels', [])
|
|
109 for rel in rels:
|
|
110 rel_id = rel['id']
|
|
111 if rel_id in relations:
|
|
112 old_rel = relations[rel_id]
|
|
113 if rel != old_rel:
|
|
114 print("ERROR: relation is different: %s != %s"%(repr(rel), repr(old_rel)))
|
|
115 continue
|
|
116
|
|
117 relations[rel_id] = rel
|
|
118
|
|
119 return relations
|
|
120
|
|
121
|
|
122 # In[110]:
|
|
123
|
|
124 def n4jrelationsFromRels(rels, nodes):
|
|
125 # go through all rels
|
|
126 print("importing %s relations"%len(rels))
|
|
127 cnt = 0
|
|
128 for rel in rels.values():
|
|
129 cnt += 1
|
|
130 if cnt % 100 == 0:
|
|
131 print(" %s relations"%cnt)
|
|
132
|
|
133 rel_id = rel['id']
|
|
134 rel_name = rel['name']
|
|
135 src_id = rel['src_id']
|
|
136 tar_id = rel['tar_id']
|
|
137 src = nodes.get(src_id, None)
|
|
138 if src is None:
|
|
139 print("ERROR: relation %s src node %s missing!"%(rel_id,src_id))
|
|
140 continue
|
|
141
|
|
142 tar = nodes.get(tar_id, None)
|
|
143 if tar is None:
|
|
144 print("ERROR: relation %s tar node %s missing!"%(rel_id,tar_id))
|
|
145 continue
|
|
146
|
|
147 n4j_rel = gdb.relationships.create(src, rel_name, tar)
|
|
148 n4j_relations[rel_id] = n4j_rel
|
|
149
|
|
150 return n4j_relations
|
|
151
|
|
152
|
|
153 # In[114]:
|
|
154
|
|
155 def importEnts(etype):
|
|
156 # read json for all entities of given type
|
|
157 json = readJSON(entsURL%etype)
|
|
158 ents = json['ents']
|
|
159 print("importing %s %ss"%(len(ents),etype))
|
|
160 cnt = 0
|
|
161 for ent in ents:
|
|
162 cnt += 1
|
|
163 if cnt % 100 == 0:
|
|
164 print(" %s %ss"%(cnt, etype))
|
|
165
|
|
166 # extract ismi id
|
|
167 ismi_id = ent['id']
|
|
168
|
|
169 node = None
|
|
170
|
|
171 # fetch full data for entity
|
|
172 ent_json = readJSON(entURL%ismi_id)
|
|
173 ent_data = ent_json['ent']
|
|
174 # create neo4j node
|
|
175 if keep_nodes:
|
|
176 node = getNode(ismi_id)
|
|
177
|
|
178 if node is None:
|
|
179 node = nodeFromEnt(ent_data, etype)
|
|
180
|
|
181 if ismi_id in n4j_nodes:
|
|
182 print("ERROR: entity with id=%s exists!"%ismi_id)
|
|
183 return
|
|
184
|
|
185 # save node reference
|
|
186 n4j_nodes[ismi_id] = node
|
|
187
|
|
188 # extract relations
|
|
189 relsFromEnt(ent_data, ismi_relations)
|
|
190
|
|
191 #if cnt >= 100:
|
|
192 # return
|
|
193
|
|
194
|
|
195 # In[119]:
|
|
196
|
|
197 def importAllEnts(etypes):
|
|
198
|
|
199 for etype in etypes:
|
|
200 importEnts(etype)
|
|
201
|
|
202 n4jrelationsFromRels(ismi_relations, n4j_nodes)
|
|
203
|
|
204
|
|
205 # In[120]:
|
|
206
|
|
207 #importAllEnts(ismi_types)
|
|
208 importAllEnts(ismi_defs)
|