Mercurial > hg > drupalISMI
comparison importFromOpenMind/importer/ismi2neo4j.py @ 16:de0a06eef13b
new neo4j importer for network visualisation frontend.
author | casties |
---|---|
date | Fri, 28 Aug 2015 17:24:45 +0200 |
parents | |
children | 4dfd832e9cd9 |
comparison
equal
deleted
inserted
replaced
15:61767ff5ce2b | 16:de0a06eef13b |
---|---|
1 import urllib.request | |
2 import json | |
3 from neo4jrestclient.client import GraphDatabase, Node | |
4 | |
5 # In[111]: | |
6 ismi_types=["PERSON","WITNESS","CODEX","PLACE","COLLECTION","REPOSITORY"] | |
7 | |
8 baseURL="http://localhost:18080/ismi-richfaces/jsonInterface?" | |
9 | |
10 entsURL=baseURL+"method=get_ents&oc=%s" | |
11 | |
12 entURL=baseURL+"method=get_ent&id=%s&include_content=True" | |
13 | |
14 | |
15 def readJSON(url): | |
16 wsh=urllib.request.urlopen(url) | |
17 txt = wsh.read() | |
18 return json.loads(txt.decode("utf-8")) | |
19 | |
20 defs_json = readJSON(baseURL+"method=get_defs") | |
21 | |
22 ismi_defs = [atts['ov'] for atts in defs_json['defs']] | |
23 | |
24 | |
25 gdb = GraphDatabase("http://localhost:7474/db/data/", username="neo4j", password="neo5j") | |
26 | |
27 n4j_nodes = {} | |
28 ismi_relations = {} | |
29 n4j_relations = {} | |
30 | |
31 keep_nodes = False | |
32 | |
33 ent_exclude_attrs = [ | |
34 'lw', | |
35 'node_type', | |
36 'nov' | |
37 ] | |
38 | |
39 | |
40 def getNode(ismi_id=None): | |
41 if ismi_id is not None: | |
42 res = gdb.query("match (n {ismi_id: %s}) return n"%40635, returns=(Node)) | |
43 if len(res) > 0: | |
44 return res[0] | |
45 | |
46 return None | |
47 | |
48 def nodeFromEnt(ent, etype): | |
49 attrs = {} | |
50 # go through all attributes | |
51 for att in ent['atts']: | |
52 ct = att.get('content_type', None) | |
53 if ct in ['text', 'arabic', 'bool', 'url']: | |
54 # normal text attribute | |
55 key = att['name'] | |
56 val = att['ov'] | |
57 | |
58 if key in ent_exclude_attrs: | |
59 # exclude attribute | |
60 continue | |
61 | |
62 # keep attribute | |
63 attrs[key] = val | |
64 | |
65 elif ct == 'date': | |
66 # date attribute | |
67 key = att['name'] | |
68 val = att['ov'] | |
69 #print("don't know what to do with date: %s=%s"%(key,val)) | |
70 | |
71 elif ct == 'old': | |
72 # ignore attribute | |
73 continue | |
74 | |
75 else: | |
76 #print("WARN: attribute with unknown content_type: %s"%repr(att)) | |
77 # ignore other content types | |
78 continue | |
79 | |
80 # process base attributes | |
81 oc = ent['oc'] | |
82 if oc != etype: | |
83 print("ERROR: entity type doesn't match!") | |
84 return null | |
85 | |
86 attrs['type'] = oc | |
87 | |
88 ismi_id = ent['id'] | |
89 # rename id to ismi_id | |
90 attrs['ismi_id'] = ismi_id | |
91 | |
92 ov = ent.get('ov', None) | |
93 if ov is not None: | |
94 # save ov as label | |
95 attrs['label'] = ov | |
96 | |
97 # create node with attributes | |
98 node = gdb.nodes.create(**attrs) | |
99 # add labels | |
100 node.labels.add(['project_ismi', etype]) | |
101 return node | |
102 | |
103 | |
104 # In[77]: | |
105 | |
106 def relsFromEnt(ent, relations): | |
107 # go through src_rels and tar_rels | |
108 rels = ent.get('src_rels', []) + ent.get('tar_rels', []) | |
109 for rel in rels: | |
110 rel_id = rel['id'] | |
111 if rel_id in relations: | |
112 old_rel = relations[rel_id] | |
113 if rel != old_rel: | |
114 print("ERROR: relation is different: %s != %s"%(repr(rel), repr(old_rel))) | |
115 continue | |
116 | |
117 relations[rel_id] = rel | |
118 | |
119 return relations | |
120 | |
121 | |
122 # In[110]: | |
123 | |
124 def n4jrelationsFromRels(rels, nodes): | |
125 # go through all rels | |
126 print("importing %s relations"%len(rels)) | |
127 cnt = 0 | |
128 for rel in rels.values(): | |
129 cnt += 1 | |
130 if cnt % 100 == 0: | |
131 print(" %s relations"%cnt) | |
132 | |
133 rel_id = rel['id'] | |
134 rel_name = rel['name'] | |
135 src_id = rel['src_id'] | |
136 tar_id = rel['tar_id'] | |
137 src = nodes.get(src_id, None) | |
138 if src is None: | |
139 print("ERROR: relation %s src node %s missing!"%(rel_id,src_id)) | |
140 continue | |
141 | |
142 tar = nodes.get(tar_id, None) | |
143 if tar is None: | |
144 print("ERROR: relation %s tar node %s missing!"%(rel_id,tar_id)) | |
145 continue | |
146 | |
147 n4j_rel = gdb.relationships.create(src, rel_name, tar) | |
148 n4j_relations[rel_id] = n4j_rel | |
149 | |
150 return n4j_relations | |
151 | |
152 | |
153 # In[114]: | |
154 | |
155 def importEnts(etype): | |
156 # read json for all entities of given type | |
157 json = readJSON(entsURL%etype) | |
158 ents = json['ents'] | |
159 print("importing %s %ss"%(len(ents),etype)) | |
160 cnt = 0 | |
161 for ent in ents: | |
162 cnt += 1 | |
163 if cnt % 100 == 0: | |
164 print(" %s %ss"%(cnt, etype)) | |
165 | |
166 # extract ismi id | |
167 ismi_id = ent['id'] | |
168 | |
169 node = None | |
170 | |
171 # fetch full data for entity | |
172 ent_json = readJSON(entURL%ismi_id) | |
173 ent_data = ent_json['ent'] | |
174 # create neo4j node | |
175 if keep_nodes: | |
176 node = getNode(ismi_id) | |
177 | |
178 if node is None: | |
179 node = nodeFromEnt(ent_data, etype) | |
180 | |
181 if ismi_id in n4j_nodes: | |
182 print("ERROR: entity with id=%s exists!"%ismi_id) | |
183 return | |
184 | |
185 # save node reference | |
186 n4j_nodes[ismi_id] = node | |
187 | |
188 # extract relations | |
189 relsFromEnt(ent_data, ismi_relations) | |
190 | |
191 #if cnt >= 100: | |
192 # return | |
193 | |
194 | |
195 # In[119]: | |
196 | |
197 def importAllEnts(etypes): | |
198 | |
199 for etype in etypes: | |
200 importEnts(etype) | |
201 | |
202 n4jrelationsFromRels(ismi_relations, n4j_nodes) | |
203 | |
204 | |
205 # In[120]: | |
206 | |
207 #importAllEnts(ismi_types) | |
208 importAllEnts(ismi_defs) |