Mercurial > hg > drupalISMI
comparison importFromOpenMind/importer/ismi2neo4j.py @ 17:4dfd832e9cd9
added automatic creation of inverse relations.
added more attribute types.
author | casties |
---|---|
date | Thu, 03 Sep 2015 18:48:21 +0200 |
parents | de0a06eef13b |
children | 0827156df210 |
comparison
equal
deleted
inserted
replaced
16:de0a06eef13b | 17:4dfd832e9cd9 |
---|---|
1 import urllib.request | 1 import urllib.request |
2 import json | 2 import json |
3 from neo4jrestclient.client import GraphDatabase, Node | 3 from neo4jrestclient.client import GraphDatabase, Node |
4 | 4 |
5 # In[111]: | 5 ## configure behaviour |
6 ismi_types=["PERSON","WITNESS","CODEX","PLACE","COLLECTION","REPOSITORY"] | 6 |
7 | 7 # add inverse relations as "<relation" |
8 add_inverse_relations = True | |
9 | |
10 # try to find and re-use existing nodes in neo4j (slow!) | |
11 keep_nodes = False | |
12 | |
13 # label added to all nodes | |
14 project_label = '_ismi_inv_rel' | |
15 | |
16 # OpenMind base URL | |
8 baseURL="http://localhost:18080/ismi-richfaces/jsonInterface?" | 17 baseURL="http://localhost:18080/ismi-richfaces/jsonInterface?" |
18 | |
19 # neo4j base URL | |
20 neo4jBaseURL = "http://localhost:7474/db/data/" | |
21 | |
9 | 22 |
10 entsURL=baseURL+"method=get_ents&oc=%s" | 23 entsURL=baseURL+"method=get_ents&oc=%s" |
11 | 24 |
12 entURL=baseURL+"method=get_ent&id=%s&include_content=True" | 25 entURL=baseURL+"method=get_ent&id=%s&include_content=True" |
13 | 26 |
17 txt = wsh.read() | 30 txt = wsh.read() |
18 return json.loads(txt.decode("utf-8")) | 31 return json.loads(txt.decode("utf-8")) |
19 | 32 |
20 defs_json = readJSON(baseURL+"method=get_defs") | 33 defs_json = readJSON(baseURL+"method=get_defs") |
21 | 34 |
35 # current list of all definitions | |
22 ismi_defs = [atts['ov'] for atts in defs_json['defs']] | 36 ismi_defs = [atts['ov'] for atts in defs_json['defs']] |
23 | 37 |
24 | 38 #ismi_types=["PERSON","WITNESS","CODEX","PLACE","COLLECTION","REPOSITORY"] |
25 gdb = GraphDatabase("http://localhost:7474/db/data/", username="neo4j", password="neo5j") | 39 |
40 | |
41 gdb = GraphDatabase(neo4jBaseURL, username="neo4j", password="neo5j") | |
26 | 42 |
27 n4j_nodes = {} | 43 n4j_nodes = {} |
28 ismi_relations = {} | 44 ismi_relations = {} |
29 n4j_relations = {} | 45 n4j_relations = {} |
30 | |
31 keep_nodes = False | |
32 | 46 |
33 ent_exclude_attrs = [ | 47 ent_exclude_attrs = [ |
34 'lw', | 48 'lw', |
35 'node_type', | 49 'node_type', |
36 'nov' | 50 'nov' |
37 ] | 51 ] |
38 | 52 |
53 def fixName(name, is_src_rel=False, is_tar_rel=False): | |
54 # these are too embarrasing... | |
55 if 'FLORUIT' in name: | |
56 name = name.replace('FLORUIT', 'FLOURISH') | |
57 | |
58 elif 'floruit' in name: | |
59 name = name.replace('floruit', 'flourish') | |
60 | |
61 if is_src_rel: | |
62 name = name + '>' | |
63 | |
64 if is_tar_rel: | |
65 name = '<' + name | |
66 | |
67 return name | |
68 | |
39 | 69 |
40 def getNode(ismi_id=None): | 70 def getNode(ismi_id=None): |
41 if ismi_id is not None: | 71 if ismi_id is not None: |
42 res = gdb.query("match (n {ismi_id: %s}) return n"%40635, returns=(Node)) | 72 res = gdb.query("match (n {ismi_id: %s}) return n"%ismi_id, returns=(Node)) |
43 if len(res) > 0: | 73 if len(res) > 0: |
44 return res[0] | 74 return res[0] |
45 | 75 |
46 return None | 76 return None |
77 | |
47 | 78 |
48 def nodeFromEnt(ent, etype): | 79 def nodeFromEnt(ent, etype): |
49 attrs = {} | 80 attrs = {} |
50 # go through all attributes | 81 # go through all attributes |
51 for att in ent['atts']: | 82 for att in ent['atts']: |
52 ct = att.get('content_type', None) | 83 ct = att.get('content_type', None) |
53 if ct in ['text', 'arabic', 'bool', 'url']: | 84 if ct is None or ct.lower() in ['text', 'arabic', 'bool', 'boolean', 'url', 'language']: |
54 # normal text attribute | 85 # normal text attribute (assume no content_type is text too...) |
55 key = att['name'] | 86 key = att['name'] |
56 val = att['ov'] | 87 val = att['ov'] |
57 | 88 |
58 if key in ent_exclude_attrs: | 89 if key in ent_exclude_attrs: |
59 # exclude attribute | 90 # exclude attribute |
60 continue | 91 continue |
61 | 92 |
62 # keep attribute | 93 # keep attribute |
63 attrs[key] = val | 94 attrs[key] = val |
95 | |
96 elif ct == 'num': | |
97 # number attribute | |
98 key = att['name'] | |
99 val = att['ov'] | |
100 | |
101 if key in ent_exclude_attrs: | |
102 # exclude attribute | |
103 continue | |
104 | |
105 # keep attribute, assume num is int | |
106 attrs[key] = int(val) | |
64 | 107 |
65 elif ct == 'date': | 108 elif ct == 'date': |
66 # date attribute | 109 # date attribute |
67 key = att['name'] | 110 key = att['name'] |
68 val = att['ov'] | 111 val = att['ov'] |
71 elif ct == 'old': | 114 elif ct == 'old': |
72 # ignore attribute | 115 # ignore attribute |
73 continue | 116 continue |
74 | 117 |
75 else: | 118 else: |
76 #print("WARN: attribute with unknown content_type: %s"%repr(att)) | 119 print("WARN: attribute with unknown content_type: %s"%repr(att)) |
77 # ignore other content types | 120 # ignore other content types |
78 continue | 121 continue |
79 | 122 |
80 # process base attributes | 123 # process base attributes |
81 oc = ent['oc'] | 124 oc = ent['oc'] |
82 if oc != etype: | 125 if oc != etype: |
83 print("ERROR: entity type doesn't match!") | 126 print("ERROR: entity type doesn't match!") |
84 return null | 127 return null |
85 | 128 |
86 attrs['type'] = oc | 129 attrs['type'] = fixName(oc) |
87 | 130 |
88 ismi_id = ent['id'] | 131 ismi_id = ent['id'] |
89 # rename id to ismi_id | 132 # rename id to ismi_id |
90 attrs['ismi_id'] = ismi_id | 133 attrs['ismi_id'] = ismi_id |
91 | 134 |
95 attrs['label'] = ov | 138 attrs['label'] = ov |
96 | 139 |
97 # create node with attributes | 140 # create node with attributes |
98 node = gdb.nodes.create(**attrs) | 141 node = gdb.nodes.create(**attrs) |
99 # add labels | 142 # add labels |
100 node.labels.add(['project_ismi', etype]) | 143 node.labels.add([project_label, fixName(etype)]) |
101 return node | 144 return node |
102 | 145 |
103 | 146 |
104 # In[77]: | 147 # In[77]: |
105 | 148 |
142 tar = nodes.get(tar_id, None) | 185 tar = nodes.get(tar_id, None) |
143 if tar is None: | 186 if tar is None: |
144 print("ERROR: relation %s tar node %s missing!"%(rel_id,tar_id)) | 187 print("ERROR: relation %s tar node %s missing!"%(rel_id,tar_id)) |
145 continue | 188 continue |
146 | 189 |
147 n4j_rel = gdb.relationships.create(src, rel_name, tar) | 190 if add_inverse_relations: |
191 n4j_rel = [gdb.relationships.create(src, fixName(rel_name, is_src_rel=True), tar), | |
192 gdb.relationships.create(tar, fixName(rel_name, is_tar_rel=True), src)] | |
193 | |
194 else: | |
195 n4j_rel = gdb.relationships.create(src, fixName(rel_name), tar) | |
196 | |
148 n4j_relations[rel_id] = n4j_rel | 197 n4j_relations[rel_id] = n4j_rel |
149 | 198 |
150 return n4j_relations | 199 return n4j_relations |
151 | 200 |
152 | 201 |