comparison importFromOpenMind/importer/ismi2neo4j.py @ 17:4dfd832e9cd9

added automatic creation of inverse relations. added more attribute types.
author casties
date Thu, 03 Sep 2015 18:48:21 +0200
parents de0a06eef13b
children 0827156df210
comparison
equal deleted inserted replaced
16:de0a06eef13b 17:4dfd832e9cd9
1 import urllib.request 1 import urllib.request
2 import json 2 import json
3 from neo4jrestclient.client import GraphDatabase, Node 3 from neo4jrestclient.client import GraphDatabase, Node
4 4
5 # In[111]: 5 ## configure behaviour
6 ismi_types=["PERSON","WITNESS","CODEX","PLACE","COLLECTION","REPOSITORY"] 6
7 7 # add inverse relations as "<relation"
8 add_inverse_relations = True
9
10 # try to find and re-use existing nodes in neo4j (slow!)
11 keep_nodes = False
12
13 # label added to all nodes
14 project_label = '_ismi_inv_rel'
15
16 # OpenMind base URL
8 baseURL="http://localhost:18080/ismi-richfaces/jsonInterface?" 17 baseURL="http://localhost:18080/ismi-richfaces/jsonInterface?"
18
19 # neo4j base URL
20 neo4jBaseURL = "http://localhost:7474/db/data/"
21
9 22
10 entsURL=baseURL+"method=get_ents&oc=%s" 23 entsURL=baseURL+"method=get_ents&oc=%s"
11 24
12 entURL=baseURL+"method=get_ent&id=%s&include_content=True" 25 entURL=baseURL+"method=get_ent&id=%s&include_content=True"
13 26
17 txt = wsh.read() 30 txt = wsh.read()
18 return json.loads(txt.decode("utf-8")) 31 return json.loads(txt.decode("utf-8"))
19 32
20 defs_json = readJSON(baseURL+"method=get_defs") 33 defs_json = readJSON(baseURL+"method=get_defs")
21 34
35 # current list of all definitions
22 ismi_defs = [atts['ov'] for atts in defs_json['defs']] 36 ismi_defs = [atts['ov'] for atts in defs_json['defs']]
23 37
24 38 #ismi_types=["PERSON","WITNESS","CODEX","PLACE","COLLECTION","REPOSITORY"]
25 gdb = GraphDatabase("http://localhost:7474/db/data/", username="neo4j", password="neo5j") 39
40
41 gdb = GraphDatabase(neo4jBaseURL, username="neo4j", password="neo5j")
26 42
27 n4j_nodes = {} 43 n4j_nodes = {}
28 ismi_relations = {} 44 ismi_relations = {}
29 n4j_relations = {} 45 n4j_relations = {}
30
31 keep_nodes = False
32 46
33 ent_exclude_attrs = [ 47 ent_exclude_attrs = [
34 'lw', 48 'lw',
35 'node_type', 49 'node_type',
36 'nov' 50 'nov'
37 ] 51 ]
38 52
53 def fixName(name, is_src_rel=False, is_tar_rel=False):
54 # these are too embarrasing...
55 if 'FLORUIT' in name:
56 name = name.replace('FLORUIT', 'FLOURISH')
57
58 elif 'floruit' in name:
59 name = name.replace('floruit', 'flourish')
60
61 if is_src_rel:
62 name = name + '>'
63
64 if is_tar_rel:
65 name = '<' + name
66
67 return name
68
39 69
40 def getNode(ismi_id=None): 70 def getNode(ismi_id=None):
41 if ismi_id is not None: 71 if ismi_id is not None:
42 res = gdb.query("match (n {ismi_id: %s}) return n"%40635, returns=(Node)) 72 res = gdb.query("match (n {ismi_id: %s}) return n"%ismi_id, returns=(Node))
43 if len(res) > 0: 73 if len(res) > 0:
44 return res[0] 74 return res[0]
45 75
46 return None 76 return None
77
47 78
48 def nodeFromEnt(ent, etype): 79 def nodeFromEnt(ent, etype):
49 attrs = {} 80 attrs = {}
50 # go through all attributes 81 # go through all attributes
51 for att in ent['atts']: 82 for att in ent['atts']:
52 ct = att.get('content_type', None) 83 ct = att.get('content_type', None)
53 if ct in ['text', 'arabic', 'bool', 'url']: 84 if ct is None or ct.lower() in ['text', 'arabic', 'bool', 'boolean', 'url', 'language']:
54 # normal text attribute 85 # normal text attribute (assume no content_type is text too...)
55 key = att['name'] 86 key = att['name']
56 val = att['ov'] 87 val = att['ov']
57 88
58 if key in ent_exclude_attrs: 89 if key in ent_exclude_attrs:
59 # exclude attribute 90 # exclude attribute
60 continue 91 continue
61 92
62 # keep attribute 93 # keep attribute
63 attrs[key] = val 94 attrs[key] = val
95
96 elif ct == 'num':
97 # number attribute
98 key = att['name']
99 val = att['ov']
100
101 if key in ent_exclude_attrs:
102 # exclude attribute
103 continue
104
105 # keep attribute, assume num is int
106 attrs[key] = int(val)
64 107
65 elif ct == 'date': 108 elif ct == 'date':
66 # date attribute 109 # date attribute
67 key = att['name'] 110 key = att['name']
68 val = att['ov'] 111 val = att['ov']
71 elif ct == 'old': 114 elif ct == 'old':
72 # ignore attribute 115 # ignore attribute
73 continue 116 continue
74 117
75 else: 118 else:
76 #print("WARN: attribute with unknown content_type: %s"%repr(att)) 119 print("WARN: attribute with unknown content_type: %s"%repr(att))
77 # ignore other content types 120 # ignore other content types
78 continue 121 continue
79 122
80 # process base attributes 123 # process base attributes
81 oc = ent['oc'] 124 oc = ent['oc']
82 if oc != etype: 125 if oc != etype:
83 print("ERROR: entity type doesn't match!") 126 print("ERROR: entity type doesn't match!")
84 return null 127 return null
85 128
86 attrs['type'] = oc 129 attrs['type'] = fixName(oc)
87 130
88 ismi_id = ent['id'] 131 ismi_id = ent['id']
89 # rename id to ismi_id 132 # rename id to ismi_id
90 attrs['ismi_id'] = ismi_id 133 attrs['ismi_id'] = ismi_id
91 134
95 attrs['label'] = ov 138 attrs['label'] = ov
96 139
97 # create node with attributes 140 # create node with attributes
98 node = gdb.nodes.create(**attrs) 141 node = gdb.nodes.create(**attrs)
99 # add labels 142 # add labels
100 node.labels.add(['project_ismi', etype]) 143 node.labels.add([project_label, fixName(etype)])
101 return node 144 return node
102 145
103 146
104 # In[77]: 147 # In[77]:
105 148
142 tar = nodes.get(tar_id, None) 185 tar = nodes.get(tar_id, None)
143 if tar is None: 186 if tar is None:
144 print("ERROR: relation %s tar node %s missing!"%(rel_id,tar_id)) 187 print("ERROR: relation %s tar node %s missing!"%(rel_id,tar_id))
145 continue 188 continue
146 189
147 n4j_rel = gdb.relationships.create(src, rel_name, tar) 190 if add_inverse_relations:
191 n4j_rel = [gdb.relationships.create(src, fixName(rel_name, is_src_rel=True), tar),
192 gdb.relationships.create(tar, fixName(rel_name, is_tar_rel=True), src)]
193
194 else:
195 n4j_rel = gdb.relationships.create(src, fixName(rel_name), tar)
196
148 n4j_relations[rel_id] = n4j_rel 197 n4j_relations[rel_id] = n4j_rel
149 198
150 return n4j_relations 199 return n4j_relations
151 200
152 201