annotate importFromOpenMind/importer/ismixml2model.py @ 51:5a633e875490

also read normalized fields from xml.
author casties
date Fri, 03 Mar 2017 20:11:06 +0100
parents 6625019a0c96
children b9a6e596ebe4
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
46
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
1 import xml.etree.ElementTree as ET
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
2 import json
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
3 import networkx
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
4 import sys
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
5
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
6 ## configure behaviour
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
7
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
8 # output filename
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
9 output_fn = "ismi_graph.gpickle"
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
10
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
11 input_fn = "openmind-data.xml"
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
12
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
13
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
14 # node types to exclude from the graph
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
15 exclude_objects_of_type = ['DIGITALIZATION', 'REFERENCE']
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
16
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
17 # attributes to exclude
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
18 exclude_attributes_of_type = [
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
19 'lw',
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
20 'node_type',
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
21 'nov',
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
22 'notes_old'
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
23 ]
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
24
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
25 # name of type attribute
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
26 node_type_attribute = '_type'
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
27 rel_type_attribute = '_type'
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
28
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
29 #ismi_types=["PERSON","WITNESS","CODEX","PLACE","COLLECTION","REPOSITORY"]
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
30
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
31
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
32 nx_graph = networkx.MultiDiGraph()
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
33
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
34 nx_nodes = {}
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
35 ismi_relations = {}
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
36 nx_relations = {}
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
37
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
38 # active log levels for logging
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
39 #logLevels = {'DEBUG', 'INFO', 'WARNING', 'ERROR', 'SYSMSG'}
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
40 #logLevels = {'INFO', 'WARNING', 'ERROR', 'SYSMSG'}
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
41 logLevels = {'INFO', 'ERROR', 'SYSMSG'}
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
42
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
43 def log(level, message):
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
44 if level in logLevels:
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
45 print("%s: %s"%(level, message))
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
46
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
47
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
48 def fixName(name, is_src_rel=False, is_tar_rel=False, att_from_rel=False):
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
49 if is_src_rel:
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
50 #name = name + '>'
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
51 pass
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
52
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
53 if is_tar_rel:
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
54 name = '<' + name
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
55
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
56 if att_from_rel:
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
57 # clean up relations as attribute names
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
58 name = name.replace('is_', '')
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
59 name = name.replace('has_', '')
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
60 name = name.replace('was_', '')
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
61 name = name.replace('_of', '')
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
62
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
63 return name
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
64
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
65
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
66
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
67 def parseYear(val):
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
68 year = None
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
69 try:
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
70 date_json = json.loads(val)
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
71 if 'from' in date_json:
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
72 year = date_json['from'].get('year', None)
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
73 elif 'date' in date_json:
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
74 year = date_json['date'].get('year', None)
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
75 else:
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
76 log("WARNING", "don't know what to do with date %s"%(val))
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
77
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
78 except:
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
79 pass
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
80
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
81 return year
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
82
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
83
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
84 def nodeFromEnt(ent_elem):
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
85 """Create a graph node from the given XML entity.
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
86
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
87 Creates the node in gdb and returns the node.
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
88 """
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
89 # text content of entity element
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
90 ov = ent_elem.text or ''
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
91
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
92 attrs = {}
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
93
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
94 # get attributes element
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
95 atts_elem = ent_elem.find('attributes')
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
96
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
97 if atts_elem is None:
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
98 log('DEBUG', "entity has no attributes: %s"%ent_elem)
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
99
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
100 else:
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
101 # go through all attributes
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
102 for att_elem in atts_elem:
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
103 ct = att_elem.get('content-type', None)
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
104 name = att_elem.get('name', None)
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
105 if name in exclude_attributes_of_type:
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
106 # exclude attribute
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
107 continue
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
108
48
6625019a0c96 old model2neo4j renamed to model2neo4j_restclient. new model2neo4j_client and model2neo4j_import. fixed ismixml2model and compare_models.
casties
parents: 47
diff changeset
109 if ct is None or ct.lower() in ['text', 'arabic', 'bool', 'boolean', 'url', 'language', 'null']:
46
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
110 # normal text attribute (assume no content_type is text too...)
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
111 val = att_elem.text
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
112
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
113 if val is not None and val[0] == '{':
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
114 # try to parse as date
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
115 year = parseYear(val)
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
116 if year is not None:
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
117 val = year
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
118
48
6625019a0c96 old model2neo4j renamed to model2neo4j_restclient. new model2neo4j_client and model2neo4j_import. fixed ismixml2model and compare_models.
casties
parents: 47
diff changeset
119 if val is not None:
6625019a0c96 old model2neo4j renamed to model2neo4j_restclient. new model2neo4j_client and model2neo4j_import. fixed ismixml2model and compare_models.
casties
parents: 47
diff changeset
120 # keep attribute
6625019a0c96 old model2neo4j renamed to model2neo4j_restclient. new model2neo4j_client and model2neo4j_import. fixed ismixml2model and compare_models.
casties
parents: 47
diff changeset
121 attrs[name] = val
51
5a633e875490 also read normalized fields from xml.
casties
parents: 48
diff changeset
122 # check for normalized value
5a633e875490 also read normalized fields from xml.
casties
parents: 48
diff changeset
123 nov = att_elem.findtext('norm')
5a633e875490 also read normalized fields from xml.
casties
parents: 48
diff changeset
124 if nov is not None:
5a633e875490 also read normalized fields from xml.
casties
parents: 48
diff changeset
125 # add normalized value
5a633e875490 also read normalized fields from xml.
casties
parents: 48
diff changeset
126 attrs['_n_'+name] = nov
46
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
127
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
128 elif ct == 'date':
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
129 # date attribute
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
130 val = att_elem.text
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
131 if val is not None:
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
132 # try to parse date object to get gregorian year
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
133 year = parseYear(val)
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
134 if year is not None:
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
135 attrs[name] = year
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
136
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
137 elif ct == 'num':
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
138 # number attribute
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
139 val = att_elem.text
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
140 if val is not None:
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
141 # keep attribute, assume num is int
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
142 attrs[name] = int(val)
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
143
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
144 elif ct == 'old':
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
145 # ignore attribute
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
146 continue
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
147
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
148 else:
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
149 log("WARN", "attribute with unknown content_type: %s"%repr(att_elem))
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
150 # ignore other content types
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
151 continue
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
152
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
153 # process base attributes
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
154 oc = ent_elem.get('object-class')
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
155
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
156 # set type
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
157 attrs[node_type_attribute] = fixName(oc)
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
158
47
378dcb66a27f new compare_models comparing the existence of nodes and relations in two graphs.
casties
parents: 46
diff changeset
159 ismi_id = int(ent_elem.get('id'))
46
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
160 # rename id to ismi_id
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
161 attrs['ismi_id'] = ismi_id
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
162
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
163 if len(ov) > 0:
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
164 # save ov as label
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
165 attrs['label'] = ov
51
5a633e875490 also read normalized fields from xml.
casties
parents: 48
diff changeset
166 # check for normalized value
5a633e875490 also read normalized fields from xml.
casties
parents: 48
diff changeset
167 nov = ent_elem.findtext('norm')
5a633e875490 also read normalized fields from xml.
casties
parents: 48
diff changeset
168 if nov is not None:
5a633e875490 also read normalized fields from xml.
casties
parents: 48
diff changeset
169 # add normalized value
5a633e875490 also read normalized fields from xml.
casties
parents: 48
diff changeset
170 attrs['_n_label'] = nov
46
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
171
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
172 # create node
47
378dcb66a27f new compare_models comparing the existence of nodes and relations in two graphs.
casties
parents: 46
diff changeset
173 log('DEBUG', "new node(%s, %s)"%(ismi_id, attrs))
46
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
174 nx_graph.add_node(ismi_id, **attrs)
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
175 node = nx_graph.node[ismi_id]
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
176
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
177 return node
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
178
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
179
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
180 def relationFromRel(rel_elem):
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
181 """Create graph relation from etree element.
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
182 """
47
378dcb66a27f new compare_models comparing the existence of nodes and relations in two graphs.
casties
parents: 46
diff changeset
183 rel_id = int(rel_elem.get('id'))
46
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
184 rel_name = rel_elem.get('object-class')
47
378dcb66a27f new compare_models comparing the existence of nodes and relations in two graphs.
casties
parents: 46
diff changeset
185 src_id = int(rel_elem.get('source-id'))
378dcb66a27f new compare_models comparing the existence of nodes and relations in two graphs.
casties
parents: 46
diff changeset
186 tar_id = int(rel_elem.get('target-id'))
46
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
187 if not src_id in nx_nodes:
47
378dcb66a27f new compare_models comparing the existence of nodes and relations in two graphs.
casties
parents: 46
diff changeset
188 log("WARNING", "relation %s src node %s missing!"%(rel_id,src_id))
46
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
189 return None
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
190
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
191 if not tar_id in nx_nodes:
47
378dcb66a27f new compare_models comparing the existence of nodes and relations in two graphs.
casties
parents: 46
diff changeset
192 log("WARNING", "relation %s tar node %s missing!"%(rel_id,tar_id))
46
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
193 return None
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
194
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
195 ov = rel_elem.text or ''
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
196
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
197 attrs = {}
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
198
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
199 # get attributes element
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
200 atts_elem = rel_elem.find('attributes')
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
201
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
202 if atts_elem is not None:
48
6625019a0c96 old model2neo4j renamed to model2neo4j_restclient. new model2neo4j_client and model2neo4j_import. fixed ismixml2model and compare_models.
casties
parents: 47
diff changeset
203 if atts_elem.tail is not None:
6625019a0c96 old model2neo4j renamed to model2neo4j_restclient. new model2neo4j_client and model2neo4j_import. fixed ismixml2model and compare_models.
casties
parents: 47
diff changeset
204 # tail belongs to parent
6625019a0c96 old model2neo4j renamed to model2neo4j_restclient. new model2neo4j_client and model2neo4j_import. fixed ismixml2model and compare_models.
casties
parents: 47
diff changeset
205 ov += atts_elem.tail
6625019a0c96 old model2neo4j renamed to model2neo4j_restclient. new model2neo4j_client and model2neo4j_import. fixed ismixml2model and compare_models.
casties
parents: 47
diff changeset
206
46
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
207 # go through all attributes
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
208 for att_elem in atts_elem:
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
209 ct = att_elem.get('content-type', None)
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
210 name = att_elem.get('name', None)
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
211 if name in exclude_attributes_of_type:
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
212 # exclude attribute
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
213 continue
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
214
48
6625019a0c96 old model2neo4j renamed to model2neo4j_restclient. new model2neo4j_client and model2neo4j_import. fixed ismixml2model and compare_models.
casties
parents: 47
diff changeset
215 if ct is None or ct.lower() in ['text', 'arabic', 'bool', 'boolean', 'url', 'language', 'null']:
46
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
216 # normal text attribute (assume no content_type is text too...)
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
217 val = att_elem.text
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
218
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
219 if val is not None and val[0] == '{':
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
220 # try to parse as date
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
221 year = parseYear(val)
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
222 if year is not None:
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
223 val = year
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
224
48
6625019a0c96 old model2neo4j renamed to model2neo4j_restclient. new model2neo4j_client and model2neo4j_import. fixed ismixml2model and compare_models.
casties
parents: 47
diff changeset
225 if val is not None:
6625019a0c96 old model2neo4j renamed to model2neo4j_restclient. new model2neo4j_client and model2neo4j_import. fixed ismixml2model and compare_models.
casties
parents: 47
diff changeset
226 # keep attribute
6625019a0c96 old model2neo4j renamed to model2neo4j_restclient. new model2neo4j_client and model2neo4j_import. fixed ismixml2model and compare_models.
casties
parents: 47
diff changeset
227 attrs[name] = val
51
5a633e875490 also read normalized fields from xml.
casties
parents: 48
diff changeset
228 # check for normalized value
5a633e875490 also read normalized fields from xml.
casties
parents: 48
diff changeset
229 nov = att_elem.findtext('norm')
5a633e875490 also read normalized fields from xml.
casties
parents: 48
diff changeset
230 if nov is not None:
5a633e875490 also read normalized fields from xml.
casties
parents: 48
diff changeset
231 # add normalized value
5a633e875490 also read normalized fields from xml.
casties
parents: 48
diff changeset
232 attrs['_n_'+name] = nov
46
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
233
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
234 elif ct == 'date':
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
235 # date attribute
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
236 val = att_elem.text
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
237 if val is not None:
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
238 # try to parse date object to get gregorian year
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
239 year = parseYear(val)
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
240 if year is not None:
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
241 attrs[name] = year
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
242
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
243 elif ct == 'num':
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
244 # number attribute
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
245 val = att_elem.text
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
246 if val is not None:
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
247 # keep attribute, assume num is int
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
248 attrs[name] = int(val)
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
249
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
250 elif ct == 'old':
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
251 # ignore attribute
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
252 continue
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
253
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
254 else:
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
255 log("WARN", "attribute with unknown content_type: %s"%repr(att_elem))
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
256 # ignore other content types
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
257 continue
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
258
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
259 #if len(ov) > 0:
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
260 # # own value of relation is not useful
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
261 # attrs['ov'] = ov
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
262
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
263 attrs[rel_type_attribute] = fixName(rel_name)
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
264 attrs['ismi_id'] = rel_id
47
378dcb66a27f new compare_models comparing the existence of nodes and relations in two graphs.
casties
parents: 46
diff changeset
265 #log('DEBUG', "new edge(%s, %s, %s)"%(src_id, tar_id, attrs))
46
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
266 # create relation with type
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
267 nx_rel = nx_graph.add_edge(src_id, tar_id, attr_dict=attrs)
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
268
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
269 return nx_rel
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
270
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
271
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
272 def importEnts(ents_elem):
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
273 """Import all entities from etree element elem.
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
274 """
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
275 cnt = 0
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
276 xml_num = ents_elem.get('number')
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
277 log('INFO', "XML says %s entities"%xml_num)
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
278
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
279 # iterate through entities element
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
280 for ent_elem in ents_elem:
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
281 cnt += 1
47
378dcb66a27f new compare_models comparing the existence of nodes and relations in two graphs.
casties
parents: 46
diff changeset
282
378dcb66a27f new compare_models comparing the existence of nodes and relations in two graphs.
casties
parents: 46
diff changeset
283 oc = ent_elem.get('object-class')
378dcb66a27f new compare_models comparing the existence of nodes and relations in two graphs.
casties
parents: 46
diff changeset
284 if oc in exclude_objects_of_type:
378dcb66a27f new compare_models comparing the existence of nodes and relations in two graphs.
casties
parents: 46
diff changeset
285 # skip this entity
378dcb66a27f new compare_models comparing the existence of nodes and relations in two graphs.
casties
parents: 46
diff changeset
286 continue
378dcb66a27f new compare_models comparing the existence of nodes and relations in two graphs.
casties
parents: 46
diff changeset
287
378dcb66a27f new compare_models comparing the existence of nodes and relations in two graphs.
casties
parents: 46
diff changeset
288 ismi_id = int(ent_elem.get('id'))
46
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
289 log('DEBUG', "reading entity[%s]"%ismi_id)
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
290
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
291 if ismi_id in nx_nodes:
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
292 log("ERROR", "entity with id=%s exists!"%ismi_id)
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
293 return
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
294
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
295 # create networkx node
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
296 node = nodeFromEnt(ent_elem)
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
297
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
298 # save node reference
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
299 nx_nodes[ismi_id] = node
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
300
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
301 # debug
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
302 #if cnt >= 100:
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
303 # return
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
304
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
305
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
306 def importRels(rels_elem):
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
307 """Import all entities from etree element elem.
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
308 """
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
309 cnt = 0
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
310 xml_num = rels_elem.get('number')
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
311 log('INFO', "XML says %s relations"%xml_num)
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
312
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
313 # iterate through entities element
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
314 for rel_elem in rels_elem:
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
315 cnt += 1
47
378dcb66a27f new compare_models comparing the existence of nodes and relations in two graphs.
casties
parents: 46
diff changeset
316
378dcb66a27f new compare_models comparing the existence of nodes and relations in two graphs.
casties
parents: 46
diff changeset
317 ismi_id = int(rel_elem.get('id'))
46
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
318 log('DEBUG', "reading relation[%s]"%ismi_id)
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
319
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
320 if ismi_id in nx_relations:
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
321 print("ERROR: relation with id=%s exists!"%ismi_id)
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
322 return
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
323
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
324 # create networkx relation
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
325 relation = relationFromRel(rel_elem)
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
326
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
327 # save relation reference
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
328 nx_relations[ismi_id] = relation
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
329
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
330 # debug
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
331 #if cnt >= 100:
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
332 # return
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
333
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
334
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
335 def importAll():
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
336 # parse XML file
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
337 log('INFO', "parsing XML file %s"%input_fn)
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
338 tree = ET.parse(input_fn)
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
339 log('DEBUG', "etree ready")
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
340 root = tree.getroot()
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
341 ents = root.find('entities')
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
342 importEnts(ents)
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
343
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
344 rels = root.find('relations')
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
345 importRels(rels)
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
346
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
347 ## main
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
348
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
349 print("Copy graph from OpenMind-XML to networkx pickle")
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
350
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
351 # parse command line parameters
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
352 if len(sys.argv) > 1:
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
353 input_fn = sys.argv[1]
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
354
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
355 if len(sys.argv) > 2:
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
356 output_fn = sys.argv[2]
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
357
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
358 # import everything
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
359 print("Reading graph from OpenMind-XML file %s"%input_fn)
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
360 if len(exclude_objects_of_type) > 0:
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
361 print(" Skipping objects of type %s"%exclude_objects_of_type);
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
362
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
363 importAll()
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
364
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
365 print("Graph info: %s"%networkx.info(nx_graph))
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
366 #print(" nodes:%s"%repr(nx_graph.nodes(data=True)))
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
367
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
368 # export pickle
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
369 networkx.write_gpickle(nx_graph, output_fn)
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
370 print("Wrote networkx pickle file %s"%output_fn)