annotate importFromOpenMind/importer/ismixml2model.py @ 55:5a1a4af235eb

fix fix of transfer of ownvalue to _label attribute.
author casties
date Fri, 21 Apr 2017 19:08:09 +0200
parents b9a6e596ebe4
children be1c7d6814b6
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
46
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
1 import xml.etree.ElementTree as ET
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
2 import json
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
3 import networkx
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
4 import sys
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
5
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
6 ## configure behaviour
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
7
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
8 # output filename
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
9 output_fn = "ismi_graph.gpickle"
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
10
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
11 input_fn = "openmind-data.xml"
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
12
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
13
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
14 # node types to exclude from the graph
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
15 exclude_objects_of_type = ['DIGITALIZATION', 'REFERENCE']
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
16
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
17 # attributes to exclude
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
18 exclude_attributes_of_type = [
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
19 'lw',
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
20 'node_type',
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
21 'nov',
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
22 'notes_old'
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
23 ]
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
24
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
25 # name of type attribute
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
26 node_type_attribute = '_type'
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
27 rel_type_attribute = '_type'
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
28
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
29 #ismi_types=["PERSON","WITNESS","CODEX","PLACE","COLLECTION","REPOSITORY"]
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
30
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
31
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
32 nx_graph = networkx.MultiDiGraph()
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
33
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
34 nx_nodes = {}
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
35 ismi_relations = {}
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
36 nx_relations = {}
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
37
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
38 # active log levels for logging
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
39 #logLevels = {'DEBUG', 'INFO', 'WARNING', 'ERROR', 'SYSMSG'}
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
40 #logLevels = {'INFO', 'WARNING', 'ERROR', 'SYSMSG'}
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
41 logLevels = {'INFO', 'ERROR', 'SYSMSG'}
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
42
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
43 def log(level, message):
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
44 if level in logLevels:
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
45 print("%s: %s"%(level, message))
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
46
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
47
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
48 def fixName(name, is_src_rel=False, is_tar_rel=False, att_from_rel=False):
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
49 if is_src_rel:
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
50 #name = name + '>'
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
51 pass
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
52
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
53 if is_tar_rel:
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
54 name = '<' + name
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
55
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
56 if att_from_rel:
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
57 # clean up relations as attribute names
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
58 name = name.replace('is_', '')
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
59 name = name.replace('has_', '')
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
60 name = name.replace('was_', '')
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
61 name = name.replace('_of', '')
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
62
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
63 return name
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
64
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
65
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
66
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
67 def parseYear(val):
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
68 year = None
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
69 try:
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
70 date_json = json.loads(val)
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
71 if 'from' in date_json:
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
72 year = date_json['from'].get('year', None)
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
73 elif 'date' in date_json:
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
74 year = date_json['date'].get('year', None)
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
75 else:
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
76 log("WARNING", "don't know what to do with date %s"%(val))
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
77
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
78 except:
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
79 pass
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
80
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
81 return year
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
82
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
83
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
84 def nodeFromEnt(ent_elem):
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
85 """Create a graph node from the given XML entity.
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
86
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
87 Creates the node in gdb and returns the node.
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
88 """
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
89 # text content of entity element
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
90 ov = ent_elem.text or ''
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
91
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
92 attrs = {}
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
93
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
94 # get attributes element
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
95 atts_elem = ent_elem.find('attributes')
54
b9a6e596ebe4 fix transfer of ownvalue to _label attribute.
casties
parents: 51
diff changeset
96
46
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
97 if atts_elem is None:
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
98 log('DEBUG', "entity has no attributes: %s"%ent_elem)
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
99
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
100 else:
55
5a1a4af235eb fix fix of transfer of ownvalue to _label attribute.
casties
parents: 54
diff changeset
101 # text content of entity element after atts_elem
5a1a4af235eb fix fix of transfer of ownvalue to _label attribute.
casties
parents: 54
diff changeset
102 ov += atts_elem.tail or ''
5a1a4af235eb fix fix of transfer of ownvalue to _label attribute.
casties
parents: 54
diff changeset
103
46
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
104 # go through all attributes
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
105 for att_elem in atts_elem:
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
106 ct = att_elem.get('content-type', None)
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
107 name = att_elem.get('name', None)
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
108 if name in exclude_attributes_of_type:
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
109 # exclude attribute
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
110 continue
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
111
48
6625019a0c96 old model2neo4j renamed to model2neo4j_restclient. new model2neo4j_client and model2neo4j_import. fixed ismixml2model and compare_models.
casties
parents: 47
diff changeset
112 if ct is None or ct.lower() in ['text', 'arabic', 'bool', 'boolean', 'url', 'language', 'null']:
46
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
113 # normal text attribute (assume no content_type is text too...)
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
114 val = att_elem.text
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
115
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
116 if val is not None and val[0] == '{':
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
117 # try to parse as date
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
118 year = parseYear(val)
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
119 if year is not None:
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
120 val = year
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
121
48
6625019a0c96 old model2neo4j renamed to model2neo4j_restclient. new model2neo4j_client and model2neo4j_import. fixed ismixml2model and compare_models.
casties
parents: 47
diff changeset
122 if val is not None:
6625019a0c96 old model2neo4j renamed to model2neo4j_restclient. new model2neo4j_client and model2neo4j_import. fixed ismixml2model and compare_models.
casties
parents: 47
diff changeset
123 # keep attribute
6625019a0c96 old model2neo4j renamed to model2neo4j_restclient. new model2neo4j_client and model2neo4j_import. fixed ismixml2model and compare_models.
casties
parents: 47
diff changeset
124 attrs[name] = val
51
5a633e875490 also read normalized fields from xml.
casties
parents: 48
diff changeset
125 # check for normalized value
5a633e875490 also read normalized fields from xml.
casties
parents: 48
diff changeset
126 nov = att_elem.findtext('norm')
5a633e875490 also read normalized fields from xml.
casties
parents: 48
diff changeset
127 if nov is not None:
5a633e875490 also read normalized fields from xml.
casties
parents: 48
diff changeset
128 # add normalized value
5a633e875490 also read normalized fields from xml.
casties
parents: 48
diff changeset
129 attrs['_n_'+name] = nov
46
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
130
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
131 elif ct == 'date':
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
132 # date attribute
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
133 val = att_elem.text
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
134 if val is not None:
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
135 # try to parse date object to get gregorian year
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
136 year = parseYear(val)
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
137 if year is not None:
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
138 attrs[name] = year
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
139
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
140 elif ct == 'num':
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
141 # number attribute
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
142 val = att_elem.text
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
143 if val is not None:
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
144 # keep attribute, assume num is int
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
145 attrs[name] = int(val)
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
146
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
147 elif ct == 'old':
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
148 # ignore attribute
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
149 continue
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
150
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
151 else:
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
152 log("WARN", "attribute with unknown content_type: %s"%repr(att_elem))
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
153 # ignore other content types
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
154 continue
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
155
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
156 # process base attributes
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
157 oc = ent_elem.get('object-class')
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
158
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
159 # set type
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
160 attrs[node_type_attribute] = fixName(oc)
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
161
47
378dcb66a27f new compare_models comparing the existence of nodes and relations in two graphs.
casties
parents: 46
diff changeset
162 ismi_id = int(ent_elem.get('id'))
46
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
163 # rename id to ismi_id
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
164 attrs['ismi_id'] = ismi_id
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
165
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
166 if len(ov) > 0:
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
167 # save ov as label
54
b9a6e596ebe4 fix transfer of ownvalue to _label attribute.
casties
parents: 51
diff changeset
168 attrs['_label'] = ov
51
5a633e875490 also read normalized fields from xml.
casties
parents: 48
diff changeset
169 # check for normalized value
5a633e875490 also read normalized fields from xml.
casties
parents: 48
diff changeset
170 nov = ent_elem.findtext('norm')
5a633e875490 also read normalized fields from xml.
casties
parents: 48
diff changeset
171 if nov is not None:
5a633e875490 also read normalized fields from xml.
casties
parents: 48
diff changeset
172 # add normalized value
5a633e875490 also read normalized fields from xml.
casties
parents: 48
diff changeset
173 attrs['_n_label'] = nov
46
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
174
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
175 # create node
47
378dcb66a27f new compare_models comparing the existence of nodes and relations in two graphs.
casties
parents: 46
diff changeset
176 log('DEBUG', "new node(%s, %s)"%(ismi_id, attrs))
46
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
177 nx_graph.add_node(ismi_id, **attrs)
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
178 node = nx_graph.node[ismi_id]
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
179
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
180 return node
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
181
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
182
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
183 def relationFromRel(rel_elem):
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
184 """Create graph relation from etree element.
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
185 """
47
378dcb66a27f new compare_models comparing the existence of nodes and relations in two graphs.
casties
parents: 46
diff changeset
186 rel_id = int(rel_elem.get('id'))
46
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
187 rel_name = rel_elem.get('object-class')
47
378dcb66a27f new compare_models comparing the existence of nodes and relations in two graphs.
casties
parents: 46
diff changeset
188 src_id = int(rel_elem.get('source-id'))
378dcb66a27f new compare_models comparing the existence of nodes and relations in two graphs.
casties
parents: 46
diff changeset
189 tar_id = int(rel_elem.get('target-id'))
46
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
190 if not src_id in nx_nodes:
47
378dcb66a27f new compare_models comparing the existence of nodes and relations in two graphs.
casties
parents: 46
diff changeset
191 log("WARNING", "relation %s src node %s missing!"%(rel_id,src_id))
46
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
192 return None
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
193
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
194 if not tar_id in nx_nodes:
47
378dcb66a27f new compare_models comparing the existence of nodes and relations in two graphs.
casties
parents: 46
diff changeset
195 log("WARNING", "relation %s tar node %s missing!"%(rel_id,tar_id))
46
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
196 return None
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
197
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
198 ov = rel_elem.text or ''
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
199
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
200 attrs = {}
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
201
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
202 # get attributes element
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
203 atts_elem = rel_elem.find('attributes')
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
204
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
205 if atts_elem is not None:
48
6625019a0c96 old model2neo4j renamed to model2neo4j_restclient. new model2neo4j_client and model2neo4j_import. fixed ismixml2model and compare_models.
casties
parents: 47
diff changeset
206 if atts_elem.tail is not None:
6625019a0c96 old model2neo4j renamed to model2neo4j_restclient. new model2neo4j_client and model2neo4j_import. fixed ismixml2model and compare_models.
casties
parents: 47
diff changeset
207 # tail belongs to parent
6625019a0c96 old model2neo4j renamed to model2neo4j_restclient. new model2neo4j_client and model2neo4j_import. fixed ismixml2model and compare_models.
casties
parents: 47
diff changeset
208 ov += atts_elem.tail
6625019a0c96 old model2neo4j renamed to model2neo4j_restclient. new model2neo4j_client and model2neo4j_import. fixed ismixml2model and compare_models.
casties
parents: 47
diff changeset
209
46
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
210 # go through all attributes
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
211 for att_elem in atts_elem:
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
212 ct = att_elem.get('content-type', None)
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
213 name = att_elem.get('name', None)
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
214 if name in exclude_attributes_of_type:
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
215 # exclude attribute
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
216 continue
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
217
48
6625019a0c96 old model2neo4j renamed to model2neo4j_restclient. new model2neo4j_client and model2neo4j_import. fixed ismixml2model and compare_models.
casties
parents: 47
diff changeset
218 if ct is None or ct.lower() in ['text', 'arabic', 'bool', 'boolean', 'url', 'language', 'null']:
46
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
219 # normal text attribute (assume no content_type is text too...)
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
220 val = att_elem.text
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
221
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
222 if val is not None and val[0] == '{':
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
223 # try to parse as date
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
224 year = parseYear(val)
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
225 if year is not None:
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
226 val = year
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
227
48
6625019a0c96 old model2neo4j renamed to model2neo4j_restclient. new model2neo4j_client and model2neo4j_import. fixed ismixml2model and compare_models.
casties
parents: 47
diff changeset
228 if val is not None:
6625019a0c96 old model2neo4j renamed to model2neo4j_restclient. new model2neo4j_client and model2neo4j_import. fixed ismixml2model and compare_models.
casties
parents: 47
diff changeset
229 # keep attribute
6625019a0c96 old model2neo4j renamed to model2neo4j_restclient. new model2neo4j_client and model2neo4j_import. fixed ismixml2model and compare_models.
casties
parents: 47
diff changeset
230 attrs[name] = val
51
5a633e875490 also read normalized fields from xml.
casties
parents: 48
diff changeset
231 # check for normalized value
5a633e875490 also read normalized fields from xml.
casties
parents: 48
diff changeset
232 nov = att_elem.findtext('norm')
5a633e875490 also read normalized fields from xml.
casties
parents: 48
diff changeset
233 if nov is not None:
5a633e875490 also read normalized fields from xml.
casties
parents: 48
diff changeset
234 # add normalized value
5a633e875490 also read normalized fields from xml.
casties
parents: 48
diff changeset
235 attrs['_n_'+name] = nov
46
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
236
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
237 elif ct == 'date':
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
238 # date attribute
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
239 val = att_elem.text
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
240 if val is not None:
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
241 # try to parse date object to get gregorian year
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
242 year = parseYear(val)
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
243 if year is not None:
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
244 attrs[name] = year
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
245
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
246 elif ct == 'num':
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
247 # number attribute
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
248 val = att_elem.text
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
249 if val is not None:
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
250 # keep attribute, assume num is int
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
251 attrs[name] = int(val)
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
252
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
253 elif ct == 'old':
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
254 # ignore attribute
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
255 continue
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
256
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
257 else:
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
258 log("WARN", "attribute with unknown content_type: %s"%repr(att_elem))
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
259 # ignore other content types
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
260 continue
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
261
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
262 #if len(ov) > 0:
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
263 # # own value of relation is not useful
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
264 # attrs['ov'] = ov
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
265
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
266 attrs[rel_type_attribute] = fixName(rel_name)
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
267 attrs['ismi_id'] = rel_id
47
378dcb66a27f new compare_models comparing the existence of nodes and relations in two graphs.
casties
parents: 46
diff changeset
268 #log('DEBUG', "new edge(%s, %s, %s)"%(src_id, tar_id, attrs))
46
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
269 # create relation with type
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
270 nx_rel = nx_graph.add_edge(src_id, tar_id, attr_dict=attrs)
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
271
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
272 return nx_rel
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
273
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
274
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
275 def importEnts(ents_elem):
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
276 """Import all entities from etree element elem.
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
277 """
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
278 cnt = 0
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
279 xml_num = ents_elem.get('number')
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
280 log('INFO', "XML says %s entities"%xml_num)
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
281
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
282 # iterate through entities element
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
283 for ent_elem in ents_elem:
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
284 cnt += 1
47
378dcb66a27f new compare_models comparing the existence of nodes and relations in two graphs.
casties
parents: 46
diff changeset
285
378dcb66a27f new compare_models comparing the existence of nodes and relations in two graphs.
casties
parents: 46
diff changeset
286 oc = ent_elem.get('object-class')
378dcb66a27f new compare_models comparing the existence of nodes and relations in two graphs.
casties
parents: 46
diff changeset
287 if oc in exclude_objects_of_type:
378dcb66a27f new compare_models comparing the existence of nodes and relations in two graphs.
casties
parents: 46
diff changeset
288 # skip this entity
378dcb66a27f new compare_models comparing the existence of nodes and relations in two graphs.
casties
parents: 46
diff changeset
289 continue
378dcb66a27f new compare_models comparing the existence of nodes and relations in two graphs.
casties
parents: 46
diff changeset
290
378dcb66a27f new compare_models comparing the existence of nodes and relations in two graphs.
casties
parents: 46
diff changeset
291 ismi_id = int(ent_elem.get('id'))
46
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
292 log('DEBUG', "reading entity[%s]"%ismi_id)
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
293
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
294 if ismi_id in nx_nodes:
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
295 log("ERROR", "entity with id=%s exists!"%ismi_id)
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
296 return
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
297
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
298 # create networkx node
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
299 node = nodeFromEnt(ent_elem)
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
300
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
301 # save node reference
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
302 nx_nodes[ismi_id] = node
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
303
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
304 # debug
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
305 #if cnt >= 100:
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
306 # return
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
307
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
308
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
309 def importRels(rels_elem):
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
310 """Import all entities from etree element elem.
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
311 """
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
312 cnt = 0
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
313 xml_num = rels_elem.get('number')
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
314 log('INFO', "XML says %s relations"%xml_num)
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
315
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
316 # iterate through entities element
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
317 for rel_elem in rels_elem:
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
318 cnt += 1
47
378dcb66a27f new compare_models comparing the existence of nodes and relations in two graphs.
casties
parents: 46
diff changeset
319
378dcb66a27f new compare_models comparing the existence of nodes and relations in two graphs.
casties
parents: 46
diff changeset
320 ismi_id = int(rel_elem.get('id'))
46
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
321 log('DEBUG', "reading relation[%s]"%ismi_id)
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
322
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
323 if ismi_id in nx_relations:
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
324 print("ERROR: relation with id=%s exists!"%ismi_id)
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
325 return
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
326
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
327 # create networkx relation
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
328 relation = relationFromRel(rel_elem)
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
329
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
330 # save relation reference
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
331 nx_relations[ismi_id] = relation
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
332
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
333 # debug
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
334 #if cnt >= 100:
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
335 # return
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
336
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
337
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
338 def importAll():
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
339 # parse XML file
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
340 log('INFO', "parsing XML file %s"%input_fn)
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
341 tree = ET.parse(input_fn)
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
342 log('DEBUG', "etree ready")
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
343 root = tree.getroot()
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
344 ents = root.find('entities')
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
345 importEnts(ents)
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
346
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
347 rels = root.find('relations')
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
348 importRels(rels)
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
349
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
350 ## main
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
351
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
352 print("Copy graph from OpenMind-XML to networkx pickle")
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
353
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
354 # parse command line parameters
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
355 if len(sys.argv) > 1:
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
356 input_fn = sys.argv[1]
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
357
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
358 if len(sys.argv) > 2:
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
359 output_fn = sys.argv[2]
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
360
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
361 # import everything
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
362 print("Reading graph from OpenMind-XML file %s"%input_fn)
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
363 if len(exclude_objects_of_type) > 0:
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
364 print(" Skipping objects of type %s"%exclude_objects_of_type);
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
365
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
366 importAll()
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
367
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
368 print("Graph info: %s"%networkx.info(nx_graph))
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
369 #print(" nodes:%s"%repr(nx_graph.nodes(data=True)))
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
370
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
371 # export pickle
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
372 networkx.write_gpickle(nx_graph, output_fn)
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
373 print("Wrote networkx pickle file %s"%output_fn)