annotate importFromOpenMind/importer/ismixml2model.py @ 46:f3945ef1e6a4

new importer for OM4XML dump file.
author casties
date Fri, 03 Feb 2017 18:46:16 +0100
parents
children 378dcb66a27f
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
46
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
1 import xml.etree.ElementTree as ET
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
2 import json
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
3 import networkx
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
4 import sys
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
5
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
6 ## configure behaviour
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
7
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
8 # output filename
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
9 output_fn = "ismi_graph.gpickle"
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
10
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
11 input_fn = "openmind-data.xml"
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
12
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
13
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
14 # node types to exclude from the graph
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
15 exclude_objects_of_type = ['DIGITALIZATION', 'REFERENCE']
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
16
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
17 # attributes to exclude
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
18 exclude_attributes_of_type = [
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
19 'lw',
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
20 'node_type',
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
21 'nov',
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
22 'notes_old'
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
23 ]
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
24
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
25 # name of type attribute
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
26 node_type_attribute = '_type'
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
27 rel_type_attribute = '_type'
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
28
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
29 #ismi_types=["PERSON","WITNESS","CODEX","PLACE","COLLECTION","REPOSITORY"]
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
30
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
31
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
32 nx_graph = networkx.MultiDiGraph()
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
33
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
34 nx_nodes = {}
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
35 ismi_relations = {}
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
36 nx_relations = {}
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
37
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
38 # active log levels for logging
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
39 #logLevels = {'DEBUG', 'INFO', 'WARNING', 'ERROR', 'SYSMSG'}
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
40 #logLevels = {'INFO', 'WARNING', 'ERROR', 'SYSMSG'}
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
41 logLevels = {'INFO', 'ERROR', 'SYSMSG'}
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
42
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
43 def log(level, message):
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
44 if level in logLevels:
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
45 print("%s: %s"%(level, message))
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
46
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
47
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
48 def fixName(name, is_src_rel=False, is_tar_rel=False, att_from_rel=False):
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
49 if is_src_rel:
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
50 #name = name + '>'
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
51 pass
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
52
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
53 if is_tar_rel:
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
54 name = '<' + name
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
55
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
56 if att_from_rel:
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
57 # clean up relations as attribute names
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
58 name = name.replace('is_', '')
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
59 name = name.replace('has_', '')
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
60 name = name.replace('was_', '')
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
61 name = name.replace('_of', '')
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
62
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
63 return name
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
64
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
65
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
66
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
67 def parseYear(val):
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
68 year = None
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
69 try:
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
70 date_json = json.loads(val)
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
71 if 'from' in date_json:
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
72 year = date_json['from'].get('year', None)
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
73 elif 'date' in date_json:
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
74 year = date_json['date'].get('year', None)
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
75 else:
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
76 log("WARNING", "don't know what to do with date %s"%(val))
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
77
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
78 except:
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
79 pass
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
80
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
81 return year
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
82
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
83
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
84 def nodeFromEnt(ent_elem):
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
85 """Create a graph node from the given XML entity.
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
86
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
87 Creates the node in gdb and returns the node.
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
88 """
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
89 # text content of entity element
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
90 ov = ent_elem.text or ''
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
91
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
92 attrs = {}
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
93
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
94 # get attributes element
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
95 atts_elem = ent_elem.find('attributes')
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
96
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
97 if atts_elem is None:
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
98 log('DEBUG', "entity has no attributes: %s"%ent_elem)
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
99
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
100 else:
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
101 # go through all attributes
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
102 for att_elem in atts_elem:
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
103 if att_elem.tail is not None:
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
104 # tail belongs to parent
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
105 ov += att_elem.tail
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
106
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
107 ct = att_elem.get('content-type', None)
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
108 name = att_elem.get('name', None)
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
109 if name in exclude_attributes_of_type:
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
110 # exclude attribute
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
111 continue
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
112
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
113 if ct is None or ct.lower() in ['text', 'arabic', 'bool', 'boolean', 'url', 'language']:
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
114 # normal text attribute (assume no content_type is text too...)
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
115 val = att_elem.text
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
116
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
117 if val is not None and val[0] == '{':
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
118 # try to parse as date
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
119 year = parseYear(val)
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
120 if year is not None:
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
121 val = year
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
122
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
123 # keep attribute
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
124 attrs[name] = val
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
125 #if 'nov' in att:
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
126 # # add normalized value
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
127 # attrs['_n_'+name] = att['nov']
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
128
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
129 elif ct == 'date':
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
130 # date attribute
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
131 val = att_elem.text
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
132 if val is not None:
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
133 # try to parse date object to get gregorian year
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
134 year = parseYear(val)
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
135 if year is not None:
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
136 attrs[name] = year
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
137
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
138 elif ct == 'num':
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
139 # number attribute
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
140 val = att_elem.text
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
141 if val is not None:
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
142 # keep attribute, assume num is int
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
143 attrs[name] = int(val)
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
144
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
145 elif ct == 'old':
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
146 # ignore attribute
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
147 continue
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
148
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
149 else:
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
150 log("WARN", "attribute with unknown content_type: %s"%repr(att_elem))
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
151 # ignore other content types
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
152 continue
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
153
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
154 # process base attributes
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
155 oc = ent_elem.get('object-class')
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
156
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
157 # set type
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
158 attrs[node_type_attribute] = fixName(oc)
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
159
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
160 ismi_id = ent_elem.get('id')
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
161 # rename id to ismi_id
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
162 attrs['ismi_id'] = ismi_id
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
163
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
164 if len(ov) > 0:
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
165 # save ov as label
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
166 attrs['label'] = ov
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
167 #if 'nov' in ent:
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
168 # # add normalized value
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
169 # attrs['_n_label'] = ent.get('nov')
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
170
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
171 # create node
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
172 #log('DEBUG', "new node(%s, %s)"%(ismi_id, attrs))
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
173 nx_graph.add_node(ismi_id, **attrs)
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
174 node = nx_graph.node[ismi_id]
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
175
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
176 return node
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
177
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
178
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
179 def relationFromRel(rel_elem):
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
180 """Create graph relation from etree element.
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
181 """
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
182 rel_id = rel_elem.get('id')
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
183 rel_name = rel_elem.get('object-class')
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
184 src_id = rel_elem.get('source-id')
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
185 tar_id = rel_elem.get('target-id')
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
186 if not src_id in nx_nodes:
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
187 log("ERROR", "relation %s src node %s missing!"%(rel_id,src_id))
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
188 return None
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
189
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
190 if not tar_id in nx_nodes:
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
191 log("ERROR", "relation %s tar node %s missing!"%(rel_id,tar_id))
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
192 return None
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
193
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
194 ov = rel_elem.text or ''
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
195
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
196 attrs = {}
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
197
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
198 # get attributes element
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
199 atts_elem = rel_elem.find('attributes')
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
200
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
201 if atts_elem is not None:
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
202 # go through all attributes
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
203 for att_elem in atts_elem:
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
204 if att_elem.tail is not None:
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
205 # tail belongs to parent
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
206 ov += att_elem.tail
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
207
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
208 ct = att_elem.get('content-type', None)
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
209 name = att_elem.get('name', None)
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
210 if name in exclude_attributes_of_type:
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
211 # exclude attribute
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
212 continue
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
213
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
214 if ct is None or ct.lower() in ['text', 'arabic', 'bool', 'boolean', 'url', 'language']:
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
215 # normal text attribute (assume no content_type is text too...)
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
216 val = att_elem.text
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
217
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
218 if val is not None and val[0] == '{':
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
219 # try to parse as date
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
220 year = parseYear(val)
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
221 if year is not None:
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
222 val = year
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
223
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
224 # keep attribute
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
225 attrs[name] = val
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
226 #if 'nov' in att:
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
227 # # add normalized value
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
228 # attrs['_n_'+name] = att['nov']
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
229
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
230 elif ct == 'date':
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
231 # date attribute
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
232 val = att_elem.text
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
233 if val is not None:
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
234 # try to parse date object to get gregorian year
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
235 year = parseYear(val)
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
236 if year is not None:
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
237 attrs[name] = year
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
238
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
239 elif ct == 'num':
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
240 # number attribute
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
241 val = att_elem.text
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
242 if val is not None:
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
243 # keep attribute, assume num is int
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
244 attrs[name] = int(val)
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
245
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
246 elif ct == 'old':
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
247 # ignore attribute
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
248 continue
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
249
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
250 else:
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
251 log("WARN", "attribute with unknown content_type: %s"%repr(att_elem))
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
252 # ignore other content types
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
253 continue
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
254
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
255 #if len(ov) > 0:
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
256 # # own value of relation is not useful
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
257 # attrs['ov'] = ov
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
258
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
259 attrs[rel_type_attribute] = fixName(rel_name)
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
260 attrs['ismi_id'] = rel_id
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
261 log('DEBUG', "new edge(%s, %s, %s)"%(src_id, tar_id, attrs))
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
262 # create relation with type
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
263 nx_rel = nx_graph.add_edge(src_id, tar_id, attr_dict=attrs)
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
264
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
265 return nx_rel
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
266
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
267
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
268 def importEnts(ents_elem):
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
269 """Import all entities from etree element elem.
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
270 """
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
271 cnt = 0
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
272 xml_num = ents_elem.get('number')
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
273 log('INFO', "XML says %s entities"%xml_num)
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
274
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
275 # iterate through entities element
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
276 for ent_elem in ents_elem:
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
277 cnt += 1
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
278 ismi_id = ent_elem.get('id')
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
279 log('DEBUG', "reading entity[%s]"%ismi_id)
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
280
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
281 if ismi_id in nx_nodes:
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
282 log("ERROR", "entity with id=%s exists!"%ismi_id)
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
283 return
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
284
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
285 # create networkx node
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
286 node = nodeFromEnt(ent_elem)
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
287
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
288 # save node reference
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
289 nx_nodes[ismi_id] = node
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
290
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
291 # debug
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
292 #if cnt >= 100:
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
293 # return
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
294
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
295
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
296 def importRels(rels_elem):
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
297 """Import all entities from etree element elem.
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
298 """
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
299 cnt = 0
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
300 xml_num = rels_elem.get('number')
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
301 log('INFO', "XML says %s relations"%xml_num)
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
302
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
303 # iterate through entities element
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
304 for rel_elem in rels_elem:
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
305 cnt += 1
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
306 ismi_id = rel_elem.get('id')
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
307 log('DEBUG', "reading relation[%s]"%ismi_id)
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
308
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
309 if ismi_id in nx_relations:
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
310 print("ERROR: relation with id=%s exists!"%ismi_id)
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
311 return
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
312
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
313 # create networkx relation
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
314 relation = relationFromRel(rel_elem)
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
315
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
316 # save relation reference
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
317 nx_relations[ismi_id] = relation
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
318
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
319 # debug
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
320 #if cnt >= 100:
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
321 # return
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
322
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
323
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
324 def importAll():
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
325 # parse XML file
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
326 log('INFO', "parsing XML file %s"%input_fn)
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
327 tree = ET.parse(input_fn)
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
328 log('DEBUG', "etree ready")
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
329 root = tree.getroot()
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
330 ents = root.find('entities')
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
331 importEnts(ents)
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
332
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
333 rels = root.find('relations')
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
334 importRels(rels)
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
335
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
336 ## main
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
337
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
338 print("Copy graph from OpenMind-XML to networkx pickle")
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
339
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
340 # parse command line parameters
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
341 if len(sys.argv) > 1:
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
342 input_fn = sys.argv[1]
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
343
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
344 if len(sys.argv) > 2:
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
345 output_fn = sys.argv[2]
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
346
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
347 # import everything
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
348 print("Reading graph from OpenMind-XML file %s"%input_fn)
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
349 if len(exclude_objects_of_type) > 0:
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
350 print(" Skipping objects of type %s"%exclude_objects_of_type);
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
351
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
352 importAll()
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
353
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
354 print("Graph info: %s"%networkx.info(nx_graph))
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
355 #print(" nodes:%s"%repr(nx_graph.nodes(data=True)))
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
356
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
357 # export pickle
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
358 networkx.write_gpickle(nx_graph, output_fn)
f3945ef1e6a4 new importer for OM4XML dump file.
casties
parents:
diff changeset
359 print("Wrote networkx pickle file %s"%output_fn)