Mercurial > hg > drupalISMI
view importFromOpenMind/importer/filterISMI.py @ 10:2a786f0d46a7
more comments in the code.
author | casties |
---|---|
date | Fri, 26 Jun 2015 10:59:53 +0200 |
parents | 0ae6145e7c80 |
children |
line wrap: on
line source
''' Created on 22.04.2014 @author: dwinter ''' import os import json import urllib.request class Importer: def loadJSON(self,url): """Load JSON from URL. Saves JSON in data member. """ print(" loading "+url) response = urllib.request.urlopen(url) str_response = response.readall().decode('utf-8') self.data = json.loads(str_response) def loadJSONFromFile(self,fn): """Load JSON from file. Saves JSON in data member. """ print(" loading "+fn+".json") self.data = json.load(open(fn+".json",'r', encoding="utf-8"),encoding="utf-8") def getEntIdsMentioned(self,kind="tar",filterOC=[]): """Extract related entities from data member. Checks relations of direction kind. Skips objects of type filterOC. Returns a set of ids of related objects and a list of the relations. """ ents = self.data.get("ents") ret=set() rels=[] if kind=="tar": rel_type="tar_rels" id_type="src_id" oc_type="src_oc" else: rel_type="src_rels" id_type="tar_id" oc_type="tar_oc" for ent in ents: tar_rels = ent.get(rel_type) for tar_rel in tar_rels: if not tar_rel.get(oc_type) in filterOC: ret.add(str(tar_rel.get(id_type))) rels.append(tar_rel) return ret,rels def loadallEnts(self,kind="tar",filterOC=[]): """Get related entities from OpenMind. Gets all related entities' ids using kind and filterOC via getEntIdsMentioned(). Downloads the entities from OpenMind using the ids. Returns the entities as JSON-string and a list of relations. """ ids,rels = self.getEntIdsMentioned(kind=kind,filterOC=filterOC) baseUrl="https://ismi.mpiwg-berlin.mpg.de/om4-ismi/jsonInterface?include_content=true&include_romanization=true&method=get_ents" lenId = len(ids) portions = int(lenId / 500) ents = [] for p in range(portions+1): start = p * 500 end = min(lenId,(p+1)*500) idsFrak = list(ids)[start:end] idsString = ",".join(idsFrak) qs = baseUrl+"&ids="+idsString print(" loading ents from "+qs) response = urllib.request.urlopen(qs) entsJ = json.loads(response.readall().decode('utf-8')); ents += entsJ.get("ents") #str_response += response.readall().decode('utf-8') str_response = json.dumps({"ents":ents}); return str_response,rels def saveallEnts(self,filename,kind="tar",filterOC=[]): """Loads all related entities and saves as JSON. Loads all related entities using kind and filterOC via LoadAllEnts(). Saves entities in file filename.json. Saves relations in file filename_rels.json. """ ents,rels = self.loadallEnts(kind=kind,filterOC=filterOC) print(" writing ", filename+".json") of = open(filename+".json","wb") of.write(ents.encode('utf-8')) of.close() print(" writing ", filename+"_rels.json") of = open(filename+"_rels.json","w") json.dump({'rels':rels},of); of.close() if __name__ == '__main__': imp = Importer() # url = """http://openmind-ismi-dev.mpiwg-berlin.mpg.de/om4-ismi/jsonInterface?method=get_ents&ids=27543,36745,58453,87298,259646,35093,22863,34870,36882,101488,36696,31794,37240,35014,35583,37025,35960,172492,98286,165721,260111,90980,36316,260120,36241,260129,260138,38860,176694,72545,36185,36575,260146,31672,37739,89861,176778,180743,86328,260150,90658,58423,181058,105948,35526,74078,260158,181096,31606,31568,27872,36938,4836,34668,76866,102230,76888,74070,73757,182685,260162,260170,1102,172888,260174,34806,28088,36713,37323,34551,35943,98095,260178,260182,182770,260186,260190,260194,36114,85003,31630,157290,37153,37213,172952,86871,64406,102590,82615,58245,179791,179550,12419,95861,36429,36099,74237,36065,74822,87549,83765,36733,19259,260198,34986,88041,260202,36550,260206,37228,39880,36318,36597,35035,58328,80831,58354,74277,36529,36380,69450,200246,260222,81178,260226,199952,262557,87212,99059,64270,81811,65785,36645 # """ # # # load all public codices # contains codices with attributes and first-order relations # url = """https://ismi.mpiwg-berlin.mpg.de/om4-ismi/jsonInterface?method=get_public_codices""" imp.loadJSON(url) # create directory for export files exportDir = '/tmp/ismi_data' if not os.access(exportDir, os.R_OK): # dir doesn't exist -> create os.makedirs(exportDir) # # load and save all target relations of codices as witnesses.json # imp.saveallEnts(exportDir+"/witnesses",kind="tar") # # load and save all source relations of codices except type codex and witness as codex_src.json # imp.saveallEnts(exportDir+"/codex_src",kind="src",filterOC=['CODEX','WITNESS']) #hole jetzt alle relationen an den witnessen # # load the witnesses.json file from above # imp.loadJSONFromFile(exportDir+"/witnesses") # # load and save all source relations except type codex, witness, person as texts.json # imp.saveallEnts(exportDir+"/texts",kind="src",filterOC=['CODEX','WITNESS','PERSON']) # # load the texts.json file from above # imp.loadJSONFromFile(exportDir+"/texts") # # load and save all source relations except type codex, witness and text as authors_subjects_src.json # imp.saveallEnts(exportDir+"/authors_subjects_src",kind="src",filterOC=['CODEX','WITNESS','TEXT']) # # load and save all target relations except type codex, witness and text as authors_subjects_tar.json # imp.saveallEnts(exportDir+"/authors_subjects_tar",kind="tar",filterOC=['CODEX','WITNESS','TEXT']) # # load the authors_subjects_src.json file from above # imp.loadJSONFromFile(exportDir+"/authors_subjects_src") # # load and save all source relations except type codex, witness, text and person as subjects_places.json # imp.saveallEnts(exportDir+"/subjects_places",kind="src",filterOC=['CODEX','WITNESS','TEXT','PERSON']) # # load and save all source relations of type codex, witness, text and person as references_places.json # imp.saveallEnts(exportDir+"/references_places",kind="tar",filterOC=['CODEX','WITNESS','TEXT','PERSON'])