# HG changeset patch # User casties # Date 1435309193 -7200 # Node ID 2a786f0d46a76ccbfae01f3cc6c006020de28d8d # Parent cf772424f725863f086f97421473021dc3cd31e9 more comments in the code. diff -r cf772424f725 -r 2a786f0d46a7 importFromOpenMind/importer/filterISMI.py --- a/importFromOpenMind/importer/filterISMI.py Mon Jun 22 19:02:34 2015 +0200 +++ b/importFromOpenMind/importer/filterISMI.py Fri Jun 26 10:59:53 2015 +0200 @@ -11,8 +11,11 @@ class Importer: def loadJSON(self,url): - - + """Load JSON from URL. + + Saves JSON in data member. + """ + print(" loading "+url) response = urllib.request.urlopen(url) str_response = response.readall().decode('utf-8') @@ -20,18 +23,24 @@ def loadJSONFromFile(self,fn): + """Load JSON from file. - + Saves JSON in data member. + """ + print(" loading "+fn+".json") self.data = json.load(open(fn+".json",'r', encoding="utf-8"),encoding="utf-8") def getEntIdsMentioned(self,kind="tar",filterOC=[]): - """ holt alle Id entweder als src_id """ + """Extract related entities from data member. + + Checks relations of direction kind. + Skips objects of type filterOC. + Returns a set of ids of related objects and a list of the relations. + """ ents = self.data.get("ents") - - ret=set() rels=[] if kind=="tar": @@ -46,29 +55,27 @@ for ent in ents: tar_rels = ent.get(rel_type) - - for tar_rel in tar_rels: - if not tar_rel.get(oc_type) in filterOC: - ret.add(str(tar_rel.get(id_type))) - rels.append(tar_rel) - - return ret,rels def loadallEnts(self,kind="tar",filterOC=[]): + """Get related entities from OpenMind. + + Gets all related entities' ids using kind and filterOC via getEntIdsMentioned(). + Downloads the entities from OpenMind using the ids. + Returns the entities as JSON-string and a list of relations. + """ ids,rels = self.getEntIdsMentioned(kind=kind,filterOC=filterOC) - baseUrl="https://ismi.mpiwg-berlin.mpg.de/om4-ismi/jsonInterface?include_content=true&include_romanization=true&method=get_ents" lenId = len(ids) @@ -78,7 +85,6 @@ ents = [] for p in range(portions+1): - start = p * 500 end = min(lenId,(p+1)*500) @@ -87,28 +93,36 @@ qs = baseUrl+"&ids="+idsString - print (qs) + print(" loading ents from "+qs) response = urllib.request.urlopen(qs) entsJ = json.loads(response.readall().decode('utf-8')); ents += entsJ.get("ents") #str_response += response.readall().decode('utf-8') - str_response = json.dumps({"ents":ents}); return str_response,rels + def saveallEnts(self,filename,kind="tar",filterOC=[]): + """Loads all related entities and saves as JSON. + + Loads all related entities using kind and filterOC via LoadAllEnts(). + Saves entities in file filename.json. + Saves relations in file filename_rels.json. + """ ents,rels = self.loadallEnts(kind=kind,filterOC=filterOC) + + print(" writing ", filename+".json") of = open(filename+".json","wb") of.write(ents.encode('utf-8')) of.close() + print(" writing ", filename+"_rels.json") of = open(filename+"_rels.json","w") json.dump({'rels':rels},of); of.close() - if __name__ == '__main__': imp = Importer() @@ -116,45 +130,70 @@ # url = """http://openmind-ismi-dev.mpiwg-berlin.mpg.de/om4-ismi/jsonInterface?method=get_ents&ids=27543,36745,58453,87298,259646,35093,22863,34870,36882,101488,36696,31794,37240,35014,35583,37025,35960,172492,98286,165721,260111,90980,36316,260120,36241,260129,260138,38860,176694,72545,36185,36575,260146,31672,37739,89861,176778,180743,86328,260150,90658,58423,181058,105948,35526,74078,260158,181096,31606,31568,27872,36938,4836,34668,76866,102230,76888,74070,73757,182685,260162,260170,1102,172888,260174,34806,28088,36713,37323,34551,35943,98095,260178,260182,182770,260186,260190,260194,36114,85003,31630,157290,37153,37213,172952,86871,64406,102590,82615,58245,179791,179550,12419,95861,36429,36099,74237,36065,74822,87549,83765,36733,19259,260198,34986,88041,260202,36550,260206,37228,39880,36318,36597,35035,58328,80831,58354,74277,36529,36380,69450,200246,260222,81178,260226,199952,262557,87212,99059,64270,81811,65785,36645 # """ # - + # + # load all public codices + # contains codices with attributes and first-order relations + # url = """https://ismi.mpiwg-berlin.mpg.de/om4-ismi/jsonInterface?method=get_public_codices""" - imp.loadJSON(url) - #ids= imp.getEntIdsMentioned() - - - #loadall = imp.loadallEnts() - #print(loadall.encode('utf-8')) - + # create directory for export files exportDir = '/tmp/ismi_data' if not os.access(exportDir, os.R_OK): # dir doesn't exist -> create os.makedirs(exportDir) + # + # load and save all target relations of codices as witnesses.json + # imp.saveallEnts(exportDir+"/witnesses",kind="tar") + # + # load and save all source relations of codices except type codex and witness as codex_src.json + # imp.saveallEnts(exportDir+"/codex_src",kind="src",filterOC=['CODEX','WITNESS']) #hole jetzt alle relationen an den witnessen + # + # load the witnesses.json file from above + # imp.loadJSONFromFile(exportDir+"/witnesses") - #ids= imp.getEntIdsMentioned(kind="src") - + # + # load and save all source relations except type codex, witness, person as texts.json + # imp.saveallEnts(exportDir+"/texts",kind="src",filterOC=['CODEX','WITNESS','PERSON']) + # + # load the texts.json file from above + # imp.loadJSONFromFile(exportDir+"/texts") + # + # load and save all source relations except type codex, witness and text as authors_subjects_src.json + # imp.saveallEnts(exportDir+"/authors_subjects_src",kind="src",filterOC=['CODEX','WITNESS','TEXT']) + # + # load and save all target relations except type codex, witness and text as authors_subjects_tar.json + # imp.saveallEnts(exportDir+"/authors_subjects_tar",kind="tar",filterOC=['CODEX','WITNESS','TEXT']) - + # + # load the authors_subjects_src.json file from above + # imp.loadJSONFromFile(exportDir+"/authors_subjects_src") + # + # load and save all source relations except type codex, witness, text and person as subjects_places.json + # imp.saveallEnts(exportDir+"/subjects_places",kind="src",filterOC=['CODEX','WITNESS','TEXT','PERSON']) + + # + # load and save all source relations of type codex, witness, text and person as references_places.json + # imp.saveallEnts(exportDir+"/references_places",kind="tar",filterOC=['CODEX','WITNESS','TEXT','PERSON'])