Mercurial > hg > drupalISMI
view importFromOpenMind/importer/filterISMI.py @ 4:0ae6145e7c80
update urls.
author | root@ismi.rz-berlin.mpg.de |
---|---|
date | Tue, 02 Jun 2015 11:59:43 +0200 |
parents | e55656794c82 |
children | 2a786f0d46a7 |
line wrap: on
line source
''' Created on 22.04.2014 @author: dwinter ''' import os import json import urllib.request class Importer: def loadJSON(self,url): response = urllib.request.urlopen(url) str_response = response.readall().decode('utf-8') self.data = json.loads(str_response) def loadJSONFromFile(self,fn): self.data = json.load(open(fn+".json",'r', encoding="utf-8"),encoding="utf-8") def getEntIdsMentioned(self,kind="tar",filterOC=[]): """ holt alle Id entweder als src_id """ ents = self.data.get("ents") ret=set() rels=[] if kind=="tar": rel_type="tar_rels" id_type="src_id" oc_type="src_oc" else: rel_type="src_rels" id_type="tar_id" oc_type="tar_oc" for ent in ents: tar_rels = ent.get(rel_type) for tar_rel in tar_rels: if not tar_rel.get(oc_type) in filterOC: ret.add(str(tar_rel.get(id_type))) rels.append(tar_rel) return ret,rels def loadallEnts(self,kind="tar",filterOC=[]): ids,rels = self.getEntIdsMentioned(kind=kind,filterOC=filterOC) baseUrl="https://ismi.mpiwg-berlin.mpg.de/om4-ismi/jsonInterface?include_content=true&include_romanization=true&method=get_ents" lenId = len(ids) portions = int(lenId / 500) ents = [] for p in range(portions+1): start = p * 500 end = min(lenId,(p+1)*500) idsFrak = list(ids)[start:end] idsString = ",".join(idsFrak) qs = baseUrl+"&ids="+idsString print (qs) response = urllib.request.urlopen(qs) entsJ = json.loads(response.readall().decode('utf-8')); ents += entsJ.get("ents") #str_response += response.readall().decode('utf-8') str_response = json.dumps({"ents":ents}); return str_response,rels def saveallEnts(self,filename,kind="tar",filterOC=[]): ents,rels = self.loadallEnts(kind=kind,filterOC=filterOC) of = open(filename+".json","wb") of.write(ents.encode('utf-8')) of.close() of = open(filename+"_rels.json","w") json.dump({'rels':rels},of); of.close() if __name__ == '__main__': imp = Importer() # url = """http://openmind-ismi-dev.mpiwg-berlin.mpg.de/om4-ismi/jsonInterface?method=get_ents&ids=27543,36745,58453,87298,259646,35093,22863,34870,36882,101488,36696,31794,37240,35014,35583,37025,35960,172492,98286,165721,260111,90980,36316,260120,36241,260129,260138,38860,176694,72545,36185,36575,260146,31672,37739,89861,176778,180743,86328,260150,90658,58423,181058,105948,35526,74078,260158,181096,31606,31568,27872,36938,4836,34668,76866,102230,76888,74070,73757,182685,260162,260170,1102,172888,260174,34806,28088,36713,37323,34551,35943,98095,260178,260182,182770,260186,260190,260194,36114,85003,31630,157290,37153,37213,172952,86871,64406,102590,82615,58245,179791,179550,12419,95861,36429,36099,74237,36065,74822,87549,83765,36733,19259,260198,34986,88041,260202,36550,260206,37228,39880,36318,36597,35035,58328,80831,58354,74277,36529,36380,69450,200246,260222,81178,260226,199952,262557,87212,99059,64270,81811,65785,36645 # """ # url = """https://ismi.mpiwg-berlin.mpg.de/om4-ismi/jsonInterface?method=get_public_codices""" imp.loadJSON(url) #ids= imp.getEntIdsMentioned() #loadall = imp.loadallEnts() #print(loadall.encode('utf-8')) exportDir = '/tmp/ismi_data' if not os.access(exportDir, os.R_OK): # dir doesn't exist -> create os.makedirs(exportDir) imp.saveallEnts(exportDir+"/witnesses",kind="tar") imp.saveallEnts(exportDir+"/codex_src",kind="src",filterOC=['CODEX','WITNESS']) #hole jetzt alle relationen an den witnessen imp.loadJSONFromFile(exportDir+"/witnesses") #ids= imp.getEntIdsMentioned(kind="src") imp.saveallEnts(exportDir+"/texts",kind="src",filterOC=['CODEX','WITNESS','PERSON']) imp.loadJSONFromFile(exportDir+"/texts") imp.saveallEnts(exportDir+"/authors_subjects_src",kind="src",filterOC=['CODEX','WITNESS','TEXT']) imp.saveallEnts(exportDir+"/authors_subjects_tar",kind="tar",filterOC=['CODEX','WITNESS','TEXT']) imp.loadJSONFromFile(exportDir+"/authors_subjects_src") imp.saveallEnts(exportDir+"/subjects_places",kind="src",filterOC=['CODEX','WITNESS','TEXT','PERSON']) imp.saveallEnts(exportDir+"/references_places",kind="tar",filterOC=['CODEX','WITNESS','TEXT','PERSON'])