view importFromOpenMind/importer/filterISMI.py @ 2:e55656794c82

create and use separate export directory.
author root@ismi.rz-berlin.mpg.de
date Tue, 02 Jun 2015 11:09:16 +0200
parents 124ef8f3b22d
children 0ae6145e7c80
line wrap: on
line source

'''
Created on 22.04.2014

@author: dwinter
'''

import os
import json
import urllib.request

class Importer:
    
    def loadJSON(self,url):
       

        response = urllib.request.urlopen(url)
        str_response = response.readall().decode('utf-8')
    
        self.data = json.loads(str_response)
        
    
    def loadJSONFromFile(self,fn):
        

        self.data = json.load(open(fn+".json",'r', encoding="utf-8"),encoding="utf-8")
        
        
    def getEntIdsMentioned(self,kind="tar",filterOC=[]):
        """ holt alle Id entweder als src_id """
        
        ents = self.data.get("ents")
        
        
        
        ret=set()
        rels=[]
        if kind=="tar":
            rel_type="tar_rels"
            id_type="src_id"
            oc_type="src_oc"
        else:       
            rel_type="src_rels"
            id_type="tar_id"
            oc_type="tar_oc"
        
        for ent in ents:
            tar_rels = ent.get(rel_type)
            
           
            
            for tar_rel in tar_rels:
                
               
                if not tar_rel.get(oc_type) in filterOC:
                    
      
                    ret.add(str(tar_rel.get(id_type)))
                
                
                    rels.append(tar_rel)
                
                
        
        return ret,rels
        
       
    def loadallEnts(self,kind="tar",filterOC=[]):
        
        ids,rels = self.getEntIdsMentioned(kind=kind,filterOC=filterOC)
        
        
        baseUrl="http://openmind-ismi-dev.mpiwg-berlin.mpg.de/om4-ismi/jsonInterface?include_content=true&include_romanization=true&method=get_ents"
        
        lenId = len(ids)
        
        portions = int(lenId / 500) 
        
        ents = []
        for p in range(portions+1):
            
            
            start = p * 500
            end = min(lenId,(p+1)*500)
            
            idsFrak = list(ids)[start:end]
            idsString = ",".join(idsFrak)
            
            
            qs = baseUrl+"&ids="+idsString
            print (qs)
            response = urllib.request.urlopen(qs)
            entsJ = json.loads(response.readall().decode('utf-8'));
            ents += entsJ.get("ents")
            #str_response += response.readall().decode('utf-8')
        
        
        str_response = json.dumps({"ents":ents});
        return str_response,rels
    
    def saveallEnts(self,filename,kind="tar",filterOC=[]):
        
        ents,rels = self.loadallEnts(kind=kind,filterOC=filterOC)
        of = open(filename+".json","wb")
        of.write(ents.encode('utf-8'))
        of.close()
        
        of = open(filename+"_rels.json","w")
        json.dump({'rels':rels},of);
        of.close()
        
        
    
if __name__ == '__main__':
    imp = Importer()
    
#     url = """http://openmind-ismi-dev.mpiwg-berlin.mpg.de/om4-ismi/jsonInterface?method=get_ents&ids=27543,36745,58453,87298,259646,35093,22863,34870,36882,101488,36696,31794,37240,35014,35583,37025,35960,172492,98286,165721,260111,90980,36316,260120,36241,260129,260138,38860,176694,72545,36185,36575,260146,31672,37739,89861,176778,180743,86328,260150,90658,58423,181058,105948,35526,74078,260158,181096,31606,31568,27872,36938,4836,34668,76866,102230,76888,74070,73757,182685,260162,260170,1102,172888,260174,34806,28088,36713,37323,34551,35943,98095,260178,260182,182770,260186,260190,260194,36114,85003,31630,157290,37153,37213,172952,86871,64406,102590,82615,58245,179791,179550,12419,95861,36429,36099,74237,36065,74822,87549,83765,36733,19259,260198,34986,88041,260202,36550,260206,37228,39880,36318,36597,35035,58328,80831,58354,74277,36529,36380,69450,200246,260222,81178,260226,199952,262557,87212,99059,64270,81811,65785,36645
# """
#    

    url = """http://openmind-ismi-dev.mpiwg-berlin.mpg.de/om4-ismi/jsonInterface?method=get_public_codices"""
   
   
    imp.loadJSON(url)
    
    #ids= imp.getEntIdsMentioned()
    
    
    #loadall = imp.loadallEnts()
    #print(loadall.encode('utf-8'))

    exportDir = '/tmp/ismi_data'
    if not os.access(exportDir, os.R_OK):
        # dir doesn't exist -> create
        os.makedirs(exportDir)
    
    imp.saveallEnts(exportDir+"/witnesses",kind="tar")
    
    imp.saveallEnts(exportDir+"/codex_src",kind="src",filterOC=['CODEX','WITNESS'])
    
    #hole jetzt alle relationen an den witnessen
    
    imp.loadJSONFromFile(exportDir+"/witnesses")
    
    #ids= imp.getEntIdsMentioned(kind="src")
 
    imp.saveallEnts(exportDir+"/texts",kind="src",filterOC=['CODEX','WITNESS','PERSON'])
    
    imp.loadJSONFromFile(exportDir+"/texts")
    
    imp.saveallEnts(exportDir+"/authors_subjects_src",kind="src",filterOC=['CODEX','WITNESS','TEXT'])
    
    imp.saveallEnts(exportDir+"/authors_subjects_tar",kind="tar",filterOC=['CODEX','WITNESS','TEXT'])
    
  
    imp.loadJSONFromFile(exportDir+"/authors_subjects_src")
    
    imp.saveallEnts(exportDir+"/subjects_places",kind="src",filterOC=['CODEX','WITNESS','TEXT','PERSON'])
    imp.saveallEnts(exportDir+"/references_places",kind="tar",filterOC=['CODEX','WITNESS','TEXT','PERSON'])