view importFromOpenMind/importer/unfilteredISMI.py @ 19:ca1e02a2a9c4

unfilteredIsmi: openmind to json exporter like filterISMI. ismi2model: openmind importer like ismi2neo4j that saves networkx pickle file.
author casties
date Wed, 09 Sep 2015 17:32:42 +0200
parents
children a9bfd49355f8
line wrap: on
line source

'''
Created on 22.04.2014

@author: dwinter
'''

import os
import json
import urllib.request

#ismiBaseUrl="https://ismi.mpiwg-berlin.mpg.de/om4-ismi"
ismiBaseUrl="http://localhost:18080/ismi-richfaces"

class Importer:
    
    allents = {}
    allrels = {}
    
    def loadJSON(self,url):
        """Load JSON from URL.
        
        Saves JSON in data member.
        """
        #print("  loading "+url)
        response = urllib.request.urlopen(url)
        str_response = response.readall().decode('utf-8')
    
        self.data = json.loads(str_response)
        
    
    def loadJSONFromFile(self,fn):
        """Load JSON from file.
        
        Saves JSON in data member.
        """
        print("  loading "+fn+".json")
        self.data = json.load(open(fn+".json",'r', encoding="utf-8"),encoding="utf-8")
        
        
    def getEntIds(self):
        """Extract entities from data member.
        
        Checks all relations.
        Returns a set of ids of related objects and a list of the relations.  
        """
        
        ents = self.data.get("ents")
        
        ret=set()
        rels=[]
        
        for ent in ents:
            ret.add(str(ent.get('id')))
            if 'src_rels' in ent:
                print("src_rels: %s"%ent.get('src_rels'))
                rels.extend(ent.get('src_rels'))

            if 'tar_rels' in ent:
                print("tar_rels: %s"%ent.get('tar_rels'))
                rels.extend(ent.get('tar_rels'))
        
        return ret,rels
        
       
    def loadallEnts(self,kind="tar",filterOC=[]):
        """Get related entities from OpenMind.
        
        Gets all related entities' ids using kind and filterOC via getEntIdsMentioned().
        Downloads the entities from OpenMind using the ids.
        Returns the entities as JSON-string and a list of relations.
        """
        
        ids,rels = self.getEntIds()
        
        baseUrl=ismiBaseUrl+"/jsonInterface?include_content=true&include_romanization=true&method=get_ents"
        
        lenId = len(ids)
        portions = int(lenId / 500) 
        print("loading %s entities"%lenId)
                
        ents = []
        for p in range(portions+1):
            
            start = p * 500
            end = min(lenId,(p+1)*500)
            
            idsFrak = list(ids)[start:end]
            idsString = ",".join(idsFrak)
            
            
            qs = baseUrl+"&ids="+idsString
            #print("  loading ents from "+qs)
            response = urllib.request.urlopen(qs)
            entsJ = json.loads(response.readall().decode('utf-8'));
            ents += entsJ.get("ents")
            
            # iterate all entities
            for ent in entsJ.get("ents"):
                ismi_id = ent.get('id')
                if ismi_id in self.allents:
                    print("entity id=%s exists!"%ismi_id)
                else:
                    self.allents[ismi_id] = ent
                
                # extract relations
                if 'src_rels' in ent:
                    #print("src_rels: %s"%ent.get('src_rels'))
                    rels.extend(ent.get('src_rels'))
                    
                    for rel in ent.get('src_rels'):
                        rel_id = rel.get('id')
                        if rel_id in self.allrels:
                            print("relation id=%s exists!"%rel_id)
                        else:
                            self.allrels[rel_id] = rel
    
                if 'tar_rels' in ent:
                    #print("tar_rels: %s"%ent.get('tar_rels'))
                    rels.extend(ent.get('tar_rels'))
                    
                    for rel in ent.get('tar_rels'):
                        rel_id = rel.get('id')
                        if rel_id in self.allrels:
                            print("relation id=%s exists!"%rel_id)
                        else:
                            self.allrels[rel_id] = rel
                    
        #str_response = json.dumps({"ents":ents});
        return ents,rels
    
    
    def saveallEnts(self,filename,kind="tar",filterOC=[]):
        """Loads all related entities and saves as JSON.
        
        Loads all related entities using kind and filterOC via LoadAllEnts().
        Saves entities in file filename.json.
        Saves relations in file filename_rels.json.
        """
        
        ents,rels = self.loadallEnts(kind=kind,filterOC=filterOC)
        
        print("  writing ", filename+".json")
        of = open(filename+".json","wb")
        of.write(json.dumps({"ents":ents}).encode('utf-8'))
        of.close()
        
        print("  writing ", filename+"_rels.json")
        of = open(filename+"_rels.json","w")
        json.dump({'rels':rels},of);
        of.close()
        
    
if __name__ == '__main__':
    imp = Importer()
    
    # get current list of all definitions 
    imp.loadJSON(ismiBaseUrl+"/jsonInterface?method=get_defs")
    ismi_defs = [atts['ov'] for atts in imp.data['defs']]
    
    # create directory for export files
    exportDir = '/tmp/ismi_data'
    if not os.access(exportDir, os.R_OK):
        # dir doesn't exist -> create
        os.makedirs(exportDir)
    
    for ismi_def in ismi_defs:
        print("loading entities of type %s"%ismi_def)
        #
        # load all entities of type ismi_def
        # contains entities with attributes and first-order relations
        #
        url = ismiBaseUrl+"/jsonInterface?method=get_ents&oc=%s"%ismi_def
        imp.loadJSON(url)
        
        #
        # load and save all target relations of entities as entities.json
        #
        imp.saveallEnts(exportDir+"/%s"%ismi_def)
    
    #
    # save all entities in one file
    #
    print("  writing ", "ALL.json")
    of = open(exportDir+"/ALL.json","wb")
    allents = [ent for ent in imp.allents.values()]
    of.write(json.dumps({"ents":allents}).encode('utf-8'))
    of.close()

    print("  writing ", "ALL_rels.json")
    of = open(exportDir+"/ALL_rels.json","wb")
    allrels = [rel for rel in imp.allrels.values()]
    of.write(json.dumps({"rels":allrels}).encode('utf-8'))
    of.close()