changeset 10:2a786f0d46a7

more comments in the code.
author casties
date Fri, 26 Jun 2015 10:59:53 +0200
parents cf772424f725
children 61dd2a96c4e2
files importFromOpenMind/importer/filterISMI.py
diffstat 1 files changed, 68 insertions(+), 29 deletions(-) [+]
line wrap: on
line diff
--- a/importFromOpenMind/importer/filterISMI.py	Mon Jun 22 19:02:34 2015 +0200
+++ b/importFromOpenMind/importer/filterISMI.py	Fri Jun 26 10:59:53 2015 +0200
@@ -11,8 +11,11 @@
 class Importer:
     
     def loadJSON(self,url):
-       
-
+        """Load JSON from URL.
+        
+        Saves JSON in data member.
+        """
+        print("  loading "+url)
         response = urllib.request.urlopen(url)
         str_response = response.readall().decode('utf-8')
     
@@ -20,18 +23,24 @@
         
     
     def loadJSONFromFile(self,fn):
+        """Load JSON from file.
         
-
+        Saves JSON in data member.
+        """
+        print("  loading "+fn+".json")
         self.data = json.load(open(fn+".json",'r', encoding="utf-8"),encoding="utf-8")
         
         
     def getEntIdsMentioned(self,kind="tar",filterOC=[]):
-        """ holt alle Id entweder als src_id """
+        """Extract related entities from data member.
+        
+        Checks relations of direction kind.
+        Skips objects of type filterOC.
+        Returns a set of ids of related objects and a list of the relations.  
+        """
         
         ents = self.data.get("ents")
         
-        
-        
         ret=set()
         rels=[]
         if kind=="tar":
@@ -46,29 +55,27 @@
         for ent in ents:
             tar_rels = ent.get(rel_type)
             
-           
-            
             for tar_rel in tar_rels:
-                
                
                 if not tar_rel.get(oc_type) in filterOC:
-                    
       
                     ret.add(str(tar_rel.get(id_type)))
                 
-                
                     rels.append(tar_rel)
-                
-                
         
         return ret,rels
         
        
     def loadallEnts(self,kind="tar",filterOC=[]):
+        """Get related entities from OpenMind.
+        
+        Gets all related entities' ids using kind and filterOC via getEntIdsMentioned().
+        Downloads the entities from OpenMind using the ids.
+        Returns the entities as JSON-string and a list of relations.
+        """
         
         ids,rels = self.getEntIdsMentioned(kind=kind,filterOC=filterOC)
         
-        
         baseUrl="https://ismi.mpiwg-berlin.mpg.de/om4-ismi/jsonInterface?include_content=true&include_romanization=true&method=get_ents"
         
         lenId = len(ids)
@@ -78,7 +85,6 @@
         ents = []
         for p in range(portions+1):
             
-            
             start = p * 500
             end = min(lenId,(p+1)*500)
             
@@ -87,28 +93,36 @@
             
             
             qs = baseUrl+"&ids="+idsString
-            print (qs)
+            print("  loading ents from "+qs)
             response = urllib.request.urlopen(qs)
             entsJ = json.loads(response.readall().decode('utf-8'));
             ents += entsJ.get("ents")
             #str_response += response.readall().decode('utf-8')
         
-        
         str_response = json.dumps({"ents":ents});
         return str_response,rels
     
+    
     def saveallEnts(self,filename,kind="tar",filterOC=[]):
+        """Loads all related entities and saves as JSON.
+        
+        Loads all related entities using kind and filterOC via LoadAllEnts().
+        Saves entities in file filename.json.
+        Saves relations in file filename_rels.json.
+        """
         
         ents,rels = self.loadallEnts(kind=kind,filterOC=filterOC)
+
+        print("  writing ", filename+".json")
         of = open(filename+".json","wb")
         of.write(ents.encode('utf-8'))
         of.close()
         
+        print("  writing ", filename+"_rels.json")
         of = open(filename+"_rels.json","w")
         json.dump({'rels':rels},of);
         of.close()
         
-        
     
 if __name__ == '__main__':
     imp = Importer()
@@ -116,45 +130,70 @@
 #     url = """http://openmind-ismi-dev.mpiwg-berlin.mpg.de/om4-ismi/jsonInterface?method=get_ents&ids=27543,36745,58453,87298,259646,35093,22863,34870,36882,101488,36696,31794,37240,35014,35583,37025,35960,172492,98286,165721,260111,90980,36316,260120,36241,260129,260138,38860,176694,72545,36185,36575,260146,31672,37739,89861,176778,180743,86328,260150,90658,58423,181058,105948,35526,74078,260158,181096,31606,31568,27872,36938,4836,34668,76866,102230,76888,74070,73757,182685,260162,260170,1102,172888,260174,34806,28088,36713,37323,34551,35943,98095,260178,260182,182770,260186,260190,260194,36114,85003,31630,157290,37153,37213,172952,86871,64406,102590,82615,58245,179791,179550,12419,95861,36429,36099,74237,36065,74822,87549,83765,36733,19259,260198,34986,88041,260202,36550,260206,37228,39880,36318,36597,35035,58328,80831,58354,74277,36529,36380,69450,200246,260222,81178,260226,199952,262557,87212,99059,64270,81811,65785,36645
 # """
 #    
-
+    #
+    # load all public codices
+    # contains codices with attributes and first-order relations
+    #
     url = """https://ismi.mpiwg-berlin.mpg.de/om4-ismi/jsonInterface?method=get_public_codices"""
    
-   
     imp.loadJSON(url)
     
-    #ids= imp.getEntIdsMentioned()
-    
-    
-    #loadall = imp.loadallEnts()
-    #print(loadall.encode('utf-8'))
-
+    # create directory for export files
     exportDir = '/tmp/ismi_data'
     if not os.access(exportDir, os.R_OK):
         # dir doesn't exist -> create
         os.makedirs(exportDir)
     
+    #
+    # load and save all target relations of codices as witnesses.json
+    #
     imp.saveallEnts(exportDir+"/witnesses",kind="tar")
     
+    #
+    # load and save all source relations of codices except type codex and witness as codex_src.json
+    # 
     imp.saveallEnts(exportDir+"/codex_src",kind="src",filterOC=['CODEX','WITNESS'])
     
     #hole jetzt alle relationen an den witnessen
     
+    #
+    # load the witnesses.json file from above
+    #
     imp.loadJSONFromFile(exportDir+"/witnesses")
     
-    #ids= imp.getEntIdsMentioned(kind="src")
- 
+    #
+    # load and save all source relations except type codex, witness, person as texts.json
+    # 
     imp.saveallEnts(exportDir+"/texts",kind="src",filterOC=['CODEX','WITNESS','PERSON'])
     
+    #
+    # load the texts.json file from above
+    #
     imp.loadJSONFromFile(exportDir+"/texts")
     
+    #
+    # load and save all source relations except type codex, witness and text as authors_subjects_src.json
+    #
     imp.saveallEnts(exportDir+"/authors_subjects_src",kind="src",filterOC=['CODEX','WITNESS','TEXT'])
     
+    #
+    # load and save all target relations except type codex, witness and text as authors_subjects_tar.json
+    #
     imp.saveallEnts(exportDir+"/authors_subjects_tar",kind="tar",filterOC=['CODEX','WITNESS','TEXT'])
     
-  
+    #
+    # load the authors_subjects_src.json file from above
+    #    
     imp.loadJSONFromFile(exportDir+"/authors_subjects_src")
     
+    #
+    # load and save all source relations except type codex, witness, text and person as subjects_places.json
+    #
     imp.saveallEnts(exportDir+"/subjects_places",kind="src",filterOC=['CODEX','WITNESS','TEXT','PERSON'])
+    
+    #
+    # load and save all source relations of type codex, witness, text and person as references_places.json
+    #
     imp.saveallEnts(exportDir+"/references_places",kind="tar",filterOC=['CODEX','WITNESS','TEXT','PERSON'])