changeset 107:742347ef8410

Changed DivaImportHttp for new URL and JSON format.
author Robert Casties <casties@mpiwg-berlin.mpg.de>
date Mon, 08 Jul 2019 20:20:07 +0200
parents 93c7dbfaf062
children bdd1c3fc0897
files src/main/java/org/mpi/openmind/scripts/DivaImportHttp.java src/main/java/org/mpi/openmind/scripts/DivaImportHttpOld.java
diffstat 2 files changed, 156 insertions(+), 21 deletions(-) [+]
line wrap: on
line diff
--- a/src/main/java/org/mpi/openmind/scripts/DivaImportHttp.java	Fri Apr 26 18:12:23 2019 +0200
+++ b/src/main/java/org/mpi/openmind/scripts/DivaImportHttp.java	Mon Jul 08 20:20:07 2019 +0200
@@ -1,10 +1,9 @@
 package org.mpi.openmind.scripts;
 
+import java.io.InputStreamReader;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.Map;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
 
 import org.apache.http.HttpEntity;
 import org.apache.http.HttpResponse;
@@ -15,11 +14,13 @@
 import org.apache.log4j.Level;
 import org.apache.log4j.Logger;
 import org.apache.log4j.PatternLayout;
+import org.json.JSONArray;
+import org.json.JSONObject;
+import org.json.JSONTokener;
 import org.mpi.openmind.cache.WrapperService;
 import org.mpi.openmind.repository.bo.Attribute;
 import org.mpi.openmind.repository.bo.Entity;
 import org.mpi.openmind.repository.bo.Node;
-import org.mpi.openmind.repository.bo.Relation;
 import org.mpi.openmind.repository.services.ServiceRegistry;
 import org.mpi.openmind.repository.services.utils.AttributeFilter;
 
@@ -49,9 +50,9 @@
 	public static String userName = "diva-import";
 	
 	/** URL for listing of Diva menifest files */
-    public static String scanListUrl = "https://images.rasi.mcgill.ca/data/";
+    public static String scanListUrl = "https://ismi-imageserver-cc.mpiwg-berlin.mpg.de/iiif/manifests-internal/";
 	
-	public static void execute(){
+	public static void execute() {
 		ServiceRegistry services = new ServiceRegistry();
 		// data model should exist by now
 		//createDataModel(services.getWrapper());
@@ -70,7 +71,7 @@
 		    DefaultHttpClient httpclient = new DefaultHttpClient();
 		    HttpGet httpGet = new HttpGet(scanListUrl);
 		    try {
-	            System.out.println("Reading scan dirs from "+scanListUrl);
+	            System.out.println("Reading scan manifests from "+scanListUrl);
 	            // send HTTP request and read response
 	            HttpResponse response = httpclient.execute(httpGet);
 		        if (response.getStatusLine().getStatusCode() > 200) {
@@ -78,20 +79,20 @@
 		            return;
 		        }
 		        HttpEntity htent = response.getEntity();
-		        String document = EntityUtils.toString(htent);
-		        // brutal HTML parsing by regex ;-(
-		        Pattern lp = Pattern.compile("<a href=\"([\\w_.]+)\\.json\">");
-		        Matcher lm = lp.matcher(document);
+		        JSONTokener jsonReader = new JSONTokener(new InputStreamReader(htent.getContent()));
+		        // parse JSON directory index of manifest files
+		        JSONArray files = new JSONArray(jsonReader);
+		        int numFiles = files.length();
 	            List<Entity> list = new ArrayList<Entity>();
-	            int dirs = 0;
-	            // go through all filenames in the links
-		        while (lm.find()) {
-		            dirs += 1;
-		            String dirName = lm.group(1);
-                    System.out.println("check: "+dirName);
+	            // go through all filenames in the list
+		        for (int i = 0; i < numFiles; ++i) {
+		        	JSONObject file = files.getJSONObject(i);
+		            String filename = file.getString("name");
+                    System.out.println("check: "+filename);
                     // create filter to search OpenMind
+                    String digiName = filename.replace(".json", "");
 		            List<AttributeFilter> filters = new ArrayList<AttributeFilter>();
-		            AttributeFilter filter = new AttributeFilter("name", dirName, "DIGITALIZATION");
+		            AttributeFilter filter = new AttributeFilter("name", digiName, "DIGITALIZATION");
 		            filter.setExactMatch(true);
 					filters.add(filter);
 		            // get matching DIGITALIZATIONs
@@ -100,10 +101,10 @@
                         //System.out.println("  exists: "+res);
                     } else {
                         // no existing DIGITALIZATION - create new Entity
-                        System.out.println(" create: "+dirName);
+                        System.out.println(" create: "+filename);
                         Entity digi = new Entity(Node.TYPE_ABOX, DIGITALIZATION, false);
-                        digi.setOwnValue(dirName);                        
-                        digi.addAttribute(new Attribute("name", "text", dirName));
+                        digi.setOwnValue(filename);                        
+                        digi.addAttribute(new Attribute("name", "text", filename));
                         digi.addAttribute(new Attribute("num_files", "text", "100"));
                         // add to list
                         list.add(digi);
@@ -113,7 +114,7 @@
 		        EntityUtils.consume(htent);
 		        // persist OpenMind entities
 		        omService.saveEntityList(list, userName);
-                System.out.println("Read " + dirs + " directories");
+                System.out.println("Found " + numFiles + " manifests");
 	            System.out.println("Created " + list.size() + " DIGITALIZATIONs");
 	            System.out.println("END");
 		        
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/main/java/org/mpi/openmind/scripts/DivaImportHttpOld.java	Mon Jul 08 20:20:07 2019 +0200
@@ -0,0 +1,134 @@
+package org.mpi.openmind.scripts;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.http.HttpEntity;
+import org.apache.http.HttpResponse;
+import org.apache.http.client.methods.HttpGet;
+import org.apache.http.impl.client.DefaultHttpClient;
+import org.apache.http.util.EntityUtils;
+import org.apache.log4j.ConsoleAppender;
+import org.apache.log4j.Level;
+import org.apache.log4j.Logger;
+import org.apache.log4j.PatternLayout;
+import org.mpi.openmind.cache.WrapperService;
+import org.mpi.openmind.repository.bo.Attribute;
+import org.mpi.openmind.repository.bo.Entity;
+import org.mpi.openmind.repository.bo.Node;
+import org.mpi.openmind.repository.bo.Relation;
+import org.mpi.openmind.repository.services.ServiceRegistry;
+import org.mpi.openmind.repository.services.utils.AttributeFilter;
+
+
+/**
+ * Downloads a list of Diva manifest files from the repository over HTTP 
+ * and checks if each manifest has a corresponding DIGITALIZATION object. 
+ * Creates missing DIGITALIZATION objects.
+ * 
+ * @author casties
+ *
+ */
+public class DivaImportHttpOld {
+	
+	static{
+		ConsoleAppender console = new ConsoleAppender(); //create appender
+		  //configure the appender
+		  String PATTERN = "%d [%p|%c|%C{1}] %m%n";
+		  console.setLayout(new PatternLayout(PATTERN)); 
+		  console.setThreshold(Level.INFO);
+		  console.activateOptions();
+		  //add appender to any Logger (here is root)
+		  Logger.getRootLogger().addAppender(console);
+	}
+
+	public static String DIGITALIZATION = "DIGITALIZATION";
+	public static String userName = "diva-import";
+	
+	/** URL for listing of Diva menifest files */
+    public static String scanListUrl = "https://images.rasi.mcgill.ca/data/";
+	
+	public static void execute(){
+		ServiceRegistry services = new ServiceRegistry();
+		// data model should exist by now
+		//createDataModel(services.getWrapper());
+		importData(services.getWrapper());
+	}
+	
+	/**
+	 * Downloads a list of Diva manifest files from the repository and checks if
+	 * each manifest has a corresponding DIGITALIZATION object. Creates missing
+	 * DIGITALIZATION objects.
+	 * 
+	 * @param omService
+	 */
+	private static void importData(WrapperService omService){
+		try {
+		    DefaultHttpClient httpclient = new DefaultHttpClient();
+		    HttpGet httpGet = new HttpGet(scanListUrl);
+		    try {
+	            System.out.println("Reading scan dirs from "+scanListUrl);
+	            // send HTTP request and read response
+	            HttpResponse response = httpclient.execute(httpGet);
+		        if (response.getStatusLine().getStatusCode() > 200) {
+		            System.out.println("ERROR reading HTTP response: "+response.getStatusLine());
+		            return;
+		        }
+		        HttpEntity htent = response.getEntity();
+		        String document = EntityUtils.toString(htent);
+		        // brutal HTML parsing by regex ;-(
+		        Pattern lp = Pattern.compile("<a href=\"([\\w_.]+)\\.json\">");
+		        Matcher lm = lp.matcher(document);
+	            List<Entity> list = new ArrayList<Entity>();
+	            int dirs = 0;
+	            // go through all filenames in the links
+		        while (lm.find()) {
+		            dirs += 1;
+		            String dirName = lm.group(1);
+                    System.out.println("check: "+dirName);
+                    // create filter to search OpenMind
+		            List<AttributeFilter> filters = new ArrayList<AttributeFilter>();
+		            AttributeFilter filter = new AttributeFilter("name", dirName, "DIGITALIZATION");
+		            filter.setExactMatch(true);
+					filters.add(filter);
+		            // get matching DIGITALIZATIONs
+                    Map<Entity, Attribute> res = omService.searchEntityByAttributeFilter(filters, -1);
+                    if (res.size() > 0) {
+                        //System.out.println("  exists: "+res);
+                    } else {
+                        // no existing DIGITALIZATION - create new Entity
+                        System.out.println(" create: "+dirName);
+                        Entity digi = new Entity(Node.TYPE_ABOX, DIGITALIZATION, false);
+                        digi.setOwnValue(dirName);                        
+                        digi.addAttribute(new Attribute("name", "text", dirName));
+                        digi.addAttribute(new Attribute("num_files", "text", "100"));
+                        // add to list
+                        list.add(digi);
+                    }
+		        }
+		        // ensure http entity is fully consumed
+		        EntityUtils.consume(htent);
+		        // persist OpenMind entities
+		        omService.saveEntityList(list, userName);
+                System.out.println("Read " + dirs + " directories");
+	            System.out.println("Created " + list.size() + " DIGITALIZATIONs");
+	            System.out.println("END");
+		        
+		    } finally {
+		        httpGet.releaseConnection();
+		    }
+		    
+		} catch (Exception e) {
+			e.printStackTrace();
+		}
+	}
+	
+	
+	public static void main(String[] args){
+		execute();
+		System.exit(0);
+	}
+}