Mercurial > hg > openmind
changeset 107:742347ef8410
Changed DivaImportHttp for new URL and JSON format.
author | Robert Casties <casties@mpiwg-berlin.mpg.de> |
---|---|
date | Mon, 08 Jul 2019 20:20:07 +0200 |
parents | 93c7dbfaf062 |
children | bdd1c3fc0897 |
files | src/main/java/org/mpi/openmind/scripts/DivaImportHttp.java src/main/java/org/mpi/openmind/scripts/DivaImportHttpOld.java |
diffstat | 2 files changed, 156 insertions(+), 21 deletions(-) [+] |
line wrap: on
line diff
--- a/src/main/java/org/mpi/openmind/scripts/DivaImportHttp.java Fri Apr 26 18:12:23 2019 +0200 +++ b/src/main/java/org/mpi/openmind/scripts/DivaImportHttp.java Mon Jul 08 20:20:07 2019 +0200 @@ -1,10 +1,9 @@ package org.mpi.openmind.scripts; +import java.io.InputStreamReader; import java.util.ArrayList; import java.util.List; import java.util.Map; -import java.util.regex.Matcher; -import java.util.regex.Pattern; import org.apache.http.HttpEntity; import org.apache.http.HttpResponse; @@ -15,11 +14,13 @@ import org.apache.log4j.Level; import org.apache.log4j.Logger; import org.apache.log4j.PatternLayout; +import org.json.JSONArray; +import org.json.JSONObject; +import org.json.JSONTokener; import org.mpi.openmind.cache.WrapperService; import org.mpi.openmind.repository.bo.Attribute; import org.mpi.openmind.repository.bo.Entity; import org.mpi.openmind.repository.bo.Node; -import org.mpi.openmind.repository.bo.Relation; import org.mpi.openmind.repository.services.ServiceRegistry; import org.mpi.openmind.repository.services.utils.AttributeFilter; @@ -49,9 +50,9 @@ public static String userName = "diva-import"; /** URL for listing of Diva menifest files */ - public static String scanListUrl = "https://images.rasi.mcgill.ca/data/"; + public static String scanListUrl = "https://ismi-imageserver-cc.mpiwg-berlin.mpg.de/iiif/manifests-internal/"; - public static void execute(){ + public static void execute() { ServiceRegistry services = new ServiceRegistry(); // data model should exist by now //createDataModel(services.getWrapper()); @@ -70,7 +71,7 @@ DefaultHttpClient httpclient = new DefaultHttpClient(); HttpGet httpGet = new HttpGet(scanListUrl); try { - System.out.println("Reading scan dirs from "+scanListUrl); + System.out.println("Reading scan manifests from "+scanListUrl); // send HTTP request and read response HttpResponse response = httpclient.execute(httpGet); if (response.getStatusLine().getStatusCode() > 200) { @@ -78,20 +79,20 @@ return; } HttpEntity htent = response.getEntity(); - String document = EntityUtils.toString(htent); - // brutal HTML parsing by regex ;-( - Pattern lp = Pattern.compile("<a href=\"([\\w_.]+)\\.json\">"); - Matcher lm = lp.matcher(document); + JSONTokener jsonReader = new JSONTokener(new InputStreamReader(htent.getContent())); + // parse JSON directory index of manifest files + JSONArray files = new JSONArray(jsonReader); + int numFiles = files.length(); List<Entity> list = new ArrayList<Entity>(); - int dirs = 0; - // go through all filenames in the links - while (lm.find()) { - dirs += 1; - String dirName = lm.group(1); - System.out.println("check: "+dirName); + // go through all filenames in the list + for (int i = 0; i < numFiles; ++i) { + JSONObject file = files.getJSONObject(i); + String filename = file.getString("name"); + System.out.println("check: "+filename); // create filter to search OpenMind + String digiName = filename.replace(".json", ""); List<AttributeFilter> filters = new ArrayList<AttributeFilter>(); - AttributeFilter filter = new AttributeFilter("name", dirName, "DIGITALIZATION"); + AttributeFilter filter = new AttributeFilter("name", digiName, "DIGITALIZATION"); filter.setExactMatch(true); filters.add(filter); // get matching DIGITALIZATIONs @@ -100,10 +101,10 @@ //System.out.println(" exists: "+res); } else { // no existing DIGITALIZATION - create new Entity - System.out.println(" create: "+dirName); + System.out.println(" create: "+filename); Entity digi = new Entity(Node.TYPE_ABOX, DIGITALIZATION, false); - digi.setOwnValue(dirName); - digi.addAttribute(new Attribute("name", "text", dirName)); + digi.setOwnValue(filename); + digi.addAttribute(new Attribute("name", "text", filename)); digi.addAttribute(new Attribute("num_files", "text", "100")); // add to list list.add(digi); @@ -113,7 +114,7 @@ EntityUtils.consume(htent); // persist OpenMind entities omService.saveEntityList(list, userName); - System.out.println("Read " + dirs + " directories"); + System.out.println("Found " + numFiles + " manifests"); System.out.println("Created " + list.size() + " DIGITALIZATIONs"); System.out.println("END");
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/main/java/org/mpi/openmind/scripts/DivaImportHttpOld.java Mon Jul 08 20:20:07 2019 +0200 @@ -0,0 +1,134 @@ +package org.mpi.openmind.scripts; + +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.apache.http.HttpEntity; +import org.apache.http.HttpResponse; +import org.apache.http.client.methods.HttpGet; +import org.apache.http.impl.client.DefaultHttpClient; +import org.apache.http.util.EntityUtils; +import org.apache.log4j.ConsoleAppender; +import org.apache.log4j.Level; +import org.apache.log4j.Logger; +import org.apache.log4j.PatternLayout; +import org.mpi.openmind.cache.WrapperService; +import org.mpi.openmind.repository.bo.Attribute; +import org.mpi.openmind.repository.bo.Entity; +import org.mpi.openmind.repository.bo.Node; +import org.mpi.openmind.repository.bo.Relation; +import org.mpi.openmind.repository.services.ServiceRegistry; +import org.mpi.openmind.repository.services.utils.AttributeFilter; + + +/** + * Downloads a list of Diva manifest files from the repository over HTTP + * and checks if each manifest has a corresponding DIGITALIZATION object. + * Creates missing DIGITALIZATION objects. + * + * @author casties + * + */ +public class DivaImportHttpOld { + + static{ + ConsoleAppender console = new ConsoleAppender(); //create appender + //configure the appender + String PATTERN = "%d [%p|%c|%C{1}] %m%n"; + console.setLayout(new PatternLayout(PATTERN)); + console.setThreshold(Level.INFO); + console.activateOptions(); + //add appender to any Logger (here is root) + Logger.getRootLogger().addAppender(console); + } + + public static String DIGITALIZATION = "DIGITALIZATION"; + public static String userName = "diva-import"; + + /** URL for listing of Diva menifest files */ + public static String scanListUrl = "https://images.rasi.mcgill.ca/data/"; + + public static void execute(){ + ServiceRegistry services = new ServiceRegistry(); + // data model should exist by now + //createDataModel(services.getWrapper()); + importData(services.getWrapper()); + } + + /** + * Downloads a list of Diva manifest files from the repository and checks if + * each manifest has a corresponding DIGITALIZATION object. Creates missing + * DIGITALIZATION objects. + * + * @param omService + */ + private static void importData(WrapperService omService){ + try { + DefaultHttpClient httpclient = new DefaultHttpClient(); + HttpGet httpGet = new HttpGet(scanListUrl); + try { + System.out.println("Reading scan dirs from "+scanListUrl); + // send HTTP request and read response + HttpResponse response = httpclient.execute(httpGet); + if (response.getStatusLine().getStatusCode() > 200) { + System.out.println("ERROR reading HTTP response: "+response.getStatusLine()); + return; + } + HttpEntity htent = response.getEntity(); + String document = EntityUtils.toString(htent); + // brutal HTML parsing by regex ;-( + Pattern lp = Pattern.compile("<a href=\"([\\w_.]+)\\.json\">"); + Matcher lm = lp.matcher(document); + List<Entity> list = new ArrayList<Entity>(); + int dirs = 0; + // go through all filenames in the links + while (lm.find()) { + dirs += 1; + String dirName = lm.group(1); + System.out.println("check: "+dirName); + // create filter to search OpenMind + List<AttributeFilter> filters = new ArrayList<AttributeFilter>(); + AttributeFilter filter = new AttributeFilter("name", dirName, "DIGITALIZATION"); + filter.setExactMatch(true); + filters.add(filter); + // get matching DIGITALIZATIONs + Map<Entity, Attribute> res = omService.searchEntityByAttributeFilter(filters, -1); + if (res.size() > 0) { + //System.out.println(" exists: "+res); + } else { + // no existing DIGITALIZATION - create new Entity + System.out.println(" create: "+dirName); + Entity digi = new Entity(Node.TYPE_ABOX, DIGITALIZATION, false); + digi.setOwnValue(dirName); + digi.addAttribute(new Attribute("name", "text", dirName)); + digi.addAttribute(new Attribute("num_files", "text", "100")); + // add to list + list.add(digi); + } + } + // ensure http entity is fully consumed + EntityUtils.consume(htent); + // persist OpenMind entities + omService.saveEntityList(list, userName); + System.out.println("Read " + dirs + " directories"); + System.out.println("Created " + list.size() + " DIGITALIZATIONs"); + System.out.println("END"); + + } finally { + httpGet.releaseConnection(); + } + + } catch (Exception e) { + e.printStackTrace(); + } + } + + + public static void main(String[] args){ + execute(); + System.exit(0); + } +}