annotate src/main/java/org/mpi/openmind/scripts/DivaImportHttp.java @ 112:933d17f95016

new script MigratePrimeAliases to migrate is_prime_alias_X_of.
author Robert Casties <casties@mpiwg-berlin.mpg.de>
date Wed, 14 Aug 2019 20:48:02 +0200
parents 8013b12cecf7
children 0a8facc3d296
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
23
d2d4cd129f5e new importer for DIGITALIZATIONs using the Diva manifest HTTP endpoint.
casties
parents:
diff changeset
1 package org.mpi.openmind.scripts;
d2d4cd129f5e new importer for DIGITALIZATIONs using the Diva manifest HTTP endpoint.
casties
parents:
diff changeset
2
107
742347ef8410 Changed DivaImportHttp for new URL and JSON format.
Robert Casties <casties@mpiwg-berlin.mpg.de>
parents: 99
diff changeset
3 import java.io.InputStreamReader;
23
d2d4cd129f5e new importer for DIGITALIZATIONs using the Diva manifest HTTP endpoint.
casties
parents:
diff changeset
4 import java.util.ArrayList;
108
bdd1c3fc0897 Much faster DivaImportHttp that puts all entities in a Map first.
Robert Casties <casties@mpiwg-berlin.mpg.de>
parents: 107
diff changeset
5 import java.util.HashMap;
23
d2d4cd129f5e new importer for DIGITALIZATIONs using the Diva manifest HTTP endpoint.
casties
parents:
diff changeset
6 import java.util.List;
d2d4cd129f5e new importer for DIGITALIZATIONs using the Diva manifest HTTP endpoint.
casties
parents:
diff changeset
7 import java.util.Map;
d2d4cd129f5e new importer for DIGITALIZATIONs using the Diva manifest HTTP endpoint.
casties
parents:
diff changeset
8
d2d4cd129f5e new importer for DIGITALIZATIONs using the Diva manifest HTTP endpoint.
casties
parents:
diff changeset
9 import org.apache.http.HttpEntity;
d2d4cd129f5e new importer for DIGITALIZATIONs using the Diva manifest HTTP endpoint.
casties
parents:
diff changeset
10 import org.apache.http.HttpResponse;
d2d4cd129f5e new importer for DIGITALIZATIONs using the Diva manifest HTTP endpoint.
casties
parents:
diff changeset
11 import org.apache.http.client.methods.HttpGet;
d2d4cd129f5e new importer for DIGITALIZATIONs using the Diva manifest HTTP endpoint.
casties
parents:
diff changeset
12 import org.apache.http.impl.client.DefaultHttpClient;
d2d4cd129f5e new importer for DIGITALIZATIONs using the Diva manifest HTTP endpoint.
casties
parents:
diff changeset
13 import org.apache.http.util.EntityUtils;
d2d4cd129f5e new importer for DIGITALIZATIONs using the Diva manifest HTTP endpoint.
casties
parents:
diff changeset
14 import org.apache.log4j.ConsoleAppender;
d2d4cd129f5e new importer for DIGITALIZATIONs using the Diva manifest HTTP endpoint.
casties
parents:
diff changeset
15 import org.apache.log4j.Level;
d2d4cd129f5e new importer for DIGITALIZATIONs using the Diva manifest HTTP endpoint.
casties
parents:
diff changeset
16 import org.apache.log4j.Logger;
d2d4cd129f5e new importer for DIGITALIZATIONs using the Diva manifest HTTP endpoint.
casties
parents:
diff changeset
17 import org.apache.log4j.PatternLayout;
107
742347ef8410 Changed DivaImportHttp for new URL and JSON format.
Robert Casties <casties@mpiwg-berlin.mpg.de>
parents: 99
diff changeset
18 import org.json.JSONArray;
742347ef8410 Changed DivaImportHttp for new URL and JSON format.
Robert Casties <casties@mpiwg-berlin.mpg.de>
parents: 99
diff changeset
19 import org.json.JSONObject;
742347ef8410 Changed DivaImportHttp for new URL and JSON format.
Robert Casties <casties@mpiwg-berlin.mpg.de>
parents: 99
diff changeset
20 import org.json.JSONTokener;
23
d2d4cd129f5e new importer for DIGITALIZATIONs using the Diva manifest HTTP endpoint.
casties
parents:
diff changeset
21 import org.mpi.openmind.cache.WrapperService;
d2d4cd129f5e new importer for DIGITALIZATIONs using the Diva manifest HTTP endpoint.
casties
parents:
diff changeset
22 import org.mpi.openmind.repository.bo.Attribute;
d2d4cd129f5e new importer for DIGITALIZATIONs using the Diva manifest HTTP endpoint.
casties
parents:
diff changeset
23 import org.mpi.openmind.repository.bo.Entity;
d2d4cd129f5e new importer for DIGITALIZATIONs using the Diva manifest HTTP endpoint.
casties
parents:
diff changeset
24 import org.mpi.openmind.repository.bo.Node;
d2d4cd129f5e new importer for DIGITALIZATIONs using the Diva manifest HTTP endpoint.
casties
parents:
diff changeset
25 import org.mpi.openmind.repository.services.ServiceRegistry;
d2d4cd129f5e new importer for DIGITALIZATIONs using the Diva manifest HTTP endpoint.
casties
parents:
diff changeset
26
d2d4cd129f5e new importer for DIGITALIZATIONs using the Diva manifest HTTP endpoint.
casties
parents:
diff changeset
27
47
c9a411c8f742 more comments.
casties
parents: 28
diff changeset
28 /**
c9a411c8f742 more comments.
casties
parents: 28
diff changeset
29 * Downloads a list of Diva manifest files from the repository over HTTP
c9a411c8f742 more comments.
casties
parents: 28
diff changeset
30 * and checks if each manifest has a corresponding DIGITALIZATION object.
c9a411c8f742 more comments.
casties
parents: 28
diff changeset
31 * Creates missing DIGITALIZATION objects.
c9a411c8f742 more comments.
casties
parents: 28
diff changeset
32 *
c9a411c8f742 more comments.
casties
parents: 28
diff changeset
33 * @author casties
c9a411c8f742 more comments.
casties
parents: 28
diff changeset
34 *
c9a411c8f742 more comments.
casties
parents: 28
diff changeset
35 */
23
d2d4cd129f5e new importer for DIGITALIZATIONs using the Diva manifest HTTP endpoint.
casties
parents:
diff changeset
36 public class DivaImportHttp {
d2d4cd129f5e new importer for DIGITALIZATIONs using the Diva manifest HTTP endpoint.
casties
parents:
diff changeset
37
d2d4cd129f5e new importer for DIGITALIZATIONs using the Diva manifest HTTP endpoint.
casties
parents:
diff changeset
38 static{
d2d4cd129f5e new importer for DIGITALIZATIONs using the Diva manifest HTTP endpoint.
casties
parents:
diff changeset
39 ConsoleAppender console = new ConsoleAppender(); //create appender
d2d4cd129f5e new importer for DIGITALIZATIONs using the Diva manifest HTTP endpoint.
casties
parents:
diff changeset
40 //configure the appender
d2d4cd129f5e new importer for DIGITALIZATIONs using the Diva manifest HTTP endpoint.
casties
parents:
diff changeset
41 String PATTERN = "%d [%p|%c|%C{1}] %m%n";
d2d4cd129f5e new importer for DIGITALIZATIONs using the Diva manifest HTTP endpoint.
casties
parents:
diff changeset
42 console.setLayout(new PatternLayout(PATTERN));
d2d4cd129f5e new importer for DIGITALIZATIONs using the Diva manifest HTTP endpoint.
casties
parents:
diff changeset
43 console.setThreshold(Level.INFO);
d2d4cd129f5e new importer for DIGITALIZATIONs using the Diva manifest HTTP endpoint.
casties
parents:
diff changeset
44 console.activateOptions();
d2d4cd129f5e new importer for DIGITALIZATIONs using the Diva manifest HTTP endpoint.
casties
parents:
diff changeset
45 //add appender to any Logger (here is root)
d2d4cd129f5e new importer for DIGITALIZATIONs using the Diva manifest HTTP endpoint.
casties
parents:
diff changeset
46 Logger.getRootLogger().addAppender(console);
d2d4cd129f5e new importer for DIGITALIZATIONs using the Diva manifest HTTP endpoint.
casties
parents:
diff changeset
47 }
d2d4cd129f5e new importer for DIGITALIZATIONs using the Diva manifest HTTP endpoint.
casties
parents:
diff changeset
48
d2d4cd129f5e new importer for DIGITALIZATIONs using the Diva manifest HTTP endpoint.
casties
parents:
diff changeset
49 public static String DIGITALIZATION = "DIGITALIZATION";
d2d4cd129f5e new importer for DIGITALIZATIONs using the Diva manifest HTTP endpoint.
casties
parents:
diff changeset
50 public static String userName = "diva-import";
d2d4cd129f5e new importer for DIGITALIZATIONs using the Diva manifest HTTP endpoint.
casties
parents:
diff changeset
51
47
c9a411c8f742 more comments.
casties
parents: 28
diff changeset
52 /** URL for listing of Diva menifest files */
107
742347ef8410 Changed DivaImportHttp for new URL and JSON format.
Robert Casties <casties@mpiwg-berlin.mpg.de>
parents: 99
diff changeset
53 public static String scanListUrl = "https://ismi-imageserver-cc.mpiwg-berlin.mpg.de/iiif/manifests-internal/";
47
c9a411c8f742 more comments.
casties
parents: 28
diff changeset
54
107
742347ef8410 Changed DivaImportHttp for new URL and JSON format.
Robert Casties <casties@mpiwg-berlin.mpg.de>
parents: 99
diff changeset
55 public static void execute() {
23
d2d4cd129f5e new importer for DIGITALIZATIONs using the Diva manifest HTTP endpoint.
casties
parents:
diff changeset
56 ServiceRegistry services = new ServiceRegistry();
47
c9a411c8f742 more comments.
casties
parents: 28
diff changeset
57 // data model should exist by now
28
bdba8c108183 clean up.
casties
parents: 23
diff changeset
58 //createDataModel(services.getWrapper());
23
d2d4cd129f5e new importer for DIGITALIZATIONs using the Diva manifest HTTP endpoint.
casties
parents:
diff changeset
59 importData(services.getWrapper());
d2d4cd129f5e new importer for DIGITALIZATIONs using the Diva manifest HTTP endpoint.
casties
parents:
diff changeset
60 }
d2d4cd129f5e new importer for DIGITALIZATIONs using the Diva manifest HTTP endpoint.
casties
parents:
diff changeset
61
28
bdba8c108183 clean up.
casties
parents: 23
diff changeset
62 /**
bdba8c108183 clean up.
casties
parents: 23
diff changeset
63 * Downloads a list of Diva manifest files from the repository and checks if
bdba8c108183 clean up.
casties
parents: 23
diff changeset
64 * each manifest has a corresponding DIGITALIZATION object. Creates missing
bdba8c108183 clean up.
casties
parents: 23
diff changeset
65 * DIGITALIZATION objects.
bdba8c108183 clean up.
casties
parents: 23
diff changeset
66 *
bdba8c108183 clean up.
casties
parents: 23
diff changeset
67 * @param omService
bdba8c108183 clean up.
casties
parents: 23
diff changeset
68 */
23
d2d4cd129f5e new importer for DIGITALIZATIONs using the Diva manifest HTTP endpoint.
casties
parents:
diff changeset
69 private static void importData(WrapperService omService){
d2d4cd129f5e new importer for DIGITALIZATIONs using the Diva manifest HTTP endpoint.
casties
parents:
diff changeset
70 try {
d2d4cd129f5e new importer for DIGITALIZATIONs using the Diva manifest HTTP endpoint.
casties
parents:
diff changeset
71 DefaultHttpClient httpclient = new DefaultHttpClient();
d2d4cd129f5e new importer for DIGITALIZATIONs using the Diva manifest HTTP endpoint.
casties
parents:
diff changeset
72 HttpGet httpGet = new HttpGet(scanListUrl);
d2d4cd129f5e new importer for DIGITALIZATIONs using the Diva manifest HTTP endpoint.
casties
parents:
diff changeset
73 try {
107
742347ef8410 Changed DivaImportHttp for new URL and JSON format.
Robert Casties <casties@mpiwg-berlin.mpg.de>
parents: 99
diff changeset
74 System.out.println("Reading scan manifests from "+scanListUrl);
47
c9a411c8f742 more comments.
casties
parents: 28
diff changeset
75 // send HTTP request and read response
c9a411c8f742 more comments.
casties
parents: 28
diff changeset
76 HttpResponse response = httpclient.execute(httpGet);
23
d2d4cd129f5e new importer for DIGITALIZATIONs using the Diva manifest HTTP endpoint.
casties
parents:
diff changeset
77 if (response.getStatusLine().getStatusCode() > 200) {
d2d4cd129f5e new importer for DIGITALIZATIONs using the Diva manifest HTTP endpoint.
casties
parents:
diff changeset
78 System.out.println("ERROR reading HTTP response: "+response.getStatusLine());
d2d4cd129f5e new importer for DIGITALIZATIONs using the Diva manifest HTTP endpoint.
casties
parents:
diff changeset
79 return;
d2d4cd129f5e new importer for DIGITALIZATIONs using the Diva manifest HTTP endpoint.
casties
parents:
diff changeset
80 }
d2d4cd129f5e new importer for DIGITALIZATIONs using the Diva manifest HTTP endpoint.
casties
parents:
diff changeset
81 HttpEntity htent = response.getEntity();
107
742347ef8410 Changed DivaImportHttp for new URL and JSON format.
Robert Casties <casties@mpiwg-berlin.mpg.de>
parents: 99
diff changeset
82 JSONTokener jsonReader = new JSONTokener(new InputStreamReader(htent.getContent()));
742347ef8410 Changed DivaImportHttp for new URL and JSON format.
Robert Casties <casties@mpiwg-berlin.mpg.de>
parents: 99
diff changeset
83 // parse JSON directory index of manifest files
742347ef8410 Changed DivaImportHttp for new URL and JSON format.
Robert Casties <casties@mpiwg-berlin.mpg.de>
parents: 99
diff changeset
84 JSONArray files = new JSONArray(jsonReader);
742347ef8410 Changed DivaImportHttp for new URL and JSON format.
Robert Casties <casties@mpiwg-berlin.mpg.de>
parents: 99
diff changeset
85 int numFiles = files.length();
108
bdd1c3fc0897 Much faster DivaImportHttp that puts all entities in a Map first.
Robert Casties <casties@mpiwg-berlin.mpg.de>
parents: 107
diff changeset
86 // get all DIGITALIZATION entities
bdd1c3fc0897 Much faster DivaImportHttp that puts all entities in a Map first.
Robert Casties <casties@mpiwg-berlin.mpg.de>
parents: 107
diff changeset
87 System.out.println("Loading all DIGITALIZATIONs");
bdd1c3fc0897 Much faster DivaImportHttp that puts all entities in a Map first.
Robert Casties <casties@mpiwg-berlin.mpg.de>
parents: 107
diff changeset
88 List<Entity> digiList = omService.getEntitiesByDef(DIGITALIZATION);
bdd1c3fc0897 Much faster DivaImportHttp that puts all entities in a Map first.
Robert Casties <casties@mpiwg-berlin.mpg.de>
parents: 107
diff changeset
89 // unpack in Map by name/ownValue
bdd1c3fc0897 Much faster DivaImportHttp that puts all entities in a Map first.
Robert Casties <casties@mpiwg-berlin.mpg.de>
parents: 107
diff changeset
90 Map<String, Entity> digiMap = new HashMap<String, Entity>(digiList.size());
bdd1c3fc0897 Much faster DivaImportHttp that puts all entities in a Map first.
Robert Casties <casties@mpiwg-berlin.mpg.de>
parents: 107
diff changeset
91 for (Entity digi : digiList) {
bdd1c3fc0897 Much faster DivaImportHttp that puts all entities in a Map first.
Robert Casties <casties@mpiwg-berlin.mpg.de>
parents: 107
diff changeset
92 String name = digi.getOwnValue();
bdd1c3fc0897 Much faster DivaImportHttp that puts all entities in a Map first.
Robert Casties <casties@mpiwg-berlin.mpg.de>
parents: 107
diff changeset
93 digiMap.put(name, digi);
bdd1c3fc0897 Much faster DivaImportHttp that puts all entities in a Map first.
Robert Casties <casties@mpiwg-berlin.mpg.de>
parents: 107
diff changeset
94 }
bdd1c3fc0897 Much faster DivaImportHttp that puts all entities in a Map first.
Robert Casties <casties@mpiwg-berlin.mpg.de>
parents: 107
diff changeset
95 List<Entity> saveList = new ArrayList<Entity>();
107
742347ef8410 Changed DivaImportHttp for new URL and JSON format.
Robert Casties <casties@mpiwg-berlin.mpg.de>
parents: 99
diff changeset
96 // go through all filenames in the list
742347ef8410 Changed DivaImportHttp for new URL and JSON format.
Robert Casties <casties@mpiwg-berlin.mpg.de>
parents: 99
diff changeset
97 for (int i = 0; i < numFiles; ++i) {
742347ef8410 Changed DivaImportHttp for new URL and JSON format.
Robert Casties <casties@mpiwg-berlin.mpg.de>
parents: 99
diff changeset
98 JSONObject file = files.getJSONObject(i);
742347ef8410 Changed DivaImportHttp for new URL and JSON format.
Robert Casties <casties@mpiwg-berlin.mpg.de>
parents: 99
diff changeset
99 String filename = file.getString("name");
742347ef8410 Changed DivaImportHttp for new URL and JSON format.
Robert Casties <casties@mpiwg-berlin.mpg.de>
parents: 99
diff changeset
100 System.out.println("check: "+filename);
742347ef8410 Changed DivaImportHttp for new URL and JSON format.
Robert Casties <casties@mpiwg-berlin.mpg.de>
parents: 99
diff changeset
101 String digiName = filename.replace(".json", "");
108
bdd1c3fc0897 Much faster DivaImportHttp that puts all entities in a Map first.
Robert Casties <casties@mpiwg-berlin.mpg.de>
parents: 107
diff changeset
102 if (digiMap.containsKey(digiName)) {
23
d2d4cd129f5e new importer for DIGITALIZATIONs using the Diva manifest HTTP endpoint.
casties
parents:
diff changeset
103 //System.out.println(" exists: "+res);
d2d4cd129f5e new importer for DIGITALIZATIONs using the Diva manifest HTTP endpoint.
casties
parents:
diff changeset
104 } else {
47
c9a411c8f742 more comments.
casties
parents: 28
diff changeset
105 // no existing DIGITALIZATION - create new Entity
109
8013b12cecf7 fix .json extension on imported digitalizations.
Robert Casties <casties@mpiwg-berlin.mpg.de>
parents: 108
diff changeset
106 System.out.println(" create: "+digiName);
23
d2d4cd129f5e new importer for DIGITALIZATIONs using the Diva manifest HTTP endpoint.
casties
parents:
diff changeset
107 Entity digi = new Entity(Node.TYPE_ABOX, DIGITALIZATION, false);
109
8013b12cecf7 fix .json extension on imported digitalizations.
Robert Casties <casties@mpiwg-berlin.mpg.de>
parents: 108
diff changeset
108 digi.setOwnValue(digiName);
8013b12cecf7 fix .json extension on imported digitalizations.
Robert Casties <casties@mpiwg-berlin.mpg.de>
parents: 108
diff changeset
109 digi.addAttribute(new Attribute("name", "text", digiName));
23
d2d4cd129f5e new importer for DIGITALIZATIONs using the Diva manifest HTTP endpoint.
casties
parents:
diff changeset
110 digi.addAttribute(new Attribute("num_files", "text", "100"));
47
c9a411c8f742 more comments.
casties
parents: 28
diff changeset
111 // add to list
108
bdd1c3fc0897 Much faster DivaImportHttp that puts all entities in a Map first.
Robert Casties <casties@mpiwg-berlin.mpg.de>
parents: 107
diff changeset
112 saveList.add(digi);
23
d2d4cd129f5e new importer for DIGITALIZATIONs using the Diva manifest HTTP endpoint.
casties
parents:
diff changeset
113 }
d2d4cd129f5e new importer for DIGITALIZATIONs using the Diva manifest HTTP endpoint.
casties
parents:
diff changeset
114 }
d2d4cd129f5e new importer for DIGITALIZATIONs using the Diva manifest HTTP endpoint.
casties
parents:
diff changeset
115 // ensure http entity is fully consumed
d2d4cd129f5e new importer for DIGITALIZATIONs using the Diva manifest HTTP endpoint.
casties
parents:
diff changeset
116 EntityUtils.consume(htent);
d2d4cd129f5e new importer for DIGITALIZATIONs using the Diva manifest HTTP endpoint.
casties
parents:
diff changeset
117 // persist OpenMind entities
108
bdd1c3fc0897 Much faster DivaImportHttp that puts all entities in a Map first.
Robert Casties <casties@mpiwg-berlin.mpg.de>
parents: 107
diff changeset
118 omService.saveEntityList(saveList, userName);
107
742347ef8410 Changed DivaImportHttp for new URL and JSON format.
Robert Casties <casties@mpiwg-berlin.mpg.de>
parents: 99
diff changeset
119 System.out.println("Found " + numFiles + " manifests");
108
bdd1c3fc0897 Much faster DivaImportHttp that puts all entities in a Map first.
Robert Casties <casties@mpiwg-berlin.mpg.de>
parents: 107
diff changeset
120 System.out.println("Created " + saveList.size() + " DIGITALIZATIONs");
23
d2d4cd129f5e new importer for DIGITALIZATIONs using the Diva manifest HTTP endpoint.
casties
parents:
diff changeset
121 System.out.println("END");
d2d4cd129f5e new importer for DIGITALIZATIONs using the Diva manifest HTTP endpoint.
casties
parents:
diff changeset
122
d2d4cd129f5e new importer for DIGITALIZATIONs using the Diva manifest HTTP endpoint.
casties
parents:
diff changeset
123 } finally {
d2d4cd129f5e new importer for DIGITALIZATIONs using the Diva manifest HTTP endpoint.
casties
parents:
diff changeset
124 httpGet.releaseConnection();
d2d4cd129f5e new importer for DIGITALIZATIONs using the Diva manifest HTTP endpoint.
casties
parents:
diff changeset
125 }
d2d4cd129f5e new importer for DIGITALIZATIONs using the Diva manifest HTTP endpoint.
casties
parents:
diff changeset
126
d2d4cd129f5e new importer for DIGITALIZATIONs using the Diva manifest HTTP endpoint.
casties
parents:
diff changeset
127 } catch (Exception e) {
d2d4cd129f5e new importer for DIGITALIZATIONs using the Diva manifest HTTP endpoint.
casties
parents:
diff changeset
128 e.printStackTrace();
d2d4cd129f5e new importer for DIGITALIZATIONs using the Diva manifest HTTP endpoint.
casties
parents:
diff changeset
129 }
d2d4cd129f5e new importer for DIGITALIZATIONs using the Diva manifest HTTP endpoint.
casties
parents:
diff changeset
130 }
d2d4cd129f5e new importer for DIGITALIZATIONs using the Diva manifest HTTP endpoint.
casties
parents:
diff changeset
131
d2d4cd129f5e new importer for DIGITALIZATIONs using the Diva manifest HTTP endpoint.
casties
parents:
diff changeset
132
d2d4cd129f5e new importer for DIGITALIZATIONs using the Diva manifest HTTP endpoint.
casties
parents:
diff changeset
133 public static void main(String[] args){
d2d4cd129f5e new importer for DIGITALIZATIONs using the Diva manifest HTTP endpoint.
casties
parents:
diff changeset
134 execute();
d2d4cd129f5e new importer for DIGITALIZATIONs using the Diva manifest HTTP endpoint.
casties
parents:
diff changeset
135 System.exit(0);
d2d4cd129f5e new importer for DIGITALIZATIONs using the Diva manifest HTTP endpoint.
casties
parents:
diff changeset
136 }
d2d4cd129f5e new importer for DIGITALIZATIONs using the Diva manifest HTTP endpoint.
casties
parents:
diff changeset
137 }