Mercurial > hg > openmind
annotate src/main/java/org/mpi/openmind/scripts/DivaImportHttp.java @ 112:933d17f95016
new script MigratePrimeAliases to migrate is_prime_alias_X_of.
author | Robert Casties <casties@mpiwg-berlin.mpg.de> |
---|---|
date | Wed, 14 Aug 2019 20:48:02 +0200 |
parents | 8013b12cecf7 |
children | 0a8facc3d296 |
rev | line source |
---|---|
23
d2d4cd129f5e
new importer for DIGITALIZATIONs using the Diva manifest HTTP endpoint.
casties
parents:
diff
changeset
|
1 package org.mpi.openmind.scripts; |
d2d4cd129f5e
new importer for DIGITALIZATIONs using the Diva manifest HTTP endpoint.
casties
parents:
diff
changeset
|
2 |
107
742347ef8410
Changed DivaImportHttp for new URL and JSON format.
Robert Casties <casties@mpiwg-berlin.mpg.de>
parents:
99
diff
changeset
|
3 import java.io.InputStreamReader; |
23
d2d4cd129f5e
new importer for DIGITALIZATIONs using the Diva manifest HTTP endpoint.
casties
parents:
diff
changeset
|
4 import java.util.ArrayList; |
108
bdd1c3fc0897
Much faster DivaImportHttp that puts all entities in a Map first.
Robert Casties <casties@mpiwg-berlin.mpg.de>
parents:
107
diff
changeset
|
5 import java.util.HashMap; |
23
d2d4cd129f5e
new importer for DIGITALIZATIONs using the Diva manifest HTTP endpoint.
casties
parents:
diff
changeset
|
6 import java.util.List; |
d2d4cd129f5e
new importer for DIGITALIZATIONs using the Diva manifest HTTP endpoint.
casties
parents:
diff
changeset
|
7 import java.util.Map; |
d2d4cd129f5e
new importer for DIGITALIZATIONs using the Diva manifest HTTP endpoint.
casties
parents:
diff
changeset
|
8 |
d2d4cd129f5e
new importer for DIGITALIZATIONs using the Diva manifest HTTP endpoint.
casties
parents:
diff
changeset
|
9 import org.apache.http.HttpEntity; |
d2d4cd129f5e
new importer for DIGITALIZATIONs using the Diva manifest HTTP endpoint.
casties
parents:
diff
changeset
|
10 import org.apache.http.HttpResponse; |
d2d4cd129f5e
new importer for DIGITALIZATIONs using the Diva manifest HTTP endpoint.
casties
parents:
diff
changeset
|
11 import org.apache.http.client.methods.HttpGet; |
d2d4cd129f5e
new importer for DIGITALIZATIONs using the Diva manifest HTTP endpoint.
casties
parents:
diff
changeset
|
12 import org.apache.http.impl.client.DefaultHttpClient; |
d2d4cd129f5e
new importer for DIGITALIZATIONs using the Diva manifest HTTP endpoint.
casties
parents:
diff
changeset
|
13 import org.apache.http.util.EntityUtils; |
d2d4cd129f5e
new importer for DIGITALIZATIONs using the Diva manifest HTTP endpoint.
casties
parents:
diff
changeset
|
14 import org.apache.log4j.ConsoleAppender; |
d2d4cd129f5e
new importer for DIGITALIZATIONs using the Diva manifest HTTP endpoint.
casties
parents:
diff
changeset
|
15 import org.apache.log4j.Level; |
d2d4cd129f5e
new importer for DIGITALIZATIONs using the Diva manifest HTTP endpoint.
casties
parents:
diff
changeset
|
16 import org.apache.log4j.Logger; |
d2d4cd129f5e
new importer for DIGITALIZATIONs using the Diva manifest HTTP endpoint.
casties
parents:
diff
changeset
|
17 import org.apache.log4j.PatternLayout; |
107
742347ef8410
Changed DivaImportHttp for new URL and JSON format.
Robert Casties <casties@mpiwg-berlin.mpg.de>
parents:
99
diff
changeset
|
18 import org.json.JSONArray; |
742347ef8410
Changed DivaImportHttp for new URL and JSON format.
Robert Casties <casties@mpiwg-berlin.mpg.de>
parents:
99
diff
changeset
|
19 import org.json.JSONObject; |
742347ef8410
Changed DivaImportHttp for new URL and JSON format.
Robert Casties <casties@mpiwg-berlin.mpg.de>
parents:
99
diff
changeset
|
20 import org.json.JSONTokener; |
23
d2d4cd129f5e
new importer for DIGITALIZATIONs using the Diva manifest HTTP endpoint.
casties
parents:
diff
changeset
|
21 import org.mpi.openmind.cache.WrapperService; |
d2d4cd129f5e
new importer for DIGITALIZATIONs using the Diva manifest HTTP endpoint.
casties
parents:
diff
changeset
|
22 import org.mpi.openmind.repository.bo.Attribute; |
d2d4cd129f5e
new importer for DIGITALIZATIONs using the Diva manifest HTTP endpoint.
casties
parents:
diff
changeset
|
23 import org.mpi.openmind.repository.bo.Entity; |
d2d4cd129f5e
new importer for DIGITALIZATIONs using the Diva manifest HTTP endpoint.
casties
parents:
diff
changeset
|
24 import org.mpi.openmind.repository.bo.Node; |
d2d4cd129f5e
new importer for DIGITALIZATIONs using the Diva manifest HTTP endpoint.
casties
parents:
diff
changeset
|
25 import org.mpi.openmind.repository.services.ServiceRegistry; |
d2d4cd129f5e
new importer for DIGITALIZATIONs using the Diva manifest HTTP endpoint.
casties
parents:
diff
changeset
|
26 |
d2d4cd129f5e
new importer for DIGITALIZATIONs using the Diva manifest HTTP endpoint.
casties
parents:
diff
changeset
|
27 |
47 | 28 /** |
29 * Downloads a list of Diva manifest files from the repository over HTTP | |
30 * and checks if each manifest has a corresponding DIGITALIZATION object. | |
31 * Creates missing DIGITALIZATION objects. | |
32 * | |
33 * @author casties | |
34 * | |
35 */ | |
23
d2d4cd129f5e
new importer for DIGITALIZATIONs using the Diva manifest HTTP endpoint.
casties
parents:
diff
changeset
|
36 public class DivaImportHttp { |
d2d4cd129f5e
new importer for DIGITALIZATIONs using the Diva manifest HTTP endpoint.
casties
parents:
diff
changeset
|
37 |
d2d4cd129f5e
new importer for DIGITALIZATIONs using the Diva manifest HTTP endpoint.
casties
parents:
diff
changeset
|
38 static{ |
d2d4cd129f5e
new importer for DIGITALIZATIONs using the Diva manifest HTTP endpoint.
casties
parents:
diff
changeset
|
39 ConsoleAppender console = new ConsoleAppender(); //create appender |
d2d4cd129f5e
new importer for DIGITALIZATIONs using the Diva manifest HTTP endpoint.
casties
parents:
diff
changeset
|
40 //configure the appender |
d2d4cd129f5e
new importer for DIGITALIZATIONs using the Diva manifest HTTP endpoint.
casties
parents:
diff
changeset
|
41 String PATTERN = "%d [%p|%c|%C{1}] %m%n"; |
d2d4cd129f5e
new importer for DIGITALIZATIONs using the Diva manifest HTTP endpoint.
casties
parents:
diff
changeset
|
42 console.setLayout(new PatternLayout(PATTERN)); |
d2d4cd129f5e
new importer for DIGITALIZATIONs using the Diva manifest HTTP endpoint.
casties
parents:
diff
changeset
|
43 console.setThreshold(Level.INFO); |
d2d4cd129f5e
new importer for DIGITALIZATIONs using the Diva manifest HTTP endpoint.
casties
parents:
diff
changeset
|
44 console.activateOptions(); |
d2d4cd129f5e
new importer for DIGITALIZATIONs using the Diva manifest HTTP endpoint.
casties
parents:
diff
changeset
|
45 //add appender to any Logger (here is root) |
d2d4cd129f5e
new importer for DIGITALIZATIONs using the Diva manifest HTTP endpoint.
casties
parents:
diff
changeset
|
46 Logger.getRootLogger().addAppender(console); |
d2d4cd129f5e
new importer for DIGITALIZATIONs using the Diva manifest HTTP endpoint.
casties
parents:
diff
changeset
|
47 } |
d2d4cd129f5e
new importer for DIGITALIZATIONs using the Diva manifest HTTP endpoint.
casties
parents:
diff
changeset
|
48 |
d2d4cd129f5e
new importer for DIGITALIZATIONs using the Diva manifest HTTP endpoint.
casties
parents:
diff
changeset
|
49 public static String DIGITALIZATION = "DIGITALIZATION"; |
d2d4cd129f5e
new importer for DIGITALIZATIONs using the Diva manifest HTTP endpoint.
casties
parents:
diff
changeset
|
50 public static String userName = "diva-import"; |
d2d4cd129f5e
new importer for DIGITALIZATIONs using the Diva manifest HTTP endpoint.
casties
parents:
diff
changeset
|
51 |
47 | 52 /** URL for listing of Diva menifest files */ |
107
742347ef8410
Changed DivaImportHttp for new URL and JSON format.
Robert Casties <casties@mpiwg-berlin.mpg.de>
parents:
99
diff
changeset
|
53 public static String scanListUrl = "https://ismi-imageserver-cc.mpiwg-berlin.mpg.de/iiif/manifests-internal/"; |
47 | 54 |
107
742347ef8410
Changed DivaImportHttp for new URL and JSON format.
Robert Casties <casties@mpiwg-berlin.mpg.de>
parents:
99
diff
changeset
|
55 public static void execute() { |
23
d2d4cd129f5e
new importer for DIGITALIZATIONs using the Diva manifest HTTP endpoint.
casties
parents:
diff
changeset
|
56 ServiceRegistry services = new ServiceRegistry(); |
47 | 57 // data model should exist by now |
28 | 58 //createDataModel(services.getWrapper()); |
23
d2d4cd129f5e
new importer for DIGITALIZATIONs using the Diva manifest HTTP endpoint.
casties
parents:
diff
changeset
|
59 importData(services.getWrapper()); |
d2d4cd129f5e
new importer for DIGITALIZATIONs using the Diva manifest HTTP endpoint.
casties
parents:
diff
changeset
|
60 } |
d2d4cd129f5e
new importer for DIGITALIZATIONs using the Diva manifest HTTP endpoint.
casties
parents:
diff
changeset
|
61 |
28 | 62 /** |
63 * Downloads a list of Diva manifest files from the repository and checks if | |
64 * each manifest has a corresponding DIGITALIZATION object. Creates missing | |
65 * DIGITALIZATION objects. | |
66 * | |
67 * @param omService | |
68 */ | |
23
d2d4cd129f5e
new importer for DIGITALIZATIONs using the Diva manifest HTTP endpoint.
casties
parents:
diff
changeset
|
69 private static void importData(WrapperService omService){ |
d2d4cd129f5e
new importer for DIGITALIZATIONs using the Diva manifest HTTP endpoint.
casties
parents:
diff
changeset
|
70 try { |
d2d4cd129f5e
new importer for DIGITALIZATIONs using the Diva manifest HTTP endpoint.
casties
parents:
diff
changeset
|
71 DefaultHttpClient httpclient = new DefaultHttpClient(); |
d2d4cd129f5e
new importer for DIGITALIZATIONs using the Diva manifest HTTP endpoint.
casties
parents:
diff
changeset
|
72 HttpGet httpGet = new HttpGet(scanListUrl); |
d2d4cd129f5e
new importer for DIGITALIZATIONs using the Diva manifest HTTP endpoint.
casties
parents:
diff
changeset
|
73 try { |
107
742347ef8410
Changed DivaImportHttp for new URL and JSON format.
Robert Casties <casties@mpiwg-berlin.mpg.de>
parents:
99
diff
changeset
|
74 System.out.println("Reading scan manifests from "+scanListUrl); |
47 | 75 // send HTTP request and read response |
76 HttpResponse response = httpclient.execute(httpGet); | |
23
d2d4cd129f5e
new importer for DIGITALIZATIONs using the Diva manifest HTTP endpoint.
casties
parents:
diff
changeset
|
77 if (response.getStatusLine().getStatusCode() > 200) { |
d2d4cd129f5e
new importer for DIGITALIZATIONs using the Diva manifest HTTP endpoint.
casties
parents:
diff
changeset
|
78 System.out.println("ERROR reading HTTP response: "+response.getStatusLine()); |
d2d4cd129f5e
new importer for DIGITALIZATIONs using the Diva manifest HTTP endpoint.
casties
parents:
diff
changeset
|
79 return; |
d2d4cd129f5e
new importer for DIGITALIZATIONs using the Diva manifest HTTP endpoint.
casties
parents:
diff
changeset
|
80 } |
d2d4cd129f5e
new importer for DIGITALIZATIONs using the Diva manifest HTTP endpoint.
casties
parents:
diff
changeset
|
81 HttpEntity htent = response.getEntity(); |
107
742347ef8410
Changed DivaImportHttp for new URL and JSON format.
Robert Casties <casties@mpiwg-berlin.mpg.de>
parents:
99
diff
changeset
|
82 JSONTokener jsonReader = new JSONTokener(new InputStreamReader(htent.getContent())); |
742347ef8410
Changed DivaImportHttp for new URL and JSON format.
Robert Casties <casties@mpiwg-berlin.mpg.de>
parents:
99
diff
changeset
|
83 // parse JSON directory index of manifest files |
742347ef8410
Changed DivaImportHttp for new URL and JSON format.
Robert Casties <casties@mpiwg-berlin.mpg.de>
parents:
99
diff
changeset
|
84 JSONArray files = new JSONArray(jsonReader); |
742347ef8410
Changed DivaImportHttp for new URL and JSON format.
Robert Casties <casties@mpiwg-berlin.mpg.de>
parents:
99
diff
changeset
|
85 int numFiles = files.length(); |
108
bdd1c3fc0897
Much faster DivaImportHttp that puts all entities in a Map first.
Robert Casties <casties@mpiwg-berlin.mpg.de>
parents:
107
diff
changeset
|
86 // get all DIGITALIZATION entities |
bdd1c3fc0897
Much faster DivaImportHttp that puts all entities in a Map first.
Robert Casties <casties@mpiwg-berlin.mpg.de>
parents:
107
diff
changeset
|
87 System.out.println("Loading all DIGITALIZATIONs"); |
bdd1c3fc0897
Much faster DivaImportHttp that puts all entities in a Map first.
Robert Casties <casties@mpiwg-berlin.mpg.de>
parents:
107
diff
changeset
|
88 List<Entity> digiList = omService.getEntitiesByDef(DIGITALIZATION); |
bdd1c3fc0897
Much faster DivaImportHttp that puts all entities in a Map first.
Robert Casties <casties@mpiwg-berlin.mpg.de>
parents:
107
diff
changeset
|
89 // unpack in Map by name/ownValue |
bdd1c3fc0897
Much faster DivaImportHttp that puts all entities in a Map first.
Robert Casties <casties@mpiwg-berlin.mpg.de>
parents:
107
diff
changeset
|
90 Map<String, Entity> digiMap = new HashMap<String, Entity>(digiList.size()); |
bdd1c3fc0897
Much faster DivaImportHttp that puts all entities in a Map first.
Robert Casties <casties@mpiwg-berlin.mpg.de>
parents:
107
diff
changeset
|
91 for (Entity digi : digiList) { |
bdd1c3fc0897
Much faster DivaImportHttp that puts all entities in a Map first.
Robert Casties <casties@mpiwg-berlin.mpg.de>
parents:
107
diff
changeset
|
92 String name = digi.getOwnValue(); |
bdd1c3fc0897
Much faster DivaImportHttp that puts all entities in a Map first.
Robert Casties <casties@mpiwg-berlin.mpg.de>
parents:
107
diff
changeset
|
93 digiMap.put(name, digi); |
bdd1c3fc0897
Much faster DivaImportHttp that puts all entities in a Map first.
Robert Casties <casties@mpiwg-berlin.mpg.de>
parents:
107
diff
changeset
|
94 } |
bdd1c3fc0897
Much faster DivaImportHttp that puts all entities in a Map first.
Robert Casties <casties@mpiwg-berlin.mpg.de>
parents:
107
diff
changeset
|
95 List<Entity> saveList = new ArrayList<Entity>(); |
107
742347ef8410
Changed DivaImportHttp for new URL and JSON format.
Robert Casties <casties@mpiwg-berlin.mpg.de>
parents:
99
diff
changeset
|
96 // go through all filenames in the list |
742347ef8410
Changed DivaImportHttp for new URL and JSON format.
Robert Casties <casties@mpiwg-berlin.mpg.de>
parents:
99
diff
changeset
|
97 for (int i = 0; i < numFiles; ++i) { |
742347ef8410
Changed DivaImportHttp for new URL and JSON format.
Robert Casties <casties@mpiwg-berlin.mpg.de>
parents:
99
diff
changeset
|
98 JSONObject file = files.getJSONObject(i); |
742347ef8410
Changed DivaImportHttp for new URL and JSON format.
Robert Casties <casties@mpiwg-berlin.mpg.de>
parents:
99
diff
changeset
|
99 String filename = file.getString("name"); |
742347ef8410
Changed DivaImportHttp for new URL and JSON format.
Robert Casties <casties@mpiwg-berlin.mpg.de>
parents:
99
diff
changeset
|
100 System.out.println("check: "+filename); |
742347ef8410
Changed DivaImportHttp for new URL and JSON format.
Robert Casties <casties@mpiwg-berlin.mpg.de>
parents:
99
diff
changeset
|
101 String digiName = filename.replace(".json", ""); |
108
bdd1c3fc0897
Much faster DivaImportHttp that puts all entities in a Map first.
Robert Casties <casties@mpiwg-berlin.mpg.de>
parents:
107
diff
changeset
|
102 if (digiMap.containsKey(digiName)) { |
23
d2d4cd129f5e
new importer for DIGITALIZATIONs using the Diva manifest HTTP endpoint.
casties
parents:
diff
changeset
|
103 //System.out.println(" exists: "+res); |
d2d4cd129f5e
new importer for DIGITALIZATIONs using the Diva manifest HTTP endpoint.
casties
parents:
diff
changeset
|
104 } else { |
47 | 105 // no existing DIGITALIZATION - create new Entity |
109
8013b12cecf7
fix .json extension on imported digitalizations.
Robert Casties <casties@mpiwg-berlin.mpg.de>
parents:
108
diff
changeset
|
106 System.out.println(" create: "+digiName); |
23
d2d4cd129f5e
new importer for DIGITALIZATIONs using the Diva manifest HTTP endpoint.
casties
parents:
diff
changeset
|
107 Entity digi = new Entity(Node.TYPE_ABOX, DIGITALIZATION, false); |
109
8013b12cecf7
fix .json extension on imported digitalizations.
Robert Casties <casties@mpiwg-berlin.mpg.de>
parents:
108
diff
changeset
|
108 digi.setOwnValue(digiName); |
8013b12cecf7
fix .json extension on imported digitalizations.
Robert Casties <casties@mpiwg-berlin.mpg.de>
parents:
108
diff
changeset
|
109 digi.addAttribute(new Attribute("name", "text", digiName)); |
23
d2d4cd129f5e
new importer for DIGITALIZATIONs using the Diva manifest HTTP endpoint.
casties
parents:
diff
changeset
|
110 digi.addAttribute(new Attribute("num_files", "text", "100")); |
47 | 111 // add to list |
108
bdd1c3fc0897
Much faster DivaImportHttp that puts all entities in a Map first.
Robert Casties <casties@mpiwg-berlin.mpg.de>
parents:
107
diff
changeset
|
112 saveList.add(digi); |
23
d2d4cd129f5e
new importer for DIGITALIZATIONs using the Diva manifest HTTP endpoint.
casties
parents:
diff
changeset
|
113 } |
d2d4cd129f5e
new importer for DIGITALIZATIONs using the Diva manifest HTTP endpoint.
casties
parents:
diff
changeset
|
114 } |
d2d4cd129f5e
new importer for DIGITALIZATIONs using the Diva manifest HTTP endpoint.
casties
parents:
diff
changeset
|
115 // ensure http entity is fully consumed |
d2d4cd129f5e
new importer for DIGITALIZATIONs using the Diva manifest HTTP endpoint.
casties
parents:
diff
changeset
|
116 EntityUtils.consume(htent); |
d2d4cd129f5e
new importer for DIGITALIZATIONs using the Diva manifest HTTP endpoint.
casties
parents:
diff
changeset
|
117 // persist OpenMind entities |
108
bdd1c3fc0897
Much faster DivaImportHttp that puts all entities in a Map first.
Robert Casties <casties@mpiwg-berlin.mpg.de>
parents:
107
diff
changeset
|
118 omService.saveEntityList(saveList, userName); |
107
742347ef8410
Changed DivaImportHttp for new URL and JSON format.
Robert Casties <casties@mpiwg-berlin.mpg.de>
parents:
99
diff
changeset
|
119 System.out.println("Found " + numFiles + " manifests"); |
108
bdd1c3fc0897
Much faster DivaImportHttp that puts all entities in a Map first.
Robert Casties <casties@mpiwg-berlin.mpg.de>
parents:
107
diff
changeset
|
120 System.out.println("Created " + saveList.size() + " DIGITALIZATIONs"); |
23
d2d4cd129f5e
new importer for DIGITALIZATIONs using the Diva manifest HTTP endpoint.
casties
parents:
diff
changeset
|
121 System.out.println("END"); |
d2d4cd129f5e
new importer for DIGITALIZATIONs using the Diva manifest HTTP endpoint.
casties
parents:
diff
changeset
|
122 |
d2d4cd129f5e
new importer for DIGITALIZATIONs using the Diva manifest HTTP endpoint.
casties
parents:
diff
changeset
|
123 } finally { |
d2d4cd129f5e
new importer for DIGITALIZATIONs using the Diva manifest HTTP endpoint.
casties
parents:
diff
changeset
|
124 httpGet.releaseConnection(); |
d2d4cd129f5e
new importer for DIGITALIZATIONs using the Diva manifest HTTP endpoint.
casties
parents:
diff
changeset
|
125 } |
d2d4cd129f5e
new importer for DIGITALIZATIONs using the Diva manifest HTTP endpoint.
casties
parents:
diff
changeset
|
126 |
d2d4cd129f5e
new importer for DIGITALIZATIONs using the Diva manifest HTTP endpoint.
casties
parents:
diff
changeset
|
127 } catch (Exception e) { |
d2d4cd129f5e
new importer for DIGITALIZATIONs using the Diva manifest HTTP endpoint.
casties
parents:
diff
changeset
|
128 e.printStackTrace(); |
d2d4cd129f5e
new importer for DIGITALIZATIONs using the Diva manifest HTTP endpoint.
casties
parents:
diff
changeset
|
129 } |
d2d4cd129f5e
new importer for DIGITALIZATIONs using the Diva manifest HTTP endpoint.
casties
parents:
diff
changeset
|
130 } |
d2d4cd129f5e
new importer for DIGITALIZATIONs using the Diva manifest HTTP endpoint.
casties
parents:
diff
changeset
|
131 |
d2d4cd129f5e
new importer for DIGITALIZATIONs using the Diva manifest HTTP endpoint.
casties
parents:
diff
changeset
|
132 |
d2d4cd129f5e
new importer for DIGITALIZATIONs using the Diva manifest HTTP endpoint.
casties
parents:
diff
changeset
|
133 public static void main(String[] args){ |
d2d4cd129f5e
new importer for DIGITALIZATIONs using the Diva manifest HTTP endpoint.
casties
parents:
diff
changeset
|
134 execute(); |
d2d4cd129f5e
new importer for DIGITALIZATIONs using the Diva manifest HTTP endpoint.
casties
parents:
diff
changeset
|
135 System.exit(0); |
d2d4cd129f5e
new importer for DIGITALIZATIONs using the Diva manifest HTTP endpoint.
casties
parents:
diff
changeset
|
136 } |
d2d4cd129f5e
new importer for DIGITALIZATIONs using the Diva manifest HTTP endpoint.
casties
parents:
diff
changeset
|
137 } |