Mercurial > hg > openmind
changeset 23:d2d4cd129f5e
new importer for DIGITALIZATIONs using the Diva manifest HTTP endpoint.
author | casties |
---|---|
date | Mon, 27 Jun 2016 19:11:36 -0400 |
parents | 165b1efb85cd |
children | 1c034e2f7367 |
files | src/main/java/org/mpi/openmind/scripts/DivaImportHttp.java |
diffstat | 1 files changed, 167 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/main/java/org/mpi/openmind/scripts/DivaImportHttp.java Mon Jun 27 19:11:36 2016 -0400 @@ -0,0 +1,167 @@ +package org.mpi.openmind.scripts; + +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.apache.http.HttpEntity; +import org.apache.http.HttpResponse; +import org.apache.http.client.methods.HttpGet; +import org.apache.http.impl.client.DefaultHttpClient; +import org.apache.http.util.EntityUtils; +import org.apache.log4j.ConsoleAppender; +import org.apache.log4j.Level; +import org.apache.log4j.Logger; +import org.apache.log4j.PatternLayout; +import org.mpi.openmind.cache.WrapperService; +import org.mpi.openmind.repository.bo.Attribute; +import org.mpi.openmind.repository.bo.Entity; +import org.mpi.openmind.repository.bo.Node; +import org.mpi.openmind.repository.bo.Relation; +import org.mpi.openmind.repository.services.ServiceRegistry; +import org.mpi.openmind.repository.services.utils.AttributeFilter; + + +public class DivaImportHttp { + + static{ + ConsoleAppender console = new ConsoleAppender(); //create appender + //configure the appender + String PATTERN = "%d [%p|%c|%C{1}] %m%n"; + console.setLayout(new PatternLayout(PATTERN)); + console.setThreshold(Level.INFO); + console.activateOptions(); + //add appender to any Logger (here is root) + Logger.getRootLogger().addAppender(console); + } + + public static String DIGITALIZATION = "DIGITALIZATION"; + public static String userName = "diva-import"; + + public static void execute(){ + ServiceRegistry services = new ServiceRegistry(); + createDataModel(services.getWrapper()); + importData(services.getWrapper()); + } + + private static void createDataModel(WrapperService ontology){ + + try { + + Entity digi = new Entity(Node.TYPE_TBOX, Node.TYPE_TBOX, false); + digi.setOwnValue(DIGITALIZATION); + + digi = ontology.saveLWDefinition(digi, userName); + + Attribute attName = new Attribute(Node.TYPE_TBOX, "text", "name"); + attName.setSourceId(digi.getId()); + attName.setSourceObjectClass(Node.TYPE_TBOX); + attName.setSourceModif(digi.getModificationTime()); + attName.setSystemStatus(Node.SYS_STATUS_CURRENT_VERSION); + + ontology.saveDefAttribute(attName, userName); + + Attribute num_files = new Attribute(Node.TYPE_TBOX, "text", "num_files"); + num_files.setSourceId(digi.getId()); + num_files.setSourceObjectClass(Node.TYPE_TBOX); + num_files.setSourceModif(digi.getModificationTime()); + num_files.setSystemStatus(Node.SYS_STATUS_CURRENT_VERSION); + + ontology.saveDefAttribute(num_files, userName); + + + //DIGI is_digitalization_of CODEX + Entity codex = ontology.getDefinition("CODEX"); + Relation rel = new Relation(digi, codex, "is_digitalization_of"); + + ontology.saveDefRelation(rel, userName); + + //----------- + Entity witness = ontology.getDefinition("WITNESS"); + + Attribute end_page = new Attribute(Node.TYPE_TBOX, "text", "end_page"); + end_page.setSourceId(witness.getId()); + end_page.setSourceObjectClass(Node.TYPE_TBOX); + end_page.setSourceModif(witness.getModificationTime()); + end_page.setSystemStatus(Node.SYS_STATUS_CURRENT_VERSION); + + ontology.saveDefAttribute(end_page, userName); + + Attribute start_page = new Attribute(Node.TYPE_TBOX, "text", "start_page"); + start_page.setSourceId(witness.getId()); + start_page.setSourceObjectClass(Node.TYPE_TBOX); + start_page.setSourceModif(witness.getModificationTime()); + start_page.setSystemStatus(Node.SYS_STATUS_CURRENT_VERSION); + + ontology.saveDefAttribute(start_page, userName); + + + } catch (Exception e) { + e.printStackTrace(); + } + + + } + + private static void importData(WrapperService omService){ + try { + DefaultHttpClient httpclient = new DefaultHttpClient(); + String scanListUrl = "https://images.rasi.mcgill.ca/data/"; + HttpGet httpGet = new HttpGet(scanListUrl); + System.out.println("Reading scan dirs from "+scanListUrl); + HttpResponse response = httpclient.execute(httpGet); + try { + if (response.getStatusLine().getStatusCode() > 200) { + System.out.println("ERROR reading HTTP response: "+response.getStatusLine()); + return; + } + HttpEntity htent = response.getEntity(); + String document = EntityUtils.toString(htent); + // brutal HTML string parsing ;-( + Pattern lp = Pattern.compile("<a href=\"([\\w_.]+)\\.json\">"); + Matcher lm = lp.matcher(document); + List<Entity> list = new ArrayList<Entity>(); + int dirs = 0; + while (lm.find()) { + dirs += 1; + String dirName = lm.group(1); + System.out.println("check: "+dirName); + List<AttributeFilter> filters = new ArrayList<AttributeFilter>(); + filters.add(new AttributeFilter("name", dirName, "DIGITALIZATION")); + Map<Entity, Attribute> res = omService.searchEntityByAttributeFilter(filters, -1); + if (res.size() > 0) { + //System.out.println(" exists: "+res); + } else { + System.out.println(" create: "+dirName); + Entity digi = new Entity(Node.TYPE_ABOX, DIGITALIZATION, false); + digi.setOwnValue(dirName); + digi.addAttribute(new Attribute("name", "text", dirName)); + digi.addAttribute(new Attribute("num_files", "text", "100")); + list.add(digi); + } + } + // ensure http entity is fully consumed + EntityUtils.consume(htent); + // persist OpenMind entities + omService.saveEntityList(list, userName); + System.out.println("Read " + dirs + " directories"); + System.out.println("Created " + list.size() + " DIGITALIZATIONs"); + System.out.println("END"); + + } finally { + httpGet.releaseConnection(); + } + + } catch (Exception e) { + e.printStackTrace(); + } + } + + + public static void main(String[] args){ + execute(); + System.exit(0); + } +}