comparison src/de/mpiwg/itgroup/eSciDoc/Tools/IngestECHO.java @ 0:c6929e63b0b8

first import
author dwinter
date Wed, 24 Nov 2010 16:52:07 +0100
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:c6929e63b0b8
1 package de.mpiwg.itgroup.eSciDoc.Tools;
2
3 //todo: create context for echo and contentmodell
4 import java.io.BufferedReader;
5 import java.io.ByteArrayInputStream;
6 import java.io.ByteArrayOutputStream;
7 import java.io.IOException;
8 import java.io.InputStream;
9 import java.io.InputStreamReader;
10 import java.io.PrintStream;
11 import java.io.StringReader;
12 import java.net.MalformedURLException;
13 import java.net.URL;
14 import java.util.ArrayList;
15 import java.util.HashMap;
16 import java.util.List;
17 import java.util.regex.Matcher;
18 import java.util.regex.Pattern;
19
20 import javax.xml.parsers.DocumentBuilder;
21 import javax.xml.parsers.DocumentBuilderFactory;
22 import javax.xml.xpath.XPath;
23 import javax.xml.xpath.XPathConstants;
24 import javax.xml.xpath.XPathFactory;
25
26 import org.apache.http.HttpResponse;
27 import org.apache.http.client.ClientProtocolException;
28 import org.apache.xmlrpc.XmlRpcException;
29 import org.apache.xmlrpc.client.XmlRpcClient;
30 import org.apache.xmlrpc.client.XmlRpcClientConfigImpl;
31 import org.jdom.JDOMException;
32 import org.w3c.dom.Document;
33 import org.w3c.dom.NodeList;
34 import org.xml.sax.InputSource;
35 import org.xml.sax.SAXParseException;
36
37 import sun.misc.Regexp;
38
39 //import fedora.client.FedoraClient;
40 //import fedora.server.access.FedoraAPIA;
41 //import fedora.server.management.FedoraAPIM;
42 //import fedora.server.types.gen.ComparisonOperator;
43 //import fedora.server.types.gen.Condition;
44 //import fedora.server.types.gen.FieldSearchQuery;
45 //import fedora.server.types.gen.FieldSearchResult;
46 //import fedora.server.types.gen.ListSession;
47 //import fedora.server.types.gen.MIMETypedStream;
48 //import fedora.server.types.gen.ObjectFields;
49
50 public class IngestECHO extends Ingestor {
51
52 protected String ECHORESOURCE_TEMPLATE_XML;
53 protected String ECHOCONTAINER_TEMPLATE_XML;
54 private String SERVLETURL;
55 protected String ECHOURL;
56 protected String ECHO_CONTAINER_ID;
57 protected String ECHO_ROOT_ID;
58 protected String MAIN_CONTEXT;
59 private HashMap<String, String> pids;
60
61 protected static String ESCIDOC_SERVER_URL = "euler.mpiwg-berlin.mpg.de";
62 protected static String ZOPEPROVIDER = "http://127.0.0.1:18080";
63
64 private static int PORT = 8080;
65
66 IngestECHO(String user, String password){
67
68 super(ESCIDOC_SERVER_URL, PORT, ZOPEPROVIDER, user, password);
69 ECHORESOURCE_TEMPLATE_XML = "ECHOResourceTemplate.xml";
70
71 SERVLETURL= "http://nausikaa2.mpiwg-berlin.mpg.de/digitallibrary/servlet/Texter?fn=";
72
73 ECHOURL = "http://echo.mpiwg-berlin.mpg.de";
74
75 //ZOPEPROVIDER = "http://127.0.0.1:18080";
76
77 ECHO_CONTAINER_ID = "escidoc:3006"; // enthaelt alle ECHO
78 // objecte
79 ECHO_ROOT_ID = "escidoc:3005"; // enthaelt alle Objekte die
80 // keiner ECHO collection
81 // angehoeren
82
83 MAIN_CONTEXT = "escidoc:3002";
84
85 HashMap<String, String> pids = null;
86
87
88 }
89
90 void ingestECHOCollections() throws XmlRpcException, IOException {
91 ArrayList<String> urls = getAllCollections();
92 HashMap<String, String> success = new HashMap<String, String>();
93 HashMap<String, String> nosuccess = new HashMap<String, String>();
94
95 for (String url : urls) {
96
97 try {
98 String id = ingestECHOCollection(url);
99 success.put(id, url);
100 } catch (Exception e) {
101
102 ByteArrayOutputStream out = new ByteArrayOutputStream();
103 PrintStream s = new PrintStream(out);
104 e.printStackTrace(s);
105
106 nosuccess.put(url, out.toString());
107
108 e.printStackTrace();
109 }
110 }
111 System.out.println("SUCCESSFULL INGEST");
112 for (String id : success.keySet())
113 System.out.println("ID:" + id + " URL:" + success.get(id));
114
115 System.out.println("ERRORS:");
116 for (String id : nosuccess.keySet()) {
117 System.out.println("URL:" + id);
118 System.out.println("Message:" + nosuccess.get(id));
119 }
120
121 }
122
123 void organizeECHOCollections() throws XmlRpcException, IOException,
124 JDOMException {
125 ArrayList<String> urls = getAllCollections();
126 HashMap<String, String> success = new HashMap<String, String>();
127 HashMap<String, String> nosuccess = new HashMap<String, String>();
128
129 for (String url : urls) {
130
131 XmlRpcClientConfigImpl config = new XmlRpcClientConfigImpl();
132 XmlRpcClient client = new XmlRpcClient();
133 config.setServerURL(new URL(url));
134 client.setConfig(config);
135
136 Object[] params = new Object[] {};
137
138 if (pids == null) {
139 pids = getPIDsAndEscidocIdsOfCollections(ECHO_CONTAINER_ID);
140 }
141
142 try {
143 String parentPid;
144 String pid = (String) client.execute("getPID", params);
145 String contid = pids.get("mpiwg:" + pid);
146 addECHOObjectToCollection(client, contid);
147 success.put(pid, url);
148 } catch (Exception e) {
149
150 ByteArrayOutputStream out = new ByteArrayOutputStream();
151 PrintStream s = new PrintStream(out);
152 e.printStackTrace(s);
153
154 nosuccess.put(url, out.toString());
155
156 e.printStackTrace();
157 }
158 }
159 System.out.println("SUCCESSFULL ORGANIZED");
160 for (String id : success.keySet())
161 System.out.println("ID:" + id + " URL:" + success.get(id));
162
163 System.out.println("ERRORS:");
164 for (String id : nosuccess.keySet()) {
165 System.out.println("URL:" + id);
166 System.out.println("Message:" + nosuccess.get(id));
167 }
168
169 }
170
171 void organizeECHORessources() throws XmlRpcException, IOException,
172 JDOMException {
173 ArrayList<String> urls = getAllResources();
174 HashMap<String, String> success = new HashMap<String, String>();
175 HashMap<String, String> nosuccess = new HashMap<String, String>();
176
177 for (String url : urls) {
178
179 XmlRpcClientConfigImpl config = new XmlRpcClientConfigImpl();
180 XmlRpcClient client = new XmlRpcClient();
181 config.setServerURL(new URL(url));
182 client.setConfig(config);
183
184 Object[] params = new Object[] {};
185
186 if (pids == null) {
187 pids = getPIDsAndEscidocIdsOfCollections(ECHO_CONTAINER_ID);
188 }
189
190 try {
191 String parentPid;
192 String pid = (String) client.execute("getPID", params);
193 String contid = getIDfromPID("mpiwg:" + pid);
194 addECHOObjectToCollection(client, contid);
195 success.put(pid, url);
196 } catch (Exception e) {
197
198 ByteArrayOutputStream out = new ByteArrayOutputStream();
199 PrintStream s = new PrintStream(out);
200 e.printStackTrace(s);
201
202 nosuccess.put(url, out.toString());
203
204 e.printStackTrace();
205 }
206 }
207 System.out.println("SUCCESSFULL ORGANIZED");
208 for (String id : success.keySet())
209 System.out.println("ID:" + id + " URL:" + success.get(id));
210
211 System.out.println("ERRORS:");
212 for (String id : nosuccess.keySet()) {
213 System.out.println("URL:" + id);
214 System.out.println("Message:" + nosuccess.get(id));
215 }
216
217 }
218
219
220
221 private String getIDfromPID(String pid) throws ClientProtocolException,
222 IOException {
223 InputStream res = getXMLfromPID(pid,MAIN_CONTEXT);
224 return EScidocBasicHandler.getId(EScidocBasicHandler
225 .convertStreamToString(res));
226 }
227
228 /**
229 * FŸgt die ECHO Collection unter der URL in eScidoc ein. Der Link auf die
230 * Web-Seite wird in einem eigenen item hinterlegt, dass in Collection
231 * eingefŸgt wird.
232 *
233 * @param url
234 * @throws Exception
235 */
236 private String ingestECHOCollection(String url) throws Exception {
237
238 // get a PID for the Collection
239 System.out.println("Processing:" + url);
240
241 HashMap<String, String> dcs = new HashMap<String, String>(); // Store
242 // for
243 // the
244 // metadata
245
246 // Verbinde dich mit der Collection Ÿber XML-rpc
247
248 XmlRpcClientConfigImpl config = new XmlRpcClientConfigImpl();
249 XmlRpcClient client = new XmlRpcClient();
250 config.setServerURL(new URL(url));
251 client.setConfig(config);
252
253 String pid = getOrCreatePID(client);
254
255 if (pidAlreadyExists("mpiwg:"+pid))
256 {
257 System.out.println("PID:"+pid);
258 String contid=getIDfromPID("mpiwg:"+pid);
259 System.out.println("------- belongsTo:"+contid);
260 return contid;
261 }
262 Object[] params = new Object[] {};
263
264 eSciDocXmlObject obj = new eSciDocXmlObject("mpiwg:" + pid,
265 ECHOCONTAINER_TEMPLATE_XML);
266
267 String result = (String) client.execute("getDescription", params);
268
269 String x = new String(result.getBytes("UTF-8"), ("UTF-8"));
270 // System.out.println("DESCR"+x);
271 dcs.put("description", x);
272 String title = (String) client.execute("getTitle", params);
273 dcs.put("title", title);
274 obj.insertDC(dcs);
275 obj.addOrigUrlToMPIWGMetaData(url);
276
277 // obj.setRelationship("info:fedora/echo:col1");
278 String xml = obj.printXML();
279 // System.out.println(xml);
280 String ret = ingest("/ir/container", xml);
281 String xr = ingestCollectionWebSite(title, url);
282 // System.out.println(xr);
283 String objid = EScidocBasicHandler.getId(xr);
284 String dateStamp = EScidocBasicHandler.getDateStamp(ret);
285 String addTxt = "<param last-modification-date=\"" + dateStamp + "\">";
286 addTxt += "<id>" + objid + "</id>";
287 addTxt += "</param>";
288
289 String contid = EScidocBasicHandler.getId(ret);
290
291 ByteArrayInputStream stream = new ByteArrayInputStream(addTxt
292 .getBytes("utf-8"));
293
294 eSciDocHandler.eScidocPost("/ir/container/" + contid + "/members/add",
295 stream);
296 // System.out.println(response.getStatusLine());
297 // System.out.println(EScidocBasicHandler.convertStreamToString(response.getEntity().getContent()));
298 System.out.println("Processed:" + url + "------>" + contid);
299
300 addToCollection(ECHO_CONTAINER_ID, contid);
301
302 params = new Object[] { pid };
303 client.execute("setPID", params);
304
305 System.out.println(ret);
306
307 addECHOObjectToCollection(client, contid);
308 return contid;
309
310 }
311
312 public ArrayList<String> findMissingItems() throws XmlRpcException, IOException{
313 return findMissingItemsFromECHOUrls(getAllResources());
314 }
315
316 public ArrayList<String> findMissingCollections() throws XmlRpcException, IOException{
317 return findMissingItemsFromECHOUrls(getAllCollections());
318 }
319
320 public ArrayList<String> findMissingItemsFromECHOUrls(List<String> urls) throws XmlRpcException, IOException{
321 //ArrayList<String> urls = getAllCollections();
322 System.out.println("GOT the collections");
323 ArrayList<String> ret = new ArrayList<String>();
324 for (String url : urls) {
325 System.out.println("checking:"+url);
326 XmlRpcClientConfigImpl config = new XmlRpcClientConfigImpl();
327 XmlRpcClient client = new XmlRpcClient();
328
329 config.setServerURL(new URL(url));
330 client.setConfig(config);
331 String pid;
332 try {
333 Object[] parameters = new Object[] {};
334
335 pid = (String) client.execute("getPID", parameters);
336 } catch (Exception e) {
337 pid = null;
338 }
339
340 if (pid == null){
341 ret.add(url);
342 System.out.println(" -- no pid");
343 } else {
344 String id;
345 try {
346 id = getIDfromPID("mpiwg:"+pid);
347 } catch (Exception e) {
348 id = "NO";
349 ret.add(url);
350 }
351
352
353 System.out.println(" -- id:"+id);
354 }
355
356
357
358 }
359 return ret;
360 }
361 private String getOrCreatePID(XmlRpcClient client) throws XmlRpcException,
362 MalformedURLException {
363 Object[] parameters = new Object[] {};
364
365 String pid = null;
366
367 // Hole pid aus ECHO
368 try {
369 pid = (String) client.execute("getPID", parameters);
370 } catch (Exception e) {
371 pid = null;
372 }
373 // Falls dort noch keine ist, erzeuge ein neue
374 if (pid == null)
375 pid = getID();
376 else
377 System.out.println("PID from ECHO:" + pid);
378
379 return pid;
380 }
381
382 private void addECHOObjectToCollection(XmlRpcClient client, String contid)
383 throws ClientProtocolException, IOException, JDOMException {
384 Object[] params;
385 params = new Object[] {};
386
387 if (pids == null) {
388 pids = getPIDsAndEscidocIdsOfCollections(ECHO_CONTAINER_ID);
389 }
390
391 String parentId;
392 String parentPid;
393 try {
394 parentPid = (String) client.execute("getParentPID", params);
395 parentId = pids.get("mpiwg:" + parentPid);
396 } catch (Exception e) {
397 parentId = ECHO_ROOT_ID;
398 }
399 addToCollection(parentId, contid);
400
401 }
402
403 private String ingestCollectionWebSite(String title, String url)
404 throws Exception {
405 String pid = getID();
406 eSciDocXmlObject obj = new eSciDocXmlObject("mpiwg:" + pid,
407 "ECHOCollectionWebRepresentationTemplate.xml");
408
409 HashMap<String, String> dcs = new HashMap<String, String>();
410
411 obj.addWebUrl(url);
412 // obj.setRelationship("info:fedora/echo:col1");
413
414 dcs.put("title", title); // ersatzweise den titel aus der echo
415 // collection
416 obj.insertDC(dcs);
417
418 String xml = obj.printXML();
419 // System.out.println(xml);
420 String res = ingest("/ir/item", xml);
421 return res;
422 }
423
424 public void ingestECHOResources() throws IOException {
425 ingestECHOResources(null);
426
427 }
428 public void ingestECHOResources(Pattern match) throws IOException {
429 ArrayList<String> urls = getAllResources();
430 HashMap<String, String> success = new HashMap<String, String>();
431 HashMap<String, String> nosuccess = new HashMap<String, String>();
432
433 for (String url : urls) {
434
435 try {
436 Boolean ingest=false;
437
438 if (match == null)
439 ingest=true;
440 else {
441 Matcher m = match.matcher(url);
442 if (m.matches())
443 ingest=true;
444
445 }
446 if (ingest){
447 String id = ingestECHOResource(url);
448 success.put(id, url);
449 }
450 } catch (Exception e) {
451
452 ByteArrayOutputStream out = new ByteArrayOutputStream();
453 PrintStream s = new PrintStream(out);
454 e.printStackTrace(s);
455
456 nosuccess.put(url, out.toString());
457
458 e.printStackTrace();
459 }
460 }
461 System.out.println("SUCCESSFULL INGEST");
462 for (String id : success.keySet())
463 System.out.println("ID:" + id + " URL:" + success.get(id));
464
465 System.out.println("ERRORS:");
466 for (String id : nosuccess.keySet()) {
467 System.out.println("URL:" + id);
468 System.out.println("Message:" + nosuccess.get(id));
469 }
470
471 }
472
473 protected ArrayList<String> getAllResources() throws IOException {
474 URL echoUrl = new URL(ECHOURL + "/getResourcesXML");
475 Pattern p = Pattern.compile("echoLink=\"([^\"]*)\"");
476 BufferedReader in = new BufferedReader(new InputStreamReader(echoUrl
477 .openStream()));
478
479 ArrayList<String> ret = new ArrayList<String>();
480 String inputLine;
481 Matcher m;
482 while ((inputLine = in.readLine()) != null) {
483 m = p.matcher(inputLine);
484 String lit;
485 if (m.find()) {
486
487 lit = m.group(1);
488 ret.add(lit);
489 }
490 }
491
492 in.close();
493 return ret;
494
495 }
496
497 protected String ingestECHOResource(String url) throws Exception {
498 return ingestECHOResource(url, false);
499 }
500
501 protected String ingestECHOResource(String url,boolean withfullText) throws Exception {
502
503 System.out.println("Starting:" + url);
504
505 HashMap<String, String> dcs = new HashMap<String, String>();
506
507 XmlRpcClientConfigImpl config = new XmlRpcClientConfigImpl();
508 XmlRpcClient client = new XmlRpcClient();
509 config.setServerURL(new URL(url));
510 client.setConfig(config);
511
512 String pid = getOrCreatePID(client);
513
514 if (pidAlreadyExists("mpiwg:"+pid))
515 {
516 System.out.println("PID:"+pid);
517 String contid=getIDfromPID("mpiwg:"+pid);
518 System.out.println("------- belongsTo:"+contid);
519 return contid;
520 }
521 eSciDocXmlObject obj = new eSciDocXmlObject("mpiwg:" + pid,
522 ECHORESOURCE_TEMPLATE_XML);
523 Object[] params = new Object[] {};
524
525 String title = (String) client.execute("getTitle", params);
526 String ml = (String) client.execute("getMetaDataLink", params);
527 if (withfullText){
528 String fulltextURL = url+"/getFullTextXML";
529 obj.addFullText(fulltextURL);
530
531 }
532 ml = correctML(ml);
533
534 obj.addWebUrl(url);
535 obj.addOrigUrlToMPIWGMetaData(url);
536
537 // obj.setRelationship("info:fedora/echo:col1");
538
539 config.setServerURL(new URL(ZOPEPROVIDER + "/metadataMain"));
540 client.setConfig(config);
541 params = new Object[] { ml };
542
543 try {
544 String result = (String) client.execute("getDCFormatted", params);
545 System.out.println("dC:"+result);
546 DocumentBuilderFactory factory = DocumentBuilderFactory
547 .newInstance();
548 factory.setNamespaceAware(true);
549 DocumentBuilder db = factory.newDocumentBuilder();
550
551 InputSource resultStream = new InputSource(new StringReader(result));
552 Document dc = db.parse(resultStream);
553 obj.insertDC(dc);
554
555 Document indexmeta = db.parse(ml);
556
557 XPath xpath = XPathFactory.newInstance().newXPath();
558 xpath.setNamespaceContext(new EScidocNameSpaceContext());
559
560 NodeList test = (NodeList) xpath.evaluate("//meta", indexmeta,
561 XPathConstants.NODESET);
562 if (test.getLength() != 1)
563 {
564 test = (NodeList) xpath.evaluate("//mpiwg:meta", indexmeta,
565 XPathConstants.NODESET);
566
567 if (test.getLength() !=1)
568 throw new Exception();
569 }
570 obj.insertMeta(test.item(0));
571
572 obj.addIndexMetaUrl(ml);
573
574 } catch (XmlRpcException e) {
575 System.err.println("Ressource:" + url);
576 System.err.println("METADATA CANNOT BE PARSED:" + ml);
577 HashMap<String, String> dc = new HashMap<String, String>();
578 dc.put("title", title); // ersatzweise den titel aus der echo
579 // collection
580 obj.insertDC(dc);
581 } catch (SAXParseException e) {
582 System.err.println("METADATA RESULT CANNOT BE PARSED:");
583 HashMap<String, String> dc = new HashMap<String, String>();
584 dc.put("title", title); // ersatzweise den titel aus der echo
585 // collection
586 obj.insertDC(dc);
587 }
588
589 String xml = obj.printXML();
590 System.out.println(xml);
591 return "XXX";
592 String result = ingest("/ir/item", xml);
593 // String contid = EScidocBasicHandler.getId(result);
594 // //String contid="NNNN";
595 // System.out.println("------->" + contid);
596 //
597 // params = new Object[] { pid };
598 // config.setServerURL(new URL(url));
599 // client.setConfig(config);
600 //
601 // client.execute("setPID", params);
602 // addToCollection(ECHO_CONTAINER_ID, contid);
603 //
604 // addECHOObjectToCollection(client, contid);
605 // return contid;
606
607 }
608
609 private boolean pidAlreadyExists(String pid) {
610 String id;
611 try{
612 id = getIDfromPID(pid);
613 } catch (Exception e){
614 return false;
615 }
616 if (!id.equals(""))
617 return true;
618 return false;
619 }
620
621 private String correctML(String ml) {
622 Pattern p = Pattern.compile("experimental/(.*)");
623 Matcher m = p.matcher(ml);
624 String pf;
625 if (m.find())
626 pf = "experimental/" + m.group(1);
627 else {
628 p = Pattern.compile("permanent/(.*)");
629 m = p.matcher(ml);
630 if (m.find())
631 pf = "permanent/" + m.group(1);
632 else
633 return ml;
634 }
635 return SERVLETURL + pf;
636 }
637
638 protected ArrayList<String> getAllCollections() throws XmlRpcException,
639 IOException {
640 System.out.println("ECHO:"+ECHOURL);
641 URL echoUrl = new URL(ECHOURL + "/getCollectionsXML");
642 Pattern p = Pattern.compile("echoLink=\"(.*)\"");
643 BufferedReader in = new BufferedReader(new InputStreamReader(echoUrl
644 .openStream()));
645
646 ArrayList<String> ret = new ArrayList<String>();
647 String inputLine;
648 Matcher m;
649 while ((inputLine = in.readLine()) != null) {
650 m = p.matcher(inputLine);
651 String lit;
652 if (m.find()) {
653
654 lit = m.group(1);
655 ret.add(lit);
656 }
657 }
658
659 in.close();
660 return ret;
661 }
662
663 private void submitAndReleaseAnObject(String href) throws ClientProtocolException,
664 IOException, JDOMException {
665
666 addVersionPid(href);
667 HttpResponse res = submitAnObject(href, "submit");
668 System.out.println(EScidocBasicHandler.convertStreamToString(res
669 .getEntity().getContent()));
670 res = releaseAnObject(href, "first release");
671 System.out.println(EScidocBasicHandler.convertStreamToString(res
672 .getEntity().getContent()));
673
674 }
675
676 void releaseECHORessources() throws XmlRpcException, IOException,
677 JDOMException {
678 ArrayList<String> urls = getAllResources();
679 HashMap<String, String> success = new HashMap<String, String>();
680 HashMap<String, String> nosuccess = new HashMap<String, String>();
681 int numOfUrl= urls.size();
682 int count = 0;
683 for (String url : urls) {
684
685 XmlRpcClientConfigImpl config = new XmlRpcClientConfigImpl();
686 XmlRpcClient client = new XmlRpcClient();
687 config.setServerURL(new URL(url));
688 client.setConfig(config);
689
690 Object[] params = new Object[] {};
691
692 if (pids == null) {
693 pids = getPIDsAndEscidocIdsOfCollections(ECHO_CONTAINER_ID);
694 }
695
696 try {
697 String parentPid;
698 String pid = (String) client.execute("getPID", params);
699 String contid = getIDfromPID("mpiwg:" + pid);
700 submitAndReleaseAnObject("/ir/item/"+contid);
701 success.put(pid, url);
702 } catch (Exception e) {
703
704 ByteArrayOutputStream out = new ByteArrayOutputStream();
705 PrintStream s = new PrintStream(out);
706 e.printStackTrace(s);
707
708 nosuccess.put(url, out.toString());
709
710 e.printStackTrace();
711 }
712 count+=1;
713 System.out.println("DONE:"+count+" of "+numOfUrl);
714 }
715 System.out.println("SUCCESSFULL ORGANIZED");
716 for (String id : success.keySet())
717 System.out.println("ID:" + id + " URL:" + success.get(id));
718
719 System.out.println("ERRORS:");
720 for (String id : nosuccess.keySet()) {
721 System.out.println("URL:" + id);
722 System.out.println("Message:" + nosuccess.get(id));
723 }
724
725 }
726 }