comparison software/mpdl-services-new/mpiwg-mpdl-cms/src/de/mpg/mpiwg/berlin/mpdl/cms/harvester/PathExtractor.java @ 25:e9fe3186670c default tip

letzter Stand eingecheckt
author Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
date Tue, 21 May 2013 10:19:32 +0200
parents
children
comparison
equal deleted inserted replaced
23:e845310098ba 25:e9fe3186670c
1 package de.mpg.mpiwg.berlin.mpdl.cms.harvester;
2
3 import java.io.File;
4 import java.io.IOException;
5 import java.util.ArrayList;
6 import java.util.List;
7
8 import javax.xml.stream.XMLInputFactory;
9 import javax.xml.stream.XMLStreamConstants;
10 import javax.xml.stream.XMLStreamException;
11 import javax.xml.stream.XMLStreamReader;
12
13 import org.apache.http.HttpEntity;
14 import org.apache.http.HttpResponse;
15 import org.apache.http.client.HttpClient;
16 import org.apache.http.client.methods.HttpGet;
17 import org.apache.http.impl.client.DefaultHttpClient;
18
19 public class PathExtractor {
20 private List<String> ressourceLoc;
21 String excludes;
22
23 public PathExtractor() {
24
25 }
26
27 public List<String> initExtractor(String startingUri, String excludes) {
28 this.excludes = excludes;
29 ressourceLoc = new ArrayList<String>();
30 // parameter necessery, because it's recursive, thus changing the uri
31 extractDocLocations(startingUri);
32 System.out.println("extracing resource locations done.");
33 return this.ressourceLoc;
34 }
35
36 /**
37 * recursive Method to extract the path of the resources
38 *
39 * @param startUrl
40 */
41 private void extractDocLocations(String startUrl) {
42 HttpClient client = new DefaultHttpClient();
43 HttpGet httpget = new HttpGet(startUrl);
44 HttpResponse resp = null;
45 try {
46 resp = client.execute(httpget);
47 } catch (IOException e) {
48 e.printStackTrace();
49 }
50 HttpEntity entity = resp.getEntity();
51 if (entity != null) {
52 XMLInputFactory iFactory = XMLInputFactory.newInstance();
53 XMLStreamReader reader = null;
54 try {
55 reader = iFactory.createXMLStreamReader(entity.getContent());
56 } catch (IllegalStateException e1) {
57 e1.printStackTrace();
58 } catch (XMLStreamException e1) {
59 e1.printStackTrace();
60 } catch (IOException e1) {
61 e1.printStackTrace();
62 }
63
64 try {
65 while (true) {
66 int event = reader.next();
67 if (event == XMLStreamConstants.END_DOCUMENT) {
68 reader.close();
69 break;
70 }
71 if (event == XMLStreamConstants.START_ELEMENT) {
72 if ((reader.getAttributeValue(null, "name")) != null) {
73 if (reader.getLocalName().equals("collection") && !(startUrl.endsWith(reader.getAttributeValue(null, "name")))) {
74 if(!(this.excludes.contains(reader.getAttributeValue(null, "name").toLowerCase()))){
75 if (reader.getAttributeValue(null, "name").startsWith("/")) {
76 client.getConnectionManager().closeExpiredConnections();
77 extractDocLocations(startUrl + reader.getAttributeValue(null, "name"));
78 } else {
79 client.getConnectionManager().closeExpiredConnections();
80 if (!startUrl.endsWith("/")) {
81 extractDocLocations(startUrl + "/" + reader.getAttributeValue(null, "name"));
82 } else {
83 extractDocLocations(startUrl + reader.getAttributeValue(null, "name"));
84 }
85 }
86 }
87 }
88 if (reader.getLocalName().equals("resource")) {
89 if (!startUrl.endsWith("/")) {
90 ressourceLoc.add(startUrl + "/" + reader.getAttributeValue(null, "name"));
91 } else {
92 ressourceLoc.add(startUrl + reader.getAttributeValue(null, "name"));
93 }
94 }
95 }
96 }
97 if (event == XMLStreamConstants.ATTRIBUTE) {
98 // System.out.println("localName : "+reader.getLocalName());
99 }
100 }
101 } catch (XMLStreamException e) {
102 e.printStackTrace();
103 }
104 }
105 }
106
107 /**
108 * extrahiert ebenso wie extractDocLocations(String startUri) Pfade, tut dies
109 * aber local und nicht über HTTP
110 *
111 * @return
112 */
113 public List<String> extractPathLocally(String startUrl) {
114 List<String> pathList = new ArrayList<String>();
115
116 // home verzeichnis pfad über system variable
117 // String loc = System.getenv("HOME")+"/wsp/configs";
118 // out.println("hom variable + conf datei : "+loc);
119 File f = new File(startUrl);
120 // out.println("readable : "+Boolean.toString(f.canRead()));
121 // out.println("readable : "+f.isDirectory());
122 if (f.isDirectory()) {
123 File[] filelist = f.listFiles();
124 for (File file : filelist) {
125 if (file.getName().toLowerCase().contains("config")) {
126 if (!startUrl.endsWith("/")) {
127 pathList.add(startUrl + "/" + file.getName());
128 } else {
129 pathList.add(startUrl + file.getName());
130 }
131 }
132 }
133 }
134 return pathList;
135 }
136
137 }