Mercurial > hg > mpdl-group
view software/mpdl-services/mpiwg-mpdl-cms/src/de/mpg/mpiwg/berlin/mpdl/cms/harvester/PathExtractor.java @ 23:e845310098ba
diverse Korrekturen
author | Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de> |
---|---|
date | Tue, 27 Nov 2012 12:35:19 +0100 |
parents | |
children |
line wrap: on
line source
package de.mpg.mpiwg.berlin.mpdl.cms.harvester; import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.List; import javax.xml.stream.XMLInputFactory; import javax.xml.stream.XMLStreamConstants; import javax.xml.stream.XMLStreamException; import javax.xml.stream.XMLStreamReader; import org.apache.http.HttpEntity; import org.apache.http.HttpResponse; import org.apache.http.client.HttpClient; import org.apache.http.client.methods.HttpGet; import org.apache.http.impl.client.DefaultHttpClient; public class PathExtractor { private List<String> ressourceLoc; String excludes; public PathExtractor() { } public List<String> initExtractor(String startingUri, String excludes) { this.excludes = excludes; ressourceLoc = new ArrayList<String>(); // parameter necessery, because it's recursive, thus changing the uri extractDocLocations(startingUri); System.out.println("extracing resource locations done."); return this.ressourceLoc; } /** * recursive Method to extract the path of the resources * * @param startUrl */ private void extractDocLocations(String startUrl) { HttpClient client = new DefaultHttpClient(); HttpGet httpget = new HttpGet(startUrl); HttpResponse resp = null; try { resp = client.execute(httpget); } catch (IOException e) { e.printStackTrace(); } HttpEntity entity = resp.getEntity(); if (entity != null) { XMLInputFactory iFactory = XMLInputFactory.newInstance(); XMLStreamReader reader = null; try { reader = iFactory.createXMLStreamReader(entity.getContent()); } catch (IllegalStateException e1) { e1.printStackTrace(); } catch (XMLStreamException e1) { e1.printStackTrace(); } catch (IOException e1) { e1.printStackTrace(); } try { while (true) { int event = reader.next(); if (event == XMLStreamConstants.END_DOCUMENT) { reader.close(); break; } if (event == XMLStreamConstants.START_ELEMENT) { if ((reader.getAttributeValue(null, "name")) != null) { if (reader.getLocalName().equals("collection") && !(startUrl.endsWith(reader.getAttributeValue(null, "name")))) { if(!(this.excludes.contains(reader.getAttributeValue(null, "name").toLowerCase()))){ if (reader.getAttributeValue(null, "name").startsWith("/")) { client.getConnectionManager().closeExpiredConnections(); extractDocLocations(startUrl + reader.getAttributeValue(null, "name")); } else { client.getConnectionManager().closeExpiredConnections(); if (!startUrl.endsWith("/")) { extractDocLocations(startUrl + "/" + reader.getAttributeValue(null, "name")); } else { extractDocLocations(startUrl + reader.getAttributeValue(null, "name")); } } } } if (reader.getLocalName().equals("resource")) { if (!startUrl.endsWith("/")) { ressourceLoc.add(startUrl + "/" + reader.getAttributeValue(null, "name")); } else { ressourceLoc.add(startUrl + reader.getAttributeValue(null, "name")); } } } } if (event == XMLStreamConstants.ATTRIBUTE) { // System.out.println("localName : "+reader.getLocalName()); } } } catch (XMLStreamException e) { e.printStackTrace(); } } } /** * extrahiert ebenso wie extractDocLocations(String startUri) Pfade, tut dies * aber local und nicht über HTTP * * @return */ public List<String> extractPathLocally(String startUrl) { List<String> pathList = new ArrayList<String>(); // home verzeichnis pfad über system variable // String loc = System.getenv("HOME")+"/wsp/configs"; // out.println("hom variable + conf datei : "+loc); File f = new File(startUrl); // out.println("readable : "+Boolean.toString(f.canRead())); // out.println("readable : "+f.isDirectory()); if (f.isDirectory()) { File[] filelist = f.listFiles(); for (File file : filelist) { if (file.getName().toLowerCase().contains("config")) { if (!startUrl.endsWith("/")) { pathList.add(startUrl + "/" + file.getName()); } else { pathList.add(startUrl + file.getName()); } } } } return pathList; } }