Mercurial > hg > mpdl-group
diff software/mpdl-services-new/mpiwg-mpdl-cms/src/de/mpg/mpiwg/berlin/mpdl/cms/harvester/PathExtractor.java @ 25:e9fe3186670c default tip
letzter Stand eingecheckt
author | Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de> |
---|---|
date | Tue, 21 May 2013 10:19:32 +0200 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services-new/mpiwg-mpdl-cms/src/de/mpg/mpiwg/berlin/mpdl/cms/harvester/PathExtractor.java Tue May 21 10:19:32 2013 +0200 @@ -0,0 +1,137 @@ +package de.mpg.mpiwg.berlin.mpdl.cms.harvester; + +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +import javax.xml.stream.XMLInputFactory; +import javax.xml.stream.XMLStreamConstants; +import javax.xml.stream.XMLStreamException; +import javax.xml.stream.XMLStreamReader; + +import org.apache.http.HttpEntity; +import org.apache.http.HttpResponse; +import org.apache.http.client.HttpClient; +import org.apache.http.client.methods.HttpGet; +import org.apache.http.impl.client.DefaultHttpClient; + +public class PathExtractor { + private List<String> ressourceLoc; + String excludes; + + public PathExtractor() { + + } + + public List<String> initExtractor(String startingUri, String excludes) { + this.excludes = excludes; + ressourceLoc = new ArrayList<String>(); + // parameter necessery, because it's recursive, thus changing the uri + extractDocLocations(startingUri); + System.out.println("extracing resource locations done."); + return this.ressourceLoc; + } + + /** + * recursive Method to extract the path of the resources + * + * @param startUrl + */ + private void extractDocLocations(String startUrl) { + HttpClient client = new DefaultHttpClient(); + HttpGet httpget = new HttpGet(startUrl); + HttpResponse resp = null; + try { + resp = client.execute(httpget); + } catch (IOException e) { + e.printStackTrace(); + } + HttpEntity entity = resp.getEntity(); + if (entity != null) { + XMLInputFactory iFactory = XMLInputFactory.newInstance(); + XMLStreamReader reader = null; + try { + reader = iFactory.createXMLStreamReader(entity.getContent()); + } catch (IllegalStateException e1) { + e1.printStackTrace(); + } catch (XMLStreamException e1) { + e1.printStackTrace(); + } catch (IOException e1) { + e1.printStackTrace(); + } + + try { + while (true) { + int event = reader.next(); + if (event == XMLStreamConstants.END_DOCUMENT) { + reader.close(); + break; + } + if (event == XMLStreamConstants.START_ELEMENT) { + if ((reader.getAttributeValue(null, "name")) != null) { + if (reader.getLocalName().equals("collection") && !(startUrl.endsWith(reader.getAttributeValue(null, "name")))) { + if(!(this.excludes.contains(reader.getAttributeValue(null, "name").toLowerCase()))){ + if (reader.getAttributeValue(null, "name").startsWith("/")) { + client.getConnectionManager().closeExpiredConnections(); + extractDocLocations(startUrl + reader.getAttributeValue(null, "name")); + } else { + client.getConnectionManager().closeExpiredConnections(); + if (!startUrl.endsWith("/")) { + extractDocLocations(startUrl + "/" + reader.getAttributeValue(null, "name")); + } else { + extractDocLocations(startUrl + reader.getAttributeValue(null, "name")); + } + } + } + } + if (reader.getLocalName().equals("resource")) { + if (!startUrl.endsWith("/")) { + ressourceLoc.add(startUrl + "/" + reader.getAttributeValue(null, "name")); + } else { + ressourceLoc.add(startUrl + reader.getAttributeValue(null, "name")); + } + } + } + } + if (event == XMLStreamConstants.ATTRIBUTE) { + // System.out.println("localName : "+reader.getLocalName()); + } + } + } catch (XMLStreamException e) { + e.printStackTrace(); + } + } + } + + /** + * extrahiert ebenso wie extractDocLocations(String startUri) Pfade, tut dies + * aber local und nicht über HTTP + * + * @return + */ + public List<String> extractPathLocally(String startUrl) { + List<String> pathList = new ArrayList<String>(); + + // home verzeichnis pfad über system variable + // String loc = System.getenv("HOME")+"/wsp/configs"; + // out.println("hom variable + conf datei : "+loc); + File f = new File(startUrl); + // out.println("readable : "+Boolean.toString(f.canRead())); + // out.println("readable : "+f.isDirectory()); + if (f.isDirectory()) { + File[] filelist = f.listFiles(); + for (File file : filelist) { + if (file.getName().toLowerCase().contains("config")) { + if (!startUrl.endsWith("/")) { + pathList.add(startUrl + "/" + file.getName()); + } else { + pathList.add(startUrl + file.getName()); + } + } + } + } + return pathList; + } + +}