view software/mpdl-services/mpiwg-mpdl-cms/src/de/mpg/mpiwg/berlin/mpdl/cms/harvester/PathExtractor.java @ 23:e845310098ba

diverse Korrekturen
author Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
date Tue, 27 Nov 2012 12:35:19 +0100
parents
children
line wrap: on
line source

package de.mpg.mpiwg.berlin.mpdl.cms.harvester;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLStreamConstants;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamReader;

import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.DefaultHttpClient;

public class PathExtractor {
  private List<String> ressourceLoc;
  String excludes;

  public PathExtractor() {

  }

  public List<String> initExtractor(String startingUri, String excludes) {
    this.excludes = excludes;
    ressourceLoc = new ArrayList<String>();
    // parameter necessery, because it's recursive, thus changing the uri
    extractDocLocations(startingUri);
    System.out.println("extracing resource locations done.");
    return this.ressourceLoc;
  }

  /**
   * recursive Method to extract the path of the resources
   * 
   * @param startUrl
   */
  private void extractDocLocations(String startUrl) {
    HttpClient client = new DefaultHttpClient();
    HttpGet httpget = new HttpGet(startUrl);
    HttpResponse resp = null;
    try {
      resp = client.execute(httpget);
    } catch (IOException e) {
      e.printStackTrace();
    }
    HttpEntity entity = resp.getEntity();
    if (entity != null) {
      XMLInputFactory iFactory = XMLInputFactory.newInstance();
      XMLStreamReader reader = null;
      try {
        reader = iFactory.createXMLStreamReader(entity.getContent());
      } catch (IllegalStateException e1) {
        e1.printStackTrace();
      } catch (XMLStreamException e1) {
        e1.printStackTrace();
      } catch (IOException e1) {
        e1.printStackTrace();
      }

      try {
        while (true) {
          int event = reader.next();
          if (event == XMLStreamConstants.END_DOCUMENT) {
            reader.close();
            break;
          }
          if (event == XMLStreamConstants.START_ELEMENT) {
            if ((reader.getAttributeValue(null, "name")) != null) {
              if (reader.getLocalName().equals("collection") && !(startUrl.endsWith(reader.getAttributeValue(null, "name")))) {
                if(!(this.excludes.contains(reader.getAttributeValue(null, "name").toLowerCase()))){
                  if (reader.getAttributeValue(null, "name").startsWith("/")) {
                    client.getConnectionManager().closeExpiredConnections();
                    extractDocLocations(startUrl + reader.getAttributeValue(null, "name"));
                  } else {
                    client.getConnectionManager().closeExpiredConnections();
                    if (!startUrl.endsWith("/")) {
                      extractDocLocations(startUrl + "/" + reader.getAttributeValue(null, "name"));
                    } else {
                      extractDocLocations(startUrl + reader.getAttributeValue(null, "name"));
                    }
                  }
                }
              }
              if (reader.getLocalName().equals("resource")) {
                if (!startUrl.endsWith("/")) {
                  ressourceLoc.add(startUrl + "/" + reader.getAttributeValue(null, "name"));
                } else {
                  ressourceLoc.add(startUrl + reader.getAttributeValue(null, "name"));
                }
              }
            }
          }
          if (event == XMLStreamConstants.ATTRIBUTE) {
            // System.out.println("localName : "+reader.getLocalName());
          }
        }
      } catch (XMLStreamException e) {
        e.printStackTrace();
      }
    }
  }

  /**
   * extrahiert ebenso wie extractDocLocations(String startUri) Pfade, tut dies
   * aber local und nicht über HTTP
   * 
   * @return
   */
  public List<String> extractPathLocally(String startUrl) {
    List<String> pathList = new ArrayList<String>();

    // home verzeichnis pfad über system variable
    // String loc = System.getenv("HOME")+"/wsp/configs";
    // out.println("hom variable + conf datei : "+loc);
    File f = new File(startUrl);
    // out.println("readable : "+Boolean.toString(f.canRead()));
    // out.println("readable : "+f.isDirectory());
    if (f.isDirectory()) {
      File[] filelist = f.listFiles();
      for (File file : filelist) {
        if (file.getName().toLowerCase().contains("config")) {
          if (!startUrl.endsWith("/")) {
            pathList.add(startUrl + "/" + file.getName());
          } else {
            pathList.add(startUrl + file.getName());
          }
        }
      }
    }
    return pathList;
  }

}