package org.bbaw.wsp.cms.collections;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLStreamConstants;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamReader;
import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.DefaultHttpClient;
/**
* eXist-specific Crawler
* extracts the paths of Documents in a http manner
*
* @author marco juergens
*
*/
public class PathExtractor {
private List<String> ressourceLoc;
private String excludes;
public PathExtractor() {
}
public List<String> initExtractor(String startingUri, String excludes) {
this.excludes = excludes;
ressourceLoc = new ArrayList<String>();
// parameter necessary, because it's recursive, thus changing the uri
extractDocLocations(startingUri);
return this.ressourceLoc;
}
/**
* recursive Method to extract the path of the resources
*
* @param startUrl
*/
private void extractDocLocations(String startUrl) {
HttpClient client = new DefaultHttpClient();
HttpGet httpget = new HttpGet(startUrl);
HttpResponse resp = null;
try {
resp = client.execute(httpget);
} catch (IOException e) {
e.printStackTrace();
}
HttpEntity entity = resp.getEntity();
if (entity != null) {
XMLInputFactory iFactory = XMLInputFactory.newInstance();
XMLStreamReader reader = null;
try {
reader = iFactory.createXMLStreamReader(entity.getContent());
} catch (IllegalStateException e1) {
e1.printStackTrace();
} catch (XMLStreamException e1) {
e1.printStackTrace();
} catch (IOException e1) {
e1.printStackTrace();
}
try {
while (true) {
int event = reader.next();
if (event == XMLStreamConstants.END_DOCUMENT) {
reader.close();
break;
}
if (event == XMLStreamConstants.START_ELEMENT) {
String nameAttributeValue = reader.getAttributeValue(null, "name");
if ((nameAttributeValue) != null) {
if (reader.getLocalName().equals("collection") && !(startUrl.endsWith(nameAttributeValue))) {
if (! isNameExcluded(nameAttributeValue.toLowerCase())) {
if (nameAttributeValue.startsWith("/")) {
client.getConnectionManager().closeExpiredConnections();
extractDocLocations(startUrl + nameAttributeValue);
} else {
client.getConnectionManager().closeExpiredConnections();
if (! startUrl.endsWith("/")) {
extractDocLocations(startUrl + "/" + nameAttributeValue);
} else {
extractDocLocations(startUrl + nameAttributeValue);
}
}
}
}
if (reader.getLocalName().equals("resource")) {
String url = startUrl + "/" + nameAttributeValue;
if (startUrl.endsWith("/"))
url = startUrl + nameAttributeValue;
boolean startUrlIsExcluded = isExcluded(url); // if exclude contains a full file name e.g. verzeichnisse/personenkorrektur.xml
if (! startUrlIsExcluded) {
ressourceLoc.add(url);
}
}
}
}
if (event == XMLStreamConstants.ATTRIBUTE) {
// System.out.println("localName : "+reader.getLocalName());
}
}
} catch (XMLStreamException e) {
e.printStackTrace();
}
}
}
private boolean isExcluded(String url) {
boolean isExcluded = false;
if (excludes != null && url != null) {
String[] exludeArrayStr = excludes.split(" ");
for (int i=0; i<exludeArrayStr.length; i++) {
String exclude = exludeArrayStr[i];
if (url.endsWith(exclude))
return true;
}
}
return isExcluded;
}
private boolean isNameExcluded(String name) {
boolean isExcluded = false;
if (excludes != null && name != null) {
String[] exludeArrayStr = excludes.split(" ");
for (int i=0; i<exludeArrayStr.length; i++) {
String exclude = exludeArrayStr[i];
if (name.equals(exclude))
return true;
}
}
return isExcluded;
}
/**
* extrahiert ebenso wie extractDocLocations(String startUri) Pfade, tut dies
* aber local und nicht über HTTP
*
* @return
*/
public List<String> extractPathLocally(String startUrl) {
List<String> pathList = new ArrayList<String>();
// home verzeichnis pfad über system variable
// String loc = System.getenv("HOME")+"/wsp/configs";
// out.println("hom variable + conf datei : "+loc);
File f = new File(startUrl);
// out.println("readable : "+Boolean.toString(f.canRead()));
// out.println("readable : "+f.isDirectory());
if (f.isDirectory()) {
File[] filelist = f.listFiles();
for (File file : filelist) {
if (file.getName().toLowerCase().contains("config")) {
if (!startUrl.endsWith("/")) {
pathList.add(startUrl + "/" + file.getName());
} else {
pathList.add(startUrl + file.getName());
}
}
}
}
return pathList;
}
}