/*
* To change this template, choose Tools | Templates
* and open the template in the editor.
*/
package eu.fusepool.datalifecycle.utils;
import java.io.IOException;
import java.net.URL;
import java.util.LinkedList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.io.IOUtils;
/**
*
* @author Reto
*/
public class LinksRetriever {
public static List<URL> getLinks(String urlString) throws IOException {
URL url = new URL(urlString);
return getLinks(url);
}
public static List<URL> getLinks(URL url) throws IOException {
return getLinks(url, false);
}
public static List<URL> getLinks(URL url, boolean recurse) throws IOException {
final List<URL> links = new LinkedList<URL>();
processLinks(url, recurse, new LinkProcessor() {
public boolean process(URL url) {
links.add(url);
return true;
}
});
return links;
}
public static void processLinks(URL url, boolean recurse, LinkProcessor processor) throws IOException {
interruptibleProcessLinks(url, recurse, processor);
}
/**
* @return true if the processing shall continue, false otherwise
*/
private static boolean interruptibleProcessLinks(URL url, boolean recurse, LinkProcessor processor) throws IOException {
//InputStream in = url.openStream();
String html = IOUtils.toString(url);
//ByteArrayOutputStream baos
//System.out.print(html);
Pattern pattern = Pattern.compile("(<a href=\")(.*?)(\">)");
Matcher matcher = pattern.matcher(html);
while (matcher.find()) {
String linkTarget = matcher.group(2);
URL targetUrl = new URL(url,linkTarget);
if (linkTarget.endsWith(".rdf")
|| linkTarget.endsWith(".nt")
|| linkTarget.endsWith(".ttl")
|| linkTarget.endsWith(".xml")) {
if (processor.process(targetUrl)) {
continue;
} else {
return false;
}
}
if (recurse && linkTarget.endsWith("/")
&& (targetUrl.toString().startsWith(url.toString()))
&& (!targetUrl.equals(url))) {
if (!interruptibleProcessLinks(targetUrl, recurse, processor)) {
return false;
}
}
}
return true;
}
public static interface LinkProcessor {
/**
* Process an URI
* @param url the URI to process
* @return true if the processing shall continue, false otherwise
*/
boolean process(URL url);
}
}