package eu.fusepool.datalifecycle.utils;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.io.IOUtils;
/**
* Provides an utility method to retrieve files paths from a local file system or from
* a remote http server. A file name must end with one of the following extensions: .xml, .rdf.
* An url that does not end with the mentioned extensions is supposed to refer to a folder in a local file
* system (file scheme) or in a remote one (http scheme).
* @author luigi
*
*/
public class FileUtil {
public static ArrayList<String> getFileList(URL url, String [] fileNameExtensions) throws IOException {
ArrayList<String> fileList = new ArrayList<String>();
String scheme = url.getProtocol();
String fileName = url.getFile();
String path = url.getPath();
String ref = url.toString();
boolean isRdfFile = false;
for(int i = 0; i < fileNameExtensions.length; i++){
if(ref.endsWith(fileNameExtensions[i]))
isRdfFile = true;
}
if(isRdfFile){
fileList.add(ref);
}
else {
if("file".equals(scheme)) {
File dir = new File(fileName);
if(dir.isDirectory()) {
if(! path.endsWith("/")) path = path + "/";
String [] files = dir.list();
for(int i = 0; i < files.length; i++ ) {
for(int j = 0; j < fileNameExtensions.length; j++)
if(files[i].endsWith(fileNameExtensions[j]))
fileList.add(scheme + "://" + path + files[i]);
}
}
}
if("http".equals(scheme)){
String html = IOUtils.toString(url);
Pattern pattern = Pattern.compile("(<a href=\")(.*?)(\">)");
Matcher matcher = pattern.matcher(html);
while(matcher.find()){
String match = matcher.group(2);
for(int i = 0; i < fileNameExtensions.length; i++)
if(match.endsWith(fileNameExtensions[i]))
fileList.add(ref + match);
}
}
}
return fileList;
}
public static void main(String [] args) throws IOException {
//String dataurl = "file:///home/luigi/projects/bfh/fusepool/data_sources/patents/MAREC/rdf/00/";
//String dataurl = "http://raw.fusepool.info/marec/00/";
String dataurl = "http://raw.fusepool.info/pmc/Acc_Chem_Res/";
String [] filenameExtension = {".nxml"};
URL url = new URL(dataurl);
ArrayList<String> fileList = FileUtil.getFileList(url, filenameExtension);
Iterator<String> ifile = fileList.iterator();
while(ifile.hasNext()){
System.out.println(ifile.next());
}
}
}