package org.hipi.tools.downloader; import org.hipi.image.HipiImageHeader.HipiImageFormat; import org.hipi.imagebundle.HipiImageBundle; import org.hipi.image.HipiImageHeader; import org.hipi.image.io.JpegCodec; import org.hipi.image.io.PngCodec; import org.apache.commons.cli.BasicParser; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.CommandLineParser; import org.apache.commons.cli.HelpFormatter; import org.apache.commons.cli.Options; import org.apache.commons.cli.Parser; import org.apache.commons.cli.ParseException; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.BooleanWritable; import org.apache.hadoop.io.compress.CompressionCodec; import org.apache.hadoop.io.compress.CompressionCodecFactory; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; import java.io.BufferedInputStream; import java.io.BufferedReader; import java.io.InputStream; import java.io.InputStreamReader; import java.io.IOException; import java.io.StringReader; import java.net.URL; import java.net.URLConnection; import java.util.ArrayList; import java.util.HashMap; import java.util.Iterator; import java.util.regex.Matcher; import java.util.regex.Pattern; /** * A MapReduce program that takes a list of image URL's, downloads * them, and creates a {@link org.hipi.imagebundle.HipiImageBundle} from * them. Also supports downloading the Yahoo/Flickr 100M CC dataset. * * When running this program, the user must specify 3 parameters. The * first is the location of the list of URL's (one URL per line), the * second is the output path for the HIB that will be generated, and * the third is the number of nodes that should be used during the * program's execution. This final parameter should be chosen with * respect to the total bandwidth your particular cluster is able to * handle. An example usage would be: * <p> * downloader.jar /path/to/urls.txt /path/to/output.hib 10 * <p> * This program * will automatically force 10 nodes to download the set of URL's * contained in the input list, thus if your list contains 100,000 * images, each node in this example will be responsible for * downloading 10,000 images. * */ public class Downloader extends Configured implements Tool { private static final Options options = new Options(); private static final Parser parser = (Parser)new BasicParser(); static { options.addOption("f", "force", false, "force overwrite if output HIB already exists"); options.addOption("y", "yfcc100m", false, "assume input files are in Yahoo/Flickr CC 100M format"); options.addOption("n", "num-nodes", true, "number of download nodes (default=1) (ignored if --yfcc100m is specified)"); } private static void usage() { HelpFormatter formatter = new HelpFormatter(); formatter.setWidth(148); formatter.printHelp("hibDownload.jar <directory containing source files> <output HIB> [-f] [--yfcc100m] [--num-nodes #count]", options); System.exit(0); } private static long uniqueMapperKey = 0; // Ensures temp hib paths in mapper are unique private static long numDownloads = 0; // Keeps track of number of image downloads private final String FLICKR_PREFIX = "yfcc100m_dataset"; // This string represents the root name for each of the dataset files public static class DownloaderMapper extends Mapper<LongWritable, Text, BooleanWritable, Text> { private static Configuration conf; @Override public void setup(Context context) throws IOException, InterruptedException { this.conf = context.getConfiguration(); } @Override public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { // Use line number and a unique key assigned to each map task to generate a unique filename. String tempPath = conf.get("downloader.outpath") + key.get() + uniqueMapperKey + ".hib.tmp"; boolean yfcc100m = conf.getBoolean("downloader.yfcc100m", false); // Create new temporary HIB HipiImageBundle hib = new HipiImageBundle(new Path(tempPath), conf); hib.openForWrite(true); // The value argument contains a list of image URLs delimited by // '\n'. Setup buffered reader to allow processing this string // line by line. BufferedReader lineReader = new BufferedReader(new StringReader(value.toString())); String line; // Iterate through URLs while ((line = lineReader.readLine()) != null) { String[] lineFields = null; String imageUri = null; if (yfcc100m) { // Split line into fields lineFields = line.split("\t"); // Fields within each line are delimited by tabs if (lineFields[22].equals("1")) { // 0 = image, 1 = video in YFCC100M format continue; } imageUri = lineFields[14]; } else { imageUri = line; // Otherwise, assume entire line is image URL } long startTime = System.currentTimeMillis(); try { String type = ""; URLConnection conn; // Attempt to download image at URL using java.net try { URL link = new URL(imageUri); numDownloads++; System.out.println(""); System.out.println("Downloading: " + link.toString()); System.out.println("Number of downloads: " + numDownloads); conn = link.openConnection(); conn.connect(); type = conn.getContentType(); // Check that image format is supported, header is parsable, and add to HIB if so if (type != null && (type.compareTo("image/jpeg") == 0 || type.compareTo("image/png") == 0)) { // Get input stream for URL connection InputStream bis = new BufferedInputStream(conn.getInputStream()); // Mark current location in stream for later reset bis.mark(Integer.MAX_VALUE); // Attempt to decode the image header HipiImageHeader header = (type.compareTo("image/jpeg") == 0 ? JpegCodec.getInstance().decodeHeader(bis) : PngCodec.getInstance().decodeHeader(bis)); if (header == null) { System.out.println("Failed to parse header, image not added to HIB: " + link.toString()); } else { // Passed header decode test, so reset to beginning of stream bis.reset(); if (yfcc100m) { // Capture fields as image metadata for posterity for (int i=0; i<lineFields.length; i++) { header.addMetaData(String.format("col_%03d", i), lineFields[i]); } header.addMetaData("source", lineFields[14]); } else { // Capture source URL as image metadata for posterity header.addMetaData("source",imageUri); } // Add image to hib hib.addImage(header, bis); System.err.println("Added to HIB: " + imageUri); } } else { System.out.println("Unrecognized HTTP content type or unsupported image format [" + type + "], not added to HIB: " + imageUri); } } catch (Exception e) { System.out.println("Connection error while trying to download: " + imageUri); e.printStackTrace(); } } catch (Exception e) { System.out.println("Network error while trying to download: " + imageUri); e.printStackTrace(); try { Thread.sleep(1000); } catch (InterruptedException ie) { ie.printStackTrace(); } } float el = (float) (System.currentTimeMillis() - startTime) / 1000.0f; System.out.println("> Time elapsed: " + el + " seconds"); } // while ((line = lineReader.readLine()) != null) { try { // Output key/value pair to reduce layer consisting of boolean and path to HIB context.write(new BooleanWritable(true), new Text(hib.getPath().toString())); // Cleanup lineReader.close(); hib.close(); } catch (Exception e) { e.printStackTrace(); } uniqueMapperKey++; } // Display metadata of the image public static void printFlickrImageMetadata(String[] lineArray) { System.out.println(" Flickr Image Metadata: "); System.out.println(" > Photo/Video Identifier: " + lineArray[0]); System.out.println(" > User NSID: " + lineArray[1]); System.out.println(" > User Nickname: " + lineArray[2]); System.out.println(" > Date Taken: " + lineArray[3]); System.out.println(" > Date Uploaded: " + lineArray[4]); System.out.println(" > Capture Device: " + lineArray[5]); System.out.println(" > Title: " + lineArray[6]); System.out.println(" > Description: " + lineArray[7]); System.out.println(" > User Tags: " + lineArray[8]); System.out.println(" > Machine Tags: " + lineArray[9]); System.out.println(" > Longitude: " + lineArray[10]); System.out.println(" > Latitude: " + lineArray[11]); System.out.println(" > Accuracy: " + lineArray[12]); System.out.println(" > Photo/Video Page URL: " + lineArray[13]); System.out.println(" > Photo/Video Download URL: " + lineArray[14]); System.out.println(" > License Name: " + lineArray[15]); System.out.println(" > License URL: " + lineArray[16]); System.out.println(" > Photo/Video Server Identifier: " + lineArray[17]); System.out.println(" > Photo/Video Farm Identifier: " + lineArray[18]); System.out.println(" > Photo/Video Secret: " + lineArray[19]); System.out.println(" > Photo/Video Secret Original: " + lineArray[20]); System.out.println(" > Extension of the Original Photo: " + lineArray[21]); System.out.println(" > Photos/video marker (0 = photo, 1 = video): " + lineArray[22]); } } public int run(String[] args) throws Exception { // try to parse command line arguments CommandLine line = null; try { line = parser.parse(options, args); } catch( ParseException exp ) { usage(); } if (line == null) { usage(); } String [] leftArgs = line.getArgs(); if (leftArgs.length != 2) { usage(); } String inputDir = leftArgs[0]; String outputHib = leftArgs[1]; boolean yfcc100m = line.hasOption("yfcc100m"); int numDownloadNodes = (yfcc100m ? 1 : ((line.hasOption("num-nodes") ? Integer.parseInt(line.getOptionValue("num-nodes")) : 1))); if (numDownloadNodes < 1) { System.err.println("Invalid number of download nodes specified [" + numDownloadNodes + "]"); System.exit(1); } boolean overwrite = line.hasOption("force"); System.out.println("Source directory: " + inputDir); System.out.println("Output HIB: " + outputHib); System.out.println("Overwrite output HIB if it exists: " + (overwrite ? "true" : "false")); System.out.println("YFCC100M format: " + (yfcc100m ? "true" : "false")); System.out.println("Number of download nodes: " + numDownloadNodes); Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(conf); // Remove existing HIB if overwrite is specified and HIB exists if (!overwrite) { if (fs.exists(new Path(outputHib))) { System.err.println("HIB [" + outputHib + "] already exists. Use the \"--force\" argument to overwrite."); System.exit(1); } } else { // overwrite if (fs.exists(new Path(outputHib))) { System.out.println("Found that output HIB already exists, deleting."); } } fs.delete(new Path(outputHib), true); fs.delete(new Path(outputHib+".dat"), true); fs.delete(new Path(outputHib+"_output"), true); // Scan source directory for list of input files FileStatus[] inputFiles = fs.listStatus(new Path(inputDir)); if (inputFiles == null || inputFiles.length == 0) { System.err.println("Failed to find any files in source directory: " + inputDir); System.exit(1); } // Validate list of input files ArrayList<Path> sourceFiles = new ArrayList<Path>(); for (FileStatus file : inputFiles) { Path path = file.getPath(); if (yfcc100m) { String[] tokens = path.getName().split("-"); if (tokens == null || tokens.length == 0) { System.out.println(" Skipping source file (does not follow YFCC100M file name convention): " + file.getPath()); continue; } } try { // If it exists, get the relevant compression codec CompressionCodecFactory codecFactory = new CompressionCodecFactory(conf); CompressionCodec codec = codecFactory.getCodec(path); FSDataInputStream fis = fs.open(path); // If the codec was found, use it to create an decompressed input stream. // Otherwise, assume input stream is already decompressed BufferedReader reader = null; if (codec != null) { reader = new BufferedReader(new InputStreamReader(codec.createInputStream(fis))); } else { reader = new BufferedReader(new InputStreamReader(fis)); } String fileLine = reader.readLine(); String[] lineFields = (yfcc100m ? fileLine.split("\t") : fileLine.split("\\s+")); if (yfcc100m) { if (lineFields.length != 23) { System.out.println(" Skipping source file (does not follow YFCC100M source file format): " + file.getPath()); String imageUri = null; } else { System.out.println(" Adding source file: " + file.getPath()); sourceFiles.add(path); } } else { if (lineFields.length != 1) { System.out.println(" Skipping source file (contains multiple fields per line where only one is expected): " + file.getPath()); if (lineFields.length == 23) { System.out.println(" Did you mean to use \"--yfcc100m\"?"); } String imageUri = null; } else { System.out.println(" Adding source file: " + file.getPath()); sourceFiles.add(path); } } fis.close(); reader = null; } catch (Exception e) { System.err.println("Skipping source file (unable to open and parse first line: " + file.getPath()); continue; } } if (sourceFiles.size() == 0) { System.err.println("Failed to find any valid files in source directory: " + inputDir); System.exit(1); } // Construct path to directory containing outputHib String outputPath = outputHib.substring(0, outputHib.lastIndexOf('/') + 1); // Attaching job parameters to global Configuration object conf.setInt("downloader.nodes", numDownloadNodes); conf.setStrings("downloader.outfile", outputHib); conf.setStrings("downloader.outpath", outputPath); conf.setBoolean("downloader.yfcc100m", yfcc100m); Job job = Job.getInstance(conf, "hibDownload"); job.setJarByClass(Downloader.class); job.setMapperClass(DownloaderMapper.class); job.setReducerClass(DownloaderReducer.class); job.setInputFormatClass(DownloaderInputFormat.class); job.setOutputKeyClass(BooleanWritable.class); job.setOutputValueClass(Text.class); job.setNumReduceTasks(1); FileOutputFormat.setOutputPath(job, new Path(outputHib + "_output")); Path[] inputPaths = new Path[sourceFiles.size()]; inputPaths = sourceFiles.toArray(inputPaths); DownloaderInputFormat.setInputPaths(job, inputPaths); return job.waitForCompletion(true) ? 0 : 1; } public static void main(String[] args) throws Exception { int res = ToolRunner.run(new Downloader(), args); System.exit(res); } }