package com.caseystella.util.common.hadoop.ingest; import com.google.common.base.Predicate; import com.google.common.base.Predicates; import com.google.common.collect.Iterables; import org.apache.commons.cli.*; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import javax.annotation.Nullable; import java.io.*; import java.util.*; /** * Created by cstella on 9/6/14. */ public class DirDiff { private static enum Opts { HELP(OptionBuilder.withLongOpt("help") .withDescription("Print this message") .create("h") , "h" ) ,CONFIG(OptionBuilder.hasArg() .withArgName("file") .withDescription("Config JSON file, conforming to the expectations of the class") .withLongOpt("config") .isRequired() .create("f") , "f" ) ; static Options options = new Options(); static CommandLineParser parser = new PosixParser(); static { for(Opts opt : values()) { options.addOption(opt.option); } } String code; Option option; Opts(Option option, String code) { this.option = option; this.code = code; } public boolean has(CommandLine commandLine) { return commandLine.hasOption(code); } public String get(CommandLine commandLine) { return commandLine.getOptionValue(code); } public static void printHelp(PrintWriter pw) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp( "DirDiff", options , true); formatter.printHelp(pw, HelpFormatter.DEFAULT_WIDTH , "DirDiff" , null, options , HelpFormatter.DEFAULT_LEFT_PAD , HelpFormatter.DEFAULT_DESC_PAD , null , true ); } public static CommandLine parse(String... argv) { try { return parser.parse( options, argv ); } catch(ParseException ex) { printHelp(new PrintWriter(System.err)); return null; } } } public static void main(String... argv) throws IOException { File configFile; Config config; CommandLine line = Opts.parse(argv); if(line == null) { System.exit(-1); } if(Opts.HELP.has(line)) { Opts.printHelp(new PrintWriter(System.out)); return; } { configFile = new File(Opts.CONFIG.get(line)); try { config = Config.load(new FileReader(configFile)); config.initialize(); } catch (Throwable e) { System.err.println("Unable to load config file"); e.printStackTrace(System.err); System.exit(-1); return; } } BufferedReader reader = new BufferedReader(new InputStreamReader(System.in)); Map<String, Map.Entry<Config.Mapping, Set<String>> > sourcePathToHDFS = new LinkedHashMap<String, Map.Entry<Config.Mapping,Set<String>>>(); for(Config.Mapping mapping : config.getMappings()) { File sourceDir = new File(mapping.getSource()); if(!sourceDir.isDirectory()) { continue; } Set<String> hdfsFiles = null; try { hdfsFiles = lsHDFS(mapping.getDestination()); } catch(FileNotFoundException fnfe) { hdfsFiles = new HashSet<String>(); } catch (IOException e) { continue; } sourcePathToHDFS.put(sourceDir.getCanonicalPath(), new AbstractMap.SimpleEntry<Config.Mapping, Set<String>>(mapping,hdfsFiles)); } for(String fileS = null; (fileS = reader.readLine()) != null;) { File inFile = new File(fileS); if(!inFile.exists()) { continue; } String canonicalPath = inFile.getCanonicalPath(); for(Map.Entry<String, Map.Entry<Config.Mapping, Set<String>>> e : sourcePathToHDFS.entrySet()) { if(canonicalPath.startsWith(e.getKey())) { Predicate<File> pred = Predicates.and(config, Predicates.and(e.getValue().getKey(), new PrintFile(e.getValue().getValue()))); if(pred.apply(inFile)) { try { System.out.println("\"" + inFile.getCanonicalPath() + "\" \"" + e.getValue().getKey().getDestination() + "\""); } catch (IOException ioe) { throw new RuntimeException("Cannot canonicalize " + inFile, ioe); } } break; } } } } private static class PrintFile implements Predicate<File> { Set<String> hdfsFiles; PrintFile(Set<String> hdfsFiles) { this.hdfsFiles = hdfsFiles; } @Override public boolean apply(@Nullable File input) { return !hdfsFiles.contains(input.getName()); } } private static Set<String> lsHDFS(String location) throws IOException { FileSystem fs = FileSystem.newInstance(new Configuration()); Set<String> ret = new HashSet<String>(); for(FileStatus stat : fs.listStatus(new Path(location))) { ret.add(stat.getPath().getName()); } return ret; } }