/* Copyright 2011, Lightbox Technologies, Inc Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ package com.lightboxtechnologies.spectrum; import java.io.BufferedReader; import java.io.InputStream; import java.io.InputStreamReader; import java.io.IOException; import java.io.Reader; import java.util.HashSet; import java.util.Set; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.BytesWritable; import org.apache.hadoop.io.MapWritable; import org.apache.hadoop.io.SequenceFile; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat; import org.apache.hadoop.util.GenericOptionsParser; import org.apache.hadoop.hbase.mapreduce.TableInputFormat; import org.apache.hadoop.hbase.util.*; import org.apache.hadoop.hbase.client.Scan; import org.apache.commons.io.IOUtils; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.commons.codec.binary.Hex; import java.io.ByteArrayOutputStream; import java.io.DataOutputStream; import org.sleuthkit.hadoop.SKJobFactory; public class SequenceFileExport { private static final Log LOG = LogFactory.getLog(SequenceFileExport.class); protected static class SequenceFileExportMapper extends Mapper<ImmutableHexWritable,FsEntry,BytesWritable,MapWritable> { private final Set<String> Extensions = new HashSet<String>(); private final BytesWritable OutKey = new BytesWritable(); private final MapWritable Fields = new MapWritable(); private final Text FullPath = new Text(); private final Text Ext = new Text(); private final Text Sha = new Text(); private final Text Md5 = new Text(); // FIXME: IBW instead? private final BytesWritable Vid = new BytesWritable(); private final Text HdfsPath = new Text(); public SequenceFileExportMapper() { Fields.put(new Text("full_path"), FullPath); Fields.put(new Text("extension"), Ext); Fields.put(new Text("sha1"), Sha); Fields.put(new Text("md5"), Md5); Fields.put(new Text("data"), Vid); Fields.put(new Text("hdfs_path"), HdfsPath); } @Override protected void setup(Context context) throws IOException, InterruptedException { super.setup(context); final Configuration conf = context.getConfiguration(); // get permissible file extensions from the configuration Extensions.clear(); Extensions.addAll(conf.getStringCollection("extensions")); } void encodeHex(Text val, FsEntry entry, String field) { Object o = entry.get(field); if (o != null && o instanceof byte[]) { byte[] b = (byte[])o; val.set(new String(Hex.encodeHex(b))); } else { LOG.warn(entry.fullPath() + " didn't have a hash for " + field); val.set(""); } } @Override public void map(ImmutableHexWritable key, FsEntry value, Context context) throws IOException, InterruptedException { if (Extensions.contains(value.extension())) { FullPath.set(value.fullPath()); Ext.set(value.extension()); encodeHex(Sha, value, "sha1"); encodeHex(Md5, value, "md5"); if (value.isContentHDFS()) { Vid.setSize(0); HdfsPath.set(value.getContentHdfsPath()); } else { final byte[] buf = value.getContentBuffer(); if (buf == null) { LOG.warn(value.fullPath() + " didn't have a content buffer, skipping."); return; } Vid.set(buf, 0, buf.length); HdfsPath.set(""); } byte[] keybytes = key.get(); OutKey.set(keybytes, 0, keybytes.length); context.write(OutKey, Fields); } } } protected static void die() { System.err.println( "Usage: SequenceFileExport <image_id> <friendlyname> <outpath> <ext> [<ext>]...\n" + " SequenceFileExport -f <ext_file> <image_id> <friendlyname> <outpath>" ); System.exit(2); } public static void main(String[] args) throws Exception { final Configuration conf = new Configuration(); final String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); String imageID; String outpath; String friendlyname; final Set<String> exts = new HashSet<String>(); if ("-f".equals(otherArgs[0])) { if (otherArgs.length != 4) { die(); } // load extensions from file final Path extpath = new Path(otherArgs[1]); InputStream in = null; try { in = extpath.getFileSystem(conf).open(extpath); Reader r = null; try { r = new InputStreamReader(in); BufferedReader br = null; try { br = new BufferedReader(r); String line; while ((line = br.readLine()) != null) { exts.add(line.trim().toLowerCase()); } br.close(); } finally { IOUtils.closeQuietly(br); } r.close(); } finally { IOUtils.closeQuietly(r); } in.close(); } finally { IOUtils.closeQuietly(in); } imageID = otherArgs[2]; friendlyname = otherArgs[3]; outpath = otherArgs[4]; } else { if (otherArgs.length < 3) { die(); } // read extensions from trailing args imageID = otherArgs[0]; friendlyname = otherArgs[1]; outpath = otherArgs[2]; // lowercase all file extensions for (int i = 2; i < otherArgs.length; ++i) { exts.add(otherArgs[i].toLowerCase()); } } conf.setStrings("extensions", exts.toArray(new String[exts.size()])); final Job job = SKJobFactory.createJobFromConf(imageID, friendlyname, "SequenceFileExport", conf); job.setJarByClass(SequenceFileExport.class); job.setMapperClass(SequenceFileExportMapper.class); job.setNumReduceTasks(0); job.setOutputKeyClass(BytesWritable.class); job.setOutputValueClass(MapWritable.class); job.setInputFormatClass(FsEntryHBaseInputFormat.class); FsEntryHBaseInputFormat.setupJob(job, imageID); job.setOutputFormatClass(SequenceFileOutputFormat.class); SequenceFileOutputFormat.setOutputCompressionType( job, SequenceFile.CompressionType.BLOCK ); FileOutputFormat.setOutputPath(job, new Path(outpath)); System.exit(job.waitForCompletion(true) ? 0 : 1); } }