/* * Copyright 2014 DataGenerator Contributors * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.finra.datagenerator.samples.distributor.hdfs; import com.google.gson.Gson; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.compress.CompressionCodec; import org.apache.hadoop.io.compress.GzipCodec; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.NLineInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.log4j.Logger; import org.finra.datagenerator.consumer.DataConsumer; import org.finra.datagenerator.distributor.SearchDistributor; import org.finra.datagenerator.engine.Frontier; import org.finra.datagenerator.engine.scxml.SCXMLGapper; import java.io.IOException; import java.util.List; import java.util.Map; /** * Created by robbinbr on 3/24/14. */ public class HDFSDistributor implements SearchDistributor { private static final Logger log = Logger.getLogger(HDFSDistributor.class); private static final String ENCODING = "UTF-8"; private static final Gson GSON = new Gson(); private String stateMachineText; private String hdfsFileRoot; private JobConf configuration; private Path mapperInputFilePath; private Path mapperOutputFilePath; private String mapperOutputFileName; private long maxNumberOfLines; private String reportingHost; // TODO: This method is not actually doing anything? /** * Set the DataConsumer object * * @param dataConsumer the DataConsumer which should be used by this Distributor (Writers, Transformers, and Reporters * should be configured) * @return the current object, with DataConsumer set */ public SearchDistributor setDataConsumer(DataConsumer dataConsumer) { return this; } /** * Set the file root on HDFS where files (temp and final) can be written * * @param fileRoot A valid HDFS location with space for the output of the data generation * @return An updated HDFSDistributor with hdfsFileRoot set */ public HDFSDistributor setFileRoot(String fileRoot) { this.hdfsFileRoot = fileRoot; this.mapperInputFilePath = new Path(hdfsFileRoot + "/input.dat"); if (mapperOutputFileName != null) { this.mapperOutputFilePath = new Path(hdfsFileRoot + "/" + mapperOutputFileName); } return this; } /** * Set the reporting host port for this Distributor * * @param hostPort The port number to use when reporting * @return An updated HDFSDistributor with the host port set */ public HDFSDistributor setReportingHost(String hostPort) { this.reportingHost = hostPort; return this; } /** * Set the output file directory (to be appended to hdfsFileRoot) * * @param fileName Path from hdfsFileRoot * @return an updated HDFSDistributor with output file name set */ public HDFSDistributor setOutputFileDir(String fileName) { this.mapperOutputFileName = fileName; if (hdfsFileRoot != null) { this.mapperOutputFilePath = new Path(hdfsFileRoot + "/" + mapperOutputFileName); } return this; } /** * Set the HDFS Configuration for this distributor (should be the same instance configured for the * MapReduce job by ToolRunner) * * @param configuration A configuration instance to use for Mapper tasks * @return An updated HDFSDistributor with Configuration object set */ public HDFSDistributor setConfiguration(Configuration configuration) { this.configuration = new JobConf(configuration); return this; } /** * Set the XML text for the state machine serving as an input model for data generation * * @param stateMachineText a String containing the state machine XML * @return An updated SearchDistributor with state machine text set */ public SearchDistributor setStateMachineText(String stateMachineText) { this.stateMachineText = stateMachineText; return this; } /** * Set the max number of lines which should be written by this Distributor * * @param maxNumberOfLines Maximum number of lines to be written by this Distributor * @return An updated SearchDistributor with a maximum line count set */ public SearchDistributor setMaxNumberOfLines(long maxNumberOfLines) { this.maxNumberOfLines = maxNumberOfLines; return this; } @Override public void distribute(List<Frontier> searchProblemList) { // We need to write the List out to a file on HDFS // That file will be input into the MR job // Add variables job configuration configuration.set("stateMachineText", stateMachineText); configuration.setLong("maxNumberOfLines", maxNumberOfLines); // Write input problems try { writeProblemsToHDFS(searchProblemList); } catch (IOException e) { log.error("Problem writing " + mapperInputFilePath + " prior to MR job execution"); return; } // Prepare and submit job try { Job job = prepareJob(); job.waitForCompletion(true); log.info("DataGen MR job can be tracked at " + job.getTrackingURL()); } catch (IOException e) { e.printStackTrace(); } catch (InterruptedException e) { e.printStackTrace(); } catch (ClassNotFoundException e) { e.printStackTrace(); } // Cleanup } private Job prepareJob() throws IOException { // Basic configuration configuration.setInt("mapreduce.input.lineinputformat.linespermap", 1); configuration.set("reportingHost", this.reportingHost); configuration.setBoolean("mapreduce.map.output.compress", true); configuration.setBoolean("mapred.compress.map.output", true); configuration.setBoolean("mapred.output.compress", true); configuration.setClass("mapred.map.output.compression.codec", GzipCodec.class, CompressionCodec.class); configuration.setClass("mapred.output.compression.codec", GzipCodec.class, CompressionCodec.class); /* configuration.setBoolean("mapreduce.output.fileoutputformat.compress", true); configuration.setClass("mapreduce.output.fileoutputformat.compress.codec", GzipCodec.class, CompressionCodec.class); configuration.setCompressMapOutput(true); */ // configuration.set("mapreduce.output.fileoutputformat.compress", "true"); // configuration.set("mapreduce.output.fileoutputformat.compress.codec", "org.apache.hadoop.io.compress.GzipCodec"); // configuration.set("mapreduce.output.fileoutputformat.compress.type", "BLOCK"); // Job ret = new Job(configuration); Job ret = org.apache.hadoop.mapreduce.Job.getInstance(configuration); ret.setJarByClass(HDFSDistributor.class); ret.setJobName("PATH Test Data Generation"); // Mapper ret.setMapperClass(DataGeneratorMapper.class); // Reducer (none) ret.setNumReduceTasks(0); // Input ret.setInputFormatClass(NLineInputFormat.class); NLineInputFormat.addInputPath(ret, mapperInputFilePath); // Output // [BTR] Saw this used in an example w/NLineInputFormatter // but not sure what it actually does ... // LazyOutputFormat.setOutputFormatClass(ret, TextOutputFormat.class); FileOutputFormat.setOutputPath(ret, mapperOutputFilePath); //ret.getConfiguration().setBoolean("mapred.output.compress", false); return ret; } /** * Convert a set of SearchProblem objects to Strings of JSON text, writing the array to * the HDFS location given by the HDFS file root. The written file serves as input to the * Mapper tasks (one Mapper per line in the file, which is also one SearchProblem) * * @param problems A List of Search Problems to write * @throws IOException if the file cannot be written to HDFS */ public void writeProblemsToHDFS(List<Frontier> problems) throws IOException { FileSystem fs = FileSystem.get(configuration); log.info("hdfsFileRoot = " + hdfsFileRoot); StringBuilder sb = new StringBuilder(); for (Frontier problem : problems) { SCXMLGapper gapper = new SCXMLGapper(); Map<String, String> decomposition = gapper.decompose(problem, stateMachineText); String problemString = decomposition.get("target") + "|" + decomposition.get("variables") + "|"; sb.append(problemString.replace("\n", "").replace("\t", "").replace("\r", "")); sb.append("\n"); } try (FSDataOutputStream out = fs.create(mapperInputFilePath)) { out.write(sb.toString().getBytes()); } catch (IOException e) { log.error("Problem writing " + mapperInputFilePath + " prior to MR job execution"); } } }