Java Examples for org.apache.avro.mapreduce.AvroKeyOutputFormat

The following java examples will help you to understand the usage of org.apache.avro.mapreduce.AvroKeyOutputFormat. These source code samples are taken from different open source projects.

Example 1
Project: camus-master  File: CamusSweeperAvroKeyJob.java View source code
@Override
public void configureJob(String topic, Job job) {
    boolean skipNameValidation = RelaxedSchemaUtils.skipNameValidation(job.getConfiguration());
    if (skipNameValidation) {
        RelaxedAvroSerialization.addToConfiguration(job.getConfiguration());
    }
    // setting up our input format and map output types
    super.configureInput(job, AvroKeyCombineFileInputFormat.class, AvroKeyMapper.class, AvroKey.class, AvroValue.class);
    // setting up our output format and output types
    super.configureOutput(job, skipNameValidation ? RelaxedAvroKeyOutputFormat.class : AvroKeyOutputFormat.class, AvroKeyReducer.class, AvroKey.class, NullWritable.class);
    // finding the newest file from our input. this file will contain the newest version of our avro
    // schema.
    Schema schema;
    try {
        schema = getNewestSchemaFromSource(job);
    } catch (IOException e) {
        throw new RuntimeException(e);
    }
    // checking if we have a key schema used for deduping. if we don't then we make this a map only
    // job and set the key schema
    // to the newest input schema
    String keySchemaStr = getConfValue(job, topic, "camus.sweeper.avro.key.schema");
    Schema keySchema;
    if (job.getConfiguration().getBoolean("camus.sweeper.use.all.attributes", false)) {
        log.info("Using all attributes in the schema (except Map fields) for deduping");
        keySchema = getAllFieldsExceptMap(schema);
    } else if (keySchemaStr == null || keySchemaStr.isEmpty() || job.getConfiguration().getBoolean("second.stage", false)) {
        job.setNumReduceTasks(0);
        keySchema = schema;
    } else {
        keySchema = RelaxedSchemaUtils.parseSchema(keySchemaStr, job.getConfiguration());
        keySchema = duplicateRecord(keySchema, schema);
        if (!validateKeySchema(schema, keySchema)) {
            log.info("topic:" + topic + " key invalid, using map only job");
            job.setNumReduceTasks(0);
            keySchema = schema;
        }
    }
    setupSchemas(topic, job, schema, keySchema);
    // setting the compression level. Only used if compression is enabled. default is 6
    job.getConfiguration().setInt(AvroOutputFormat.DEFLATE_LEVEL_KEY, job.getConfiguration().getInt(AvroOutputFormat.DEFLATE_LEVEL_KEY, 6));
}
Example 2
Project: avro-sorting-master  File: AvroWritableKeySort.java View source code
public boolean runMapReduce(final Job job, Path inputPath, Path outputPath) throws Exception {
    FileInputFormat.setInputPaths(job, inputPath);
    job.setInputFormatClass(AvroKeyInputFormat.class);
    AvroJob.setInputKeySchema(job, Weather.SCHEMA$);
    job.setMapperClass(SortMapper.class);
    AvroJob.setMapOutputValueSchema(job, Weather.SCHEMA$);
    job.setMapOutputKeyClass(WeatherSubset.class);
    job.setReducerClass(SortReducer.class);
    AvroJob.setOutputKeySchema(job, Weather.SCHEMA$);
    job.setOutputFormatClass(AvroKeyOutputFormat.class);
    FileOutputFormat.setOutputPath(job, outputPath);
    job.setPartitionerClass(WeatherPartitioner.class);
    job.setGroupingComparatorClass(WeatherSubsetGroupingComparator.class);
    job.setSortComparatorClass(WeatherSubsetSortComparator.class);
    return job.waitForCompletion(true);
}
Example 3
Project: etl-by-example-master  File: Driver.java View source code
@Override
public int run(String[] args) throws Exception {
    Path mrInput, mrOutput;
    if (args.length == 2) {
        mrInput = new Path(args[0]);
        mrOutput = new Path(args[1] + directoryFormat.format(new Date()));
    } else {
        System.err.println("Parameter missing!");
        return 1;
    }
    /** configure Job **/
    Job job = new Job(getConf(), "DataIngest Example");
    job.setJarByClass(Driver.class);
    job.setUserClassesTakesPrecedence(true);
    FileInputFormat.setInputPaths(job, mrInput);
    FileOutputFormat.setOutputPath(job, mrOutput);
    job.setMapperClass(MapperRawToAvro.class);
    job.setReducerClass(ReducerByDateTime.class);
    AvroJob.setMapOutputKeySchema(job, Schema.create(Schema.Type.LONG));
    AvroJob.setMapOutputValueSchema(job, SampleRecord.SCHEMA$);
    AvroKeyOutputFormat.setCompressOutput(job, true);
    AvroKeyOutputFormat.setOutputCompressorClass(job, DeflateCodec.class);
    AvroMultipleOutputs.addNamedOutput(job, "sampleRecord", AvroKeyOutputFormat.class, SampleRecord.SCHEMA$);
    MultipleOutputs.setCountersEnabled(job, true);
    if (job.waitForCompletion(true)) {
        return 0;
    } else {
        return 1;
    }
}
Example 4
Project: cdap-master  File: DynamicPartitioningOutputFormat.java View source code
private boolean isAvroOutputFormat(FileOutputFormat<K, V> fileOutputFormat) {
    String className = fileOutputFormat.getClass().getName();
    // use class name String in order avoid having a dependency on the Avro libraries here
    return "org.apache.avro.mapreduce.AvroKeyOutputFormat".equals(className) || "org.apache.avro.mapreduce.AvroKeyValueOutputFormat".equals(className);
}
Example 5
Project: hpg-bigdata-master  File: Bam2AvroMR.java View source code
public static int run(String input, String output, String codecName, boolean adjQuality, Configuration conf) throws Exception {
    // read header, and save sequence index/name in conf
    final Path p = new Path(input);
    final SeekableStream seekableStream = WrapSeekable.openPath(conf, p);
    final SamReader reader = SamReaderFactory.make().open(SamInputResource.of(seekableStream));
    final SAMFileHeader header = reader.getFileHeader();
    int i = 0;
    SAMSequenceRecord sr;
    while ((sr = header.getSequence(i)) != null) {
        conf.set("" + i, sr.getSequenceName());
        i++;
    }
    Job job = Job.getInstance(conf, "Bam2AvroMR");
    job.setJarByClass(Bam2AvroMR.class);
    // Avro problem fix
    job.getConfiguration().set("mapreduce.job.user.classpath.first", "true");
    job.getConfiguration().set(ADJUST_QUALITY, Boolean.toString(adjQuality));
    // We call setOutputSchema first so we can override the configuration
    // parameters it sets
    AvroJob.setOutputKeySchema(job, ReadAlignment.getClassSchema());
    job.setOutputValueClass(NullWritable.class);
    AvroJob.setMapOutputValueSchema(job, ReadAlignment.getClassSchema());
    // point to input data
    FileInputFormat.setInputPaths(job, new Path(input));
    job.setInputFormatClass(AnySAMInputFormat.class);
    // set the output format
    FileOutputFormat.setOutputPath(job, new Path(output));
    if (codecName != null) {
        FileOutputFormat.setCompressOutput(job, true);
        FileOutputFormat.setOutputCompressorClass(job, CompressionUtils.getHadoopCodec(codecName));
    }
    job.setOutputFormatClass(AvroKeyOutputFormat.class);
    job.setMapOutputKeyClass(AvroKey.class);
    job.setMapOutputValueClass(Void.class);
    job.setMapperClass(Bam2GaMapper.class);
    job.setNumReduceTasks(0);
    job.waitForCompletion(true);
    // write header
    Path headerPath = new Path(output + "/part-m-00000.avro.header");
    FileSystem fs = FileSystem.get(conf);
    BufferedWriter br = new BufferedWriter(new OutputStreamWriter(fs.create(headerPath, true)));
    br.write(header.getTextHeader());
    br.close();
    return 0;
}
Example 6
Project: iis-master  File: SparkPipeMapReduce.java View source code
//------------------------ LOGIC --------------------------
public static void main(String[] args) throws IOException, ClassNotFoundException {
    SparkPipeMapReduceParameters params = parseParameters(args);
    SparkConf conf = new SparkConf();
    conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
    conf.set("spark.kryo.registrator", "pl.edu.icm.sparkutils.avro.AvroCompatibleKryoRegistrator");
    Class<? extends GenericRecord> outputAvroClass = Class.forName(params.outputAvroSchemaClass).asSubclass(GenericRecord.class);
    Schema inputSchema = AvroUtils.toSchema(params.inputAvroSchemaClass);
    Schema outputSchema = AvroUtils.toSchema(params.outputAvroSchemaClass);
    Job job = Job.getInstance();
    AvroJob.setInputKeySchema(job, inputSchema);
    AvroJob.setOutputKeySchema(job, outputSchema);
    try (JavaSparkContext sc = new JavaSparkContext(conf)) {
        sc.addFile(params.mapperScript);
        sc.addFile(params.reducerScript);
        String mapperScriptName = new File(params.mapperScript).getName();
        String reducerScriptName = new File(params.reducerScript).getName();
        SparkPipeExecutor pipeExecutor = new SparkPipeExecutor();
        @SuppressWarnings("unchecked") JavaPairRDD<AvroKey<GenericRecord>, NullWritable> inputRecords = (JavaPairRDD<AvroKey<GenericRecord>, NullWritable>) sc.newAPIHadoopFile(params.inputAvroPath, AvroKeyInputFormat.class, GenericRecord.class, NullWritable.class, job.getConfiguration());
        JavaPairRDD<String, String> mappedRecords = pipeExecutor.doMap(inputRecords, mapperScriptName, params.mapperScriptArgs);
        JavaPairRDD<AvroKey<GenericRecord>, NullWritable> reducedRecords = pipeExecutor.doReduce(mappedRecords, reducerScriptName, params.reducerScriptArgs, outputAvroClass);
        reducedRecords.saveAsNewAPIHadoopFile(params.outputAvroPath, AvroKey.class, NullWritable.class, AvroKeyOutputFormat.class, job.getConfiguration());
    }
}
Example 7
Project: hiped2-master  File: BloomFilterCreator.java View source code
/**
   * The MapReduce driver - setup and launch the job.
   *
   * @param args the command-line arguments
   * @return the process exit code
   * @throws Exception if something goes wrong
   */
public int run(final String[] args) throws Exception {
    Cli cli = Cli.builder().setArgs(args).addOptions(ReplicatedJoin.UserOptions.values()).build();
    int result = cli.runCmd();
    if (result != 0) {
        return result;
    }
    Path usersPath = new Path(cli.getArgValueAsString(ReplicatedJoin.UserOptions.USERS));
    Path outputPath = new Path(cli.getArgValueAsString(ReplicatedJoin.UserOptions.OUTPUT));
    Configuration conf = super.getConf();
    Job job = new Job(conf);
    job.setJarByClass(BloomFilterCreator.class);
    job.setMapperClass(Map.class);
    job.setReducerClass(Reduce.class);
    AvroJob.setOutputKeySchema(job, AvroBytesRecord.SCHEMA);
    job.getConfiguration().set(AvroJob.CONF_OUTPUT_CODEC, SnappyCodec.class.getName());
    job.setOutputFormatClass(AvroKeyOutputFormat.class);
    job.setMapOutputKeyClass(NullWritable.class);
    job.setMapOutputValueClass(BloomFilter.class);
    FileInputFormat.setInputPaths(job, usersPath);
    FileOutputFormat.setOutputPath(job, outputPath);
    job.setNumReduceTasks(1);
    return job.waitForCompletion(true) ? 0 : 1;
}
Example 8
Project: KOSHIK-master  File: EnglishPipeline.java View source code
@SuppressWarnings("static-access")
@Override
public int run(String[] args) throws Exception {
    Options options = new Options();
    options.addOption(OptionBuilder.withArgName("path,...").hasArg().withDescription("input path[s]").create(OPTION_INPUTPATHS));
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("output path").create(OPTION_OUTPUTPATH));
    CommandLine commandLine;
    CommandLineParser commandLineParser = new GnuParser();
    commandLine = commandLineParser.parse(options, args);
    if (!commandLine.hasOption(OPTION_INPUTPATHS) || !commandLine.hasOption(OPTION_OUTPUTPATH)) {
        HelpFormatter helpFormatter = new HelpFormatter();
        helpFormatter.printHelp(getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }
    String inputPaths = commandLine.getOptionValue(OPTION_INPUTPATHS);
    Path outputPath = new Path(commandLine.getOptionValue(OPTION_OUTPUTPATH));
    LOGGER.info("Utility name: " + this.getClass().getName());
    LOGGER.info(" - input path: " + inputPaths);
    LOGGER.info(" - output path: " + outputPath);
    Job job = new Job(getConf(), getClass().getName());
    job.setJarByClass(getClass());
    FileInputFormat.setInputPaths(job, inputPaths);
    FileOutputFormat.setOutputPath(job, outputPath);
    job.setInputFormatClass(AvroKeyInputFormat.class);
    //job.setMapperClass(DocumentSelectMapper.class);
    AvroJob.setInputKeySchema(job, AvroDocument.SCHEMA$);
    AvroJob.setMapOutputKeySchema(job, AvroDocument.SCHEMA$);
    job.setMapOutputValueClass(NullWritable.class);
    job.setReducerClass(EnglishPipelineReducer.class);
    AvroJob.setOutputKeySchema(job, AvroDocument.SCHEMA$);
    job.setOutputValueClass(NullWritable.class);
    job.setOutputFormatClass(AvroKeyOutputFormat.class);
    return job.waitForCompletion(true) ? 0 : 1;
}
Example 9
Project: opencga-master  File: VariantTableExportDriver.java View source code
@Override
protected void initMapReduceJob(String inTable, Job job, Scan scan, boolean addDependencyJar) throws IOException {
    super.initMapReduceJob(inTable, job, scan, addDependencyJar);
    // set Path
    FileOutputFormat.setOutputPath(job, new Path(this.outFile));
    // compression
    FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class);
    switch(this.type) {
        case AVRO:
            job.setOutputFormatClass(AvroKeyOutputFormat.class);
            // Set schema
            AvroJob.setOutputKeySchema(job, VariantAvro.getClassSchema());
            break;
        case VCF:
            job.setOutputFormatClass(HadoopVcfOutputFormat.class);
            break;
        default:
            throw new IllegalStateException("Type not known: " + this.type);
    }
    job.setNumReduceTasks(0);
}
Example 10
Project: aegisthus-master  File: SSTableExport.java View source code
@Override
public int run(String[] args) throws Exception {
    Job job = Job.getInstance(getConf());
    job.setJarByClass(SSTableExport.class);
    CommandLine cl = getOptions(args);
    if (cl == null) {
        return 1;
    }
    // Check all of the paths and load the sstable version from the input filenames
    List<Path> paths = Lists.newArrayList();
    if (cl.hasOption(Feature.CMD_ARG_INPUT_FILE)) {
        for (String input : cl.getOptionValues(Feature.CMD_ARG_INPUT_FILE)) {
            checkVersionFromFilename(input);
            paths.add(new Path(input));
        }
    }
    if (cl.hasOption(Feature.CMD_ARG_INPUT_DIR)) {
        paths.addAll(getDataFiles(job.getConfiguration(), cl.getOptionValue(Feature.CMD_ARG_INPUT_DIR)));
    }
    String avroSchemaString = getAvroSchema(cl.getOptionValue(Feature.CMD_ARG_AVRO_SCHEMA_FILE), job.getConfiguration());
    Schema avroSchema = new Schema.Parser().parse(avroSchemaString);
    // At this point we have the version of sstable that we can use for this run
    job.getConfiguration().set(Aegisthus.Feature.CONF_SSTABLE_VERSION, version.toString());
    if (job.getConfiguration().get(Aegisthus.Feature.CONF_CQL_SCHEMA) != null) {
        setConfigurationFromCql(job.getConfiguration());
    }
    job.setInputFormatClass(AegisthusInputFormat.class);
    job.setMapperClass(CQLMapper.class);
    job.setOutputFormatClass(AvroKeyOutputFormat.class);
    AvroJob.setOutputKeySchema(job, avroSchema);
    // Map-only job
    job.setNumReduceTasks(0);
    TextInputFormat.setInputPaths(job, paths.toArray(new Path[paths.size()]));
    FileOutputFormat.setOutputPath(job, new Path(cl.getOptionValue(Feature.CMD_ARG_OUTPUT_DIR)));
    job.submit();
    System.out.println(job.getJobID());
    System.out.println(job.getTrackingURL());
    boolean success = job.waitForCompletion(true);
    return success ? 0 : 1;
}
Example 11
Project: white-elephant-master  File: ParseJobConfs.java View source code
public void execute(StagedOutputJobExecutor executor) throws IOException, InterruptedException, ExecutionException {
    for (String clusterName : _clusterNames.split(",")) {
        System.out.println("Processing cluster " + clusterName);
        List<JobStatsProcessing.ProcessingTask> processingTasks = JobStatsProcessing.getTasks(_fs, _logsRoot, clusterName, _confsOutputPathRoot, "xml", _incremental, _numDays, _numDaysForced);
        for (JobStatsProcessing.ProcessingTask task : processingTasks) {
            List<String> inputPaths = new ArrayList<String>();
            inputPaths.add(task.inputPathFormat);
            String outputPath = task.outputPath;
            final StagedOutputJob job = StagedOutputJob.createStagedJob(_props, _name + "-parse-confs-" + task.id, inputPaths, "/tmp" + outputPath, outputPath, _log);
            job.getConfiguration().set("jobs.output.path", _confsOutputPathRoot);
            job.getConfiguration().set("logs.cluster.name", clusterName);
            job.setOutputKeyClass(BytesWritable.class);
            job.setOutputValueClass(NullWritable.class);
            job.setInputFormatClass(CombineDocumentFileFormat.class);
            job.setOutputFormatClass(AvroKeyOutputFormat.class);
            AvroJob.setOutputKeySchema(job, JobConf.SCHEMA$);
            job.setNumReduceTasks(0);
            job.setMapperClass(ParseJobConfs.TheMapper.class);
            executor.submit(job);
        }
        executor.waitForCompletion();
    }
}
Example 12
Project: geowave-master  File: VectorMRExportJobRunner.java View source code
/**
	 * Main method to execute the MapReduce analytic.
	 */
public int runJob() throws CQLException, IOException, InterruptedException, ClassNotFoundException {
    Configuration conf = super.getConf();
    if (conf == null) {
        conf = new Configuration();
        setConf(conf);
    }
    GeoWaveConfiguratorBase.setRemoteInvocationParams(hdfsHostPort, mrOptions.getResourceManagerHostPort(), conf);
    final QueryOptions options = new QueryOptions();
    final List<String> adapterIds = mrOptions.getAdapterIds();
    final AdapterStore adapterStore = storeOptions.createAdapterStore();
    if ((adapterIds != null) && !adapterIds.isEmpty()) {
        options.setAdapters(Lists.transform(adapterIds, new Function<String, DataAdapter<?>>() {

            @Override
            public DataAdapter<?> apply(final String input) {
                return adapterStore.getAdapter(new ByteArrayId(input));
            }
        }));
    }
    conf.setInt(BATCH_SIZE_KEY, mrOptions.getBatchSize());
    if (mrOptions.getIndexId() != null) {
        final Index index = storeOptions.createIndexStore().getIndex(new ByteArrayId(mrOptions.getIndexId()));
        if (index == null) {
            JCommander.getConsole().println("Unable to find index '" + mrOptions.getIndexId() + "' in store");
            return -1;
        }
        if (index instanceof PrimaryIndex) {
            options.setIndex((PrimaryIndex) index);
        } else {
            JCommander.getConsole().println("Index '" + mrOptions.getIndexId() + "' is not a primary index");
            return -1;
        }
    }
    if (mrOptions.getCqlFilter() != null) {
        if ((adapterIds == null) || (adapterIds.size() != 1)) {
            JCommander.getConsole().println("Exactly one type is expected when using CQL filter");
            return -1;
        }
        final String adapterId = adapterIds.get(0);
        final DataAdapter<?> adapter = storeOptions.createAdapterStore().getAdapter(new ByteArrayId(adapterId));
        if (adapter == null) {
            JCommander.getConsole().println("Type '" + adapterId + "' not found");
            return -1;
        }
        if (!(adapter instanceof GeotoolsFeatureDataAdapter)) {
            JCommander.getConsole().println("Type '" + adapterId + "' does not support vector export");
            return -1;
        }
        GeoWaveInputFormat.setQuery(conf, (DistributableQuery) CQLQuery.createOptimalQuery(mrOptions.getCqlFilter(), (GeotoolsFeatureDataAdapter) adapter, options.getIndex(), null));
    }
    GeoWaveInputFormat.setStoreOptions(conf, storeOptions);
    // the above code is a temporary placeholder until this gets merged with
    // the new commandline options
    GeoWaveInputFormat.setQueryOptions(conf, options);
    final Job job = new Job(conf);
    job.setJarByClass(this.getClass());
    job.setJobName("Exporting to " + hdfsPath);
    FileOutputFormat.setCompressOutput(job, true);
    FileOutputFormat.setOutputPath(job, new Path(hdfsPath));
    job.setMapperClass(VectorExportMapper.class);
    job.setInputFormatClass(GeoWaveInputFormat.class);
    job.setOutputFormatClass(AvroKeyOutputFormat.class);
    job.setMapOutputKeyClass(AvroKey.class);
    job.setMapOutputValueClass(NullWritable.class);
    job.setOutputKeyClass(AvroKey.class);
    job.setOutputValueClass(NullWritable.class);
    job.setNumReduceTasks(0);
    AvroJob.setOutputKeySchema(job, AvroSimpleFeatureCollection.SCHEMA$);
    AvroJob.setMapOutputKeySchema(job, AvroSimpleFeatureCollection.SCHEMA$);
    GeoWaveInputFormat.setMinimumSplitCount(job.getConfiguration(), mrOptions.getMinSplits());
    GeoWaveInputFormat.setMaximumSplitCount(job.getConfiguration(), mrOptions.getMaxSplits());
    boolean retVal = false;
    try {
        retVal = job.waitForCompletion(true);
    } catch (final IOException ex) {
        LOGGER.error("Error waiting for map reduce tile resize job: ", ex);
    }
    return retVal ? 0 : 1;
}
Example 13
Project: pinot-master  File: DerivedColumnTransformationPhaseJob.java View source code
public Job run() throws Exception {
    Job job = Job.getInstance(getConf());
    job.setJobName(name);
    job.setJarByClass(DerivedColumnTransformationPhaseJob.class);
    Configuration configuration = job.getConfiguration();
    FileSystem fs = FileSystem.get(configuration);
    // Input Path
    String inputPathDir = getAndSetConfiguration(configuration, DERIVED_COLUMN_TRANSFORMATION_PHASE_INPUT_PATH);
    LOGGER.info("Input path dir: " + inputPathDir);
    for (String inputPath : inputPathDir.split(",")) {
        LOGGER.info("Adding input:" + inputPath);
        Path input = new Path(inputPath);
        FileInputFormat.addInputPath(job, input);
    }
    // Topk path
    String topkPath = getAndSetConfiguration(configuration, DERIVED_COLUMN_TRANSFORMATION_PHASE_TOPK_PATH);
    LOGGER.info("Topk path : " + topkPath);
    // Output path
    Path outputPath = new Path(getAndSetConfiguration(configuration, DERIVED_COLUMN_TRANSFORMATION_PHASE_OUTPUT_PATH));
    LOGGER.info("Output path dir: " + outputPath.toString());
    if (fs.exists(outputPath)) {
        fs.delete(outputPath, true);
    }
    FileOutputFormat.setOutputPath(job, outputPath);
    // Schema
    Schema avroSchema = ThirdeyeAvroUtils.getSchema(inputPathDir);
    LOGGER.info("Schema : {}", avroSchema.toString(true));
    // ThirdEyeConfig
    String metricTypesProperty = ThirdeyeAvroUtils.getMetricTypesProperty(props.getProperty(ThirdEyeConfigProperties.THIRDEYE_METRIC_NAMES.toString()), props.getProperty(ThirdEyeConfigProperties.THIRDEYE_METRIC_TYPES.toString()), avroSchema);
    props.setProperty(ThirdEyeConfigProperties.THIRDEYE_METRIC_TYPES.toString(), metricTypesProperty);
    ThirdEyeConfig thirdeyeConfig = ThirdEyeConfig.fromProperties(props);
    job.getConfiguration().set(DERIVED_COLUMN_TRANSFORMATION_PHASE_THIRDEYE_CONFIG.toString(), OBJECT_MAPPER.writeValueAsString(thirdeyeConfig));
    LOGGER.info("ThirdEyeConfig {}", thirdeyeConfig.encode());
    // New schema
    Schema outputSchema = newSchema(thirdeyeConfig);
    job.getConfiguration().set(DERIVED_COLUMN_TRANSFORMATION_PHASE_OUTPUT_SCHEMA.toString(), outputSchema.toString());
    // Map config
    job.setMapperClass(DerivedColumnTransformationPhaseMapper.class);
    job.setInputFormatClass(AvroKeyInputFormat.class);
    job.setMapOutputKeyClass(AvroKey.class);
    job.setMapOutputValueClass(NullWritable.class);
    AvroJob.setOutputKeySchema(job, outputSchema);
    LazyOutputFormat.setOutputFormatClass(job, AvroKeyOutputFormat.class);
    AvroMultipleOutputs.addNamedOutput(job, "avro", AvroKeyOutputFormat.class, outputSchema);
    job.setNumReduceTasks(0);
    job.waitForCompletion(true);
    return job;
}
Example 14
Project: hadcom.utils-master  File: ConvertEnvMultiTable2MultiAvro2.java View source code
private static void addSchema(String outputPath, Job job, BufferedReader br) throws IOException {
    String line;
    StringBuilder strBuilder = new StringBuilder();
    // read the file
    while ((line = br.readLine()) != null) {
        strBuilder.append(line);
    }
    String schemaStr = strBuilder.toString();
    // get the schema name and make a hdfs directory out
    // of
    // it.
    Schema schema;
    try {
        schema = new Schema.Parser().parse(schemaStr);
    } catch (Exception e) {
        throw new RuntimeException("Unable to parse schema file: " + schemaStr, e);
    }
    getNamedOutputsList(job);
    job.getConfiguration().set(SCHEMA_PRE_CONF + schema.getName(), schemaStr);
    AvroMultipleOutputs.addNamedOutput(job, schema.getName(), AvroKeyOutputFormat.class, schema);
    // clear the builder
    strBuilder.delete(0, strBuilder.length());
}
Example 15
Project: kiji-mapreduce-master  File: AvroKeyMapReduceJobOutput.java View source code
/** {@inheritDoc} */
@Override
protected Class<? extends OutputFormat> getOutputFormatClass() {
    return AvroKeyOutputFormat.class;
}