ParquetAsTextOutputFormat.java example

Explorer

iow-hadoop-streaming-master
- src
  - main
    - java
      - net
        iponweb
        hadoop
        streaming
        avro
        AvroAsJsonInputFormat.java
        AvroAsJsonOutputFormat.java
        AvroAsJsonRecordReader.java
        AvroAsTextInputFormat.java
        AvroAsTextOutputFormat.java
        AvroAsTextRecordReaderCopy.java
        EmptyRecordReader.java
        GenericDataTSV.java
        IOWJsonDecoder.java
        TextEmptyRecordReader.java
        io
        ByKeyOutputFormat.java
        parquet
        GroupReadSupport.java
        GroupWriteSupport.java
        JsonRecordWriterWrapper.java
        ParquetAsJsonInputFormat.java
        ParquetAsJsonOutputFormat.java
        ParquetAsTextInputFormat.java
        ParquetAsTextOutputFormat.java
        PathAction.java
        TextRecordWriterWrapper.java
        tools
        KeyValueSplitter.java
  - test
    - java
      - net
        iponweb
        hadoop
        streaming
        avro
        AvroInOutFormatsTest.java
        GenericDataTSVTest.java
        dummyReporter.java
        parquet
        ParquetInOutFormatsTest.java

/**
 * Copyright 2014 IPONWEB
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package net.iponweb.hadoop.streaming.parquet;


import com.google.common.base.Throwables;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.DefaultCodec;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.RecordWriter;
import org.apache.hadoop.util.Progressable;
import org.apache.parquet.example.data.simple.SimpleGroup;
import org.apache.parquet.hadoop.ParquetOutputFormat;
import org.apache.parquet.hadoop.ParquetRecordWriter;
import org.apache.parquet.hadoop.example.GroupWriteSupport;
import org.apache.parquet.hadoop.metadata.CompressionCodecName;
import org.apache.parquet.schema.MessageType;
import org.apache.parquet.schema.MessageTypeParser;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;

public class ParquetAsTextOutputFormat extends FileOutputFormat<Text, Text> {

    private static final Log LOG = LogFactory.getLog(ParquetAsTextOutputFormat.class);
    protected ParquetOutputFormat<SimpleGroup> realOutputFormat = new ParquetOutputFormat<>();

    public static void setWriteSupportClass(Configuration configuration,  Class<?> writeSupportClass) {
        configuration.set(ParquetOutputFormat.WRITE_SUPPORT_CLASS, writeSupportClass.getName());
    }

    public static void setBlockSize(Configuration configuration, int blockSize) {
        configuration.setInt(ParquetOutputFormat.BLOCK_SIZE, blockSize);
    }

    public static void setPageSize(Configuration configuration, int pageSize) {
        configuration.setInt(ParquetOutputFormat.PAGE_SIZE, pageSize);
    }

    public static void setCompression(Configuration configuration, CompressionCodecName compression) {
        configuration.set(ParquetOutputFormat.COMPRESSION, compression.name());
    }

    public static void setEnableDictionary(Configuration configuration, boolean enableDictionary) {
        configuration.setBoolean(ParquetOutputFormat.ENABLE_DICTIONARY, enableDictionary);
    }

    private static Path getDefaultWorkFile(JobConf conf, String name, String extension) {
        String file = getUniqueName(conf, name) + extension;
        return new Path(getWorkOutputPath(conf), file);
    }

    private static CompressionCodecName getCodec(JobConf conf) {

        CompressionCodecName codec;

        if (ParquetOutputFormat.isCompressionSet(conf)) { // explicit parquet config
            codec = ParquetOutputFormat.getCompression(conf);
        } else if (getCompressOutput(conf)) { // from hadoop config
            // find the right codec
            Class<?> codecClass = getOutputCompressorClass(conf, DefaultCodec.class);
            LOG.info("Compression set through hadoop codec: " + codecClass.getName());
            codec = CompressionCodecName.fromCompressionCodec(codecClass);
        } else {
            codec = CompressionCodecName.UNCOMPRESSED;
        }

        LOG.info("Compression: " + codec.name());
        return codec;
    }

    public RecordWriter<Text, Text> getRecordWriter(FileSystem fs, JobConf job, String name, Progressable progress)
        throws IOException {

        // find and load schema


        String writeSchema = job.get("iow.streaming.output.schema");
        MessageType s;

        if (writeSchema == null) {

            String schemaFile = job.get("iow.streaming.output.schema.file","streaming_output_schema");

            if (job.getBoolean("iow.streaming.schema.use.prefix", false)) {
                // guess schema from file name
                // format is: schema:filename
                // with special keyword default - 'default:filename'

                String str[] = name.split(":");
                if (!str[0].equals("default"))
                    schemaFile = str[0];

                name = str[1];
            }

            LOG.info("Using schema: " + schemaFile);
            File f = new File(schemaFile);
            try {
                BufferedReader reader = new BufferedReader(new FileReader(f));
                StringBuilder r = new StringBuilder();
                String line;
                while ((line = reader.readLine()) != null)
                    r.append(line);

                writeSchema = r.toString();

            } catch (Throwable e) {
                LOG.error("Can't read schema file " + schemaFile);
                Throwables.propagateIfPossible(e, IOException.class);
                throw new RuntimeException(e);
            }
        }
        s = MessageTypeParser.parseMessageType(writeSchema);

        setWriteSupportClass(job,GroupWriteSupport.class);
        GroupWriteSupport.setSchema(s, job);

        CompressionCodecName codec = getCodec(job);
        String extension = codec.getExtension() + ".parquet";
        Path file = getDefaultWorkFile(job, name, extension);

        ParquetRecordWriter<SimpleGroup> realWriter;
        try {
            realWriter = (ParquetRecordWriter<SimpleGroup>) realOutputFormat.getRecordWriter(job, file, codec);
        } catch (InterruptedException e) {
            Thread.interrupted();
            throw new IOException(e);
        }

        return createRecordWriter(realWriter, fs, job, name, progress);
    }

    protected RecordWriter<Text, Text>
        createRecordWriter(ParquetRecordWriter<SimpleGroup> w, FileSystem fs, JobConf job, String name, Progressable p)
            throws IOException {

        return new TextRecordWriterWrapper(w, fs, job, name, p);
    }
}