ScalaCsvOutputFormat.java example

Explorer
flink-master
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */


package org.apache.flink.api.scala.operators;

import org.apache.flink.annotation.PublicEvolving;
import org.apache.flink.api.common.ExecutionConfig;
import org.apache.flink.api.common.InvalidProgramException;
import org.apache.flink.api.common.io.FileOutputFormat;
import org.apache.flink.api.common.typeinfo.TypeInformation;
import org.apache.flink.api.java.io.CsvInputFormat;
import org.apache.flink.api.java.typeutils.InputTypeConfigurable;
import org.apache.flink.core.fs.Path;
import org.apache.flink.types.StringValue;

import java.io.BufferedOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.Writer;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import scala.Product;

/**
 * This is an OutputFormat to serialize Scala Tuples to text. The output is
 * structured by record delimiters and field delimiters as common in CSV files.
 * Record delimiter separate records from each other ('\n' is common). Field
 * delimiters separate fields within a record.
 */
@PublicEvolving
public class ScalaCsvOutputFormat<T extends Product> extends FileOutputFormat<T> implements InputTypeConfigurable {
	private static final long serialVersionUID = 1L;

	@SuppressWarnings("unused")
	private static final Logger LOG = LoggerFactory.getLogger(ScalaCsvOutputFormat.class);

	// --------------------------------------------------------------------------------------------

	public static final String DEFAULT_LINE_DELIMITER = CsvInputFormat.DEFAULT_LINE_DELIMITER;

	public static final String DEFAULT_FIELD_DELIMITER = String.valueOf(CsvInputFormat.DEFAULT_FIELD_DELIMITER);

	// --------------------------------------------------------------------------------------------

	private transient Writer wrt;

	private String fieldDelimiter;

	private String recordDelimiter;

	private String charsetName;

	private boolean allowNullValues = true;

	private boolean quoteStrings = false;

	// --------------------------------------------------------------------------------------------
	// Constructors and getters/setters for the configurable parameters
	// --------------------------------------------------------------------------------------------

	/**
	 * Creates an instance of CsvOutputFormat. Lines are separated by the newline character '\n',
	 * fields are separated by ','.
	 *
	 * @param outputPath The path where the CSV file is written.
	 */
	public ScalaCsvOutputFormat(Path outputPath) {
		this(outputPath, DEFAULT_LINE_DELIMITER, DEFAULT_FIELD_DELIMITER);
	}

	/**
	 * Creates an instance of CsvOutputFormat. Lines are separated by the newline character '\n',
	 * fields by the given field delimiter.
	 *
	 * @param outputPath The path where the CSV file is written.
	 * @param fieldDelimiter
	 *            The delimiter that is used to separate fields in a tuple.
	 */
	public ScalaCsvOutputFormat(Path outputPath, String fieldDelimiter) {
		this(outputPath, DEFAULT_LINE_DELIMITER, fieldDelimiter);
	}

	/**
	 * Creates an instance of CsvOutputFormat.
	 *
	 * @param outputPath The path where the CSV file is written.
	 * @param recordDelimiter
	 *            The delimiter that is used to separate the tuples.
	 * @param fieldDelimiter
	 *            The delimiter that is used to separate fields in a tuple.
	 */
	public ScalaCsvOutputFormat(Path outputPath, String recordDelimiter, String fieldDelimiter) {
		super(outputPath);
		if (recordDelimiter == null) {
			throw new IllegalArgumentException("RecordDelmiter shall not be null.");
		}

		if (fieldDelimiter == null) {
			throw new IllegalArgumentException("FieldDelimiter shall not be null.");
		}

		this.fieldDelimiter = fieldDelimiter;
		this.recordDelimiter = recordDelimiter;
		this.allowNullValues = false;
	}

	/**
	 * Configures the format to either allow null values (writing an empty field),
	 * or to throw an exception when encountering a null field.
	 * <p>
	 * by default, null values are allowed.
	 *
	 * @param allowNulls Flag to indicate whether the output format should accept null values.
	 */
	public void setAllowNullValues(boolean allowNulls) {
		this.allowNullValues = allowNulls;
	}

	/**
	 * Sets the charset with which the CSV strings are written to the file.
	 * If not specified, the output format uses the systems default character encoding.
	 *
	 * @param charsetName The name of charset to use for encoding the output.
	 */
	public void setCharsetName(String charsetName) {
		this.charsetName = charsetName;
	}

	/**
	 * Configures whether the output format should quote string values. String values are fields
	 * of type {@link String} and {@link org.apache.flink.types.StringValue}, as well as
	 * all subclasses of the latter.
	 * <p>
	 * By default, strings are not quoted.
	 *
	 * @param quoteStrings Flag indicating whether string fields should be quoted.
	 */
	public void setQuoteStrings(boolean quoteStrings) {
		this.quoteStrings = quoteStrings;
	}

	// --------------------------------------------------------------------------------------------

	@Override
	public void open(int taskNumber, int numTasks) throws IOException {
		super.open(taskNumber, numTasks);
		this.wrt = this.charsetName == null ? new OutputStreamWriter(new BufferedOutputStream(this.stream, 4096)) :
				new OutputStreamWriter(new BufferedOutputStream(this.stream, 4096), this.charsetName);
	}

	@Override
	public void close() throws IOException {
		if (wrt != null) {
			this.wrt.close();
		}
		super.close();
	}

	@Override
	public void writeRecord(T element) throws IOException {
		int numFields = element.productArity();

		for (int i = 0; i < numFields; i++) {
			Object v = element.productElement(i);
			if (v != null) {
				if (i != 0) {
					this.wrt.write(this.fieldDelimiter);
				}

				if (quoteStrings) {
					if (v instanceof String || v instanceof StringValue) {
						this.wrt.write('"');
						this.wrt.write(v.toString());
						this.wrt.write('"');
					} else {
						this.wrt.write(v.toString());
					}
				} else {
					this.wrt.write(v.toString());
				}
			} else {
				if (this.allowNullValues) {
					if (i != 0) {
						this.wrt.write(this.fieldDelimiter);
					}
				} else {
					throw new RuntimeException("Cannot write tuple with <null> value at position: " + i);
				}
			}
		}

		// add the record delimiter
		this.wrt.write(this.recordDelimiter);
	}

	// --------------------------------------------------------------------------------------------
	@Override
	public String toString() {
		return "CsvOutputFormat (path: " + this.getOutputFilePath() + ", delimiter: " + this.fieldDelimiter + ")";
	}

	/**
	 *
	 * The purpose of this method is solely to check whether the data type to be processed
	 * is in fact a tuple type.
	 */
	@Override
	public void setInputType(TypeInformation<?> type, ExecutionConfig executionConfig) {
		if (!type.isTupleType()) {
			throw new InvalidProgramException("The " + ScalaCsvOutputFormat.class.getSimpleName() +
				" can only be used to write tuple data sets.");
		}
	}
}