GraphCsvReader.java example

Explorer
flink-master
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.flink.graph;

import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.java.DataSet;
import org.apache.flink.api.java.ExecutionEnvironment;
import org.apache.flink.api.java.io.CsvReader;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.api.java.tuple.Tuple3;
import org.apache.flink.core.fs.Path;
import org.apache.flink.graph.utils.Tuple2ToEdgeMap;
import org.apache.flink.graph.utils.Tuple2ToVertexMap;
import org.apache.flink.types.NullValue;
import org.apache.flink.util.Preconditions;

/**
 * A class to build a Graph using path(s) provided to CSV file(s) with optional vertex and edge data.
 * The class also configures the CSV readers used to read edge and vertex data such as the field types,
 * the delimiters (row and field), the fields that should be included or skipped, and other flags,
 * such as whether to skip the initial line as the header.
 * The configuration is done using the functions provided in the {@link org.apache.flink.api.java.io.CsvReader} class.
 */

public class GraphCsvReader {

	@SuppressWarnings("unused")
	private final Path vertexPath, edgePath;
	private final ExecutionEnvironment executionContext;
	protected CsvReader edgeReader;
	protected CsvReader vertexReader;
	protected MapFunction<?, ?> mapper;

//--------------------------------------------------------------------------------------------------------------------
	public GraphCsvReader(Path vertexPath, Path edgePath, ExecutionEnvironment context) {
		this.vertexPath = vertexPath;
		this.edgePath = edgePath;
		this.vertexReader = new CsvReader(vertexPath, context);
		this.edgeReader = new CsvReader(edgePath, context);
		this.mapper = null;
		this.executionContext = context;
	}

	public GraphCsvReader(Path edgePath, ExecutionEnvironment context) {
		this.vertexPath = null;
		this.edgePath = edgePath;
		this.edgeReader = new CsvReader(edgePath, context);
		this.vertexReader = null;
		this.mapper = null;
		this.executionContext = context;
	}

	public <K, VV> GraphCsvReader(Path edgePath, final MapFunction<K, VV> mapper, ExecutionEnvironment context) {
		this.vertexPath = null;
		this.edgePath = edgePath;
		this.edgeReader = new CsvReader(edgePath, context);
		this.vertexReader = null;
		this.mapper = mapper;
		this.executionContext = context;
	}

	public GraphCsvReader (String edgePath, ExecutionEnvironment context) {
		this(new Path(Preconditions.checkNotNull(edgePath, "The file path may not be null.")), context);

	}

	public GraphCsvReader(String vertexPath, String edgePath, ExecutionEnvironment context) {
		this(new Path(Preconditions.checkNotNull(vertexPath, "The file path may not be null.")),
				new Path(Preconditions.checkNotNull(edgePath, "The file path may not be null.")), context);
	}


	public <K, VV> GraphCsvReader(String edgePath, final MapFunction<K, VV> mapper, ExecutionEnvironment context) {
			this(new Path(Preconditions.checkNotNull(edgePath, "The file path may not be null.")), mapper, context);
	}

	/**
	 * Creates a Graph from CSV input with vertex values and edge values.
	 * The vertex values are specified through a vertices input file or a user-defined map function.
	 * 
	 * @param vertexKey the type of the vertex IDs
	 * @param vertexValue the type of the vertex values
	 * @param edgeValue the type of the edge values
	 * @return a Graph with vertex and edge values.
	 */
	@SuppressWarnings("unchecked")
	public <K, VV, EV> Graph<K, VV, EV> types(Class<K> vertexKey, Class<VV> vertexValue,
			Class<EV> edgeValue) {

		if (edgeReader == null) {
			throw new RuntimeException("The edge input file cannot be null!");
		}

		DataSet<Tuple3<K, K, EV>> edges = edgeReader.types(vertexKey, vertexKey, edgeValue);

		// the vertex value can be provided by an input file or a user-defined mapper
		if (vertexReader != null) {
			DataSet<Tuple2<K, VV>> vertices = vertexReader
				.types(vertexKey, vertexValue)
					.name(GraphCsvReader.class.getName());

			return Graph.fromTupleDataSet(vertices, edges, executionContext);
		}
		else if (mapper != null) {
			return Graph.fromTupleDataSet(edges, (MapFunction<K, VV>) mapper, executionContext);
		}
		else {
			throw new RuntimeException("Vertex values have to be specified through a vertices input file"
					+ "or a user-defined map function.");
		}
	}

	/**
	 * Creates a Graph from CSV input with edge values, but without vertex values.
	 * @param vertexKey the type of the vertex IDs
	 * @param edgeValue the type of the edge values
	 * @return a Graph where the edges are read from an edges CSV file (with values).
	 */
	public <K, EV> Graph<K, NullValue, EV> edgeTypes(Class<K> vertexKey, Class<EV> edgeValue) {

		if (edgeReader == null) {
			throw new RuntimeException("The edge input file cannot be null!");
		}

		DataSet<Tuple3<K, K, EV>> edges = edgeReader
			.types(vertexKey, vertexKey, edgeValue)
				.name(GraphCsvReader.class.getName());

		return Graph.fromTupleDataSet(edges, executionContext);
	}

	/**
	 * Creates a Graph from CSV input without vertex values or edge values.
	 * @param vertexKey the type of the vertex IDs
	 * @return a Graph where the vertex IDs are read from the edges input file.
	 */
	public <K> Graph<K, NullValue, NullValue> keyType(Class<K> vertexKey) {

		if (edgeReader == null) {
			throw new RuntimeException("The edge input file cannot be null!");
		}

		DataSet<Edge<K, NullValue>> edges = edgeReader
			.types(vertexKey, vertexKey)
				.name(GraphCsvReader.class.getName())
			.map(new Tuple2ToEdgeMap<K>())
				.name("Type conversion");

		return Graph.fromDataSet(edges, executionContext);
	}

	/**
	 * Creates a Graph from CSV input without edge values.
	 * The vertex values are specified through a vertices input file or a user-defined map function.
	 * If no vertices input file is provided, the vertex IDs are automatically created from the edges
	 * input file.
	 * @param vertexKey the type of the vertex IDs
	 * @param vertexValue the type of the vertex values
	 * @return a Graph where the vertex IDs and vertex values.
	 */
	@SuppressWarnings({ "serial", "unchecked" })
	public <K, VV> Graph<K, VV, NullValue> vertexTypes(Class<K> vertexKey, Class<VV> vertexValue) {

		if (edgeReader == null) {
			throw new RuntimeException("The edge input file cannot be null!");
		}

		DataSet<Edge<K, NullValue>> edges = edgeReader
			.types(vertexKey, vertexKey)
				.name(GraphCsvReader.class.getName())
			.map(new Tuple2ToEdgeMap<K>())
				.name("To Edge");

		// the vertex value can be provided by an input file or a user-defined mapper
		if (vertexReader != null) {
			DataSet<Vertex<K, VV>> vertices = vertexReader
				.types(vertexKey, vertexValue)
					.name(GraphCsvReader.class.getName())
				.map(new Tuple2ToVertexMap<K, VV>())
					.name("Type conversion");

			return Graph.fromDataSet(vertices, edges, executionContext);
		}
		else if (mapper != null) {
			return Graph.fromDataSet(edges, (MapFunction<K, VV>) mapper, executionContext);
		}
		else {
			throw new RuntimeException("Vertex values have to be specified through a vertices input file"
					+ "or a user-defined map function.");
		}
	}

	/**
	 *Configures the Delimiter that separates rows for the CSV reader used to read the edges
	 *	({@code '\n'}) is used by default.
	 *
	 *@param delimiter The delimiter that separates the rows.
	 * @return The GraphCSVReader instance itself, to allow for fluent function chaining.
	 */
	public GraphCsvReader lineDelimiterEdges(String delimiter) {
		edgeReader.lineDelimiter(delimiter);
		return this;
	}

	/**
	 *Configures the Delimiter that separates rows for the CSV reader used to read the vertices
	 *	({@code '\n'}) is used by default.
	 *
	 *@param delimiter The delimiter that separates the rows.
	 * @return The GraphCSVReader instance itself, to allow for fluent function chaining.
	 */
	public GraphCsvReader lineDelimiterVertices(String delimiter) {
		if(this.vertexReader != null) {
			this.vertexReader.lineDelimiter(delimiter);
		}
		return this;
	}

	/**
	 *Configures the Delimiter that separates fields in a row for the CSV reader used to read the vertices
	 * ({@code ','}) is used by default.
	 *
	 * @param delimiter The delimiter that separates the fields in a row.
	 * @return The GraphCsv reader instance itself, to allow for fluent function chaining.
	 */
	public GraphCsvReader fieldDelimiterVertices(String delimiter) {
		if(this.vertexReader != null) {
			this.vertexReader.fieldDelimiter(delimiter);
		}
		return this;
	}

	/**
	 *Configures the Delimiter that separates fields in a row for the CSV reader used to read the edges
	 * ({@code ','}) is used by default.
	 *
	 * @param delimiter The delimiter that separates the fields in a row.
	 * @return The GraphCsv reader instance itself, to allow for fluent function chaining.
	 */
	public GraphCsvReader fieldDelimiterEdges(String delimiter) {
		this.edgeReader.fieldDelimiter(delimiter);
		return this;
	}

	/**
	 * Enables quoted String parsing for Edge Csv Reader. Field delimiters in quoted Strings are ignored.
	 * A String is parsed as quoted if it starts and ends with a quoting character and as unquoted otherwise.
	 * Leading or tailing whitespaces are not allowed.
	 *
	 * @param quoteCharacter The character which is used as quoting character.
	 * @return The GraphCSVReader instance itself, to allow for fluent function chaining.
	 */
	public GraphCsvReader parseQuotedStringsEdges(char quoteCharacter) {
		this.edgeReader.parseQuotedStrings(quoteCharacter);
		return this;
	}

	/**
	 * Enables quoted String parsing for Vertex Csv Reader. Field delimiters in quoted Strings are ignored.
	 * A String is parsed as quoted if it starts and ends with a quoting character and as unquoted otherwise.
	 * Leading or tailing whitespaces are not allowed.
	 *
	 * @param quoteCharacter The character which is used as quoting character.
	 * @return The GraphCSVReader instance itself, to allow for fluent function chaining.
	 */
	public GraphCsvReader parseQuotedStringsVertices(char quoteCharacter) {
		if(this.vertexReader != null) {
			this.vertexReader.parseQuotedStrings(quoteCharacter);
		}
		return this;
	}

	/**
	 * Configures the string that starts comments for the Vertex Csv Reader.
	 * By default comments will be treated as invalid lines.
	 * This function only recognizes comments which start at the beginning of the line!
	 *
	 * @param commentPrefix The string that starts the comments.
	 * @return The GraphCSVReader instance itself, to allow for fluent function chaining.
	 */
	public GraphCsvReader ignoreCommentsVertices(String commentPrefix) {
		if(this.vertexReader != null) {
			this.vertexReader.ignoreComments(commentPrefix);
		}
		return this;
	}

	/**
	 * Configures the string that starts comments for the Edge Csv Reader.
	 * By default comments will be treated as invalid lines.
	 * This function only recognizes comments which start at the beginning of the line!
	 *
	 * @param commentPrefix The string that starts the comments.
	 * @return The GraphCSVReader instance itself, to allow for fluent function chaining.
	 */
	public GraphCsvReader ignoreCommentsEdges(String commentPrefix) {
		this.edgeReader.ignoreComments(commentPrefix);
		return this;
	}

	/**
	 * Configures which fields of the CSV file containing vertices data should be included and which should be skipped. The
	 * parser will look at the first {@code n} fields, where {@code n} is the length of the boolean
	 * array. The parser will skip over all fields where the boolean value at the corresponding position
	 * in the array is {@code false}. The result contains the fields where the corresponding position in
	 * the boolean array is {@code true}.
	 * The number of fields in the result is consequently equal to the number of times that {@code true}
	 * occurs in the fields array.
	 *
	 * @param vertexFields The array of flags that describes which fields are to be included from the CSV file for vertices.
	 * @return The GraphCSVReader instance itself, to allow for fluent function chaining.
	 */
	public GraphCsvReader includeFieldsVertices(boolean ... vertexFields) {
		if(this.vertexReader != null) {
			this.vertexReader.includeFields(vertexFields);
		}
		return this;
	}

	/**
	 * Configures which fields of the CSV file containing edges data should be included and which should be skipped. The
	 * parser will look at the first {@code n} fields, where {@code n} is the length of the boolean
	 * array. The parser will skip over all fields where the boolean value at the corresponding position
	 * in the array is {@code false}. The result contains the fields where the corresponding position in
	 * the boolean array is {@code true}.
	 * The number of fields in the result is consequently equal to the number of times that {@code true}
	 * occurs in the fields array.
	 *
	 * @param edgeFields The array of flags that describes which fields are to be included from the CSV file for edges.
	 * @return The GraphCSVReader instance itself, to allow for fluent function chaining.
	 */
	public GraphCsvReader includeFieldsEdges(boolean ... edgeFields) {
		this.edgeReader.includeFields(edgeFields);
		return this;
	}

	/**
	 * Configures which fields of the CSV file containing vertices data should be included and which should be skipped. The
	 * positions in the string (read from position 0 to its length) define whether the field at
	 * the corresponding position in the CSV schema should be included.
	 * parser will look at the first {@code n} fields, where {@code n} is the length of the mask string
	 * The parser will skip over all fields where the character at the corresponding position
	 * in the string is {@code '0'}, {@code 'F'}, or {@code 'f'} (representing the value
	 * {@code false}). The result contains the fields where the corresponding position in
	 * the boolean array is {@code '1'}, {@code 'T'}, or {@code 't'} (representing the value {@code true}).
	 *
	 * @param mask The string mask defining which fields to include and which to skip.
	 * @return The GraphCSVReader instance itself, to allow for fluent function chaining.
	 */
	public GraphCsvReader includeFieldsVertices(String mask) {
		if(this.vertexReader != null) {
			this.vertexReader.includeFields(mask);
		}
		return this;
	}

	/**
	 * Configures which fields of the CSV file containing edges data should be included and which should be skipped. The
	 * positions in the string (read from position 0 to its length) define whether the field at
	 * the corresponding position in the CSV schema should be included.
	 * parser will look at the first {@code n} fields, where {@code n} is the length of the mask string
	 * The parser will skip over all fields where the character at the corresponding position
	 * in the string is {@code '0'}, {@code 'F'}, or {@code 'f'} (representing the value
	 * {@code false}). The result contains the fields where the corresponding position in
	 * the boolean array is {@code '1'}, {@code 'T'}, or {@code 't'} (representing the value {@code true}).
	 *
	 * @param mask The string mask defining which fields to include and which to skip.
	 * @return The GraphCSVReader instance itself, to allow for fluent function chaining.
	 */
	public GraphCsvReader includeFieldsEdges(String mask) {
		this.edgeReader.includeFields(mask);
		return this;
	}

	/**
	 * Configures which fields of the CSV file containing vertices data should be included and which should be skipped. The
	 * bits in the value (read from least significant to most significant) define whether the field at
	 * the corresponding position in the CSV schema should be included.
	 * parser will look at the first {@code n} fields, where {@code n} is the position of the most significant
	 * non-zero bit.
	 * The parser will skip over all fields where the character at the corresponding bit is zero, and
	 * include the fields where the corresponding bit is one.
	 * <p>
	 * Examples:
	 * <ul>
	 *   <li>A mask of {@code 0x7} would include the first three fields.</li>
	 *   <li>A mask of {@code 0x26} (binary {@code 100110} would skip the first fields, include fields
	 *       two and three, skip fields four and five, and include field six.</li>
	 * </ul>
	 *
	 * @param mask The bit mask defining which fields to include and which to skip.
	 * @return The GraphCSVReader instance itself, to allow for fluent function chaining.
	 */
	public GraphCsvReader includeFieldsVertices(long mask) {
		if(this.vertexReader != null) {
			this.vertexReader.includeFields(mask);
		}
		return this;
	}

	/**
	 * Configures which fields of the CSV file containing edges data should be included and which should be skipped. The
	 * bits in the value (read from least significant to most significant) define whether the field at
	 * the corresponding position in the CSV schema should be included.
	 * parser will look at the first {@code n} fields, where {@code n} is the position of the most significant
	 * non-zero bit.
	 * The parser will skip over all fields where the character at the corresponding bit is zero, and
	 * include the fields where the corresponding bit is one.
	 * <p>
	 * Examples:
	 * <ul>
	 *   <li>A mask of {@code 0x7} would include the first three fields.</li>
	 *   <li>A mask of {@code 0x26} (binary {@code 100110} would skip the first fields, include fields
	 *       two and three, skip fields four and five, and include field six.</li>
	 * </ul>
	 *
	 * @param mask The bit mask defining which fields to include and which to skip.
	 * @return The GraphCSVReader instance itself, to allow for fluent function chaining.
	 */
	public GraphCsvReader includeFieldsEdges(long mask) {
		this.edgeReader.includeFields(mask);
		return this;
	}

	/**
	 * Sets the CSV reader for the Edges file to ignore the first line. This is useful for files that contain a header line.
	 *
	 * @return The GraphCSVReader instance itself, to allow for fluent function chaining.
	 */
	public GraphCsvReader ignoreFirstLineEdges() {
		this.edgeReader.ignoreFirstLine();
		return this;
	}

	/**
	 * Sets the CSV reader for the Vertices file to ignore the first line. This is useful for files that contain a header line.
	 *
	 * @return The GraphCSVReader instance itself, to allow for fluent function chaining.
	 */
	public GraphCsvReader ignoreFirstLineVertices() {
		if(this.vertexReader != null) {
			this.vertexReader.ignoreFirstLine();
		}
		return this;
	}

	/**
	 * Sets the CSV reader for the Edges file  to ignore any invalid lines.
	 * This is useful for files that contain an empty line at the end, multiple header lines or comments. This would throw an exception otherwise.
	 *
	 * @return The GraphCSVReader instance itself, to allow for fluent function chaining.
	 */
	public GraphCsvReader ignoreInvalidLinesEdges() {
		this.edgeReader.ignoreInvalidLines();
		return this;
	}

	/**
	 * Sets the CSV reader Vertices file  to ignore any invalid lines.
	 * This is useful for files that contain an empty line at the end, multiple header lines or comments. This would throw an exception otherwise.
	 *
	 * @return The GraphCSVReader instance itself, to allow for fluent function chaining.
	 */
	public GraphCsvReader ignoreInvalidLinesVertices() {
		if(this.vertexReader != null) {
			this.vertexReader.ignoreInvalidLines();
		}
		return this;
	}
}