/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.flink.examples.java.relational; import java.util.ArrayList; import java.util.Collection; import java.util.List; import org.apache.commons.lang3.StringUtils; import org.apache.flink.api.common.JobExecutionResult; import org.apache.flink.api.common.accumulators.Accumulator; import org.apache.flink.api.java.DataSet; import org.apache.flink.api.java.ExecutionEnvironment; import org.apache.flink.api.common.functions.RichFilterFunction; import org.apache.flink.api.java.tuple.Tuple3; import org.apache.flink.api.java.utils.ParameterTool; import org.apache.flink.configuration.Configuration; /** * This program filters lines from a CSV file with empty fields. In doing so, it counts the number of empty fields per * column within a CSV file using a custom accumulator for vectors. In this context, empty fields are those, that at * most contain whitespace characters like space and tab. * <p> * The input file is a plain text CSV file with the semicolon as field separator and double quotes as field delimiters * and three columns. See {@link #getDataSet(ExecutionEnvironment, ParameterTool)} for configuration. * <p> * Usage: <code>EmptyFieldsCountAccumulator --input <path> --output <path></code> <br> * <p> * This example shows how to use: * <ul> * <li>custom accumulators * <li>tuple data types * <li>inline-defined functions * <li>naming large tuple types * </ul> */ @SuppressWarnings("serial") public class EmptyFieldsCountAccumulator { // ************************************************************************* // PROGRAM // ************************************************************************* private static final String EMPTY_FIELD_ACCUMULATOR = "empty-fields"; public static void main(final String[] args) throws Exception { final ParameterTool params = ParameterTool.fromArgs(args); final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); // make parameters available in the web interface env.getConfig().setGlobalJobParameters(params); // get the data set final DataSet<StringTriple> file = getDataSet(env, params); // filter lines with empty fields final DataSet<StringTriple> filteredLines = file.filter(new EmptyFieldFilter()); // Here, we could do further processing with the filtered lines... JobExecutionResult result; // output the filtered lines if (params.has("output")) { filteredLines.writeAsCsv(params.get("output")); // execute program result = env.execute("Accumulator example"); } else { System.out.println("Printing result to stdout. Use --output to specify output path."); filteredLines.print(); result = env.getLastJobExecutionResult(); } // get the accumulator result via its registration key final List<Integer> emptyFields = result.getAccumulatorResult(EMPTY_FIELD_ACCUMULATOR); System.out.format("Number of detected empty fields per column: %s\n", emptyFields); } // ************************************************************************* // UTIL METHODS // ************************************************************************* @SuppressWarnings("unchecked") private static DataSet<StringTriple> getDataSet(ExecutionEnvironment env, ParameterTool params) { if (params.has("input")) { return env.readCsvFile(params.get("input")) .fieldDelimiter(";") .pojoType(StringTriple.class); } else { System.out.println("Executing EmptyFieldsCountAccumulator example with default input data set."); System.out.println("Use --input to specify file input."); return env.fromCollection(getExampleInputTuples()); } } private static Collection<StringTriple> getExampleInputTuples() { Collection<StringTriple> inputTuples = new ArrayList<StringTriple>(); inputTuples.add(new StringTriple("John", "Doe", "Foo Str.")); inputTuples.add(new StringTriple("Joe", "Johnson", "")); inputTuples.add(new StringTriple(null, "Kate Morn", "Bar Blvd.")); inputTuples.add(new StringTriple("Tim", "Rinny", "")); inputTuples.add(new StringTriple("Alicia", "Jackson", " ")); return inputTuples; } /** * This function filters all incoming tuples that have one or more empty fields. * In doing so, it also counts the number of empty fields per attribute with an accumulator (registered under * {@link EmptyFieldsCountAccumulator#EMPTY_FIELD_ACCUMULATOR}). */ public static final class EmptyFieldFilter extends RichFilterFunction<StringTriple> { // create a new accumulator in each filter function instance // accumulators can be merged later on private final VectorAccumulator emptyFieldCounter = new VectorAccumulator(); @Override public void open(final Configuration parameters) throws Exception { super.open(parameters); // register the accumulator instance getRuntimeContext().addAccumulator(EMPTY_FIELD_ACCUMULATOR, this.emptyFieldCounter); } @Override public boolean filter(final StringTriple t) { boolean containsEmptyFields = false; // iterate over the tuple fields looking for empty ones for (int pos = 0; pos < t.getArity(); pos++) { final String field = t.getField(pos); if (field == null || field.trim().isEmpty()) { containsEmptyFields = true; // if an empty field is encountered, update the // accumulator this.emptyFieldCounter.add(pos); } } return !containsEmptyFields; } } /** * This accumulator maintains a vector of counts. Calling {@link #add(Integer)} increments the * <i>n</i>-th vector component. The size of the vector is automatically managed. */ public static class VectorAccumulator implements Accumulator<Integer, ArrayList<Integer>> { /** Stores the accumulated vector components. */ private final ArrayList<Integer> resultVector; public VectorAccumulator(){ this(new ArrayList<Integer>()); } public VectorAccumulator(ArrayList<Integer> resultVector){ this.resultVector = resultVector; } /** * Increases the result vector component at the specified position by 1. */ @Override public void add(Integer position) { updateResultVector(position, 1); } /** * Increases the result vector component at the specified position by the specified delta. */ private void updateResultVector(int position, int delta) { // inflate the vector to contain the given position while (this.resultVector.size() <= position) { this.resultVector.add(0); } // increment the component value final int component = this.resultVector.get(position); this.resultVector.set(position, component + delta); } @Override public ArrayList<Integer> getLocalValue() { return this.resultVector; } @Override public void resetLocal() { // clear the result vector if the accumulator instance shall be reused this.resultVector.clear(); } @Override public void merge(final Accumulator<Integer, ArrayList<Integer>> other) { // merge two vector accumulators by adding their up their vector components final List<Integer> otherVector = other.getLocalValue(); for (int index = 0; index < otherVector.size(); index++) { updateResultVector(index, otherVector.get(index)); } } @Override public Accumulator<Integer, ArrayList<Integer>> clone() { return new VectorAccumulator(new ArrayList<Integer>(resultVector)); } @Override public String toString() { return StringUtils.join(resultVector, ','); } } /** * It is recommended to use POJOs (Plain old Java objects) instead of TupleX for * data types with many fields. Also, POJOs can be used to give large Tuple-types a name. * <a href="https://ci.apache.org/projects/flink/flink-docs-master/apis/best_practices.html#naming-large-tuplex-types">Source (docs)</a> */ public static class StringTriple extends Tuple3<String, String, String> { public StringTriple() {} public StringTriple(String f0, String f1, String f2) { super(f0, f1, f2); } } }