/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.beam.examples.cookbook;
import org.apache.beam.sdk.Pipeline;
import org.apache.beam.sdk.io.TextIO;
import org.apache.beam.sdk.options.Default;
import org.apache.beam.sdk.options.DefaultValueFactory;
import org.apache.beam.sdk.options.Description;
import org.apache.beam.sdk.options.PipelineOptions;
import org.apache.beam.sdk.options.PipelineOptionsFactory;
import org.apache.beam.sdk.transforms.Distinct;
import org.apache.beam.sdk.util.gcsfs.GcsPath;
/**
* This example uses as input Shakespeare's plays as plaintext files, and will remove any
* duplicate lines across all the files. (The output does not preserve any input order).
*
* <p>Concepts: the Distinct transform, and how to wire transforms together.
* Demonstrates {@link org.apache.beam.sdk.io.TextIO.Read}/
* {@link Distinct}/{@link org.apache.beam.sdk.io.TextIO.Write}.
*
* <p>To execute this pipeline locally, specify a local output file or output prefix on GCS:
* --output=[YOUR_LOCAL_FILE | gs://YOUR_OUTPUT_PREFIX]
*
* <p>To change the runner, specify:
* <pre>{@code
* --runner=YOUR_SELECTED_RUNNER
* }
* </pre>
* See examples/java/README.md for instructions about how to configure different runners.
*
* <p>The input defaults to {@code gs://apache-beam-samples/shakespeare/*} and can be
* overridden with {@code --input}.
*/
public class DistinctExample {
/**
* Options supported by {@link DistinctExample}.
*
* <p>Inherits standard configuration options.
*/
private interface Options extends PipelineOptions {
@Description("Path to the directory or GCS prefix containing files to read from")
@Default.String("gs://apache-beam-samples/shakespeare/*")
String getInput();
void setInput(String value);
@Description("Path of the file to write to")
@Default.InstanceFactory(OutputFactory.class)
String getOutput();
void setOutput(String value);
/** Returns gs://${TEMP_LOCATION}/"deduped.txt". */
class OutputFactory implements DefaultValueFactory<String> {
@Override
public String create(PipelineOptions options) {
if (options.getTempLocation() != null) {
return GcsPath.fromUri(options.getTempLocation())
.resolve("deduped.txt").toString();
} else {
throw new IllegalArgumentException("Must specify --output or --tempLocation");
}
}
}
}
public static void main(String[] args)
throws Exception {
Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
Pipeline p = Pipeline.create(options);
p.apply("ReadLines", TextIO.read().from(options.getInput()))
.apply(Distinct.<String>create())
.apply("DedupedShakespeare", TextIO.write().to(options.getOutput()));
p.run().waitUntilFinish();
}
}