/**
* Copyright 2016-2017 Seznam.cz, a.s.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package cz.seznam.euphoria.hadoop;
import cz.seznam.euphoria.core.client.dataset.Dataset;
import cz.seznam.euphoria.core.client.flow.Flow;
import cz.seznam.euphoria.core.client.io.Context;
import cz.seznam.euphoria.core.client.io.StdoutSink;
import cz.seznam.euphoria.core.client.operator.FlatMap;
import cz.seznam.euphoria.core.client.operator.ReduceByKey;
import cz.seznam.euphoria.core.client.util.Pair;
import cz.seznam.euphoria.core.client.util.Sums;
import cz.seznam.euphoria.core.executor.Executor;
import cz.seznam.euphoria.core.util.Settings;
import cz.seznam.euphoria.hadoop.input.SimpleHadoopTextFileSource;
import cz.seznam.euphoria.inmem.InMemExecutor;
import java.net.URI;
import java.util.regex.Pattern;
/** Implements a very simplistic WordCount over text files using hadoop data sinks. */
public class ExerciseHadoopIO {
private static final Pattern SPLIT_RE = Pattern.compile("\\s+");
public static void main(String[] args) throws Exception {
if (args.length != 1) {
System.err.println("Usage: " + ExerciseHadoopIO.class + " <input-uri>");
System.exit(1);
}
final URI inputUri = URI.create(args[0]);
Settings settings = new Settings();
settings.setClass("euphoria.io.datasource.factory.webhdfs",
SimpleHadoopTextFileSource.Factory.class);
settings.setClass("euphoria.io.datasource.factory.hdfs",
SimpleHadoopTextFileSource.Factory.class);
settings.setClass("euphoria.io.datasource.factory.file",
SimpleHadoopTextFileSource.Factory.class);
Flow flow = Flow.create("WordCount", settings);
// set-up our input source (a stream)
Dataset<String> lines = flow.createInput(inputUri);
Dataset<Pair<String, Long>> tuples = FlatMap.of(lines)
.using((String line, Context<Pair<String, Long>> out) ->
SPLIT_RE.splitAsStream(line)
.map(String::trim)
.filter(s -> !s.isEmpty())
.forEachOrdered(s -> out.collect(Pair.of(s, 1L))))
.output();
// reduce it to counts, use windowing
Dataset<Pair<String, Long>> wordCount = ReduceByKey
.of(tuples)
.keyBy(Pair::getFirst)
.valueBy(Pair::getSecond)
.combineBy(Sums.ofLongs())
.output();
// produce the output
wordCount.persist(new StdoutSink<>());
Executor executor = new InMemExecutor();
executor.submit(flow).get();
}
}