package example1.cascading;
import java.io.IOException;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import cascading.flow.Flow;
import cascading.flow.FlowConnector;
import cascading.operation.regex.RegexSplitGenerator;
import cascading.pipe.Each;
import cascading.pipe.Pipe;
import cascading.pipe.assembly.CountBy;
import cascading.scheme.TextDelimited;
import cascading.scheme.TextLine;
import cascading.tap.Hfs;
import cascading.tap.SinkMode;
import cascading.tap.Tap;
import cascading.tuple.Fields;
public class WordCount extends Configured implements Tool {
public static void main(String[] args) throws Exception {
int r = ToolRunner.run(new WordCount(), args);
System.exit(r);
}
// フィールド名の定義
public static final String F_LINE = "line";
public static final String F_WORD = "word";
public static final String F_COUNT = "count";
@Override
public int run(String[] args) throws Exception {
// 入出力ディレクトリーの指定
Tap source = new Hfs(new TextLine(new Fields(F_LINE)),
makeQualifiedPath(args[0]));
Tap sink = new Hfs(new TextDelimited(new Fields(F_WORD, F_COUNT),
false, "\t"), makeQualifiedPath(args[1]), SinkMode.REPLACE);
// Pipeの初期化
Pipe pipe = new Pipe("wordcount-pipe");
// 行を単語に分割する。
pipe = new Each(pipe, new RegexSplitGenerator(new Fields(F_WORD),
"[ \t\n\r\f]+"));
// 単語毎にカウントする。
// pipe = new GroupBy(pipe, new Fields(F_WORD));
// pipe = new Every(pipe, new Count(new Fields(F_COUNT)));
pipe = new CountBy(pipe, new Fields(F_WORD), new Fields(F_COUNT));
// 実行
FlowConnector flowConnector = new FlowConnector();
Flow flow = flowConnector.connect("wordcount-cascading", source, sink,
pipe);
flow.complete();
return 0;
}
public String makeQualifiedPath(String path) throws IOException {
FileSystem fs = FileSystem.get(super.getConf());
return new Path(path).makeQualified(fs).toString();
}
}