PopularLogResources.java example

Explorer

hadoop-book-master
- src
  - main
    - java
      - com
        manning
        hip
        ch1
        ConfigComparer.java
        ConfigDumper.java
        ConfigProperty.java
        InvertedIndexMapReduce.java
        ch10
        ExtractMovieUDF.java
        GeolocUDF.java
        LogSerDe.java
        ch11
        ComplexTupleLoader.java
        IsPrivateIP.java
        LogMapReduce.java
        NonAlgebraic.java
        PigGeolocationUDF.java
        PigLongGeolocationUDF.java
        RegExExclusionFilter.java
        SequenceFileTupleLoader.java
        SequenceFileTupleStoreFunc.java
        TypedCommonLogLoader.java
        TypedCommonLogLoaderConstants.java
        ch12
        cascading
        PopularLogResources.java
        crunch
        CrunchUtils.java
        InvertedIndex.java
        JoinLogsAndUsers.java
        PopularLinks.java
        SimpleTokenize.java
        ch13
        ArrayOutOfBounds.java
        ArrayOutOfBoundsImproved.java
        OptimizedMRForDebugging.java
        SideEffectJob.java
        TaskSplitReader.java
        ch2
        DBExportMapReduce.java
        DBImportExportMapReduce.java
        DBImportMapReduce.java
        HBaseExportedStockReader.java
        HBaseScanAvroStock.java
        HBaseSinkMapReduce.java
        HBaseSourceMapReduce.java
        HBaseSourceSinkMapReduce.java
        HBaseWriteAvroStock.java
        HttpDownloadMap.java
        HttpDownloadMapReduce.java
        SqoopSequenceFileReader.java
        StockHBaseImporter.java
        StockRecord.java
        ch3
        StockPriceWritable.java
        StockWritableSerDe.java
        TextArrayWritable.java
        avro
        AvroBytesRecord.java
        AvroGenericFileDumper.java
        AvroStockAvgFileRead.java
        AvroStockFileRead.java
        AvroStockFileWrite.java
        AvroStockMapReduce.java
        AvroStockMapReduce2.java
        AvroTextMapReduce.java
        AvroTextMapReduceNoMR.java
        gen
        Stock.java
        StockAvg.java
        binary
        BinaryOffsets.java
        CustomBinaryFileRead.java
        CustomBinaryFileWrite.java
        CustomBinaryInputFormat.java
        CustomBinaryMapReduce.java
        CustomBinaryRecordReader.java
        csv
        CSVInputFormat.java
        CSVMapReduce.java
        CSVOutputFormat.java
        CSVParser.java
        json
        JsonInputFormat.java
        JsonMapReduce.java
        SimpleJsonOutputMapReduce.java
        passwd
        Passwd.java
        PasswdInputFormat.java
        PasswdMapReduce.java
        PasswdOutputFormat.java
        proto
        StockAvgProtocolBuffersFileReader.java
        StockProtocolBuffersFileReader.java
        StockProtocolBuffersMapReduce.java
        StockProtos.java
        seqfile
        SequenceFileStockLoader.java
        SequenceFileStockMapReduce.java
        SequenceFileStockReader.java
        SequenceFileStockWriter.java
        SequenceFileStoreFunc.java
        thrift
        Stock.java
        StockAvg.java
        ThriftStockAvgFileReader.java
        ThriftStockMapReduce.java
        xml
        HadoopPropertyXMLMapReduce.java
        SimpleXmlOutputMapReduce.java
        XmlInputFormat.java
        ch4
        joins
        contribjoin
        Main.java
        Map.java
        Reduce.java
        TextTaggedMapOutput.java
        improved
        SampleMain.java
        SampleMap.java
        SampleReduce.java
        TextTaggedOutputValue.java
        impl
        CompositeKey.java
        CompositeKeyComparator.java
        CompositeKeyOnlyComparator.java
        CompositeKeyPartitioner.java
        OptimizedDataJoinJob.java
        OptimizedDataJoinMapperBase.java
        OptimizedDataJoinReducerBase.java
        OptimizedJobBase.java
        OutputValue.java
        replicated
        DistributedCacheFileReader.java
        GenericReplicatedJoin.java
        Main.java
        Pair.java
        TextDistributedCacheFileReader.java
        semijoin
        FinalJoinJob.java
        Main.java
        ReplicatedFilterJob.java
        UniqueHashedKeyJob.java
        sampler
        ReservoirSamplerInputFormat.java
        SamplerJob.java
        sort
        secondary
        Person.java
        PersonComparator.java
        PersonModified.java
        PersonNameComparator.java
        PersonNamePartitioner.java
        SortMapReduce.java
        total
        SampleMapReduce.java
        TotalSortMapReduce.java
        ch5
        CompressedFileRead.java
        CompressedFileWrite.java
        CompressedFileWriteFromLocal.java
        CompressedMapReduce.java
        CompressionIOBenchmark.java
        FileCat.java
        FileWrite.java
        LzopFileReadWrite.java
        LzopMapReduce.java
        SimpleMapReduce.java
        SmallFilesMapReduce.java
        SmallFilesRead.java
        SmallFilesWrite.java
        ch6
        CloneReduce.java
        CombineJob.java
        DataSkewGnuplot.java
        DataSkewMetrics.java
        ExtractJobMetrics.java
        ExtractJobTaskTimeline.java
        FilterProjectJob.java
        IdentityJob.java
        JobHistoryHelper.java
        LongSleepJob.java
        MetricSummary.java
        PersonBinaryComparator.java
        PersonSortMapReduce.java
        SampleJob.java
        SkewLogsJob.java
        SlowJob.java
        StringConcat.java
        StringTokBench.java
        TaskMetrics.java
        TaskThroughput.java
        ch7
        bloom
        BloomFilterCreator.java
        BloomFilterDumper.java
        BloomJoin.java
        friendsofafriend
        CalcMapReduce.java
        Main.java
        Person.java
        PersonComparator.java
        PersonNameComparator.java
        PersonNamePartitioner.java
        SortMapReduce.java
        pagerank
        Main.java
        Map.java
        Node.java
        Reduce.java
        shortestpath
        Main.java
        Map.java
        Node.java
        Reduce.java
        ch8
        BinaryFilenameInputFormat.java
        ch9
        HomegrownNBClassifier.java
        MovieUserEvaluator.java
        MovieUserRecommender.java
        Synthetic2DClusteringPrep.java
        common
        ApacheCommonLogParser.java
        ApacheCommonLogReader.java
        CatHdfs.java
        CommonLogEntry.java
        CommonLogInputFormat.java
        HadoopCompat.java
        JobHelper.java
        PaddedTable.java
        Range.java
        StreamToHdfs.java
        testdata
        Crawler.java
  - test
    - java
      - com
        manning
        hip
        ch13
        SimpleMovingAverage.java
        localjobrunner
        IdentityTest.java
        IdentityWithBuilderTest.java
        TextIOJobBuilder.java
        mrunit
        IdentityMapJUnitAssertsTest.java
        IdentityMapJUnitTest.java
        IdentityMapReduceTest.java
        IdentityMapTest.java
        IdentityReduceTest.java
        MRUnitJUnitAsserts.java
        PipelineTest.java

/*
 * Copyright (c) 2007-2010 Concurrent, Inc. All Rights Reserved.
 *
 * Project and contact information: http://www.cascading.org/
 *
 * This file is part of the Cascading project.
 *
 * Cascading is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * Cascading is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with Cascading.  If not, see <http://www.gnu.org/licenses/>.
 */

package com.manning.hip.ch12.cascading;

import cascading.flow.Flow;
import cascading.flow.FlowConnector;
import cascading.operation.Aggregator;
import cascading.operation.aggregator.Count;
import cascading.operation.regex.RegexParser;
import cascading.pipe.Each;
import cascading.pipe.Every;
import cascading.pipe.GroupBy;
import cascading.pipe.Pipe;
import cascading.scheme.TextLine;
import cascading.tap.Hfs;
import cascading.tap.SinkMode;
import cascading.tap.Tap;
import cascading.tuple.Fields;

import java.util.Properties;

/**
 *
 */
public class PopularLogResources {
  public static void main(String[] args) {
    String inputPath = args[0];
    String outputPath = args[1];

    // define what the input file looks like, "offset" is bytes from beginning
    TextLine input = new TextLine(new Fields("offset", "line"));

    // create SOURCE tap to read a resource from HDFS
    Tap logTap = new Hfs(input, inputPath);

    // create an assembly to parse an Apache log file and store on an HDFS cluster

    // declare the field names we will parse out of the log file
    Fields apacheFields = new Fields("resource");

    // define the regular expression to parse the log file with
    String apacheRegex = "^([^ ]*) +[^ ]* +[^ ]* +\\[([^]]*)\\] +\\\"([^ ]*) ([^ ]*) [^ ]*\\\" ([^ ]*) ([^ ]*).*$";

    // declare the groups from the above regex we want to keep. each regex group will be given
    // a field name from 'apacheFields', above, respectively
    int[] allGroups = {4};

    // create the parser
    RegexParser parser = new RegexParser(apacheFields, apacheRegex, allGroups);

    // create the import pipe element, with the name 'import', and with the input argument named "line"
    // replace the incoming tuple with the parser results
    // "line" -> parser -> "ts"
    Pipe pipeline = new Each("import", new Fields("line"), parser, Fields.RESULTS);


    // group the Tuple stream by the "word" value
    pipeline = new GroupBy(pipeline, new Fields("resource"));

    // For every Tuple group
    // count the number of occurrences of "word" and store result in
    // a field named "count"
    Aggregator count = new Count(new Fields("resource"));
    pipeline = new Every(pipeline, count);


    // create a SINK tap to write to the default filesystem
    // by default, TextLine writes all fields out
    Tap remoteLogTap = new Hfs(new TextLine(), outputPath, SinkMode.REPLACE);

    // set the current job jar
    Properties properties = new Properties();
    FlowConnector.setApplicationJarClass(properties, PopularLogResources.class);

    // connect the assembly to the SOURCE and SINK taps
    Flow parsedLogFlow = new FlowConnector(properties).connect(logTap, remoteLogTap, pipeline);

    // start execution of the flow (either locally or on the cluster
    parsedLogFlow.start();

    // block until the flow completes
    parsedLogFlow.complete();
  }
}