/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.flink.streaming.examples.twitter;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.api.java.utils.ParameterTool;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.connectors.twitter.TwitterSource;
import org.apache.flink.streaming.examples.twitter.util.TwitterExampleData;
import org.apache.flink.util.Collector;
import java.util.StringTokenizer;
/**
* Implements the "TwitterStream" program that computes a most used word
* occurrence over JSON objects in a streaming fashion.
* <p>
* The input is a Tweet stream from a TwitterSource.
* </p>
* <p>
* Usage: <code>Usage: TwitterExample [--output <path>]
* [--twitter-source.consumerKey <key> --twitter-source.consumerSecret <secret> --twitter-source.token <token> --twitter-source.tokenSecret <tokenSecret>]</code><br>
*
* If no parameters are provided, the program is run with default data from
* {@link TwitterExampleData}.
* </p>
* <p>
* This example shows how to:
* <ul>
* <li>acquire external data,
* <li>use in-line defined functions,
* <li>handle flattened stream inputs.
* </ul>
*/
public class TwitterExample {
// *************************************************************************
// PROGRAM
// *************************************************************************
public static void main(String[] args) throws Exception {
// Checking input parameters
final ParameterTool params = ParameterTool.fromArgs(args);
System.out.println("Usage: TwitterExample [--output <path>] " +
"[--twitter-source.consumerKey <key> --twitter-source.consumerSecret <secret> --twitter-source.token <token> --twitter-source.tokenSecret <tokenSecret>]");
// set up the execution environment
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
// make parameters available in the web interface
env.getConfig().setGlobalJobParameters(params);
env.setParallelism(params.getInt("parallelism", 1));
// get input data
DataStream<String> streamSource;
if (params.has(TwitterSource.CONSUMER_KEY) &&
params.has(TwitterSource.CONSUMER_SECRET) &&
params.has(TwitterSource.TOKEN) &&
params.has(TwitterSource.TOKEN_SECRET)
) {
streamSource = env.addSource(new TwitterSource(params.getProperties()));
} else {
System.out.println("Executing TwitterStream example with default props.");
System.out.println("Use --twitter-source.consumerKey <key> --twitter-source.consumerSecret <secret> " +
"--twitter-source.token <token> --twitter-source.tokenSecret <tokenSecret> specify the authentication info.");
// get default test text data
streamSource = env.fromElements(TwitterExampleData.TEXTS);
}
DataStream<Tuple2<String, Integer>> tweets = streamSource
// selecting English tweets and splitting to (word, 1)
.flatMap(new SelectEnglishAndTokenizeFlatMap())
// group by words and sum their occurrences
.keyBy(0).sum(1);
// emit result
if (params.has("output")) {
tweets.writeAsText(params.get("output"));
} else {
System.out.println("Printing result to stdout. Use --output to specify output path.");
tweets.print();
}
// execute program
env.execute("Twitter Streaming Example");
}
// *************************************************************************
// USER FUNCTIONS
// *************************************************************************
/**
* Deserialize JSON from twitter source
*
* <p>
* Implements a string tokenizer that splits sentences into words as a
* user-defined FlatMapFunction. The function takes a line (String) and
* splits it into multiple pairs in the form of "(word,1)" ({@code Tuple2<String,
* Integer>}).
*/
public static class SelectEnglishAndTokenizeFlatMap implements FlatMapFunction<String, Tuple2<String, Integer>> {
private static final long serialVersionUID = 1L;
private transient ObjectMapper jsonParser;
/**
* Select the language from the incoming JSON text
*/
@Override
public void flatMap(String value, Collector<Tuple2<String, Integer>> out) throws Exception {
if(jsonParser == null) {
jsonParser = new ObjectMapper();
}
JsonNode jsonNode = jsonParser.readValue(value, JsonNode.class);
boolean isEnglish = jsonNode.has("user") && jsonNode.get("user").has("lang") && jsonNode.get("user").get("lang").asText().equals("en");
boolean hasText = jsonNode.has("text");
if (isEnglish && hasText) {
// message of tweet
StringTokenizer tokenizer = new StringTokenizer(jsonNode.get("text").asText());
// split the message
while (tokenizer.hasMoreTokens()) {
String result = tokenizer.nextToken().replaceAll("\\s*", "").toLowerCase();
if (!result.equals("")) {
out.collect(new Tuple2<>(result, 1));
}
}
}
}
}
}