/*
* Copyright 2013 Cloudera Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.cloudera.cdk.morphline.twitter;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.text.SimpleDateFormat;
import java.util.Collection;
import java.util.Collections;
import java.util.Locale;
import java.util.Random;
import java.util.zip.GZIPInputStream;
import com.cloudera.cdk.morphline.api.Command;
import com.cloudera.cdk.morphline.api.CommandBuilder;
import com.cloudera.cdk.morphline.api.MorphlineContext;
import com.cloudera.cdk.morphline.api.Record;
import com.cloudera.cdk.morphline.base.Fields;
import com.cloudera.cdk.morphline.stdio.AbstractParser;
import com.fasterxml.jackson.core.JsonParseException;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.MappingIterator;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.ObjectReader;
import com.typesafe.config.Config;
/**
* JSON parser that extracts search documents from twitter tweets obtained from the twitter 1% sample firehose with the delimited=length option.
* For background see https://dev.twitter.com/docs/api/1.1/get/statuses/sample.
*
* The JSON input format is documented at https://dev.twitter.com/docs/platform-objects/tweets
*/
public final class ReadJsonTestTweetsBuilder implements CommandBuilder {
//public static final String MEDIA_TYPE = "mytwittertest/json+delimited+length";
@Override
public Collection<String> getNames() {
return Collections.singletonList("readJsonTestTweets");
}
@Override
public Command build(Config config, Command parent, Command child, MorphlineContext context) {
return new ReadJsonTestTweets(this, config, parent, child, context);
}
///////////////////////////////////////////////////////////////////////////////
// Nested classes:
///////////////////////////////////////////////////////////////////////////////
private static final class ReadJsonTestTweets extends AbstractParser {
private final boolean isLengthDelimited;
private String idPrefix;
private final ObjectReader reader = new ObjectMapper().reader(JsonNode.class);
// Fri May 14 02:52:55 +0000 2010
private SimpleDateFormat formatterFrom = new SimpleDateFormat("EEE MMM dd HH:mm:ss Z yyyy", Locale.US);
private SimpleDateFormat formatterTo = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss'Z'", Locale.US);
public ReadJsonTestTweets(CommandBuilder builder, Config config, Command parent, Command child, MorphlineContext context) {
super(builder, config, parent, child, context);
this.isLengthDelimited = getConfigs().getBoolean(config, "isLengthDelimited", true);
this.idPrefix = getConfigs().getString(config, "idPrefix", null);
if ("random".equals(idPrefix)) {
idPrefix = String.valueOf(new Random().nextInt());
} else if (idPrefix == null) {
idPrefix = "";
}
validateArguments();
}
@Override
protected boolean doProcess(Record record, InputStream in) throws IOException {
String name = (String) record.getFirstValue(Fields.ATTACHMENT_NAME);
if (name != null && name.endsWith(".gz")) {
in = new GZIPInputStream(in, 64 * 1024);
}
long numRecords = 0;
BufferedReader bufferedReader = null;
MappingIterator<JsonNode> iter = null;
if (isLengthDelimited) {
bufferedReader = new BufferedReader(new InputStreamReader(in, "UTF-8"));
} else {
iter = reader.readValues(in);
}
try {
while (true) {
JsonNode rootNode;
if (isLengthDelimited) {
String json = nextLine(bufferedReader);
if (json == null) {
break;
}
try {
// src can be a File, URL, InputStream, etc
rootNode = reader.readValue(json);
} catch (JsonParseException e) {
LOG.info("json parse exception after " + numRecords + " records");
LOG.debug("json parse exception after " + numRecords + " records", e);
break;
}
} else {
if (!iter.hasNext()) {
break;
}
rootNode = iter.next();
}
Record doc = new Record();
JsonNode user = rootNode.get("user");
JsonNode idNode = rootNode.get("id_str");
if (idNode == null || idNode.textValue() == null) {
continue; // skip
}
doc.put("id", idPrefix + idNode.textValue());
tryAddDate(doc, "created_at", rootNode.get("created_at"));
tryAddString(doc, "source", rootNode.get("source"));
tryAddString(doc, "text", rootNode.get("text"));
tryAddInt(doc, "retweet_count", rootNode.get("retweet_count"));
tryAddBool(doc, "retweeted", rootNode.get("retweeted"));
tryAddLong(doc, "in_reply_to_user_id", rootNode.get("in_reply_to_user_id"));
tryAddLong(doc, "in_reply_to_status_id", rootNode.get("in_reply_to_status_id"));
tryAddString(doc, "media_url_https", rootNode.get("media_url_https"));
tryAddString(doc, "expanded_url", rootNode.get("expanded_url"));
tryAddInt(doc, "user_friends_count", user.get("friends_count"));
tryAddString(doc, "user_location", user.get("location"));
tryAddString(doc, "user_description", user.get("description"));
tryAddInt(doc, "user_statuses_count", user.get("statuses_count"));
tryAddInt(doc, "user_followers_count", user.get("followers_count"));
tryAddString(doc, "user_screen_name", user.get("screen_name"));
tryAddString(doc, "user_name", user.get("name"));
incrementNumRecords();
LOG.debug("tweetdoc: {}", doc);
if (!getChild().process(doc)) {
return false;
}
numRecords++;
}
} finally {
if (iter != null) {
iter.close();
}
LOG.debug("processed {} records", numRecords);
}
return true;
}
private String nextLine(BufferedReader reader) throws IOException {
String line;
while ((line = reader.readLine()) != null) {
if (line.length() > 0)
break; // ignore empty lines
}
if (line == null)
return null;
Integer.parseInt(line); // sanity check
while ((line = reader.readLine()) != null) {
if (line.length() > 0)
break; // ignore empty lines
}
return line;
}
private void tryAddDate(Record doc, String solr_field, JsonNode node) {
if (node == null)
return;
String val = node.asText();
if (val == null) {
return;
}
try {
// String tmp = formatterTo.format(formatterFrom.parse(val.trim()));
doc.put(solr_field, formatterTo.format(formatterFrom.parse(val.trim())));
} catch (Exception e) {
LOG.error("Could not parse date " + val);
// ++exceptionCount;
}
}
private void tryAddLong(Record doc, String solr_field, JsonNode node) {
if (node == null)
return;
Long val = node.asLong();
if (val == null) {
return;
}
doc.put(solr_field, val);
}
private void tryAddInt(Record doc, String solr_field, JsonNode node) {
if (node == null)
return;
Integer val = node.asInt();
if (val == null) {
return;
}
doc.put(solr_field, val);
}
private void tryAddBool(Record doc, String solr_field, JsonNode node) {
if (node == null)
return;
Boolean val = node.asBoolean();
if (val == null) {
return;
}
doc.put(solr_field, val);
}
private void tryAddString(Record doc, String solr_field, JsonNode node) {
if (node == null)
return;
String val = node.asText();
if (val == null) {
return;
}
doc.put(solr_field, val);
}
}
}