/* * Copyright 2013 Cloudera Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.cloudera.cdk.morphline.twitter; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.text.SimpleDateFormat; import java.util.Collection; import java.util.Collections; import java.util.Locale; import java.util.Random; import java.util.zip.GZIPInputStream; import com.cloudera.cdk.morphline.api.Command; import com.cloudera.cdk.morphline.api.CommandBuilder; import com.cloudera.cdk.morphline.api.MorphlineContext; import com.cloudera.cdk.morphline.api.Record; import com.cloudera.cdk.morphline.base.Fields; import com.cloudera.cdk.morphline.stdio.AbstractParser; import com.fasterxml.jackson.core.JsonParseException; import com.fasterxml.jackson.databind.JsonNode; import com.fasterxml.jackson.databind.MappingIterator; import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ObjectReader; import com.typesafe.config.Config; /** * JSON parser that extracts search documents from twitter tweets obtained from the twitter 1% sample firehose with the delimited=length option. * For background see https://dev.twitter.com/docs/api/1.1/get/statuses/sample. * * The JSON input format is documented at https://dev.twitter.com/docs/platform-objects/tweets */ public final class ReadJsonTestTweetsBuilder implements CommandBuilder { //public static final String MEDIA_TYPE = "mytwittertest/json+delimited+length"; @Override public Collection<String> getNames() { return Collections.singletonList("readJsonTestTweets"); } @Override public Command build(Config config, Command parent, Command child, MorphlineContext context) { return new ReadJsonTestTweets(this, config, parent, child, context); } /////////////////////////////////////////////////////////////////////////////// // Nested classes: /////////////////////////////////////////////////////////////////////////////// private static final class ReadJsonTestTweets extends AbstractParser { private final boolean isLengthDelimited; private String idPrefix; private final ObjectReader reader = new ObjectMapper().reader(JsonNode.class); // Fri May 14 02:52:55 +0000 2010 private SimpleDateFormat formatterFrom = new SimpleDateFormat("EEE MMM dd HH:mm:ss Z yyyy", Locale.US); private SimpleDateFormat formatterTo = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss'Z'", Locale.US); public ReadJsonTestTweets(CommandBuilder builder, Config config, Command parent, Command child, MorphlineContext context) { super(builder, config, parent, child, context); this.isLengthDelimited = getConfigs().getBoolean(config, "isLengthDelimited", true); this.idPrefix = getConfigs().getString(config, "idPrefix", null); if ("random".equals(idPrefix)) { idPrefix = String.valueOf(new Random().nextInt()); } else if (idPrefix == null) { idPrefix = ""; } validateArguments(); } @Override protected boolean doProcess(Record record, InputStream in) throws IOException { String name = (String) record.getFirstValue(Fields.ATTACHMENT_NAME); if (name != null && name.endsWith(".gz")) { in = new GZIPInputStream(in, 64 * 1024); } long numRecords = 0; BufferedReader bufferedReader = null; MappingIterator<JsonNode> iter = null; if (isLengthDelimited) { bufferedReader = new BufferedReader(new InputStreamReader(in, "UTF-8")); } else { iter = reader.readValues(in); } try { while (true) { JsonNode rootNode; if (isLengthDelimited) { String json = nextLine(bufferedReader); if (json == null) { break; } try { // src can be a File, URL, InputStream, etc rootNode = reader.readValue(json); } catch (JsonParseException e) { LOG.info("json parse exception after " + numRecords + " records"); LOG.debug("json parse exception after " + numRecords + " records", e); break; } } else { if (!iter.hasNext()) { break; } rootNode = iter.next(); } Record doc = new Record(); JsonNode user = rootNode.get("user"); JsonNode idNode = rootNode.get("id_str"); if (idNode == null || idNode.textValue() == null) { continue; // skip } doc.put("id", idPrefix + idNode.textValue()); tryAddDate(doc, "created_at", rootNode.get("created_at")); tryAddString(doc, "source", rootNode.get("source")); tryAddString(doc, "text", rootNode.get("text")); tryAddInt(doc, "retweet_count", rootNode.get("retweet_count")); tryAddBool(doc, "retweeted", rootNode.get("retweeted")); tryAddLong(doc, "in_reply_to_user_id", rootNode.get("in_reply_to_user_id")); tryAddLong(doc, "in_reply_to_status_id", rootNode.get("in_reply_to_status_id")); tryAddString(doc, "media_url_https", rootNode.get("media_url_https")); tryAddString(doc, "expanded_url", rootNode.get("expanded_url")); tryAddInt(doc, "user_friends_count", user.get("friends_count")); tryAddString(doc, "user_location", user.get("location")); tryAddString(doc, "user_description", user.get("description")); tryAddInt(doc, "user_statuses_count", user.get("statuses_count")); tryAddInt(doc, "user_followers_count", user.get("followers_count")); tryAddString(doc, "user_screen_name", user.get("screen_name")); tryAddString(doc, "user_name", user.get("name")); incrementNumRecords(); LOG.debug("tweetdoc: {}", doc); if (!getChild().process(doc)) { return false; } numRecords++; } } finally { if (iter != null) { iter.close(); } LOG.debug("processed {} records", numRecords); } return true; } private String nextLine(BufferedReader reader) throws IOException { String line; while ((line = reader.readLine()) != null) { if (line.length() > 0) break; // ignore empty lines } if (line == null) return null; Integer.parseInt(line); // sanity check while ((line = reader.readLine()) != null) { if (line.length() > 0) break; // ignore empty lines } return line; } private void tryAddDate(Record doc, String solr_field, JsonNode node) { if (node == null) return; String val = node.asText(); if (val == null) { return; } try { // String tmp = formatterTo.format(formatterFrom.parse(val.trim())); doc.put(solr_field, formatterTo.format(formatterFrom.parse(val.trim()))); } catch (Exception e) { LOG.error("Could not parse date " + val); // ++exceptionCount; } } private void tryAddLong(Record doc, String solr_field, JsonNode node) { if (node == null) return; Long val = node.asLong(); if (val == null) { return; } doc.put(solr_field, val); } private void tryAddInt(Record doc, String solr_field, JsonNode node) { if (node == null) return; Integer val = node.asInt(); if (val == null) { return; } doc.put(solr_field, val); } private void tryAddBool(Record doc, String solr_field, JsonNode node) { if (node == null) return; Boolean val = node.asBoolean(); if (val == null) { return; } doc.put(solr_field, val); } private void tryAddString(Record doc, String solr_field, JsonNode node) { if (node == null) return; String val = node.asText(); if (val == null) { return; } doc.put(solr_field, val); } } }