/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.flink.contrib.tweetinputformat.io; import org.apache.flink.contrib.tweetinputformat.model.tweet.Contributors; import org.apache.flink.contrib.tweetinputformat.model.tweet.Tweet; import org.apache.flink.contrib.tweetinputformat.model.tweet.entities.HashTags; import org.json.simple.parser.ContentHandler; import org.json.simple.parser.ParseException; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.IOException; public class TweetHandler implements ContentHandler { private static final Logger logger = LoggerFactory.getLogger(TweetHandler.class); protected Tweet reuse; private int nesting = 0; private ObjectState objectState = ObjectState.TWEET; private EntryState entryState = EntryState.UNEXPECTED; private boolean sameHashTag = false; // to handle the coordinates special case of nesting primitive types private int coordinatesCounter = 0; private double coordinatesTemp = 0.0d; @Override public void startJSON() throws ParseException, IOException { sameHashTag = true; } @Override public void endJSON() throws ParseException, IOException { } @Override public boolean startObject() throws ParseException, IOException { nesting++; return true; } @Override public boolean endObject() throws ParseException, IOException { nesting--; if (this.nesting == 1) { this.objectState = ObjectState.TWEET; } // The handler in JSONParser checks for the "!contentHandler.endObject()", so we should // return false if its not the end of the object. return nesting > 0; } @Override public boolean startObjectEntry(String key) throws ParseException, IOException { if ((key.equals("contributors") || key.equals("user") || key.equals("geo") || key.equals("place") || key.equals("attributes") || key.equals("bounding_box"))) { objectState = ObjectState.valueOf(key.toUpperCase()); } else if (key.equals("hashtags") && nesting == 2) { objectState = ObjectState.valueOf(key.toUpperCase()); } else if (key.equals("coordinates") && (this.nesting == 1)) { objectState = ObjectState.valueOf(key.toUpperCase()); } else { try { entryState = EntryState.valueOf(key.toUpperCase()); } catch (IllegalArgumentException e) { logger.debug(e.getMessage()); } } return true; } @Override public boolean endObjectEntry() throws ParseException, IOException { if (objectState == ObjectState.CONTRIBUTORS && nesting == 1) { objectState = ObjectState.TWEET; } return true; } @Override public boolean startArray() throws ParseException, IOException { return true; } @Override public boolean endArray() throws ParseException, IOException { if (objectState == ObjectState.COORDINATES) { coordinatesCounter = 0; coordinatesTemp = 0.0d; } // Some tweets have HashTags twice, this condition to read only one of them if (objectState == ObjectState.HASHTAGS && entryState == EntryState.INDICES && nesting == 2) { sameHashTag = false; } return true; } @Override public boolean primitive(Object value) throws ParseException, IOException { try { if (objectState == ObjectState.TWEET) { tweetObjectStatePrimitiveHandler(value); } else if (objectState == ObjectState.USER) { userObjectStatePrimitiveHandler(value); } else if (objectState == ObjectState.GEO) { return true; } else if (objectState == ObjectState.COORDINATES) { coordinatesObjectStatePrimitiveHandler(value); } else if (objectState == ObjectState.PLACE) { placeObjectStatePrimitiveHandler(value); } else if (objectState == ObjectState.GEO) { return true; } else if (objectState == ObjectState.ATTRIBUTES) { placeAttributesObjectStatePrimitiveHandler(value); } else if (objectState == ObjectState.CONTRIBUTORS) { contributorsObjectStatePrimitiveHandler(value); } else if (objectState == ObjectState.HASHTAGS && entryState == EntryState.TEXT && sameHashTag) { hashTagsObjectStatePrimitiveHandler(value); } } catch (Exception e) { logger.debug("Error in primitive type: " + e.getMessage()); } return true; } public void tweetObjectStatePrimitiveHandler(Object value) { switch (entryState) { case CREATED_AT: if (value != null) { reuse.setCreated_at((String) value); } break; case TEXT: if (value != null) { reuse.setText((String) value); } break; case ID: if (value != null) { reuse.setId((Long) value); } break; case ID_STR: if (value != null) { reuse.setId_str((String) value); } break; case SOURCE: if (value != null) { reuse.setSource((String) value); } break; case TRUNCATED: if (value != null) { reuse.setTruncated((Boolean) value); } break; case IN_REPLY_TO_STATUS_ID: if (value != null) { reuse.setIn_reply_to_status_id((Long) value); } break; case IN_REPLY_TO_STATUS_ID_STR: if (value != null) { reuse.setIn_reply_to_status_id_str((String) value); } break; case IN_REPLY_TO_USER_ID: if (value != null) { reuse.setIn_reply_to_user_id((Long) value); } break; case IN_REPLY_TO_USER_ID_STR: if (value != null) { reuse.setIn_reply_to_user_id_str((String) value); } break; case IN_REPLY_TO_SCREEN_NAME: if (value != null) { reuse.setIn_reply_to_screen_name((String) value); } break; case RETWEET_COUNT: if (value != null) { reuse.setRetweet_count((Long) value); } break; case FAVORITE_COUNT: if (value != null) { reuse.setFavorite_count((Long) value); } break; case FAVORITED: if (value != null) { reuse.setFavorited((Boolean) value); } break; case RETWEETED: if (value != null) { reuse.setRetweeted((Boolean) value); } break; case POSSIBLY_SENSITIVE: if (value != null) { reuse.setPossibly_sensitive((Boolean) value); } break; case FILTER_LEVEL: if (value != null) { reuse.setFilter_level((String) value); } break; case LANG: if (value != null) { reuse.setLang((String) value); } break; } } public void userObjectStatePrimitiveHandler(Object value) { switch (entryState) { case ID: if (value != null) { // handle format exception caused by wrong values in the "id" field in the // tweets. if (value instanceof String) { try { reuse.getUser().setId(Long.parseLong((String) value)); } catch (NumberFormatException e) { reuse.getUser().setId(0L); logger.debug("This Tweet_ID is not a numeric type : " + (String) value); } } else { reuse.getUser().setId((Long) value); } } break; case ID_STR: if (value != null) { reuse.getUser().setId_str((String) value); } break; case NAME: if (value != null) { reuse.getUser().setName((String) value); } break; case SCREEN_NAME: if (value != null) { reuse.getUser().setScreen_name((String) value); } break; case LOCATION: if (value != null) { reuse.getUser().setLocation((String) value); } break; case URL: if (value != null) { reuse.getUser().setUrl((String) value); } break; case DESCRIPTION: if (value != null) { reuse.getUser().setDescription((String) value); } break; case PROTECTED: if (value != null) { reuse.getUser().setProtected_tweet((Boolean) value); } break; case VERIFIED: if (value != null) { reuse.getUser().setVerified((Boolean) value); } break; case FOLLOWERS_COUNT: if (value != null) { reuse.getUser().setFollowers_count((Long) value); } break; case FRIENDS_COUNT: if (value != null) { reuse.getUser().setFriends_count((Long) value); } break; case LISTED_COUNT: if (value != null) { reuse.getUser().setListed_count((Long) value); } break; case FAVOURITES_COUNT: if (value != null) { reuse.getUser().setFavourites_count((Long) value); } break; case STATUSES_COUNT: if (value != null) { reuse.getUser().setStatuses_count((Long) value); } break; case CREATED_AT: if (value != null) { reuse.getUser().setCreated_at((String) value); } break; case UTC_OFFSET: if (value != null) { reuse.getUser().setUtc_offset((Long) value); } break; case TIME_ZONE: if (value != null) { reuse.getUser().setTime_zone((String) value); } break; case GEO_ENABLED: if (value != null) { reuse.getUser().setGeo_enabled((Boolean) value); } break; case LANG: if (value != null) { reuse.getUser().setLang((String) value); } break; case CONTRIBUTORS_ENABLED: if (value != null) { reuse.getUser().setContributors_enabled((Boolean) value); } break; case IS_TRANSLATOR: if (value != null) { reuse.getUser().setIs_translator((Boolean) value); } break; case PROFILE_BACKGROUND_COLOR: if (value != null) { reuse.getUser().setProfile_background_color((String) value); } break; case PROFILE_BACKGROUND_IMAGE_URL: if (value != null) { reuse.getUser().setProfile_background_image_url((String) value); } break; case PROFILE_BACKGROUND_IMAGE_URL_HTTPS: if (value != null) { reuse.getUser().setProfile_background_image_url_https((String) value); } break; case PROFILE_BACKGROUND_TILE: if (value != null) { reuse.getUser().setProfile_background_tile((Boolean) value); } break; case PROFILE_LINK_COLOR: if (value != null) { reuse.getUser().setProfile_link_color((String) value); } break; case PROFILE_SIDEBAR_BORDER_COLOR: if (value != null) { reuse.getUser().setProfile_sidebar_border_color((String) value); } break; case PROFILE_SIDEBAR_FILL_COLOR: if (value != null) { reuse.getUser().setProfile_sidebar_fill_color((String) value); } break; case PROFILE_TEXT_COLOR: if (value != null) { reuse.getUser().setProfile_text_color((String) value); } break; case PROFILE_USE_BACKGROUND_IMAGE: if (value != null) { reuse.getUser().setProfile_use_background_image((Boolean) value); } break; case PROFILE_IMAGE_URL: if (value != null) { reuse.getUser().setProfile_image_url((String) value); } break; case PROFILE_IMAGE_URL_HTTPS: if (value != null) { reuse.getUser().setProfile_image_url_https((String) value); } break; case PROFILE_BANNER_URL: if (value != null) { reuse.getUser().setProfile_banner_url((String) value); } break; case DEFAULT_PROFILE: if (value != null) { reuse.getUser().setDefault_profile((Boolean) value); } break; case DEFAULT_PROFILE_IMAGE: if (value != null) { reuse.getUser().setDefault_profile_image((Boolean) value); } break; case FOLLOWING: if (value != null) { reuse.getUser().setFollowing((Boolean) value); } break; case FOLLOW_REQUEST_SENT: if (value != null) { reuse.getUser().setFollow_request_sent((Boolean) value); } break; case NOTIFICATIONS: if (value != null) { reuse.getUser().setNotifications((Boolean) value); } break; } } public void coordinatesObjectStatePrimitiveHandler(Object value) { switch (entryState) { case COORDINATES: if (value != null && this.coordinatesCounter == 0) { coordinatesTemp = (Double) value; this.coordinatesCounter++; } else if (value != null && this.coordinatesCounter == 1) { reuse.getCoordinates().setCoordinates(coordinatesTemp, (Double) value); } else { reuse.getCoordinates().setCoordinates(0.0d, 0.0d); } break; } } public void placeObjectStatePrimitiveHandler(Object value) { switch (entryState) { case ID: if (value != null) { reuse.getPlace().setId((String) value); } break; case URL: if (value != null) { reuse.getPlace().setUrl((String) value); } break; case PLACE_TYPE: if (value != null) { reuse.getPlace().setPlace_type((String) value); } break; case NAME: if (value != null) { reuse.getPlace().setName((String) value); } break; case FULL_NAME: if (value != null) { reuse.getPlace().setFull_name((String) value); } break; case COUNTRY_CODE: if (value != null) { reuse.getPlace().setCountry_code((String) value); } break; case COUNTRY: if (value != null) { reuse.getPlace().setCountry((String) value); } break; // Skipped BoundingBox -- Not Required } } public void placeAttributesObjectStatePrimitiveHandler(Object value) { switch (entryState) { case STREET_ADDRESS: if (value != null) { reuse.getPlace().getAttributes().setStreet_address((String) value); } break; case LOCALITY: if (value != null) { reuse.getPlace().getAttributes().setLocality((String) value); } break; case REGION: if (value != null) { reuse.getPlace().getAttributes().setRegion((String) value); } break; case ISO3: if (value != null) { reuse.getPlace().getAttributes().setIso3((String) value); } break; case POSTAL_CODE: if (value != null) { reuse.getPlace().getAttributes().setPostal_code((String) value); } break; case PHONE: if (value != null) { reuse.getPlace().getAttributes().setPhone((String) value); } break; case URL: if (value != null) { reuse.getPlace().getAttributes().setUrl((String) value); } break; case APP_ID: if (value != null) { reuse.getPlace().getAttributes().setAppId((String) value); } break; // Skipped BoundingBox -- Not Required } } public void contributorsObjectStatePrimitiveHandler(Object value) { // to handle the case of the null as contributors is an array in the Twitter documentation // && if it is not null we initialize the object and fill it with the data, if (value == null) { reuse.getContributors().add(new Contributors()); } else { Contributors contributor = new Contributors(); switch (entryState) { case ID: if (value != null) { contributor.setId((Long) value); } break; case ID_STR: if (value != null) { contributor.setId_str((String) value); } break; case TWEET_CONTRIBUTORS_SCREEN_NAME: if (value != null) { contributor.setScreenName((String) value); } break; } reuse.getContributors().add(contributor); } } public void hashTagsObjectStatePrimitiveHandler(Object value) { HashTags hashTag = new HashTags(); if (value == null) { return; } else if (entryState == EntryState.TEXT && value != null) { hashTag.setText((String) value, false); reuse.getEntities().getHashtags().add(hashTag); } } private static enum ObjectState { TWEET, CONTRIBUTORS, USER, GEO, COORDINATES, PLACE, ATTRIBUTES, BOUNDING_BOX, HASHTAGS; } private static enum EntryState { TEXT, CREATED_AT, ID, ID_STR, SOURCE, TRUNCATED, IN_REPLY_TO_STATUS_ID, IN_REPLY_TO_STATUS_ID_STR, IN_REPLY_TO_USER_ID, IN_REPLY_TO_USER_ID_STR, IN_REPLY_TO_SCREEN_NAME, RETWEET_COUNT, FAVORITE_COUNT, FAVORITED, RETWEETED, POSSIBLY_SENSITIVE, FILTER_LEVEL, TWEET_CONTRIBUTORS_SCREEN_NAME, SCREEN_NAME, LOCATION, DESCRIPTION, PROTECTED, VERIFIED, FOLLOWERS_COUNT, FRIENDS_COUNT, LISTED_COUNT, FAVOURITES_COUNT, STATUSES_COUNT, UTC_OFFSET, TIME_ZONE, GEO_ENABLED, LANG, CONTRIBUTORS_ENABLED, IS_TRANSLATOR, PROFILE_BACKGROUND_COLOR, PROFILE_BACKGROUND_IMAGE_URL, PROFILE_BACKGROUND_IMAGE_URL_HTTPS, PROFILE_BACKGROUND_TILE, PROFILE_LINK_COLOR, PROFILE_SIDEBAR_BORDER_COLOR, PROFILE_SIDEBAR_FILL_COLOR, PROFILE_TEXT_COLOR, PROFILE_USE_BACKGROUND_IMAGE, PROFILE_IMAGE_URL, PROFILE_IMAGE_URL_HTTPS, PROFILE_BANNER_URL, DEFAULT_PROFILE, DEFAULT_PROFILE_IMAGE, FOLLOWING, FOLLOW_REQUEST_SENT, NOTIFICATIONS, TYPE, COORDINATES, PLACE_TYPE, NAME, FULL_NAME, COUNTRY_CODE, COUNTRY, BOUNDING_BOX, ATTRIBUTES, STREET_ADDRESS, LOCALITY, REGION, ISO3, POSTAL_CODE, PHONE, URL, ENTITIES, HASHTAGS, TRENDS, URLS, USER_MENTIONS, SYMBOLS, MEDIA, INDICES, MEDIA_URL, MEDIA_URL_HTTPS, DISPLAY_URL, EXPANDED_URL, SIZES, LARGE, W, H, RESIZE, SMALL, THUMB, MEDIUM, RETWEETED_STATUS, SOURCE_STATUS_ID, SOURCE_STATUS_ID_STR, SCOPES, FOLLOWERS, APP_ID, UNEXPECTED; } }