/** * This software is licensed to you under the Apache License, Version 2.0 (the * "Apache License"). * * LinkedIn's contributions are made under the Apache License. If you contribute * to the Software, the contributions will be deemed to have been made under the * Apache License, unless you expressly indicate otherwise. Please do not make any * contributions that would be inconsistent with the Apache License. * * You may obtain a copy of the Apache License at http://www.apache.org/licenses/LICENSE-2.0 * Unless required by applicable law or agreed to in writing, this software * distributed under the Apache License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the Apache * License for the specific language governing permissions and limitations for the * software governed under the Apache License. * * © 2012 LinkedIn Corp. All Rights Reserved. */ package com.senseidb.example.tweets.gateway; import java.io.BufferedReader; import java.io.InputStream; import java.io.InputStreamReader; import java.net.URL; import java.net.URLConnection; import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Comparator; import java.util.List; import java.util.Map; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.log4j.Logger; import org.json.JSONObject; import proj.zoie.api.DataConsumer.DataEvent; import proj.zoie.impl.indexing.StreamDataProvider; public class TwitterSampleStreamer extends StreamDataProvider<JSONObject> { private static Logger logger = Logger.getLogger(TwitterSampleStreamer.class); // following hashtag extraction logic taken from twitter-text-java: https://github.com/twitter/twitter-text-java private static String LATIN_ACCENTS_CHARS = "\\u00c0-\\u00d6\\u00d8-\\u00f6\\u00f8-\\u00ff\\u015f"; private static final String HASHTAG_ALPHA_CHARS = "a-z" + LATIN_ACCENTS_CHARS + "\\u0400-\\u04ff\\u0500-\\u0527" + // Cyrillic "\\u2de0–\\u2dff\\ua640–\\ua69f" + // Cyrillic Extended A/B "\\u1100-\\u11ff\\u3130-\\u3185\\uA960-\\uA97F\\uAC00-\\uD7AF\\uD7B0-\\uD7FF" + // Hangul (Korean) "\\p{InHiragana}\\p{InKatakana}" + // Japanese Hiragana and Katakana "\\p{InCJKUnifiedIdeographs}" + // Japanese Kanji / Chinese Han "\\u3005\\u303b" + // Kanji/Han iteration marks "\\uff21-\\uff3a\\uff41-\\uff5a" + // full width Alphabet "\\uff66-\\uff9f" + // half width Katakana "\\uffa1-\\uffdc"; // half width Hangul (Korean) private static final String HASHTAG_ALPHA_NUMERIC_CHARS = "0-9\\uff10-\\uff19_" + HASHTAG_ALPHA_CHARS; private static final String HASHTAG_ALPHA = "[" + HASHTAG_ALPHA_CHARS +"]"; private static final String HASHTAG_ALPHA_NUMERIC = "[" + HASHTAG_ALPHA_NUMERIC_CHARS +"]"; public static final Pattern AUTO_LINK_HASHTAGS = Pattern.compile("(^|[^&/" + HASHTAG_ALPHA_NUMERIC_CHARS + "])(#|\uFF03)(" + HASHTAG_ALPHA_NUMERIC + "*" + HASHTAG_ALPHA + HASHTAG_ALPHA_NUMERIC + "*)", Pattern.CASE_INSENSITIVE); public static final Pattern HASHTAG_MATCH_END = Pattern.compile("^(?:[##]|://)"); private static List<String> extractHashtags(String text) { List<String> extracted = new ArrayList<String>(); Matcher matcher = AUTO_LINK_HASHTAGS.matcher(text); while (matcher.find()) { String after = text.substring(matcher.end()); if (!HASHTAG_MATCH_END.matcher(after).find()) { extracted.add(matcher.group(3)); } } return extracted; } private final BufferedReader _tweetReader; public TwitterSampleStreamer(Map<String, String> config,Comparator<String> versionComparator) throws Exception { super(versionComparator); String username = config.get("username"); String password = config.get("password"); URL url = new URL("https://stream.twitter.com/1/statuses/sample.json"); URLConnection uc = url.openConnection(); String userPassword = username+":"+password; String encoding = new sun.misc.BASE64Encoder().encode (userPassword.getBytes()); uc.setRequestProperty ("Authorization", "Basic " + encoding); InputStream in = uc.getInputStream(); _tweetReader =new BufferedReader(new InputStreamReader(in,"UTF-8")); } @Override public DataEvent<JSONObject> next() { DataEvent<JSONObject> tweetEvent = null; try{ String tweet = _tweetReader.readLine(); logger.info("tweet: "+tweet); JSONObject jsonObj = new JSONObject(tweet); String id = jsonObj.optString("id_str", null); if (id!=null){ JSONObject tweetJSON = new JSONObject(); tweetJSON.put("id", Long.parseLong(id)); String textString = jsonObj.optString("text",""); long time = new SimpleDateFormat("EEE MMM dd HH:mm:ss Z yyyy").parse(jsonObj.getString("created_at")).getTime(); tweetJSON.put("time", time); JSONObject user = jsonObj.optJSONObject("user"); String screenName = user.optString("screen_name",""); tweetJSON.put("tweeter", screenName); List<String> hashtags = extractHashtags(textString); StringBuilder contentBuilder = new StringBuilder(); contentBuilder.append(textString).append("\n"); contentBuilder.append(screenName).append("\n"); if(hashtags!=null && hashtags.size()>0){ StringBuilder buf = new StringBuilder(); boolean first = true; for (String tag : hashtags){ if (!first){ buf.append(","); } else{ first = false; } buf.append(tag); contentBuilder.append(tag).append("\n"); } tweetJSON.put("hashtags", buf.toString()); } tweetJSON.put("contents", contentBuilder.toString()); tweetJSON.put("tweet", jsonObj); tweetEvent = new DataEvent<JSONObject>(tweetJSON,String.valueOf(System.currentTimeMillis())); if (logger.isDebugEnabled()){ logger.debug("event: "+tweetJSON.toString()); } } } catch(Exception e){ logger.error(e.getMessage(),e); return null; } return tweetEvent; } @Override public void setStartingOffset(String version) { } @Override public void reset() { } @Override public void stop() { try{ _tweetReader.close(); } catch(Exception e){ logger.error(e.getMessage(),e); } finally{ super.stop(); } } }