/**
* TwitterScraper
* Copyright 22.02.2015 by Michael Peter Christen, @0rb1t3r
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
*/
package org.loklak.harvester;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLEncoder;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Date;
import java.util.HashMap;
import java.util.LinkedHashSet;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Semaphore;
import java.util.concurrent.TimeUnit;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.eclipse.jetty.util.log.Log;
import org.loklak.data.DAO;
import org.loklak.data.IncomingMessageBuffer;
import org.loklak.http.ClientConnection;
import org.loklak.objects.MessageEntry;
import org.loklak.objects.ProviderType;
import org.loklak.objects.SourceType;
import org.loklak.objects.Timeline;
import org.loklak.objects.UserEntry;
public class TwitterScraper {
public final static ExecutorService executor = Executors.newFixedThreadPool(40);
public static Timeline search(
final String query,
final Timeline.Order order,
final boolean writeToIndex,
final boolean writeToBackend,
int jointime) {
Timeline[] tl = search(query, order, writeToIndex, writeToBackend);
long timeout = System.currentTimeMillis() + jointime;
for (MessageEntry me: tl[1]) {
assert me instanceof TwitterTweet;
TwitterTweet tt = (TwitterTweet) me;
long remainingWait = Math.max(10, timeout - System.currentTimeMillis());
if (tt.waitReady(remainingWait)) tl[0].add(tt, tt.getUser()); // double additions are detected
}
return tl[0];
}
private static String prepareSearchURL(final String query) {
// check
// https://twitter.com/search-advanced for a better syntax
// https://support.twitter.com/articles/71577-how-to-use-advanced-twitter-search#
String https_url = "";
try {
StringBuilder t = new StringBuilder(query.length());
for (String s: query.replace('+', ' ').split(" ")) {
t.append(' ');
if (s.startsWith("since:") || s.startsWith("until:")) {
int u = s.indexOf('_');
t.append(u < 0 ? s : s.substring(0, u));
} else {
t.append(s);
}
}
String q = t.length() == 0 ? "*" : URLEncoder.encode(t.substring(1), "UTF-8");
//https://twitter.com/search?f=tweets&vertical=default&q=kaffee&src=typd
https_url = "https://twitter.com/search?f=tweets&vertical=default&q=" + q + "&src=typd";
} catch (UnsupportedEncodingException e) {}
return https_url;
}
private static Timeline[] search(
final String query,
final Timeline.Order order,
final boolean writeToIndex,
final boolean writeToBackend) {
// check
// https://twitter.com/search-advanced for a better syntax
// https://support.twitter.com/articles/71577-how-to-use-advanced-twitter-search#
String https_url = prepareSearchURL(query);
Timeline[] timelines = null;
try {
ClientConnection connection = new ClientConnection(https_url);
if (connection.inputStream == null) return null;
try {
BufferedReader br = new BufferedReader(new InputStreamReader(connection.inputStream, StandardCharsets.UTF_8));
timelines = search(br, order, writeToIndex, writeToBackend);
} catch (IOException e) {
Log.getLog().warn(e);
} finally {
connection.close();
}
} catch (IOException e) {
// this could mean that twitter rejected the connection (DoS protection?) or we are offline (we should be silent then)
// Log.getLog().warn(e);
if (timelines == null) timelines = new Timeline[]{new Timeline(order), new Timeline(order)};
};
// wait until all messages in the timeline are ready
if (timelines == null) {
// timeout occurred
timelines = new Timeline[]{new Timeline(order), new Timeline(order)};
}
if (timelines != null) {
if (timelines[0] != null) timelines[0].setScraperInfo("local");
if (timelines[1] != null) timelines[1].setScraperInfo("local");
}
return timelines;
}
private static Timeline[] parse(
final File file,
final Timeline.Order order,
final boolean writeToIndex,
final boolean writeToBackend) {
Timeline[] timelines = null;
try {
BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(file), StandardCharsets.UTF_8));
timelines = search(br, order, writeToIndex, writeToBackend);
} catch (IOException e) {
Log.getLog().warn(e);
} finally {
if (timelines == null) timelines = new Timeline[]{new Timeline(order), new Timeline(order)};
}
if (timelines[0] != null) timelines[0].setScraperInfo("local");
if (timelines[1] != null) timelines[1].setScraperInfo("local");
return timelines;
}
/**
* scrape messages from the reader stream: this already checks if a message is new. There are only new messages returned
* @param br
* @param order
* @return two timelines in one array: Timeline[0] is the one which is finished to be used, Timeline[1] contains messages which are in postprocessing
* @throws IOException
*/
private static Timeline[] search(
final BufferedReader br,
final Timeline.Order order,
final boolean writeToIndex,
final boolean writeToBackend) throws IOException {
Timeline timelineReady = new Timeline(order);
Timeline timelineWorking = new Timeline(order);
String input;
Map<String, prop> props = new HashMap<String, prop>();
Set<String> images = new LinkedHashSet<>();
Set<String> videos = new LinkedHashSet<>();
String place_id = "", place_name = "";
boolean parsing_favourite = false, parsing_retweet = false;
int line = 0; // first line is 1, according to emacs which numbers the first line also as 1
boolean debuglog = false;
while ((input = br.readLine()) != null){
line++;
input = input.trim();
if (input.length() == 0) continue;
// debug
if (debuglog) System.out.println(line + ": " + input);
//if (input.indexOf("ProfileTweet-actionCount") > 0) System.out.println(input);
// parse
int p;
if ((p = input.indexOf("=\"account-group")) > 0) {
props.put("userid", new prop(input, p, "data-user-id"));
continue;
}
if ((p = input.indexOf("class=\"avatar js-action-profile-avatar")) > 0) {
props.put("useravatarurl", new prop(input, p, "src"));
continue;
}
if ((p = input.indexOf("class=\"fullname js-action-profile-name")) > 0) {
props.put("userfullname", new prop(input, p, null));
continue;
}
if ((p = input.indexOf("class=\"username js-action-profile-name")) > 0) {
props.put("usernickname", new prop(input.replace("<s>@</s>", "").replace("<b>", "").replace("</b>", ""), p, null));
continue;
}
if ((p = input.indexOf("class=\"tweet-timestamp")) > 0) {
props.put("tweetstatusurl", new prop(input, 0, "href"));
props.put("tweettimename", new prop(input, p, "title"));
// don't continue here because "class=\"_timestamp" is in the same line
}
if ((p = input.indexOf("class=\"_timestamp")) > 0) {
props.put("tweettimems", new prop(input, p, "data-time-ms"));
continue;
}
if ((p = input.indexOf("class=\"ProfileTweet-action--retweet")) > 0) {
parsing_retweet = true;
continue;
}
if ((p = input.indexOf("class=\"ProfileTweet-action--favorite")) > 0) {
parsing_favourite = true;
continue;
}
if ((p = input.indexOf("class=\"TweetTextSize")) > 0) {
// read until closing p tag to account for new lines in tweets
while (input.lastIndexOf("</p>") == -1){
input = input + ' ' + br.readLine();
}
prop tweettext = new prop(input, p, null);
props.put("tweettext", tweettext);
continue;
}
if ((p = input.indexOf("class=\"ProfileTweet-actionCount")) > 0) {
if (parsing_retweet) {
prop tweetretweetcount = new prop(input, p, "data-tweet-stat-count");
props.put("tweetretweetcount", tweetretweetcount);
parsing_retweet = false;
}
if (parsing_favourite) {
props.put("tweetfavouritecount", new prop(input, p, "data-tweet-stat-count"));
parsing_favourite = false;
}
continue;
}
// get images
if ((p = input.indexOf("class=\"media media-thumbnail twitter-timeline-link media-forward is-preview")) > 0 ||
(p = input.indexOf("class=\"multi-photo")) > 0) {
images.add(new prop(input, p, "data-resolved-url-large").value);
continue;
}
// we have two opportunities to get video thumbnails == more images; images in the presence of video content should be treated as thumbnail for the video
if ((p = input.indexOf("class=\"animated-gif-thumbnail\"")) > 0) {
images.add(new prop(input, 0, "src").value);
continue;
}
if ((p = input.indexOf("class=\"animated-gif\"")) > 0) {
images.add(new prop(input, p, "poster").value);
continue;
}
if ((p = input.indexOf("<source video-src")) >= 0 && input.indexOf("type=\"video/") > p) {
videos.add(new prop(input, p, "video-src").value);
continue;
}
if ((p = input.indexOf("class=\"Tweet-geo")) > 0) {
prop place_name_prop = new prop(input, p, "title");
place_name = place_name_prop.value;
continue;
}
if ((p = input.indexOf("class=\"ProfileTweet-actionButton u-linkClean js-nav js-geo-pivot-link")) > 0) {
prop place_id_prop = new prop(input, p, "data-place-id");
place_id = place_id_prop.value;
continue;
}
if (props.size() == 10 || (debuglog && props.size() > 4 && input.indexOf("stream-item") > 0 /* li class="js-stream-item" starts a new tweet */)) {
// the tweet is complete, evaluate the result
if (debuglog) System.out.println("*** line " + line + " propss.size() = " + props.size());
prop userid = props.get("userid"); if (userid == null) {if (debuglog) System.out.println("*** line " + line + " MISSING value userid"); continue;}
prop usernickname = props.get("usernickname"); if (usernickname == null) {if (debuglog) System.out.println("*** line " + line + " MISSING value usernickname"); continue;}
prop useravatarurl = props.get("useravatarurl"); if (useravatarurl == null) {if (debuglog) System.out.println("*** line " + line + " MISSING value useravatarurl"); continue;}
prop userfullname = props.get("userfullname"); if (userfullname == null) {if (debuglog) System.out.println("*** line " + line + " MISSING value userfullname"); continue;}
UserEntry user = new UserEntry(
userid.value,
usernickname.value,
useravatarurl.value,
MessageEntry.html2utf8(userfullname.value)
);
ArrayList<String> imgs = new ArrayList<String>(images.size()); imgs.addAll(images);
ArrayList<String> vids = new ArrayList<String>(videos.size()); vids.addAll(videos);
prop tweettimems = props.get("tweettimems"); if (tweettimems == null) {if (debuglog) System.out.println("*** line " + line + " MISSING value tweettimems"); continue;}
prop tweetretweetcount = props.get("tweetretweetcount"); if (tweetretweetcount == null) {if (debuglog) System.out.println("*** line " + line + " MISSING value tweetretweetcount"); continue;}
prop tweetfavouritecount = props.get("tweetfavouritecount"); if (tweetfavouritecount == null) {if (debuglog) System.out.println("*** line " + line + " MISSING value tweetfavouritecount"); continue;}
TwitterTweet tweet = new TwitterTweet(
user.getScreenName(),
Long.parseLong(tweettimems.value),
props.get("tweettimename").value,
props.get("tweetstatusurl").value,
props.get("tweettext").value,
Long.parseLong(tweetretweetcount.value),
Long.parseLong(tweetfavouritecount.value),
imgs, vids, place_name, place_id,
user, writeToIndex, writeToBackend
);
if (DAO.messages == null || !DAO.messages.existsCache(tweet.getIdStr())) {
// checking against the exist cache is incomplete. A false negative would just cause that a tweet is
// indexed again.
if (tweet.willBeTimeConsuming()) {
executor.execute(tweet);
//new Thread(tweet).start();
// because the executor may run the thread in the current thread it could be possible that the result is here already
if (tweet.isReady()) {
timelineReady.add(tweet, user);
//DAO.log("SCRAPERTEST: messageINIT is ready");
} else {
timelineWorking.add(tweet, user);
//DAO.log("SCRAPERTEST: messageINIT unshortening");
}
} else {
// no additional thread needed, run the postprocessing in the current thread
tweet.run();
timelineReady.add(tweet, user);
}
}
images.clear();
props.clear();
continue;
}
}
//for (prop p: props.values()) System.out.println(p);
br.close();
return new Timeline[]{timelineReady, timelineWorking};
}
private static class prop {
public String key, value = null;
public prop(String value) {
this.key = null;
this.value = value;
}
public prop(String line, int start, String key) {
this.key = key;
if (key == null) {
int p = line.indexOf('>', start);
if (p > 0) {
int c = 1;
int q = p + 1;
while (c > 0 && q < line.length()) {
char a = line.charAt(q);
if (a == '<') {
if (line.charAt(q+1) != 'i') {
if (line.charAt(q+1) == '/') c--; else c++;
}
}
q++;
}
assert p >= -1;
assert q > 0;
try {
value = line.substring(p + 1, q - 1);
} catch (StringIndexOutOfBoundsException e) {
Log.getLog().debug(e);
}
}
} else {
int p = line.indexOf(key + "=\"", start);
if (p > 0) {
int q = line.indexOf('"', p + key.length() + 2);
if (q > 0) {
value = line.substring(p + key.length() + 2, q);
}
}
}
}
@SuppressWarnings("unused")
public boolean success() {
return value != null;
}
public String toString() {
return this.key + "=" + (this.value == null ? "unknown" : this.value);
}
}
final static Pattern hashtag_pattern = Pattern.compile("<a href=\"/hashtag/.*?\".*?class=\"twitter-hashtag.*?\".*?><s>#</s><b>(.*?)</b></a>");
final static Pattern timeline_link_pattern = Pattern.compile("<a href=\"https://(.*?)\".*? data-expanded-url=\"(.*?)\".*?twitter-timeline-link.*?title=\"(.*?)\".*?>.*?</a>");
final static Pattern timeline_embed_pattern = Pattern.compile("<a href=\"(https://t.co/\\w+)\" class=\"twitter-timeline-link.*?>pic.twitter.com/(.*?)</a>");
final static Pattern emoji_pattern = Pattern.compile("<img .*?class=\"Emoji Emoji--forText\".*?alt=\"(.*?)\".*?>");
final static Pattern doublespace_pattern = Pattern.compile(" ");
final static Pattern cleanup_pattern = Pattern.compile(
"</?(s|b|strong)>|" +
"<a href=\"/hashtag.*?>|" +
"<a.*?class=\"twitter-atreply.*?>|" +
"<span.*?span>"
);
public static class TwitterTweet extends MessageEntry implements Runnable {
private final Semaphore ready;
private UserEntry user;
private boolean writeToIndex, writeToBackend;
public TwitterTweet(
final String user_screen_name_raw,
final long created_at_raw,
final String created_at_name_raw, // not used here but should be compared to created_at_raw
final String status_id_url_raw,
final String text_raw,
final long retweets,
final long favourites,
final Collection<String> images,
final Collection<String> videos,
final String place_name,
final String place_id,
final UserEntry user,
final boolean writeToIndex,
final boolean writeToBackend) throws MalformedURLException {
super();
this.source_type = SourceType.TWITTER;
this.provider_type = ProviderType.SCRAPED;
this.screen_name = user_screen_name_raw;
this.created_at = new Date(created_at_raw);
this.status_id_url = new URL("https://twitter.com" + status_id_url_raw);
int p = status_id_url_raw.lastIndexOf('/');
this.id_str = p >= 0 ? status_id_url_raw.substring(p + 1) : "-1";
this.retweet_count = retweets;
this.favourites_count = favourites;
this.place_name = place_name;
this.place_id = place_id;
this.images = new LinkedHashSet<>(); for (String image: images) this.images.add(image);
this.videos = new LinkedHashSet<>(); for (String video: videos) this.videos.add(video);
this.text = text_raw;
this.user = user;
this.writeToIndex = writeToIndex;
this.writeToBackend = writeToBackend;
//Date d = new Date(timemsraw);
//System.out.println(d);
/* failed to reverse-engineering the place_id :(
if (place_id.length() == 16) {
String a = place_id.substring(0, 8);
String b = place_id.substring(8, 16);
long an = Long.parseLong(a, 16);
long bn = Long.parseLong(b, 16);
System.out.println("place = " + place_name + ", a = " + an + ", b = " + bn);
// Frankfurt a = 3314145750, b = 3979907708, http://www.openstreetmap.org/#map=15/50.1128/8.6835
// Singapore a = 1487192992, b = 3578663936
}
*/
// this.text MUST be analysed with analyse(); this is not done here because it should be started concurrently; run run();
this.ready = new Semaphore(0);
}
public UserEntry getUser() {
return this.user;
}
public boolean willBeTimeConsuming() {
return timeline_link_pattern.matcher(this.text).find() || timeline_embed_pattern.matcher(this.text).find();
}
@Override
public void run() {
//long start = System.currentTimeMillis();
try {
//DAO.log("TwitterTweet [" + this.id_str + "] start");
this.text = unshorten(this.text);
//DAO.log("TwitterTweet [" + this.id_str + "] unshorten after " + (System.currentTimeMillis() - start) + "ms");
this.enrich();
//DAO.log("TwitterTweet [" + this.id_str + "] enrich after " + (System.currentTimeMillis() - start) + "ms");
if (this.writeToIndex) IncomingMessageBuffer.addScheduler(this, this.user, true);
//DAO.log("TwitterTweet [" + this.id_str + "] write after " + (System.currentTimeMillis() - start) + "ms");
if (this.writeToBackend) DAO.outgoingMessages.transmitMessage(this, this.user);
//DAO.log("TwitterTweet [" + this.id_str + "] transmit after " + (System.currentTimeMillis() - start) + "ms");
} catch (Throwable e) {
Log.getLog().warn(e);
} finally {
this.ready.release(1000);
}
}
public boolean isReady() {
if (this.ready == null) throw new RuntimeException("isReady() should not be called if postprocessing is not started");
return this.ready.availablePermits() > 0;
}
public boolean waitReady(long millis) {
if (this.ready == null) throw new RuntimeException("waitReady() should not be called if postprocessing is not started");
if (this.ready.availablePermits() > 0) return true;
try {
return this.ready.tryAcquire(millis, TimeUnit.MILLISECONDS);
} catch (InterruptedException e) {
return false;
}
}
}
public static String unshorten(String text) {
while (true) {
try {
Matcher m = emoji_pattern.matcher(text);
if (m.find()) {
String emoji = m.group(1);
text = m.replaceFirst(emoji);
continue;
}
} catch (Throwable e) {
Log.getLog().warn(e);
break;
}
try {
Matcher m = hashtag_pattern.matcher(text);
if (m.find()) {
text = m.replaceFirst(" #" + m.group(1) + " "); // the extra spaces are needed because twitter removes them if the hashtag is followed with a link
continue;
}
} catch (Throwable e) {
Log.getLog().warn(e);
break;
}
try {
Matcher m = timeline_link_pattern.matcher(text);
if (m.find()) {
String expanded = RedirectUnshortener.unShorten(m.group(2));
text = m.replaceFirst(" " + expanded);
continue;
}
} catch (Throwable e) {
Log.getLog().warn(e);
break;
}
try {
Matcher m = timeline_embed_pattern.matcher(text);
if (m.find()) {
String shorturl = RedirectUnshortener.unShorten(m.group(2));
text = m.replaceFirst(" https://pic.twitter.com/" + shorturl + " ");
continue;
}
} catch (Throwable e) {
Log.getLog().warn(e);
break;
}
break;
}
text = cleanup_pattern.matcher(text).replaceAll("");
text = MessageEntry.html2utf8(text);
text = doublespace_pattern.matcher(text).replaceAll(" ");
text = text.trim();
return text;
}
/**
* Usage: java twitter4j.examples.search.SearchTweets [query]
*
* @param args search query
*/
public static void main(String[] args) {
//wget --no-check-certificate "https://twitter.com/search?q=eifel&src=typd&f=realtime"
Timeline[] result = null;
if (args[0].startsWith("/"))
result = parse(new File(args[0]),Timeline.Order.CREATED_AT, true, true);
else
result = TwitterScraper.search(args[0], Timeline.Order.CREATED_AT, true, true);
int all = 0;
for (int x = 0; x < 2; x++) {
if (x == 0) System.out.println("Timeline[0] - finished to be used:");
if (x == 1) System.out.println("Timeline[1] - messages which are in postprocessing");
all += result[x].size();
for (MessageEntry tweet : result[x]) {
if (tweet instanceof TwitterTweet) {
((TwitterTweet) tweet).waitReady(10000);
}
System.out.println(tweet.getCreatedAt().toString() + " from @" + tweet.getScreenName() + " - " + tweet.getText());
}
}
System.out.println("count: " + all);
System.exit(0);
}
}