package edu.umd.rhsmith.diads.meater.modules.tweater.storage.legacy;
import java.sql.Connection;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Statement;
import java.util.LinkedList;
import java.util.List;
import javax.sql.DataSource;
import twitter4j.HashtagEntity;
import twitter4j.URLEntity;
import twitter4j.UserMentionEntity;
import edu.umd.rhsmith.diads.meater.core.app.MEaterConfigurationException;
import edu.umd.rhsmith.diads.meater.modules.tweater.media.UserStatusData;
import edu.umd.rhsmith.diads.meater.modules.tweater.queries.QueryFollow;
import edu.umd.rhsmith.diads.meater.modules.tweater.queries.QueryItem;
import edu.umd.rhsmith.diads.meater.modules.tweater.queries.QueryLocation;
import edu.umd.rhsmith.diads.meater.modules.tweater.queries.QueryPhrase;
import edu.umd.rhsmith.diads.meater.modules.tweater.queries.QueryTrack;
import edu.umd.rhsmith.diads.meater.util.ControlException;
import edu.umd.rhsmith.diads.meater.util.Util;
/**
* This class persists statuses to a MySQL database.
*
* @author dmonner
*/
public class MySqlStatusEater extends StatusEater {
private final String dbName;
/**
* The handle to the MySQL database connection pool
*/
private DataSource ds;
/**
* The maximum number of times to try a transaction before giving up
*/
private final int MAX_TRIES = 5;
public MySqlStatusEater(MySqlStatusEaterInitializer init)
throws MEaterConfigurationException {
super(init);
this.dbName = init.getDbName();
}
/*
* --------------------------------
* Control methods
* --------------------------------
*/
@Override
protected void doInitRoutine() throws MEaterConfigurationException {
this.ds = this.getComponentManager().getMain().getSqlManager()
.getDataSource(dbName);
if (this.ds == null) {
throw new MEaterConfigurationException(this.messageString(
MSG_ERR_NOSOURCE_FMT, dbName));
}
}
@Override
protected void doStartupRoutine() throws ControlException {
}
@Override
protected void doShutdownRoutine() {
}
/**
* Searches the database for a particular status.
*
* @param status_id
* @return <code>true</code> iff the database contains the status with the
* given ID.
*/
public boolean has(final long status_id) {
logFinest("Checking database for status id " + status_id + ".");
final Connection conn = connect();
Statement stmt = null;
boolean have = false;
try {
// Run a SELECT COUNT query to see if we have the status with the
// given ID
stmt = conn.createStatement();
final ResultSet result = stmt
.executeQuery("SELECT COUNT(*) FROM status WHERE status_id = "
+ status_id + ";");
// If we do have it, we will have one row of results with a single
// int > 0
if (result.first() && result.getInt(1) > 0) {
have = true;
}
} catch (final SQLException ex) {
logSevere( //
"SQLState: " + ex.getSQLState() + "\n" + //
"VendorError: " + ex.getErrorCode() + "\n" + //
Util.traceMessage(ex));
} finally {
try {
if (stmt != null) {
stmt.close();
}
} catch (final SQLException ex) {
logSevere( //
"SQLState: " + ex.getSQLState() + "\n" + //
"VendorError: " + ex.getErrorCode() + "\n" + //
Util.traceMessage(ex));
}
disconnect(conn);
}
logFinest("Database " + (have ? "contains" : "does not contain")
+ " status id " + status_id + ".");
return have;
}
/*
* (non-Javadoc)
*
* @see edu.umd.cs.dmonner.tweater.StatusEater#persist(java.util.List,
* twitter4j.Status)
*/
@Override
public void persist(final List<QueryItem> matches,
final UserStatusData status) {
logFinest("Entering persist() for status id " + status.getStatusId());
// get sentiment information
final double sentiment = status.getSentiment();
// get location information
final double lat = status.getStatusLatitude();
final double lon = status.getStatusLongitude();
final String locStr = scrub(status.getUserLocation());
final String lang = scrub(status.getUserLanguage());
// get retweet information
final boolean rt = status.isStatusRetweet();
final long rtct = rt ? status.getStatusRetweetCount() : 0;
final long rtid = rt ? status.getStatusRetweetedStatusId() : -1;
// get hashtags and links
final HashtagEntity[] tags = status.getHashtagEntities();
final URLEntity[] urls = status.getURLEntities();
final UserMentionEntity[] mentions = status.getUserMentionEntities();
final List<String> sqls = new LinkedList<String>();
// SQL to insert the user
sqls.add("INSERT INTO user(" + //
"user_id, " + //
"user_info_from_status, " + //
"user_name, " + //
"user_join_date, " + //
"user_status_count, " + //
"user_followers, " + //
"user_friends, " + //
"user_listed, " + //
"user_verified, " + //
"user_lang, " + //
"user_location, " + //
"user_utc_offset) " + //
"VALUES (" + //
status.getUserId() + ", " + //
status.getStatusId() + ", '" + //
status.getUserScreenName() + "', " + //
status.getUserCreatedAt().getTime() + ", " + //
status.getUserStatusesCount() + ", " + //
status.getUserFollowersCount() + ", " + //
status.getUserFriendsCount() + ", " + //
status.getUserListedCount() + ", " + //
(status.isUserVerified() ? 1 : 0) + ", '" + //
lang + "', '" + //
locStr + "', "//
+ status.getUserUtcOffset() + ");");
// SQL to insert the status
sqls.add("INSERT INTO status(" + //
"status_id, " + //
"user_id, " + //
"status_date, " + //
"status_text, " + //
"status_sentiment, " + //
"status_is_retweet, " + //
"status_retweet_of, " + //
"status_retweet_count, " + //
"status_latitude, " + //
"status_longitude) " + //
" VALUES (" + //
status.getStatusId() + ", " + //
status.getUserId() + ", " + //
status.getStatusCreatedAt().getTime() + ", '" + //
scrub(status.getStatusText()) + "', " + //
sentiment + ", " + //
rt + ", " + //
rtid + ", " + //
rtct + ", " + //
lat + ", " + //
lon + ");");
// SQL to insert QueryItem matches
for (final QueryItem match : matches) {
String table = null;
if (match instanceof QueryTrack)
table = "track_match(query_track_no, status_id)";
else if (match instanceof QueryPhrase)
table = "phrase_match(query_phrase_no, status_id)";
else if (match instanceof QueryFollow)
table = "follow_match(query_follow_no, status_id)";
else if (match instanceof QueryLocation)
table = "location_match(query_location_no, status_id)";
else
logWarning("Unhandled match type: " + match);
if (table != null)
sqls.add("INSERT INTO " + table + " VALUES ("
+ match.getQueryId() + ", " + status.getStatusId()
+ ");");
}
// SQL to insert hashtag entities
for (final HashtagEntity tag : tags) {
final String text = scrub(tag.getText()).toLowerCase();
sqls.add("INSERT INTO hashtag(" + //
"hashtag_text) " + //
"VALUES ('" + //
text + "') " + //
"ON DUPLICATE KEY UPDATE hashtag_no = LAST_INSERT_ID(hashtag_no);");
sqls.add("INSERT INTO hashtag_match(" + //
"hashtag_no, " + //
"status_id, " + //
"hashtag_startidx, " + //
"hashtag_endidx) " + //
"VALUES " + //
"(LAST_INSERT_ID(), " + //
status.getStatusId() + ", " + //
tag.getStart() + ", " + //
tag.getEnd() + ");");
}
// SQL to expand and insert URL entities
for (final URLEntity url : urls) {
if (url != null && url.getURL() != null) {
final String u = url.getURL().toString().trim();
if (!u.equals("")) {
final String urlStr = scrub(u);
final String expandedStr = scrub(Util.expandUrl(u));
sqls.add("INSERT INTO expanded_link(" + //
"expanded_link_url) " + //
"VALUES ('" + //
expandedStr + "') " + //
"ON DUPLICATE KEY UPDATE expanded_link_no = LAST_INSERT_ID(expanded_link_no);");
sqls.add("INSERT INTO link(" + //
"link_url, " + //
"expanded_link_no) " + //
"VALUES ('" + //
urlStr + "', " + //
"LAST_INSERT_ID()) " + //
"ON DUPLICATE KEY UPDATE link_no = LAST_INSERT_ID(link_no);");
sqls.add("INSERT INTO link_match(" + //
"link_no, " + //
"status_id, " + //
"link_startidx, " + //
"link_endidx) " + //
"VALUES " + //
"(LAST_INSERT_ID(), " + //
status.getStatusId() + ", " + //
url.getStart() + ", " + //
url.getEnd() + ");");
}
}
}
// SQL to insert mentions
for (final UserMentionEntity mention : mentions) {
sqls.add("INSERT INTO mention(" + //
"status_id, " + //
"mention_startidx, " + //
"mention_endidx, " + //
"mention_by, " + //
"mention_of) " + //
"VALUES (" + //
status.getStatusId() + ", " + //
mention.getStart() + ", " + //
mention.getEnd() + ", " + //
status.getUserId() + ", " + //
mention.getId() + ");");
}
logFinest("Attempting " + sqls.size()
+ " database inserts for status id " + status.getStatusId()
+ ".");
// Run the whole insert as a single batch; this greatly reduces load on
// the database compared to
// individual insert commands
final Connection conn = connect();
execute(conn, status.getStatusId(), sqls);
disconnect(conn);
}
/*
* --------------------------------
* Misc. utilities
* --------------------------------
*/
/**
* Obtains a database connection safely while logging errors; convenience
* method.
*
* @return An open connection to the database.
*/
private Connection connect() {
Connection conn = null;
while (conn == null) {
try {
conn = ds.getConnection();
if (conn == null) {
try {
Thread.sleep((int) (500 * Math.random()));
} catch (final InterruptedException ex) {
}
}
} catch (final SQLException ex) {
logSevere( //
"SQLState: " + ex.getSQLState() + "\n" + //
"VendorError: " + ex.getErrorCode() + "\n" + //
Util.traceMessage(ex));
}
}
return conn;
}
/**
* Closes the passed-in connection safely while logging errors; convenience
* method.
*
* @param conn
*/
private void disconnect(final Connection conn) {
try {
if (conn != null) {
conn.close();
}
} catch (final SQLException ex) {
logSevere( //
"SQLState: " + ex.getSQLState() + "\n" + //
"VendorError: " + ex.getErrorCode() + "\n" + //
Util.traceMessage(ex));
}
}
/**
* Runs a list of SQL statements as a single transaction over the given
* connection. In the even of
* a busy database, this method will retry the transaction up to
* <code>MAX_TRIES</code> times.
*
* @param conn
* @param id
* @param sqls
*/
private void execute(final Connection conn, final long id,
final List<String> sqls) {
execute(conn, id, sqls, MAX_TRIES);
}
/**
* Runs a list of SQL statements as a single transaction over the given
* connection, allowing <code>tries</code> retries if the database is too
* busy.
*
* @param conn
* @param id
* @param sqls
*/
private void execute(final Connection conn, final long id,
final List<String> sqls, final int tries) {
Statement stmt = null;
try {
// Tell the connection to wait to commit the transaction until we're
// done
conn.setAutoCommit(false);
stmt = conn.createStatement();
// Add all the SQL statements to a batch
for (final String sql : sqls) {
stmt.addBatch(sql);
}
// Execute the batch
logFinest("Executing MySQL statement batch for status id " + id
+ ".");
stmt.executeBatch();
// Commit the results, assuming we succeed
logFinest("Committing inserts for status id " + id + ".");
conn.commit();
logFinest("Database inserts committed for status id " + id + ".");
} catch (final SQLException ex) {
// This will happen if Twitter sends us a duplicate status, as
// happens occasionally
if (ex.getMessage().contains("Duplicate entry")) {
// Roll back the transaction and log the duplicate error
logFine("Duplicate entry into database for status id " + id
+ "; transaction aborted.");
logFinest(ex.getMessage() + "\nQuery = \n"
+ Util.prettyPrintList(sqls) + "\nStatus id: " + id);
rollback(conn);
}
// This happens when the database is too busy to process our
// transaction
else if (ex.getMessage().contains("try restarting transaction")) {
// We roll back what we have, if anything
rollback(conn);
// Error message recommends restarting the transaction/batch
if (tries > 0) {
// Try again if we have any tries left
logWarning("Restarting transaction for status id " + id
+ "...");
execute(conn, id, sqls, tries - 1);
} else {
// If we don't, log the transaction failure
logSevere("Failed after several tries; aborting transaction for status id "
+ id + ".");
logSevere(Util.traceMessage(ex));
}
}
// This happens when the text (usually a URL) is too long for our
// database field
else if (ex.getMessage().contains("Data truncation")) {
// This doesn't signify a failure or error, so log as INFO
logInfo(ex.getMessage() + "\nStatus id: " + id);
logFinest("Query = \n" + Util.prettyPrintList(sqls)
+ "\nStatus id: " + id);
}
// This happens if people place certain crazy unicode characters in
// their tweets, I guess?
else if (ex.getMessage().contains("Incorrect string value")) {
// Rollback the transaction and log the failure
rollback(conn);
logInfo(ex.getMessage() + "\nStatus id: " + id);
logFinest("Query = \n" + Util.prettyPrintList(sqls)
+ "\nStatus id: " + id);
}
// Otherwise, the error is of an unknown type
else {
// Log it as a severe failure
logSevere( //
"SQLState: " + ex.getSQLState() + "\n" + //
"VendorError: " + ex.getErrorCode() + "\n" + //
Util.traceMessage(ex));
logSevere("Query = \n" + Util.prettyPrintList(sqls)
+ "\nStatus id: " + id);
}
} finally {
try {
if (stmt != null) {
stmt.close();
}
} catch (final SQLException ex) {
logSevere( //
"SQLState: " + ex.getSQLState() + "\n" + //
"VendorError: " + ex.getErrorCode() + "\n" + //
Util.traceMessage(ex));
logFinest("Query = \n" + Util.prettyPrintList(sqls)
+ "\nStatus id: " + id);
}
}
}
/**
* Roll back the last transaction on this connection; convenience method.
*
* @param conn
*/
private void rollback(final Connection conn) {
try {
logFine("Rolling back transaction because of error...");
conn.rollback();
} catch (final SQLException ex) {
logSevere( //
"SQLState: " + ex.getSQLState() + "\n" + //
"VendorError: " + ex.getErrorCode() + "\n" + //
Util.traceMessage(ex));
}
}
/**
* Sanitizes a string for use as a field in a MySQL database by escaping all
* backslashes and
* single-quotes.
*
* @param in
* @return The input string, sanitized for insertion into a SQL database.
*/
public static String scrub(final String in) {
/*
* first one actually replaces \ with escaped \\, second replaces ' with
* \';\ in regex is expressed as \\, which is java string literal is
* \\\\, and same applies to the replacement pattern; we don't need
*/
return in.replaceAll("\\\\", "\\\\\\\\").replaceAll("'", "\\\\'");
}
/*
* --------------------------------
* Messages
* --------------------------------
*/
private static final String MSG_ERR_NOSOURCE_FMT = "Couldn't get data source from manager with name '%s'";
}