/**
* Licensed to DigitalPebble Ltd under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* DigitalPebble licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.digitalpebble.stormcrawler.sql;
import java.nio.ByteBuffer;
import java.sql.Connection;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Statement;
import java.sql.Timestamp;
import java.util.Date;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Queue;
import java.util.Set;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.digitalpebble.stormcrawler.util.ConfUtils;
import com.digitalpebble.stormcrawler.util.StringTabScheme;
import org.apache.storm.metric.api.MultiCountMetric;
import org.apache.storm.spout.Scheme;
import org.apache.storm.spout.SpoutOutputCollector;
import org.apache.storm.task.TopologyContext;
import org.apache.storm.topology.OutputFieldsDeclarer;
import org.apache.storm.topology.base.BaseRichSpout;
@SuppressWarnings("serial")
public class SQLSpout extends BaseRichSpout {
public static final Logger LOG = LoggerFactory.getLogger(SQLSpout.class);
private static final Scheme SCHEME = new StringTabScheme();
private SpoutOutputCollector _collector;
private String tableName;
private Connection connection;
private int bufferSize = 100;
private Queue<List<Object>> buffer = new LinkedList<>();
/**
* Keeps track of the URLs in flight so that we don't add them more than
* once when the table contains just a few URLs
**/
private Set<String> beingProcessed = new HashSet<>();
private boolean active;
private MultiCountMetric eventCounter;
private int minWaitBetweenQueriesMSec = 5000;
private long lastQueryTime = System.currentTimeMillis();
/**
* if more than one instance of the spout exist, each one is in charge of a
* separate bucket value. This is used to ensure a good diversity of URLs.
**/
private int bucketNum = -1;
@SuppressWarnings({ "rawtypes", "unchecked" })
@Override
public void open(Map conf, TopologyContext context,
SpoutOutputCollector collector) {
_collector = collector;
this.eventCounter = context.registerMetric("SQLSpout",
new MultiCountMetric(), 10);
bufferSize = ConfUtils.getInt(conf,
Constants.MYSQL_BUFFERSIZE_PARAM_NAME, 100);
minWaitBetweenQueriesMSec = ConfUtils.getInt(conf,
Constants.MYSQL_MIN_QUERY_INTERVAL_PARAM_NAME, 5000);
tableName = ConfUtils.getString(conf, Constants.MYSQL_TABLE_PARAM_NAME);
try {
connection = SQLUtil.getConnection(conf);
} catch (SQLException ex) {
LOG.error(ex.getMessage(), ex);
throw new RuntimeException(ex);
}
// determine bucket this spout instance will be in charge of
int totalTasks = context
.getComponentTasks(context.getThisComponentId()).size();
if (totalTasks > 1) {
bucketNum = context.getThisTaskIndex();
}
}
@Override
public void declareOutputFields(OutputFieldsDeclarer declarer) {
declarer.declare(SCHEME.getOutputFields());
}
@Override
public void nextTuple() {
if (!active)
return;
if (!buffer.isEmpty()) {
List<Object> fields = buffer.remove();
String url = fields.get(0).toString();
this._collector.emit(fields, url);
beingProcessed.add(url);
return;
}
// re-populate the buffer
long now = System.currentTimeMillis();
long allowed = lastQueryTime + minWaitBetweenQueriesMSec;
if (now > allowed) {
populateBuffer();
lastQueryTime = now;
}
}
private void populateBuffer() {
// select entries from mysql
String query = "SELECT * FROM " + tableName;
query += " WHERE nextfetchdate <= '"
+ new Timestamp(new Date().getTime()) + "'";
// constraint on bucket num
if (bucketNum >= 0) {
query += " AND bucket = '" + bucketNum + "'";
}
query += " LIMIT " + this.bufferSize;
// create the java statement
Statement st = null;
ResultSet rs = null;
try {
st = this.connection.createStatement();
// execute the query, and get a java resultset
rs = st.executeQuery(query);
eventCounter.scope("SQL queries").incrBy(1);
// iterate through the java resultset
while (rs.next()) {
String url = rs.getString("url");
// already processed? skip
if (beingProcessed.contains(url)) {
continue;
}
String metadata = rs.getString("metadata");
if (metadata == null) {
metadata = "";
} else if (!metadata.startsWith("\t")) {
metadata = "\t" + metadata;
}
String URLMD = url + metadata;
List<Object> v = SCHEME.deserialize(ByteBuffer.wrap(URLMD
.getBytes()));
buffer.add(v);
}
} catch (SQLException e) {
LOG.error("Exception while querying table", e);
} finally {
try {
if (rs != null)
rs.close();
} catch (SQLException e) {
LOG.error("Exception closing resultset", e);
}
try {
if (st != null)
st.close();
} catch (SQLException e) {
LOG.error("Exception closing statement", e);
}
}
}
@Override
public void activate() {
super.activate();
active = true;
}
@Override
public void deactivate() {
super.deactivate();
active = false;
}
@Override
public void ack(Object msgId) {
super.ack(msgId);
beingProcessed.remove(msgId);
}
@Override
public void fail(Object msgId) {
super.fail(msgId);
beingProcessed.remove(msgId);
}
@Override
public void close() {
super.close();
try {
connection.close();
} catch (SQLException e) {
LOG.error("Exception caught while closing SQL connection", e);
}
}
}