/** * Copyright 2008 - CommonCrawl Foundation * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. * */ package org.commoncrawl.service.crawler; import static org.fusesource.leveldbjni.JniDBFactory.factory; import java.io.File; import java.io.IOException; import java.util.HashMap; import java.util.HashSet; import java.util.LinkedList; import java.util.Map; import java.util.Set; import java.util.concurrent.atomic.AtomicLong; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.commoncrawl.util.CCStringUtils; import org.commoncrawl.util.Tuples.Triple; import org.iq80.leveldb.DB; import org.iq80.leveldb.DBComparator; import org.iq80.leveldb.DBIterator; import org.iq80.leveldb.Options; import org.iq80.leveldb.ReadOptions; import org.iq80.leveldb.Snapshot; import redis.clients.jedis.Jedis; /** * queue used to manage on-demand remote parsing of select crawled documents * * @author rana * */ public class ParseQueue { private String _redisQueueName = null; private File _entryDBName = null; public static final Log LOG = LogFactory.getLog(ParseQueue.class); public static final String QUEUE_DB = "parse_queue_db"; public static final String ENTRY_DB = "parse_entry_db"; public static final byte[] REDIS_DOMAIN_COUNTS_KEY = "dc".getBytes(); public static final byte[] REDIS_PRIORITY_QUEUE_LT10_KEY = "pq_lt10".getBytes(); public static final byte[] REDIS_PRIORITY_QUEUE_GT10_KEY = "pq_gt10".getBytes(); private DB entryDB; Jedis redis; private long queueEpoch = System.currentTimeMillis(); private AtomicLong sequenceNo = new AtomicLong(System.currentTimeMillis()); private int workUnitsPerTimespan; public static final int ITEM_STATE_QUEUED = 2; public static final int ITEM_STATE_ACTIVE = 1; public static class Item { public long _domainId; public long _sequenceId; public byte[] _data; public Item() { } /** * create new Item for insertion into queue * @param domainId * @param data */ public Item(long domainId, byte[] data) { _domainId = domainId; _sequenceId = 0; _data = data; } /** * used by queue to populate item during pop * @param domainId * @param sequenceId * @param data */ Item(long domainId,long sequenceId, byte[] data) { _domainId = domainId; _sequenceId = sequenceId; _data = data; } } private LinkedList<Item> _scheduledItems = new LinkedList<Item>(); public ParseQueue(File dbPath,int redisPort,int workUnitsPerTimespan) throws IOException { LOG.info("Database Path is:" + dbPath); dbPath.mkdirs(); Options options = new Options(); options.createIfMissing(true); options.comparator(new DomainStateAndTimestampKey()); options.paranoidChecks(true); entryDB = factory.open(new File(dbPath,ENTRY_DB), options); this.workUnitsPerTimespan = workUnitsPerTimespan; redis = new Jedis("localhost",redisPort); // clear redis LOG.info("CLEARING REDIS"); redis.flushAll(); LOG.info("RELOAD ORPHANED RECORDS"); reloadOrphanedRecords(); LOG.info("REBUILD REDIS STATE "); loadRedis(); } public void close() throws IOException { if (entryDB != null) { entryDB.close(); } } private static byte[] domainIdToBytes(long domainId) { return new byte[] { (byte)((domainId >> 56) & 0xff), (byte)((domainId >> 48) & 0xff), (byte)((domainId >> 40) & 0xff), (byte)((domainId >> 32) & 0xff), (byte)((domainId >> 24) & 0xff), (byte)((domainId >> 16) & 0xff), (byte)((domainId >> 8) & 0xff), (byte)((domainId >> 0) & 0xff) }; } private static long bytesToDomainId(byte[] bytes) { return (long)(0xff & bytes[0]) << 56 | (long)(0xff & bytes[1]) << 48 | (long)(0xff & bytes[2]) << 40 | (long)(0xff & bytes[3]) << 32 | (long)(0xff & bytes[4]) << 24 | (long)(0xff & bytes[5]) << 16 | (long)(0xff & bytes[6]) << 8 | (long)(0xff & bytes[7]) << 0; } /** * * @param urlObject * @throws IOException */ public long insertItemIntoQueue(Item item) throws IOException { // create id ... long sequenceId = sequenceNo.addAndGet(1); // create composite key byte[] key = DomainStateAndTimestampKey.createCompositeKey(item._domainId,ITEM_STATE_QUEUED,sequenceId); entryDB.put(key, item._data); // talk to redis byte[] dominIdBytes = domainIdToBytes(item._domainId); long itemCount = redis.hincrBy(REDIS_DOMAIN_COUNTS_KEY,dominIdBytes,1); LOG.info("insertItemIntoQueue for domainId:" + item._domainId + " adding entry to DB. itemCount:" + itemCount); // update redis queue if necessary if (itemCount == 1) { double score = (double) (System.currentTimeMillis() - queueEpoch) / 1000; //LOG.info("insertItemIntoQueue domainId:" + item._domainId + " itemCount:" + itemCount + " adding to LT10 queue with score:" + score); redis.zadd(REDIS_PRIORITY_QUEUE_LT10_KEY,score,dominIdBytes); } // if gt > 10 .. then move queues ... else if (itemCount == 11) { //LOG.info("insertItemIntoQueue domainId:" + item._domainId + " itemCount:" + itemCount + " moving from LT10 to GT10 queue"); double oldScore = redis.zscore(REDIS_PRIORITY_QUEUE_LT10_KEY,dominIdBytes); redis.zrem(REDIS_PRIORITY_QUEUE_LT10_KEY,dominIdBytes); redis.zadd(REDIS_PRIORITY_QUEUE_GT10_KEY,oldScore,dominIdBytes); } else { //LOG.info("insertItemIntoQueue domainId:" + item._domainId + " itemCount:" + itemCount); } return sequenceId; } /** * * @param urlObject * @throws IOException */ public Item popItemIntoFromQueue() throws IOException { // if queue empty if (_scheduledItems.size() == 0) { // load the queue ... fillQueue(); } // now if queue is non empty ... if (_scheduledItems.size() != 0) { return _scheduledItems.remove(); } return null; } /** * delete the previously pop'ed item from the database * * @param item * @throws IOException */ public void deleteItem(Item item) throws IOException { // delete it from the database entryDB.delete(DomainStateAndTimestampKey.createCompositeKey(item._domainId, ITEM_STATE_ACTIVE, item._sequenceId)); } private void reloadOrphanedRecords() throws IOException { Snapshot snapshot = entryDB.getSnapshot(); try { ReadOptions options = new ReadOptions(); options.snapshot(snapshot); DBIterator iterator = entryDB.iterator(options); try { for (iterator.seekToFirst();iterator.hasNext();iterator.next()) { Triple<Long,Long,Integer> oldKey = DomainStateAndTimestampKey.fromBytes(iterator.peekNext().getKey()); if (oldKey.e2 == ITEM_STATE_ACTIVE) { LOG.info("Found Orphaned Record for Domain:" + oldKey.e0); // flip to inactive state byte[] newKey = DomainStateAndTimestampKey.createCompositeKey( oldKey.e0, ITEM_STATE_QUEUED, oldKey.e1); // delete using old key ... entryDB.delete(iterator.peekNext().getKey()); // reinsert using new key ... entryDB.put(newKey,iterator.peekNext().getValue()); } } } finally { iterator.close(); } } finally { snapshot.close(); } } private void loadRedis()throws IOException { DBIterator iterator = entryDB.iterator(); try { long lastDomainId = 0; long lastDomainCount = 0; long firstTimestamp = 0; for (iterator.seekToFirst();iterator.hasNext();iterator.next()) { long currentDomainId = bytesToDomainId(iterator.peekNext().getKey()); if (currentDomainId != lastDomainId) { if (lastDomainCount != 0) { double score = (double) (firstTimestamp - queueEpoch) / 1000; LOG.info("Inserting DomainId:" + lastDomainId + " score:" + score); redis.hincrBy(REDIS_DOMAIN_COUNTS_KEY,domainIdToBytes(lastDomainId), lastDomainCount); redis.zadd( (lastDomainCount <= 10) ? REDIS_PRIORITY_QUEUE_LT10_KEY:REDIS_PRIORITY_QUEUE_GT10_KEY, score, domainIdToBytes(lastDomainId)); } lastDomainCount = 1; lastDomainId = currentDomainId; firstTimestamp = DomainStateAndTimestampKey.getTimestampFromKey(iterator.peekNext().getKey()); } } if (lastDomainCount != 0) { redis.hincrBy(REDIS_DOMAIN_COUNTS_KEY,domainIdToBytes(lastDomainId), lastDomainCount); double score = (double) (firstTimestamp - queueEpoch) / 1000; redis.zadd( (lastDomainCount <= 10) ? REDIS_PRIORITY_QUEUE_LT10_KEY:REDIS_PRIORITY_QUEUE_GT10_KEY, score, domainIdToBytes(lastDomainId)); LOG.info("Inserting DomainId:" + lastDomainId + " score:" + score); } } finally { iterator.close(); } } private void fillQueue() throws IOException { LOG.info("In fillQueue"); int unitsForSmallDomains = workUnitsPerTimespan / 3; // figure out timespan units int unitsRemaining = workUnitsPerTimespan; // make two passes to populate queues ... for (int pass=0;pass<2;++pass) { // figure out queue name based on pass byte[] queueName = (pass == 0) ? REDIS_PRIORITY_QUEUE_LT10_KEY : REDIS_PRIORITY_QUEUE_GT10_KEY; // get up to max possible keys ... Set<byte[]> keys = redis.zrange(queueName,0,unitsRemaining); // figure out units to try and acquire int unitsToAcquire = Math.min(unitsRemaining,(pass == 0) ? unitsForSmallDomains : unitsRemaining); // special case .. if LT queue, see if go all out if large queue is empty ... if (pass == 0 && redis.zcard(REDIS_PRIORITY_QUEUE_GT10_KEY) == 0) { unitsToAcquire = unitsRemaining; } LOG.info("Pass:" + pass + " redisSetSize: " + keys.size() + " unitsToAcquire:" + unitsToAcquire); // keep counts by domain HashMap<byte[],Integer> counts = new HashMap<byte[],Integer>(); HashSet<byte[]> emptyDomainSet = new HashSet<byte[]>(); int itemsAcquiredThisPass = 0; while (unitsToAcquire > 0 && emptyDomainSet.size() != keys.size()) { // walk keys ... for (byte[] domainKey : keys) { if (!emptyDomainSet.contains(domainKey)) { // pop item from database ... Item item = popNextItemFromDatabase(domainKey); if (item != null) { // schedule it... _scheduledItems.add(item); // decerement aggregate count ... unitsToAcquire--; itemsAcquiredThisPass++; // increment localized count .. Integer existingCount = counts.get(domainKey); if (existingCount == null) { counts.put(domainKey,1); } else{ counts.put(domainKey,existingCount.intValue() + 1); } if (unitsToAcquire == 0) break; } else { // add to empty domain set ... emptyDomainSet.add(domainKey); } } } } unitsRemaining -= itemsAcquiredThisPass; LOG.info("update RedisQueues for pass:" + pass + " itemsAcquired:" + itemsAcquiredThisPass + " unitsRemaining:" + unitsRemaining); // ok clear redis counts for domains operated on .. updateRedisQueues(queueName,counts); } } private void updateRedisQueues(byte[] queueName,HashMap<byte[],Integer> counts)throws IOException { for (Map.Entry<byte[],Integer> countEntry : counts.entrySet()) { // decrement redis count long newCount = redis.hincrBy(REDIS_DOMAIN_COUNTS_KEY,countEntry.getKey(),-countEntry.getValue()); if (newCount <= 0) { LOG.info("Count for Domain:" + bytesToDomainId(countEntry.getKey()) + " is zero. Removing from sets/maps"); // ok the domain is empty ...remove from both counts map and queue redis.hdel(REDIS_DOMAIN_COUNTS_KEY, countEntry.getKey()); // delete from source queue as well redis.zrem(queueName,countEntry.getKey()); } else if (newCount <=10 && queueName != REDIS_PRIORITY_QUEUE_LT10_KEY) { LOG.info("Count for Domain:" + bytesToDomainId(countEntry.getKey()) + " LTEQ 10 but in wrong queue. moving"); // need to remove from high priority queue ... redis.zrem(REDIS_PRIORITY_QUEUE_GT10_KEY, countEntry.getKey()); } // ok now add back to queue if count != 0 if (newCount > 0) { byte[] finalQueueName = (newCount <= 10) ? REDIS_PRIORITY_QUEUE_LT10_KEY:REDIS_PRIORITY_QUEUE_GT10_KEY; // set new score value double score = (double) (System.currentTimeMillis() - queueEpoch) / 1000; redis.zadd(finalQueueName,score,countEntry.getKey()); LOG.info("Count for Domain:" + bytesToDomainId(countEntry.getKey()) + " is:" + newCount + " assinging Scroe:" + score + " Queue:" + new String(finalQueueName)); } } } private Item popNextItemFromDatabase(byte[] targetIdBytes) throws IOException { // allocate space for a potential item ... Item itemOut = null; // construct an iterator DBIterator iterator = entryDB.iterator(); try { // iterator.seek(DomainStateAndTimestampKey.createCompositeKey(targetIdBytes,ITEM_STATE_QUEUED,Long.MIN_VALUE)); if (iterator.hasNext()) { long targetId = bytesToDomainId(targetIdBytes); Triple<Long,Long,Integer> compositeKey = DomainStateAndTimestampKey.fromBytes(iterator.peekNext().getKey()); if (targetId == compositeKey.e0) { // create item using domain id and timestamp from composite key and value bytes ... itemOut = new Item(compositeKey.e0,compositeKey.e1,iterator.peekNext().getValue()); // delete it from the database entryDB.delete(iterator.peekNext().getKey()); // and then reinsert with a new key ... entryDB.put( DomainStateAndTimestampKey.createCompositeKey( compositeKey.e0,ITEM_STATE_ACTIVE,compositeKey.e1),iterator.peekNext().getValue()); } } } finally { iterator.close(); } LOG.info("popNextItemFromDatabase for domainId:" + bytesToDomainId(targetIdBytes) + " returned Item:" + itemOut); return itemOut; } static class DomainStateAndTimestampKey implements DBComparator { public int compare(byte[] key1, byte[] key2) { long domainId1 = (long)(0xff & key1[0]) << 56 | (long)(0xff & key1[1]) << 48 | (long)(0xff & key1[2]) << 40 | (long)(0xff & key1[3]) << 32 | (long)(0xff & key1[4]) << 24 | (long)(0xff & key1[5]) << 16 | (long)(0xff & key1[6]) << 8 | (long)(0xff & key1[7]) << 0; long domainId2 = (long)(0xff & key2[0]) << 56 | (long)(0xff & key2[1]) << 48 | (long)(0xff & key2[2]) << 40 | (long)(0xff & key2[3]) << 32 | (long)(0xff & key2[4]) << 24 | (long)(0xff & key2[5]) << 16 | (long)(0xff & key2[6]) << 8 | (long)(0xff & key2[7]) << 0; int result = (domainId1<domainId2) ? -1 : (domainId1 > domainId2) ? 1 :0; if (result == 0) { int state1 = (0xff & key1[8]); int state2 = (0xff & key2[8]); result = (state1 < state2) ? -1 : (state1 > state2) ? 1: 0; if (result == 0) { long timestamp1 = (long)(0xff & key1[9]) << 56 | (long)(0xff & key1[10]) << 48 | (long)(0xff & key1[11]) << 40 | (long)(0xff & key1[12]) << 32 | (long)(0xff & key1[13]) << 24 | (long)(0xff & key1[14]) << 16 | (long)(0xff & key1[15]) << 8 | (long)(0xff & key1[16]) << 0; long timestamp2 = (long)(0xff & key2[9]) << 56 | (long)(0xff & key2[10]) << 48 | (long)(0xff & key2[11]) << 40 | (long)(0xff & key2[12]) << 32 | (long)(0xff & key2[13]) << 24 | (long)(0xff & key2[14]) << 16 | (long)(0xff & key2[15]) << 8 | (long)(0xff & key2[16]) << 0; result = (timestamp1<timestamp2) ? -1 : (timestamp1 > timestamp2) ? 1 :0; } } return result; } public byte[] findShortestSeparator(byte[] start, byte[] limit) { return start; } public byte[] findShortSuccessor(byte[] key) { return key; } @Override public String name() { return "WorkQueue_Comparator"; } public static final int COMPOSITE_KEY_SIZE = 8 + 1 + 8; public static byte[] createCompositeKey(byte[] targetArray,long domainId,int state,long timestamp)throws IOException { if (targetArray == null) { targetArray = new byte[COMPOSITE_KEY_SIZE]; } else { if (targetArray.length < COMPOSITE_KEY_SIZE) throw new IOException("Invalid Target Array Size!"); } targetArray[0] = (byte)((domainId >> 56) & 0xff); targetArray[1] = (byte)((domainId >> 48) & 0xff); targetArray[2] = (byte)((domainId >> 40) & 0xff); targetArray[3] = (byte)((domainId >> 32) & 0xff); targetArray[4] = (byte)((domainId >> 24) & 0xff); targetArray[5] = (byte)((domainId >> 16) & 0xff); targetArray[6] = (byte)((domainId >> 8) & 0xff); targetArray[7] = (byte)((domainId >> 0) & 0xff); targetArray[8] = (byte)(state & 0xff); targetArray[9] = (byte)((timestamp >> 56) & 0xff); targetArray[10] = (byte)((timestamp >> 48) & 0xff); targetArray[11] = (byte)((timestamp >> 40) & 0xff); targetArray[12] = (byte)((timestamp >> 32) & 0xff); targetArray[13] = (byte)((timestamp >> 24) & 0xff); targetArray[14] = (byte)((timestamp >> 16) & 0xff); targetArray[15] = (byte)((timestamp >> 8) & 0xff); targetArray[16] = (byte)((timestamp >> 0) & 0xff); return targetArray; } public static byte[] createCompositeKey(long domainId,int state,long timestamp) { return new byte[] { (byte)((domainId >> 56) & 0xff), (byte)((domainId >> 48) & 0xff), (byte)((domainId >> 40) & 0xff), (byte)((domainId >> 32) & 0xff), (byte)((domainId >> 24) & 0xff), (byte)((domainId >> 16) & 0xff), (byte)((domainId >> 8) & 0xff), (byte)((domainId >> 0) & 0xff), (byte)(state & 0xff), (byte)((timestamp >> 56) & 0xff), (byte)((timestamp >> 48) & 0xff), (byte)((timestamp >> 40) & 0xff), (byte)((timestamp >> 32) & 0xff), (byte)((timestamp >> 24) & 0xff), (byte)((timestamp >> 16) & 0xff), (byte)((timestamp >> 8) & 0xff), (byte)((timestamp >> 0) & 0xff) }; } public static byte[] createCompositeKey(byte[] domainId,int state,long timestamp) { return new byte[] { (byte)domainId[0], (byte)domainId[1], (byte)domainId[2], (byte)domainId[3], (byte)domainId[4], (byte)domainId[5], (byte)domainId[6], (byte)domainId[7], (byte) (state & 0xff), (byte)((timestamp >> 56) & 0xff), (byte)((timestamp >> 48) & 0xff), (byte)((timestamp >> 40) & 0xff), (byte)((timestamp >> 32) & 0xff), (byte)((timestamp >> 24) & 0xff), (byte)((timestamp >> 16) & 0xff), (byte)((timestamp >> 8) & 0xff), (byte)((timestamp >> 0) & 0xff) }; } public static long getTimestampFromKey(byte[] key) { return (long)(0xff & key[9]) << 56 | (long)(0xff & key[10]) << 48 | (long)(0xff & key[11]) << 40 | (long)(0xff & key[12]) << 32 | (long)(0xff & key[13]) << 24 | (long)(0xff & key[14]) << 16 | (long)(0xff & key[15]) << 8 | (long)(0xff & key[16]) << 0; } public static Triple<Long, Long, Integer> fromBytes(byte[] key) { long domainId = (long)(0xff & key[0]) << 56 | (long)(0xff & key[1]) << 48 | (long)(0xff & key[2]) << 40 | (long)(0xff & key[3]) << 32 | (long)(0xff & key[4]) << 24 | (long)(0xff & key[5]) << 16 | (long)(0xff & key[6]) << 8 | (long)(0xff & key[7]) << 0; int state = (0xff & key[8]); long timestamp = (long)(0xff & key[9]) << 56 | (long)(0xff & key[10]) << 48 | (long)(0xff & key[11]) << 40 | (long)(0xff & key[12]) << 32 | (long)(0xff & key[13]) << 24 | (long)(0xff & key[14]) << 16 | (long)(0xff & key[15]) << 8 | (long)(0xff & key[16]) << 0; return new Triple<Long,Long,Integer>(domainId,timestamp,state); } }; public static String createTestValue(int index,int count) { StringBuffer buf = new StringBuffer(); for (int i=0;i<count+1;++i) { buf.append(index); } return buf.toString(); } public static void main(String[] args) throws IOException { ParseQueue queue = new ParseQueue(new File("/home/rana/ccprod/data/ParseQueue_Test"),6379,20); try { for (int i=0;i<10000;++i) { long domainId = (long)(Math.random() * 1000); long timestamp = System.currentTimeMillis(); String dataStr = Long.toString(domainId) + ":"+ Long.toString(timestamp); LOG.info("Inserting:" + dataStr); queue.insertItemIntoQueue(new Item(domainId,dataStr.getBytes())); } Item itemOut = null; while ((itemOut = queue.popItemIntoFromQueue()) != null) { LOG.info("Got Item:" + new String(itemOut._data) + " DELETING IT"); queue.deleteItem(itemOut); } LOG.info("Done"); } catch (Exception e) { LOG.error(CCStringUtils.stringifyException(e)); } finally { LOG.info("Closing Database"); queue.close(); LOG.info("Closed Database"); } LOG.info("Sleeping"); try { Thread.sleep(20000); } catch (InterruptedException e) { // TODO Auto-generated catch block e.printStackTrace(); } LOG.info("Done Sleeping"); } }