/** * Licensed to DigitalPebble Ltd under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * DigitalPebble licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.digitalpebble.stormcrawler.util; import java.io.Serializable; import java.util.HashMap; import java.util.LinkedList; import java.util.List; import java.util.Map; import org.apache.storm.generated.GlobalStreamId; import org.apache.storm.grouping.CustomStreamGrouping; import org.apache.storm.shade.org.apache.commons.lang.StringUtils; import org.apache.storm.task.WorkerTopologyContext; import com.digitalpebble.stormcrawler.Constants; import com.digitalpebble.stormcrawler.Metadata; @SuppressWarnings("serial") /** * Directs tuples to a specific bolt instance based on the URLPartitioner, e.g. * byIP, byDomain or byHost. * * Use as follows with Flux : * * <pre> * {@code * streams: * - from: "spout" * to: "status" * grouping: * type: CUSTOM * customClass: * className: "com.digitalpebble.stormcrawler.util.URLStreamGrouping" * constructorArgs: * - "byDomain" * } * </pre> **/ public class URLStreamGrouping implements CustomStreamGrouping, Serializable { private int numTasks = 0; private URLPartitioner partitioner; private String partitionMode; public URLStreamGrouping() { } public URLStreamGrouping(String mode) { partitionMode = mode; } @Override public void prepare(WorkerTopologyContext context, GlobalStreamId stream, List<Integer> targetTasks) { numTasks = targetTasks.size(); partitioner = new URLPartitioner(); if (StringUtils.isNotBlank(partitionMode)) { Map<String, String> conf = new HashMap<>(); conf.put(Constants.PARTITION_MODEParamName, partitionMode); partitioner.configure(conf); } } @Override public List<Integer> chooseTasks(int taskId, List<Object> values) { List<Integer> boltIds = new LinkedList<>(); // optimisation : single target if (numTasks == 1) { boltIds.add(0); return boltIds; } if (values.size() < 2) { // TODO log! return boltIds; } // the first value is always the URL // and the second the metadata String url = (String) values.get(0); Metadata metadata = (Metadata) values.get(1); String partitionKey = partitioner.getPartition(url, metadata); if (StringUtils.isBlank(partitionKey)) { // TODO log! return boltIds; } // hash on the key int partition = Math.abs(partitionKey.hashCode() % numTasks); boltIds.add(partition); return boltIds; } }