/** * Copyright 2012 - CommonCrawl Foundation * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. * **/ package org.commoncrawl.mapred.pipelineV3.crawllistgen; import java.util.Set; import org.apache.hadoop.io.Writable; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.Partitioner; import org.commoncrawl.util.ByteArrayUtils; import org.commoncrawl.util.FlexBuffer; import org.commoncrawl.util.GoogleURL; import org.commoncrawl.util.SuperDomainList; import org.commoncrawl.util.TextBytes; import org.commoncrawl.util.URLUtils; import com.google.common.collect.Sets; /** * * @author rana * */ public class PartitionUtils { public static class PartitionKeyPartitioner implements Partitioner<TextBytes, Writable> { FlexBuffer scratchBuffer = new FlexBuffer(); @Override public void configure(JobConf job) { } @Override public int getPartition(TextBytes key, Writable value, int numPartitions) { return getPartitionGivenPartitionKey(key, scratchBuffer, numPartitions); } } static byte pattern[] = { ':' }; public static void generatePartitionKeyGivenDomain(Set<Long> superDomainIdList, String rootDomain, int type, TextBytes partitionKeyOut) { partitionKeyOut.set(rootDomain + ":" + Integer.toString(type) + ":"); } public static boolean generatePartitionKeyGivenURL(Set<Long> superDomainIdList, GoogleURL urlObject, int type, TextBytes partitionKeyOut) { String domain = urlObject.getHost(); String rootDomain = URLUtils.extractRootDomainName(domain); if (rootDomain != null) { long domainFP = SuperDomainList.domainFingerprintGivenName(rootDomain); if (!superDomainIdList.contains(domainFP)) { domain = rootDomain; } partitionKeyOut.set(domain + ":" + Integer.toString(type) + ":" + urlObject.getCanonicalURL()); return true; } return false; } public static boolean generatePartitionKeyGivenURL(Set<Long> superDomainIdList, TextBytes urlKey, int type, TextBytes partitionKeyOut) { GoogleURL urlObject = new GoogleURL(urlKey.toString()); if (urlObject.isValid()) { return generatePartitionKeyGivenURL(superDomainIdList, urlObject, type, partitionKeyOut); } return false; } public static TextBytes getDomainGivenPartitionKey(TextBytes partitionKey, TextBytes domainOut) { int index = ByteArrayUtils.indexOf(partitionKey.getBytes(), partitionKey.getOffset(), partitionKey.getLength(), pattern); domainOut.set(partitionKey.getBytes(), partitionKey.getOffset(), index - partitionKey.getOffset()); return domainOut; } public static int getPartitionGivenPartitionKey(TextBytes partitionKey, FlexBuffer scratchBuffer, int numParitions) { int index = ByteArrayUtils.indexOf(partitionKey.getBytes(), partitionKey.getOffset(), partitionKey.getLength(), pattern); scratchBuffer.set(partitionKey.getBytes(), partitionKey.getOffset(), index - partitionKey.getOffset()); return (scratchBuffer.hashCode() & Integer.MAX_VALUE) % numParitions; } public static int getTypeGivenPartitionKey(TextBytes partitionKey) { byte bytes[] = partitionKey.getBytes(); int offset = partitionKey.getOffset(); int index = ByteArrayUtils.indexOf(partitionKey.getBytes(), partitionKey.getOffset(), partitionKey.getLength(), pattern); int startIndex = ++index; while (bytes[index + offset] != ':') ++index; return (int) ByteArrayUtils.parseLong(bytes, offset + startIndex, index - startIndex, 10); } public static void getURLGivenPartitionKey(TextBytes partitionKey, TextBytes urlOut) { byte bytes[] = partitionKey.getBytes(); int offset = partitionKey.getOffset(); int index = ByteArrayUtils.indexOf(partitionKey.getBytes(), partitionKey.getOffset(), partitionKey.getLength(), pattern); ++index; while (bytes[index + offset] != ':') ++index; if (index + 1 < partitionKey.getLength()) { urlOut.set(partitionKey.getBytes(), partitionKey.getOffset() + index + 1, partitionKey.getLength() - (index + 1)); } else { urlOut.clear(); } } public static void main(String[] args) { TextBytes partitionKeyOut = new TextBytes(); Set<Long> emptySet = Sets.newHashSet(); FlexBuffer scratchBuffer = new FlexBuffer(); TextBytes urlOut = new TextBytes(); TextBytes domainBytes = new TextBytes(); generatePartitionKeyGivenURL(emptySet, new TextBytes("http://www.google.com/someurl"), 0, partitionKeyOut); System.out.println("ParitiionKey:" + partitionKeyOut.toString()); System.out.println("Parition:" + getPartitionGivenPartitionKey(partitionKeyOut, scratchBuffer, 10)); System.out.println("Domain:" + getDomainGivenPartitionKey(partitionKeyOut, domainBytes)); System.out.println("Type:" + getTypeGivenPartitionKey(partitionKeyOut)); getURLGivenPartitionKey(partitionKeyOut, urlOut); System.out.println("URL:" + urlOut.toString()); generatePartitionKeyGivenDomain(emptySet, "google.com", 0, partitionKeyOut); System.out.println("ParitiionKey:" + partitionKeyOut.toString()); System.out.println("Parition:" + getPartitionGivenPartitionKey(partitionKeyOut, scratchBuffer, 10)); System.out.println("Domain:" + getDomainGivenPartitionKey(partitionKeyOut, domainBytes)); System.out.println("Type:" + getTypeGivenPartitionKey(partitionKeyOut)); getURLGivenPartitionKey(partitionKeyOut, urlOut); System.out.println("URL:" + urlOut.toString()); } }