/** * Copyright 2008 - CommonCrawl Foundation * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. * **/ package org.commoncrawl.util; import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.DataInputStream; import java.io.DataOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; import java.util.Collection; import java.util.Map; import java.util.Set; import junit.framework.Assert; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.io.DataOutputBuffer; import org.apache.hadoop.io.WritableUtils; import org.commoncrawl.protocol.URLFP; import org.commoncrawl.util.BitUtils.BitStream; import org.commoncrawl.util.BitUtils.BitStreamReader; import com.google.common.collect.TreeMultimap; /** * A compressed list of url fingerprints * * @author rana * */ public class CompressedURLFPList { private static final Log LOG = LogFactory .getLog(CompressedURLFPList.class); public static final int FPList_Version = 1; // store the segment id as part of the urlfingerprint stream public static final int FLAG_ARCHIVE_SEGMENT_ID = 1; // this stream contains a single permanent redirect url entry public static final int FLAG_IS_PERMANENT_REDIRECT = 2; // serialize indvidual flag information per urlfp if present public static final int FLAG_SERIALIZE_URLFP_FLAGS = 4; // serialize link count variant public static final int FLAG_SERIALIZE_LINK_COUNT = 8; // serialize timestamp public static final int FLAG_SERIALIZE_TIMESTAMP = 16; /** * The builder, used to build the list * * @author rana * */ public static class Builder { private TreeMultimap<Integer, URLFP> _links = TreeMultimap.create(); private int _flags = 0; /** * construct a link builder * * @param archiveSegmentId * - true if you want to preserve document versions in the list */ public Builder(int flags) { _flags = flags; } /** add a fingerprint item to the builder **/ public void addLink(URLFP linkItem) { _links.get(linkItem.getRootDomainHash()).add(linkItem); } /** get the link map **/ public TreeMultimap<Integer, URLFP> getLinkMap() { return _links; } /** do we need to serialize urlfp flags **/ public boolean serializeURLFPFlags() { return (_flags & FLAG_SERIALIZE_URLFP_FLAGS) != 0; } /** is link count packed in flag hi word ? **/ public boolean serializeLinkCount() { return (_flags & FLAG_SERIALIZE_LINK_COUNT) != 0; } /** serialize the fingerprint's timestamp value **/ public boolean serializeTimestamp() { return (_flags & FLAG_SERIALIZE_TIMESTAMP) != 0; } private void flushRootDomainItems(int rootDomainHash, Collection<URLFP> items, DataOutputStream dataOut, BitStream bitStream, TreeMultimap<Integer, URLFP> tempMap) throws IOException { boolean allItemsRootItems = true; boolean allItemsBelongToSameDomain = true; URLFP firstSubDomainItem = null; // fast scan for special edge cases for (URLFP item : items) { if (firstSubDomainItem == null) { firstSubDomainItem = item; } if (allItemsRootItems) { if (item.getDomainHash() != rootDomainHash) { allItemsRootItems = false; } } if (allItemsBelongToSameDomain && item != firstSubDomainItem) { if (item.getDomainHash() != firstSubDomainItem.getDomainHash()) { allItemsBelongToSameDomain = false; allItemsRootItems = false; } } } if (allItemsRootItems) { bitStream.addbit(1); // bit indicating all items belong to root domain // .. WritableUtils.writeVInt(dataOut, rootDomainHash); WritableUtils.writeVInt(dataOut, items.size()); if (serializeURLFPFlags()) { serializeURLFPFlagStates(dataOut, items); } if (serializeLinkCount()) { serializeLinkCountFlagStates(dataOut, items); } for (URLFP item : items) { WritableUtils.writeVLong(dataOut, item.getUrlHash()); if ((_flags & FLAG_ARCHIVE_SEGMENT_ID) != 0) { WritableUtils.writeVInt(dataOut, item.getParseSegmentId()); } // serialize flags if necessary if (serializeURLFPFlags() && (item.getFlags() & Short.MAX_VALUE) != 0) { WritableUtils.writeVInt(dataOut, (item.getFlags() & Short.MAX_VALUE)); } if (serializeLinkCount() && item.getLinkCount() != 0) { WritableUtils.writeVInt(dataOut, item.getLinkCount()); } if (serializeTimestamp()) { WritableUtils.writeVLong(dataOut, item.getTimestamp()); } } } else if (allItemsBelongToSameDomain) { bitStream.addbit(0); // bit indicating items belong to sub-domains .. WritableUtils.writeVInt(dataOut, rootDomainHash); WritableUtils.writeVInt(dataOut, 1);// sub domain count WritableUtils.writeVInt(dataOut, firstSubDomainItem.getDomainHash()); // sub // domain // hash WritableUtils.writeVInt(dataOut, items.size()); // url count if (serializeURLFPFlags()) { serializeURLFPFlagStates(dataOut, items); } if (serializeLinkCount()) { serializeLinkCountFlagStates(dataOut, items); } for (URLFP item : items) { WritableUtils.writeVLong(dataOut, item.getUrlHash()); if ((_flags & FLAG_ARCHIVE_SEGMENT_ID) != 0) { WritableUtils.writeVInt(dataOut, item.getParseSegmentId()); } // serialize flags if necessary if (serializeURLFPFlags() && (item.getFlags() & Short.MAX_VALUE) != 0) { WritableUtils.writeVInt(dataOut, (item.getFlags() & Short.MAX_VALUE)); } if (serializeLinkCount() && item.getLinkCount() != 0) { WritableUtils.writeVInt(dataOut, item.getLinkCount()); } if (serializeTimestamp()) { WritableUtils.writeVLong(dataOut, item.getTimestamp()); } } } else { // ok the long path ... for (URLFP item : items) { // add to sorted multi-map tempMap.put(item.getDomainHash(), item); } bitStream.addbit(0); // bit indicating items belong to sub-domains .. WritableUtils.writeVInt(dataOut, rootDomainHash); Set<Map.Entry<Integer, Collection<URLFP>>> entrySet = tempMap.asMap() .entrySet(); WritableUtils.writeVInt(dataOut, entrySet.size());// sub domain count // iterate set ... for (Map.Entry<Integer, Collection<URLFP>> entry : entrySet) { WritableUtils.writeVInt(dataOut, entry.getKey()); // sub domain hash WritableUtils.writeVInt(dataOut, entry.getValue().size()); if (serializeURLFPFlags()) { serializeURLFPFlagStates(dataOut, entry.getValue()); } if (serializeLinkCount()) { serializeLinkCountFlagStates(dataOut, entry.getValue()); } for (URLFP fpItem : entry.getValue()) { WritableUtils.writeVLong(dataOut, fpItem.getUrlHash()); if ((_flags & FLAG_ARCHIVE_SEGMENT_ID) != 0) { WritableUtils.writeVInt(dataOut, fpItem.getParseSegmentId()); } // serialize flags if necessary if (serializeURLFPFlags() && (fpItem.getFlags() & Short.MAX_VALUE) != 0) { WritableUtils.writeVInt(dataOut, (fpItem.getFlags() & Short.MAX_VALUE)); } if (serializeLinkCount() && fpItem.getLinkCount() != 0) { WritableUtils.writeVInt(dataOut, fpItem.getLinkCount()); } if (serializeTimestamp()) { WritableUtils.writeVLong(dataOut, fpItem.getTimestamp()); } } } } } /** flush the link stream to the specified output stream **/ public void flush(OutputStream os) throws IOException { TreeMultimap<Integer, URLFP> subDomainMap = TreeMultimap.create(); DataOutputBuffer dataOut = new DataOutputBuffer(_links.size() * 24); BitStream bitStream = new BitStream(); // get the root domain entry set Set<Map.Entry<Integer, Collection<URLFP>>> entrySet = _links.asMap() .entrySet(); WritableUtils.writeVInt(dataOut, entrySet.size()); // iterate entires ... for (Map.Entry<Integer, Collection<URLFP>> entry : entrySet) { // flush items in this root domain ... flushRootDomainItems(entry.getKey(), entry.getValue(), dataOut, bitStream, subDomainMap); } DataOutputStream finalDataOut = new DataOutputStream(os); // write out header ... finalDataOut.writeByte(FPList_Version); // write out archive versions flag WritableUtils.writeVInt(finalDataOut, _flags); // flush bit control stream ... finalDataOut.writeInt(bitStream.nbits); finalDataOut.write(bitStream.bits, 0, (bitStream.nbits + 7) / 8); // and write out remaing data stream finalDataOut.write(dataOut.getData(), 0, dataOut.size()); } private void serializeURLFPFlagStates(DataOutputStream dataOut, Collection<URLFP> itemList) throws IOException { // ok first pass see if there is a need to serialize flag states ... boolean serializeFlagStates = false; for (URLFP item : itemList) { if ((item.getFlags() & Short.MAX_VALUE) != 0) { serializeFlagStates = true; break; } } if (!serializeFlagStates) { dataOut.write(0); } else { BitStream bitStream = new BitStream(); // add first single bit to set a non-zero first byte bitStream.addbit(1); for (URLFP item : itemList) { bitStream.addbit(((item.getFlags() & Short.MAX_VALUE) != 0) ? 1 : 0); } dataOut.write(bitStream.bits, 0, (bitStream.nbits + 7) / 8); } } private void serializeLinkCountFlagStates(DataOutputStream dataOut, Collection<URLFP> itemList) throws IOException { // ok first pass see if there is a need to serialize flag states ... boolean serializeLinkCountStates = false; for (URLFP item : itemList) { if (item.getLinkCount() != 0) { serializeLinkCountStates = true; break; } } if (!serializeLinkCountStates) { dataOut.write(0); } else { BitStream bitStream = new BitStream(); // add first single bit to set a non-zero first byte bitStream.addbit(1); for (URLFP item : itemList) { bitStream.addbit(item.getLinkCount() != 0 ? 1 : 0); } dataOut.write(bitStream.bits, 0, (bitStream.nbits + 7) / 8); } } } /** * reader - used to read compressed fingerprint list ... * * @author rana * */ public static final class Reader { private InputStream _in = null; private DataInputStream _din = null; private int _flags; private int _currentRootDomainHash; private int _currentSubDomainHash; private int _currentRootIdx = -1; private int _rootIdxCount = -1; private BitStream _bitStream; private BitStreamReader _bitReader; private boolean _currentDomainHasSubDomains = false; private int _currentSubDomainIdx = -1; private int _currentSubDomainCount = -1; private int _currentURLIdx = -1; private int _currentURLCount = -1; private BitStreamReader _flagStateBitStreamReader = null; private BitStreamReader _linkCountStateBitStreamReader = null; /** * initialize a reader to read and decode the link data stream * * @param stream * the previously encoded link data stream * @throws IOException */ public Reader(InputStream stream) throws IOException { _in = stream; _din = new DataInputStream(stream); // skip version _din.read(); // read flags _flags = WritableUtils.readVInt(_din); // read bit stream ... _bitStream = new BitStream(); _bitStream.nbits = _din.readInt(); _bitStream.bits = new byte[(_bitStream.nbits + 7) / 8]; _din.read(_bitStream.bits); _bitReader = new BitStreamReader(_bitStream); // read root index count... _rootIdxCount = WritableUtils.readVInt(_din); // reset current root index ... _currentRootIdx = -1; } /** get the stream flags **/ public int getStreamFlags() { return _flags; } private final void readURLFPFlagStates(int urlCount) throws IOException { // read first byte ... byte firstByte = _din.readByte(); if (firstByte != 0) { // LOG.info("first byte non-zero - reading FPFlag States"); // ok there is an embedded bit stream // figure out how many more bytes we need to read BitStream flagStateBitStream = new BitStream(); flagStateBitStream.bits = new byte[((urlCount + 1 + 7) / 8)]; flagStateBitStream.bits[0] = firstByte; if (flagStateBitStream.bits.length - 1 != 0) { // read remaining bytes ... _din.read(flagStateBitStream.bits, 1, flagStateBitStream.bits.length - 1); } // and initialize reader ... _flagStateBitStreamReader = new BitStreamReader(flagStateBitStream); // and skip first bit _flagStateBitStreamReader.getbit(); } else { // LOG.info("first byte zero - skipping FPFlag States"); _flagStateBitStreamReader = null; } } private final void readLinkCountStates(int urlCount) throws IOException { // read first byte ... byte firstByte = _din.readByte(); if (firstByte != 0) { // LOG.info("first byte non-zero - reading FPFlag States"); // ok there is an embedded bit stream // figure out how many more bytes we need to read BitStream linkCountBitStream = new BitStream(); linkCountBitStream.bits = new byte[((urlCount + 1 + 7) / 8)]; linkCountBitStream.bits[0] = firstByte; if (linkCountBitStream.bits.length - 1 != 0) { // read remaining bytes ... _din.read(linkCountBitStream.bits, 1, linkCountBitStream.bits.length - 1); // and initialize reader ... _linkCountStateBitStreamReader = new BitStreamReader( linkCountBitStream); // and skip first bit _linkCountStateBitStreamReader.getbit(); } } else { // LOG.info("first byte zero - skipping FPFlag States"); _linkCountStateBitStreamReader = null; } } /** * checks to see if there is more data in the stream * * @return true if another item can be read from the stream * @throws IOException */ public boolean hasNext() throws IOException { while (_currentRootIdx < _rootIdxCount) { if (++_currentURLIdx < _currentURLCount) { // LOG.info("urlIdx:" + _currentURLIdx + " Max:" + _currentURLCount); return true; } else if (++_currentSubDomainIdx < _currentSubDomainCount) { _currentURLIdx = -1; if (_currentDomainHasSubDomains) { // read sub domain hash _currentSubDomainHash = WritableUtils.readVInt(_din); } else { _currentSubDomainHash = _currentRootDomainHash; } _currentURLCount = WritableUtils.readVInt(_din); // LOG.info("subDomainIdx:" + _currentSubDomainIdx + " URLCount:" + // _currentURLCount); // if this stream has urlfp flags if (hasURLFPFlags()) { readURLFPFlagStates(_currentURLCount); } if (hasLinkCount()) { readLinkCountStates(_currentURLCount); } } else { if (++_currentRootIdx < _rootIdxCount) { _currentSubDomainIdx = -1; _currentURLIdx = -1; _currentURLCount = -1; _currentDomainHasSubDomains = (_bitReader.getbit() == 0); _currentRootDomainHash = WritableUtils.readVInt(_din); if (_currentDomainHasSubDomains) { _currentSubDomainCount = WritableUtils.readVInt(_din); } else { _currentSubDomainCount = 1; } } } } return false; } /** * does this list have serialized flag information per url * */ private boolean hasURLFPFlags() { return (_flags & FLAG_SERIALIZE_URLFP_FLAGS) != 0; } private boolean hasLinkCount() { return (_flags & FLAG_SERIALIZE_LINK_COUNT) != 0; } private boolean hasTimestamp() { return (_flags & FLAG_SERIALIZE_TIMESTAMP) != 0; } /** * reads a nd returns the next URLFP object in the data stream * * @return URLFP object * @throws IOException */ public URLFP next() throws IOException { URLFP urlFPOut = new URLFP(); if (!_currentDomainHasSubDomains) { urlFPOut.setRootDomainHash(_currentRootDomainHash); urlFPOut.setDomainHash(_currentRootDomainHash); } else { urlFPOut.setRootDomainHash(_currentRootDomainHash); urlFPOut.setDomainHash(_currentSubDomainHash); } urlFPOut.setUrlHash(WritableUtils.readVLong(_din)); if ((_flags & FLAG_ARCHIVE_SEGMENT_ID) != 0) { urlFPOut.setParseSegmentId(WritableUtils.readVInt(_din)); } // if stream has urlfp flags && this subset has individual flag state info if (hasURLFPFlags() && _flagStateBitStreamReader != null) { if (_flagStateBitStreamReader.getbit() == 1) { urlFPOut.setFlags(WritableUtils.readVInt(_din)); } } if (hasLinkCount() && _linkCountStateBitStreamReader != null) { if (_linkCountStateBitStreamReader.getbit() == 1) { urlFPOut.setLinkCount(WritableUtils.readVInt(_din)); } } if (hasTimestamp()) { urlFPOut.setTimestamp(WritableUtils.readVLong(_din)); } return urlFPOut; } public void close() throws IOException { _in.close(); } } private static URLFP insertURLFPItem(TreeMultimap<Integer, URLFP> map, String url, int parseSegmentId) { return insertURLFPItem(map, url, parseSegmentId, 0); } private static URLFP insertURLFPItem(TreeMultimap<Integer, URLFP> map, String url, int parseSegmentId, int flags) { URLFP fpOut = new URLFP(); String hostName = URLUtils.fastGetHostFromURL(url); String rootName = null; if (hostName != null) { rootName = URLUtils.extractRootDomainName(hostName); } if (hostName != null && rootName != null) { fpOut.setRootDomainHash(URLFingerprint.generate32BitHostFP(rootName)); fpOut.setDomainHash(URLFingerprint.generate32BitHostFP(hostName)); fpOut.setUrlHash(URLFingerprint.generate64BitURLFPrint(url)); fpOut.setParseSegmentId(parseSegmentId); fpOut.setFlags(flags); map.put(fpOut.getRootDomainHash(), fpOut); return fpOut; } return null; } private static void addMapToBuilder(Builder builder, TreeMultimap<Integer, URLFP> map) { for (Map.Entry<Integer, URLFP> entry : map.entries()) { builder.addLink(entry.getValue()); } } public static final String DOMAIN_1_SUBDOMAIN_1_URL_1 = "http://news.google.com/foo/bar/z"; public static final String DOMAIN_1_SUBDOMAIN_1_URL_2 = "http://news.google.com/zzz"; public static final String DOMAIN_2_SUBDOMAIN_1_URL_1 = "http://cnn.com/foo/bar/z"; public static final String DOMAIN_2_SUBDOMAIN_1_URL_2 = "http://cnn.com/zzzz"; public static final String DOMAIN_3_SUBDOMAIN_1_URL_1 = "http://news.abc.com/url1"; public static final String DOMAIN_3_SUBDOMAIN_1_URL_2 = "http://news.abc.com/url2"; public static final String DOMAIN_3_SUBDOMAIN_2_URL_1 = "http://cartoons.abc.com/url1"; public static final String DOMAIN_3_SUBDOMAIN_2_URL_2 = "http://cartoons.abc.com/url2"; public static void main(String[] args) { validateReallyBigList(); validateURLFPSerializationRootDomain(); validateURLFPSerializationSingleSubDomain(); validateURLFPSerializationMultiDomain(); validateURLFPFlagSerializationRootDomain(); validateURLFPFlagSerializationMultipleSubDomains(); validateURLFPFlagSerializationOneSubDomain(); } public static void validateReallyBigList() { Builder builder = new Builder(FLAG_ARCHIVE_SEGMENT_ID | FLAG_SERIALIZE_URLFP_FLAGS); for (int rootDomain = 0; rootDomain < 1000; ++rootDomain) { for (long urlfphash = 0; urlfphash < 1000; ++urlfphash) { URLFP foo = new URLFP(); foo.setDomainHash(rootDomain); foo.setRootDomainHash(rootDomain); foo.setUrlHash(urlfphash); builder.addLink(foo); } } ByteBufferOutputStream bufferOut = new ByteBufferOutputStream(); try { builder.flush(bufferOut); System.out.println("Buffer Size:" + bufferOut._buffer.getCount() + " URLCount:" + builder.getLinkMap().size()); } catch (IOException e) { e.printStackTrace(); } } public static void validateURLFPFlagSerializationOneSubDomain() { TreeMultimap<Integer, URLFP> sourceMap = TreeMultimap.create(); TreeMultimap<Integer, URLFP> destMap = TreeMultimap.create(); ; ByteArrayOutputStream byteStream = new ByteArrayOutputStream(); Builder firstBuilder = new Builder(FLAG_ARCHIVE_SEGMENT_ID | FLAG_SERIALIZE_URLFP_FLAGS); insertURLFPItem(sourceMap, DOMAIN_1_SUBDOMAIN_1_URL_1 + "0", 1, 255); insertURLFPItem(sourceMap, DOMAIN_1_SUBDOMAIN_1_URL_1 + "1", 2, 255); insertURLFPItem(sourceMap, DOMAIN_1_SUBDOMAIN_1_URL_1 + "2", 3, 255); insertURLFPItem(sourceMap, DOMAIN_1_SUBDOMAIN_1_URL_1 + "3", 4, 0); insertURLFPItem(sourceMap, DOMAIN_1_SUBDOMAIN_1_URL_1 + "4", 5, 0); insertURLFPItem(sourceMap, DOMAIN_1_SUBDOMAIN_1_URL_1 + "5", 6, 0); insertURLFPItem(sourceMap, DOMAIN_1_SUBDOMAIN_1_URL_1 + "6", 7, 255); insertURLFPItem(sourceMap, DOMAIN_1_SUBDOMAIN_1_URL_1 + "7", 8, 255); insertURLFPItem(sourceMap, DOMAIN_1_SUBDOMAIN_1_URL_1 + "8", 9, 255); addMapToBuilder(firstBuilder, sourceMap); try { // flush to byte stream ... firstBuilder.flush(byteStream); // now set up to read the stream ByteArrayInputStream inputStream = new ByteArrayInputStream(byteStream .toByteArray(), 0, byteStream.size()); Reader reader = new Reader(inputStream); while (reader.hasNext()) { URLFP fp = reader.next(); destMap.put(fp.getRootDomainHash(), fp); } reader.close(); Assert.assertTrue(sourceMap.equals(destMap)); } catch (IOException e) { e.printStackTrace(); } } public static void validateURLFPFlagSerializationRootDomain() { TreeMultimap<Integer, URLFP> sourceMap = TreeMultimap.create(); TreeMultimap<Integer, URLFP> destMap = TreeMultimap.create(); ; ByteArrayOutputStream byteStream = new ByteArrayOutputStream(); Builder firstBuilder = new Builder(FLAG_ARCHIVE_SEGMENT_ID | FLAG_SERIALIZE_URLFP_FLAGS); for (int i = 0; i < 12; ++i) { insertURLFPItem(sourceMap, DOMAIN_2_SUBDOMAIN_1_URL_1 + "0_" + i, 1, (255 | (65535 << 16))); insertURLFPItem(sourceMap, DOMAIN_2_SUBDOMAIN_1_URL_1 + "1_" + i, 2, (255 | (65535 << 16))); insertURLFPItem(sourceMap, DOMAIN_2_SUBDOMAIN_1_URL_1 + "2_" + i, 3, (255 | (65535 << 16))); insertURLFPItem(sourceMap, DOMAIN_2_SUBDOMAIN_1_URL_1 + "3_" + i, 4, 0); insertURLFPItem(sourceMap, DOMAIN_2_SUBDOMAIN_1_URL_1 + "4_" + i, 5, 0); insertURLFPItem(sourceMap, DOMAIN_2_SUBDOMAIN_1_URL_1 + "5_" + i, 6, 0); insertURLFPItem(sourceMap, DOMAIN_2_SUBDOMAIN_1_URL_1 + "6_" + i, 7, (255 | (65535 << 16))); insertURLFPItem(sourceMap, DOMAIN_2_SUBDOMAIN_1_URL_1 + "7_" + i, 8, 255); } insertURLFPItem(sourceMap, DOMAIN_2_SUBDOMAIN_1_URL_1 + "8", 8, 255); addMapToBuilder(firstBuilder, sourceMap); try { // flush to byte stream ... firstBuilder.flush(byteStream); // now set up to read the stream ByteArrayInputStream inputStream = new ByteArrayInputStream(byteStream .toByteArray(), 0, byteStream.size()); Reader reader = new Reader(inputStream); while (reader.hasNext()) { URLFP fp = reader.next(); destMap.put(fp.getRootDomainHash(), fp); } reader.close(); Assert.assertTrue(sourceMap.equals(destMap)); } catch (IOException e) { e.printStackTrace(); } } public static void validateURLFPFlagSerializationMultipleSubDomains() { TreeMultimap<Integer, URLFP> sourceMap = TreeMultimap.create(); TreeMultimap<Integer, URLFP> destMap = TreeMultimap.create(); ; ByteArrayOutputStream byteStream = new ByteArrayOutputStream(); Builder firstBuilder = new Builder(FLAG_ARCHIVE_SEGMENT_ID | FLAG_SERIALIZE_URLFP_FLAGS); insertURLFPItem(sourceMap, DOMAIN_3_SUBDOMAIN_1_URL_1 + "0", 1, 255); insertURLFPItem(sourceMap, DOMAIN_3_SUBDOMAIN_2_URL_1 + "1", 2, 255); insertURLFPItem(sourceMap, DOMAIN_3_SUBDOMAIN_1_URL_1 + "2", 3, 255); insertURLFPItem(sourceMap, DOMAIN_3_SUBDOMAIN_2_URL_1 + "3", 4, 0); insertURLFPItem(sourceMap, DOMAIN_3_SUBDOMAIN_1_URL_1 + "4", 1, 255); insertURLFPItem(sourceMap, DOMAIN_3_SUBDOMAIN_2_URL_1 + "5", 2, 255); insertURLFPItem(sourceMap, DOMAIN_3_SUBDOMAIN_1_URL_1 + "6", 3, 255); insertURLFPItem(sourceMap, DOMAIN_3_SUBDOMAIN_2_URL_1 + "7", 4, 0); insertURLFPItem(sourceMap, DOMAIN_3_SUBDOMAIN_1_URL_1 + "8", 1, 255); insertURLFPItem(sourceMap, DOMAIN_3_SUBDOMAIN_2_URL_1 + "9", 2, 255); insertURLFPItem(sourceMap, DOMAIN_3_SUBDOMAIN_1_URL_1 + "10", 3, 255); insertURLFPItem(sourceMap, DOMAIN_3_SUBDOMAIN_2_URL_1 + "11", 4, 0); addMapToBuilder(firstBuilder, sourceMap); try { // flush to byte stream ... firstBuilder.flush(byteStream); // now set up to read the stream ByteArrayInputStream inputStream = new ByteArrayInputStream(byteStream .toByteArray(), 0, byteStream.size()); Reader reader = new Reader(inputStream); while (reader.hasNext()) { URLFP fp = reader.next(); destMap.put(fp.getRootDomainHash(), fp); } reader.close(); Assert.assertTrue(sourceMap.equals(destMap)); } catch (IOException e) { e.printStackTrace(); } } public static void validateURLFPSerializationMultiDomain() { ByteArrayOutputStream byteStream = new ByteArrayOutputStream(); Builder firstBuilder = new Builder(FLAG_ARCHIVE_SEGMENT_ID); TreeMultimap<Integer, URLFP> sourceMap = TreeMultimap.create(); TreeMultimap<Integer, URLFP> destMap = TreeMultimap.create(); ; // single top level domain with one sub domain an two urls insertURLFPItem(sourceMap, DOMAIN_1_SUBDOMAIN_1_URL_1, 1); insertURLFPItem(sourceMap, DOMAIN_1_SUBDOMAIN_1_URL_2, 1); // top level domain with matching sub domain an two urls insertURLFPItem(sourceMap, DOMAIN_2_SUBDOMAIN_1_URL_1, 1); insertURLFPItem(sourceMap, DOMAIN_2_SUBDOMAIN_1_URL_2, 1); // two sub domains with two urls each insertURLFPItem(sourceMap, DOMAIN_3_SUBDOMAIN_1_URL_1, 1); insertURLFPItem(sourceMap, DOMAIN_3_SUBDOMAIN_1_URL_2, 1); insertURLFPItem(sourceMap, DOMAIN_3_SUBDOMAIN_2_URL_1, 1); insertURLFPItem(sourceMap, DOMAIN_3_SUBDOMAIN_2_URL_2, 1); addMapToBuilder(firstBuilder, sourceMap); try { // flush to byte stream ... firstBuilder.flush(byteStream); // now set up to read the stream ByteArrayInputStream inputStream = new ByteArrayInputStream(byteStream .toByteArray(), 0, byteStream.size()); Reader reader = new Reader(inputStream); while (reader.hasNext()) { URLFP fp = reader.next(); destMap.put(fp.getRootDomainHash(), fp); } reader.close(); // dump both lists for (Integer rootDomain : sourceMap.keySet()) { for (URLFP urlfp : sourceMap.get(rootDomain)) { System.out.println("SourceFP Root:" + urlfp.getRootDomainHash() + " Domain:" + urlfp.getDomainHash() + " URL:" + urlfp.getUrlHash()); } } for (Integer rootDomain : destMap.keySet()) { for (URLFP urlfp : destMap.get(rootDomain)) { System.out.println("DestFP Root:" + urlfp.getRootDomainHash() + " Domain:" + urlfp.getDomainHash() + " URL:" + urlfp.getUrlHash()); } } Assert.assertTrue(sourceMap.equals(destMap)); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } public static void validateURLFPSerializationSingleSubDomain() { ByteArrayOutputStream byteStream = new ByteArrayOutputStream(); Builder firstBuilder = new Builder(FLAG_ARCHIVE_SEGMENT_ID); TreeMultimap<Integer, URLFP> sourceMap = TreeMultimap.create(); TreeMultimap<Integer, URLFP> destMap = TreeMultimap.create(); ; // single top level domain with one sub domain an two urls insertURLFPItem(sourceMap, DOMAIN_1_SUBDOMAIN_1_URL_1 + "0", 1); insertURLFPItem(sourceMap, DOMAIN_1_SUBDOMAIN_1_URL_2 + "1", 1); // single top level domain with one sub domain an two urls insertURLFPItem(sourceMap, DOMAIN_1_SUBDOMAIN_1_URL_1 + "2", 1); insertURLFPItem(sourceMap, DOMAIN_1_SUBDOMAIN_1_URL_2 + "3", 1); // single top level domain with one sub domain an two urls insertURLFPItem(sourceMap, DOMAIN_1_SUBDOMAIN_1_URL_1 + "4", 1); insertURLFPItem(sourceMap, DOMAIN_1_SUBDOMAIN_1_URL_2 + "5", 1); // single top level domain with one sub domain an two urls insertURLFPItem(sourceMap, DOMAIN_1_SUBDOMAIN_1_URL_1 + "6", 1); insertURLFPItem(sourceMap, DOMAIN_1_SUBDOMAIN_1_URL_2 + "7", 1); // single top level domain with one sub domain an two urls insertURLFPItem(sourceMap, DOMAIN_1_SUBDOMAIN_1_URL_1 + "8", 1); insertURLFPItem(sourceMap, DOMAIN_1_SUBDOMAIN_1_URL_2 + "9", 1); addMapToBuilder(firstBuilder, sourceMap); try { // flush to byte stream ... firstBuilder.flush(byteStream); // now set up to read the stream ByteArrayInputStream inputStream = new ByteArrayInputStream(byteStream .toByteArray(), 0, byteStream.size()); Reader reader = new Reader(inputStream); while (reader.hasNext()) { URLFP fp = reader.next(); destMap.put(fp.getRootDomainHash(), fp); } reader.close(); // dump both lists for (Integer rootDomain : sourceMap.keySet()) { for (URLFP urlfp : sourceMap.get(rootDomain)) { System.out.println("SourceFP Root:" + urlfp.getRootDomainHash() + " Domain:" + urlfp.getDomainHash() + " URL:" + urlfp.getUrlHash()); } } for (Integer rootDomain : destMap.keySet()) { for (URLFP urlfp : destMap.get(rootDomain)) { System.out.println("DestFP Root:" + urlfp.getRootDomainHash() + " Domain:" + urlfp.getDomainHash() + " URL:" + urlfp.getUrlHash()); } } Assert.assertTrue(sourceMap.equals(destMap)); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } public static void validateURLFPSerializationRootDomain() { ByteArrayOutputStream byteStream = new ByteArrayOutputStream(); Builder firstBuilder = new Builder(FLAG_ARCHIVE_SEGMENT_ID); TreeMultimap<Integer, URLFP> sourceMap = TreeMultimap.create(); TreeMultimap<Integer, URLFP> destMap = TreeMultimap.create(); ; // top level domain with matching sub domain an two urls insertURLFPItem(sourceMap, DOMAIN_2_SUBDOMAIN_1_URL_1 + "0", 1); insertURLFPItem(sourceMap, DOMAIN_2_SUBDOMAIN_1_URL_2 + "1", 1); // top level domain with matching sub domain an two urls insertURLFPItem(sourceMap, DOMAIN_2_SUBDOMAIN_1_URL_1 + "2", 1); insertURLFPItem(sourceMap, DOMAIN_2_SUBDOMAIN_1_URL_2 + "3", 1); // top level domain with matching sub domain an two urls insertURLFPItem(sourceMap, DOMAIN_2_SUBDOMAIN_1_URL_1 + "4", 1); insertURLFPItem(sourceMap, DOMAIN_2_SUBDOMAIN_1_URL_2 + "5", 1); addMapToBuilder(firstBuilder, sourceMap); try { // flush to byte stream ... firstBuilder.flush(byteStream); // now set up to read the stream ByteArrayInputStream inputStream = new ByteArrayInputStream(byteStream .toByteArray(), 0, byteStream.size()); Reader reader = new Reader(inputStream); while (reader.hasNext()) { URLFP fp = reader.next(); destMap.put(fp.getRootDomainHash(), fp); } reader.close(); // dump both lists for (Integer rootDomain : sourceMap.keySet()) { for (URLFP urlfp : sourceMap.get(rootDomain)) { System.out.println("SourceFP Root:" + urlfp.getRootDomainHash() + " Domain:" + urlfp.getDomainHash() + " URL:" + urlfp.getUrlHash()); } } for (Integer rootDomain : destMap.keySet()) { for (URLFP urlfp : destMap.get(rootDomain)) { System.out.println("DestFP Root:" + urlfp.getRootDomainHash() + " Domain:" + urlfp.getDomainHash() + " URL:" + urlfp.getUrlHash()); } } System.out.println("Buffer Size:" + byteStream.size() + " URLCount:" + firstBuilder.getLinkMap().size()); Assert.assertTrue(sourceMap.equals(destMap)); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } }