/** * Copyright 2008 - CommonCrawl Foundation * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. * **/ package org.commoncrawl.hadoop.io; import java.io.DataInput; import java.io.DataOutput; import java.io.IOException; import java.util.Arrays; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapred.InputSplit; /** * A map reduce input split for gzipped ARC files. * * @author Albert Chern */ public class ARCSplit implements InputSplit { /** * The {@link ARCResource}s for this split. */ private ARCResource[] resources; /** * The total size of this split in bytes. */ private long size; /** * The hosts that the resources are located on. */ private String[] hosts; /** * Default constructor for Hadoop. */ public ARCSplit() { } /** * Constructs an <tt>ARCSplit</tt> for {@link ARCResources}. * * @param resources * the resource identifiers for this split's ARC files */ public ARCSplit(ARCResource[] resources) { this(resources, new String[0]); } /** * Constructs an <tt>ARCSplit</tt> for {@link ARCResource}s whose locations * are known. * * @param resources * the {@link ARCResource}s for this split's ARC files * @param hosts * the hosts that the resources are located on */ public ARCSplit(ARCResource[] resources, String[] hosts) { this.resources = resources; this.hosts = hosts; for (ARCResource resource : resources) { size += resource.getSize(); } } /** * @inheritDoc */ public long getLength() throws IOException { return size; } /** * @inheritDoc */ public String[] getLocations() throws IOException { return hosts; } /** * Returns the resources for this split. */ public ARCResource[] getResources() { return resources; } /** * @inheritDoc */ public void readFields(DataInput in) throws IOException { int nResources = in.readInt(); resources = new ARCResource[nResources]; for (int i = 0; i < nResources; i++) { resources[i] = new ARCResource(Text.readString(in), in.readLong()); } size = in.readLong(); hosts = null; } /** * @inheritDoc */ @Override public String toString() { return "Resources: " + Arrays.toString(resources) + " Size: " + size + " Hosts: " + Arrays.toString(hosts); } /** * @inheritDoc */ public void write(DataOutput out) throws IOException { out.writeInt(resources.length); for (ARCResource resource : resources) { resource.write(out); } out.writeLong(size); // The hosts are only used on the client side, so don't serialize them } }