/**
* Copyright 2008 - CommonCrawl Foundation
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
**/
package org.commoncrawl.hadoop.io;
import java.io.IOException;
import java.util.Collection;
import java.util.LinkedList;
import java.util.List;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.JobConfigurable;
/**
* Calculates splits based on the desired number of files per split and the
* desired size of each split.
*
* <p>
* Concrete implementations should override {@link #getARCResources}.
*
* @author Albert Chern
*/
public abstract class ARCSplitCalculator implements JobConfigurable {
/**
* <tt>arc.split.calculator.files.per.split</tt> - the property where the
* number of files per input split is stored.
*
* @see #setFilesPerSplit
*/
public static final String P_FILES_PER_SPLIT = "arc.split.calculator.files.per.split";
/**
* <tt>arc.split.calculator.mb.per.split</tt> - the property where the desired
* size in megabytes of the split is stored.
*
* @see #setMegabytesPerSplit
*/
public static final String P_MB_PER_SPLIT = "arc.split.calculator.mb.per.split";
/**
* Sets the desired number of files per input split.
*
* <p>
* Default is 1.
*
* @param job
* the job to set the number of files per split for
* @param filesPerSplit
* the desired number of ARC files per split
*
* @see #P_FILES_PER_SPLIT
*/
public static final void setFilesPerSplit(JobConf job, int filesPerSplit) {
job.setInt(P_FILES_PER_SPLIT, filesPerSplit);
}
/**
* Sets the desired number of megabytes per split.
*
* <p>
* New files will be added to a split until the total size of the split
* exceeds this threshold. Default is no limit.
*
* @param job
* the job to set the number of megabytes per split for
* @param mbPerSplit
* the desired number of megabytes per split
*/
public static final void setMegabytesPerSplit(JobConf job, int mbPerSplit) {
job.setInt(P_MB_PER_SPLIT, mbPerSplit);
}
private int filesPerSplit;
private long bytesPerSplit;
private void addSplit(List<ARCSplit> splits, ARCResource[] resources, int size) {
if (size > 0) {
ARCResource[] copy = new ARCResource[size];
System.arraycopy(resources, 0, copy, 0, size);
splits.add(new ARCSplit(copy));
}
}
/**
* @inheritDoc
*/
public final void configure(JobConf job) {
filesPerSplit = job.getInt(P_FILES_PER_SPLIT, 1);
bytesPerSplit = job.get(P_MB_PER_SPLIT) == null ? Long.MAX_VALUE
: Long.parseLong(job.get(P_MB_PER_SPLIT)) * 1024 * 1024;
configureImpl(job);
}
/**
* Hook for subclass configuration.
*
* @param job
* the {@link JobConf} of the job
*
* @see JobConfigurable#configure
*/
protected void configureImpl(JobConf job) {
}
/**
* Given a job, returns the {@link ARCResource}s it should process.
*
* @param job
* the job for which to get the {@link ARCResource}s
*
* @return the {@link ARCResource}s to process
*
* @throws IOException
* if an IO error occurs
*/
protected abstract Collection<ARCResource> getARCResources(JobConf job) throws IOException;
/**
* @inheritDoc
*/
public ARCSplit[] getARCSplits(JobConf job) throws IOException {
List<ARCSplit> splits = new LinkedList<ARCSplit>();
ARCResource[] resources = new ARCResource[filesPerSplit];
int nResources = 0;
long length = 0;
for (ARCResource resource : getARCResources(job)) {
resources[nResources++] = resource;
length += resource.getSize();
// When the split is too big, add it
if (nResources >= filesPerSplit || length >= bytesPerSplit) {
addSplit(splits, resources, nResources);
nResources = 0;
length = 0;
}
}
// Add the final split
addSplit(splits, resources, nResources);
return splits.toArray(new ARCSplit[splits.size()]);
}
}