/** * Copyright 2008 - CommonCrawl Foundation * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. * **/ package org.commoncrawl.hadoop.io; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import java.util.Collection; import java.util.HashMap; import java.util.Map; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.JobConfigurable; import org.commoncrawl.util.EscapeUtils; /** * An {@link ARCSource} for local files. * * @author Albert Chern */ public class LocalARCSource extends ARCSplitCalculator implements ARCSource, JobConfigurable { /** * <tt>local.arc.source.inputs</tt> - the property where the list of inputs is * stored. * * @see #setInputs * @see #getInputs */ public static final String P_INPUTS = "local.arc.source.inputs"; /** * Returns the list of inputs set by {@link setInputs}. * * @param job * the job to get the inputs from * * @return the list of inputs, or <tt>null</tt> if not set */ public static String[] getInputs(JobConf job) { String inputs = job.get(P_INPUTS); return inputs == null ? null : EscapeUtils.split(',', inputs); } /** * Sets the list of inputs that will be processed. * * <p> * Paths to add should either be for gzipped ARC files, or directories * containing gzipped ARC files. * * @param job * the job to set the inputs for * @param paths * the paths to set as inputs * * @see #P_INPUTS */ public static void setInputs(JobConf job, String... paths) { job.set(P_INPUTS, EscapeUtils.concatenate(',', paths)); } /** * @inheritDoc */ @Override protected Collection<ARCResource> getARCResources(JobConf job) throws IOException { String[] inputs = getInputs(job); if (inputs == null) { throw new IllegalArgumentException("No inputs set"); } Map<String, ARCResource> resources = new HashMap<String, ARCResource>(); for (String input : inputs) { File file = new File(input); File[] files = file.isDirectory() ? file.listFiles() : new File[] { file }; for (File f : files) { String path = f.getCanonicalPath(); resources.put(path, new ARCResource(path, f.length())); } } return resources.values(); } /** * @inheritDoc */ public InputStream getStream(String resource, long streamPosition, Throwable lastError, int previousFailures) throws Throwable { if (lastError != null || previousFailures > 0) { // Don't retry...local IO failures are not expected return null; } if (streamPosition != 0) { // This shouldn't happen, but we'll check just in case throw new RuntimeException("Non-zero position requested"); } return new FileInputStream(resource); } }