/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.mahout.common; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStream; import java.net.URI; import java.util.ArrayList; import java.util.Arrays; import java.util.Comparator; import java.util.Iterator; import java.util.List; import com.google.common.base.Joiner; import com.google.common.base.Preconditions; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.filecache.DistributedCache; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.LocalFileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.PathFilter; import org.apache.hadoop.io.Writable; import org.apache.hadoop.mapreduce.InputFormat; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.JobContext; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.OutputFormat; import org.apache.hadoop.mapreduce.Reducer; import org.apache.mahout.common.iterator.sequencefile.PathType; import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirValueIterator; import org.apache.mahout.common.iterator.sequencefile.SequenceFileValueIterator; import org.slf4j.Logger; import org.slf4j.LoggerFactory; public final class HadoopUtil { private static final Logger log = LoggerFactory.getLogger(HadoopUtil.class); private HadoopUtil() { } /** * Create a map-only Hadoop Job out of the passed in parameters. Does not set the * Job name. * * @see #getCustomJobName(String, org.apache.hadoop.mapreduce.JobContext, Class, Class) */ public static Job prepareJob(Path inputPath, Path outputPath, Class<? extends InputFormat> inputFormat, Class<? extends Mapper> mapper, Class<? extends Writable> mapperKey, Class<? extends Writable> mapperValue, Class<? extends OutputFormat> outputFormat, Configuration conf) throws IOException { Job job = new Job(new Configuration(conf)); Configuration jobConf = job.getConfiguration(); if (mapper.equals(Mapper.class)) { throw new IllegalStateException("Can't figure out the user class jar file from mapper/reducer"); } job.setJarByClass(mapper); job.setInputFormatClass(inputFormat); jobConf.set("mapred.input.dir", inputPath.toString()); job.setMapperClass(mapper); job.setMapOutputKeyClass(mapperKey); job.setMapOutputValueClass(mapperValue); job.setOutputKeyClass(mapperKey); job.setOutputValueClass(mapperValue); jobConf.setBoolean("mapred.compress.map.output", true); job.setNumReduceTasks(0); job.setOutputFormatClass(outputFormat); jobConf.set("mapred.output.dir", outputPath.toString()); return job; } /** * Create a map and reduce Hadoop job. Does not set the name on the job. * @param inputPath The input {@link org.apache.hadoop.fs.Path} * @param outputPath The output {@link org.apache.hadoop.fs.Path} * @param inputFormat The {@link org.apache.hadoop.mapreduce.InputFormat} * @param mapper The {@link org.apache.hadoop.mapreduce.Mapper} class to use * @param mapperKey The {@link org.apache.hadoop.io.Writable} key class. If the Mapper is a no-op, * this value may be null * @param mapperValue The {@link org.apache.hadoop.io.Writable} value class. If the Mapper is a no-op, * this value may be null * @param reducer The {@link org.apache.hadoop.mapreduce.Reducer} to use * @param reducerKey The reducer key class. * @param reducerValue The reducer value class. * @param outputFormat The {@link org.apache.hadoop.mapreduce.OutputFormat}. * @param conf The {@link org.apache.hadoop.conf.Configuration} to use. * @return The {@link org.apache.hadoop.mapreduce.Job}. * @throws IOException if there is a problem with the IO. * * @see #getCustomJobName(String, org.apache.hadoop.mapreduce.JobContext, Class, Class) * @see #prepareJob(org.apache.hadoop.fs.Path, org.apache.hadoop.fs.Path, Class, Class, Class, Class, Class, * org.apache.hadoop.conf.Configuration) */ public static Job prepareJob(Path inputPath, Path outputPath, Class<? extends InputFormat> inputFormat, Class<? extends Mapper> mapper, Class<? extends Writable> mapperKey, Class<? extends Writable> mapperValue, Class<? extends Reducer> reducer, Class<? extends Writable> reducerKey, Class<? extends Writable> reducerValue, Class<? extends OutputFormat> outputFormat, Configuration conf) throws IOException { Job job = new Job(new Configuration(conf)); Configuration jobConf = job.getConfiguration(); if (reducer.equals(Reducer.class)) { if (mapper.equals(Mapper.class)) { throw new IllegalStateException("Can't figure out the user class jar file from mapper/reducer"); } job.setJarByClass(mapper); } else { job.setJarByClass(reducer); } job.setInputFormatClass(inputFormat); jobConf.set("mapred.input.dir", inputPath.toString()); job.setMapperClass(mapper); if (mapperKey != null) { job.setMapOutputKeyClass(mapperKey); } if (mapperValue != null) { job.setMapOutputValueClass(mapperValue); } jobConf.setBoolean("mapred.compress.map.output", true); job.setReducerClass(reducer); job.setOutputKeyClass(reducerKey); job.setOutputValueClass(reducerValue); job.setOutputFormatClass(outputFormat); jobConf.set("mapred.output.dir", outputPath.toString()); return job; } public static String getCustomJobName(String className, JobContext job, Class<? extends Mapper> mapper, Class<? extends Reducer> reducer) { StringBuilder name = new StringBuilder(100); String customJobName = job.getJobName(); if (customJobName == null || customJobName.trim().isEmpty()) { name.append(className); } else { name.append(customJobName); } name.append('-').append(mapper.getSimpleName()); name.append('-').append(reducer.getSimpleName()); return name.toString(); } public static void delete(Configuration conf, Iterable<Path> paths) throws IOException { if (conf == null) { conf = new Configuration(); } for (Path path : paths) { FileSystem fs = path.getFileSystem(conf); if (fs.exists(path)) { log.info("Deleting {}", path); fs.delete(path, true); } } } public static void delete(Configuration conf, Path... paths) throws IOException { delete(conf, Arrays.asList(paths)); } public static long countRecords(Path path, Configuration conf) throws IOException { long count = 0; Iterator<?> iterator = new SequenceFileValueIterator<Writable>(path, true, conf); while (iterator.hasNext()) { iterator.next(); count++; } return count; } /** * Count all the records in a directory using a * {@link org.apache.mahout.common.iterator.sequencefile.SequenceFileDirValueIterator} * * @param path The {@link org.apache.hadoop.fs.Path} to count * @param pt The {@link org.apache.mahout.common.iterator.sequencefile.PathType} * @param filter Apply the {@link org.apache.hadoop.fs.PathFilter}. May be null * @param conf The Hadoop {@link org.apache.hadoop.conf.Configuration} * @return The number of records * @throws IOException if there was an IO error */ public static long countRecords(Path path, PathType pt, PathFilter filter, Configuration conf) throws IOException { long count = 0; Iterator<?> iterator = new SequenceFileDirValueIterator<Writable>(path, pt, filter, null, true, conf); while (iterator.hasNext()) { iterator.next(); count++; } return count; } public static InputStream openStream(Path path, Configuration conf) throws IOException { FileSystem fs = FileSystem.get(path.toUri(), conf); return fs.open(path.makeQualified(path.toUri(), path)); } public static FileStatus[] getFileStatus(Path path, PathType pathType, PathFilter filter, Comparator<FileStatus> ordering, Configuration conf) throws IOException { FileStatus[] statuses; FileSystem fs = path.getFileSystem(conf); if (filter == null) { statuses = pathType == PathType.GLOB ? fs.globStatus(path) : listStatus(fs, path); } else { statuses = pathType == PathType.GLOB ? fs.globStatus(path, filter) : listStatus(fs, path, filter); } if (ordering != null) { Arrays.sort(statuses, ordering); } return statuses; } public static FileStatus[] listStatus(FileSystem fs, Path path) throws IOException { try { return fs.listStatus(path); } catch (FileNotFoundException e) { return new FileStatus[0]; } } public static FileStatus[] listStatus(FileSystem fs, Path path, PathFilter filter) throws IOException { try { return fs.listStatus(path, filter); } catch (FileNotFoundException e) { return new FileStatus[0]; } } public static void cacheFiles(Path fileToCache, Configuration conf) { DistributedCache.setCacheFiles(new URI[]{fileToCache.toUri()}, conf); } /** * Return the first cached file in the list, else null if thre are no cached files. * @param conf - MapReduce Configuration * @return Path of Cached file * @throws IOException - IO Exception */ public static Path getSingleCachedFile(Configuration conf) throws IOException { return getCachedFiles(conf)[0]; } /** * Retrieves paths to cached files. * @param conf - MapReduce Configuration * @return Path[] of Cached Files * @throws IOException - IO Exception * @throws IllegalStateException if no cache files are found */ public static Path[] getCachedFiles(Configuration conf) throws IOException { LocalFileSystem localFs = FileSystem.getLocal(conf); Path[] cacheFiles = DistributedCache.getLocalCacheFiles(conf); URI[] fallbackFiles = DistributedCache.getCacheFiles(conf); // fallback for local execution if (cacheFiles == null) { Preconditions.checkState(fallbackFiles != null, "Unable to find cached files!"); cacheFiles = new Path[fallbackFiles.length]; for (int n = 0; n < fallbackFiles.length; n++) { cacheFiles[n] = new Path(fallbackFiles[n].getPath()); } } else { for (int n = 0; n < cacheFiles.length; n++) { cacheFiles[n] = localFs.makeQualified(cacheFiles[n]); // fallback for local execution if (!localFs.exists(cacheFiles[n])) { cacheFiles[n] = new Path(fallbackFiles[n].getPath()); } } } Preconditions.checkState(cacheFiles.length > 0, "Unable to find cached files!"); return cacheFiles; } public static void setSerializations(Configuration configuration) { configuration.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization," + "org.apache.hadoop.io.serializer.WritableSerialization"); } public static void writeInt(int value, Path path, Configuration configuration) throws IOException { FileSystem fs = FileSystem.get(path.toUri(), configuration); try (FSDataOutputStream out = fs.create(path)) { out.writeInt(value); } } public static int readInt(Path path, Configuration configuration) throws IOException { FileSystem fs = FileSystem.get(path.toUri(), configuration); try (FSDataInputStream in = fs.open(path)) { return in.readInt(); } } /** * Builds a comma-separated list of input splits * @param fs - File System * @param fileStatus - File Status * @return list of directories as a comma-separated String * @throws IOException - IO Exception */ public static String buildDirList(FileSystem fs, FileStatus fileStatus) throws IOException { boolean containsFiles = false; List<String> directoriesList = new ArrayList<>(); for (FileStatus childFileStatus : fs.listStatus(fileStatus.getPath())) { if (childFileStatus.isDir()) { String subDirectoryList = buildDirList(fs, childFileStatus); directoriesList.add(subDirectoryList); } else { containsFiles = true; } } if (containsFiles) { directoriesList.add(fileStatus.getPath().toUri().getPath()); } return Joiner.on(',').skipNulls().join(directoriesList.iterator()); } /** * Builds a comma-separated list of input splits * @param fs - File System * @param fileStatus - File Status * @param pathFilter - path filter * @return list of directories as a comma-separated String * @throws IOException - IO Exception */ public static String buildDirList(FileSystem fs, FileStatus fileStatus, PathFilter pathFilter) throws IOException { boolean containsFiles = false; List<String> directoriesList = new ArrayList<>(); for (FileStatus childFileStatus : fs.listStatus(fileStatus.getPath(), pathFilter)) { if (childFileStatus.isDir()) { String subDirectoryList = buildDirList(fs, childFileStatus); directoriesList.add(subDirectoryList); } else { containsFiles = true; } } if (containsFiles) { directoriesList.add(fileStatus.getPath().toUri().getPath()); } return Joiner.on(',').skipNulls().join(directoriesList.iterator()); } /** * * @param configuration - configuration * @param filePath - Input File Path * @return relative file Path * @throws IOException - IO Exception */ public static String calcRelativeFilePath(Configuration configuration, Path filePath) throws IOException { FileSystem fs = filePath.getFileSystem(configuration); FileStatus fst = fs.getFileStatus(filePath); String currentPath = fst.getPath().toString().replaceFirst("file:", ""); String basePath = configuration.get("baseinputpath"); if (!basePath.endsWith("/")) { basePath += "/"; } basePath = basePath.replaceFirst("file:", ""); String[] parts = currentPath.split(basePath); if (parts.length == 2) { return parts[1]; } else if (parts.length == 1) { return parts[0]; } return currentPath; } /** * Finds a file in the DistributedCache * * @param partOfFilename a substring of the file name * @param localFiles holds references to files stored in distributed cache * @return Path to first matched file or null if nothing was found **/ public static Path findInCacheByPartOfFilename(String partOfFilename, URI[] localFiles) { for (URI distCacheFile : localFiles) { log.info("trying find a file in distributed cache containing [{}] in its name", partOfFilename); if (distCacheFile != null && distCacheFile.toString().contains(partOfFilename)) { log.info("found file [{}] containing [{}]", distCacheFile.toString(), partOfFilename); return new Path(distCacheFile.getPath()); } } return null; } }