/* * Copyright [2012-2014] PayPal Software Foundation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package ml.shifu.shifu.fs; import java.io.BufferedInputStream; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.OutputStreamWriter; import java.util.ArrayList; import java.util.Arrays; import java.util.Comparator; import java.util.List; import java.util.Scanner; import java.util.zip.GZIPInputStream; import ml.shifu.shifu.container.obj.ColumnConfig; import ml.shifu.shifu.container.obj.EvalConfig; import ml.shifu.shifu.container.obj.RawSourceData.SourceType; import ml.shifu.shifu.util.CommonUtils; import ml.shifu.shifu.util.Constants; import ml.shifu.shifu.util.HDFSUtils; import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream; import org.apache.commons.io.IOUtils; import org.apache.commons.lang.StringUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.*; import org.apache.hadoop.io.compress.CompressionCodec; import org.apache.hadoop.io.compress.CompressionCodecFactory; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.xerial.snappy.SnappyInputStream; /** * ShifuFileUtils class encapsulate the file system interface from other components. * It provides the functions that for all kinds of file operation. * <p> * Caller need to pass the file path and SourceType to do file operation */ public class ShifuFileUtils { private static final Logger log = LoggerFactory.getLogger(ShifuFileUtils.class); // avoid user to create instance private ShifuFileUtils() { } /** * Create an empty file, if file doesn't exist * if the file already exists, this method won't do nothing just return false * * @param path * - file path to create * @param sourceType * - where to create file * @return - true : create an file, or false * @throws IOException * - if any I/O exception in processing */ public static boolean createFileIfNotExists(String path, SourceType sourceType) throws IOException { return getFileSystemBySourceType(sourceType).createNewFile(new Path(path)); } /** * Create Directory if directory doesn't exist * if the directory exists, this method will do nothing * * @param sourceFile * - source file * @return operation status * @throws IOException * - if any I/O exception in processing */ public static boolean createDirIfNotExists(SourceFile sourceFile) throws IOException { return createDirIfNotExists(sourceFile.getPath(), sourceFile.getSourceType()); } /** * Create Directory if directory doesn't exist * if the directory exists, this method will do nothing * * @param path * - directory path * @param sourceType * - local/hdfs * @return operation status * @throws IOException * any io exception */ public static boolean createDirIfNotExists(String path, SourceType sourceType) throws IOException { return getFileSystemBySourceType(sourceType).mkdirs(new Path(path)); } /** * Get buffered writer with <code>{@link Constants#DEFAULT_CHARSET}</code> for source file * !!! Notice, if the file exists, it will be overwritten. * !!! Warning: writer instance should be closed by caller. * * @param sourceFile * - source file * @return buffered writer with <code>{@link Constants#DEFAULT_CHARSET}</code> * @throws IOException * - if any I/O exception in processing */ public static BufferedWriter getWriter(SourceFile sourceFile) throws IOException { return getWriter(sourceFile.getPath(), sourceFile.getSourceType()); } /** * Get buffered writer with <code>{@link Constants#DEFAULT_CHARSET}</code> for specified file. * !!! Notice, if the file exists, it will be overwritten. * !!! Warning: writer instance should be closed by caller. * * @param path * - file path * @param sourceType * - local/hdfs * @return buffered writer with <code>{@link Constants#DEFAULT_CHARSET}</code> * @throws IOException * - if any I/O exception in processing */ public static BufferedWriter getWriter(String path, SourceType sourceType) throws IOException { return new BufferedWriter(new OutputStreamWriter(getFileSystemBySourceType(sourceType).create(new Path(path)), Constants.DEFAULT_CHARSET)); } /** * Get buffered reader with <code>{@link Constants#DEFAULT_CHARSET}</code> for specified file * <p> * !!! Warning: reader instance should be closed by caller. * * @param sourceFile * - source file with <code>{@link Constants#DEFAULT_CHARSET}</code> * @throws IOException * - if any I/O exception in processing * @return buffered reader */ public static BufferedReader getReader(SourceFile sourceFile) throws IOException { return getReader(sourceFile.getPath(), sourceFile.getSourceType()); } /** * Get buffered reader with <code>{@link Constants#DEFAULT_CHARSET}</code> for specified file * <p> * !!! Warning: reader instance should be closed by caller. * * @param path * - file path * @param sourceType * - local/hdfs * @throws IOException * - if any I/O exception in processing * @return buffered reader with <code>{@link Constants#DEFAULT_CHARSET}</code> */ public static BufferedReader getReader(String path, SourceType sourceType) throws IOException { try { return new BufferedReader(new InputStreamReader(getCompressInputStream( getFileSystemBySourceType(sourceType).open(new Path(path)), new Path(path)), Constants.DEFAULT_CHARSET)); } catch (IOException e) { // To manual fix a issue that FileSystem is closed exceptionally. Here we renew a FileSystem object to make // sure all go through such issues. if(e.getMessage() != null) { if(e.getMessage().toLowerCase().indexOf("filesystem closed") >= 0) { if(sourceType == SourceType.HDFS) { return new BufferedReader(new InputStreamReader(HDFSUtils.renewFS().open(new Path(path)), Constants.DEFAULT_CHARSET)); } } } throw e; } } private static InputStream getCompressInputStream(FSDataInputStream fdis, Path path) throws IOException { String name = path.getName(); if(name.toLowerCase().endsWith(".gz")) { return new GZIPInputStream(fdis); } else if(name.toLowerCase().endsWith(".bz2")) { return new BZip2CompressorInputStream(fdis); } else if(name.toLowerCase().endsWith(".snappy")) { return new SnappyInputStream(fdis); } else { return fdis; } } /** * Get the data scanners for a list specified paths * if the file is directory, get all scanner of normal sub-files * if the file is normal file, get its scanner * !!! Notice, all hidden files (file name start with ".") will be skipped * !!! Warning: scanner instances should be closed by caller. * * @param paths * - file paths to get the scanner * @param sourceType * - local/hdfs * @return scanners for specified paths * @throws IOException * - if any I/O exception in processing */ public static List<Scanner> getDataScanners(List<String> paths, SourceType sourceType) throws IOException { if(paths == null || sourceType == null) { throw new IllegalArgumentException("paths should not be null, sourceType should not be null."); } List<Scanner> scanners = new ArrayList<Scanner>(); for(String path: paths) { scanners.addAll(getDataScanners(path, sourceType)); } return scanners; } public static List<Scanner> getDataScanners(String path, SourceType sourceType) throws IOException { return getDataScanners(path, sourceType, null); } /** * Get the data scanners for some specified path * if the file is directory, get all scanner of normal sub-files * if the file is normal file, get its scanner * !!! Notice, all hidden files (file name start with ".") will be skipped * !!! Warning: scanner instances should be closed by caller. * * @param path * - file path to get the scanner * @param sourceType * - local/hdfs * @param pathFilter * the path filter * @return scanners for specified path * @throws IOException * - if any I/O exception in processing */ @SuppressWarnings("deprecation") public static List<Scanner> getDataScanners(String path, SourceType sourceType, final PathFilter pathFilter) throws IOException { FileSystem fs = getFileSystemBySourceType(sourceType); FileStatus[] listStatus; Path p = new Path(path); if(fs.getFileStatus(p).isDir()) { // for folder we need filter pig header files listStatus = fs.listStatus(p, new PathFilter() { @Override public boolean accept(Path path) { boolean hiddenOrSuccessFile = path.getName().startsWith(Constants.HIDDEN_FILES) || path.getName().equalsIgnoreCase("_SUCCESS"); if(pathFilter != null) { return !hiddenOrSuccessFile && pathFilter.accept(path); } else { return !hiddenOrSuccessFile; } } }); } else { listStatus = new FileStatus[] { fs.getFileStatus(p) }; } if(listStatus.length > 1) { Arrays.sort(listStatus, new Comparator<FileStatus>() { @Override public int compare(FileStatus f1, FileStatus f2) { return f1.getPath().getName().compareTo(f2.getPath().getName()); } }); } List<Scanner> scanners = new ArrayList<Scanner>(); for(FileStatus f: listStatus) { String filename = f.getPath().getName(); if(f.isDir()) { log.warn("Skip - {}, since it's direcory, please check your configuration.", filename); continue; } log.debug("Creating Scanner for file: {} ", filename); if(filename.endsWith(Constants.GZ_SUFFIX)) { scanners.add(new Scanner(new GZIPInputStream(fs.open(f.getPath())), Constants.DEFAULT_CHARSET)); } else if(filename.endsWith(Constants.BZ2_SUFFIX)) { scanners.add(new Scanner(new BZip2CompressorInputStream(fs.open(f.getPath())), Constants.DEFAULT_CHARSET)); } else { scanners.add(new Scanner(new BufferedInputStream(fs.open(f.getPath())), Constants.DEFAULT_CHARSET)); } } return scanners; } /** * Get the data scanners for some specified path * if the file is directory, get all scanner of normal sub-files * if the file is normal file, get its scanner * !!! Notice, all hidden files (file name start with ".") will be skipped * !!! Warning: scanner instances should be closed by caller. * * @param sourceFile * - source file * @return scanners for source file * @throws IOException * - if any I/O exception in processing */ public static List<Scanner> getDataScanners(SourceFile sourceFile) throws IOException { return getDataScanners(sourceFile.getPath(), sourceFile.getSourceType()); } /** * Copy src file to dst file in the same FileSystem. Such as copy local source to local destination, * copy hdfs source to hdfs dest. * * @param srcPath * - source file to copy * @param destPath * - destination file * @param sourceType * - local/hdfs * @throws IOException * - if any I/O exception in processing */ public static void copy(String srcPath, String destPath, SourceType sourceType) throws IOException { if(StringUtils.isEmpty(srcPath) || StringUtils.isEmpty(destPath) || sourceType == null) { throw new IllegalArgumentException(String.format( "Null or empty parameters srcDataPath:%s, dstDataPath:%s, sourceType:%s", srcPath, destPath, sourceType)); } FileSystem fs = getFileSystemBySourceType(sourceType); // delete all files in dst firstly because of different folder if has dstDataPath if(!fs.delete(new Path(destPath), true)) { // ignore delete failed, it's ok. } FileUtil.copy(fs, new Path(srcPath), fs, new Path(destPath), false, new Configuration()); } /** * Check the path is directory or not, the SourceType is used to find the file system * * @param sourceFile * - source file * @return - true, if the file is directory; or false * @throws IOException * - if any I/O exception in processing */ public static boolean isDir(SourceFile sourceFile) throws IOException { return isDir(sourceFile.getPath(), sourceFile.getSourceType()); } /** * Check the path is directory or not, the SourceType is used to find the file system * * @param path * - the path of source file * @param sourceType * - SourceType to find file system * @return - true, if the file is directory; or false * @throws IOException * - if any I/O exception in processing */ @SuppressWarnings("deprecation") public static boolean isDir(String path, SourceType sourceType) throws IOException { FileSystem fs = getFileSystemBySourceType(sourceType); FileStatus status = fs.getFileStatus(new Path(path)); return status.isDir(); } /** * According to SourceType to check whether file exists. * * @param sourceFile * - source file * @return - true if file exists, or false * @throws IOException * - if any I/O exception in processing */ public static boolean isFileExists(SourceFile sourceFile) throws IOException { return isFileExists(sourceFile.getPath(), sourceFile.getSourceType()); } /** * According to SourceType to check whether file exists. * * @param path * - path of source file * @param sourceType * - local/hdfs * @return - true if file exists, or false * @throws IOException * - if any I/O exception in processing */ public static boolean isFileExists(String path, SourceType sourceType) throws IOException { return isFileExists(new Path(path), sourceType); } /** * According to SourceType to check whether file exists. * * @param path * - @Path of source file * @param sourceType * - local/hdfs * @return - true if file exists, or false * @throws IOException * - if any I/O exception in processing */ public static boolean isFileExists(Path path, SourceType sourceType) throws IOException { FileSystem fs = getFileSystemBySourceType(sourceType); FileStatus[] fileStatusArr = fs.globStatus(path); return !(fileStatusArr == null || fileStatusArr.length == 0); } /** * Delete the file or directory recursively. * * @param sourceFile * - source file to check * @return operation status * @throws IOException * - if any I/O exception in processing */ public static boolean deleteFile(SourceFile sourceFile) throws IOException { return deleteFile(sourceFile.getPath(), sourceFile.getSourceType()); } /** * Delete the file or directory recursively. * * @param path * - file or directory * @param sourceType * - file source [local/HDFS] * @return operation status * @throws IOException * - if any I/O exception in processing */ public static boolean deleteFile(String path, SourceType sourceType) throws IOException { FileSystem fs = getFileSystemBySourceType(sourceType); return fs.delete(new Path(path), true); } /** * Expand the file path, allowing user to use regex just like when using `hadoop fs` * According the rules in glob, "{2,3}", "*" will be allowed * * @param rawPath * - the raw file path that may contains regex * @param sourceType * - file source [local/HDFS] * @return - the file path list after expansion * @throws IOException * - if any I/O exception in processing */ public static List<String> expandPath(String rawPath, SourceType sourceType) throws IOException { FileSystem fs = getFileSystemBySourceType(sourceType); FileStatus[] fsArr = fs.globStatus(new Path(rawPath)); List<String> filePathList = new ArrayList<String>(); if(fsArr != null) { for(FileStatus fileStatus: fsArr) { filePathList.add(fileStatus.getPath().toString()); } } return filePathList; } /** * Get the FileSystem, according the source type * * @param sourceType * - which kind of file system * @return - file system handler */ public static FileSystem getFileSystemBySourceType(SourceType sourceType) { if(sourceType == null) { throw new IllegalArgumentException("sourceType should not be null."); } switch(sourceType) { case HDFS: return HDFSUtils.getFS(); case LOCAL: return HDFSUtils.getLocalFS(); default: throw new IllegalStateException(String.format("No such source type - %s.", sourceType)); } } public static List<ColumnConfig> searchColumnConfig(EvalConfig config, List<ColumnConfig> configList) throws IOException { String path = config.getModelsPath(); if(StringUtils.isNotEmpty(path)) { FileSystem fs = ShifuFileUtils.getFileSystemBySourceType(config.getDataSet().getSource()); while(path.indexOf("/") > 0) { path = path.substring(0, path.lastIndexOf("/")); Path columnConfigFile = new Path(path + "/ColumnConfig.json"); if(fs.exists(columnConfigFile)) { log.info("Using config file in this column config : {}", columnConfigFile.toString()); return CommonUtils.loadColumnConfigList(columnConfigFile.toString(), config.getDataSet() .getSource()); } } } return configList; } public static List<String> readFilePartsIntoList(String filePath, SourceType sourceType) throws IOException { List<String> lines = new ArrayList<String>(); FileSystem fs = getFileSystemBySourceType(sourceType); FileStatus[] fileStatsArr = getFilePartStatus(filePath, sourceType); CompressionCodecFactory compressionFactory = new CompressionCodecFactory(new Configuration()); for(FileStatus fileStatus: fileStatsArr) { InputStream is = null; CompressionCodec codec = compressionFactory.getCodec(fileStatus.getPath()); if(codec != null) { is = codec.createInputStream(fs.open(fileStatus.getPath())); } else { is = fs.open(fileStatus.getPath()); } lines.addAll(IOUtils.readLines(is)); IOUtils.closeQuietly(is); } return lines; } public static FileStatus[] getFilePartStatus(String filePath, SourceType sourceType) throws IOException { FileSystem fs = getFileSystemBySourceType(sourceType); FileStatus[] fileStatsArr = fs.listStatus(new Path(filePath), new PathFilter() { @Override public boolean accept(Path path) { return path.getName().startsWith("part"); } }); return fileStatsArr; } public static int getFilePartCount(String filePath, SourceType sourceType) throws IOException { FileStatus[] fileStatsArr = getFilePartStatus(filePath, sourceType); return fileStatsArr.length; } public static long getFileOrDirectorySize(String filePath, SourceType sourceType) throws IOException { long size = 0; FileStatus[] fileStatsArr = getFilePartStatus(filePath, sourceType); for(FileStatus fileStats: fileStatsArr) { size += fileStats.getLen(); } return size; } }