HdfsUtils.java example

Explorer

flamingo-mapreduce-master
- src
  - main
    - java
      - org
        openflamingo
        mapreduce
        aggregator
        Aggregator.java
        BooleanAndAggregator.java
        BooleanOrAggregator.java
        BooleanOverwriteAggregator.java
        DoubleAverageAggregator.java
        DoubleMaxAggregator.java
        DoubleMinAggregator.java
        DoubleOverwriteAggregator.java
        DoubleProductAggregator.java
        DoubleSumAggregator.java
        FloatAverageAggregator.java
        FloatMaxAggregator.java
        FloatMinAggregator.java
        FloatOverwriteAggregator.java
        FloatProductAggregator.java
        FloatSumAggregator.java
        IntMaxAggregator.java
        IntMinAggregator.java
        IntOverwriteAggregator.java
        IntProductAggregator.java
        IntSumAggregator.java
        LongMaxAggregator.java
        LongMinAggregator.java
        LongOverwriteAggregator.java
        LongProductAggregator.java
        LongSumAggregator.java
        compress
        PooledStreamCompressor.java
        core
        AbstractJob.java
        Constants.java
        Delimiter.java
        PigDriver.java
        WorkflowJob.java
        etl
        MapReduceDriver.java
        accounting
        AccountingDriver.java
        AccountingMapper.java
        aggregate
        AggregateDriver.java
        AggregateMapper.java
        clean
        CleanDriver.java
        CleanMapper.java
        filter
        FilterDriver.java
        FilterMapper.java
        filters
        EmptyColumnFilter.java
        EndWithFilter.java
        EqualNumberFilter.java
        EqualStringFilter.java
        Filter.java
        FilterRegistry.java
        FilterSupport.java
        GreaterThanEqualFilter.java
        GreaterThanFilter.java
        LessThanEqualFilter.java
        LessThanFilter.java
        NotEmptyColumnFilter.java
        NotEqualNumberFilter.java
        NotEqualStringFilter.java
        StartWithFilter.java
        generate
        GenerateKeyDriver.java
        GenerateSequenceMapper.java
        GenerateType.java
        grep
        GrepColumnMapper.java
        GrepDriver.java
        GrepRowMapper.java
        RegEx.java
        groupby
        GroupByDriver.java
        GroupByMapper.java
        GroupByReducer.java
        linecount
        LineCountMapper.java
        rank
        RankDriver.java
        RankMapper.java
        RankReducer.java
        replace
        column
        DefaultReplacer.java
        ReplaceColumnDriver.java
        ReplaceColumnMapper.java
        Replacer.java
        delimiter
        ReplaceDelimiterDriver.java
        ReplaceDelimiterMapper.java
        sort
        OrderBuilder.java
        SortDriver.java
        statics
        StaticsDriver.java
        StaticsMapper.java
        StaticsReducer.java
        SumMapper.java
        SumReducer.java
        statics
        AverageModule.java
        DataTypeRegistry.java
        DeviationModule.java
        MaxModule.java
        MinMaxValueSelector.java
        MinModule.java
        StandardDeviationModule.java
        Statics.java
        StaticsModule.java
        StaticsModuleRegistry.java
        SumModule.java
        VariantModule.java
        parser
        CsvRowParser.java
        type
        DataType.java
        TextArrayWritable.java
        util
        ArrayUtils.java
        ComparisonUtils.java
        CounterUtils.java
        DateUtils.java
        EmptyIterable.java
        ExpressionUtils.java
        FileSelector.java
        FileSelectorChain.java
        FileSizeChecker.java
        FileUtils.java
        HadoopMetrics.java
        HdfsUtils.java
        MapReduceUtils.java
        MathUtils.java
        MemoryUtils.java
        ReflectionUtils.java
        StartEndWithFileSelector.java
        StringUtils.java
        TimestampUtils.java
        WritableUtils.java
  - test
    - java
      - org
        openflamingo
        mapreduce
        aggregator
        LongSumAggregatorTest.java
        common
        AccountingExpressionFileReadTest.java
        etl
        HdfsInputsParseTestDriver.java
        accounting
        AccountingMapTest.java
        ExpressionHandleTest.java
        aggregate
        AggregateMapperTest.java
        clean
        CleanMapperTest.java
        filter
        FilterMapperTest.java
        generate
        GenerateSequenceMapperTest.java
        GenerateTypeTest.java
        grep
        GrepColumnMapperTest.java
        GrepRowMapperTest.java
        groupby
        GroupByMapReduceTest.java
        mvel
        MVELTest.java
        rank
        RankMapReduceTest.java
        replace
        ReplaceColumnMapperTest.java
        ReplaceDelimiterMapperTest.java
        statics
        StaticsMapperReducerTest.java
        SumMapperTest.java
        SumReducerTest.java
        parser
        CsvRowParserTest.java
        util
        HdfsUtilsTest.java
        MathUtilsTest.java
        StringUtilsTest.java
    - resources
      - org
        openflamingo
        mapreduce
        etl
        sort
        OrderBuilderTest.java

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.openflamingo.mapreduce.util;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FileUtil;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hdfs.DFSClient;
import org.apache.hadoop.hdfs.protocol.HdfsFileStatus;
import org.apache.hadoop.mapreduce.InputSplit;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.*;
import java.net.InetSocketAddress;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;

/**
 * HDFS Utility.
 *
 * @author Edward KIM
 * @author Seo Ji Hye
 * @since 0.1
 */
public class HdfsUtils {

	/**
	 * SLF4J Logging
	 */
	private static Logger logger = LoggerFactory.getLogger(HdfsUtils.class);

	public static final String DEFAULT_UGI = "hadoop,hadoop";

	public static final String HDFS_URL_PREFIX = "hdfs://";

	/**
	 * Hadoop HDFS의 DFS Client를 생성한다.
	 *
	 * @param hdfsUrl HDFS URL
	 * @return DFS Client
	 * @throws java.io.IOException DFS Client를 생성할 수 없는 경우
	 */
	public static DFSClient createDFSClient(String hdfsUrl) throws IOException {
		if (hdfsUrl == null || !hdfsUrl.startsWith("hdfs://")) {
			throw new IllegalArgumentException("HDFS URL이 잘못되었습니다. 요청한 HDFS URL [" + hdfsUrl + "]");
		}
		String url = StringUtils.replace(hdfsUrl, "hdfs://", "");
		String[] parts = url.split(":");
		return createDFSClient(parts[0], Integer.valueOf(parts[1]));
	}

	/**
	 * Hadoop HDFS의 DFS Client를 생성한다.
	 *
	 * @param namenodeIp   Namenode IP
	 * @param namenodePort Namenode Port
	 * @return DFS Client
	 * @throws java.io.IOException DFS Client를 생성할 수 없는 경우
	 */
	public static DFSClient createDFSClient(String namenodeIp, int namenodePort) throws IOException {
		Configuration config = new Configuration();
		InetSocketAddress address = new InetSocketAddress(namenodeIp, namenodePort);
		return new DFSClient(address, config);
	}

	/**
	 * 지정한 경로를 삭제한다.
	 *
	 * @param client    DFS Client
	 * @param path      삭제할 경로
	 * @param recursive Recusive 적용 여부
	 * @return 성공시 <tt>true</tt>
	 * @throws java.io.IOException 파일을 삭제할 수 없는 경우
	 */
	public static boolean remove(DFSClient client, String path, boolean recursive) throws IOException {
		if (client.exists(path)) {
			logger.info("요청한 [{}] 파일이 존재하므로 삭제합니다. Recursive 여부 [{}]", path, recursive);
			return client.delete(path, recursive);
		}
		logger.info("요청한 [{}] 파일이 존재하지 않습니다.", path);
		return false;
	}

	/**
	 * 지정한 경로의 파일 목록을 얻는다.
	 *
	 * @param client DFS Client
	 * @param path   경로
	 * @return 파일 경로 목록
	 * @throws java.io.IOException HDFS IO를 처리할 수 없는 경우
	 */
	public static List<String> list(DFSClient client, String path) throws IOException {
		List<String> list = new ArrayList<String>();
/*
        FileStatus[] statuses = client.listPaths(path);
        if (statuses != null) {
            for (FileStatus status : statuses) {
                if (!status.isDir()) {
                    String fullyQualifiedHDFSFilename = path + "/" + status.getPath().getName();
                    list.add(fullyQualifiedHDFSFilename);
                }
            }
        }
*/
		return list;
	}

	/**
	 * DFS Client의 출력 스트립을 얻는다.
	 *
	 * @param client    DFS Client
	 * @param filename  파일명
	 * @param overwrite Overwrite 여부
	 * @return 출력 스트림
	 * @throws java.io.IOException HDFS IO를 처리할 수 없는 경우
	 */
	public static OutputStream getOutputStream(DFSClient client, String filename, boolean overwrite) throws IOException {
		return client.create(filename, overwrite);
	}

	/**
	 * DFS Client의 입력 스트립을 얻는다.
	 *
	 * @param client   DFS Client
	 * @param filename 파일 경로
	 * @return 입력 스트림
	 * @throws java.io.IOException HDFS IO를 처리할 수 없는 경우
	 */
	public static InputStream getInputStream(DFSClient client, String filename) throws IOException {
		return client.open(filename);
	}

	/**
	 * 출력 스트림을 종료한다.
	 *
	 * @param outputStream 출력 스트림
	 * @throws java.io.IOException 출력 스트림을 종료할 수 없는 경우
	 */
	public static void closeOuputStream(OutputStream outputStream) throws IOException {
		outputStream.close();
	}

	/**
	 * 입력 스트림을 종료한다.
	 *
	 * @param inputStream 입력 스트림
	 * @throws java.io.IOException 입력 스트림을 종료할 수 없는 경우
	 */
	public static void closeInputStream(InputStream inputStream) throws IOException {
		inputStream.close();
	}

	/**
	 * Input Split의 파일명을 반환한다.
	 * Input Split은 기본적으로 <tt>file + ":" + start + "+" + length</tt> 형식으로 구성되어 있다.
	 *
	 * @param inputSplit Input Split
	 * @return 파일명
	 */
	public static String getFilename(InputSplit inputSplit) {
		String filename = org.openflamingo.mapreduce.util.FileUtils.getFilename(inputSplit.toString());
		int start = filename.indexOf(":");
		return filename.substring(0, start);
	}

	/**
	 * @param hdfsUrl
	 * @return
	 * @throws java.io.IOException
	 */
	public static FileSystem getFileSystem(String hdfsUrl) throws IOException {
		Configuration configuration = new Configuration();
		configuration.set("fs.default.name", hdfsUrl);
		FileSystem fileSystem = FileSystem.get(configuration);
		return fileSystem;
	}

	/**
	 * 지정한 경로가 존재하는지 확인한다.
	 *
	 * @param client DFS Client
	 * @param path   존재 여부를 판단할 경로
	 * @return 존재하면 <tt>true</tt>
	 * @throws java.io.IOException HDFS IO를 처리할 수 없는 경우
	 */
	public static boolean exists(DFSClient client, String path) throws IOException {
		return client.exists(path);
	}

	/**
	 * 지정한 경로가 파일인지 확인한다.
	 *
	 * @param client DFS Client
	 * @param path   경로
	 * @return 파일인 경우 <tt>true</tt>
	 * @throws java.io.IOException HDFS IO를 처리할 수 없는 경우
	 */
	public static boolean isFile(DFSClient client, String path) throws IOException {
		HdfsFileStatus status = client.getFileInfo(path);
		return !status.isDir();
	}

	/**
	 * 지정한 경로가 디렉토리인지 확인한다.
	 *
	 * @param fs   {@link org.apache.hadoop.fs.FileSystem}
	 * @param path 경로
	 * @return 디렉토리인 경우 <tt>true</tt>
	 * @throws java.io.IOException HDFS IO를 처리할 수 없는 경우
	 */
	public static boolean isDirectory(FileSystem fs, String path) throws IOException {
		try {
			FileStatus status = fs.getFileStatus(new Path(path));
			return status.isDir();
		} catch (FileNotFoundException ex) {
			return false;
		}
	}

	/**
	 * 문자열을 지정한 파일로 저장한다.
	 *
	 * @param client  DFS Client
	 * @param path    저장할 파일의 절대 경로
	 * @param content 저장할 파일의 문자열 내용
	 * @throws java.io.IOException HDFS IO를 처리할 수 없는 경우
	 */
	public static void saveFile(DFSClient client, String path, String content) throws IOException {
		OutputStream outputStream = getOutputStream(client, path, true);
		org.openflamingo.mapreduce.util.FileUtils.copy(content.getBytes(), outputStream);
		outputStream.close();
	}

	/**
	 * 지정한 경로의 파일 정보를 얻어온다.
	 *
	 * @param client DFS Client
	 * @param path   파일 정보를 얻어올 경로
	 * @return 파일 정보
	 * @throws java.io.IOException HDFS IO를 처리할 수 없는 경우
	 */
	public static HdfsFileStatus getFileInfo(DFSClient client, String path) throws IOException {
		return client.getFileInfo(path);
	}

	/**
	 * 다운로드한 로컬 파일 시스템에 존재하는 파일을 지정한 HDFS에 업로드한다.
	 *
	 * @param hdfsUrl          HDFS URL
	 * @param filename         HDFS의 Path에 저장할 파일명
	 * @param hdfsPath         HDFS의 Path
	 * @param downloadFilePath 로컬 파일 시스템에 있는 다운로드한 파일
	 * @throws java.io.IOException HDFS 작업을 실패한 경우
	 */
	public static void uploadToHdfs(String hdfsUrl, String filename, String hdfsPath, String downloadFilePath) throws IOException {
		String hdfsFullPath = hdfsPath + "/" + filename;
		File inputFile = new File(downloadFilePath);
		DFSClient dfsClient = HdfsUtils.createDFSClient(hdfsUrl);
		copyFromLocalFileToHdfsFile(inputFile, dfsClient, hdfsFullPath);
		dfsClient.close();
	}

	/**
	 * 로컬 파일 시스템의 파일을 HDFS로 복사한다.
	 *
	 * @param inputFile 로컬 파일 시스템의 입력 파일
	 * @param client    DFSClient
	 * @param hdfsPath  HDFS의 출력 파일 경로
	 * @throws java.io.IOException 파일을 복사할 수 없는 경우
	 */
	public static void copyFromLocalFileToHdfsFile(File inputFile, DFSClient client, String hdfsPath) throws IOException {
		OutputStream outputStream = HdfsUtils.getOutputStream(client, hdfsPath, true);
		InputStream inputStream = new FileInputStream(inputFile);
		org.openflamingo.mapreduce.util.FileUtils.copy(inputStream, outputStream);
	}

	/**
	 * HDFS 상에서 지정한 디렉토리의 파일을 다른 디렉토리로 파일을 이동시킨다.
	 *
	 * @param hdfsUrl         HDFS URL
	 * @param sourceDirectory 소스 디렉토리
	 * @param targetDirectory 목적 디렉토리
	 * @throws java.io.IOException 파일을 이동할 수 없는 경우
	 */
	public static void moveFilesToDirectory(String hdfsUrl, String sourceDirectory, String targetDirectory) throws IOException {
		Configuration conf = new Configuration();
		conf.set("fs.default.name", hdfsUrl);
		conf.set("hadoop.job.ugi", DEFAULT_UGI);
		FileSystem fileSystem = FileSystem.get(conf);
		FileStatus[] statuses = fileSystem.listStatus(new Path(sourceDirectory));
		for (int i = 0; i < statuses.length; i++) {
			FileStatus fileStatus = statuses[i];
			if (!isDirectory(fileSystem, targetDirectory)) {
				logger.info("HDFS에 [{}] 디렉토리가 존재하지 않아서 생성합니다.", targetDirectory);
				fileSystem.mkdirs(new Path(targetDirectory));
			}
			fileSystem.rename(fileStatus.getPath(), new Path(targetDirectory));
			logger.info("HDFS의 파일 [{}]을 [{}] 디렉토리로 이동했습니다.", fileStatus.getPath(), targetDirectory);
		}
	}

	/**
	 * 디렉토리가 존재하지 않는다면 생성한다.
	 *
	 * @param directory 디렉토리
	 * @param hdfsUrl   HDFS URL
	 * @throws java.io.IOException HDFS 작업을 실패한 경우
	 */
	public static void makeDirectoryIfNotExists(String directory, String hdfsUrl) throws IOException {
		Configuration conf = new Configuration();
		conf.set("fs.default.name", hdfsUrl);
		conf.set("hadoop.job.ugi", DEFAULT_UGI);
		FileSystem fileSystem = FileSystem.get(conf);
		if (!isDirectory(fileSystem, directory)) {
			logger.info("HDFS에 [{}] 디렉토리가 존재하지 않아서 생성합니다.", directory);
			fileSystem.mkdirs(new Path(directory));
		}
	}

	/**
	 * 해당 HDFS 디렉토리에 있는 모든 파일 목록을 반환한다.
	 *
	 * @param hdfsUrl         HDFS URL
	 * @param hdfsDirectories HDFS 디렉토리 목록
	 * @return HDFS 디렉토리에 포함되어 있는 모든 파일 목록
	 * @throws java.io.IOException HDFS에 접근할 수 없거나, 파일 목록을 알아낼 수 없는 경우
	 */
	public static String[] getHdfsFiles(String hdfsUrl, List<String> hdfsDirectories) throws IOException {
		List<String> filesInDirectories = new ArrayList<String>();
		DFSClient client = HdfsUtils.createDFSClient(hdfsUrl);
		for (Iterator<String> stringIterator = hdfsDirectories.iterator(); stringIterator.hasNext(); ) {
			String hdfsDirectory = stringIterator.next();
			List<String> files = HdfsUtils.list(client, hdfsDirectory);
			filesInDirectories.addAll(files);
		}
		client.close();
		return StringUtils.toStringArray(filesInDirectories);
	}

	/**
	 * HDFS의 해당 경로의 모든 파일에서 지정한 확장자를 가진 파일 목록을 반환한다.
	 *
	 * @param hdfsUrl HDFS URL
	 * @param ext     확장자(예: <tt>.dat</tt>)
	 * @param path    경로
	 * @return "<tt>.dat</tt>" 확장자를 가진 파일 목록
	 * @throws java.io.IOException HDFS 작업을 실패한 경우
	 */
	public static String[] getHdfsFiles(String hdfsUrl, String ext, String path) throws IOException {
		ArrayList<String> files = new ArrayList<String>();
		DFSClient client = HdfsUtils.createDFSClient(hdfsUrl);
		makeDirectoryIfNotExists(path, hdfsUrl);
/*
        FileStatus[] statuses = client.listPaths(path);
        if (statuses != null) {
            for (int index = 0; index < statuses.length; index++) {
                FileStatus file = statuses[index];
                if (!file.isDir() && file.getPath().getName().endsWith(ext)) {
                    logger.debug("\tHDFS의 [{}] 디렉토리에는 [{}] 파일이 있습니다.", path, file.getPath().getName());
                    files.add(file.getPath().getName());
                }
            }
        }
*/
		client.close();
		return StringUtils.toStringArray(files);
	}

	/**
	 * 지정한 경로에 파일이 존재하는지 확인한다.
	 *
	 * @param hdfsUrl HDFS URL
	 * @param path    존재 여부를 확인할 절대 경로
	 * @return 존재한다면 <tt>true</tt>
	 * @throws java.io.IOException 파일 존재 여부를 알 수 없거나, HDFS에 접근할 수 없는 경우
	 */
	public static boolean isExist(String hdfsUrl, String path) throws IOException {
		DFSClient client = HdfsUtils.createDFSClient(hdfsUrl);
		HdfsFileStatus status = client.getFileInfo(path);
		if (status != null && !status.isDir()) {
			logger.info("파일 [{}{}]이 HDFS에 존재합니다.", hdfsUrl, path);
			client.close();
			return true;
		}
		logger.info("파일 [{}{}]이 HDFS이 존재하지 않습니다.", hdfsUrl, path);
		client.close();
		return false;
	}

	/**
	 * HDFS에서 지정한 디렉토리의 모든 파일을 삭제한다.
	 *
	 * @param hdfsUrl       HDFS URL
	 * @param hdfsDirectory 파일을 삭제할 HDFS Directory URL
	 * @throws java.io.IOException 파일을 삭제할 수 없는 경우
	 */
	public static void deleteFromHdfs(String hdfsUrl, String hdfsDirectory) throws IOException {
		Configuration conf = new Configuration();
		conf.set("fs.default.name", hdfsUrl);
		FileSystem fs = FileSystem.get(conf);
		FileStatus[] statuses = fs.globStatus(new Path(hdfsDirectory));
		for (int i = 0; i < statuses.length; i++) {
			FileStatus fileStatus = statuses[i];
			fs.delete(fileStatus.getPath(), true);
		}
	}

	/**
	 * 해당 경로에 있는 파일을 MERGE한다.
	 *
	 * @param hdfsUrl HDFS URL
	 * @param path    HDFS Path
	 * @throws java.io.IOException Get Merge할 수 없는 경우
	 */
	public static void merge(String hdfsUrl, String path) throws IOException {
		// 입력 경로의 모든 파일을 Get Merge하여 임시 파일에 기록한다.
		Configuration conf = new Configuration();
		conf.set("fs.default.name", hdfsUrl);
		FileSystem fileSystem = FileSystem.get(conf);
		Path source = new Path(path);
		if (!fileSystem.getFileStatus(source).isDir()) {
			// 이미 파일이라면 더이상 Get Merge할 필요없다.
			return;
		}
		Path target = new Path(path + "_temporary");
		FileUtil.copyMerge(fileSystem, source, fileSystem, target, true, conf, null);

		// 원 소스 파일을 삭제한다.
		fileSystem.delete(source, true);

		// 임시 파일을 원 소스 파일명으로 대체한다.
		Path in = new Path(path + "_temporary");
		Path out = new Path(path);
		fileSystem.rename(in, out);

		// 임시 디렉토리를 삭제한다.
		fileSystem.delete(new Path(path + "_temporary"), true);
	}
}