/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hive.metastore;
import java.io.IOException;
import java.util.concurrent.Executors;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.TimeUnit;
import org.apache.commons.lang3.concurrent.BasicThreadFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileChecksum;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;
import org.apache.hadoop.fs.Trash;
import org.apache.hadoop.fs.permission.FsPermission;
import org.apache.hadoop.hdfs.DFSConfigKeys;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.conf.HiveConf.ConfVars;
import org.apache.hadoop.hive.metastore.api.MetaException;
import org.apache.hadoop.security.UserGroupInformation;
import org.apache.hadoop.util.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class ReplChangeManager {
private static final Logger LOG = LoggerFactory.getLogger(ReplChangeManager.class);
static private ReplChangeManager instance;
private static boolean inited = false;
private static boolean enabled = false;
private static Path cmroot;
private static HiveConf hiveConf;
private String msUser;
private String msGroup;
private FileSystem fs;
public static final String ORIG_LOC_TAG = "user.original-loc";
public static final String REMAIN_IN_TRASH_TAG = "user.remain-in-trash";
public static final String URI_FRAGMENT_SEPARATOR = "#";
public static ReplChangeManager getInstance(HiveConf hiveConf) throws MetaException {
if (instance == null) {
instance = new ReplChangeManager(hiveConf);
}
return instance;
}
ReplChangeManager(HiveConf hiveConf) throws MetaException {
try {
if (!inited) {
if (hiveConf.getBoolVar(HiveConf.ConfVars.REPLCMENABLED)) {
ReplChangeManager.enabled = true;
ReplChangeManager.cmroot = new Path(hiveConf.get(HiveConf.ConfVars.REPLCMDIR.varname));
ReplChangeManager.hiveConf = hiveConf;
fs = cmroot.getFileSystem(hiveConf);
// Create cmroot with permission 700 if not exist
if (!fs.exists(cmroot)) {
fs.mkdirs(cmroot);
fs.setPermission(cmroot, new FsPermission("700"));
}
UserGroupInformation usergroupInfo = UserGroupInformation.getCurrentUser();
msUser = usergroupInfo.getShortUserName();
msGroup = usergroupInfo.getPrimaryGroupName();
}
inited = true;
}
} catch (IOException e) {
throw new MetaException(StringUtils.stringifyException(e));
}
}
// Filter files starts with ".". Note Hadoop consider files starts with
// "." or "_" as hidden file. However, we need to replicate files starts
// with "_". We find at least 2 use cases:
// 1. For har files, _index and _masterindex is required files
// 2. _success file is required for Oozie to indicate availability of data source
private static final PathFilter hiddenFileFilter = new PathFilter(){
public boolean accept(Path p){
return !p.getName().startsWith(".");
}
};
/***
* Move a path into cmroot. If the path is a directory (of a partition, or table if nonpartitioned),
* recursively move files inside directory to cmroot. Note the table must be managed table
* @param path a single file or directory
* @param ifPurge if the file should skip Trash when delete
* @return
* @throws MetaException
*/
public int recycle(Path path, boolean ifPurge) throws MetaException {
if (!enabled) {
return 0;
}
try {
int count = 0;
if (fs.isDirectory(path)) {
FileStatus[] files = fs.listStatus(path, hiddenFileFilter);
for (FileStatus file : files) {
count += recycle(file.getPath(), ifPurge);
}
} else {
Path cmPath = getCMPath(path, hiveConf, getChksumString(path, fs));
if (LOG.isDebugEnabled()) {
LOG.debug("Moving " + path.toString() + " to " + cmPath.toString());
}
// set timestamp before moving to cmroot, so we can
// avoid race condition CM remove the file before setting
// timestamp
long now = System.currentTimeMillis();
fs.setTimes(path, now, now);
boolean succ = fs.rename(path, cmPath);
// Ignore if a file with same content already exist in cmroot
// We might want to setXAttr for the new location in the future
if (!succ) {
if (LOG.isDebugEnabled()) {
LOG.debug("A file with the same content of " + path.toString() + " already exists, ignore");
}
// Need to extend the tenancy if we saw a newer file with the same content
fs.setTimes(cmPath, now, now);
} else {
// set the file owner to hive (or the id metastore run as)
fs.setOwner(cmPath, msUser, msGroup);
// tag the original file name so we know where the file comes from
// Note we currently only track the last known trace as
// xattr has limited capacity. We shall revisit and store all original
// locations if orig-loc becomes important
try {
fs.setXAttr(cmPath, ORIG_LOC_TAG, path.toString().getBytes());
} catch (UnsupportedOperationException e) {
LOG.warn("Error setting xattr for " + path.toString());
}
count++;
}
// Tag if we want to remain in trash after deletion.
// If multiple files share the same content, then
// any file claim remain in trash would be granted
if (!ifPurge) {
try {
fs.setXAttr(cmPath, REMAIN_IN_TRASH_TAG, new byte[]{0});
} catch (UnsupportedOperationException e) {
LOG.warn("Error setting xattr for " + cmPath.toString());
}
}
}
return count;
} catch (IOException e) {
throw new MetaException(StringUtils.stringifyException(e));
}
}
// Get checksum of a file
static public String getChksumString(Path path, FileSystem fs) throws IOException {
// TODO: fs checksum only available on hdfs, need to
// find a solution for other fs (eg, local fs, s3, etc)
String checksumString = null;
FileChecksum checksum = fs.getFileChecksum(path);
if (checksum != null) {
checksumString = StringUtils.byteToHexString(
checksum.getBytes(), 0, checksum.getLength());
}
return checksumString;
}
static public void setCmRoot(Path cmRoot) {
ReplChangeManager.cmroot = cmRoot;
}
/***
* Convert a path of file inside a partition or table (if non-partitioned)
* to a deterministic location of cmroot. So user can retrieve the file back
* with the original location plus checksum.
* @param path original path inside partition or table
* @param conf
* @param chksum checksum of the file, can be retrieved by {@link getCksumString}
* @return
* @throws IOException
* @throws MetaException
*/
static public Path getCMPath(Path path, Configuration conf, String chksum)
throws IOException, MetaException {
String newFileName = chksum;
int maxLength = conf.getInt(DFSConfigKeys.DFS_NAMENODE_MAX_COMPONENT_LENGTH_KEY,
DFSConfigKeys.DFS_NAMENODE_MAX_COMPONENT_LENGTH_DEFAULT);
if (newFileName.length() > maxLength) {
newFileName = newFileName.substring(0, maxLength-1);
}
Path cmPath = new Path(cmroot, newFileName);
return cmPath;
}
/***
* Get original file specified by src and chksumString. If the file exists and checksum
* matches, return the file; otherwise, use chksumString to retrieve it from cmroot
* @param src Original file location
* @param chksumString Checksum of the original file
* @param conf
* @return Corresponding FileStatus object
* @throws MetaException
*/
static public FileStatus getFileStatus(Path src, String chksumString,
HiveConf conf) throws MetaException {
try {
FileSystem srcFs = src.getFileSystem(conf);
if (chksumString == null) {
return srcFs.getFileStatus(src);
}
if (!srcFs.exists(src)) {
return srcFs.getFileStatus(getCMPath(src, conf, chksumString));
}
String currentChksumString = getChksumString(src, srcFs);
if (currentChksumString == null || chksumString.equals(currentChksumString)) {
return srcFs.getFileStatus(src);
} else {
return srcFs.getFileStatus(getCMPath(src, conf, chksumString));
}
} catch (IOException e) {
throw new MetaException(StringUtils.stringifyException(e));
}
}
/***
* Concatenate filename and checksum with "#"
* @param fileUriStr Filename string
* @param fileChecksum Checksum string
* @return Concatenated Uri string
*/
// TODO: this needs to be enhanced once change management based filesystem is implemented
// Currently using fileuri#checksum as the format
static public String encodeFileUri(String fileUriStr, String fileChecksum) {
if (fileChecksum != null) {
return fileUriStr + URI_FRAGMENT_SEPARATOR + fileChecksum;
} else {
return fileUriStr;
}
}
/***
* Split uri with fragment into file uri and checksum
* @param fileURIStr uri with fragment
* @return array of file name and checksum
*/
static public String[] getFileWithChksumFromURI(String fileURIStr) {
String[] uriAndFragment = fileURIStr.split(URI_FRAGMENT_SEPARATOR);
String[] result = new String[2];
result[0] = uriAndFragment[0];
if (uriAndFragment.length>1) {
result[1] = uriAndFragment[1];
}
return result;
}
/**
* Thread to clear old files of cmroot recursively
*/
static class CMClearer implements Runnable {
private Path cmroot;
private long secRetain;
private HiveConf hiveConf;
CMClearer(String cmrootString, long secRetain, HiveConf hiveConf) {
this.cmroot = new Path(cmrootString);
this.secRetain = secRetain;
this.hiveConf = hiveConf;
}
@Override
public void run() {
try {
LOG.info("CMClearer started");
long now = System.currentTimeMillis();
FileSystem fs = cmroot.getFileSystem(hiveConf);
FileStatus[] files = fs.listStatus(cmroot);
for (FileStatus file : files) {
long modifiedTime = file.getModificationTime();
if (now - modifiedTime > secRetain*1000) {
try {
if (fs.getXAttrs(file.getPath()).containsKey(REMAIN_IN_TRASH_TAG)) {
boolean succ = Trash.moveToAppropriateTrash(fs, file.getPath(), hiveConf);
if (succ) {
if (LOG.isDebugEnabled()) {
LOG.debug("Move " + file.toString() + " to trash");
}
} else {
LOG.warn("Fail to move " + file.toString() + " to trash");
}
} else {
boolean succ = fs.delete(file.getPath(), false);
if (succ) {
if (LOG.isDebugEnabled()) {
LOG.debug("Remove " + file.toString());
}
} else {
LOG.warn("Fail to remove " + file.toString());
}
}
} catch (UnsupportedOperationException e) {
LOG.warn("Error getting xattr for " + file.getPath().toString());
}
}
}
} catch (IOException e) {
LOG.error("Exception when clearing cmroot:" + StringUtils.stringifyException(e));
}
}
}
// Schedule CMClearer thread. Will be invoked by metastore
public static void scheduleCMClearer(HiveConf hiveConf) {
if (HiveConf.getBoolVar(hiveConf, HiveConf.ConfVars.REPLCMENABLED)) {
ScheduledExecutorService executor = Executors.newSingleThreadScheduledExecutor(
new BasicThreadFactory.Builder()
.namingPattern("cmclearer-%d")
.daemon(true)
.build());
executor.scheduleAtFixedRate(new CMClearer(hiveConf.get(HiveConf.ConfVars.REPLCMDIR.varname),
hiveConf.getTimeVar(ConfVars.REPLCMRETIAN, TimeUnit.SECONDS), hiveConf),
0, hiveConf.getTimeVar(ConfVars.REPLCMINTERVAL, TimeUnit.SECONDS), TimeUnit.SECONDS);
}
}
}