/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hive.ql.plan;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.ql.plan.Explain.Level;
/**
* Marker work for Replication - behaves similar to CopyWork, but maps to ReplCopyTask,
* which will have mechanics to list the files in source to write to the destination,
* instead of copying them, if specified, falling back to copying if needed.
*/
@Explain(displayName = "Copy for Replication", explainLevels = { Level.USER, Level.DEFAULT, Level.EXTENDED })
public class ReplCopyWork extends CopyWork {
protected boolean copyFiles = true; // governs copy-or-list-files behaviour.
// If set to true, behaves identically to a CopyWork
// If set to false, ReplCopyTask does a file-list of the things to be copied instead, and puts them in a file called _files.
// Default is set to mimic CopyTask, with the intent that any Replication code will explicitly flip this.
/**
* TODO : Refactor
*
* There is an upcoming patch that refactors this bit of code. Currently, the idea is the following:
*
* By default, ReplCopyWork will behave similarly to CopyWork, and simply copy
* along data from the source to destination. If, however, listFilesOnOutput is set,
* then, instead of copying the individual files to the destination, it simply creates
* a file called _files on destination that contains the list of the original files
* that were intended to be copied. Thus, we do not actually copy the files at CopyWork
* time.
*
* The flip side of this behaviour happens when, instead, readListFromInput is set. This
* flag, if set, changes the source behaviour of this CopyTask, and instead of copying
* explicit files, this will then fall back to a behaviour wherein an _files is read from
* the source, and the files specified by the _files are then copied to the destination.
*
* This allows us a lazy-copy-on-source and a pull-from destination semantic that we want
* to use from replication.
*
* ==
*
* The refactor intent, however, is to simplify this, so that we have only 1 flag that we set,
* called isLazy. If isLazy is set, then this is the equivalent of the current listFilesOnOutput,
* and will generate a _files file.
*
* As to the input, we simply decide on whether to use the lazy mode or not depending on the
* presence of a _files file on the input. If we see a _files on the input, we simply expand it
* to copy as needed. If we do not, we copy as normal.
*
*/
protected boolean listFilesOnOutput = false; // governs copy-or-list-files behaviour
// If set to true, it'll iterate over input files, and for each file in the input,
// it'll write out an additional line in a _files file in the output.
// If set to false, it'll behave as a traditional CopyTask.
protected boolean readListFromInput = false; // governs remote-fetch-input behaviour
// If set to true, we'll assume that the input has a _files file present which lists
// the actual input files to copy, and we'll pull each of those on read.
// If set to false, it'll behave as a traditional CopyTask.
public ReplCopyWork() {
}
public ReplCopyWork(final Path fromPath, final Path toPath) {
super(fromPath, toPath, true);
}
public ReplCopyWork(final Path fromPath, final Path toPath, boolean errorOnSrcEmpty) {
super(fromPath, toPath, errorOnSrcEmpty);
}
public void setListFilesOnOutputBehaviour(boolean listFilesOnOutput){
this.listFilesOnOutput = listFilesOnOutput;
}
public boolean getListFilesOnOutputBehaviour(){
return this.listFilesOnOutput;
}
public void setReadListFromInput(boolean readListFromInput){
this.readListFromInput = readListFromInput;
}
public boolean getReadListFromInput(){
return this.readListFromInput;
}
// specialization of getListFilesOnOutputBehaviour, with a filestatus arg
// we can default to the default getListFilesOnOutputBehaviour behaviour,
// or, we can do additional pattern matching to decide that certain files
// should not be listed, and copied instead, _metadata files, for instance.
// Currently, we use this to skip _metadata files, but we might decide that
// this is not the right place for it later on.
public boolean getListFilesOnOutputBehaviour(FileStatus f) {
if (f.getPath().toString().contains("_metadata")){
return false; // always copy _metadata files
}
return this.listFilesOnOutput;
}
}