/* (c) 2014 LinkedIn Corp. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not use
* this file except in compliance with the License. You may obtain a copy of the
* License at http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed
* under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
* CONDITIONS OF ANY KIND, either express or implied.
*/
package com.linkedin.cubert.utils;
import java.io.IOException;
import java.text.DecimalFormat;
import java.text.NumberFormat;
import java.util.ArrayList;
import java.util.List;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.GlobPattern;
import org.apache.hadoop.fs.LocatedFileStatus;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.RemoteIterator;
import org.codehaus.jackson.JsonNode;
import org.codehaus.jackson.node.ArrayNode;
import org.joda.time.DateTime;
import org.joda.time.format.DateTimeFormat;
import org.joda.time.format.DateTimeFormatter;
import static com.linkedin.cubert.utils.JsonUtils.getText;
/**
* Utility methods to enumerate paths in the file system.
*
* @author Maneesh Varshney
*
*/
public class FileSystemUtils
{
public static Path getFirstMatch(FileSystem fs, Path path, String globPatternStr, boolean recursive)
throws IOException
{
RemoteIterator<LocatedFileStatus> files = fs.listFiles(path, recursive);
GlobPattern globPattern = new GlobPattern(globPatternStr);
while (files.hasNext())
{
Path aFile = files.next().getPath();
if(globPattern.matches(aFile.getName()))
return aFile;
}
return null;
}
public static List<Path> getPaths(FileSystem fs, JsonNode json, JsonNode params) throws IOException {
return getPaths(fs, json, false, params);
}
public static List<Path> getPaths(FileSystem fs, JsonNode json,
boolean schemaOnly, JsonNode params) throws IOException
{
if (json.isArray())
{
List<Path> paths = new ArrayList<Path>();
// If the specified input is array, recursively get paths for each item in the
// array
ArrayNode anode = (ArrayNode) json;
for (int i = 0; i < anode.size(); i++)
{
paths.addAll(getPaths(fs, json.get(i), params));
}
return paths;
}
else if (json.isTextual())
{
return getPaths(fs, new Path(json.getTextValue()));
}
else
{
List<Path> paths = new ArrayList<Path>();
Path root = new Path(getText(json, "root"));
Path basePath = root;
JsonNode startDateJson = json.get("startDate");
if (schemaOnly && json.get("origStartDate") != null)
startDateJson = json.get("origStartDate");
JsonNode endDateJson = json.get("endDate");
if(startDateJson == null || endDateJson == null)
{
throw new IllegalArgumentException("StartDate and endDate need to be specified");
}
String startDuration, endDuration;
if(startDateJson.isTextual())
{
startDuration = startDateJson.getTextValue();
endDuration = endDateJson.getTextValue();
}
else
{
startDuration = startDateJson.toString();
endDuration = endDateJson.toString();
}
boolean errorOnMissing = false;
JsonNode errorOnMissingJson = params.get("errorOnMissing");
if(errorOnMissingJson != null)
errorOnMissing = Boolean.parseBoolean(errorOnMissingJson.getTextValue());
boolean useHourlyForMissingDaily = false;
JsonNode useHourlyForMissingDailyJson = params.get("useHourlyForMissingDaily");
if(useHourlyForMissingDailyJson != null)
useHourlyForMissingDaily = Boolean.parseBoolean(useHourlyForMissingDailyJson.getTextValue());
DateTimeFormatter dtf = DateTimeFormat.forPattern("yyyyMMdd");
DateTimeFormatter dtfwHour = DateTimeFormat.forPattern("yyyyMMddHH");
DateTime startDate, endDate;
boolean isDaily;
int hourStep;
if (startDuration.length() == 8)
{
if (endDuration.length() != 8)
throw new IllegalArgumentException("EndDate " + endDuration
+ " is not consistent with StartDate " + startDuration);
startDate = dtf.parseDateTime(startDuration);
endDate = dtf.parseDateTime(endDuration);
isDaily = true;
hourStep = 24;
}
else if (startDuration.length() == 10)
{
if (endDuration.length() != 10)
throw new IllegalArgumentException("EndDate " + endDuration
+ " is not consistent with StartDate " + startDuration);
startDate = dtfwHour.parseDateTime(startDuration);
endDate = dtfwHour.parseDateTime(endDuration);
isDaily = false;
hourStep = 1;
}
else
{
throw new IllegalArgumentException("Cannot parse StartDate "
+ startDuration + " as daily or hourly duration");
}
for(Path path: getPaths(fs,root))
{
if(isDaily)
{
if(path.getName().equals("daily"))
basePath = path;
else
basePath = new Path(path, "daily");
}
else
{
if(path.getName().equals("hourly"))
basePath = path;
else
basePath = new Path(path, "hourly");
}
//If daily folder itself doesn't exist
if (!fs.exists(basePath) && isDaily && useHourlyForMissingDaily &&
fs.exists(new Path(basePath.getParent(), "hourly"))) {
basePath = new Path(basePath.getParent(), "hourly");
endDate = endDate.plusHours(23);
isDaily = false;
hourStep = 1;
}
paths.addAll(getDurationPaths(fs,
basePath,
startDate,
endDate,
isDaily,
hourStep,
errorOnMissing,
useHourlyForMissingDaily));
}
if (paths.isEmpty() && schemaOnly)
throw new IOException(String.format("No input files at %s from %s to %s",
basePath.toString(),
startDuration,
endDuration));
return paths;
}
}
private static Path generateDatedPath(Path base, int year, int month, int day) {
return generateDatedPath(base, year, month, day, -1);
}
private static Path generateDatedPath(Path base, int year, int month, int day, int hour) {
NumberFormat nf2 = new DecimalFormat("00");
return new Path(base, hour != -1 ? nf2.format(year) + "/" + nf2.format(month) + "/" + nf2.format(day) + "/"
+ nf2.format(hour) : nf2.format(year) + "/" + nf2.format(month) + "/" + nf2.format(day));
}
public static List<Path> getPaths(FileSystem fs, Path path) throws IOException
{
List<Path> paths = new ArrayList<Path>();
String pathStr = path.toString();
if (pathStr.contains("*"))
{
for (Path p : getGlobPaths(fs, path))
{
paths.add(getLatestPath(fs, p));
}
}
else
{
paths.add(getLatestPath(fs, path));
}
return paths;
}
public static List<Path> getGlobPaths(FileSystem fs, Path path) throws IOException
{
List<Path> paths = new ArrayList<Path>();
FileStatus[] fileStatus = fs.globStatus(path);
if (fileStatus == null)
throw new IOException("Cannot determine paths at " + path.toString());
for (FileStatus status : fileStatus)
{
paths.add(status.getPath());
}
return paths;
}
public static Path getLatestPath(FileSystem fs, Path path) throws IOException
{
String pathStr = path.toString();
// Return the same path, if there is no "#LATEST" within it
if (!pathStr.contains("#LATEST"))
return path;
// replace all #LATEST with glob "*"
pathStr = pathStr.replaceAll("#LATEST", "*");
FileStatus[] fileStatus = fs.globStatus(new Path(pathStr));
if (fileStatus == null || fileStatus.length == 0)
throw new IOException("Cannot determine paths at " + pathStr);
String latestPath = null;
for (FileStatus status : fileStatus)
{
String thisPath = status.getPath().toString();
if (latestPath == null || thisPath.compareTo(latestPath) > 0)
latestPath = thisPath;
}
return new Path(latestPath);
}
public static List<Path> getDurationPaths(FileSystem fs,
Path root,
DateTime startDate,
DateTime endDate,
boolean isDaily,
int hourStep,
boolean errorOnMissing,
boolean useHourlyForMissingDaily) throws IOException
{
List<Path> paths = new ArrayList<Path>();
while (endDate.compareTo(startDate) >= 0) {
Path loc;
if (isDaily)
loc = generateDatedPath(root, endDate.getYear(), endDate.getMonthOfYear(), endDate.getDayOfMonth());
else
loc = generateDatedPath(root, endDate.getYear(), endDate.getMonthOfYear(), endDate.getDayOfMonth(),
endDate.getHourOfDay());
// Check that directory exists, and contains avro files.
if (fs.exists(loc) && fs.globStatus(new Path(loc, "*" + "avro")).length > 0) {
paths.add(loc);
}
else {
loc = generateDatedPath(new Path(root.getParent(),"hourly"), endDate.getYear(),
endDate.getMonthOfYear(), endDate.getDayOfMonth());
if(isDaily && useHourlyForMissingDaily && fs.exists(loc))
{
for (FileStatus hour: fs.listStatus(loc)) {
paths.add(hour.getPath());
}
}
else if (errorOnMissing) {
throw new RuntimeException("Missing directory " + loc.toString());
}
}
if (hourStep ==24)
endDate = endDate.minusDays(1);
else
endDate = endDate.minusHours(hourStep);
}
return paths;
}
}