/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with this
* work for additional information regarding copyright ownership. The ASF
* licenses this file to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/
package org.apache.pig.piggybank.storage.partition;
import java.io.IOException;
import java.util.LinkedHashMap;
import java.util.LinkedHashSet;
import java.util.Map;
import java.util.Set;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
/**
*
* Its convenient sometimes to partition logs by date values or other e.g.
* country, city etc.<br/>
* A daydate partitioned hdfs directory might look something like:<br/>
*
* <pre>
* /logs/repo/mylog/
* daydate=2010-01-01
* daydate=2010-01-02
* </pre>
*
* This class accepts a path like /logs/repo/mylog and return a map of the
* partition keys
*/
public class PathPartitioner {
/**
* Note: this must be the path lowes in the Searches for the key=value pairs
* in the path pointer by the location parameter.
*
* @param location
* String root path in hdsf e.g. /user/hive/warehouse or
* /logs/repo
* @param conf
* Configuration
* @return Set of String. The order is maintained as per the directory tree.
* i.e. if /logs/repo/year=2010/month=2010 exists the first item in
* the set will be year and the second month.
* @throws IOException
*/
public Map<String, String> getPathPartitionKeyValues(String location)
throws IOException {
// use LinkedHashSet because order is important here.
Map<String, String> partitionKeys = new LinkedHashMap<String, String>();
String[] pathSplit = location.split("/");
for (String pathSplitItem : pathSplit) {
parseAndPutKeyValue(pathSplitItem, partitionKeys);
}
return partitionKeys;
}
/**
* Searches for the key=value pairs in the path pointer by the location
* parameter.
*
* @param location
* String root path in hdsf e.g. /user/hive/warehouse or
* /logs/repo
* @param conf
* Configuration
* @return Set of String. The order is maintained as per the directory tree.
* i.e. if /logs/repo/year=2010/month=2010 exists the first item in
* the set will be year and the second month.
* @throws IOException
*/
public Set<String> getPartitionKeys(String location, Configuration conf)
throws IOException {
// find the hive type partition key=value pairs from the path.
// first parse the string alone.
Path path = new Path(location);
FileSystem fs = path.getFileSystem(conf);
FileStatus[] fileStatusArr = null;
// use LinkedHashSet because order is important here.
Set<String> partitionKeys = new LinkedHashSet<String>();
parseAndPutKeyValue(location, partitionKeys);
while (!((fileStatusArr = fs.listStatus(path)) == null || fs
.isFile(path) || fileStatusArr.length == 0)) {
for (FileStatus fileStatus : fileStatusArr) {
path = fileStatus.getPath();
// ignore hidden directories
if (fileStatus.getPath().getName().startsWith("_")
|| !fileStatus.isDir())
continue;
parseAndPutKeyValue(path.getName(), partitionKeys);
// at the first directory found stop the for loop after parsing
// for key value pairs
break;
}
}
return partitionKeys;
}
private final void parseAndPutKeyValue(String pathName,
Map<String, String> partitionKeys) {
String[] keyValue = parsePathKeyValue(pathName);
if (keyValue != null) {
partitionKeys.put(keyValue[0], keyValue[1]);
}
}
private final void parseAndPutKeyValue(String pathName,
Set<String> partitionKeys) {
String[] keyValue = parsePathKeyValue(pathName);
if (keyValue != null) {
partitionKeys.add(keyValue[0]);
}
}
/**
* Will look for key=value pairs in the path for example:
* /user/hive/warehouse/mylogs/year=2010/month=07
*
* @param path
* @return String[] [0]= key [1] = value
*/
public String[] parsePathKeyValue(String path) {
int slashIndex = path.lastIndexOf('/');
String parsedPath = path;
String[] keyValue = null;
if (slashIndex > 0) {
parsedPath = path.substring(slashIndex);
}
if (parsedPath.contains("=")) {
String split[] = parsedPath.split("=");
if (split.length == 2) {
keyValue = split;
}
}
return keyValue;
}
}