/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with this
* work for additional information regarding copyright ownership. The ASF
* licenses this file to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/
package org.apache.pig.piggybank.storage.allloader;
import java.io.IOException;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.SequenceFile;
import org.apache.pig.FuncSpec;
import org.apache.pig.impl.logicalLayer.FrontendException;
/**
*
* Contains the logic for finding a LoadFunc based on the definition of:
* <ul>
* <li>file.extension.loaders</li>
* <li>file.format.loaders</li>
* </ul>
*
*/
public class LoadFuncHelper {
public static final String FILE_EXTENSION_LOADERS = "file.extension.loaders";
/**
* The most common type of file formats are supported i.e. SEQ, GZ, BZ2, LZO
* This is used when a file does not have an extension. If this is true the
* first 3 bytes can be read from a file to determine its content type. The
* 3 bytes are then mapped to an extension for which an entry must exist in
* file.extension.loaders. If the content does not match the any entry in
* magicNumberExtensionMap plain text is assumed.
*/
private static Map<MagicNumber, String> magicNumberExtensionMap = buildMagicNumberExtensionMap();
Configuration conf;
FileSystem fileSystem;
/**
* Stores the extension:tag = load function pairs
*/
Map<String, String> loadFunctionExtensionTagMap;
/**
* Stores the extension = tags pairs
*/
Map<String, Set<String>> extensionTagsMap;
public LoadFuncHelper(Configuration conf) throws IOException {
this.conf = conf;
fileSystem = FileSystem.get(conf);
loadFunctionExtensionTagMap = new HashMap<String, String>();
extensionTagsMap = new HashMap<String, Set<String>>();
String fileExtensionLoaders = conf.get(FILE_EXTENSION_LOADERS);
if (fileExtensionLoaders != null) {
String[] loaderExtensionPairs = fileExtensionLoaders.split("\\),");
for (String loaderExtensionPairStr : loaderExtensionPairs) {
String[] loaderExtensionPair = loaderExtensionPairStr
.split(":");
if (loaderExtensionPair.length == 2) {
// we have extension:loader assign EMPTY TAG
loadFunctionExtensionTagMap.put(
loaderExtensionPair[0].trim() + ":",
loaderExtensionPair[1].trim());
} else if (loaderExtensionPair.length == 3
|| loaderExtensionPair.length == 4) {
// we have extension:pathTag:loader assign TAG
String ext = loaderExtensionPair[0].trim();
String tag = loaderExtensionPair[1].trim();
String key = ext + ":" + tag;
String loadFunc = loaderExtensionPair[2].trim();
// support key class names for sequence files
if (loaderExtensionPair.length == 4) {
// loadFunc here is not loadFunc but the sequence file
// key class
key += ":" + loadFunc;
loadFunc = loaderExtensionPair[3].trim();
}
loadFunctionExtensionTagMap.put(key, loadFunc);
Set<String> tags = extensionTagsMap.get(ext);
if (tags == null) {
tags = new TreeSet<String>();
extensionTagsMap.put(ext, tags);
}
tags.add(tag);
} else {
throw new FrontendException(
"Bad formatted file.extension.loaders string, format is <extension>:<loader>,<extenion><loader>");
}
}
}
}
/**
*
* @return
*/
private static Map<MagicNumber, String> buildMagicNumberExtensionMap() {
Map<MagicNumber, String> magicNumberExtensionMap = new HashMap<MagicNumber, String>();
magicNumberExtensionMap.put(new MagicNumber(new byte[] { 83, 69, 81 }),
"seq");
magicNumberExtensionMap.put(
new MagicNumber(new byte[] { -119, 76, 90 }), "lzo");
magicNumberExtensionMap.put(
new MagicNumber(new byte[] { 31, -117, 8 }), "gz");
magicNumberExtensionMap.put(
new MagicNumber(new byte[] { 66, 90, 104 }), "bz2");
return magicNumberExtensionMap;
}
/**
* If location is a directory the first file found is returned
*
* @param location
* @return
* @throws IOException
* if no file is found a FrontendException is thrown
*/
public Path determineFirstFile(String location) throws IOException {
Path path = new Path(location);
FileStatus status = fileSystem.getFileStatus(path);
if (status.isDir()) {
// get the first file.
path = getFirstFile(fileSystem, path);
if (path == null) {
throw new FrontendException(path + " has no files");
}
}
return path;
}
/**
*
* If location is a directory the first file found in the directory is used.<br/>
* The file extension of the file will be searched against the
* file.extension.loaders mappings. If none found null is returned.
*
* @param location
* @return
* @throws IOException
*/
public FuncSpec determineFunction(String location) throws IOException {
return determineFunction(location, determineFirstFile(location));
}
/**
*
* The file extension of the file will be searched against the
* file.extension.loaders mappings. If none found null is returned.
*
* @param path
* @param location
* @return
* @throws IOException
*/
public FuncSpec determineFunction(String location, Path path)
throws IOException {
String fileName = path.getName();
FuncSpec funcSpec = getLoadPerExtension(fileName, path);
if (funcSpec == null) {
// look for loaders by the content definition
funcSpec = getFuncSpecFromContent(path);
}
return funcSpec;
}
/**
* Tries to identify the extension and there by the loader from the content
* type.
*
* @param path
* @return
* @throws IOException
*/
private FuncSpec getFuncSpecFromContent(Path path) throws IOException {
// get the first three bytes from the file.
FSDataInputStream dataIn = null;
byte[] magic = new byte[3];
int read = -1;
try {
dataIn = fileSystem.open(path, 3);
read = dataIn.read(magic);
} finally {
dataIn.close();
}
FuncSpec funcSpec = null;
String extensionMapping = magicNumberExtensionMap.get(new MagicNumber(
magic));
if (read < magic.length || extensionMapping == null) {
// assume plain text
funcSpec = new FuncSpec("PigStorage()");
} else {
// an extension mapping was found. i.e. this is a GZ, BZ2, LZO or
// SEQ file
String applicableTag = getApplicableTag(extensionMapping, path);
String loadFuncDefinition = null;
if (extensionMapping.equals("seq")) {
// if this is a sequence file we load the key class also
loadFuncDefinition = loadFunctionExtensionTagMap
.get(extensionMapping + ":" + applicableTag + ":"
+ getSequenceFileKeyClass(path));
}
// we do this also for sequence file because a sequence file might
// have a sequeyceFileKey associated or not in the extension mapping
// given both cases if the key class is not found above in the
// mapping, the default sequence file loader needs to be used as per
// the extension mapping.
if (loadFuncDefinition == null) {
// use only extension and tag filtering
loadFuncDefinition = loadFunctionExtensionTagMap
.get(extensionMapping + ":" + applicableTag);
}
if (loadFuncDefinition == null) {
// if still null thrown an error
throw new RuntimeException("Cannot find loader for " + path
+ " extension mapping " + extensionMapping);
}
funcSpec = new FuncSpec(loadFuncDefinition);
}
return funcSpec;
}
/**
* Open a SequenceFile.Reader instance and return the keyClassName
*
* @param path
* @return
* @throws IOException
*/
private String getSequenceFileKeyClass(Path path) throws IOException {
String keyClassName = null;
SequenceFile.Reader reader = new SequenceFile.Reader(fileSystem, path,
conf);
try {
keyClassName = reader.getKeyClassName();
int index = keyClassName.indexOf("$");
if (index > 0) {
keyClassName = keyClassName.substring(0, index);
}
} finally {
reader.close();
}
return keyClassName;
}
/**
* Search for the correct loader based on the extension and tags mappings.
*
* @param fileName
* @param path
* @return
*/
private FuncSpec getLoadPerExtension(String fileName, Path path) {
String extension = null;
String applicableTag = null;
String loadFuncDefinition = null;
FuncSpec funcSpec = null;
// NOTE: the inverse logic !( a == null && b == null) is not used
// because we want all statements to be cheked as long as they are not
// null.
while (fileName != null && (extension = getExtension(fileName)) != null
&& (applicableTag = getApplicableTag(extension, path)) != null) {
if ((loadFuncDefinition = loadFunctionExtensionTagMap.get(extension
+ ":" + applicableTag)) != null) {
// create the LoadFunc
funcSpec = new FuncSpec(loadFuncDefinition);
break;
}
fileName = cutExtension(fileName);
}
return funcSpec;
}
/**
* Searches in the path for the first occurrence of the tags associated with
* the extension.<br/>
* If this extension has no tags an empty string is returned.<br/>
* If it has tags and no tag is found in the path null is returned.<br/>
*
* @param extension
* @param path
* @return
*/
private String getApplicableTag(String extension, Path path) {
Set<String> tags = extensionTagsMap.get(extension);
String applicableTag = null;
if (tags != null) {
String fullPathName = path.toUri().toString();
for (String tag : tags) {
if (fullPathName.contains(tag)) {
applicableTag = tag;
break;
}
}
} else {
applicableTag = "";
}
return applicableTag;
}
/**
* @param fileName
* @return String return the file name without the last extension e.g.
* file.rc.gz will return file.rc
*/
private static String cutExtension(String fileName) {
String name = null;
int index = fileName.lastIndexOf('.');
if (index > 0 && index < fileName.length()) {
name = fileName.substring(0, index);
}
return name;
}
/**
*
* @param fileName
* @return String return the last file name extension e.g. file.rc.gz will
* return gz
*/
private static String getExtension(String fileName) {
String extension = null;
int index = fileName.lastIndexOf('.');
int pos = index + 1;
if (index > 0 && pos < fileName.length()) {
extension = fileName.substring(pos, fileName.length());
}
return extension;
}
/**
* Looks for and returns the first file it can find.
*
* @return Path null is no file was found
* @throws IOException
*/
private static Path getFirstFile(FileSystem fileSystem, Path path)
throws IOException {
Path currentPath = path;
Path file = null;
FileStatus[] paths = fileSystem.listStatus(currentPath);
Arrays.sort(paths);
for (FileStatus subPathStatus : paths) {
currentPath = subPathStatus.getPath();
// if hidden file skip.
if (currentPath.getName().startsWith(".")
|| currentPath.getName().startsWith("_")) {
continue;
}
if (subPathStatus.isDir()) {
file = getFirstFile(fileSystem, currentPath);
} else {
// first file found return.
file = currentPath;
break;
}
}
return file;
}
static class MagicNumber {
byte[] magic;
public MagicNumber(byte[] magic) {
super();
this.magic = magic;
}
@Override
public int hashCode() {
final int prime = 31;
int result = 1;
result = prime * result + Arrays.hashCode(magic);
return result;
}
@Override
public boolean equals(Object obj) {
if (this == obj)
return true;
if (obj == null)
return false;
if (getClass() != obj.getClass())
return false;
MagicNumber other = (MagicNumber) obj;
if (!Arrays.equals(magic, other.magic))
return false;
return true;
}
}
}