JsonMetadata.java example

Explorer
hadoop-pig-master
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.pig.piggybank.storage;

import java.io.IOException;
import java.io.OutputStream;
import java.util.HashSet;
import java.util.Set;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.Job;
import org.apache.pig.Expression;
import org.apache.pig.LoadMetadata;
import org.apache.pig.StoreMetadata;
import org.apache.pig.ResourceSchema;
import org.apache.pig.ResourceStatistics;
import org.apache.pig.backend.datastorage.ContainerDescriptor;
import org.apache.pig.backend.datastorage.DataStorage;
import org.apache.pig.backend.datastorage.ElementDescriptor;
import org.apache.pig.backend.hadoop.datastorage.ConfigurationUtil;
import org.apache.pig.backend.hadoop.datastorage.HDataStorage;
import org.apache.pig.backend.hadoop.datastorage.HDirectory;
import org.apache.pig.backend.hadoop.datastorage.HFile;
import org.apache.pig.impl.io.FileLocalizer;
import org.codehaus.jackson.JsonGenerationException;
import org.codehaus.jackson.JsonParseException;
import org.codehaus.jackson.map.JsonMappingException;
import org.codehaus.jackson.map.ObjectMapper;

/**
 * Reads and Writes metadata using JSON in metafiles next to the data.
 * 
 *
 */
public class JsonMetadata implements LoadMetadata, StoreMetadata {

    private static final Log log = LogFactory.getLog(JsonMetadata.class);

    // These are not static+final because we may want to make these adjustable by users.
    private String schemaFileName = ".pig_schema";
    private String headerFileName = ".pig_header";
    private String statFileName = ".pig_stats";
    
    private boolean printHeaders = true;

    private byte fieldDel;
    private byte recordDel;

    public JsonMetadata() {

    }

    /**.
     * Given a path, which may represent a glob pattern, a directory, or a file, this method
     * finds the set of relevant metadata files on the storage system. The algorithm for finding the
     * metadata file is as follows:
     * <p>
     * For each file represented by the path (either directly, or via a glob):
     *   If parentPath/prefix.fileName exists, use that as the metadata file.
     *   Else if parentPath/prefix exists, use that as the metadata file.
     * <p>
     * Resolving conflicts, merging the metadata, etc, is not handled by this method and should be
     * taken care of by downstream code.
     * 
     * This can go into a util package if metadata files are considered a general enough pattern
     * 
     * @param path      Path, as passed in to a LoadFunc (may be a Hadoop glob)
     * @param prefix    Metadata file designation, such as .pig_schema or .pig_stats
     * @param conf      configuration object
     * @return Set of element descriptors for all metadata files associated with the files on the path.
     */
    protected Set<ElementDescriptor> findMetaFile(String path, String prefix, Configuration conf) 
        throws IOException {
        DataStorage storage = new HDataStorage(ConfigurationUtil.toProperties(conf));
        String fullPath = FileLocalizer.fullPath(path, storage);
        Set<ElementDescriptor> metaFileSet = new HashSet<ElementDescriptor>();
        if(storage.isContainer(fullPath)) {
            ElementDescriptor metaFilePath = storage.asElement(fullPath, prefix);
            if (metaFilePath.exists()) {
                metaFileSet.add(metaFilePath); 
            }
        } else {
            ElementDescriptor[] descriptors = storage.asCollection(path);
            for(ElementDescriptor descriptor : descriptors) {
                String fileName = null, parentName = null;
                ContainerDescriptor parentContainer = null;
                if (descriptor instanceof HFile) {
                    Path descriptorPath = ((HFile) descriptor).getPath();
                    fileName = descriptorPath.getName();
                    Path parent = descriptorPath.getParent();
                    parentName = parent.toString();
                    parentContainer = new HDirectory((HDataStorage)storage,parent);
                }
                ElementDescriptor metaFilePath = storage.asElement(parentName, prefix+"."+fileName);

                // if the file has a custom schema, use it
                if (metaFilePath.exists()) {
                    metaFileSet.add(metaFilePath);
                    continue;
                }

                // if no custom schema, try the parent directory
                metaFilePath = storage.asElement(parentContainer, prefix);
                if (metaFilePath.exists()) {
                    metaFileSet.add(metaFilePath);
                }
            }
        }
        return metaFileSet;
    }
    
    //------------------------------------------------------------------------
    // Implementation of LoadMetaData interface
    
    @Override
    public String[] getPartitionKeys(String location, Job job) {
        return null;
    }

    @Override
    public void setPartitionFilter(Expression partitionFilter)
            throws IOException {        
    }
    
    /**
     * For JsonMetadata schema is considered optional
     * This method suppresses (and logs) errors if they are encountered.
     * 
     * TODO location and conf params are ignored in favor of initialzation data 
     */
    @Override
    public ResourceSchema getSchema(String location, Job job) throws IOException {     
        Configuration conf = job.getConfiguration();
        Set<ElementDescriptor> schemaFileSet = null;
        try {
            schemaFileSet = findMetaFile(location, schemaFileName, conf);
        } catch (IOException e) {
            log.warn("Could not find schema file for "+ location); 
            return null;
        }
              
        // TODO we assume that all schemas are the same. The question of merging schemas is left open for now. 
        ElementDescriptor schemaFile = null;
        if (!schemaFileSet.isEmpty()) {            
            schemaFile = schemaFileSet.iterator().next();
        } else {
            log.warn("Could not find schema file for "+location); 
            return null;
        }
        log.info("Found schema file: "+schemaFile.toString());
        ResourceSchema resourceSchema = null;
        try {
            resourceSchema = new ObjectMapper().readValue(schemaFile.open(), ResourceSchema.class);
        } catch (JsonParseException e) {
            log.warn("Unable to load Resource Schema for "+location);
            e.printStackTrace();
        } catch (JsonMappingException e) {
            log.warn("Unable to load Resource Schema for "+location);
            e.printStackTrace();
        } catch (IOException e) {
            log.warn("Unable to load Resource Schema for "+location);
            e.printStackTrace();
        }

        return resourceSchema;
    }

    /**
     * For JsonMetadata stats are considered optional
     * This method suppresses (and logs) errors if they are encountered.
     * @see org.apache.pig.LoadMetadata#getStatistics(String, Configuration)
     */
    @Override
    public ResourceStatistics getStatistics(String location, Job job) throws IOException {
        Configuration conf = job.getConfiguration();
        Set<ElementDescriptor> statFileSet = null;
        try {
            statFileSet = findMetaFile(location, statFileName, conf);
        } catch (IOException e) {
            log.warn("could not fine stat file for "+location);
            return null;
        }
        ElementDescriptor statFile = null;
        if (!statFileSet.isEmpty()) {
            statFile = statFileSet.iterator().next();
        } else {
            log.warn("Could not find stat file for "+location);
            return null;
        }
        log.info("Found stat file "+statFile.toString());
        ResourceStatistics resourceStats = null;        
        try {
            resourceStats = new ObjectMapper().readValue(statFile.open(), ResourceStatistics.class);
        } catch (JsonParseException e) {
            log.warn("Unable to load Resource Statistics for "+location);
            e.printStackTrace();
        } catch (JsonMappingException e) {
            log.warn("Unable to load Resource Statistics for "+location);
            e.printStackTrace();
        } catch (IOException e) {
            log.warn("Unable to load Resource Statistics for "+location);
            e.printStackTrace();
        }
        return resourceStats;
    }

    //------------------------------------------------------------------------
    // Implementation of StoreMetaData interface
    
    @Override
    public void storeStatistics(ResourceStatistics stats, String location, Job job) throws IOException {
        Configuration conf = job.getConfiguration();
        DataStorage storage = new HDataStorage(ConfigurationUtil.toProperties(conf));
        ElementDescriptor statFilePath = storage.asElement(location, statFileName);
        if(!statFilePath.exists() && stats != null) {
            try {
                new ObjectMapper().writeValue(statFilePath.create(), stats);                    
            } catch (JsonGenerationException e) {
                log.warn("Unable to write Resource Statistics for "+location);
                e.printStackTrace();
            } catch (JsonMappingException e) {
                log.warn("Unable to write Resource Statistics for "+location);
                e.printStackTrace();
            }
        }
    }

    @Override
    public void storeSchema(ResourceSchema schema, String location, Job job) throws IOException {
        Configuration conf = job.getConfiguration();
        DataStorage storage = new HDataStorage(ConfigurationUtil.toProperties(conf));
        ElementDescriptor schemaFilePath = storage.asElement(location, schemaFileName);
        if(!schemaFilePath.exists() && schema != null) {
            try {
                new ObjectMapper().writeValue(schemaFilePath.create(), schema);
            } catch (JsonGenerationException e) {
                log.warn("Unable to write Resource Statistics for "+location);
                e.printStackTrace();
            } catch (JsonMappingException e) {
                log.warn("Unable to write Resource Statistics for "+location);
                e.printStackTrace();
            }
        }
        if (printHeaders) {
            ElementDescriptor headerFilePath = storage.asElement(location, headerFileName);
            if (!headerFilePath.exists()) {
                OutputStream os = headerFilePath.create();
                try {
                    String[] names = schema.fieldNames();

                    for (int i=0; i < names.length; i++) {
                        os.write(names[i].getBytes("UTF-8"));
                        if (i <names.length-1) {
                            os.write(fieldDel);
                        } else {
                            os.write(recordDel);
                        }
                    }
                } finally {
                    os.close();
                }
            }
        }
    }

    public void setFieldDel(byte fieldDel) {
        this.fieldDel = fieldDel;
    }

    public void setRecordDel(byte recordDel) {
        this.recordDel = recordDel;
    }

}