/* * Copyright 2011 10gen Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.mongodb.hadoop.pig; import com.mongodb.BasicDBObjectBuilder; import com.mongodb.hadoop.BSONFileOutputFormat; import com.mongodb.hadoop.pig.udf.types.PigBoxedBSONValue; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.OutputFormat; import org.apache.hadoop.mapreduce.RecordWriter; import org.apache.pig.LoadFunc; import org.apache.pig.ResourceSchema; import org.apache.pig.ResourceSchema.ResourceFieldSchema; import org.apache.pig.ResourceStatistics; import org.apache.pig.StoreFunc; import org.apache.pig.StoreMetadata; import org.apache.pig.data.DataBag; import org.apache.pig.data.DataType; import org.apache.pig.data.Tuple; import org.apache.pig.impl.util.UDFContext; import org.apache.pig.impl.util.Utils; import org.joda.time.DateTime; import java.io.IOException; import java.util.ArrayList; import java.util.HashMap; import java.util.LinkedHashMap; import java.util.Map; import java.util.Properties; public class BSONStorage extends StoreFunc implements StoreMetadata { private static final Log LOG = LogFactory.getLog(MongoStorage.class); static final String SCHEMA_SIGNATURE = "bson.pig.output.schema"; //CHECKSTYLE:OFF protected ResourceSchema schema = null; //CHECKSTYLE:ON private RecordWriter out; private String udfcSignature = null; private String idField = null; private final BSONFileOutputFormat outputFormat = new BSONFileOutputFormat(); public BSONStorage() { } public BSONStorage(final String idField) { this.idField = idField; } /** * Returns object more suited for BSON storage. Object o corresponds to a field value in pig. * * @param o object representing pig type to convert to BSON-like object * @param field field to place o in * @param toIgnore name of field in Object o to ignore * @return an Object that can be stored as BSON. * @throws IOException if no schema is available from the field */ public static Object getTypeForBSON(final Object o, final ResourceFieldSchema field, final String toIgnore) throws IOException { byte dataType; ResourceSchema fieldInnerSchema = null; if (null == o) { return null; } if (null == field || DataType.UNKNOWN == field.getType()) { dataType = DataType.findType(o); } else { dataType = field.getType(); fieldInnerSchema = field.getSchema(); } if (dataType == DataType.BYTEARRAY && o instanceof Map) { dataType = DataType.MAP; } switch (dataType) { case DataType.NULL: return null; case DataType.INTEGER: case DataType.LONG: case DataType.FLOAT: case DataType.DOUBLE: return o; case DataType.BYTEARRAY: if (o instanceof PigBoxedBSONValue) { return ((PigBoxedBSONValue) o).getObject(); } return o.toString(); case DataType.CHARARRAY: return o; case DataType.DATETIME: return ((DateTime) o).toDate(); //Given a TUPLE, create a Map so BSONEncoder will eat it case DataType.TUPLE: // If there is no inner schema, just return the Tuple. // BasicBSONEncoder will consume it as an Iterable. if (fieldInnerSchema == null) { return o; } // If there was an inner schema, create a Map from the Tuple. ResourceFieldSchema[] fs = fieldInnerSchema.getFields(); // check if fs[0] should be 'unnamed', in which case, we create // an array of 'inner' elements. // For example, {("a"),("b")} becomes ["a","b"] if/ // unnamedStr == "t" and schema for bag is {<*>:(t:chararray)}/ // <*> -> can be any string since the field name of the tuple in // a bag should be ignored if (1 == fs.length && fs[0].getName().equals(toIgnore)) { return getTypeForBSON(((Tuple) o).get(0), fs[0], toIgnore); } // If there is more than one field in the tuple or no fields // to ignore, treat the Tuple as a Map. Map<String, Object> m = new LinkedHashMap<String, Object>(); for (int j = 0; j < fs.length; j++) { m.put(fs[j].getName(), getTypeForBSON(((Tuple) o).get(j), fs[j], toIgnore)); } return m; // Given a BAG, create an Array so BSONEncoder will eat it. case DataType.BAG: // If there is no inner schema, just return the Bag. // BasicBSONEncoder will consume it as an Iterable. if (null == fieldInnerSchema) { return o; } fs = fieldInnerSchema.getFields(); ArrayList<Object> bagList = new ArrayList<Object>(); for (Tuple t : (DataBag) o) { bagList.add(getTypeForBSON(t, fs[0], toIgnore)); } return bagList; case DataType.MAP: if (o == null) { return null; } Map map = (Map) o; Map<String, Object> out = new HashMap<String, Object>(map.size()); for (Object key : map.keySet()) { out.put(key.toString(), getTypeForBSON(map.get(key), null, toIgnore)); } return out; default: return o; } } @SuppressWarnings("unchecked") protected void writeField(final BasicDBObjectBuilder builder, final ResourceFieldSchema field, final Object d) throws IOException { Object convertedType = getTypeForBSON(d, field, null); String fieldName = field != null ? field.getName() : "value"; if (convertedType instanceof Map) { for (Map.Entry<String, Object> mapentry : ((Map<String, Object>) convertedType).entrySet()) { String addKey = mapentry.getKey().equals(idField) ? "_id" : mapentry.getKey(); builder.add(addKey, mapentry.getValue()); } } else { builder.add(fieldName, convertedType); } } @Override public void checkSchema(final ResourceSchema schema) throws IOException { this.schema = schema; UDFContext context = UDFContext.getUDFContext(); Properties p = context.getUDFProperties(getClass(), new String[]{udfcSignature}); p.setProperty(SCHEMA_SIGNATURE, schema.toString()); } @Override public void storeSchema(final ResourceSchema schema, final String location, final Job job) { // not implemented } @Override public void storeStatistics(final ResourceStatistics stats, final String location, final Job job) { // not implemented } @Override public void putNext(final Tuple tuple) throws IOException { try { final BasicDBObjectBuilder builder = BasicDBObjectBuilder.start(); ResourceFieldSchema[] fields = null; if (schema != null) { fields = schema.getFields(); } if (fields != null) { for (int i = 0; i < fields.length; i++) { writeField(builder, fields[i], tuple.get(i)); } } else { for (int i = 0; i < tuple.size(); i++) { writeField(builder, null, tuple.get(i)); } } out.write(null, builder.get()); } catch (Exception e) { throw new IOException("Couldn't convert tuple to bson: ", e); } } @Override public void prepareToWrite(final RecordWriter writer) throws IOException { out = writer; if (out == null) { throw new IOException("Invalid Record Writer"); } UDFContext udfc = UDFContext.getUDFContext(); Properties p = udfc.getUDFProperties(getClass(), new String[]{udfcSignature}); String strSchema = p.getProperty(SCHEMA_SIGNATURE); if (strSchema == null) { LOG.warn("Could not find schema in UDF context!"); LOG.warn("Will attempt to write records without schema."); } try { // Parse the schema from the string stored in the properties object. schema = new ResourceSchema(Utils.getSchemaFromString(strSchema)); } catch (Exception e) { schema = null; LOG.warn(e.getMessage()); } } @Override public OutputFormat getOutputFormat() throws IOException { return outputFormat; } @Override public String relToAbsPathForStoreLocation(final String location, final Path curDir) throws IOException { return LoadFunc.getAbsolutePath(location, curDir); } @Override public void setStoreLocation(final String location, final Job job) throws IOException { final Configuration config = job.getConfiguration(); // Old property. config.set("mapred.output.dir", location); // Modern property. config.set("mapreduce.output.fileoutputformat.outputdir", location); } @Override public void setStoreFuncUDFContextSignature(final String signature) { udfcSignature = signature; } }