/* * Copyright 2011 10gen Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.mongodb.hadoop.pig; import com.mongodb.BasicDBObjectBuilder; import com.mongodb.MongoClientURI; import com.mongodb.hadoop.MongoOutputFormat; import com.mongodb.hadoop.output.MongoRecordWriter; import com.mongodb.hadoop.util.MongoConfigUtil; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.OutputFormat; import org.apache.hadoop.mapreduce.RecordWriter; import org.apache.pig.ResourceSchema; import org.apache.pig.ResourceSchema.ResourceFieldSchema; import org.apache.pig.ResourceStatistics; import org.apache.pig.StoreFunc; import org.apache.pig.StoreMetadata; import org.apache.pig.data.DataBag; import org.apache.pig.data.DataType; import org.apache.pig.data.Tuple; import org.apache.pig.impl.util.UDFContext; import org.apache.pig.impl.util.Utils; import java.io.IOException; import java.text.ParseException; import java.util.ArrayList; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; import java.util.Properties; public class MongoStorage extends StoreFunc implements StoreMetadata { private static final Log LOG = LogFactory.getLog(MongoStorage.class); // Pig specific settings static final String PIG_OUTPUT_SCHEMA = "mongo.pig.output.schema"; static final String PIG_OUTPUT_SCHEMA_UDF_CONTEXT = "mongo.pig.output.schema.udf_context"; //CHECKSTYLE:OFF protected ResourceSchema schema = null; //CHECKSTYLE:ON private final MongoStorageOptions options; private String udfContextSignature = null; private MongoRecordWriter recordWriter = null; public MongoStorage() { this.options = null; } /** * <p> * Takes a list of arguments of two types: * </p> * <ul> * <li>A single set of keys to base updating on in the format: * <code>'update [time, user]'</code> or <code>'multi [time, user]'</code> for multi updates</li> * <li>Multiple indexes to ensure in the format: * <code>'{time: 1, user: 1},{unique: true}'</code> * (The syntax is exactly like db.col.ensureIndex())</li> * </ul> * <p> * Example: * </p> * <pre><code> * STORE Result INTO '$db' * USING com.mongodb.hadoop.pig.MongoStorage( * 'update [time, * servername, hostname]', * '{time : 1, servername : 1, hostname : 1}, {unique:true, dropDups: true}' * ) * </code></pre> * * @param args storage arguments * @throws ParseException if the arguments cannot be parsed */ public MongoStorage(final String... args) throws ParseException { this.options = MongoStorageOptions.parseArguments(args); } public void checkSchema(final ResourceSchema schema) throws IOException { LOG.info("checking schema " + schema.toString()); this.schema = schema; final Properties properties = UDFContext.getUDFContext().getUDFProperties(this.getClass(), new String[]{udfContextSignature}); properties.setProperty(PIG_OUTPUT_SCHEMA_UDF_CONTEXT, schema.toString()); } public void storeSchema(final ResourceSchema schema, final String location, final Job job) { // not implemented } public void storeStatistics(final ResourceStatistics stats, final String location, final Job job) { // not implemented } public void putNext(final Tuple tuple) throws IOException { if (LOG.isDebugEnabled()) { LOG.debug("writing " + tuple.toString()); } final BasicDBObjectBuilder builder = BasicDBObjectBuilder.start(); ResourceFieldSchema[] fields = this.schema.getFields(); for (int i = 0; i < fields.length; i++) { writeField(builder, fields[i], tuple.get(i)); } if (LOG.isDebugEnabled()) { LOG.debug("writing out:" + builder.get().toString()); } //noinspection unchecked recordWriter.write(null, builder.get()); } protected void writeField(final BasicDBObjectBuilder builder, final ResourceSchema.ResourceFieldSchema field, final Object d) throws IOException { // If the field is missing or the value is null, write a null if (d == null) { builder.add(field.getName(), null); return; } ResourceSchema s = field.getSchema(); // Based on the field's type, write it out byte i = field.getType(); if (i == DataType.INTEGER) { builder.add(field.getName(), d); } else if (i == DataType.LONG) { builder.add(field.getName(), d); } else if (i == DataType.FLOAT) { builder.add(field.getName(), d); } else if (i == DataType.DOUBLE) { builder.add(field.getName(), d); } else if (i == DataType.BYTEARRAY) { builder.add(field.getName(), d.toString()); } else if (i == DataType.CHARARRAY) { builder.add(field.getName(), d); } else if (i == DataType.TUPLE) { // Given a TUPLE, create a Map so BSONEncoder will eat it if (s == null) { throw new IOException("Schemas must be fully specified to use this storage function. No schema found for field " + field.getName()); } ResourceFieldSchema[] fs = s.getFields(); Map<String, Object> m = new LinkedHashMap<String, Object>(); for (int j = 0; j < fs.length; j++) { m.put(fs[j].getName(), ((Tuple) d).get(j)); } builder.add(field.getName(), (Map) m); } else if (i == DataType.BAG) { // Given a BAG, create an Array so BSONEncoder will eat it. ResourceFieldSchema[] fs; if (s == null) { throw new IOException("Schemas must be fully specified to use this storage function. No schema found for field " + field.getName()); } fs = s.getFields(); if (fs.length != 1 || fs[0].getType() != DataType.TUPLE) { throw new IOException("Found a bag without a tuple " + "inside!"); } // Drill down the next level to the tuple's schema. s = fs[0].getSchema(); if (s == null) { throw new IOException("Schemas must be fully specified to use this storage function. No schema found for field " + field.getName()); } fs = s.getFields(); List<Map<String, Object>> a = new ArrayList<Map<String, Object>>(); for (Tuple t : (DataBag) d) { Map<String, Object> ma = new LinkedHashMap<String, Object>(); for (int j = 0; j < fs.length; j++) { ma.put(fs[j].getName(), t.get(j)); } a.add(ma); } builder.add(field.getName(), a); } else if (i == DataType.MAP) { Map map = (Map) d; for (Object key : map.keySet()) { builder.add(key.toString(), map.get(key)); } } } public void prepareToWrite(final RecordWriter writer) throws IOException { recordWriter = (MongoRecordWriter) writer; LOG.info("Preparing to write to " + recordWriter); if (recordWriter == null) { throw new IOException("Invalid Record Writer"); } // Parse the schema from the string stored in the properties object. UDFContext udfc = UDFContext.getUDFContext(); Properties p = udfc.getUDFProperties(this.getClass(), new String[]{udfContextSignature}); String strSchema = p.getProperty(PIG_OUTPUT_SCHEMA_UDF_CONTEXT); if (strSchema == null) { throw new IOException("Could not find schema in UDF context"); } try { // Parse the schema from the string stored in the properties object. this.schema = new ResourceSchema(Utils.getSchemaFromString(strSchema)); } catch (Exception e) { LOG.error(e.getMessage(), e); } if (options != null) { // If we are insuring any indexes do so now: for (MongoStorageOptions.Index in : options.getIndexes()) { recordWriter.ensureIndex(in.index, in.options); } } } public OutputFormat getOutputFormat() throws IOException { return new MongoOutputFormat(); } public String relToAbsPathForStoreLocation(final String location, final Path curDir) throws IOException { // Don't convert anything - override to keep base from messing with URI return location; } public void setStoreLocation(final String location, final Job job) throws IOException { final Configuration config = job.getConfiguration(); if (!location.startsWith("mongodb://")) { throw new IllegalArgumentException("Invalid URI Format. URIs must begin with a mongodb:// protocol string."); } MongoClientURI locURI = new MongoClientURI(location); LOG.info(String.format( "Store location config: %s; for namespace: %s.%s; hosts: %s", config, locURI.getDatabase(), locURI.getCollection(), locURI.getHosts())); MongoConfigUtil.setOutputURI(config, locURI); final Properties properties = UDFContext.getUDFContext().getUDFProperties(this.getClass(), new String[]{udfContextSignature}); config.set(PIG_OUTPUT_SCHEMA, properties.getProperty(PIG_OUTPUT_SCHEMA_UDF_CONTEXT)); } public void setStoreFuncUDFContextSignature(final String signature) { udfContextSignature = signature; } }