/** * Copyright 2010 TransPac Software, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * * Based on cascading.jdbc code released into the public domain by * Concurrent, Inc. */ package com.bixolabs.simpledb; import java.io.IOException; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.OutputCollector; import org.apache.log4j.Logger; import cascading.scheme.Scheme; import cascading.tap.Tap; import cascading.tap.TapException; import cascading.tuple.Fields; import cascading.tuple.Tuple; import cascading.tuple.TupleEntry; /** * The SimpleDBScheme class is a {@link Scheme} subclass. It is used in conjunction with the {@SimpleDBTap} to * allow for the reading and writing of data to and from Amazon's SimpleDB service. * * @see SimpleDBTap */ @SuppressWarnings("serial") public class SimpleDBScheme extends Scheme { private static final Logger LOGGER = Logger.getLogger(SimpleDBScheme.class); private Fields _schemeFields; private String _itemFieldName; private String _query; private int _selectLimit; public SimpleDBScheme(Fields schemeFields, Fields itemField) { this(schemeFields, itemField, ""); } public SimpleDBScheme(Fields schemeFields, Fields itemField, String query) { this(schemeFields, itemField, query, SimpleDBUtils.NO_SELECT_LIMIT); } public SimpleDBScheme(Fields schemeFields, Fields itemField, String query, int selectLimit) { super(schemeFields, schemeFields); if (schemeFields.size() == 0) { throw new IllegalArgumentException("There must be at least one field"); } if (itemField.size() != 1) { throw new IllegalArgumentException("There can only be one item field, found: " + itemField.print()); } if (!schemeFields.contains(itemField)) { throw new IllegalArgumentException("Scheme fields must include the item field"); } _schemeFields = schemeFields; // TODO KKr - is this OK to assume that I'll always be able to use it as a String? // TODO KKr - should I get the position of this in the scheme field, and save/use that? _itemFieldName = itemField.get(0).toString(); setQuery(query); _selectLimit = selectLimit; } public void setQuery(String query) { query = query.trim(); if (query.startsWith("where ")) { throw new IllegalArgumentException("Query should not contain the `where ` portion of the expression: " + query); } _query = query; } public String getQuery() { return _query; } public void setSelectLimit(int selectLimit) { _selectLimit = selectLimit; } public int getSelectLimit() { return _selectLimit; } @Override public void sinkInit(Tap tap, JobConf conf) throws IOException { conf.setOutputKeyClass(NullWritable.class); conf.setOutputValueClass(Tuple.class); conf.setOutputFormat(SimpleDBOutputFormat.class); SimpleDBConfiguration sdbConf = new SimpleDBConfiguration(conf); sdbConf.setSchemeFields(_schemeFields); sdbConf.setItemFieldName(_itemFieldName); LOGGER.info(String.format("Initializing SimpleDB sink tap - scheme field: %s and item field: %s", _schemeFields, _itemFieldName)); } public void sourceInit(Tap tap, JobConf conf) throws IOException { conf.setInputFormat(SimpleDBInputFormat.class); SimpleDBConfiguration sdbConf = new SimpleDBConfiguration(conf); sdbConf.setSchemeFields(_schemeFields); sdbConf.setItemFieldName(_itemFieldName); sdbConf.setQuery(_query); sdbConf.setSelectLimit(_selectLimit); LOGGER.info(String.format("Initializing SimpleDB source tap - scheme field: %s and item field: %s", _schemeFields, _itemFieldName)); } @SuppressWarnings("unchecked") @Override public void sink(TupleEntry tupleEntry, OutputCollector outputCollector) throws IOException { String itemValue = tupleEntry.getString(_itemFieldName); if ((itemValue == null) || (itemValue.length() == 0)) { throw new TapException("Tuple passed to sink does not have a valid (not null, not empty) value for the item field (" + _itemFieldName + ")"); } Tuple result = getSinkFields() != null ? tupleEntry.selectTuple(getSinkFields()) : tupleEntry.getTuple(); outputCollector.collect(NullWritable.get(), result); } @Override public Tuple source(Object key, Object value) { return (Tuple)value; } @Override public int hashCode() { final int prime = 31; int result = super.hashCode(); result = prime * result + ((_itemFieldName == null) ? 0 : _itemFieldName.hashCode()); result = prime * result + ((_query == null) ? 0 : _query.hashCode()); return result; } @Override public boolean equals(Object obj) { if (this == obj) return true; if (!super.equals(obj)) return false; if (getClass() != obj.getClass()) return false; SimpleDBScheme other = (SimpleDBScheme) obj; if (_itemFieldName == null) { if (other._itemFieldName != null) return false; } else if (!_itemFieldName.equals(other._itemFieldName)) return false; if (_query == null) { if (other._query != null) return false; } else if (!_query.equals(other._query)) return false; return true; } }