CqlPagingRecordReader.java example

Explorer
ACaZoo-master
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.cassandra.hadoop.cql3;

import java.io.IOException;
import java.net.InetAddress;
import java.net.UnknownHostException;
import java.nio.ByteBuffer;
import java.nio.charset.CharacterCodingException;
import java.util.*;

import com.google.common.collect.AbstractIterator;
import com.google.common.collect.Iterables;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import org.apache.cassandra.db.marshal.AbstractType;
import org.apache.cassandra.db.marshal.CompositeType;
import org.apache.cassandra.db.marshal.LongType;
import org.apache.cassandra.db.marshal.TypeParser;
import org.apache.cassandra.dht.IPartitioner;
import org.apache.cassandra.exceptions.ConfigurationException;
import org.apache.cassandra.exceptions.SyntaxException;
import org.apache.cassandra.hadoop.ColumnFamilySplit;
import org.apache.cassandra.hadoop.ConfigHelper;
import org.apache.cassandra.thrift.*;
import org.apache.cassandra.utils.ByteBufferUtil;
import org.apache.cassandra.utils.FBUtilities;
import org.apache.cassandra.utils.Pair;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.thrift.TException;
import org.apache.thrift.transport.TTransport;

/**
 * Hadoop RecordReader read the values return from the CQL query
 * It use CQL key range query to page through the wide rows.
 * <p/>
 * Return List<IColumn> as keys columns
 * <p/>
 * Map<ByteBuffer, IColumn> as column name to columns mappings
 */
public class CqlPagingRecordReader extends RecordReader<Map<String, ByteBuffer>, Map<String, ByteBuffer>>
        implements org.apache.hadoop.mapred.RecordReader<Map<String, ByteBuffer>, Map<String, ByteBuffer>>
{
    private static final Logger logger = LoggerFactory.getLogger(CqlPagingRecordReader.class);

    public static final int DEFAULT_CQL_PAGE_LIMIT = 1000; // TODO: find the number large enough but not OOM

    private ColumnFamilySplit split;
    private RowIterator rowIterator;

    private Pair<Map<String, ByteBuffer>, Map<String, ByteBuffer>> currentRow;
    private int totalRowCount; // total number of rows to fetch
    private String keyspace;
    private String cfName;
    private Cassandra.Client client;
    private ConsistencyLevel consistencyLevel;

    // partition keys -- key aliases
    private List<BoundColumn> partitionBoundColumns = new ArrayList<BoundColumn>();

    // cluster keys -- column aliases
    private List<BoundColumn> clusterColumns = new ArrayList<BoundColumn>();

    // map prepared query type to item id
    private Map<Integer, Integer> preparedQueryIds = new HashMap<Integer, Integer>();

    // cql query select columns
    private String columns;

    // the number of cql rows per page
    private int pageRowSize;

    // user defined where clauses
    private String userDefinedWhereClauses;

    private IPartitioner partitioner;

    private AbstractType<?> keyValidator;

    public CqlPagingRecordReader()
    {
        super();
    }

    public void initialize(InputSplit split, TaskAttemptContext context) throws IOException
    {
        this.split = (ColumnFamilySplit) split;
        Configuration conf = context.getConfiguration();
        totalRowCount = (this.split.getLength() < Long.MAX_VALUE)
                      ? (int) this.split.getLength()
                      : ConfigHelper.getInputSplitSize(conf);
        cfName = ConfigHelper.getInputColumnFamily(conf);
        consistencyLevel = ConsistencyLevel.valueOf(ConfigHelper.getReadConsistencyLevel(conf));
        keyspace = ConfigHelper.getInputKeyspace(conf);
        columns = CqlConfigHelper.getInputcolumns(conf);
        userDefinedWhereClauses = CqlConfigHelper.getInputWhereClauses(conf);

        try
        {
            pageRowSize = Integer.parseInt(CqlConfigHelper.getInputPageRowSize(conf));
        }
        catch (NumberFormatException e)
        {
            pageRowSize = DEFAULT_CQL_PAGE_LIMIT;
        }

        partitioner = ConfigHelper.getInputPartitioner(context.getConfiguration());

        try
        {
            if (client != null)
                return;

            // create connection using thrift
            String location = getLocation();

            int port = ConfigHelper.getInputRpcPort(conf);
            client = CqlPagingInputFormat.createAuthenticatedClient(location, port, conf);

            // retrieve partition keys and cluster keys from system.schema_columnfamilies table
            retrieveKeys();

            client.set_keyspace(keyspace);
        }
        catch (Exception e)
        {
            throw new RuntimeException(e);
        }

        rowIterator = new RowIterator();

        logger.debug("created {}", rowIterator);
    }

    public void close()
    {
        if (client != null)
        {
            TTransport transport = client.getOutputProtocol().getTransport();
            if (transport.isOpen())
                transport.close();
            client = null;
        }
    }

    public Map<String, ByteBuffer> getCurrentKey()
    {
        return currentRow.left;
    }

    public Map<String, ByteBuffer> getCurrentValue()
    {
        return currentRow.right;
    }

    public float getProgress()
    {
        if (!rowIterator.hasNext())
            return 1.0F;

        // the progress is likely to be reported slightly off the actual but close enough
        float progress = ((float) rowIterator.totalRead / totalRowCount);
        return progress > 1.0F ? 1.0F : progress;
    }

    public boolean nextKeyValue() throws IOException
    {
        if (!rowIterator.hasNext())
        {
            logger.debug("Finished scanning {} rows (estimate was: {})", rowIterator.totalRead, totalRowCount);
            return false;
        }

        try
        {
            currentRow = rowIterator.next();
        }
        catch (Exception e)
        {
            // throw it as IOException, so client can catch it and handle it at client side
            IOException ioe = new IOException(e.getMessage());
            ioe.initCause(ioe.getCause());
            throw ioe;
        }
        return true;
    }

    // we don't use endpointsnitch since we are trying to support hadoop nodes that are
    // not necessarily on Cassandra machines, too.  This should be adequate for single-DC clusters, at least.
    private String getLocation()
    {
        Collection<InetAddress> localAddresses = FBUtilities.getAllLocalAddresses();

        for (InetAddress address : localAddresses)
        {
            for (String location : split.getLocations())
            {
                InetAddress locationAddress;
                try
                {
                    locationAddress = InetAddress.getByName(location);
                }
                catch (UnknownHostException e)
                {
                    throw new AssertionError(e);
                }
                if (address.equals(locationAddress))
                {
                    return location;
                }
            }
        }
        return split.getLocations()[0];
    }

    // Because the old Hadoop API wants us to write to the key and value
    // and the new asks for them, we need to copy the output of the new API
    // to the old. Thus, expect a small performance hit.
    // And obviously this wouldn't work for wide rows. But since ColumnFamilyInputFormat
    // and ColumnFamilyRecordReader don't support them, it should be fine for now.
    public boolean next(Map<String, ByteBuffer> keys, Map<String, ByteBuffer> value) throws IOException
    {
        if (nextKeyValue())
        {
            value.clear();
            value.putAll(getCurrentValue());
            
            keys.clear();
            keys.putAll(getCurrentKey());

            return true;
        }
        return false;
    }

    public long getPos() throws IOException
    {
        return (long) rowIterator.totalRead;
    }

    public Map<String, ByteBuffer> createKey()
    {
        return new LinkedHashMap<String, ByteBuffer>();
    }

    public Map<String, ByteBuffer> createValue()
    {
        return new LinkedHashMap<String, ByteBuffer>();
    }

    /** CQL row iterator */
    private class RowIterator extends AbstractIterator<Pair<Map<String, ByteBuffer>, Map<String, ByteBuffer>>>
    {
        protected int totalRead = 0;             // total number of cf rows read
        protected Iterator<CqlRow> rows;
        private int pageRows = 0;                // the number of cql rows read of this page
        private String previousRowKey = null;    // previous CF row key
        private String partitionKeyString;       // keys in <key1>, <key2>, <key3> string format
        private String partitionKeyMarkers;      // question marks in ? , ? , ? format which matches the number of keys

        public RowIterator()
        {
            // initial page
            executeQuery();
        }

        protected Pair<Map<String, ByteBuffer>, Map<String, ByteBuffer>> computeNext()
        {
            if (rows == null)
                return endOfData();

            int index = -2;
            //check there are more page to read
            while (!rows.hasNext())
            {
                // no more data
                if (index == -1 || emptyPartitionKeyValues())
                {
                    logger.debug("no more data");
                    return endOfData();
                }

                index = setTailNull(clusterColumns);
                logger.debug("set tail to null, index: {}", index);
                executeQuery();
                pageRows = 0;

                if (rows == null || !rows.hasNext() && index < 0)
                {
                    logger.debug("no more data");
                    return endOfData();
                }
            }

            Map<String, ByteBuffer> valueColumns = createValue();
            Map<String, ByteBuffer> keyColumns = createKey();
            int i = 0;
            CqlRow row = rows.next();
            for (Column column : row.columns)
            {
                String columnName = stringValue(ByteBuffer.wrap(column.getName()));
                logger.debug("column: {}", columnName);

                if (i < partitionBoundColumns.size() + clusterColumns.size())
                    keyColumns.put(stringValue(column.name), column.value);
                else
                    valueColumns.put(stringValue(column.name), column.value);

                i++;
            }

            // increase total CQL row read for this page
            pageRows++;

            // increase total CF row read
            if (newRow(keyColumns, previousRowKey))
                totalRead++;

            // read full page
            if (pageRows >= pageRowSize || !rows.hasNext())
            {
                Iterator<String> newKeys = keyColumns.keySet().iterator();
                for (BoundColumn column : partitionBoundColumns)
                    column.value = keyColumns.get(newKeys.next());

                for (BoundColumn column : clusterColumns)
                    column.value = keyColumns.get(newKeys.next());

                executeQuery();
                pageRows = 0;
            }

            return Pair.create(keyColumns, valueColumns);
        }

        /** check whether start to read a new CF row by comparing the partition keys */
        private boolean newRow(Map<String, ByteBuffer> keyColumns, String previousRowKey)
        {
            if (keyColumns.isEmpty())
                return false;

            String rowKey = "";
            if (keyColumns.size() == 1)
            {
                rowKey = partitionBoundColumns.get(0).validator.getString(keyColumns.get(partitionBoundColumns.get(0).name));
            }
            else
            {
                Iterator<ByteBuffer> iter = keyColumns.values().iterator();
                for (BoundColumn column : partitionBoundColumns)
                    rowKey = rowKey + column.validator.getString(ByteBufferUtil.clone(iter.next())) + ":";
            }

            logger.debug("previous RowKey: {}, new row key: {}", previousRowKey, rowKey);
            if (previousRowKey == null)
            {
                this.previousRowKey = rowKey;
                return true;
            }

            if (rowKey.equals(previousRowKey))
                return false;

            this.previousRowKey = rowKey;
            return true;
        }

        /** set the last non-null key value to null, and return the previous index */
        private int setTailNull(List<BoundColumn> values)
        {
            if (values.isEmpty())
                return -1;

            Iterator<BoundColumn> iterator = values.iterator();
            int previousIndex = -1;
            BoundColumn current;
            while (iterator.hasNext())
            {
                current = iterator.next();
                if (current.value == null)
                {
                    int index = previousIndex > 0 ? previousIndex : 0;
                    BoundColumn column = values.get(index);
                    logger.debug("set key {} value to  null", column.name);
                    column.value = null;
                    return previousIndex - 1;
                }

                previousIndex++;
            }

            BoundColumn column = values.get(previousIndex);
            logger.debug("set key {} value to  null", column.name);
            column.value = null;
            return previousIndex - 1;
        }

        /** serialize the prepared query, pair.left is query id, pair.right is query */
        private Pair<Integer, String> composeQuery(String columns)
        {
            Pair<Integer, String> clause = whereClause();
            if (columns == null)
            {
                columns = "*";
            }
            else
            {
                // add keys in the front in order
                String partitionKey = keyString(partitionBoundColumns);
                String clusterKey = keyString(clusterColumns);

                columns = withoutKeyColumns(columns);
                columns = (clusterKey == null || "".equals(clusterKey))
                        ? partitionKey + "," + columns
                        : partitionKey + "," + clusterKey + "," + columns;
            }

            String whereStr = userDefinedWhereClauses == null ? "" : " AND " + userDefinedWhereClauses;
            return Pair.create(clause.left,
                               String.format("SELECT %s FROM %s%s%s LIMIT %d ALLOW FILTERING",
                                             columns, quote(cfName), clause.right, whereStr, pageRowSize));
        }


        /** remove key columns from the column string */
        private String withoutKeyColumns(String columnString)
        {
            Set<String> keyNames = new HashSet<String>();
            for (BoundColumn column : Iterables.concat(partitionBoundColumns, clusterColumns))
                keyNames.add(column.name);

            String[] columns = columnString.split(",");
            String result = null;
            for (String column : columns)
            {
                String trimmed = column.trim();
                if (keyNames.contains(trimmed))
                    continue;

                String quoted = quote(trimmed);
                result = result == null ? quoted : result + "," + quoted;
            }
            return result;
        }

        /** serialize the where clause */
        private Pair<Integer, String> whereClause()
        {
            if (partitionKeyString == null)
                partitionKeyString = keyString(partitionBoundColumns);

            if (partitionKeyMarkers == null)
                partitionKeyMarkers = partitionKeyMarkers();
            // initial query token(k) >= start_token and token(k) <= end_token
            if (emptyPartitionKeyValues())
                return Pair.create(0, String.format(" WHERE token(%s) > ? AND token(%s) <= ?", partitionKeyString, partitionKeyString));

            // query token(k) > token(pre_partition_key) and token(k) <= end_token
            if (clusterColumns.size() == 0 || clusterColumns.get(0).value == null)
                return Pair.create(1,
                                   String.format(" WHERE token(%s) > token(%s)  AND token(%s) <= ?",
                                                 partitionKeyString, partitionKeyMarkers, partitionKeyString));

            // query token(k) = token(pre_partition_key) and m = pre_cluster_key_m and n > pre_cluster_key_n
            Pair<Integer, String> clause = whereClause(clusterColumns, 0);
            return Pair.create(clause.left,
                               String.format(" WHERE token(%s) = token(%s) %s", partitionKeyString, partitionKeyMarkers, clause.right));
        }

        /** recursively serialize the where clause */
        private Pair<Integer, String> whereClause(List<BoundColumn> column, int position)
        {
            if (position == column.size() - 1 || column.get(position + 1).value == null)
                return Pair.create(position + 2, String.format(" AND %s > ? ", quote(column.get(position).name)));

            Pair<Integer, String> clause = whereClause(column, position + 1);
            return Pair.create(clause.left, String.format(" AND %s = ? %s", quote(column.get(position).name), clause.right));
        }

        /** check whether all key values are null */
        private boolean emptyPartitionKeyValues()
        {
            for (BoundColumn column : partitionBoundColumns)
            {
                if (column.value != null)
                    return false;
            }
            return true;
        }

        /** serialize the partition key string in format of <key1>, <key2>, <key3> */
        private String keyString(List<BoundColumn> columns)
        {
            String result = null;
            for (BoundColumn column : columns)
                result = result == null ? quote(column.name) : result + "," + quote(column.name);

            return result == null ? "" : result;
        }

        /** serialize the question marks for partition key string in format of ?, ? , ? */
        private String partitionKeyMarkers()
        {
            String result = null;
            for (BoundColumn column : partitionBoundColumns)
                result = result == null ? "?" : result + ",?";

            return result;
        }

        /** serialize the query binding variables, pair.left is query id, pair.right is the binding variables */
        private Pair<Integer, List<ByteBuffer>> preparedQueryBindValues()
        {
            List<ByteBuffer> values = new LinkedList<ByteBuffer>();

            // initial query token(k) >= start_token and token(k) <= end_token
            if (emptyPartitionKeyValues())
            {
                values.add(partitioner.getTokenValidator().fromString(split.getStartToken()));
                values.add(partitioner.getTokenValidator().fromString(split.getEndToken()));
                return Pair.create(0, values);
            }
            else
            {
                for (BoundColumn partitionBoundColumn1 : partitionBoundColumns)
                    values.add(partitionBoundColumn1.value);

                if (clusterColumns.size() == 0 || clusterColumns.get(0).value == null)
                {
                    // query token(k) > token(pre_partition_key) and token(k) <= end_token
                    values.add(partitioner.getTokenValidator().fromString(split.getEndToken()));
                    return Pair.create(1, values);
                }
                else
                {
                    // query token(k) = token(pre_partition_key) and m = pre_cluster_key_m and n > pre_cluster_key_n
                    int type = preparedQueryBindValues(clusterColumns, 0, values);
                    return Pair.create(type, values);
                }
            }
        }

        /** recursively serialize the query binding variables */
        private int preparedQueryBindValues(List<BoundColumn> column, int position, List<ByteBuffer> bindValues)
        {
            if (position == column.size() - 1 || column.get(position + 1).value == null)
            {
                bindValues.add(column.get(position).value);
                return position + 2;
            }
            else
            {
                bindValues.add(column.get(position).value);
                return preparedQueryBindValues(column, position + 1, bindValues);
            }
        }

        /**  get the prepared query item Id  */
        private int prepareQuery(int type) throws InvalidRequestException, TException
        {
            Integer itemId = preparedQueryIds.get(type);
            if (itemId != null)
                return itemId;

            Pair<Integer, String> query = null;
            query = composeQuery(columns);
            logger.debug("type: {}, query: {}", query.left, query.right);
            CqlPreparedResult cqlPreparedResult = client.prepare_cql3_query(ByteBufferUtil.bytes(query.right), Compression.NONE);
            preparedQueryIds.put(query.left, cqlPreparedResult.itemId);
            return cqlPreparedResult.itemId;
        }

        /** Quoting for working with uppercase */
        private String quote(String identifier)
        {
            return "\"" + identifier.replaceAll("\"", "\"\"") + "\"";
        }

        /** execute the prepared query */
        private void executeQuery()
        {
            Pair<Integer, List<ByteBuffer>> bindValues = preparedQueryBindValues();
            logger.debug("query type: {}", bindValues.left);

            // check whether it reach end of range for type 1 query CASSANDRA-5573
            if (bindValues.left == 1 && reachEndRange())
            {
                rows = null;
                return;
            }

            int retries = 0;
            // only try three times for TimedOutException and UnavailableException
            while (retries < 3)
            {
                try
                {
                    CqlResult cqlResult = client.execute_prepared_cql3_query(prepareQuery(bindValues.left), bindValues.right, consistencyLevel);
                    if (cqlResult != null && cqlResult.rows != null)
                        rows = cqlResult.rows.iterator();
                    return;
                }
                catch (TimedOutException e)
                {
                    retries++;
                    if (retries >= 3)
                    {
                        rows = null;
                        RuntimeException rte = new RuntimeException(e.getMessage());
                        rte.initCause(e);
                        throw rte;
                    }
                }
                catch (UnavailableException e)
                {
                    retries++;
                    if (retries >= 3)
                    {
                        rows = null;
                        RuntimeException rte = new RuntimeException(e.getMessage());
                        rte.initCause(e);
                        throw rte;
                    }
                }
                catch (Exception e)
                {
                    rows = null;
                    RuntimeException rte = new RuntimeException(e.getMessage());
                    rte.initCause(e);
                    throw rte;
                }
            }
        }
    }

    /** retrieve the partition keys and cluster keys from system.schema_columnfamilies table */
    private void retrieveKeys() throws Exception
    {
        String query = "select key_aliases," +
                       "column_aliases, " +
                       "key_validator, " +
                       "comparator " +
                       "from system.schema_columnfamilies " +
                       "where keyspace_name='%s' and columnfamily_name='%s'";
        String formatted = String.format(query, keyspace, cfName);
        CqlResult result = client.execute_cql3_query(ByteBufferUtil.bytes(formatted), Compression.NONE, ConsistencyLevel.ONE);

        CqlRow cqlRow = result.rows.get(0);
        String keyString = ByteBufferUtil.string(ByteBuffer.wrap(cqlRow.columns.get(0).getValue()));
        logger.debug("partition keys: {}", keyString);
        List<String> keys = FBUtilities.fromJsonList(keyString);

        for (String key : keys)
            partitionBoundColumns.add(new BoundColumn(key));

        keyString = ByteBufferUtil.string(ByteBuffer.wrap(cqlRow.columns.get(1).getValue()));
        logger.debug("cluster columns: {}", keyString);
        keys = FBUtilities.fromJsonList(keyString);

        for (String key : keys)
            clusterColumns.add(new BoundColumn(key));

        Column rawKeyValidator = cqlRow.columns.get(2);
        String validator = ByteBufferUtil.string(ByteBuffer.wrap(rawKeyValidator.getValue()));
        logger.debug("row key validator: {}", validator);
        keyValidator = parseType(validator);

        if (keyValidator instanceof CompositeType)
        {
            List<AbstractType<?>> types = ((CompositeType) keyValidator).types;
            for (int i = 0; i < partitionBoundColumns.size(); i++)
                partitionBoundColumns.get(i).validator = types.get(i);
        }
        else
        {
            partitionBoundColumns.get(0).validator = keyValidator;
        }
    }

    /** check whether current row is at the end of range */
    private boolean reachEndRange()
    {
        // current row key
        ByteBuffer rowKey;
        if (keyValidator instanceof CompositeType)
        {
            ByteBuffer[] keys = new ByteBuffer[partitionBoundColumns.size()];
            for (int i = 0; i < partitionBoundColumns.size(); i++)
                keys[i] = partitionBoundColumns.get(i).value.duplicate();

            rowKey = CompositeType.build(keys);
        }
        else
        {
            rowKey = partitionBoundColumns.get(0).value;
        }

        String endToken = split.getEndToken();
        String currentToken = partitioner.getToken(rowKey).toString();
        logger.debug("End token: {}, current token: {}", endToken, currentToken);

        return endToken.equals(currentToken);
    }

    private static AbstractType<?> parseType(String type) throws IOException
    {
        try
        {
            // always treat counters like longs, specifically CCT.serialize is not what we need
            if (type != null && type.equals("org.apache.cassandra.db.marshal.CounterColumnType"))
                return LongType.instance;
            return TypeParser.parse(type);
        }
        catch (ConfigurationException e)
        {
            throw new IOException(e);
        }
        catch (SyntaxException e)
        {
            throw new IOException(e);
        }
    }

    private static class BoundColumn
    {
        final String name;
        ByteBuffer value;
        AbstractType<?> validator;

        public BoundColumn(String name)
        {
            this.name = name;
        }
    }
    
    /** get string from a ByteBuffer, catch the exception and throw it as runtime exception*/
    private static String stringValue(ByteBuffer value)
    {
        try
        {
            return ByteBufferUtil.string(value);
        }
        catch (CharacterCodingException e)
        {
            throw new RuntimeException(e);
        }
    }
}