/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.cassandra.db;
import java.io.File;
import java.io.FilenameFilter;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.net.InetAddress;
import java.util.Collections;
import java.util.HashMap;
import java.util.Map;
import java.util.SortedSet;
import java.util.TreeSet;
import java.util.concurrent.ExecutionException;
import org.apache.cassandra.config.ConfigurationException;
import org.apache.cassandra.config.DatabaseDescriptor;
import org.apache.cassandra.db.filter.IdentityQueryFilter;
import org.apache.cassandra.db.filter.NamesQueryFilter;
import org.apache.cassandra.db.filter.QueryFilter;
import org.apache.cassandra.db.filter.QueryPath;
import org.apache.cassandra.db.marshal.BytesType;
import org.apache.cassandra.dht.IPartitioner;
import org.apache.cassandra.dht.Token;
import org.apache.cassandra.locator.IEndPointSnitch;
import org.apache.cassandra.service.StorageService;
import org.apache.cassandra.utils.FBUtilities;
import org.apache.log4j.Logger;
public class SystemTable
{
private static Logger logger = Logger.getLogger(SystemTable.class);
public static final String STATUS_CF = "LocationInfo"; // keep the old CF string for backwards-compatibility
private static final String LOCATION_KEY = "L";
private static final String GOSSIP_KEY = "G"; // persistent gossiper
private static final String REPLICATION_KEY = "R"; // persistent replication configuration
private static final String BOOTSTRAP_KEY = "Bootstrap";
private static final byte[] BOOTSTRAP = utf8("B");
private static final byte[] TOKEN = utf8("Token");
private static final byte[] GENERATION = utf8("Generation");
private static final byte[] CLUSTERNAME = utf8("ClusterName");
private static final byte[] PARTITIONER = utf8("Partioner");
private static StorageMetadata metadata;
private static byte[] utf8(String str)
{
try
{
return str.getBytes("UTF-8");
}
catch (UnsupportedEncodingException e)
{
throw new RuntimeException(e);
}
}
/**
* Record token being used by another node
*/
public static synchronized void updateToken(InetAddress ep, Token token)
{
IPartitioner p = StorageService.getPartitioner();
ColumnFamily cf = ColumnFamily.create(Table.SYSTEM_TABLE, STATUS_CF);
cf.addColumn(new Column(ep.getAddress(), p.getTokenFactory().toByteArray(token), System.currentTimeMillis()));
RowMutation rm = new RowMutation(Table.SYSTEM_TABLE, LOCATION_KEY);
rm.add(cf);
try
{
rm.apply();
}
catch (IOException e)
{
throw new FSWriteError(e);
}
}
/**
* Record last seen gossip information about node, to preserve this information across restarts.
*
* @param endpoint
*/
public static synchronized void updateEndpointState(InetAddress ep, byte[] endpointState, int generation, int version)
{
assert version >= 0;
long timestamp = ((long)generation) << 30 | version; // yes, we steal 1 bit from version (this is sign bit and version is >0)
assert timestamp > 0 : "Generation: "+generation+", version: "+version+" merge = "+timestamp;
IPartitioner p = StorageService.getPartitioner();
ColumnFamily cf = ColumnFamily.create(Table.SYSTEM_TABLE, STATUS_CF);
cf.addColumn(new Column(ep.getAddress(), endpointState, timestamp));
RowMutation rm = new RowMutation(Table.SYSTEM_TABLE, GOSSIP_KEY);
rm.add(cf);
try
{
rm.apply();
}
catch (IOException e)
{
throw new FSWriteError(e);
}
}
public static synchronized void removeEndpointState(InetAddress ep)
{
try
{
IColumn col = loadEndpointState(ep);
if ( col == null )
return; // already removed
long timestamp = col.timestamp() + 1;
assert timestamp > 0;
IPartitioner p = StorageService.getPartitioner();
RowMutation rm = new RowMutation(Table.SYSTEM_TABLE, GOSSIP_KEY);
rm.delete(new QueryPath(STATUS_CF, null, ep.getAddress()), timestamp);
rm.apply();
}
catch (IOException e)
{
throw new FSWriteError(e);
}
}
/**
* removes all persisted endpoint states
*/
public static synchronized void removeEndpointStates()
{
try
{
long timestamp = Long.MAX_VALUE;
IPartitioner p = StorageService.getPartitioner();
RowMutation rm = new RowMutation(Table.SYSTEM_TABLE, GOSSIP_KEY);
rm.delete(new QueryPath(STATUS_CF, null, null), timestamp);
rm.apply();
try
{
ColumnFamilyStore statusCF = Table.open(Table.SYSTEM_TABLE).getColumnFamilyStore(SystemTable.STATUS_CF);
statusCF.forceBlockingFlush();
statusCF.forceMajorCompaction();
}
catch (ExecutionException e)
{
throw new RuntimeException(e);
}
catch (InterruptedException e)
{
throw new RuntimeException(e);
}
}
catch (IOException e)
{
throw new FSWriteError(e);
}
}
private static IColumn loadEndpointState(InetAddress ep) throws IOException {
Table table = null;
try
{
table = Table.open(Table.SYSTEM_TABLE);
}
catch (AssertionError err)
{
// this happens when a user switches from OPP to RP.
IOException ex = new IOException("Could not read system table. Did you change partitioners?");
ex.initCause(err);
throw ex;
}
SortedSet<byte[]> cols = new TreeSet<byte[]>(new BytesType());
cols.add(ep.getAddress());
QueryFilter filter = new NamesQueryFilter(GOSSIP_KEY, new QueryPath(STATUS_CF), cols);
ColumnFamily cf = table.getColumnFamilyStore(STATUS_CF).getColumnFamily(filter);
if (cf == null)
return null;
IColumn col = cf.getColumn(ep.getAddress());
if (col == null || !col.isLive())
return null;
return col;
}
/**
* Loads all previously persisted gossip endpoint states
*
* @return map with serialized endpoint states
* @throws IOException
*/
public static Map<InetAddress,byte[]> loadEndpointStates() throws IOException {
Table table = null;
try
{
table = Table.open(Table.SYSTEM_TABLE);
}
catch (AssertionError err)
{
// this happens when a user switches from OPP to RP.
IOException ex = new IOException("Could not read system table. Did you change partitioners?");
ex.initCause(err);
throw ex;
}
QueryFilter filter = new IdentityQueryFilter(GOSSIP_KEY, new QueryPath(STATUS_CF));
ColumnFamily cf = table.getColumnFamilyStore(STATUS_CF).getColumnFamily(filter);
if (cf == null)
return Collections.emptyMap();
HashMap<InetAddress,byte[]> rc = new HashMap<InetAddress, byte[]>();
for (IColumn col : cf.getSortedColumns()) {
if (col != null && col.isLive())
rc.put(InetAddress.getByAddress(col.name()),col.value());
}
return rc;
}
/**
* This method is used to update the System Table with the new token for this node
*/
public static synchronized void updateToken(Token token)
{
assert metadata != null;
IPartitioner p = StorageService.getPartitioner();
ColumnFamily cf = ColumnFamily.create(Table.SYSTEM_TABLE, STATUS_CF);
cf.addColumn(new Column(SystemTable.TOKEN, p.getTokenFactory().toByteArray(token), System.currentTimeMillis()));
RowMutation rm = new RowMutation(Table.SYSTEM_TABLE, LOCATION_KEY);
rm.add(cf);
try
{
rm.apply();
}
catch (IOException e)
{
throw new FSWriteError(e);
}
metadata.setToken(token);
}
/**
* One of three things will happen if you try to read the system table:
* 1. files are present and you can read them: great
* 2. no files are there: great (new node is assumed)
* 3. files are present but you can't read them: bad (suspect that the partitioner was changed).
* @throws IOException
*/
public static void checkHealth() throws IOException
{
Table table = null;
try
{
table = Table.open(Table.SYSTEM_TABLE);
}
catch (AssertionError err)
{
// this happens when a user switches from OPP to RP.
IOException ex = new IOException("Could not read system table. Did you change partitioners?");
ex.initCause(err);
throw ex;
}
SortedSet<byte[]> cols = new TreeSet<byte[]>(new BytesType());
cols.add(TOKEN);
cols.add(GENERATION);
cols.add(PARTITIONER);
QueryFilter filter = new NamesQueryFilter(LOCATION_KEY, new QueryPath(STATUS_CF), cols);
ColumnFamily cf = table.getColumnFamilyStore(STATUS_CF).getColumnFamily(filter);
if (cf == null)
{
// this is either a brand new node (there will be no files), or the partitioner was changed from RP to OPP.
for (String path : DatabaseDescriptor.getAllDataFileLocationsForTable("system"))
{
File[] dbContents = new File(path).listFiles(new FilenameFilter()
{
public boolean accept(File dir, String name)
{
return name.endsWith(".db");
}
});
if (dbContents.length > 0)
throw new IOException("Found system table files, but they couldn't be loaded. Did you change the partitioner?");
}
// no system files. data is either in the commit log or this is a new node.
return;
}
// token and generation should *always* be there. If either are missing, we can assume that the partitioner has
// been switched.
if (cf.getColumnCount() > 0 && (cf.getColumn(GENERATION) == null || cf.getColumn(TOKEN) == null))
throw new IOException("Couldn't read system generation or token. Did you change the partitioner?");
IColumn partitionerCol = cf.getColumn(PARTITIONER);
if (partitionerCol != null && !DatabaseDescriptor.getPartitioner().getClass().getName().equals(new String(partitionerCol.value(), "UTF-8")))
throw new IOException("Detected partitioner mismatch! Did you change the partitioner?");
if (partitionerCol == null)
logger.info("Did not see a partitioner in system storage.");
}
/*
* This method reads the system table and retrieves the metadata
* associated with this storage instance. Currently we store the
* metadata in a Column Family called LocatioInfo which has two
* columns namely "Token" and "Generation". This is the token that
* gets gossiped around and the generation info is used for FD.
* We also store whether we're in bootstrap mode in a third column
*/
public static synchronized StorageMetadata initMetadata() throws IOException, ConfigurationException
{
if (metadata != null) // guard to protect against being called twice
return metadata;
/* Read the system table to retrieve the storage ID and the generation */
Table table = Table.open(Table.SYSTEM_TABLE);
SortedSet<byte[]> columns = new TreeSet<byte[]>(new BytesType());
columns.add(TOKEN);
columns.add(GENERATION);
columns.add(CLUSTERNAME);
columns.add(PARTITIONER);
QueryFilter filter = new NamesQueryFilter(LOCATION_KEY, new QueryPath(STATUS_CF), columns);
ColumnFamily cf = table.getColumnFamilyStore(STATUS_CF).getColumnFamily(filter);
String partitioner = DatabaseDescriptor.getPartitioner().getClass().getName();
IPartitioner p = StorageService.getPartitioner();
if (cf == null)
{
Token token;
String initialToken = DatabaseDescriptor.getInitialToken();
if (initialToken == null)
token = p.getRandomToken();
else
token = p.getTokenFactory().fromString(initialToken);
logger.info("Saved Token not found. Using " + token);
// seconds-since-epoch isn't a foolproof new generation
// (where foolproof is "guaranteed to be larger than the last one seen at this ip address"),
// but it's as close as sanely possible
int generation = (int) (System.currentTimeMillis() / 1000);
logger.info("Saved ClusterName not found. Using " + DatabaseDescriptor.getClusterName());
RowMutation rm = new RowMutation(Table.SYSTEM_TABLE, LOCATION_KEY);
cf = ColumnFamily.create(Table.SYSTEM_TABLE, SystemTable.STATUS_CF);
cf.addColumn(new Column(TOKEN, p.getTokenFactory().toByteArray(token)));
cf.addColumn(new Column(GENERATION, FBUtilities.toByteArray(generation)));
cf.addColumn(new Column(CLUSTERNAME, DatabaseDescriptor.getClusterName().getBytes()));
cf.addColumn(new Column(PARTITIONER, partitioner.getBytes("UTF-8")));
rm.add(cf);
rm.apply();
try
{
table.getColumnFamilyStore(SystemTable.STATUS_CF).forceBlockingFlush();
}
catch (ExecutionException e)
{
throw new RuntimeException(e);
}
catch (InterruptedException e)
{
throw new RuntimeException(e);
}
metadata = new StorageMetadata(token, generation, DatabaseDescriptor.getClusterName().getBytes());
return metadata;
}
if (cf.getColumnCount() < 2)
throw new RuntimeException("Expected both token and generation columns; found " + cf);
/* we crashed and came back up: make sure new generation is greater than old */
IColumn tokenColumn = cf.getColumn(TOKEN);
assert tokenColumn != null : cf;
Token token = p.getTokenFactory().fromByteArray(tokenColumn.value());
logger.info("Saved Token found: " + token);
IColumn generation = cf.getColumn(GENERATION);
assert generation != null : cf;
int gen = Math.max(FBUtilities.byteArrayToInt(generation.value()) + 1, (int) (System.currentTimeMillis() / 1000));
IColumn cluster = cf.getColumn(CLUSTERNAME);
IColumn partitionerColumn = cf.getColumn(PARTITIONER);
RowMutation rm = new RowMutation(Table.SYSTEM_TABLE, LOCATION_KEY);
cf = ColumnFamily.create(Table.SYSTEM_TABLE, SystemTable.STATUS_CF);
Column generation2 = new Column(GENERATION, FBUtilities.toByteArray(gen), generation.timestamp() + 1);
cf.addColumn(generation2);
byte[] cname;
if (cluster != null)
{
logger.info("Saved ClusterName found: " + new String(cluster.value()));
cname = cluster.value();
}
else
{
Column clustername = new Column(CLUSTERNAME, DatabaseDescriptor.getClusterName().getBytes());
cf.addColumn(clustername);
cname = DatabaseDescriptor.getClusterName().getBytes();
logger.info("Saved ClusterName not found. Using " + DatabaseDescriptor.getClusterName());
}
if (partitionerColumn == null)
{
Column c = new Column(PARTITIONER, partitioner.getBytes("UTF-8"));
cf.addColumn(c);
logger.info("Saved partitioner not found. Using " + partitioner);
}
rm.add(cf);
rm.apply();
validateKeyspaceDefinitions();
try
{
table.getColumnFamilyStore(SystemTable.STATUS_CF).forceBlockingFlush();
}
catch (ExecutionException e)
{
throw new RuntimeException(e);
}
catch (InterruptedException e)
{
throw new RuntimeException(e);
}
metadata = new StorageMetadata(token, gen, cname);
return metadata;
}
public static boolean isBootstrapped()
{
Table table = null;
try
{
table = Table.open(Table.SYSTEM_TABLE);
QueryFilter filter = new NamesQueryFilter(BOOTSTRAP_KEY, new QueryPath(STATUS_CF), BOOTSTRAP);
ColumnFamily cf = table.getColumnFamilyStore(STATUS_CF).getColumnFamily(filter);
return cf != null && cf.getColumn(BOOTSTRAP).value()[0] == 1;
}
catch (IOException e)
{
throw new FSReadError(e);
}
}
public static void setBootstrapped(boolean isBootstrapped)
{
ColumnFamily cf = ColumnFamily.create(Table.SYSTEM_TABLE, STATUS_CF);
cf.addColumn(new Column(BOOTSTRAP, new byte[] { (byte) (isBootstrapped ? 1 : 0) }, System.currentTimeMillis()));
RowMutation rm = new RowMutation(Table.SYSTEM_TABLE, BOOTSTRAP_KEY);
rm.add(cf);
try
{
rm.apply();
}
catch (IOException e)
{
throw new FSWriteError(e);
}
}
private static void validateKeyspaceDefinitions() throws ConfigurationException, IOException {
/* Read the system table to retrieve the storage ID and the generation */
Table table = Table.open(Table.SYSTEM_TABLE);
QueryFilter filter = new IdentityQueryFilter(REPLICATION_KEY, new QueryPath(STATUS_CF));
ColumnFamily cf = table.getColumnFamilyStore(STATUS_CF).getColumnFamily(filter);
if ( cf != null ) {
for (String tablename : DatabaseDescriptor.getNonSystemTables()) {
// saving replica strategy class name, repl factor, endpoint snitch class name and location
IEndPointSnitch endPointSnitch = DatabaseDescriptor.getEndPointSnitch(tablename);
IColumn column = cf.getColumn(utf8(tablename));
if (column == null)
continue;
String value = new String(column.value(),"UTF-8");
if (value==null)
throw new IOException("Corrupted value in replica conf of "+tablename);
String[] vals = value.split(",");
if (vals.length<5)
throw new IOException("Corrupted value in replica conf of "+tablename+":"+value);
assertEquals(vals[0],DatabaseDescriptor.getReplicaPlacementStrategyClass(tablename).getSimpleName(),"ReplicaPlacementStrategy",tablename);
assertEquals(vals[1],String.valueOf(DatabaseDescriptor.getReplicationFactor(tablename)),"ReplicationFactor",tablename);
assertEquals(vals[2],endPointSnitch.getClass().getSimpleName(),"EndPointSnitch",tablename);
assertEquals(vals[3]+":"+vals[4],endPointSnitch.getLocalDatacenter()+":"+endPointSnitch.getLocalRack(),"Location",tablename);
}
}
// no information is saved yet. saving
RowMutation rm = new RowMutation(Table.SYSTEM_TABLE, REPLICATION_KEY);
cf = ColumnFamily.create(Table.SYSTEM_TABLE, STATUS_CF);
for (String tablename : DatabaseDescriptor.getNonSystemTables()) {
// saving replica strategy class name, repl factor, endpoint snitch class name and location
IEndPointSnitch endPointSnitch = DatabaseDescriptor.getEndPointSnitch(tablename);
String value = DatabaseDescriptor.getReplicaPlacementStrategyClass(tablename).getSimpleName()
+","+DatabaseDescriptor.getReplicationFactor(tablename)
+","+endPointSnitch.getClass().getSimpleName()
+","+endPointSnitch.getLocalDatacenter()
+","+endPointSnitch.getLocalRack();
cf.addColumn(new Column(utf8(tablename), utf8(value) , System.currentTimeMillis()));
if (logger.isDebugEnabled())
logger.debug("Replication info not found. Saving "+tablename+"="+value);
}
rm.add(cf);
try
{
rm.apply();
}
catch (IOException e)
{
throw new FSWriteError(e);
}
}
private static void assertEquals(String expected, String actual, String confname, String tablename) throws ConfigurationException
{
if (!expected.equals(actual)) {
if ( Boolean.valueOf( System.getProperty("cassandra.replication_check", "true") ) ) {
throw new ConfigurationException("You changed "+confname+" on the working node in storage-conf.xml for keyspace " + tablename+"." +
"This is generally bad idea, because without additional steps this will lead to data loss and unavailability. " +
"You can either change this value in storage-conf.xml back to "+expected+" (the safe way) or, if you really, REALLY sure, start this node with " +
"cassandra.replication_check=false system property");
} else {
logger.warn("Incompatible change of "+confname+" for keyspace " + tablename+" from "+expected+" to "+actual+" ignored by operator request");
}
}
}
public static class StorageMetadata
{
private Token token;
private int generation;
private byte[] cluster;
StorageMetadata(Token storageId, int generation, byte[] clustername)
{
token = storageId;
this.generation = generation;
cluster = clustername;
}
public Token getToken()
{
return token;
}
public void setToken(Token storageId)
{
token = storageId;
}
public int getGeneration()
{
return generation;
}
public byte[] getClusterName()
{
return cluster;
}
}
}