/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.cassandra.db; import java.io.File; import java.io.FilenameFilter; import java.io.IOException; import java.io.UnsupportedEncodingException; import java.net.InetAddress; import java.util.Collections; import java.util.HashMap; import java.util.Map; import java.util.SortedSet; import java.util.TreeSet; import java.util.concurrent.ExecutionException; import org.apache.cassandra.config.ConfigurationException; import org.apache.cassandra.config.DatabaseDescriptor; import org.apache.cassandra.db.filter.IdentityQueryFilter; import org.apache.cassandra.db.filter.NamesQueryFilter; import org.apache.cassandra.db.filter.QueryFilter; import org.apache.cassandra.db.filter.QueryPath; import org.apache.cassandra.db.marshal.BytesType; import org.apache.cassandra.dht.IPartitioner; import org.apache.cassandra.dht.Token; import org.apache.cassandra.locator.IEndPointSnitch; import org.apache.cassandra.service.StorageService; import org.apache.cassandra.utils.FBUtilities; import org.apache.log4j.Logger; public class SystemTable { private static Logger logger = Logger.getLogger(SystemTable.class); public static final String STATUS_CF = "LocationInfo"; // keep the old CF string for backwards-compatibility private static final String LOCATION_KEY = "L"; private static final String GOSSIP_KEY = "G"; // persistent gossiper private static final String REPLICATION_KEY = "R"; // persistent replication configuration private static final String BOOTSTRAP_KEY = "Bootstrap"; private static final byte[] BOOTSTRAP = utf8("B"); private static final byte[] TOKEN = utf8("Token"); private static final byte[] GENERATION = utf8("Generation"); private static final byte[] CLUSTERNAME = utf8("ClusterName"); private static final byte[] PARTITIONER = utf8("Partioner"); private static StorageMetadata metadata; private static byte[] utf8(String str) { try { return str.getBytes("UTF-8"); } catch (UnsupportedEncodingException e) { throw new RuntimeException(e); } } /** * Record token being used by another node */ public static synchronized void updateToken(InetAddress ep, Token token) { IPartitioner p = StorageService.getPartitioner(); ColumnFamily cf = ColumnFamily.create(Table.SYSTEM_TABLE, STATUS_CF); cf.addColumn(new Column(ep.getAddress(), p.getTokenFactory().toByteArray(token), System.currentTimeMillis())); RowMutation rm = new RowMutation(Table.SYSTEM_TABLE, LOCATION_KEY); rm.add(cf); try { rm.apply(); } catch (IOException e) { throw new FSWriteError(e); } } /** * Record last seen gossip information about node, to preserve this information across restarts. * * @param endpoint */ public static synchronized void updateEndpointState(InetAddress ep, byte[] endpointState, int generation, int version) { assert version >= 0; long timestamp = ((long)generation) << 30 | version; // yes, we steal 1 bit from version (this is sign bit and version is >0) assert timestamp > 0 : "Generation: "+generation+", version: "+version+" merge = "+timestamp; IPartitioner p = StorageService.getPartitioner(); ColumnFamily cf = ColumnFamily.create(Table.SYSTEM_TABLE, STATUS_CF); cf.addColumn(new Column(ep.getAddress(), endpointState, timestamp)); RowMutation rm = new RowMutation(Table.SYSTEM_TABLE, GOSSIP_KEY); rm.add(cf); try { rm.apply(); } catch (IOException e) { throw new FSWriteError(e); } } public static synchronized void removeEndpointState(InetAddress ep) { try { IColumn col = loadEndpointState(ep); if ( col == null ) return; // already removed long timestamp = col.timestamp() + 1; assert timestamp > 0; IPartitioner p = StorageService.getPartitioner(); RowMutation rm = new RowMutation(Table.SYSTEM_TABLE, GOSSIP_KEY); rm.delete(new QueryPath(STATUS_CF, null, ep.getAddress()), timestamp); rm.apply(); } catch (IOException e) { throw new FSWriteError(e); } } /** * removes all persisted endpoint states */ public static synchronized void removeEndpointStates() { try { long timestamp = Long.MAX_VALUE; IPartitioner p = StorageService.getPartitioner(); RowMutation rm = new RowMutation(Table.SYSTEM_TABLE, GOSSIP_KEY); rm.delete(new QueryPath(STATUS_CF, null, null), timestamp); rm.apply(); try { ColumnFamilyStore statusCF = Table.open(Table.SYSTEM_TABLE).getColumnFamilyStore(SystemTable.STATUS_CF); statusCF.forceBlockingFlush(); statusCF.forceMajorCompaction(); } catch (ExecutionException e) { throw new RuntimeException(e); } catch (InterruptedException e) { throw new RuntimeException(e); } } catch (IOException e) { throw new FSWriteError(e); } } private static IColumn loadEndpointState(InetAddress ep) throws IOException { Table table = null; try { table = Table.open(Table.SYSTEM_TABLE); } catch (AssertionError err) { // this happens when a user switches from OPP to RP. IOException ex = new IOException("Could not read system table. Did you change partitioners?"); ex.initCause(err); throw ex; } SortedSet<byte[]> cols = new TreeSet<byte[]>(new BytesType()); cols.add(ep.getAddress()); QueryFilter filter = new NamesQueryFilter(GOSSIP_KEY, new QueryPath(STATUS_CF), cols); ColumnFamily cf = table.getColumnFamilyStore(STATUS_CF).getColumnFamily(filter); if (cf == null) return null; IColumn col = cf.getColumn(ep.getAddress()); if (col == null || !col.isLive()) return null; return col; } /** * Loads all previously persisted gossip endpoint states * * @return map with serialized endpoint states * @throws IOException */ public static Map<InetAddress,byte[]> loadEndpointStates() throws IOException { Table table = null; try { table = Table.open(Table.SYSTEM_TABLE); } catch (AssertionError err) { // this happens when a user switches from OPP to RP. IOException ex = new IOException("Could not read system table. Did you change partitioners?"); ex.initCause(err); throw ex; } QueryFilter filter = new IdentityQueryFilter(GOSSIP_KEY, new QueryPath(STATUS_CF)); ColumnFamily cf = table.getColumnFamilyStore(STATUS_CF).getColumnFamily(filter); if (cf == null) return Collections.emptyMap(); HashMap<InetAddress,byte[]> rc = new HashMap<InetAddress, byte[]>(); for (IColumn col : cf.getSortedColumns()) { if (col != null && col.isLive()) rc.put(InetAddress.getByAddress(col.name()),col.value()); } return rc; } /** * This method is used to update the System Table with the new token for this node */ public static synchronized void updateToken(Token token) { assert metadata != null; IPartitioner p = StorageService.getPartitioner(); ColumnFamily cf = ColumnFamily.create(Table.SYSTEM_TABLE, STATUS_CF); cf.addColumn(new Column(SystemTable.TOKEN, p.getTokenFactory().toByteArray(token), System.currentTimeMillis())); RowMutation rm = new RowMutation(Table.SYSTEM_TABLE, LOCATION_KEY); rm.add(cf); try { rm.apply(); } catch (IOException e) { throw new FSWriteError(e); } metadata.setToken(token); } /** * One of three things will happen if you try to read the system table: * 1. files are present and you can read them: great * 2. no files are there: great (new node is assumed) * 3. files are present but you can't read them: bad (suspect that the partitioner was changed). * @throws IOException */ public static void checkHealth() throws IOException { Table table = null; try { table = Table.open(Table.SYSTEM_TABLE); } catch (AssertionError err) { // this happens when a user switches from OPP to RP. IOException ex = new IOException("Could not read system table. Did you change partitioners?"); ex.initCause(err); throw ex; } SortedSet<byte[]> cols = new TreeSet<byte[]>(new BytesType()); cols.add(TOKEN); cols.add(GENERATION); cols.add(PARTITIONER); QueryFilter filter = new NamesQueryFilter(LOCATION_KEY, new QueryPath(STATUS_CF), cols); ColumnFamily cf = table.getColumnFamilyStore(STATUS_CF).getColumnFamily(filter); if (cf == null) { // this is either a brand new node (there will be no files), or the partitioner was changed from RP to OPP. for (String path : DatabaseDescriptor.getAllDataFileLocationsForTable("system")) { File[] dbContents = new File(path).listFiles(new FilenameFilter() { public boolean accept(File dir, String name) { return name.endsWith(".db"); } }); if (dbContents.length > 0) throw new IOException("Found system table files, but they couldn't be loaded. Did you change the partitioner?"); } // no system files. data is either in the commit log or this is a new node. return; } // token and generation should *always* be there. If either are missing, we can assume that the partitioner has // been switched. if (cf.getColumnCount() > 0 && (cf.getColumn(GENERATION) == null || cf.getColumn(TOKEN) == null)) throw new IOException("Couldn't read system generation or token. Did you change the partitioner?"); IColumn partitionerCol = cf.getColumn(PARTITIONER); if (partitionerCol != null && !DatabaseDescriptor.getPartitioner().getClass().getName().equals(new String(partitionerCol.value(), "UTF-8"))) throw new IOException("Detected partitioner mismatch! Did you change the partitioner?"); if (partitionerCol == null) logger.info("Did not see a partitioner in system storage."); } /* * This method reads the system table and retrieves the metadata * associated with this storage instance. Currently we store the * metadata in a Column Family called LocatioInfo which has two * columns namely "Token" and "Generation". This is the token that * gets gossiped around and the generation info is used for FD. * We also store whether we're in bootstrap mode in a third column */ public static synchronized StorageMetadata initMetadata() throws IOException, ConfigurationException { if (metadata != null) // guard to protect against being called twice return metadata; /* Read the system table to retrieve the storage ID and the generation */ Table table = Table.open(Table.SYSTEM_TABLE); SortedSet<byte[]> columns = new TreeSet<byte[]>(new BytesType()); columns.add(TOKEN); columns.add(GENERATION); columns.add(CLUSTERNAME); columns.add(PARTITIONER); QueryFilter filter = new NamesQueryFilter(LOCATION_KEY, new QueryPath(STATUS_CF), columns); ColumnFamily cf = table.getColumnFamilyStore(STATUS_CF).getColumnFamily(filter); String partitioner = DatabaseDescriptor.getPartitioner().getClass().getName(); IPartitioner p = StorageService.getPartitioner(); if (cf == null) { Token token; String initialToken = DatabaseDescriptor.getInitialToken(); if (initialToken == null) token = p.getRandomToken(); else token = p.getTokenFactory().fromString(initialToken); logger.info("Saved Token not found. Using " + token); // seconds-since-epoch isn't a foolproof new generation // (where foolproof is "guaranteed to be larger than the last one seen at this ip address"), // but it's as close as sanely possible int generation = (int) (System.currentTimeMillis() / 1000); logger.info("Saved ClusterName not found. Using " + DatabaseDescriptor.getClusterName()); RowMutation rm = new RowMutation(Table.SYSTEM_TABLE, LOCATION_KEY); cf = ColumnFamily.create(Table.SYSTEM_TABLE, SystemTable.STATUS_CF); cf.addColumn(new Column(TOKEN, p.getTokenFactory().toByteArray(token))); cf.addColumn(new Column(GENERATION, FBUtilities.toByteArray(generation))); cf.addColumn(new Column(CLUSTERNAME, DatabaseDescriptor.getClusterName().getBytes())); cf.addColumn(new Column(PARTITIONER, partitioner.getBytes("UTF-8"))); rm.add(cf); rm.apply(); try { table.getColumnFamilyStore(SystemTable.STATUS_CF).forceBlockingFlush(); } catch (ExecutionException e) { throw new RuntimeException(e); } catch (InterruptedException e) { throw new RuntimeException(e); } metadata = new StorageMetadata(token, generation, DatabaseDescriptor.getClusterName().getBytes()); return metadata; } if (cf.getColumnCount() < 2) throw new RuntimeException("Expected both token and generation columns; found " + cf); /* we crashed and came back up: make sure new generation is greater than old */ IColumn tokenColumn = cf.getColumn(TOKEN); assert tokenColumn != null : cf; Token token = p.getTokenFactory().fromByteArray(tokenColumn.value()); logger.info("Saved Token found: " + token); IColumn generation = cf.getColumn(GENERATION); assert generation != null : cf; int gen = Math.max(FBUtilities.byteArrayToInt(generation.value()) + 1, (int) (System.currentTimeMillis() / 1000)); IColumn cluster = cf.getColumn(CLUSTERNAME); IColumn partitionerColumn = cf.getColumn(PARTITIONER); RowMutation rm = new RowMutation(Table.SYSTEM_TABLE, LOCATION_KEY); cf = ColumnFamily.create(Table.SYSTEM_TABLE, SystemTable.STATUS_CF); Column generation2 = new Column(GENERATION, FBUtilities.toByteArray(gen), generation.timestamp() + 1); cf.addColumn(generation2); byte[] cname; if (cluster != null) { logger.info("Saved ClusterName found: " + new String(cluster.value())); cname = cluster.value(); } else { Column clustername = new Column(CLUSTERNAME, DatabaseDescriptor.getClusterName().getBytes()); cf.addColumn(clustername); cname = DatabaseDescriptor.getClusterName().getBytes(); logger.info("Saved ClusterName not found. Using " + DatabaseDescriptor.getClusterName()); } if (partitionerColumn == null) { Column c = new Column(PARTITIONER, partitioner.getBytes("UTF-8")); cf.addColumn(c); logger.info("Saved partitioner not found. Using " + partitioner); } rm.add(cf); rm.apply(); validateKeyspaceDefinitions(); try { table.getColumnFamilyStore(SystemTable.STATUS_CF).forceBlockingFlush(); } catch (ExecutionException e) { throw new RuntimeException(e); } catch (InterruptedException e) { throw new RuntimeException(e); } metadata = new StorageMetadata(token, gen, cname); return metadata; } public static boolean isBootstrapped() { Table table = null; try { table = Table.open(Table.SYSTEM_TABLE); QueryFilter filter = new NamesQueryFilter(BOOTSTRAP_KEY, new QueryPath(STATUS_CF), BOOTSTRAP); ColumnFamily cf = table.getColumnFamilyStore(STATUS_CF).getColumnFamily(filter); return cf != null && cf.getColumn(BOOTSTRAP).value()[0] == 1; } catch (IOException e) { throw new FSReadError(e); } } public static void setBootstrapped(boolean isBootstrapped) { ColumnFamily cf = ColumnFamily.create(Table.SYSTEM_TABLE, STATUS_CF); cf.addColumn(new Column(BOOTSTRAP, new byte[] { (byte) (isBootstrapped ? 1 : 0) }, System.currentTimeMillis())); RowMutation rm = new RowMutation(Table.SYSTEM_TABLE, BOOTSTRAP_KEY); rm.add(cf); try { rm.apply(); } catch (IOException e) { throw new FSWriteError(e); } } private static void validateKeyspaceDefinitions() throws ConfigurationException, IOException { /* Read the system table to retrieve the storage ID and the generation */ Table table = Table.open(Table.SYSTEM_TABLE); QueryFilter filter = new IdentityQueryFilter(REPLICATION_KEY, new QueryPath(STATUS_CF)); ColumnFamily cf = table.getColumnFamilyStore(STATUS_CF).getColumnFamily(filter); if ( cf != null ) { for (String tablename : DatabaseDescriptor.getNonSystemTables()) { // saving replica strategy class name, repl factor, endpoint snitch class name and location IEndPointSnitch endPointSnitch = DatabaseDescriptor.getEndPointSnitch(tablename); IColumn column = cf.getColumn(utf8(tablename)); if (column == null) continue; String value = new String(column.value(),"UTF-8"); if (value==null) throw new IOException("Corrupted value in replica conf of "+tablename); String[] vals = value.split(","); if (vals.length<5) throw new IOException("Corrupted value in replica conf of "+tablename+":"+value); assertEquals(vals[0],DatabaseDescriptor.getReplicaPlacementStrategyClass(tablename).getSimpleName(),"ReplicaPlacementStrategy",tablename); assertEquals(vals[1],String.valueOf(DatabaseDescriptor.getReplicationFactor(tablename)),"ReplicationFactor",tablename); assertEquals(vals[2],endPointSnitch.getClass().getSimpleName(),"EndPointSnitch",tablename); assertEquals(vals[3]+":"+vals[4],endPointSnitch.getLocalDatacenter()+":"+endPointSnitch.getLocalRack(),"Location",tablename); } } // no information is saved yet. saving RowMutation rm = new RowMutation(Table.SYSTEM_TABLE, REPLICATION_KEY); cf = ColumnFamily.create(Table.SYSTEM_TABLE, STATUS_CF); for (String tablename : DatabaseDescriptor.getNonSystemTables()) { // saving replica strategy class name, repl factor, endpoint snitch class name and location IEndPointSnitch endPointSnitch = DatabaseDescriptor.getEndPointSnitch(tablename); String value = DatabaseDescriptor.getReplicaPlacementStrategyClass(tablename).getSimpleName() +","+DatabaseDescriptor.getReplicationFactor(tablename) +","+endPointSnitch.getClass().getSimpleName() +","+endPointSnitch.getLocalDatacenter() +","+endPointSnitch.getLocalRack(); cf.addColumn(new Column(utf8(tablename), utf8(value) , System.currentTimeMillis())); if (logger.isDebugEnabled()) logger.debug("Replication info not found. Saving "+tablename+"="+value); } rm.add(cf); try { rm.apply(); } catch (IOException e) { throw new FSWriteError(e); } } private static void assertEquals(String expected, String actual, String confname, String tablename) throws ConfigurationException { if (!expected.equals(actual)) { if ( Boolean.valueOf( System.getProperty("cassandra.replication_check", "true") ) ) { throw new ConfigurationException("You changed "+confname+" on the working node in storage-conf.xml for keyspace " + tablename+"." + "This is generally bad idea, because without additional steps this will lead to data loss and unavailability. " + "You can either change this value in storage-conf.xml back to "+expected+" (the safe way) or, if you really, REALLY sure, start this node with " + "cassandra.replication_check=false system property"); } else { logger.warn("Incompatible change of "+confname+" for keyspace " + tablename+" from "+expected+" to "+actual+" ignored by operator request"); } } } public static class StorageMetadata { private Token token; private int generation; private byte[] cluster; StorageMetadata(Token storageId, int generation, byte[] clustername) { token = storageId; this.generation = generation; cluster = clustername; } public Token getToken() { return token; } public void setToken(Token storageId) { token = storageId; } public int getGeneration() { return generation; } public byte[] getClusterName() { return cluster; } } }