/* This file is part of VoltDB. * Copyright (C) 2008-2010 VoltDB Inc. * * VoltDB is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * VoltDB is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with VoltDB. If not, see <http://www.gnu.org/licenses/>. */ package org.voltdb.sysprocs.saverestore; import java.io.BufferedInputStream; import java.io.CharArrayWriter; import java.io.EOFException; import java.io.File; import java.io.FileFilter; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStreamReader; import java.io.PrintWriter; import java.io.StringWriter; import java.nio.ByteBuffer; import java.util.ArrayList; import java.util.Collection; import java.util.Date; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import java.util.TreeMap; import java.util.TreeSet; import java.util.zip.CRC32; import org.voltdb.catalog.*; import org.voltdb.utils.Pair; import org.voltdb.utils.DBBPool.BBContainer; import edu.brown.catalog.CatalogUtil; import edu.brown.hstore.PartitionExecutor.SystemProcedureExecutionContext; public class SnapshotUtil { /** * Create a digest for a snapshot containing the time of the snapshot and the list of tables included. * The first item in the comma separated list is the time in milliseconds as a string. * @param snapshotTime * @param path * @param nonce * @param tables * @throws IOException */ public static void recordSnapshotTableList( long snapshotTime, String path, String nonce, List<Table> tables) throws IOException { final File f = new File(path, constructDigestFilenameForNonce(nonce)); if (f.exists()) { if (!f.delete()) { throw new IOException("Unable to write table list file " + f); } } FileOutputStream fos = new FileOutputStream(f); StringWriter sw = new StringWriter(); sw.append(Long.toString(snapshotTime)); if (!tables.isEmpty()) { sw.append(','); } for (int ii = 0; ii < tables.size(); ii++) { sw.append(tables.get(ii).getTypeName()); if (!(ii == (tables.size() - 1))) { sw.append(','); } else { sw.append('\n'); } } final byte tableListBytes[] = sw.getBuffer().toString().getBytes("UTF-8"); final CRC32 crc = new CRC32(); crc.update(tableListBytes); ByteBuffer fileBuffer = ByteBuffer.allocate(tableListBytes.length + 4); fileBuffer.putInt((int)crc.getValue()); fileBuffer.put(tableListBytes); fileBuffer.flip(); fos.getChannel().write(fileBuffer); fos.getFD().sync(); } public static List<String> retrieveRelevantTableNames(String path, String nonce) throws Exception { return retrieveRelevantTableNamesAndTime(new File(path, constructDigestFilenameForNonce(nonce))).getSecond(); } /** * Retrieve a list of tables from a digest. Doesn't return the snapshot time * value that is stashed in the digest file. * @param f * @throws Exception */ public static Pair<Long, List<String>> retrieveRelevantTableNamesAndTime(File f) throws Exception { String tableList = CRCCheck(f); String tableNames[] = tableList.split(","); String actualTableNames[] = new String[tableNames.length - 1]; System.arraycopy( tableNames, 1, actualTableNames, 0, tableNames.length - 1); return Pair.of(Long.valueOf(tableNames[0]), java.util.Arrays.asList(actualTableNames)); } /** * Check if the CRC of the snapshot file matches the digest. * @param f The snapshot file object * @return The table list as a string * @throws IOException If CRC does not match */ public static String CRCCheck(File f) throws IOException { final FileInputStream fis = new FileInputStream(f); try { final BufferedInputStream bis = new BufferedInputStream(fis); ByteBuffer crcBuffer = ByteBuffer.allocate(4); if (4 != bis.read(crcBuffer.array())) { throw new EOFException("EOF while attempting to read CRC from snapshot digest"); } final int crc = crcBuffer.getInt(); final InputStreamReader isr = new InputStreamReader(bis, "UTF-8"); CharArrayWriter caw = new CharArrayWriter(); while (true) { int nextChar = isr.read(); if (nextChar == -1) { throw new EOFException("EOF while reading snapshot digest"); } if (nextChar == '\n') { break; } caw.write(nextChar); } String tableList = caw.toString(); byte tableListBytes[] = tableList.getBytes("UTF-8"); CRC32 tableListCRC = new CRC32(); tableListCRC.update(tableListBytes); tableListCRC.update("\n".getBytes("UTF-8")); final int calculatedValue = (int)tableListCRC.getValue(); if (crc != calculatedValue) { throw new IOException("CRC of snapshot digest did not match digest contents"); } return tableList; } finally { try { if (fis != null) fis.close(); } catch (IOException e) {} } } /** * Storage for information about files that are part of a specific snapshot */ public static class Snapshot { public final List<File> m_digests = new ArrayList<File>(); public final List<Set<String>> m_digestTables = new ArrayList<Set<String>>(); public final Map<String, TableFiles> m_tableFiles = new TreeMap<String, TableFiles>(); } /** * Description of all the files for a specific table that is part of a specific snapshot */ public static class TableFiles { public final boolean m_isReplicated; TableFiles(boolean isReplicated) { m_isReplicated = isReplicated; } public final List<File> m_files = new ArrayList<File>(); public final List<Boolean> m_completed = new ArrayList<Boolean>(); public final List<Set<Integer>> m_validPartitionIds = new ArrayList<Set<Integer>>(); public final List<Set<Integer>> m_corruptParititionIds = new ArrayList<Set<Integer>>(); public final List<Integer> m_totalPartitionCounts = new ArrayList<Integer>(); } /** * Simple filter that includes directories and files that end in .digest or .vpt */ public static class SnapshotFilter implements FileFilter { @Override public boolean accept(File pathname) { if (pathname.isDirectory()) { return true; } if (pathname.getName().endsWith(".digest") || pathname.getName().endsWith(".vpt")) { return true; } return false; } }; /** * Filter that looks for files related to a specific snapshot. */ public static class SpecificSnapshotFilter extends SnapshotFilter { private final Set<String> snapshotNames; public SpecificSnapshotFilter(Set<String> snapshotNames) { this.snapshotNames = snapshotNames; } @Override public boolean accept(File pathname) { if (!super.accept(pathname)) { return false; } if (pathname.isDirectory()) { return true; } for (String snapshotName : snapshotNames) { if (pathname.getName().startsWith(snapshotName + "-") || pathname.getName().equals(constructDigestFilenameForNonce(snapshotName))) { return true; } } return false; } } /** * Spider the provided directory applying the provided FileFilter. Optionally validate snapshot * files. Return a summary of partition counts, partition information, files, digests etc. * that can be used to determine if a valid restore plan exists. * @param directory * @param snapshots * @param filter * @param recursion * @param validate */ public static void retrieveSnapshotFiles( File directory, Map<Long, Snapshot> snapshots, FileFilter filter, int recursion, boolean validate) { if (recursion == 32) { return; } if (!directory.exists()) { System.err.println("Error: Directory " + directory.getPath() + " doesn't exist"); return; } if (!directory.canRead()) { System.err.println("Error: Directory " + directory.getPath() + " is not readable"); return; } if (!directory.canExecute()) { System.err.println("Error: Directory " + directory.getPath() + " is not executable"); return; } //System.out.println("Processing Dir :"+directory); //sleep(1); for (File f : directory.listFiles(filter)) { //System.out.println("Processing File :"+f); if (f.isDirectory()) { if (!f.canRead() || !f.canExecute()) { System.err.println("Warning: Skipping directory " + f.getPath() + " due to lack of read permission"); } else { retrieveSnapshotFiles( f, snapshots, filter, recursion++, validate); } continue; } if (!f.canRead()) { System.err.println("Warning: " + f.getPath() + " is not readable"); continue; } FileInputStream fis = null; try { fis = new FileInputStream(f); } catch (FileNotFoundException e1) { System.err.println(e1.getMessage()); continue; } try { if (f.getName().endsWith(".digest")) { Pair<Long, List<String>> result = null; try { result = retrieveRelevantTableNamesAndTime(f); } catch (Exception e) { System.err.println(e.getMessage()); System.err.println("Error: Unable to process digest " + f.getPath()); continue; } Long snapshotTime = result.getFirst(); Snapshot s = snapshots.get(snapshotTime); if (s == null) { s = new Snapshot(); snapshots.put(snapshotTime, s); } TreeSet<String> tableSet = new TreeSet<String>(); tableSet.addAll(result.getSecond()); s.m_digestTables.add(tableSet); s.m_digests.add(f); } else { HashSet<Integer> partitionIds = new HashSet<Integer>(); TableSaveFile saveFile = new TableSaveFile(fis.getChannel(), 1, null, true); try { for (Integer partitionId : saveFile.getPartitionIds()) { partitionIds.add(partitionId); } if (validate && saveFile.getCompleted()) { while (saveFile.hasMoreChunks()) { BBContainer cont = saveFile.getNextChunk(); if (cont != null) { cont.discard(); } } } partitionIds.removeAll(saveFile.getCorruptedPartitionIds()); Snapshot s = snapshots.get(saveFile.getCreateTime()); if (s == null) { s = new Snapshot(); snapshots.put(saveFile.getCreateTime(), s); } TableFiles tableFiles = s.m_tableFiles.get(saveFile.getTableName()); if (tableFiles == null) { tableFiles = new TableFiles(saveFile.isReplicated()); s.m_tableFiles.put(saveFile.getTableName(), tableFiles); } tableFiles.m_files.add(f); tableFiles.m_completed.add(saveFile.getCompleted()); tableFiles.m_validPartitionIds.add(partitionIds); tableFiles.m_corruptParititionIds.add(saveFile.getCorruptedPartitionIds()); tableFiles.m_totalPartitionCounts.add(saveFile.getTotalPartitions()); } finally { saveFile.close(); } } } catch (IOException e) { e.printStackTrace(); System.err.println(e.getMessage()); System.err.println("Error: Unable to process " + f.getPath()); } finally { try { if (fis != null) { fis.close(); } } catch (IOException e) { } } } } /** * Returns a detailed report and a boolean indicating whether the snapshot can be successfully loaded * @param snapshotTime * @param snapshot */ public static Pair<Boolean, String> generateSnapshotReport(Long snapshotTime, Snapshot snapshot) { CharArrayWriter caw = new CharArrayWriter(); PrintWriter pw = new PrintWriter(caw); boolean snapshotConsistent = true; String indentString = ""; pw.println(indentString + "Date: " + new Date(snapshotTime)); pw.println(indentString + "Digests:"); indentString = "\t"; TreeSet<String> digestTablesSeen = new TreeSet<String>(); if (snapshot.m_digests.isEmpty()) { pw.println(indentString + "No digests found."); snapshotConsistent = false; } else { boolean inconsistent = false; /* * Iterate over the digests and ensure that they all contain the same list of tables */ Map<Integer, List<Integer>> inconsistentDigests = new HashMap<Integer, List<Integer>>(); for (int ii = 0; ii < snapshot.m_digests.size(); ii++) { inconsistentDigests.put( ii, new ArrayList<Integer>()); Set<String> tables = snapshot.m_digestTables.get(ii); for (int zz = 0; zz < snapshot.m_digests.size(); zz++) { if (zz == ii) { continue; } if (!tables.equals(snapshot.m_digestTables.get(zz))) { snapshotConsistent = false; inconsistent = true; inconsistentDigests.get(ii).add(zz); } } } /* * Summarize what was inconsistent/consistent */ if (!inconsistent) { for (int ii = 0; ii < snapshot.m_digests.size(); ii++) { pw.println(indentString + snapshot.m_digests.get(ii).getPath()); } } else { pw.println(indentString + "Not all digests are consistent"); indentString = indentString + "\t"; for (Map.Entry<Integer, List<Integer>> entry : inconsistentDigests.entrySet()) { File left = snapshot.m_digests.get(entry.getKey()); pw.println(indentString + left.getPath() + " is inconsistent with:"); indentString = indentString + "\t"; for (Integer id : entry.getValue()) { File right = snapshot.m_digests.get(id); pw.println(indentString + right.getPath()); } indentString = indentString.substring(1); } } /* * Print the list of tables found in the digests */ indentString = indentString.substring(1); pw.print(indentString + "Tables: "); int ii = 0; for (int jj = 0; jj < snapshot.m_digestTables.size(); jj++) { for (String table : snapshot.m_digestTables.get(jj)) { digestTablesSeen.add(table); } } for (String table : digestTablesSeen) { if (ii != 0) { pw.print(", "); } ii++; pw.print(table); } pw.print("\n"); } /* * Check that the total partition count is the same in every table file */ Integer totalPartitionCount = null; indentString = indentString + "\t"; for (Map.Entry<String, TableFiles> entry : snapshot.m_tableFiles.entrySet()) { if (entry.getValue().m_isReplicated) { continue; } for (Integer partitionCount : entry.getValue().m_totalPartitionCounts){ if (totalPartitionCount == null) { totalPartitionCount = partitionCount; } else if (totalPartitionCount != partitionCount) { snapshotConsistent = false; pw.println(indentString + "Partition count is not consistent throughout snapshot files for " + entry.getKey() + ". Saw " + partitionCount + " and " + totalPartitionCount); } } } /* * Now check that each individual table has enough information to be restored. * It is possible for a valid partition set to be available and still have a restore * fail because the restore plan loads a save file with a corrupt partition. */ TreeSet<String> consistentTablesSeen = new TreeSet<String>(); for (Map.Entry<String, TableFiles> entry : snapshot.m_tableFiles.entrySet()) { TableFiles tableFiles = entry.getValue(); /* * Calculate the set of visible partitions not corrupted partitions */ TreeSet<Integer> partitionsAvailable = new TreeSet<Integer>(); int kk = 0; for (Set<Integer> validPartitionIds : tableFiles.m_validPartitionIds) { if (tableFiles.m_completed.get(kk++)) { partitionsAvailable.addAll(validPartitionIds); } } /* * Ensure the correct range of partition ids is present */ boolean partitionsPresent = false; if ((partitionsAvailable.size() == (tableFiles.m_isReplicated ? 1 : totalPartitionCount)) && (partitionsAvailable.first() == 0) && (partitionsAvailable.last() == (tableFiles.m_isReplicated ? 1 : totalPartitionCount) - 1)) { partitionsPresent = true; } /* * Report if any of the files have corrupt partitions */ boolean hasCorruptPartitions = false; for (Set<Integer> corruptIds : tableFiles.m_corruptParititionIds) { if (!corruptIds.isEmpty()) { hasCorruptPartitions = true; snapshotConsistent = false; } } pw.println(indentString + "Table name: " + entry.getKey()); indentString = indentString + "\t"; pw.println(indentString + "Replicated: " + entry.getValue().m_isReplicated); pw.println(indentString + "Valid partition set available: " + partitionsPresent); pw.println(indentString + "Corrupt partitions present: " + hasCorruptPartitions); /* * Print information about individual files such as the partitions present and whether * they are corrupted */ pw.println(indentString + "Files: "); indentString = indentString + "\t"; for (int ii = 0; ii < tableFiles.m_files.size(); ii++) { String corruptPartitionIdString = ""; int zz = 0; for (Integer partitionId : tableFiles.m_corruptParititionIds.get(ii)) { if (zz != 0) { corruptPartitionIdString = corruptPartitionIdString + ", "; } zz++; corruptPartitionIdString = corruptPartitionIdString + partitionId; } String validPartitionIdString = ""; zz = 0; for (Integer partitionId : tableFiles.m_validPartitionIds.get(ii)) { if (zz != 0) { validPartitionIdString = validPartitionIdString + ", "; } zz++; validPartitionIdString = validPartitionIdString + partitionId; } if (corruptPartitionIdString.isEmpty()) { consistentTablesSeen.add(entry.getKey()); pw.println(indentString + tableFiles.m_files.get(ii).getPath() + " Completed: " + tableFiles.m_completed.get(ii) + " Partitions: " + validPartitionIdString); } else { pw.println(indentString + tableFiles.m_files.get(ii).getPath() + " Completed: " + tableFiles.m_completed.get(ii) + " Valid Partitions: " + validPartitionIdString + " Corrupt Partitions: " + corruptPartitionIdString); } } indentString = indentString.substring(2); } indentString = indentString.substring(1); StringBuilder missingTables = new StringBuilder(8192); if (!consistentTablesSeen.containsAll(digestTablesSeen)) { snapshotConsistent = false; missingTables.append("Missing tables: "); Set<String> missingTablesSet = new TreeSet<String>(digestTablesSeen); missingTablesSet.removeAll(consistentTablesSeen); int hh = 0; for (String tableName : missingTablesSet) { if (hh > 0) { missingTables.append(", "); } missingTables.append(tableName); hh++; } missingTables.append('\n'); } /* * Tack on a summary at the beginning to indicate whether a restore is guaranteed to succede * with this file set. */ if (snapshotConsistent) { return Pair.of( true, "Snapshot valid\n" + caw.toString()); } else { StringBuilder sb = new StringBuilder(8192); sb.append("Snapshot corrupted\n").append(missingTables).append(caw.toCharArray()); return Pair.of( false, sb.toString()); } } /** * Generates a Filename to the snapshot file for the given table. * @param table * @param fileNonce * @param hostId */ public static final String constructFilenameForTable(Table table, String fileNonce, String hostId, String siteId, String partitionId) { StringBuilder filename_builder = new StringBuilder(fileNonce); filename_builder.append("-"); filename_builder.append(table.getTypeName()); if (!table.getIsreplicated()) { filename_builder.append("-host_"); filename_builder.append(hostId); filename_builder.append("-site_"); filename_builder.append(siteId); filename_builder.append("-partition_"); filename_builder.append(partitionId); } filename_builder.append(".vpt");//Volt partitioned table return filename_builder.toString(); } public static final File constructFileForTable(Table table, String filePath, String fileNonce, String hostId, String siteId, String partitionId) { return new File(filePath, SnapshotUtil.constructFilenameForTable( table, fileNonce, hostId, siteId, partitionId)); } /** * Generates the digest filename for the given nonce. * @param nonce */ public static final String constructDigestFilenameForNonce(String nonce) { return (nonce + ".digest"); } public static final List<Table> getTablesToSave(Database database) { ArrayList<Table> my_tables = new ArrayList<Table>(); for (Table table : database.getTables()) { if (table.getSystable() || table.getMapreduce()) continue; // // Make a list of all non-materialized, non-export only tables // if ((table.getMaterializer() != null) || // (CatalogUtil.isTableExportOnly(database, table))) // { // continue; // } my_tables.add(table); } return my_tables; } public static final int[] getPartitionsOnHost( SystemProcedureExecutionContext c, Host h) { Collection<Partition> results = CatalogUtil.getPartitionsForHost(h); final int retval[] = new int[results.size()]; int ii = 0; for (final Partition p : results) { retval[ii++] = Integer.parseInt(p.getTypeName()); } return retval; } }