/** * Copyright 2010 The Apache Software Foundation * * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hbase.regionserver.wal; import static org.apache.hadoop.hbase.util.FSUtils.recoverFileLease; import java.io.EOFException; import java.io.IOException; import java.text.ParseException; import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.TreeMap; import java.util.concurrent.Callable; import java.util.concurrent.ExecutionException; import java.util.concurrent.Executors; import java.util.concurrent.Future; import java.util.concurrent.ThreadFactory; import java.util.concurrent.ThreadPoolExecutor; import java.util.concurrent.TimeUnit; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hbase.HConstants; import org.apache.hadoop.hbase.HTableDescriptor; import org.apache.hadoop.hbase.RemoteExceptionHandler; import org.apache.hadoop.hbase.regionserver.HRegion; import org.apache.hadoop.hbase.regionserver.wal.HLog.Entry; import org.apache.hadoop.hbase.regionserver.wal.HLog.Reader; import org.apache.hadoop.hbase.regionserver.wal.HLog.Writer; import org.apache.hadoop.hbase.util.Bytes; import org.apache.hadoop.hbase.util.FSUtils; import com.google.common.util.concurrent.ThreadFactoryBuilder; /** * This class is responsible for splitting up a bunch of regionserver commit log * files that are no longer being written to, into new files, one per region for * region to replay on startup. Delete the old log files when finished. */ public class HLogSplitter { private static final String LOG_SPLITTER_IMPL = "hbase.hlog.splitter.impl"; static final Log LOG = LogFactory.getLog(HLogSplitter.class); private long splitTime = 0; private long splitSize = 0; /** * Name of file that holds recovered edits written by the wal log splitting * code, one per region */ public static final String RECOVERED_EDITS = "recovered.edits"; /** * Create a new HLogSplitter using the given {@link Configuration} and the * <code>hbase.hlog.splitter.impl</code> property to derived the instance * class to use. * * @param conf * @return New HLogSplitter instance */ public static HLogSplitter createLogSplitter(Configuration conf) { @SuppressWarnings("unchecked") Class<? extends HLogSplitter> splitterClass = (Class<? extends HLogSplitter>) conf .getClass(LOG_SPLITTER_IMPL, HLogSplitter.class); try { return splitterClass.newInstance(); } catch (InstantiationException e) { throw new RuntimeException(e); } catch (IllegalAccessException e) { throw new RuntimeException(e); } } // Private immutable datastructure to hold Writer and its Path. private final static class WriterAndPath { final Path p; final Writer w; WriterAndPath(final Path p, final Writer w) { this.p = p; this.w = w; } } /** * Split up a bunch of regionserver commit log files that are no longer being * written to, into new files, one per region for region to replay on startup. * Delete the old log files when finished. * * @param rootDir * qualified root directory of the HBase instance * @param srcDir * Directory of log files to split: e.g. * <code>${ROOTDIR}/log_HOST_PORT</code> * @param oldLogDir * directory where processed (split) logs will be archived to * @param fs * FileSystem * @param conf * Configuration * @throws IOException * will throw if corrupted hlogs aren't tolerated * @return the list of splits */ public List<Path> splitLog(final Path rootDir, final Path srcDir, Path oldLogDir, final FileSystem fs, final Configuration conf) throws IOException { long startTime = System.currentTimeMillis(); List<Path> splits = null; if (!fs.exists(srcDir)) { // Nothing to do return splits; } FileStatus[] logfiles = fs.listStatus(srcDir); if (logfiles == null || logfiles.length == 0) { // Nothing to do return splits; } LOG.info("Splitting " + logfiles.length + " hlog(s) in " + srcDir.toString()); splits = splitLog(rootDir, srcDir, oldLogDir, logfiles, fs, conf); try { FileStatus[] files = fs.listStatus(srcDir); for (FileStatus file : files) { Path newPath = HLog.getHLogArchivePath(oldLogDir, file.getPath()); LOG.info("Moving " + FSUtils.getPath(file.getPath()) + " to " + FSUtils.getPath(newPath)); if (!fs.rename(file.getPath(), newPath)) { throw new IOException("Unable to rename " + file.getPath() + " to " + newPath); } } LOG.debug("Moved " + files.length + " log files to " + FSUtils.getPath(oldLogDir)); if (!fs.delete(srcDir, true)) { throw new IOException("Unable to delete " + srcDir); } } catch (IOException e) { e = RemoteExceptionHandler.checkIOException(e); IOException io = new IOException("Cannot delete: " + srcDir); io.initCause(e); throw io; } splitTime = System.currentTimeMillis() - startTime; LOG.info("hlog file splitting completed in " + splitTime + " ms for " + srcDir.toString()); return splits; } /** * @return time that this split took */ public long getTime() { return this.splitTime; } /** * @return aggregate size of hlogs that were split */ public long getSize() { return this.splitSize; } /** * Sorts the HLog edits in the given list of logfiles (that are a mix of edits * on multiple regions) by region and then splits them per region directories, * in batches of (hbase.hlog.split.batch.size) * * A batch consists of a set of log files that will be sorted in a single map * of edits indexed by region the resulting map will be concurrently written * by multiple threads to their corresponding regions * * Each batch consists of more more log files that are - recovered (files is * opened for append then closed to ensure no process is writing into it) - * parsed (each edit in the log is appended to a list of edits indexed by * region see {@link #parseHLog} for more details) - marked as either * processed or corrupt depending on parsing outcome - the resulting edits * indexed by region are concurrently written to their corresponding region * region directories - original files are then archived to a different * directory * * * * @param rootDir * hbase directory * @param srcDir * logs directory * @param oldLogDir * directory where processed logs are archived to * @param logfiles * the list of log files to split * @param fs * @param conf * @return * @throws IOException */ private List<Path> splitLog(final Path rootDir, final Path srcDir, Path oldLogDir, final FileStatus[] logfiles, final FileSystem fs, final Configuration conf) throws IOException { List<Path> processedLogs = new ArrayList<Path>(); List<Path> corruptedLogs = new ArrayList<Path>(); final Map<byte[], WriterAndPath> logWriters = Collections .synchronizedMap(new TreeMap<byte[], WriterAndPath>( Bytes.BYTES_COMPARATOR)); List<Path> splits = null; // Number of logs in a read batch // More means faster but bigger mem consumption // TODO make a note on the conf rename and update hbase-site.xml if needed int logFilesPerStep = conf.getInt("hbase.hlog.split.batch.size", 3); boolean skipErrors = conf.getBoolean("hbase.hlog.split.skip.errors", false); splitSize = 0; try { int i = -1; while (i < logfiles.length) { final Map<byte[], LinkedList<Entry>> editsByRegion = new TreeMap<byte[], LinkedList<Entry>>( Bytes.BYTES_COMPARATOR); for (int j = 0; j < logFilesPerStep; j++) { i++; if (i == logfiles.length) { break; } FileStatus log = logfiles[i]; Path logPath = log.getPath(); long logLength = log.getLen(); splitSize += logLength; LOG.debug("Splitting hlog " + (i + 1) + " of " + logfiles.length + ": " + logPath + ", length=" + logLength); try { recoverFileLease(fs, logPath, conf); parseHLog(log, editsByRegion, fs, conf); processedLogs.add(logPath); } catch (EOFException eof) { // truncated files are expected if a RS crashes (see HBASE-2643) LOG.info("EOF from hlog " + logPath + ". continuing"); processedLogs.add(logPath); } catch (IOException e) { // If the IOE resulted from bad file format, // then this problem is idempotent and retrying won't help if (e.getCause() instanceof ParseException) { LOG.warn("ParseException from hlog " + logPath + ". continuing"); processedLogs.add(logPath); } else { if (skipErrors) { LOG.info("Got while parsing hlog " + logPath + ". Marking as corrupted", e); corruptedLogs.add(logPath); } else { throw e; } } } } writeEditsBatchToRegions(editsByRegion, logWriters, rootDir, fs, conf); } if (fs.listStatus(srcDir).length > processedLogs.size() + corruptedLogs.size()) { throw new OrphanHLogAfterSplitException( "Discovered orphan hlog after split. Maybe the " + "HRegionServer was not dead when we started"); } archiveLogs(corruptedLogs, processedLogs, oldLogDir, fs, conf); } finally { splits = new ArrayList<Path>(logWriters.size()); for (WriterAndPath wap : logWriters.values()) { wap.w.close(); splits.add(wap.p); LOG.debug("Closed " + wap.p); } } return splits; } /** * Takes splitLogsMap and concurrently writes them to region directories using a thread pool * * @param splitLogsMap map that contains the log splitting result indexed by region * @param logWriters map that contains a writer per region * @param rootDir hbase root dir * @param fs * @param conf * @throws IOException */ private void writeEditsBatchToRegions( final Map<byte[], LinkedList<Entry>> splitLogsMap, final Map<byte[], WriterAndPath> logWriters, final Path rootDir, final FileSystem fs, final Configuration conf) throws IOException { // Number of threads to use when log splitting to rewrite the logs. // More means faster but bigger mem consumption. int logWriterThreads = conf.getInt( "hbase.regionserver.hlog.splitlog.writer.threads", 3); boolean skipErrors = conf.getBoolean("hbase.skip.errors", false); HashMap<byte[], Future> writeFutureResult = new HashMap<byte[], Future>(); ThreadFactoryBuilder builder = new ThreadFactoryBuilder(); builder.setNameFormat("SplitWriter-%1$d"); ThreadFactory factory = builder.build(); ThreadPoolExecutor threadPool = (ThreadPoolExecutor)Executors.newFixedThreadPool(logWriterThreads, factory); for (final byte[] region : splitLogsMap.keySet()) { Callable splitter = createNewSplitter(rootDir, logWriters, splitLogsMap, region, fs, conf); writeFutureResult.put(region, threadPool.submit(splitter)); } threadPool.shutdown(); // Wait for all threads to terminate try { for (int j = 0; !threadPool.awaitTermination(5, TimeUnit.SECONDS); j++) { String message = "Waiting for hlog writers to terminate, elapsed " + j * 5 + " seconds"; if (j < 30) { LOG.debug(message); } else { LOG.info(message); } } } catch (InterruptedException ex) { LOG.warn("Hlog writers were interrupted, possible data loss!"); if (!skipErrors) { throw new IOException("Could not finish writing log entries", ex); // TODO maybe we should fail here regardless if skipErrors is active or not } } for (Map.Entry<byte[], Future> entry : writeFutureResult.entrySet()) { try { entry.getValue().get(); } catch (ExecutionException e) { throw (new IOException(e.getCause())); } catch (InterruptedException e1) { LOG.warn("Writer for region " + Bytes.toString(entry.getKey()) + " was interrupted, however the write process should have " + "finished. Throwing up ", e1); throw (new IOException(e1.getCause())); } } } /** * Moves processed logs to a oldLogDir after successful processing Moves * corrupted logs (any log that couldn't be successfully parsed to corruptDir * (.corrupt) for later investigation * * @param corruptedLogs * @param processedLogs * @param oldLogDir * @param fs * @param conf * @throws IOException */ private static void archiveLogs(final List<Path> corruptedLogs, final List<Path> processedLogs, final Path oldLogDir, final FileSystem fs, final Configuration conf) throws IOException { final Path corruptDir = new Path(conf.get(HConstants.HBASE_DIR), conf.get( "hbase.regionserver.hlog.splitlog.corrupt.dir", ".corrupt")); if (!fs.mkdirs(corruptDir)) { LOG.info("Unable to mkdir " + corruptDir); } fs.mkdirs(oldLogDir); for (Path corrupted : corruptedLogs) { Path p = new Path(corruptDir, corrupted.getName()); if (!fs.rename(corrupted, p)) { LOG.info("Unable to move corrupted log " + corrupted + " to " + p); } else { LOG.info("Moving corrupted log " + corrupted + " to " + p); } } for (Path p : processedLogs) { Path newPath = HLog.getHLogArchivePath(oldLogDir, p); if (!fs.rename(p, newPath)) { LOG.info("Unable to move " + p + " to " + newPath); } else { LOG.info("Archived processed log " + p + " to " + newPath); } } } /** * Path to a file under RECOVERED_EDITS_DIR directory of the region found in * <code>logEntry</code> named for the sequenceid in the passed * <code>logEntry</code>: e.g. /hbase/some_table/2323432434/recovered.edits/2332. * This method also ensures existence of RECOVERED_EDITS_DIR under the region * creating it if necessary. * @param fs * @param logEntry * @param rootDir HBase root dir. * @return Path to file into which to dump split log edits. * @throws IOException */ static Path getRegionSplitEditsPath(final FileSystem fs, final Entry logEntry, final Path rootDir) throws IOException { Path tableDir = HTableDescriptor.getTableDir(rootDir, logEntry.getKey() .getTablename()); Path regiondir = HRegion.getRegionDir(tableDir, Bytes.toString(logEntry.getKey().getEncodedRegionName())); if (!fs.exists(regiondir)) { LOG.info("This region's directory doesn't exist: " + regiondir.toString() + ". It is very likely that it was" + " already split so it's safe to discard those edits."); return null; } Path dir = HLog.getRegionDirRecoveredEditsDir(regiondir); if (!fs.exists(dir)) { if (!fs.mkdirs(dir)) LOG.warn("mkdir failed on " + dir); } return new Path(dir, formatRecoveredEditsFileName(logEntry.getKey() .getLogSeqNum())); } static String formatRecoveredEditsFileName(final long seqid) { return String.format("%019d", seqid); } /* * Parse a single hlog and put the edits in @splitLogsMap * * @param logfile to split * @param splitLogsMap output parameter: a map with region names as keys and a * list of edits as values * @param fs the filesystem * @param conf the configuration * @throws IOException if hlog is corrupted, or can't be open */ private void parseHLog(final FileStatus logfile, final Map<byte[], LinkedList<Entry>> splitLogsMap, final FileSystem fs, final Configuration conf) throws IOException { // Check for possibly empty file. With appends, currently Hadoop reports a // zero length even if the file has been sync'd. Revisit if HDFS-376 or // HDFS-878 is committed. long length = logfile.getLen(); if (length <= 0) { LOG.warn("File " + logfile.getPath() + " might be still open, length is 0"); } Path path = logfile.getPath(); Reader in; int editsCount = 0; try { in = getReader(fs, path, conf); } catch (EOFException e) { if (length <= 0) { //TODO should we ignore an empty, not-last log file if skip.errors is false? //Either way, the caller should decide what to do. E.g. ignore if this is the last //log in sequence. //TODO is this scenario still possible if the log has been recovered (i.e. closed) LOG.warn("Could not open " + path + " for reading. File is empty" + e); return; } else { throw e; } } try { Entry entry; while ((entry = in.next()) != null) { byte[] region = entry.getKey().getEncodedRegionName(); LinkedList<Entry> queue = splitLogsMap.get(region); if (queue == null) { queue = new LinkedList<Entry>(); splitLogsMap.put(region, queue); } queue.addLast(entry); editsCount++; } } finally { LOG.debug("Pushed=" + editsCount + " entries from " + path); try { if (in != null) { in.close(); } } catch (IOException e) { LOG .warn("Close log reader in finally threw exception -- continuing", e); } } } private Callable<Void> createNewSplitter(final Path rootDir, final Map<byte[], WriterAndPath> logWriters, final Map<byte[], LinkedList<Entry>> logEntries, final byte[] region, final FileSystem fs, final Configuration conf) { return new Callable<Void>() { public String getName() { return "Split writer thread for region " + Bytes.toStringBinary(region); } @Override public Void call() throws IOException { LinkedList<Entry> entries = logEntries.get(region); LOG.debug(this.getName() + " got " + entries.size() + " to process"); long threadTime = System.currentTimeMillis(); try { int editsCount = 0; WriterAndPath wap = logWriters.get(region); for (Entry logEntry : entries) { if (wap == null) { Path regionedits = getRegionSplitEditsPath(fs, logEntry, rootDir); if (regionedits == null) { // we already print a message if it's null in getRegionSplitEditsPath break; } if (fs.exists(regionedits)) { LOG.warn("Found existing old edits file. It could be the " + "result of a previous failed split attempt. Deleting " + regionedits + ", length=" + fs.getFileStatus(regionedits).getLen()); if (!fs.delete(regionedits, false)) { LOG.warn("Failed delete of old " + regionedits); } } Writer w = createWriter(fs, regionedits, conf); wap = new WriterAndPath(regionedits, w); logWriters.put(region, wap); LOG.debug("Creating writer path=" + regionedits + " region=" + Bytes.toStringBinary(region)); } wap.w.append(logEntry); editsCount++; } LOG.debug(this.getName() + " Applied " + editsCount + " total edits to " + Bytes.toStringBinary(region) + " in " + (System.currentTimeMillis() - threadTime) + "ms"); } catch (IOException e) { e = RemoteExceptionHandler.checkIOException(e); LOG.fatal(this.getName() + " Got while writing log entry to log", e); throw e; } return null; } }; } /** * Create a new {@link Writer} for writing log splits. * * @param fs * @param logfile * @param conf * @return A new Writer instance * @throws IOException */ protected Writer createWriter(FileSystem fs, Path logfile, Configuration conf) throws IOException { return HLog.createWriter(fs, logfile, conf); } /** * Create a new {@link Reader} for reading logs to split. * * @param fs * @param curLogFile * @param conf * @return A new Reader instance * @throws IOException */ protected Reader getReader(FileSystem fs, Path curLogFile, Configuration conf) throws IOException { return HLog.getReader(fs, curLogFile, conf); } }