/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.raid;
import java.io.IOException;
import java.lang.reflect.Constructor;
import java.lang.reflect.InvocationTargetException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.JobID;
import org.apache.hadoop.util.StringUtils;
/**
* contains the core functionality of the block integrity monitor
*
* configuration options:
* raid.blockfix.classname - the class name of the integrity monitor
* implementation to use
*
* raid.blockfix.interval - interval between checks for lost files
*
* raid.blockfix.read.timeout - read time out
*
* raid.blockfix.write.timeout - write time out
*/
public abstract class BlockIntegrityMonitor extends Configured {
public static final String BLOCKFIX_CLASSNAME = "raid.blockfix.classname";
public static final String BLOCKCHECK_INTERVAL = "raid.blockfix.interval";
public static final String CORRUPTFILECOUNT_INTERVAL = "raid.corruptfilecount.interval";
public static final String BLOCKFIX_READ_TIMEOUT =
"raid.blockfix.read.timeout";
public static final String BLOCKFIX_WRITE_TIMEOUT =
"raid.blockfix.write.timeout";
// If a file has replication at least this, we can assume its not raided.
public static final String NOT_RAIDED_REPLICATION =
"raid.blockfix.noraid.replication";
public static final long DEFAULT_BLOCKFIX_INTERVAL = 60 * 1000; // 1 min
public static final long DEFAULT_CORRUPTFILECOUNT_INTERVAL = 600 * 1000; //10min
public static final short DEFAULT_NOT_RAIDED_REPLICATION = 3;
public static BlockIntegrityMonitor createBlockIntegrityMonitor(
Configuration conf) throws ClassNotFoundException {
try {
// default to distributed integrity monitor
Class<?> blockFixerClass =
conf.getClass(BLOCKFIX_CLASSNAME, DistBlockIntegrityMonitor.class);
if (!BlockIntegrityMonitor.class.isAssignableFrom(blockFixerClass)) {
throw new ClassNotFoundException("not an implementation of " +
"blockintegritymonitor");
}
Constructor<?> constructor =
blockFixerClass.getConstructor(new Class[] {Configuration.class} );
return (BlockIntegrityMonitor) constructor.newInstance(conf);
} catch (NoSuchMethodException e) {
throw new ClassNotFoundException("cannot construct integritymonitor", e);
} catch (InstantiationException e) {
throw new ClassNotFoundException("cannot construct integritymonitor", e);
} catch (IllegalAccessException e) {
throw new ClassNotFoundException("cannot construct integritymonitor", e);
} catch (InvocationTargetException e) {
throw new ClassNotFoundException("cannot construct integritymonitor", e);
}
}
private long numFilesFixed = 0;
private long numFileFixFailures = 0;
private long numFilesCopied = 0;
private long numFileCopyFailures = 0;
private long numBlockFixSimulationFailures = 0;
private long numBlockFixSimulationSuccess = 0;
private long numFilesToFixDropped = 0;
public volatile boolean running = true;
// interval between checks for lost files
protected long blockCheckInterval;
protected long corruptFileCountInterval;
protected short notRaidedReplication;
public BlockIntegrityMonitor(Configuration conf) {
super(conf);
blockCheckInterval =
getConf().getLong(BLOCKCHECK_INTERVAL, DEFAULT_BLOCKFIX_INTERVAL);
corruptFileCountInterval =
getConf().getLong(CORRUPTFILECOUNT_INTERVAL, DEFAULT_CORRUPTFILECOUNT_INTERVAL);
notRaidedReplication = (short) getConf().getInt(
NOT_RAIDED_REPLICATION, DEFAULT_NOT_RAIDED_REPLICATION);
}
/**
* Returns the number of new code file fixing verification success
*/
public synchronized long getNumBlockFixSimulationSuccess() {
return numBlockFixSimulationSuccess;
}
/**
* Increments the number of new code file fixing verification success
*/
protected synchronized void incrNumBlockFixSimulationSuccess(long incr) {
if (incr < 0) {
throw new IllegalArgumentException("Cannot increment by negative value " +
incr);
}
RaidNodeMetrics.getInstance(RaidNodeMetrics.DEFAULT_NAMESPACE_ID).
blockFixSimulationSuccess.inc(incr);
numBlockFixSimulationSuccess += incr;
}
/**
* Returns the number of new code file fixing verification failures
*/
public synchronized long getNumBlockFixSimulationFailures() {
return numBlockFixSimulationFailures;
}
/**
* Increments the number of new code file fixing verification failures
*/
protected synchronized void incrNumBlockFixSimulationFailures(long incr) {
if (incr < 0) {
throw new IllegalArgumentException("Cannot increment by negative value " +
incr);
}
RaidNodeMetrics.getInstance(RaidNodeMetrics.DEFAULT_NAMESPACE_ID).
blockFixSimulationFailures.inc(incr);
numBlockFixSimulationFailures += incr;
}
/**
* Returns the number of corrupt file fixing failures.
*/
public synchronized long getNumFileFixFailures() {
return numFileFixFailures;
}
/**
* Increments the number of corrupt file fixing failures.
*/
protected synchronized void incrFileFixFailures() {
RaidNodeMetrics.getInstance(RaidNodeMetrics.DEFAULT_NAMESPACE_ID).fileFixFailures.inc();
numFileFixFailures++;
}
/**
* Increments the number of corrupt file fixing failures.
*/
protected synchronized void incrFileFixFailures(long incr) {
if (incr < 0) {
throw new IllegalArgumentException("Cannot increment by negative value " +
incr);
}
RaidNodeMetrics.getInstance(RaidNodeMetrics.DEFAULT_NAMESPACE_ID).fileFixFailures.inc(incr);
numFileFixFailures += incr;
}
/**
* Returns the number of corrupt files that have been fixed by this
* integrity monitor.
*/
public synchronized long getNumFilesFixed() {
return numFilesFixed;
}
/**
* Increments the number of corrupt files that have been fixed by this
* integrity monitor.
*/
protected synchronized void incrFilesFixed() {
RaidNodeMetrics.getInstance(RaidNodeMetrics.DEFAULT_NAMESPACE_ID).filesFixed.inc();
numFilesFixed++;
}
/**
* Increments the number of corrupt files that have been fixed by this
* integrity monitor.
*/
protected synchronized void incrFilesFixed(long incr) {
if (incr < 0) {
throw new IllegalArgumentException("Cannot increment by negative value " +
incr);
}
RaidNodeMetrics.getInstance(RaidNodeMetrics.DEFAULT_NAMESPACE_ID).filesFixed.inc(incr);
numFilesFixed += incr;
}
/**
* Returns the number of decommissioning file copy failures.
*/
public synchronized long getNumFileCopyFailures() {
return numFileCopyFailures;
}
/**
* Increments the number of decommissioning file copy failures.
*/
protected synchronized void incrFileCopyFailures(long incr) {
if (incr < 0) {
throw new IllegalArgumentException("Cannot increment by negative value " +
incr);
}
RaidNodeMetrics.getInstance(RaidNodeMetrics.DEFAULT_NAMESPACE_ID).fileCopyFailures.inc(incr);
numFileCopyFailures += incr;
}
/**
* Returns the number of decommissioning files that have been copied by this
* integrity monitor.
*/
public synchronized long getNumFilesCopied() {
return numFilesCopied;
}
/**
* Increments the number of decommissioning files that have been copied by
* this integrity monitor.
*/
protected synchronized void incrFilesCopied(long incr) {
if (incr < 0) {
throw new IllegalArgumentException("Cannot increment by negative value " +
incr);
}
RaidNodeMetrics.getInstance(RaidNodeMetrics.DEFAULT_NAMESPACE_ID).filesCopied.inc(incr);
numFilesCopied += incr;
}
static boolean isSourceFile(String p) {
for (Codec codec: Codec.getCodecs()) {
if (p.startsWith(codec.getParityPrefix())) {
return false;
}
}
return true;
}
static boolean doesParityDirExist(FileSystem parityFs, String path)
throws IOException {
// Check if it is impossible to have a parity file. We check if the
// parent directory of the lost file exists under a parity path.
// If the directory does not exist, the parity file cannot exist.
Path fileRaidParent = new Path(path).getParent();
Path dirRaidParent = (fileRaidParent != null)? fileRaidParent.getParent(): null;
boolean parityCanExist = false;
for (Codec codec: Codec.getCodecs()) {
Path parityDir = null;
if (codec.isDirRaid) {
if (dirRaidParent == null)
continue;
parityDir = (dirRaidParent.depth() == 0)?
new Path(codec.getParityPrefix()):
new Path(codec.getParityPrefix(),
RaidNode.makeRelative(dirRaidParent));
} else {
parityDir = (fileRaidParent.depth() == 0)?
new Path(codec.getParityPrefix()):
new Path(codec.getParityPrefix(),
RaidNode.makeRelative(fileRaidParent));
}
if (parityFs.exists(parityDir)) {
parityCanExist = true;
break;
}
}
return parityCanExist;
}
void filterUnreconstructableSourceFiles(FileSystem parityFs,
Iterator<String> it)
throws IOException {
while (it.hasNext()) {
String p = it.next();
if (isSourceFile(p) &&
!doesParityDirExist(parityFs, p)) {
it.remove();
}
}
}
public abstract Status getAggregateStatus();
public static class Status {
final int highPriorityFiles;
final int lowPriorityFiles;
final int lowestPriorityFiles;
final List<JobStatus> jobs;
final List<JobStatus> simFailJobs;
final List<String> highPriorityFileNames;
final long lastUpdateTime;
protected Status(int highPriorityFiles, int lowPriorityFiles,
int lowestPriorityFiles,
List<JobStatus> jobs, List<String> highPriorityFileNames,
List<JobStatus> simFailJobs) {
this.highPriorityFiles = highPriorityFiles;
this.lowPriorityFiles = lowPriorityFiles;
this.lowestPriorityFiles = lowestPriorityFiles;
this.jobs = jobs;
this.simFailJobs = simFailJobs;
this.highPriorityFileNames = highPriorityFileNames;
this.lastUpdateTime = RaidNode.now();
}
@Override
public String toString() {
String result = BlockIntegrityMonitor.class.getSimpleName() + " Status:";
result += " HighPriorityFiles:" + highPriorityFiles;
result += " LowPriorityFiles:" + lowPriorityFiles;
result += " LowestPriorityFiles:" + lowPriorityFiles;
result += " Jobs:" + jobs.size();
result += " Sim Fail Jobs: " + simFailJobs.size();
return result;
}
public String toHtml(int numCorruptToReport) {
long now = RaidNode.now();
String html = "";
html += tr(td("High Priority Corrupted Files") + td(":") +
td(StringUtils.humanReadableInt(highPriorityFiles)));
html += tr(td("Low Priority Corrupted Files") + td(":") +
td(StringUtils.humanReadableInt(lowPriorityFiles)));
html += tr(td("Lowest Priority Corrupted Files") + td(":") +
td(StringUtils.humanReadableInt(lowestPriorityFiles)));
html += tr(td("Running Jobs") + td(":") +
td(jobs.size() + ""));
html += tr(td("Simulated Failed Jobs") + td(":") +
td(simFailJobs.size() + ""));
html += tr(td("Last Update") + td(":") +
td(StringUtils.formatTime(now - lastUpdateTime) + " ago"));
html = JspUtils.tableSimple(html);
if (numCorruptToReport <= 0) {
return html;
}
if (simFailJobs.size() > 0) {
String jobTable = tr(JobStatus.htmlRowHeader());
for (JobStatus job : simFailJobs) {
jobTable += tr(job.htmlRow());
}
jobTable = JspUtils.table(jobTable);
html += "<br><h4>Simulated Fail Jobs:</h4><br>" + jobTable;
}
if (jobs.size() > 0) {
String jobTable = tr(JobStatus.htmlRowHeader());
for (JobStatus job : jobs) {
jobTable += tr(job.htmlRow());
}
jobTable = JspUtils.table(jobTable);
html += "<br>" + jobTable;
}
numCorruptToReport = Math.min(numCorruptToReport, highPriorityFileNames.size());
if (numCorruptToReport > 0) {
String highPriFilesTable = "";
highPriFilesTable += tr(td("High Priority Corrupted Files") +
td(":") + td(highPriorityFileNames.get(0)));
for (int i = 1; i < numCorruptToReport; ++i) {
highPriFilesTable += tr(td("") + td(":") +
td(highPriorityFileNames.get(i)));
}
highPriFilesTable = JspUtils.tableSimple(highPriFilesTable);
html += "<br>" + highPriFilesTable;
}
return html;
}
}
public static class JobStatus {
final String id;
final String name;
final String url;
JobStatus(JobID id, String name, String url) {
this.id = id == null ? "" : id.toString();
this.name = name == null ? "" : name;
this.url = url == null ? "" : url;
}
@Override
public String toString() {
return "id:" + id + " name:" + name + " url:" + url;
}
public static String htmlRowHeader() {
return td("JobID") + td("JobName");
}
public String htmlRow() {
return td(JspUtils.link(id, url)) + td(name);
}
}
private static String td(String s) {
return JspUtils.td(s);
}
private static String tr(String s) {
return JspUtils.tr(s);
}
public abstract Runnable getCorruptionMonitor();
public abstract Runnable getDecommissioningMonitor();
public abstract Runnable getCorruptFileCounter();
}