/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.raid;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Set;
import java.util.Map.Entry;
import java.util.NavigableSet;
import java.util.TreeMap;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentSkipListMap;
import java.util.concurrent.ConcurrentSkipListSet;
import java.util.concurrent.atomic.AtomicInteger;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hdfs.util.InjectionEvent;
import org.apache.hadoop.util.InjectionHandler;
public class RaidHistogram {
protected static final Log LOG = LogFactory.getLog(RaidHistogram.class);
// how much we divide from the value
public final static String RAID_HISTOGRAM_DIVIDEND_KEY =
"hdfs.raid.histogram.dividend";
public final static long RECOVERY_FAIL = Integer.MAX_VALUE;
// it's unnecessary to record recovery time at msec level
// Use second as unit should be enough
public final static int DEFAULT_RAID_HISTOGRAM_DIVIDEND = 1000;
public class Point implements Comparable<Point> {
public long time;
public long value;
public String path;
public String taskId;
public Point(long newValue, String newPath, long newTime, String newTaskId) {
value = newValue;
path = newPath;
time = newTime;
taskId = newTaskId;
}
public int compareTo(Point otherPoint) {
if (this.time > otherPoint.time) return 1;
if (this.time < otherPoint.time) return -1;
if (this == otherPoint) return 0;
if (this.value > otherPoint.value) return 1;
if (this.value < otherPoint.value) return -1;
return this.path.compareTo(otherPoint.path);
}
}
public class CounterArray {
private AtomicInteger[] counters;
private int length;
public CounterArray(int newLength) {
length = newLength;
counters = new AtomicInteger[length];
for (int i = 0; i < length; i++) {
counters[i] = new AtomicInteger(0);
}
}
public int decrementAndGet(int index) {
return counters[index].decrementAndGet();
}
public int incrementAndGet(int index) {
return counters[index].incrementAndGet();
}
public AtomicInteger get(int index) {
return counters[index];
}
}
/**
* Status used by web ui
*/
public class BlockFixStatus {
// Used for displaying histogram graph
public int[] counters;
// values at X%
public long[] percentValues;
// array of percents shown
public ArrayList<Float> percents;
// number of failed recovery paths
public int failedPaths;
public BlockFixStatus(int[] newCounters, long[] newPercentValues,
ArrayList<Float> newPercents, int newFailedPaths) {
counters = newCounters;
percentValues = newPercentValues;
percents = newPercents;
failedPaths = newFailedPaths;
}
}
public ConcurrentSkipListSet<Point> points;
public ConcurrentHashMap<String, AtomicInteger> failedRecoveredFiles;
// Record the total number of failed paths for every window
public CounterArray totalFailedPaths;
// how large we monitor recent 5hours/1day/1week
public ArrayList<Long> windows;
public int dividend = DEFAULT_RAID_HISTOGRAM_DIVIDEND;
// how many windows we monitor
public int windowNum;
public ConcurrentSkipListMap<Long, CounterArray> histo;
// Record the total number of points for every window
public CounterArray totalPoints;
public RaidHistogram(ArrayList<Long> newWindows, Configuration conf)
throws Exception {
initialize(newWindows);
dividend = conf.getInt(RAID_HISTOGRAM_DIVIDEND_KEY,
DEFAULT_RAID_HISTOGRAM_DIVIDEND);
}
public synchronized int getNumberOfPoints() {
return points.size();
}
public synchronized void initialize(ArrayList<Long> newWindows) {
windows = newWindows;
Collections.sort(windows);
points = new ConcurrentSkipListSet<Point>();
windowNum = windows.size();
totalPoints = new CounterArray(windowNum);
totalFailedPaths = new CounterArray(windowNum);
histo = new ConcurrentSkipListMap<Long, CounterArray>();
failedRecoveredFiles = new ConcurrentHashMap<String, AtomicInteger>();
}
// Only for testing
public synchronized void setNewWindows(ArrayList<Long> newWindows)
throws IOException {
if (newWindows.size() != windows.size()) {
throw new IOException(
"Number of new windows need to be the same as that of old ones");
}
Collections.sort(newWindows);
for (int i = 0; i < newWindows.size(); i++) {
if (newWindows.get(i) > windows.get(i)) {
throw new IOException ("New window " + newWindows.get(i) +
" should be smaller than the old one " + windows.get(i));
}
windows.set(i, newWindows.get(i));
}
}
public synchronized ArrayList<Point> getPointsWithGivenRecoveryTime(
long recoveryTime) {
ArrayList<Point> resultPoints = new ArrayList<Point>();
Iterator<Point> iterator = this.points.iterator();
while (iterator.hasNext()) {
Point p = iterator.next();
if (p.value == recoveryTime) {
resultPoints.add(p);
}
}
return resultPoints;
}
/*
* If value is RECOVERY_FAIL, we consider it as recovery failure
*/
public synchronized void put(String path, long value, String taskId) {
Point p;
int last = windowNum - 1;
if (value == RECOVERY_FAIL) {
p = new Point(value, path, System.currentTimeMillis(), taskId);
AtomicInteger counter = failedRecoveredFiles.get(path);
if (counter == null) {
counter = new AtomicInteger(0);
failedRecoveredFiles.put(path, counter);
}
if (counter.incrementAndGet() == 1) {
totalFailedPaths.get(last).incrementAndGet();
}
} else {
value /= dividend;
p = new Point(value, path, System.currentTimeMillis(), taskId);
CounterArray counters = histo.get(value);
if (counters == null) {
counters = new CounterArray(windowNum);
histo.put(value, counters);
}
counters.incrementAndGet(last);
totalPoints.incrementAndGet(last);
}
points.add(p);
InjectionHandler.processEvent(InjectionEvent.RAID_SEND_RECOVERY_TIME, this, path,
value, taskId);
}
public synchronized void filterStalePoints(long endTime) throws IOException {
int last = windowNum - 1;
long windowTime = windows.get(last);
NavigableSet<Point> windowSet =
points.headSet(new Point(0, "", endTime - windowTime, null));
Iterator<Point> windowIterator = windowSet.iterator();
while (windowIterator.hasNext()) {
Point p = windowIterator.next();
if (p.value == RECOVERY_FAIL) {
AtomicInteger ca = failedRecoveredFiles.get(p.path);
if (ca == null) {
throw new IOException(p.path +
" doesn't have counter in failedRecoveredFiles");
}
if (ca.decrementAndGet() == 0) {
totalFailedPaths.decrementAndGet(last);
failedRecoveredFiles.remove(p.path);
}
} else {
CounterArray ca = histo.get(p.value);
if (ca == null) {
throw new IOException(p.value + " doesn't have counter in histo");
}
if (ca.decrementAndGet(last) == 0) {
histo.remove(p.value);
}
totalPoints.decrementAndGet(last);
}
points.remove(p);
}
}
public synchronized void collectCounters(long endTime) throws IOException {
for (int i = 0; i < windowNum - 1; i++) {
long windowTime = windows.get(i);
// reset totalFailedPaths and totalPoints
totalFailedPaths.get(i).set(0);
totalPoints.get(i).set(0);
NavigableSet<Point> windowSet =
points.tailSet(new Point(0, "", endTime - windowTime, null));
Iterator<Point> windowIterator = windowSet.iterator();
Set<String> failedRecoveredFiles = new HashSet<String>();
while (windowIterator.hasNext()) {
Point p = windowIterator.next();
if (p.value == RECOVERY_FAIL) {
if (failedRecoveredFiles.add(p.path)) {
totalFailedPaths.incrementAndGet(i);
}
} else {
CounterArray ca = histo.get(p.value);
if (ca == null) {
throw new IOException(p.value + " doesn't have counter in histo");
}
ca.incrementAndGet(i);
totalPoints.incrementAndGet(i);
}
}
}
}
public synchronized TreeMap<Long, BlockFixStatus> getBlockFixStatus(
int histoLen, ArrayList<Float> percents, long endTime)
throws IOException {
TreeMap<Long, BlockFixStatus> blockFixStatuses =
new TreeMap<Long, BlockFixStatus>();
filterStalePoints(endTime);
collectCounters(endTime);
for (int i = 0 ;i < windowNum; i++) {
blockFixStatuses.put(windows.get(i),
new BlockFixStatus(null, null, percents,
totalFailedPaths.get(i).get()));
}
int percentsNum = percents.size();
if (percentsNum == 0 || histo.size() == 0 ) {
return blockFixStatuses;
}
Collections.sort(percents);
long[] percentThresholds = new long[windowNum];
int[] percentIndexes = new int[windowNum];
int[][] counters = new int[windowNum][];
long[][] percentValues = new long[windowNum][];
for (int i = 0; i < windowNum; i++) {
percentIndexes[i] = 0;
percentThresholds[i] = (long)(percents.get(0) * totalPoints.get(i).get());
counters[i] = new int[histoLen];
percentValues[i] = new long[percentsNum];
Arrays.fill(percentValues[i], -1);
}
long width = (long)Math.ceil(histo.lastKey() * 1.0 / histoLen);
int startIdx = 0;
// iterate the histo
Iterator<Entry<Long, CounterArray>> it = histo.entrySet().iterator();
int[] currentCounter = new int[windowNum];
ArrayList<Integer> counterIndexes = new ArrayList<Integer>();
for (int i = 0; i < windowNum; i++) {
counterIndexes.add(i);
}
while (it.hasNext()) {
Entry<Long, CounterArray> pairs = (Entry<Long, CounterArray>)it.next();
Long recoveryTime = (Long)pairs.getKey();
CounterArray counter = (CounterArray)pairs.getValue();
if (startIdx * width + width <= recoveryTime && startIdx + 1 < histoLen) {
startIdx++;
}
Iterator<Integer> iter = counterIndexes.iterator();
while (iter.hasNext()) {
int idx = iter.next();
currentCounter[idx] += counter.counters[idx].get();
counters[idx][startIdx] += counter.counters[idx].get();
if (currentCounter[idx] >= percentThresholds[idx] && currentCounter[idx] > 0) {
percentValues[idx][percentIndexes[idx]] = recoveryTime;
percentIndexes[idx]++;
if (percentIndexes[idx] == percentsNum) {
percentThresholds[idx] = RECOVERY_FAIL;
iter.remove();
} else {
percentThresholds[idx] =
(long)(percents.get(percentIndexes[idx]) *
totalPoints.get(idx).get());
}
}
}
// reset counters except the last window
for (int i = 0; i < windowNum - 1; i++) {
counter.get(i).set(0);
}
}
for (int i = 0 ;i < windowNum; i++) {
// Fill the values if all data are scanned.
if (percentIndexes[i] > 0 && percentIndexes[i] < percentsNum) {
for (int j = percentIndexes[i]; j < percentsNum; j++) {
percentValues[i][j] = percentValues[i][percentIndexes[i] - 1];
}
}
blockFixStatuses.put(windows.get(i),
new BlockFixStatus(counters[i], percentValues[i], percents,
totalFailedPaths.get(i).get()));
}
return blockFixStatuses;
}
}