/**
* Copyright 2008 - CommonCrawl Foundation
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
**/
package org.commoncrawl.service.listcrawler;
import java.io.IOException;
import java.util.Date;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;
import org.commoncrawl.util.time.Month;
import org.commoncrawl.util.time.SerialDate;
import org.commoncrawl.util.Tuples.Pair;
import com.google.common.collect.Multimap;
import com.google.common.collect.TreeMultimap;
/**
* Utility code
*
* @author rana
*
*/
public class ProxyPurgeUtils {
public static void main(String[] args)throws IOException {
Configuration conf = new Configuration();
conf.addResource("nutch-default.xml");
conf.addResource("nutch-site.xml");
if (args[0].equalsIgnoreCase("list")) {
listCandidates(conf, caculateCutOffMillisecond());
}
}
static long caculateCutOffMillisecond() {
// convert it to a serial date
SerialDate today = SerialDate.createInstance(new Date(System.currentTimeMillis()));
// get date 4 months back
SerialDate cutOffDate = SerialDate.addDays(-120, today);
// convert to month
Month cutOffMonth = new Month(cutOffDate.toDate());
// get first cut off millisecond
long cutOffMillisecond = cutOffMonth.getFirstMillisecond();
return cutOffMillisecond;
}
static class Range extends Pair<Long,Long> implements Comparable<Pair<Long,Long>> {
public Range(Long e0, Long e1) {
super(e0, e1);
}
@Override
public int compareTo(Pair<Long, Long> o2) {
return (e0 < o2.e0) ? -1 : (e0 > o2.e0) ? 1 : 0;
}
}
//1312941988792
static void listCandidates(Configuration conf,final long cutOffTimeMillisecond)throws IOException {
FileSystem fs = FileSystem.get(conf);
FileSystem localFS = FileSystem.getLocal(conf);
final Multimap<Long,Range> rangeMap = TreeMultimap.create();
FileStatus candidateDirs[] = fs.globStatus(new Path("crawl/proxy/cacheExport/processed/*"));
for (FileStatus candidate : candidateDirs) {
String fileName = candidate.getPath().getName();
// get scaled timestamp start
long timestampStart = Long.parseLong(fileName) * 1000000000;
// ok see if exceeds our cutoff time
if (timestampStart < cutOffTimeMillisecond) {
FileStatus ranges[] = fs.globStatus(new Path(candidate.getPath(),"*"));
for (FileStatus range : ranges) {
String rangeName = range.getPath().getName();
long rangeStart = Long.parseLong(rangeName.substring(0,rangeName.indexOf("-")));
long rangeEnd = Long.parseLong(rangeName.substring(rangeName.indexOf("-") + 1));
rangeMap.put(Long.parseLong(fileName), new Range(rangeStart,rangeEnd));
}
}
}
PathFilter cacheDataFilter = new PathFilter() {
@Override
public boolean accept(Path path) {
if (path.getName().startsWith("cacheData-") || path.getName().startsWith("cacheIndex-")) {
long timestamp = Long.parseLong(path.getName().substring(path.getName().indexOf("-") + 1));
long timestampPrefix = timestamp / 1000000000L;
//System.out.println("timestamp:" + timestamp + " prefix:" + timestampPrefix);
for (Range range : rangeMap.get(timestampPrefix)) {
if (timestamp >= range.e0 && timestamp <= range.e1) {
return true;
}
}
}
return false;
}
};
PathFilter historyDataFilter = new PathFilter() {
@Override
public boolean accept(Path path) {
if (path.getName().startsWith("historyData-") || path.getName().startsWith("historyBloomFilter-")) {
int indexOfDot = path.getName().indexOf(".");
long timestamp = -1L;
if (indexOfDot != -1) {
timestamp = Long.parseLong(path.getName().substring(path.getName().indexOf("-") + 1,indexOfDot));
}
else {
timestamp = Long.parseLong(path.getName().substring(path.getName().indexOf("-") + 1));
}
if (timestamp < cutOffTimeMillisecond) {
return true;
}
}
return false;
}
};
FileStatus purgeCandidates[] = fs.globStatus(new Path("crawl/proxy/cache/*"),cacheDataFilter);
for (FileStatus candidate : purgeCandidates) {
System.out.println("Purging Candidate:" + candidate.getPath());
fs.delete(candidate.getPath(),false);
}
FileStatus localcacheDataPurgeCandidates[] = localFS.globStatus(new Path("/home/rana/ccprod/data/proxy_data/ccn01-Prod/*"),cacheDataFilter);
for (FileStatus candidate : localcacheDataPurgeCandidates) {
System.out.println("Purging Candidate:" + candidate.getPath());
localFS.delete(candidate.getPath(),false);
}
// now delete bloom filter data
FileStatus historyPurgeCandidates[] = fs.globStatus(new Path("crawl/proxy/history/*"),historyDataFilter);
for (FileStatus candidate : historyPurgeCandidates) {
System.out.println("Purging Candidate:" + candidate.getPath());
fs.delete(candidate.getPath(),true);
}
// now delete bloom filter data
FileStatus localHistoryPurgeCandidates[] = localFS.globStatus(new Path("/home/rana/ccprod/data/proxy_data/ccn01-Prod/historyData/*"),historyDataFilter);
for (FileStatus candidate : historyPurgeCandidates) {
System.out.println("Purging Candidate:" + candidate.getPath());
fs.delete(candidate.getPath(),true);
}
for (FileStatus candidate : localHistoryPurgeCandidates) {
System.out.println("Purging Candidate:" + candidate.getPath());
localFS.delete(candidate.getPath(),true);
}
}
}