package org.archive.hadoop.mapreduce; import java.io.DataOutputStream; import java.io.IOException; import org.apache.hadoop.io.Text; public class OvercrawlZipNumRecordWriter extends ZipNumRecordWriter { private int dayLimit = -1; private int curDayCount = 0; private String lastDay = null; public OvercrawlZipNumRecordWriter(int limit, int dayLimit, DataOutputStream outMain, DataOutputStream outSummary) { super(limit, outMain, outSummary); this.dayLimit = dayLimit; lastDay = null; curDayCount = 0; } public void write(Text key, Text val) throws IOException, InterruptedException { String urlPlusDate = key.toString(); int spaceIdx = urlPlusDate.indexOf(delim); if(spaceIdx > 0) { if(spaceIdx + 8 < urlPlusDate.length()) { String tmp = urlPlusDate.substring(0,spaceIdx + 8); boolean filter = false; if(lastDay == null) { lastDay = tmp; curDayCount = 1; } else { if(lastDay.compareTo(tmp) == 0) { // same day.. how many so far? if(curDayCount > dayLimit) { filter = true; } } else { // a new day.. lastDay = tmp; curDayCount = 0; } } if(!filter) { super.write(key, val); } } } } }