package edu.umd.cloud9.collection.clue;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.JobConf;
import com.google.common.base.Preconditions;
/**
* Class that provides convenience methods for processing portions of the Clue
* Web collection with Hadoop. Static methods in this class allow the user to
* easily "select" different portions of the collection to serve as input to a
* MapReduce job.
*
* @author Jimmy Lin
*/
public class ClueCollectionPathConstants {
private static final String[] sEnglishTest = { "ClueWeb09_English_1/en0000/00.warc.gz" };
private static final String[] sEnglishTiny = { "ClueWeb09_English_1/en0000/" };
private static final String[] sEnglishSmall = { "ClueWeb09_English_1/en0000/",
"ClueWeb09_English_1/en0001/", "ClueWeb09_English_1/en0002/",
"ClueWeb09_English_1/en0003/", "ClueWeb09_English_1/en0004/",
"ClueWeb09_English_1/en0005/", "ClueWeb09_English_1/en0006/",
"ClueWeb09_English_1/en0007/", "ClueWeb09_English_1/en0008/",
"ClueWeb09_English_1/en0009/", "ClueWeb09_English_1/en0010/",
"ClueWeb09_English_1/en0011/", "ClueWeb09_English_1/enwp00/",
"ClueWeb09_English_1/enwp01/", "ClueWeb09_English_1/enwp02/",
"ClueWeb09_English_1/enwp03/" };
private static final String[][] sEnglishSections = {
{ "ClueWeb09_English_1/en0000/", "ClueWeb09_English_1/en0001/",
"ClueWeb09_English_1/en0002/", "ClueWeb09_English_1/en0003/",
"ClueWeb09_English_1/en0004/", "ClueWeb09_English_1/en0005/",
"ClueWeb09_English_1/en0006/", "ClueWeb09_English_1/en0007/",
"ClueWeb09_English_1/en0008/", "ClueWeb09_English_1/en0009/",
"ClueWeb09_English_1/en0010/", "ClueWeb09_English_1/en0011/",
"ClueWeb09_English_1/enwp00/", "ClueWeb09_English_1/enwp01/",
"ClueWeb09_English_1/enwp02/", "ClueWeb09_English_1/enwp03/" },
{ "ClueWeb09_English_2/en0012", "ClueWeb09_English_2/en0013",
"ClueWeb09_English_2/en0014", "ClueWeb09_English_2/en0015",
"ClueWeb09_English_2/en0016", "ClueWeb09_English_2/en0017",
"ClueWeb09_English_2/en0018", "ClueWeb09_English_2/en0019",
"ClueWeb09_English_2/en0020", "ClueWeb09_English_2/en0021",
"ClueWeb09_English_2/en0022", "ClueWeb09_English_2/en0023",
"ClueWeb09_English_2/en0024", "ClueWeb09_English_2/en0025",
"ClueWeb09_English_2/en0026" },
{ "ClueWeb09_English_3/en0027", "ClueWeb09_English_3/en0028",
"ClueWeb09_English_3/en0029", "ClueWeb09_English_3/en0030",
"ClueWeb09_English_3/en0031", "ClueWeb09_English_3/en0032",
"ClueWeb09_English_3/en0033", "ClueWeb09_English_3/en0034",
"ClueWeb09_English_3/en0035", "ClueWeb09_English_3/en0036",
"ClueWeb09_English_3/en0037", "ClueWeb09_English_3/en0038",
"ClueWeb09_English_3/en0039", "ClueWeb09_English_3/en0040" },
{ "ClueWeb09_English_4/en0041", "ClueWeb09_English_4/en0042",
"ClueWeb09_English_4/en0043", "ClueWeb09_English_4/en0044",
"ClueWeb09_English_4/en0045", "ClueWeb09_English_4/en0046",
"ClueWeb09_English_4/en0047", "ClueWeb09_English_4/en0048",
"ClueWeb09_English_4/en0049", "ClueWeb09_English_4/en0050",
"ClueWeb09_English_4/en0051", "ClueWeb09_English_4/en0052",
"ClueWeb09_English_4/en0053", "ClueWeb09_English_4/en0054" },
{ "ClueWeb09_English_5/en0055", "ClueWeb09_English_5/en0056",
"ClueWeb09_English_5/en0057", "ClueWeb09_English_5/en0058",
"ClueWeb09_English_5/en0059", "ClueWeb09_English_5/en0060",
"ClueWeb09_English_5/en0061", "ClueWeb09_English_5/en0062",
"ClueWeb09_English_5/en0063", "ClueWeb09_English_5/en0064",
"ClueWeb09_English_5/en0065", "ClueWeb09_English_5/en0066",
"ClueWeb09_English_5/en0067", "ClueWeb09_English_5/en0068" },
{ "ClueWeb09_English_6/en0069", "ClueWeb09_English_6/en0070",
"ClueWeb09_English_6/en0071", "ClueWeb09_English_6/en0072",
"ClueWeb09_English_6/en0073", "ClueWeb09_English_6/en0074",
"ClueWeb09_English_6/en0075", "ClueWeb09_English_6/en0076",
"ClueWeb09_English_6/en0077", "ClueWeb09_English_6/en0078",
"ClueWeb09_English_6/en0079", "ClueWeb09_English_6/en0080",
"ClueWeb09_English_6/en0081", "ClueWeb09_English_6/en0082" },
{ "ClueWeb09_English_7/en0083", "ClueWeb09_English_7/en0084",
"ClueWeb09_English_7/en0085", "ClueWeb09_English_7/en0086",
"ClueWeb09_English_7/en0087", "ClueWeb09_English_7/en0088",
"ClueWeb09_English_7/en0089", "ClueWeb09_English_7/en0090",
"ClueWeb09_English_7/en0091", "ClueWeb09_English_7/en0092",
"ClueWeb09_English_7/en0093", "ClueWeb09_English_7/en0094",
"ClueWeb09_English_7/en0095", "ClueWeb09_English_7/en0096" },
{ "ClueWeb09_English_8/en0097", "ClueWeb09_English_8/en0098",
"ClueWeb09_English_8/en0099", "ClueWeb09_English_8/en0100",
"ClueWeb09_English_8/en0101", "ClueWeb09_English_8/en0102",
"ClueWeb09_English_8/en0103", "ClueWeb09_English_8/en0104",
"ClueWeb09_English_8/en0105", "ClueWeb09_English_8/en0106",
"ClueWeb09_English_8/en0107", "ClueWeb09_English_8/en0108",
"ClueWeb09_English_8/en0109" },
{ "ClueWeb09_English_9/en0110", "ClueWeb09_English_9/en0111",
"ClueWeb09_English_9/en0112", "ClueWeb09_English_9/en0113",
"ClueWeb09_English_9/en0114", "ClueWeb09_English_9/en0115",
"ClueWeb09_English_9/en0116", "ClueWeb09_English_9/en0117",
"ClueWeb09_English_9/en0118", "ClueWeb09_English_9/en0119",
"ClueWeb09_English_9/en0120", "ClueWeb09_English_9/en0121",
"ClueWeb09_English_9/en0122", "ClueWeb09_English_9/en0123" },
{ "ClueWeb09_English_10/en0124", "ClueWeb09_English_10/en0125",
"ClueWeb09_English_10/en0126", "ClueWeb09_English_10/en0127",
"ClueWeb09_English_10/en0128", "ClueWeb09_English_10/en0129",
"ClueWeb09_English_10/en0130", "ClueWeb09_English_10/en0131",
"ClueWeb09_English_10/en0132", "ClueWeb09_English_10/en0133" } };
private static final String[] sEnglishComplete = { "ClueWeb09_English_1/en0000/",
"ClueWeb09_English_1/en0001/", "ClueWeb09_English_1/en0002/",
"ClueWeb09_English_1/en0003/", "ClueWeb09_English_1/en0004/",
"ClueWeb09_English_1/en0005/", "ClueWeb09_English_1/en0006/",
"ClueWeb09_English_1/en0007/", "ClueWeb09_English_1/en0008/",
"ClueWeb09_English_1/en0009/", "ClueWeb09_English_1/en0010/",
"ClueWeb09_English_1/en0011/", "ClueWeb09_English_1/enwp00/",
"ClueWeb09_English_1/enwp01/", "ClueWeb09_English_1/enwp02/",
"ClueWeb09_English_1/enwp03/", "ClueWeb09_English_2/en0012",
"ClueWeb09_English_2/en0013", "ClueWeb09_English_2/en0014",
"ClueWeb09_English_2/en0015", "ClueWeb09_English_2/en0016",
"ClueWeb09_English_2/en0017", "ClueWeb09_English_2/en0018",
"ClueWeb09_English_2/en0019", "ClueWeb09_English_2/en0020",
"ClueWeb09_English_2/en0021", "ClueWeb09_English_2/en0022",
"ClueWeb09_English_2/en0023", "ClueWeb09_English_2/en0024",
"ClueWeb09_English_2/en0025", "ClueWeb09_English_2/en0026",
"ClueWeb09_English_3/en0027", "ClueWeb09_English_3/en0028",
"ClueWeb09_English_3/en0029", "ClueWeb09_English_3/en0030",
"ClueWeb09_English_3/en0031", "ClueWeb09_English_3/en0032",
"ClueWeb09_English_3/en0033", "ClueWeb09_English_3/en0034",
"ClueWeb09_English_3/en0035", "ClueWeb09_English_3/en0036",
"ClueWeb09_English_3/en0037", "ClueWeb09_English_3/en0038",
"ClueWeb09_English_3/en0039", "ClueWeb09_English_3/en0040",
"ClueWeb09_English_4/en0041", "ClueWeb09_English_4/en0042",
"ClueWeb09_English_4/en0043", "ClueWeb09_English_4/en0044",
"ClueWeb09_English_4/en0045", "ClueWeb09_English_4/en0046",
"ClueWeb09_English_4/en0047", "ClueWeb09_English_4/en0048",
"ClueWeb09_English_4/en0049", "ClueWeb09_English_4/en0050",
"ClueWeb09_English_4/en0051", "ClueWeb09_English_4/en0052",
"ClueWeb09_English_4/en0053", "ClueWeb09_English_4/en0054",
"ClueWeb09_English_5/en0055", "ClueWeb09_English_5/en0056",
"ClueWeb09_English_5/en0057", "ClueWeb09_English_5/en0058",
"ClueWeb09_English_5/en0059", "ClueWeb09_English_5/en0060",
"ClueWeb09_English_5/en0061", "ClueWeb09_English_5/en0062",
"ClueWeb09_English_5/en0063", "ClueWeb09_English_5/en0064",
"ClueWeb09_English_5/en0065", "ClueWeb09_English_5/en0066",
"ClueWeb09_English_5/en0067", "ClueWeb09_English_5/en0068",
"ClueWeb09_English_6/en0069", "ClueWeb09_English_6/en0070",
"ClueWeb09_English_6/en0071", "ClueWeb09_English_6/en0072",
"ClueWeb09_English_6/en0073", "ClueWeb09_English_6/en0074",
"ClueWeb09_English_6/en0075", "ClueWeb09_English_6/en0076",
"ClueWeb09_English_6/en0077", "ClueWeb09_English_6/en0078",
"ClueWeb09_English_6/en0079", "ClueWeb09_English_6/en0080",
"ClueWeb09_English_6/en0081", "ClueWeb09_English_6/en0082",
"ClueWeb09_English_7/en0083", "ClueWeb09_English_7/en0084",
"ClueWeb09_English_7/en0085", "ClueWeb09_English_7/en0086",
"ClueWeb09_English_7/en0087", "ClueWeb09_English_7/en0088",
"ClueWeb09_English_7/en0089", "ClueWeb09_English_7/en0090",
"ClueWeb09_English_7/en0091", "ClueWeb09_English_7/en0092",
"ClueWeb09_English_7/en0093", "ClueWeb09_English_7/en0094",
"ClueWeb09_English_7/en0095", "ClueWeb09_English_7/en0096",
"ClueWeb09_English_8/en0097", "ClueWeb09_English_8/en0098",
"ClueWeb09_English_8/en0099", "ClueWeb09_English_8/en0100",
"ClueWeb09_English_8/en0101", "ClueWeb09_English_8/en0102",
"ClueWeb09_English_8/en0103", "ClueWeb09_English_8/en0104",
"ClueWeb09_English_8/en0105", "ClueWeb09_English_8/en0106",
"ClueWeb09_English_8/en0107", "ClueWeb09_English_8/en0108",
"ClueWeb09_English_8/en0109", "ClueWeb09_English_9/en0110",
"ClueWeb09_English_9/en0111", "ClueWeb09_English_9/en0112",
"ClueWeb09_English_9/en0113", "ClueWeb09_English_9/en0114",
"ClueWeb09_English_9/en0115", "ClueWeb09_English_9/en0116",
"ClueWeb09_English_9/en0117", "ClueWeb09_English_9/en0118",
"ClueWeb09_English_9/en0119", "ClueWeb09_English_9/en0120",
"ClueWeb09_English_9/en0121", "ClueWeb09_English_9/en0122",
"ClueWeb09_English_9/en0123", "ClueWeb09_English_10/en0124",
"ClueWeb09_English_10/en0125", "ClueWeb09_English_10/en0126",
"ClueWeb09_English_10/en0127", "ClueWeb09_English_10/en0128",
"ClueWeb09_English_10/en0129", "ClueWeb09_English_10/en0130",
"ClueWeb09_English_10/en0131", "ClueWeb09_English_10/en0132",
"ClueWeb09_English_10/en0133" };
private ClueCollectionPathConstants() {
}
/**
* Adds a sample compressed WARC archive to a Hadoop <code>JobConf</code>
* object. The specific archive is
* <code>ClueWeb09_English_1/en0000/00.warc.gz</code>, which contains
* 35,582 Web pages.
*
* @param conf
* Hadoop <code>JobConf</code>
* @param base
* base path for the Clue Web collection
*/
public static void addEnglishTestFile(JobConf conf, String base) {
for (String s : sEnglishTest) {
FileInputFormat.addInputPath(conf, new Path(base + "/" + s));
}
}
/**
* Adds the first section of the Clue Web English collection to a Hadoop
* <code>JobConf</code> object. Specifically, this method adds the
* contents of <code>ClueWeb09_English_1/en0000/</code>, which contains
* 3,382,356 pages.
*
* @param conf
* Hadoop <code>JobConf</code>
* @param base
* base path for the Clue Web collection
*/
public static void addEnglishTinyCollection(JobConf conf, String base) {
for (String s : sEnglishTiny) {
FileInputFormat.addInputPath(conf, new Path(base + "/" + s));
}
}
/**
* Adds the first part (segment) of the Clue Web English collection to a
* Hadoop <code>JobConf</code> object. Specifically, this method adds the
* contents of <code>ClueWeb09_English_1/</code>, which contains
* 50,220,423 pages.
*
* @param conf
* Hadoop <code>JobConf</code>
* @param base
* base path for the Clue Web collection
*/
public static void addEnglishSmallCollection(JobConf conf, String base) {
for (String s : sEnglishSmall) {
FileInputFormat.addInputPath(conf, new Path(base + "/" + s));
}
}
/**
* Adds the complete Clue Web English collection to a Hadoop
* <code>JobConf</code> object. Specifically, this method adds the
* contents of <code>ClueWeb09_English_1/</code> through
* <code>ClueWeb09_English_10/</code>, which contains 503,903,810 pages.
*
* @param conf
* Hadoop <code>JobConf</code>
* @param base
* base path for the Clue Web collection
*/
public static void addEnglishCompleteCollection(JobConf conf, String base) {
for (String s : sEnglishComplete) {
FileInputFormat.addInputPath(conf, new Path(base + "/" + s));
}
}
/**
* Adds a part (segment) of the Clue Web English collection to a Hadoop
* <code>JobConf</code> object. Part 1 corresponds to the contents of
* <code>ClueWeb09_English_1/</code> (i.e., the "small" collection), all
* the way through part 10. Note that adding all ten parts is equivalent to
* adding the complete English collection.
*
* @param conf
* Hadoop <code>JobConf</code>
* @param base
* base path for the Clue Web collection
*/
public static void addEnglishCollectionPart(JobConf conf, String base, int i) {
Preconditions.checkArgument(i >= 1 && i <= 10);
for (String s : sEnglishSections[i - 1]) {
FileInputFormat.addInputPath(conf, new Path(base + "/" + s));
}
}
}