/*
* This file is part of the Wayback archival access software
* (http://archive-access.sourceforge.net/projects/wayback/).
*
* Licensed to the Internet Archive (IA) by one or more individual
* contributors.
*
* The IA licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.archive.wayback.hadoop;
import java.io.IOException;
import org.apache.commons.httpclient.URIException;
import org.apache.hadoop.conf.Configurable;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.archive.wayback.util.url.AggressiveUrlCanonicalizer;
/**
* @author brad
*
*/
public class CDXCanonicalizingMapper extends Mapper<Object, Text, Text, Text>
implements Configurable {
private static String MODE_CONFIG_NAME = "cdx.map.mode";
public static int MODE_GLOBAL = 0;
public static int MODE_FULL = 1;
private Configuration conf;
private int mode = MODE_GLOBAL;
private Text key = new Text();
private Text remainder = new Text();
private String delim = " ";
StringBuilder sb = new StringBuilder();
public void map(Object y, Text value, Context context)
throws IOException, InterruptedException {
if(mode == MODE_GLOBAL) {
mapGlobal(y,value,context);
} else {
mapFull(y,value,context);
}
}
private static int SHA1_DIGITS = 3;
AggressiveUrlCanonicalizer canonicalizer = new AggressiveUrlCanonicalizer();
private StringBuilder ksb = new StringBuilder();
private StringBuilder vsb = new StringBuilder();
private int i1 = 0;
private int i2 = 0;
private int i3 = 0;
private int i4 = 0;
private void mapGlobal(Object y, Text value, Context context)
throws IOException, InterruptedException {
String s = value.toString();
String parts[] = s.split(delim);
if(parts.length == 10) {
if(!parts[9].contains("A")) {
ksb.setLength(0);
vsb.setLength(0);
try {
ksb.append(canonicalizer.urlStringToKey(parts[0])).append(" ");
ksb.append(parts[1]); // date
vsb.append(parts[0]).append(delim); // orig_url
vsb.append(parts[3]).append(delim); // MIME
vsb.append(parts[4]).append(delim); // HTTP_CODE
vsb.append(parts[5].substring(0, SHA1_DIGITS)).append(" "); // SHA1
vsb.append(parts[6]).append(delim); // redirect
vsb.append(parts[7]).append(delim); // start_offset
vsb.append(parts[8]).append(".arc.gz"); // arc_prefix
key.set(ksb.toString());
remainder.set(vsb.toString());
context.write(key, remainder);
} catch (URIException e) {
System.err.println("Failed Canonicalize:("+parts[0]+
") in ("+parts[8]+"):("+parts[7]+")");
}
}
} else {
System.err.println("Funky: Problem with line("+s+")");
}
}
private void mapFull(Object y, Text value, Context context)
throws IOException, InterruptedException {
String s = value.toString();
if(s.startsWith(" CDX ")) {
return;
}
boolean problems = true;
i1 = s.indexOf(delim);
if(i1 > 0) {
i2 = s.indexOf(delim, i1 + 1);
if(i2 > 0) {
i3 = s.indexOf(delim, i2 + 1);
if(i3 > 0) {
i4 = s.lastIndexOf(delim);
if(i4 > i3) {
try {
ksb.setLength(0);
ksb.append(canonicalizer.urlStringToKey(s.substring(i2 + 1, i3)));
ksb.append(s.substring(i1,i4));
key.set(ksb.toString());
remainder.set(s.substring(i4+1));
context.write(key, remainder);
problems = false;
} catch(URIException e) {
// just eat it.. problems will be true.
}
}
}
}
}
if(problems) {
System.err.println("CDX-Can: Problem with line("+s+")");
}
}
// private void mapOld(Object y, Text value, Context context)
// throws IOException, InterruptedException {
// String parts[] = value.toString().split(delim);
// // lets assume key is field 1-2:
// sb.setLength(0);
// sb.append(parts[0]).append(delim).append(parts[1]);
// key.set(sb.toString());
// remainder.set(join(delim,parts,2));
// context.write(key, remainder);
// }
//
// private String join(String delim, String parts[], int start) {
// sb.setLength(0);
// int count = parts.length -1;
// for(int i = start; i < count; i++) {
// sb.append(parts[i]).append(delim);
// }
// sb.append(parts[count]);
// return sb.toString();
// }
/**
* @param conf Configuration for the Job
* @param mode String mode to use, one of MODE_GLOBAL, MODE_FULL
*/
public static void setMapMode(Configuration conf, int mode) {
conf.setInt(MODE_CONFIG_NAME, mode);
}
public Configuration getConf() {
return conf;
}
public void setConf(Configuration conf) {
this.conf = conf;
mode = conf.getInt(MODE_CONFIG_NAME, MODE_FULL);
delim = conf.get(CDXSortDriver.TEXT_OUTPUT_DELIM_CONFIG,delim);
}
}