package uk.bl.wa.hadoop.outlinks;
import static org.archive.format.warc.WARCConstants.HEADER_KEY_TYPE;
import static org.archive.format.warc.WARCConstants.WARCRecordType;
import java.io.IOException;
import java.util.Iterator;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
import org.apache.tika.metadata.Metadata;
import org.archive.io.ArchiveRecordHeader;
import uk.bl.wa.hadoop.WritableArchiveRecord;
import uk.bl.wa.parsers.HtmlFeatureParser;
@SuppressWarnings( "deprecation" )
public class OutlinkExtractorMapper extends MapReduceBase implements Mapper<Text, WritableArchiveRecord, Text, Text> {
Pattern pattern = Pattern.compile( "^(https?://([^/:]+)(:[0-9]+)?/).*$" );
Matcher matcher = null;
String resourceHost;
String year;
Text outputKey;
String resourceUrl;
Iterator<String> links;
ArchiveRecordHeader header;
@Override
public void map( Text key, WritableArchiveRecord value, OutputCollector<Text, Text> output, Reporter reporter ) throws IOException {
try {
header = value.getRecord().getHeader();
// If this is a non-response WARC record...
if( header.getHeaderFieldKeys().contains( HEADER_KEY_TYPE ) &&
!header.getHeaderValue( HEADER_KEY_TYPE ).equals( WARCRecordType.response.toString() ) ) {
return;
}
resourceUrl = value.getRecord().getHeader().getUrl();
// ..or if this isn't a HTTP record...
if( !resourceUrl.startsWith( "http" ) ) {
return;
}
matcher = pattern.matcher( resourceUrl );
if( matcher.matches() ) {
resourceHost = matcher.group( 2 );
year = value.getRecord().getHeader().getDate().substring( 0, 4 );
outputKey = new Text( year + "\t" + resourceHost );
Metadata metadata = HtmlFeatureParser.extractMetadata( value.getRecord(), resourceUrl );
for( String link : metadata.getValues( HtmlFeatureParser.LINK_LIST ) ) {
matcher = pattern.matcher( link );
if( matcher.matches() ) {
output.collect( outputKey, new Text( matcher.group( 2 ) ) );
}
}
}
} catch( Exception e ) {
System.err.println( e.getMessage() );
}
}
}