package uk.bl.wa.hadoop.entities; import java.io.IOException; import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.commons.httpclient.URI; import org.apache.commons.httpclient.URIException; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.MapReduceBase; import org.apache.hadoop.mapred.Mapper; import org.apache.hadoop.mapred.OutputCollector; import org.apache.hadoop.mapred.Reporter; import org.apache.log4j.Logger; import org.apache.tika.Tika; import org.archive.io.ArchiveRecordHeader; import uk.bl.wa.extract.LinkExtractor; import uk.bl.wa.hadoop.WritableArchiveRecord; import uk.bl.wa.indexer.WARCIndexer; import uk.bl.wa.parsers.HtmlFeatureParser; @SuppressWarnings( { "deprecation" } ) public class EntityMapper extends MapReduceBase implements Mapper<Text, WritableArchiveRecord, Text, Text> { private static Logger log = Logger.getLogger(EntityMapper.class.getName()); private Pattern pattern; Tika tika = new Tika(); public EntityMapper() {} @Override public void configure( JobConf job ) { this.pattern = Pattern.compile( job.get( EntityExtractor.REGEX_PATTERN_PARAM ) ); } @Override public void map( Text key, WritableArchiveRecord value, OutputCollector<Text, Text> output, Reporter reporter ) throws IOException { ArchiveRecordHeader header = value.getRecord().getHeader(); /* // Determine the type: try { String tikaType = tika.detect(value.getPayload()); // return without collecting anything if this is neither HTML or XHTML: if( ! tikaType.startsWith("application/xhtml+xml") && ! tikaType.startsWith("text/html") ); return; } catch( Throwable e ) { log.error( "Tika.detect failed:" + e.getMessage() ); //e.printStackTrace(); } */ // Generate the key: String newKey = "0000";//"0/unknown"; if( !header.getHeaderFields().isEmpty() ) { //newKey = header.getDate() + "/" + header.getUrl(); // Reduce this to just the year and the host: String year = WARCIndexer.extractYear(header.getDate()); //String host = extractHost(header.getUrl()); //newKey = year + "/" + host; newKey = year; } // Collect the linkages String base_url = value.getRecord().getHeader().getUrl(); String sourceSuffix = LinkExtractor.extractPublicSuffix( base_url ); if( sourceSuffix == null ) sourceSuffix = "null"; Set<String> destSuffixes = null; try { destSuffixes= LinkExtractor.extractPublicSuffixes(HtmlFeatureParser.extractMetadata(value.getRecord(), base_url)); } catch( java.nio.charset.UnsupportedCharsetException e ) { log.error("Could not parse record! "+e); return; } catch( java.nio.charset.IllegalCharsetNameException e ) { log.error("Could not parse record! "+e); return; } catch( Exception e) { log.error("Could not parse record! "+e); return; } // Pass out the mapped results as in-links by year: for( String destSuffix : destSuffixes ) { output.collect( new Text( newKey+"\t"+destSuffix ), new Text( sourceSuffix ) ); } } }