TrecExtractLinks.java example

Explorer
Cloud9-master
- src
/*
 * Cloud9: A MapReduce Library for Hadoop
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you
 * may not use this file except in compliance with the License. You may
 * obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied. See the License for the specific language governing
 * permissions and limitations under the License.
 */

package edu.umd.cloud9.webgraph;

import java.io.IOException;
import java.io.UTFDataFormatException;
import java.net.URI;
import java.net.URISyntaxException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.log4j.Logger;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.NodeClassFilter;
import org.htmlparser.tags.BaseHrefTag;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;

import tl.lin.data.array.ArrayListWritable;
import edu.umd.cloud9.collection.DocnoMapping;
import edu.umd.cloud9.collection.WebDocument;
import edu.umd.cloud9.util.PowerTool;
import edu.umd.cloud9.webgraph.data.AnchorText;
import edu.umd.cloud9.webgraph.data.AnchorTextConstants;
import edu.umd.cloud9.webgraph.normalizer.AnchorTextNormalizer;

/**
 * 
 * @author Nima Asadi
 * @author Fangyue Wang
 * @author metzler
 *
 */

public class TrecExtractLinks extends PowerTool
{
  private static final Logger LOG = Logger
      .getLogger(TrecExtractLinks.class);

  public static class Map
      extends
      Mapper<LongWritable, WebDocument, Text, ArrayListWritable<AnchorText>>
  {

    public static enum LinkCounter
    {
      INPUT_DOCS, // number of input documents
      OUTPUT_DOCS, // number of output documents
      INVALID_DOCNO, // number of malformed documents
      INVALID_URL, // number of malformed URLs
      TEXT_TOO_LONG, // number of lines of anchor text that are abnormally
      // long
      PARSER_FAILED
      // number of times the HTML parser fails
    };

    private static String base; // base URL for current document
    private static String baseHost;
    private static int docno; // docno of current document

    private static final Text keyWord = new Text(); // output key for the
    // mappers
    private static final ArrayListWritable<AnchorText> arrayList = new ArrayListWritable<AnchorText>();
    // output value for the mappers

    private static DocnoMapping docnoMapping = null;

    private static final Parser parser = new Parser();
    private static final NodeFilter filter = new NodeClassFilter(
        LinkTag.class);
    private static NodeList list;

    private static boolean includeInternalLinks;

    private static AnchorTextNormalizer normalizer;

    @Override
    public void setup(
        Mapper<LongWritable, WebDocument, Text, ArrayListWritable<AnchorText>>.Context context)
        throws IOException
    {
      Configuration conf = context.getConfiguration();

      String docnoMappingClass = conf.get("Cloud9.DocnoMappingClass");
      try
      {
        docnoMapping = (DocnoMapping) Class.forName(docnoMappingClass)
            .newInstance();
      }
      catch (Exception e)
      {
        throw new RuntimeException(
            "Error initializing DocnoMapping class!");
      }

      String docnoMappingFile = conf.get("Cloud9.DocnoMappingFile", null);
      if (docnoMappingFile != null)
      {
        Path docnoMappingPath = null;
        try
        {
          Path[] localFiles = DistributedCache
              .getLocalCacheFiles(conf);
          if (localFiles != null)
          {
            docnoMappingPath = localFiles[0];
          }
          else
          {
            docnoMappingPath = new Path(
                conf.get("Cloud9.DocnoMappingFile"));
          }
        }
        catch (IOException e)
        {
          throw new RuntimeException(
              "Unable to find DocnoMappingFile!");
        }

        try
        {
          docnoMapping.loadMapping(docnoMappingPath,
              FileSystem.getLocal(conf));
        }
        catch (Exception e)
        {
          e.printStackTrace();
          throw new RuntimeException(
              "Error initializing DocnoMapping!");
        }
      }

      includeInternalLinks = conf.getBoolean(
          "Cloud9.IncludeInternalLinks", false);

      try
      {
        normalizer = (AnchorTextNormalizer) Class.forName(
            conf.get("Cloud9.AnchorTextNormalizer")).newInstance();
      }
      catch (Exception e)
      {
        e.printStackTrace();
        throw new RuntimeException(
            "Error initializing AnchorTextNormalizer");
      }
    }

    @Override
    public void map(
        LongWritable key,
        WebDocument doc,
        Mapper<LongWritable, WebDocument, Text, ArrayListWritable<AnchorText>>.Context context)
        throws IOException, InterruptedException
    {
      context.getCounter(LinkCounter.INPUT_DOCS).increment(1);

      try
      {
        docno = docnoMapping.getDocno(doc.getDocid());
      }
      catch (NullPointerException e)
      {
        // Discard documents with an invalid document number
        context.getCounter(LinkCounter.INVALID_DOCNO).increment(1);
        return;
      }

      try
      {
        String url = doc.getURL().split("\n")[0];
        LOG.info("URI: " + url);
        base = normalizeURL(url);
      }
      catch (Exception e)
      {
        // Discard documents with which there is no URL associated
        context.getCounter(LinkCounter.INVALID_URL).increment(1);
        return;
      }

      if (base == null)
      {
        context.getCounter(LinkCounter.INVALID_URL).increment(1);
        return;
      }

      arrayList.clear();
      arrayList.add(new AnchorText(
          AnchorTextConstants.Type.DOCNO_FIELD.val,
          AnchorTextConstants.EMPTY_STRING, docno));
      keyWord.set(base);
      context.write(keyWord, arrayList);

      // keeping track of the number of documents that have actually been
      // processed
      context.getCounter(LinkCounter.OUTPUT_DOCS).increment(1);

      try
      {
        baseHost = new URI(base).getHost();
      }
      catch (Exception e)
      {
        context.getCounter(LinkCounter.INVALID_URL).increment(1);
        return;
      }

      if (baseHost == null)
      {
        context.getCounter(LinkCounter.INVALID_URL).increment(1);
        return;
      }

      try
      {
        parser.setInputHTML(doc.getContent()); // initializing the
        // parser with new HTML
        // content

        // Setting base URL for the current document
        NodeList nl = parser.parse(null);
        BaseHrefTag baseTag = new BaseHrefTag();
        baseTag.setBaseUrl(base);
        nl.add(baseTag);

        // re-initializing the parser with the fixed content
        parser.setInputHTML(nl.toHtml());

        // listing all LinkTag nodes
        list = parser.extractAllNodesThatMatch(filter);
      }
      catch (ParserException e)
      {
        context.getCounter(LinkCounter.PARSER_FAILED).increment(1);
        return;
      }
      catch (StackOverflowError e)
      {
        context.getCounter(LinkCounter.PARSER_FAILED).increment(1);
        return;
      }

      for (int i = 0; i < list.size(); i++)
      {
        LinkTag link = (LinkTag) list.elementAt(i);
        String anchor = link.getLinkText();
        String url = normalizeURL(link.extractLink());

        if (url == null)
        {
          continue;
        }

        if (url.equals(base))
        { // discard self links
          continue;
        }

        String host = null;
        try
        {
          host = new URI(url).getHost();
        }
        catch (Exception e)
        {
          continue;
        }

        if (host == null)
        {
          continue;
        }

        if (anchor == null)
        {
          anchor = "";
        }

        // normalizing the anchor text
        anchor = normalizer.process(anchor);

        arrayList.clear();
        if (baseHost.equals(host))
        {

          if (!includeInternalLinks)
            continue;

          arrayList.add(new AnchorText(
              AnchorTextConstants.Type.INTERNAL_IN_LINK.val,
              anchor, docno));

        }
        else
        {
          arrayList.add(new AnchorText(
              AnchorTextConstants.Type.EXTERNAL_IN_LINK.val,
              anchor, docno));
        }

        try
        {
          keyWord.set(url);
          context.write(keyWord, arrayList);
        }
        catch (UTFDataFormatException e)
        {
          context.getCounter(LinkCounter.TEXT_TOO_LONG).increment(1);

          keyWord.set(url);
          byte flag = arrayList.get(0).getType();
          arrayList.clear();
          arrayList.add(new AnchorText(flag,
              AnchorTextConstants.EMPTY_STRING, docno));
          context.write(keyWord, arrayList);
        }

      }
    }

    private static String normalizeURL(String url)
    {
      try
      {
        URI uri = new URI(url).normalize(); // first apply built-in normalizer
        String scheme = uri.getScheme().toLowerCase(); // schemes are not case sensitive
        String host = uri.getHost().toLowerCase(); // hosts are not case sensitive
        String path = uri.getPath();
        while(path != null && path.length() > 0 && path.charAt(path.length()-1) == '/') { // remove trailing forward slashes from path
          path = path.substring(0, path.length()-1);
        }
        return (new URI(scheme, host, path, null)).toString();
      }
      catch (Exception e)
      {
        return null;
      }
    }
  }

  public static class Reduce
      extends
      Reducer<Text, ArrayListWritable<AnchorText>, Text, ArrayListWritable<AnchorText>>
  {

    private static final ArrayListWritable<AnchorText> arrayList = new ArrayListWritable<AnchorText>();
    private static boolean pushed;

    @Override
    public void reduce(
        Text key,
        Iterable<ArrayListWritable<AnchorText>> values,
        Reducer<Text, ArrayListWritable<AnchorText>, Text, ArrayListWritable<AnchorText>>.Context context)
        throws IOException, InterruptedException
    {

      arrayList.clear();

      for (ArrayListWritable<AnchorText> packet : values)
      {
        for (AnchorText data : packet)
        {

          pushed = false;

          for (int i = 0; i < arrayList.size(); i++)
          {
            if (arrayList.get(i).equalsIgnoreSources(data))
            {
              arrayList.get(i).addDocumentsFrom(data);
              pushed = true;
              break;
            }
          }

          if (!pushed)
            arrayList.add(data.clone());
        }
      }

      context.write(key, arrayList);
    }
  }

  public static final String[] RequiredParameters = { "Cloud9.InputPath",
      "Cloud9.OutputPath", "Cloud9.Mappers", "Cloud9.Reducers",
      "Cloud9.IncludeInternalLinks", "Cloud9.AnchorTextNormalizer",
      "Cloud9.DocnoMappingClass", "Cloud9.DocnoMappingFile" };

  public String[] getRequiredParameters()
  {
    return RequiredParameters;
  }

  public TrecExtractLinks(Configuration conf)
  {
    super(conf);
  }

  CollectionConfigurationManager configer;

  public TrecExtractLinks(Configuration conf,
      CollectionConfigurationManager confer)
  {
    super(conf);
    this.configer = confer;
  }

  @Override
  public int runTool() throws Exception
  {

    Configuration conf = getConf();
    conf.set("mapred.child.java.opts", "-Xmx3072m");
    conf.setInt("mapred.task.timeout", 60000000);
    Job job = new Job(conf);

    int numReducers = conf.getInt("Cloud9.Reducers", 200);

    String inputPath = conf.get("Cloud9.InputPath");
    String outputPath = conf.get("Cloud9.OutputPath");

    String mappingFile = conf.get("Cloud9.DocnoMappingFile");

    FileSystem fs = FileSystem.get(conf);
    if (!fs.exists(new Path(mappingFile)))
    {
      throw new RuntimeException("Error: Docno mapping data file "
          + mappingFile + " doesn't exist!");
    }

    DistributedCache.addCacheFile(new Path(mappingFile).toUri(), job.getConfiguration());

    job.setJobName("ExtractLinks");
    job.setNumReduceTasks(numReducers);

    job.setJarByClass(TrecExtractLinks.class);
    job.setMapperClass(TrecExtractLinks.Map.class);
    job.setCombinerClass(TrecExtractLinks.Reduce.class);
    job.setReducerClass(TrecExtractLinks.Reduce.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(ArrayListWritable.class);

    configer.applyJobConfig(job);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    SequenceFileOutputFormat.setCompressOutput(job, true);
    SequenceFileOutputFormat.setOutputCompressionType(job,
        SequenceFile.CompressionType.BLOCK);

    recursivelyAddInputPaths(job, inputPath);

    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    LOG.info("ExtractLinks");
    LOG.info(" - input path: " + inputPath);
    LOG.info(" - output path: " + outputPath);
    LOG.info(" - mapping file: " + mappingFile);
    LOG.info(" - include internal links? "
        + conf.getBoolean("Cloud9.IncludeInternalLinks", false));

    job.waitForCompletion(true);
    return 0;
  }
  
   public static void recursivelyAddInputPaths(Job job, String path) throws IOException {
      FileSystem fs;
      try {
        fs = FileSystem.get(new URI(path), job.getConfiguration());
      } catch (URISyntaxException e) {
        throw new RuntimeException("Error recursively adding path -- " + path);
      }

      FileStatus [] ls = fs.listStatus(new Path(path));
      for(FileStatus status : ls) {
        // skip anything that starts with an underscore, as it often indicates
        // a log directory or another special type of Hadoop file
        if(status.getPath().getName().startsWith("_")) {
          continue;
        }

        if(status.isDir()) {
          recursivelyAddInputPaths(job, status.getPath().toString());
        }
        else {
          FileInputFormat.addInputPath(job, status.getPath());
        }
      }
    }

}