ExtractHTMLFieldCollection.java example

Explorer
Cloud9-master
- src
/*
 * Cloud9: A MapReduce Library for Hadoop
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you
 * may not use this file except in compliance with the License. You may
 * obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied. See the License for the specific language governing
 * permissions and limitations under the License.
 */

package edu.umd.cloud9.collection;

import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.regex.Pattern;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.hadoop.util.ToolRunner;
import org.apache.log4j.Logger;
import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.TagNameFilter;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;

import edu.umd.cloud9.collection.line.TextDocument;
import edu.umd.cloud9.util.PowerTool;
import edu.umd.cloud9.webgraph.TrecExtractLinks.Map.LinkCounter;

/**
 * <p>
 * Tool for generating 'per-field' collections from HTML documents. The output of this
 * tool is a new collection, in TREC format (in the form of a SequenceFile<LongWritable, TextDocument>)
 * that only consists of the text contained within the target tag. This is useful for various
 * document structure and/or field-based retrieval tasks.
 * </p>
 *
 * @author fangyue
 * @author metzler
 */
public class ExtractHTMLFieldCollection extends PowerTool {
  private static final Logger LOG = Logger.getLogger(ExtractHTMLFieldCollection.class);

  public static class MyMapper extends Mapper<LongWritable, Indexable, LongWritable, TextDocument> {
    // TODO: allow this to support user-defined regular expressions, not just the "heading" one pre-defined here
    public static class HeadingTagFilter implements NodeFilter {
      private static final long serialVersionUID = 3848416345122090905L;
      private final Pattern pattern = Pattern.compile("h[123456]", Pattern.CASE_INSENSITIVE);

      public boolean accept(Node node) {
        return (pattern.matcher(node.getText()).matches());
      }
    }

    private static String tag;

    private static final Parser parser = new Parser();
    private static NodeFilter filter;

    private static final LongWritable myKey = new LongWritable();
    private static final TextDocument myValue = new TextDocument();

    private static final StringBuffer strBuf = new StringBuffer();

    @Override
    public void setup(Mapper<LongWritable, Indexable, LongWritable, TextDocument>.Context context) throws IOException {
      Configuration conf = context.getConfiguration();
      tag = conf.get("Cloud9.TargetTag");

      if (tag.equalsIgnoreCase("heading")) {
        filter = new HeadingTagFilter();
      } else {
        filter = new TagNameFilter(tag);
      }
    }

    @Override
    public void map(LongWritable key, Indexable doc, Mapper<LongWritable, Indexable, LongWritable, TextDocument>.Context context) throws IOException, InterruptedException {
      context.getCounter(LinkCounter.INPUT_DOCS).increment(1);

      if(doc.getDocid() == null || doc.getContent() == null) {
        return;
      }

      myKey.set(key.get());

      NodeList nl;
      try {
        // initialize HTML parser
        parser.setInputHTML(doc.getContent());

        // parse the document
        nl = parser.parse(filter);
      } catch (ParserException e) {
        context.getCounter(LinkCounter.PARSER_FAILED).increment(1);
        myValue.setDocid(doc.getDocid());
        myValue.setContent("<DOC>\n<DOCNO>" + doc.getDocid() + "</DOCNO>\n<DOC>");
        context.write(myKey, myValue);
        return;
      } catch (StackOverflowError e) {
        context.getCounter(LinkCounter.PARSER_FAILED).increment(1);
        myValue.setDocid(doc.getDocid());
        myValue.setContent("<DOC>\n<DOCNO>" + doc.getDocid() + "</DOCNO>\n<DOC>");
        context.write(myKey, myValue);
        return;
      }

      strBuf.setLength(0);
      strBuf.append("<DOC>\n<DOCNO>");
      strBuf.append(doc.getDocid());
      strBuf.append("</DOCNO>\n");

      for (int i = 0; i < nl.size(); i++) {
        strBuf.append(nl.elementAt(i).toHtml()).append("\n");
      }
      strBuf.append("</DOC>\n");

      // create output document
      myValue.setDocid(doc.getDocid());
      myValue.setContent(strBuf.toString());

      // emit
      context.write(myKey, myValue);

      // bookkeeping
      context.getCounter(LinkCounter.OUTPUT_DOCS).increment(1);
    }
  }

  public static final String[] RequiredParameters = { "Cloud9.InputPath", "Cloud9.InputFormat", "Cloud9.OutputPath", "Cloud9.TargetTag" };

  public String[] getRequiredParameters() {
    return RequiredParameters;
  }

  public ExtractHTMLFieldCollection(Configuration conf) {
    super(conf);
  }

  @SuppressWarnings({"unchecked", "rawtypes"})
  @Override
  public int runTool() throws Exception {
    Configuration conf = getConf();
    Job job = new Job(conf);

    String inputPath = conf.get("Cloud9.InputPath");
    String inputFormat = conf.get("Cloud9.InputFormat");
    String outputPath = conf.get("Cloud9.OutputPath");
    String tag = conf.get("Cloud9.TargetTag");

    job.setJobName("ExtractFieldCollection");

    job.setJarByClass(ExtractHTMLFieldCollection.class);
    job.setMapperClass(MyMapper.class);
    job.setReducerClass(Reducer.class);
    job.setNumReduceTasks(200);

    job.setInputFormatClass((Class<? extends InputFormat>) Class.forName(inputFormat));
    recursivelyAddInputPaths(job, inputPath);

    FileOutputFormat.setOutputPath(job, new Path(outputPath));
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    SequenceFileOutputFormat.setCompressOutput(job, true);
    SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK);

    job.setOutputKeyClass(LongWritable.class);
    job.setOutputValueClass(TextDocument.class);

    LOG.info("ExtractFieldCollection - " + tag);
    LOG.info(" - Input path: " + inputPath);
    LOG.info(" - Input format: " + inputFormat);
    LOG.info(" - Output path: " + outputPath);
    LOG.info(" - Target tag: " + tag);

    job.waitForCompletion(true);
    return 0;
  }

  public static void recursivelyAddInputPaths(Job job, String path) throws IOException {
    FileSystem fs;
    try {
      fs = FileSystem.get(new URI(path), job.getConfiguration());
    } catch (URISyntaxException e) {
      throw new RuntimeException("Error recursively adding path -- " + path);
    }

    FileStatus[] ls = fs.listStatus(new Path(path));
    for (FileStatus status : ls) {
      // skip anything that starts with an underscore, as it often indicates
      // a log directory or another special type of Hadoop file
      if (status.getPath().getName().startsWith("_")) {
        continue;
      }

      if (status.isDir()) {
        recursivelyAddInputPaths(job, status.getPath().toString());
      } else {
        FileInputFormat.addInputPath(job, status.getPath());
      }
    }
  }

  public static void main(String [] args) throws Exception {
    Configuration conf = new Configuration();

    if(args.length != 4) {
      System.err.println("Usage: ExtractFieldCollection [input-path] [input-format] [output-path] [target-tag]");
      System.exit(-1);
    }

    conf.set("Cloud9.InputPath", args[0]);
    conf.set("Cloud9.InputFormat", args[1]);
    conf.set("Cloud9.OutputPath", args[2]);
    conf.set("Cloud9.TargetTag", args[3]);

    int res = ToolRunner.run(conf, new ExtractHTMLFieldCollection(conf), args);
    System.exit(res);
  }
}