MailToPrefsDriver.java example

Explorer
mahout-rbmClassifier-master
/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.mahout.cf.taste.example.email;

import com.google.common.collect.Lists;
import com.google.common.io.Closeables;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FileUtil;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.ToolRunner;
import org.apache.mahout.common.AbstractJob;
import org.apache.mahout.common.HadoopUtil;
import org.apache.mahout.common.Pair;
import org.apache.mahout.common.commandline.DefaultOptionCreator;
import org.apache.mahout.common.iterator.sequencefile.PathFilters;
import org.apache.mahout.common.iterator.sequencefile.PathType;
import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirIterable;
import org.apache.mahout.math.VarIntWritable;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.net.URI;
import java.util.List;
import java.util.Map;
import java.util.concurrent.atomic.AtomicInteger;

/**
 * Convert the Mail archives (see {@link org.apache.mahout.text.SequenceFilesFromMailArchives}) to a preference
 * file that can be consumed by the {@link org.apache.mahout.cf.taste.hadoop.pseudo.RecommenderJob}.
 * <p/>
 * This assumes the input is a Sequence File, that the key is: filename/message id and the value is a list (separated by the
 * user's choosing) containing the from email and any references
 * <p/>
 * The output is a matrix where either the from or to are the rows (represented as longs) and the columns are the message ids
 * that the user has interacted with (as a VectorWritable).  This class currently does not account for thread hijacking.
 * <p/>
 * It also outputs a side table mapping the row ids to their original and the message ids to the message thread id
 */
public final class MailToPrefsDriver extends AbstractJob {

  private static final Logger log = LoggerFactory.getLogger(MailToPrefsDriver.class);

  private static final String OUTPUT_FILES_PATTERN = "part-*";
  private static final int DICTIONARY_BYTE_OVERHEAD = 4;

  public static void main(String[] args) throws Exception {
    ToolRunner.run(new Configuration(), new MailToPrefsDriver(), args);
  }

  @Override
  public int run(String[] args) throws Exception {
    addInputOption();
    addOutputOption();
    addOption(DefaultOptionCreator.overwriteOption().create());
    addOption("chunkSize", "cs", "The size of chunks to write.  Default is 100 mb", "100");
    addOption("separator", "sep", "The separator used in the input file to separate to, from, subject.  Default is \\n", "\n");
    addOption("from", "f", "The position in the input text (value) where the from email is located, starting from zero (0).", "0");
    addOption("refs", "r", "The position in the input text (value) where the reference ids are located, starting from zero (0).", "1");
    addOption(buildOption("useCounts", "u", "If set, then use the number of times the user has interacted with a thread as an indication of their preference.  Otherwise, use boolean preferences.",
            false, false, "true"));
    Map<String, String> parsedArgs = parseArguments(args);

    Path input = getInputPath();
    Path output = getOutputPath();
    int chunkSize = Integer.parseInt(parsedArgs.get("--chunkSize"));
    String separator = parsedArgs.get("--separator");
    Configuration conf = getConf();
    if (conf == null) {
      setConf(new Configuration());
      conf = getConf();
    }
    boolean useCounts = hasOption("--useCounts");
    AtomicInteger currentPhase = new AtomicInteger();
    int[] msgDim = new int[1];
    //TODO: mod this to not do so many passes over the data.  Dictionary creation could probably be a chain mapper
    List<Path> msgIdChunks = null;
    boolean overwrite = hasOption(DefaultOptionCreator.OVERWRITE_OPTION);
    // create the dictionary between message ids and longs
    if (shouldRunNextPhase(parsedArgs, currentPhase)) {
      //TODO: there seems to be a pattern emerging for dictionary creation -- sparse vectors from seq files also has this.
      Path msgIdsPath = new Path(output, "msgIds");
      if (overwrite) {
        HadoopUtil.delete(conf, msgIdsPath);
      }
      log.info("Creating Msg Id Dictionary");
      Job createMsgIdDictionary = prepareJob(input,
              msgIdsPath,
              SequenceFileInputFormat.class,
              MsgIdToDictionaryMapper.class,
              Text.class,
              VarIntWritable.class,
              MailToDictionaryReducer.class,
              Text.class,
              VarIntWritable.class,
              SequenceFileOutputFormat.class);
      createMsgIdDictionary.waitForCompletion(true);
      //write out the dictionary at the top level
      msgIdChunks = createDictionaryChunks(msgIdsPath, output, "msgIds-dictionary-", createMsgIdDictionary.getConfiguration(), chunkSize, msgDim);
    }
    //create the dictionary between from email addresses and longs
    List<Path> fromChunks = null;
    if (shouldRunNextPhase(parsedArgs, currentPhase)) {
      Path fromIdsPath = new Path(output, "fromIds");
      if (overwrite) {
        HadoopUtil.delete(conf, fromIdsPath);
      }
      log.info("Creating From Id Dictionary");
      Job createFromIdDictionary = prepareJob(input,
              fromIdsPath,
              SequenceFileInputFormat.class,
              FromEmailToDictionaryMapper.class,
              Text.class,
              VarIntWritable.class,
              MailToDictionaryReducer.class,
              Text.class,
              VarIntWritable.class,
              SequenceFileOutputFormat.class);
      createFromIdDictionary.getConfiguration().set(EmailUtility.SEPARATOR, separator);
      createFromIdDictionary.waitForCompletion(true);
      //write out the dictionary at the top level
      int[] fromDim = new int[1];
      fromChunks = createDictionaryChunks(fromIdsPath, output, "fromIds-dictionary-", createFromIdDictionary.getConfiguration(), chunkSize, fromDim);
    }
    //OK, we have our dictionaries, let's output the real thing we need: <from_id -> <msgId, msgId, msgId, ...>>
    if (shouldRunNextPhase(parsedArgs, currentPhase) && fromChunks != null && msgIdChunks != null) {
      //Job map
      //may be a way to do this so that we can load the from ids in memory, if they are small enough so that we don't need the double loop
      log.info("Creating recommendation matrix");
      Path vecPath = new Path(output, "recInput");
      if (overwrite) {
        HadoopUtil.delete(conf, vecPath);
      }
      //conf.set(EmailUtility.FROM_DIMENSION, String.valueOf(fromDim[0]));
      conf.set(EmailUtility.MSG_ID_DIMENSION, String.valueOf(msgDim[0]));
      conf.set(EmailUtility.FROM_PREFIX, "fromIds-dictionary-");
      conf.set(EmailUtility.MSG_IDS_PREFIX, "msgIds-dictionary-");
      conf.set(EmailUtility.FROM_INDEX, parsedArgs.get("--from"));
      conf.set(EmailUtility.REFS_INDEX, parsedArgs.get("--refs"));
      conf.set(EmailUtility.SEPARATOR, separator);
      conf.set(MailToRecReducer.USE_COUNTS_PREFERENCE, String.valueOf(useCounts));
      int j = 0;
      int i = 0;
      for (Path fromChunk : fromChunks) {
        for (Path idChunk : msgIdChunks) {
          Path out = new Path(vecPath, "tmp-" + i + '-' + j);
          DistributedCache.setCacheFiles(new URI[]{fromChunk.toUri(), idChunk.toUri()}, conf);
          Job createRecMatrix = prepareJob(input, out, SequenceFileInputFormat.class,
                  MailToRecMapper.class, Text.class, LongWritable.class, MailToRecReducer.class, Text.class, NullWritable.class,
                  TextOutputFormat.class);
          createRecMatrix.getConfiguration().set("mapred.output.compress", "false");
          createRecMatrix.waitForCompletion(true);
          //copy the results up a level
          //HadoopUtil.copyMergeSeqFiles(out.getFileSystem(conf), out, vecPath.getFileSystem(conf), outPath, true, conf, "");
          FileStatus[] fs = HadoopUtil.getFileStatus(new Path(out, "*"), PathType.GLOB, PathFilters.partFilter(), null, conf);
          for (int k = 0; k < fs.length; k++) {
            FileStatus f = fs[k];
            Path outPath = new Path(vecPath, "chunk-" + i + '-' + j + '-' + k);
            FileUtil.copy(f.getPath().getFileSystem(conf), f.getPath(), outPath.getFileSystem(conf), outPath, true, overwrite, conf);
          }
          HadoopUtil.delete(conf, out);
          j++;
        }
        i++;
      }
      //concat the files together
      /*Path mergePath = new Path(output, "vectors.dat");
      if (overwrite) {
        HadoopUtil.delete(conf, mergePath);
      }
      log.info("Merging together output vectors to vectors.dat in {}", output);*/
      //HadoopUtil.copyMergeSeqFiles(vecPath.getFileSystem(conf), vecPath, mergePath.getFileSystem(conf), mergePath, false, conf, "\n");
    }

    return 0;
  }

  private static List<Path> createDictionaryChunks(Path inputPath,
                                                   Path dictionaryPathBase,
                                                   String name,
                                                   Configuration baseConf,
                                                   int chunkSizeInMegabytes, int[] maxTermDimension) throws IOException {
    List<Path> chunkPaths = Lists.newArrayList();

    Configuration conf = new Configuration(baseConf);

    FileSystem fs = FileSystem.get(inputPath.toUri(), conf);

    long chunkSizeLimit = chunkSizeInMegabytes * 1024L * 1024L;
    int chunkIndex = 0;
    Path chunkPath = new Path(dictionaryPathBase, name + chunkIndex);
    chunkPaths.add(chunkPath);

    SequenceFile.Writer dictWriter = new SequenceFile.Writer(fs, conf, chunkPath, Text.class, IntWritable.class);

    try {
      long currentChunkSize = 0;
      Path filesPattern = new Path(inputPath, OUTPUT_FILES_PATTERN);
      int i = 1;//start at 1, since a miss in the OpenObjectIntHashMap returns a 0
      for (Pair<Writable, Writable> record
              : new SequenceFileDirIterable<Writable, Writable>(filesPattern, PathType.GLOB, null, null, true, conf)) {
        if (currentChunkSize > chunkSizeLimit) {
          Closeables.closeQuietly(dictWriter);
          chunkIndex++;

          chunkPath = new Path(dictionaryPathBase, name + chunkIndex);
          chunkPaths.add(chunkPath);

          dictWriter = new SequenceFile.Writer(fs, conf, chunkPath, Text.class, IntWritable.class);
          currentChunkSize = 0;
        }

        Writable key = record.getFirst();
        int fieldSize = DICTIONARY_BYTE_OVERHEAD + key.toString().length() * 2 + Integer.SIZE / 8;
        currentChunkSize += fieldSize;
        dictWriter.append(key, new IntWritable(i++));
      }
      maxTermDimension[0] = i;
    } finally {
      Closeables.closeQuietly(dictWriter);
    }

    return chunkPaths;
  }

}