/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.mahout.cf.taste.example.email;
import com.google.common.collect.Lists;
import com.google.common.io.Closeables;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FileUtil;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.ToolRunner;
import org.apache.mahout.common.AbstractJob;
import org.apache.mahout.common.HadoopUtil;
import org.apache.mahout.common.Pair;
import org.apache.mahout.common.commandline.DefaultOptionCreator;
import org.apache.mahout.common.iterator.sequencefile.PathFilters;
import org.apache.mahout.common.iterator.sequencefile.PathType;
import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirIterable;
import org.apache.mahout.math.VarIntWritable;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.net.URI;
import java.util.List;
import java.util.Map;
import java.util.concurrent.atomic.AtomicInteger;
/**
* Convert the Mail archives (see {@link org.apache.mahout.text.SequenceFilesFromMailArchives}) to a preference
* file that can be consumed by the {@link org.apache.mahout.cf.taste.hadoop.pseudo.RecommenderJob}.
* <p/>
* This assumes the input is a Sequence File, that the key is: filename/message id and the value is a list (separated by the
* user's choosing) containing the from email and any references
* <p/>
* The output is a matrix where either the from or to are the rows (represented as longs) and the columns are the message ids
* that the user has interacted with (as a VectorWritable). This class currently does not account for thread hijacking.
* <p/>
* It also outputs a side table mapping the row ids to their original and the message ids to the message thread id
*/
public final class MailToPrefsDriver extends AbstractJob {
private static final Logger log = LoggerFactory.getLogger(MailToPrefsDriver.class);
private static final String OUTPUT_FILES_PATTERN = "part-*";
private static final int DICTIONARY_BYTE_OVERHEAD = 4;
public static void main(String[] args) throws Exception {
ToolRunner.run(new Configuration(), new MailToPrefsDriver(), args);
}
@Override
public int run(String[] args) throws Exception {
addInputOption();
addOutputOption();
addOption(DefaultOptionCreator.overwriteOption().create());
addOption("chunkSize", "cs", "The size of chunks to write. Default is 100 mb", "100");
addOption("separator", "sep", "The separator used in the input file to separate to, from, subject. Default is \\n", "\n");
addOption("from", "f", "The position in the input text (value) where the from email is located, starting from zero (0).", "0");
addOption("refs", "r", "The position in the input text (value) where the reference ids are located, starting from zero (0).", "1");
addOption(buildOption("useCounts", "u", "If set, then use the number of times the user has interacted with a thread as an indication of their preference. Otherwise, use boolean preferences.",
false, false, "true"));
Map<String, String> parsedArgs = parseArguments(args);
Path input = getInputPath();
Path output = getOutputPath();
int chunkSize = Integer.parseInt(parsedArgs.get("--chunkSize"));
String separator = parsedArgs.get("--separator");
Configuration conf = getConf();
if (conf == null) {
setConf(new Configuration());
conf = getConf();
}
boolean useCounts = hasOption("--useCounts");
AtomicInteger currentPhase = new AtomicInteger();
int[] msgDim = new int[1];
//TODO: mod this to not do so many passes over the data. Dictionary creation could probably be a chain mapper
List<Path> msgIdChunks = null;
boolean overwrite = hasOption(DefaultOptionCreator.OVERWRITE_OPTION);
// create the dictionary between message ids and longs
if (shouldRunNextPhase(parsedArgs, currentPhase)) {
//TODO: there seems to be a pattern emerging for dictionary creation -- sparse vectors from seq files also has this.
Path msgIdsPath = new Path(output, "msgIds");
if (overwrite) {
HadoopUtil.delete(conf, msgIdsPath);
}
log.info("Creating Msg Id Dictionary");
Job createMsgIdDictionary = prepareJob(input,
msgIdsPath,
SequenceFileInputFormat.class,
MsgIdToDictionaryMapper.class,
Text.class,
VarIntWritable.class,
MailToDictionaryReducer.class,
Text.class,
VarIntWritable.class,
SequenceFileOutputFormat.class);
createMsgIdDictionary.waitForCompletion(true);
//write out the dictionary at the top level
msgIdChunks = createDictionaryChunks(msgIdsPath, output, "msgIds-dictionary-", createMsgIdDictionary.getConfiguration(), chunkSize, msgDim);
}
//create the dictionary between from email addresses and longs
List<Path> fromChunks = null;
if (shouldRunNextPhase(parsedArgs, currentPhase)) {
Path fromIdsPath = new Path(output, "fromIds");
if (overwrite) {
HadoopUtil.delete(conf, fromIdsPath);
}
log.info("Creating From Id Dictionary");
Job createFromIdDictionary = prepareJob(input,
fromIdsPath,
SequenceFileInputFormat.class,
FromEmailToDictionaryMapper.class,
Text.class,
VarIntWritable.class,
MailToDictionaryReducer.class,
Text.class,
VarIntWritable.class,
SequenceFileOutputFormat.class);
createFromIdDictionary.getConfiguration().set(EmailUtility.SEPARATOR, separator);
createFromIdDictionary.waitForCompletion(true);
//write out the dictionary at the top level
int[] fromDim = new int[1];
fromChunks = createDictionaryChunks(fromIdsPath, output, "fromIds-dictionary-", createFromIdDictionary.getConfiguration(), chunkSize, fromDim);
}
//OK, we have our dictionaries, let's output the real thing we need: <from_id -> <msgId, msgId, msgId, ...>>
if (shouldRunNextPhase(parsedArgs, currentPhase) && fromChunks != null && msgIdChunks != null) {
//Job map
//may be a way to do this so that we can load the from ids in memory, if they are small enough so that we don't need the double loop
log.info("Creating recommendation matrix");
Path vecPath = new Path(output, "recInput");
if (overwrite) {
HadoopUtil.delete(conf, vecPath);
}
//conf.set(EmailUtility.FROM_DIMENSION, String.valueOf(fromDim[0]));
conf.set(EmailUtility.MSG_ID_DIMENSION, String.valueOf(msgDim[0]));
conf.set(EmailUtility.FROM_PREFIX, "fromIds-dictionary-");
conf.set(EmailUtility.MSG_IDS_PREFIX, "msgIds-dictionary-");
conf.set(EmailUtility.FROM_INDEX, parsedArgs.get("--from"));
conf.set(EmailUtility.REFS_INDEX, parsedArgs.get("--refs"));
conf.set(EmailUtility.SEPARATOR, separator);
conf.set(MailToRecReducer.USE_COUNTS_PREFERENCE, String.valueOf(useCounts));
int j = 0;
int i = 0;
for (Path fromChunk : fromChunks) {
for (Path idChunk : msgIdChunks) {
Path out = new Path(vecPath, "tmp-" + i + '-' + j);
DistributedCache.setCacheFiles(new URI[]{fromChunk.toUri(), idChunk.toUri()}, conf);
Job createRecMatrix = prepareJob(input, out, SequenceFileInputFormat.class,
MailToRecMapper.class, Text.class, LongWritable.class, MailToRecReducer.class, Text.class, NullWritable.class,
TextOutputFormat.class);
createRecMatrix.getConfiguration().set("mapred.output.compress", "false");
createRecMatrix.waitForCompletion(true);
//copy the results up a level
//HadoopUtil.copyMergeSeqFiles(out.getFileSystem(conf), out, vecPath.getFileSystem(conf), outPath, true, conf, "");
FileStatus[] fs = HadoopUtil.getFileStatus(new Path(out, "*"), PathType.GLOB, PathFilters.partFilter(), null, conf);
for (int k = 0; k < fs.length; k++) {
FileStatus f = fs[k];
Path outPath = new Path(vecPath, "chunk-" + i + '-' + j + '-' + k);
FileUtil.copy(f.getPath().getFileSystem(conf), f.getPath(), outPath.getFileSystem(conf), outPath, true, overwrite, conf);
}
HadoopUtil.delete(conf, out);
j++;
}
i++;
}
//concat the files together
/*Path mergePath = new Path(output, "vectors.dat");
if (overwrite) {
HadoopUtil.delete(conf, mergePath);
}
log.info("Merging together output vectors to vectors.dat in {}", output);*/
//HadoopUtil.copyMergeSeqFiles(vecPath.getFileSystem(conf), vecPath, mergePath.getFileSystem(conf), mergePath, false, conf, "\n");
}
return 0;
}
private static List<Path> createDictionaryChunks(Path inputPath,
Path dictionaryPathBase,
String name,
Configuration baseConf,
int chunkSizeInMegabytes, int[] maxTermDimension) throws IOException {
List<Path> chunkPaths = Lists.newArrayList();
Configuration conf = new Configuration(baseConf);
FileSystem fs = FileSystem.get(inputPath.toUri(), conf);
long chunkSizeLimit = chunkSizeInMegabytes * 1024L * 1024L;
int chunkIndex = 0;
Path chunkPath = new Path(dictionaryPathBase, name + chunkIndex);
chunkPaths.add(chunkPath);
SequenceFile.Writer dictWriter = new SequenceFile.Writer(fs, conf, chunkPath, Text.class, IntWritable.class);
try {
long currentChunkSize = 0;
Path filesPattern = new Path(inputPath, OUTPUT_FILES_PATTERN);
int i = 1;//start at 1, since a miss in the OpenObjectIntHashMap returns a 0
for (Pair<Writable, Writable> record
: new SequenceFileDirIterable<Writable, Writable>(filesPattern, PathType.GLOB, null, null, true, conf)) {
if (currentChunkSize > chunkSizeLimit) {
Closeables.closeQuietly(dictWriter);
chunkIndex++;
chunkPath = new Path(dictionaryPathBase, name + chunkIndex);
chunkPaths.add(chunkPath);
dictWriter = new SequenceFile.Writer(fs, conf, chunkPath, Text.class, IntWritable.class);
currentChunkSize = 0;
}
Writable key = record.getFirst();
int fieldSize = DICTIONARY_BYTE_OVERHEAD + key.toString().length() * 2 + Integer.SIZE / 8;
currentChunkSize += fieldSize;
dictWriter.append(key, new IntWritable(i++));
}
maxTermDimension[0] = i;
} finally {
Closeables.closeQuietly(dictWriter);
}
return chunkPaths;
}
}