/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.mahout.cf.taste.example.email;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.mahout.math.map.OpenObjectIntHashMap;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
public final class MailToRecMapper extends Mapper<Text, Text, Text, LongWritable> {
private static final Logger log = LoggerFactory.getLogger(MailToRecMapper.class);
private final OpenObjectIntHashMap<String> fromDictionary = new OpenObjectIntHashMap<String>();
private final OpenObjectIntHashMap<String> msgIdDictionary = new OpenObjectIntHashMap<String>();
private String separator = "\n";
private int fromIdx;
private int refsIdx;
public enum Counters {
REFERENCE, ORIGINAL
}
@Override
protected void setup(Context context) throws IOException, InterruptedException {
super.setup(context);
Configuration conf = context.getConfiguration();
String fromPrefix = conf.get(EmailUtility.FROM_PREFIX);
String msgPrefix = conf.get(EmailUtility.MSG_IDS_PREFIX);
fromIdx = conf.getInt(EmailUtility.FROM_INDEX, 0);
refsIdx = conf.getInt(EmailUtility.REFS_INDEX, 1);
EmailUtility.loadDictionaries(conf, fromPrefix, fromDictionary, msgPrefix, msgIdDictionary);
log.info("From Dictionary size: {} Msg Id Dictionary size: {}", fromDictionary.size(), msgIdDictionary.size());
separator = context.getConfiguration().get(EmailUtility.SEPARATOR);
}
@Override
protected void map(Text key, Text value, Context context) throws IOException, InterruptedException {
int msgIdKey = Integer.MIN_VALUE;
int fromKey = Integer.MIN_VALUE;
String valStr = value.toString();
String[] splits = StringUtils.splitByWholeSeparatorPreserveAllTokens(valStr, separator);
if (splits != null && splits.length > 0) {
if (splits.length > refsIdx){
String from = EmailUtility.cleanUpEmailAddress(splits[fromIdx]);
fromKey = fromDictionary.get(from);
}
//get the references
if (splits.length > refsIdx) {
String[] theRefs = EmailUtility.parseReferences(splits[refsIdx]);
if (theRefs != null && theRefs.length > 0) {
//we have a reference, the first one is the original message id, so map to that one if it exists
msgIdKey = msgIdDictionary.get(theRefs[0]);
context.getCounter(Counters.REFERENCE).increment(1);
}
}
}
if (msgIdKey == Integer.MIN_VALUE) {//we don't have any references, so use the msg id
//get the msg id and the from and output the associated ids
String keyStr = key.toString();
int idx = keyStr.lastIndexOf('/');
if (idx != -1) {
String msgId = keyStr.substring(idx + 1);
msgIdKey = msgIdDictionary.get(msgId);
context.getCounter(Counters.ORIGINAL).increment(1);
}
}
if (msgIdKey != Integer.MIN_VALUE && fromKey != Integer.MIN_VALUE) {
context.write(new Text(fromKey + "," + msgIdKey), new LongWritable(1));
}
}
}