/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.mahout.cf.taste.example.email; import com.google.common.base.Preconditions; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.filecache.DistributedCache; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Writable; import org.apache.mahout.common.Pair; import org.apache.mahout.common.iterator.sequencefile.SequenceFileIterable; import org.apache.mahout.math.map.OpenObjectIntHashMap; import java.io.IOException; import java.net.URI; import java.util.regex.Pattern; public final class EmailUtility { public static final String SEPARATOR = "separator"; public static final String MSG_IDS_PREFIX = "msgIdsPrefix"; public static final String FROM_PREFIX = "fromPrefix"; public static final String MSG_ID_DIMENSION = "msgIdDim"; public static final String FROM_INDEX = "fromIdx"; public static final String REFS_INDEX = "refsIdx"; private static final String[] EMPTY = new String[0]; private static final Pattern ADDRESS_CLEANUP = Pattern.compile("mailto:|<|>|\\[|\\]|\\=20"); private static final Pattern ANGLE_BRACES = Pattern.compile("<|>"); private static final Pattern SPACE_OR_CLOSE_ANGLE = Pattern.compile(">|\\s+"); public static final Pattern WHITESPACE = Pattern.compile("\\s*"); private EmailUtility() { } /** * Strip off some spurious characters that make it harder to dedup */ public static String cleanUpEmailAddress(CharSequence address) { //do some cleanup to normalize some things, like: Key: karthik ananth <karthik.jcecs@gmail.com>: Value: 178 //Key: karthik ananth [mailto:karthik.jcecs@gmail.com]=20: Value: 179 //TODO: is there more to clean up here? return ADDRESS_CLEANUP.matcher(address).replaceAll(""); } public static void loadDictionaries(Configuration conf, String fromPrefix, OpenObjectIntHashMap<String> fromDictionary, String msgIdPrefix, OpenObjectIntHashMap<String> msgIdDictionary) throws IOException { URI[] localFiles = DistributedCache.getCacheFiles(conf); Preconditions.checkArgument(localFiles != null, "missing paths from the DistributedCache"); for (URI localFile : localFiles) { Path dictionaryFile = new Path(localFile.getPath()); // key is word value is id OpenObjectIntHashMap<String> dictionary = null; if (dictionaryFile.getName().startsWith(fromPrefix)) { dictionary = fromDictionary; } else if (dictionaryFile.getName().startsWith(msgIdPrefix)) { dictionary = msgIdDictionary; } if (dictionary != null) { for (Pair<Writable, IntWritable> record : new SequenceFileIterable<Writable, IntWritable>(dictionaryFile, true, conf)) { dictionary.put(record.getFirst().toString(), record.getSecond().get()); } } } } public static String[] parseReferences(CharSequence rawRefs) { String[] splits; if (rawRefs != null && rawRefs.length() > 0) { splits = SPACE_OR_CLOSE_ANGLE.split(rawRefs); for (int i = 0; i < splits.length; i++) { splits[i] = ANGLE_BRACES.matcher(splits[i]).replaceAll(""); } } else { splits = EMPTY; } return splits; } public enum Counters { NO_MESSAGE_ID, NO_FROM_ADDRESS } }