/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.mahout.text; import org.apache.commons.io.DirectoryWalker; import org.apache.commons.io.comparator.CompositeFileComparator; import org.apache.commons.io.comparator.DirectoryFileComparator; import org.apache.commons.io.comparator.PathFileComparator; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat; import org.apache.hadoop.util.ToolRunner; import org.apache.mahout.common.AbstractJob; import org.apache.mahout.common.HadoopUtil; import org.apache.mahout.common.commandline.DefaultOptionCreator; import org.apache.mahout.utils.email.MailOptions; import org.apache.mahout.utils.email.MailProcessor; import org.apache.mahout.utils.io.ChunkedWriter; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.File; import java.io.IOException; import java.nio.charset.Charset; import java.util.ArrayDeque; import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; import java.util.Comparator; import java.util.Deque; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.regex.Pattern; /** * Converts a directory of gzipped mail archives into SequenceFiles of specified * chunkSize. This class is similar to {@link SequenceFilesFromDirectory} except * it uses block-compressed {@link org.apache.hadoop.io.SequenceFile}s and parses out the subject and * body text of each mail message into a separate key/value pair. */ public final class SequenceFilesFromMailArchives extends AbstractJob { private static final Logger log = LoggerFactory.getLogger(SequenceFilesFromMailArchives.class); public static final String[] CHUNK_SIZE_OPTION = {"chunkSize", "chunk"}; public static final String[] KEY_PREFIX_OPTION = {"keyPrefix", "prefix"}; public static final String[] CHARSET_OPTION = {"charset", "c"}; public static final String[] SUBJECT_OPTION = {"subject", "s"}; public static final String[] TO_OPTION = {"to", "to"}; public static final String[] FROM_OPTION = {"from", "from"}; public static final String[] REFERENCES_OPTION = {"references", "refs"}; public static final String[] BODY_OPTION = {"body", "b"}; public static final String[] STRIP_QUOTED_OPTION = {"stripQuoted", "q"}; public static final String[] QUOTED_REGEX_OPTION = {"quotedRegex", "regex"}; public static final String[] SEPARATOR_OPTION = {"separator", "sep"}; public static final String[] BODY_SEPARATOR_OPTION = {"bodySeparator", "bodySep"}; public static final String BASE_INPUT_PATH = "baseinputpath"; private static final int MAX_JOB_SPLIT_LOCATIONS = 1000000; public void createSequenceFiles(MailOptions options) throws IOException { try (ChunkedWriter writer = new ChunkedWriter(getConf(), options.getChunkSize(), new Path(options.getOutputDir()))){ MailProcessor processor = new MailProcessor(options, options.getPrefix(), writer); if (options.getInput().isDirectory()) { PrefixAdditionDirectoryWalker walker = new PrefixAdditionDirectoryWalker(processor, writer); walker.walk(options.getInput()); log.info("Parsed {} messages from {}", walker.getMessageCount(), options.getInput().getAbsolutePath()); } else { long start = System.currentTimeMillis(); long cnt = processor.parseMboxLineByLine(options.getInput()); long finish = System.currentTimeMillis(); log.info("Parsed {} messages from {} in time: {}", cnt, options.getInput().getAbsolutePath(), finish - start); } } } private static class PrefixAdditionDirectoryWalker extends DirectoryWalker<Object> { @SuppressWarnings("unchecked") private static final Comparator<File> FILE_COMPARATOR = new CompositeFileComparator( DirectoryFileComparator.DIRECTORY_REVERSE, PathFileComparator.PATH_COMPARATOR); private final Deque<MailProcessor> processors = new ArrayDeque<>(); private final ChunkedWriter writer; private final Deque<Long> messageCounts = new ArrayDeque<>(); public PrefixAdditionDirectoryWalker(MailProcessor processor, ChunkedWriter writer) { processors.addFirst(processor); this.writer = writer; messageCounts.addFirst(0L); } public void walk(File startDirectory) throws IOException { super.walk(startDirectory, null); } public long getMessageCount() { return messageCounts.getFirst(); } @Override protected void handleDirectoryStart(File current, int depth, Collection<Object> results) throws IOException { if (depth > 0) { log.info("At {}", current.getAbsolutePath()); MailProcessor processor = processors.getFirst(); MailProcessor subDirProcessor = new MailProcessor(processor.getOptions(), processor.getPrefix() + File.separator + current.getName(), writer); processors.push(subDirProcessor); messageCounts.push(0L); } } @Override protected File[] filterDirectoryContents(File directory, int depth, File[] files) throws IOException { Arrays.sort(files, FILE_COMPARATOR); return files; } @Override protected void handleFile(File current, int depth, Collection<Object> results) throws IOException { MailProcessor processor = processors.getFirst(); long currentDirMessageCount = messageCounts.pop(); try { currentDirMessageCount += processor.parseMboxLineByLine(current); } catch (IOException e) { throw new IllegalStateException("Error processing " + current, e); } messageCounts.push(currentDirMessageCount); } @Override protected void handleDirectoryEnd(File current, int depth, Collection<Object> results) throws IOException { if (depth > 0) { final long currentDirMessageCount = messageCounts.pop(); log.info("Parsed {} messages from directory {}", currentDirMessageCount, current.getAbsolutePath()); processors.pop(); // aggregate message counts long parentDirMessageCount = messageCounts.pop(); parentDirMessageCount += currentDirMessageCount; messageCounts.push(parentDirMessageCount); } } } public static void main(String[] args) throws Exception { ToolRunner.run(new Configuration(), new SequenceFilesFromMailArchives(), args); } @Override public int run(String[] args) throws Exception { addInputOption(); addOutputOption(); addOption(DefaultOptionCreator.methodOption().create()); addOption(CHUNK_SIZE_OPTION[0], CHUNK_SIZE_OPTION[1], "The chunkSize in MegaBytes. Defaults to 64", "64"); addOption(KEY_PREFIX_OPTION[0], KEY_PREFIX_OPTION[1], "The prefix to be prepended to the key", ""); addOption(CHARSET_OPTION[0], CHARSET_OPTION[1], "The name of the character encoding of the input files. Default to UTF-8", "UTF-8"); addFlag(SUBJECT_OPTION[0], SUBJECT_OPTION[1], "Include the Mail subject as part of the text. Default is false"); addFlag(TO_OPTION[0], TO_OPTION[1], "Include the to field in the text. Default is false"); addFlag(FROM_OPTION[0], FROM_OPTION[1], "Include the from field in the text. Default is false"); addFlag(REFERENCES_OPTION[0], REFERENCES_OPTION[1], "Include the references field in the text. Default is false"); addFlag(BODY_OPTION[0], BODY_OPTION[1], "Include the body in the output. Default is false"); addFlag(STRIP_QUOTED_OPTION[0], STRIP_QUOTED_OPTION[1], "Strip (remove) quoted email text in the body. Default is false"); addOption(QUOTED_REGEX_OPTION[0], QUOTED_REGEX_OPTION[1], "Specify the regex that identifies quoted text. " + "Default is to look for > or | at the beginning of the line."); addOption(SEPARATOR_OPTION[0], SEPARATOR_OPTION[1], "The separator to use between metadata items (to, from, etc.). Default is \\n", "\n"); addOption(BODY_SEPARATOR_OPTION[0], BODY_SEPARATOR_OPTION[1], "The separator to use between lines in the body. Default is \\n. " + "Useful to change if you wish to have the message be on one line", "\n"); addOption(DefaultOptionCreator.helpOption()); Map<String, List<String>> parsedArgs = parseArguments(args); if (parsedArgs == null) { return -1; } File input = getInputFile(); String outputDir = getOutputPath().toString(); int chunkSize = 64; if (hasOption(CHUNK_SIZE_OPTION[0])) { chunkSize = Integer.parseInt(getOption(CHUNK_SIZE_OPTION[0])); } String prefix = ""; if (hasOption(KEY_PREFIX_OPTION[0])) { prefix = getOption(KEY_PREFIX_OPTION[0]); } Charset charset = Charset.forName(getOption(CHARSET_OPTION[0])); MailOptions options = new MailOptions(); options.setInput(input); options.setOutputDir(outputDir); options.setPrefix(prefix); options.setChunkSize(chunkSize); options.setCharset(charset); List<Pattern> patterns = new ArrayList<>(5); // patternOrder is used downstream so that we can know what order the text // is in instead of encoding it in the string, which // would require more processing later to remove it pre feature selection. Map<String, Integer> patternOrder = new HashMap<>(); int order = 0; if (hasOption(FROM_OPTION[0])) { patterns.add(MailProcessor.FROM_PREFIX); patternOrder.put(MailOptions.FROM, order++); } if (hasOption(TO_OPTION[0])) { patterns.add(MailProcessor.TO_PREFIX); patternOrder.put(MailOptions.TO, order++); } if (hasOption(REFERENCES_OPTION[0])) { patterns.add(MailProcessor.REFS_PREFIX); patternOrder.put(MailOptions.REFS, order++); } if (hasOption(SUBJECT_OPTION[0])) { patterns.add(MailProcessor.SUBJECT_PREFIX); patternOrder.put(MailOptions.SUBJECT, order += 1); } options.setStripQuotedText(hasOption(STRIP_QUOTED_OPTION[0])); options.setPatternsToMatch(patterns.toArray(new Pattern[patterns.size()])); options.setPatternOrder(patternOrder); options.setIncludeBody(hasOption(BODY_OPTION[0])); if (hasOption(SEPARATOR_OPTION[0])) { options.setSeparator(getOption(SEPARATOR_OPTION[0])); } else { options.setSeparator("\n"); } if (hasOption(BODY_SEPARATOR_OPTION[0])) { options.setBodySeparator(getOption(BODY_SEPARATOR_OPTION[0])); } if (hasOption(QUOTED_REGEX_OPTION[0])) { options.setQuotedTextPattern(Pattern.compile(getOption(QUOTED_REGEX_OPTION[0]))); } if (getOption(DefaultOptionCreator.METHOD_OPTION, DefaultOptionCreator.MAPREDUCE_METHOD).equals(DefaultOptionCreator.SEQUENTIAL_METHOD)) { runSequential(options); } else { runMapReduce(getInputPath(), getOutputPath()); } return 0; } private int runSequential(MailOptions options) throws IOException, InterruptedException, NoSuchMethodException { long start = System.currentTimeMillis(); createSequenceFiles(options); long finish = System.currentTimeMillis(); log.info("Conversion took {}ms", finish - start); return 0; } private int runMapReduce(Path input, Path output) throws IOException, InterruptedException, ClassNotFoundException { Job job = prepareJob(input, output, MultipleTextFileInputFormat.class, SequenceFilesFromMailArchivesMapper.class, Text.class, Text.class, SequenceFileOutputFormat.class, "SequentialFilesFromMailArchives"); Configuration jobConfig = job.getConfiguration(); if (hasOption(KEY_PREFIX_OPTION[0])) { jobConfig.set(KEY_PREFIX_OPTION[1], getOption(KEY_PREFIX_OPTION[0])); } int chunkSize = 0; if (hasOption(CHUNK_SIZE_OPTION[0])) { chunkSize = Integer.parseInt(getOption(CHUNK_SIZE_OPTION[0])); jobConfig.set(CHUNK_SIZE_OPTION[0], String.valueOf(chunkSize)); } Charset charset; if (hasOption(CHARSET_OPTION[0])) { charset = Charset.forName(getOption(CHARSET_OPTION[0])); jobConfig.set(CHARSET_OPTION[0], charset.displayName()); } if (hasOption(FROM_OPTION[0])) { jobConfig.set(FROM_OPTION[1], "true"); } if (hasOption(TO_OPTION[0])) { jobConfig.set(TO_OPTION[1], "true"); } if (hasOption(REFERENCES_OPTION[0])) { jobConfig.set(REFERENCES_OPTION[1], "true"); } if (hasOption(SUBJECT_OPTION[0])) { jobConfig.set(SUBJECT_OPTION[1], "true"); } if (hasOption(QUOTED_REGEX_OPTION[0])) { jobConfig.set(QUOTED_REGEX_OPTION[1], Pattern.compile(getOption(QUOTED_REGEX_OPTION[0])).toString()); } if (hasOption(SEPARATOR_OPTION[0])) { jobConfig.set(SEPARATOR_OPTION[1], getOption(SEPARATOR_OPTION[0])); } else { jobConfig.set(SEPARATOR_OPTION[1], "\n"); } if (hasOption(BODY_OPTION[0])) { jobConfig.set(BODY_OPTION[1], "true"); } else { jobConfig.set(BODY_OPTION[1], "false"); } if (hasOption(BODY_SEPARATOR_OPTION[0])) { jobConfig.set(BODY_SEPARATOR_OPTION[1], getOption(BODY_SEPARATOR_OPTION[0])); } else { jobConfig.set(BODY_SEPARATOR_OPTION[1], "\n"); } FileSystem fs = FileSystem.get(jobConfig); FileStatus fsFileStatus = fs.getFileStatus(inputPath); jobConfig.set(BASE_INPUT_PATH, inputPath.toString()); String inputDirList = HadoopUtil.buildDirList(fs, fsFileStatus); FileInputFormat.setInputPaths(job, inputDirList); long chunkSizeInBytes = chunkSize * 1024 * 1024; // need to set this to a multiple of the block size, or no split happens FileInputFormat.setMaxInputSplitSize(job, chunkSizeInBytes); // set the max split locations, otherwise we get nasty debug stuff jobConfig.set("mapreduce.job.max.split.locations", String.valueOf(MAX_JOB_SPLIT_LOCATIONS)); boolean succeeded = job.waitForCompletion(true); if (!succeeded) { return -1; } return 0; } }