/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.mahout.text; import com.google.common.io.Closeables; import org.apache.commons.cli2.CommandLine; import org.apache.commons.cli2.Group; import org.apache.commons.cli2.Option; import org.apache.commons.cli2.OptionException; import org.apache.commons.cli2.builder.ArgumentBuilder; import org.apache.commons.cli2.builder.DefaultOptionBuilder; import org.apache.commons.cli2.builder.GroupBuilder; import org.apache.commons.cli2.commandline.Parser; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.SequenceFile; import org.apache.mahout.common.CommandLineUtil; import org.apache.mahout.common.commandline.DefaultOptionCreator; import org.apache.mahout.utils.email.MailProcessor; import org.apache.mahout.utils.email.MailOptions; import org.apache.mahout.utils.io.ChunkedWriter; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.File; import java.io.FileFilter; import java.io.IOException; import java.nio.charset.Charset; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.regex.Pattern; /** * Converts a directory of gzipped mail archives into SequenceFiles of specified chunkSize. * This class is similar to {@link SequenceFilesFromDirectory} except it uses block-compressed * {@link SequenceFile}s and parses out the subject and body text of each mail message into * a separate key/value pair. */ public final class SequenceFilesFromMailArchives { private static final Logger log = LoggerFactory.getLogger(SequenceFilesFromMailArchives.class); public void createSequenceFiles(MailOptions options) throws IOException { ChunkedWriter writer = new ChunkedWriter(new Configuration(), options.getChunkSize(), new Path(options.getOutputDir())); MailProcessor processor = new MailProcessor(options, options.getPrefix(), writer); try { if (options.getInput().isDirectory()) { PrefixAdditionFilter filter = new PrefixAdditionFilter(processor, writer); options.getInput().listFiles(filter); log.info("Parsed {} messages from {}", filter.getMessageCount(), options.getInput().getAbsolutePath()); } else { long start = System.currentTimeMillis(); long cnt = processor.parseMboxLineByLine(options.getInput()); long finish = System.currentTimeMillis(); log.info("Parsed {} messages from {} in time: {}", new Object[] { cnt, options.getInput().getAbsolutePath(), (finish - start) }); } } finally { Closeables.closeQuietly(writer); } } public class PrefixAdditionFilter implements FileFilter { private final MailProcessor processor; private final ChunkedWriter writer; private long messageCount; public PrefixAdditionFilter(MailProcessor processor, ChunkedWriter writer) { this.processor = processor; this.writer = writer; this.messageCount = 0; } public long getMessageCount() { return messageCount; } @Override public boolean accept(File current) { if (current.isDirectory()) { log.info("At {}", current.getAbsolutePath()); PrefixAdditionFilter nested = new PrefixAdditionFilter(new MailProcessor( processor.getOptions(), processor.getPrefix() + File.separator + current.getName(), writer), writer); current.listFiles(nested); long dirCount = nested.getMessageCount(); log.info("Parsed {} messages from directory {}", dirCount, current.getAbsolutePath()); messageCount += dirCount; } else { try { messageCount += processor.parseMboxLineByLine(current); } catch (IOException e) { throw new IllegalStateException("Error processing " + current, e); } } return false; } } public static void main(String[] args) throws Exception { DefaultOptionBuilder obuilder = new DefaultOptionBuilder(); ArgumentBuilder abuilder = new ArgumentBuilder(); GroupBuilder gbuilder = new GroupBuilder(); Option inputOpt = DefaultOptionCreator.inputOption().create(); Option outputDirOpt = DefaultOptionCreator.outputOption().create(); Option chunkSizeOpt = obuilder.withLongName("chunkSize").withArgument( abuilder.withName("chunkSize").withMinimum(1).withMaximum(1).create()).withDescription( "The chunkSize in MegaBytes. Defaults to 64").withShortName("chunk").create(); Option keyPrefixOpt = obuilder.withLongName("keyPrefix").withArgument( abuilder.withName("keyPrefix").withMinimum(1).withMaximum(1).create()).withDescription( "The prefix to be prepended to the key").withShortName("prefix").create(); Option charsetOpt = obuilder.withLongName("charset").withRequired(true).withArgument( abuilder.withName("charset").withMinimum(1).withMaximum(1).create()).withDescription( "The name of the character encoding of the input files").withShortName("c").create(); Option subjectOpt = obuilder.withLongName("subject").withRequired(false). withDescription("Include the Mail subject as part of the text. Default is false").withShortName("s").create(); Option toOpt = obuilder.withLongName("to").withRequired(false). withDescription("Include the to field in the text. Default is false").withShortName("to").create(); Option fromOpt = obuilder.withLongName("from").withRequired(false). withDescription("Include the from field in the text. Default is false").withShortName("from").create(); Option refsOpt = obuilder.withLongName("references").withRequired(false). withDescription("Include the references field in the text. Default is false").withShortName("refs").create(); Option bodyOpt = obuilder.withLongName("body").withRequired(false). withDescription("Include the body in the output. Default is false").withShortName("b").create(); Option quotedOpt = obuilder.withLongName("stripQuoted").withRequired(false). withDescription("Strip (remove) quoted email text in the body. Default is false").withShortName("q").create(); Option quotedRegexOpt = obuilder.withLongName("quotedRegex").withRequired(false).withArgument(abuilder.withName("regex").withMinimum(1).withMaximum(1).create()) .withDescription("Specify the regex that identifies quoted text. Default is to look for > or | at the beginning of the line.").withShortName("q").create(); Option separatorOpt = obuilder.withLongName("separator").withRequired(false).withArgument( abuilder.withName("separator").withMinimum(1).withMaximum(1).create()). withDescription("The separator to use between metadata items (to, from, etc.). Default is \\n").withShortName("sep").create(); Option bodySeparatorOpt = obuilder.withLongName("bodySeparator").withRequired(false).withArgument( abuilder.withName("bodySeparator").withMinimum(1).withMaximum(1).create()). withDescription("The separator to use between lines in the body. Default is \\n. Useful to change if you wish to have the message be on one line").withShortName("bodySep").create(); Option helpOpt = DefaultOptionCreator.helpOption(); Group group = gbuilder.withName("Options").withOption(keyPrefixOpt).withOption(chunkSizeOpt).withOption( charsetOpt).withOption(outputDirOpt).withOption(helpOpt).withOption(inputOpt).withOption(subjectOpt).withOption(toOpt) .withOption(fromOpt).withOption(bodyOpt).withOption(quotedOpt).withOption(refsOpt).withOption(bodySeparatorOpt) .withOption(quotedRegexOpt) .withOption(separatorOpt).create(); try { Parser parser = new Parser(); parser.setGroup(group); parser.setHelpOption(helpOpt); CommandLine cmdLine = parser.parse(args); if (cmdLine.hasOption(helpOpt)) { CommandLineUtil.printHelp(group); return; } File input = new File((String) cmdLine.getValue(inputOpt)); String outputDir = (String) cmdLine.getValue(outputDirOpt); int chunkSize = 64; if (cmdLine.hasOption(chunkSizeOpt)) { chunkSize = Integer.parseInt((String) cmdLine.getValue(chunkSizeOpt)); } String prefix = ""; if (cmdLine.hasOption(keyPrefixOpt)) { prefix = (String) cmdLine.getValue(keyPrefixOpt); } Charset charset = Charset.forName((String) cmdLine.getValue(charsetOpt)); SequenceFilesFromMailArchives dir = new SequenceFilesFromMailArchives(); MailOptions options = new MailOptions(); options.setInput(input); options.setOutputDir(outputDir); options.setPrefix(prefix); options.setChunkSize(chunkSize); options.setCharset(charset); List<Pattern> patterns = new ArrayList<Pattern>(5); // patternOrder is used downstream so that we can know what order the text is in instead // of encoding it in the string, which // would require more processing later to remove it pre feature selection. Map<String, Integer> patternOrder = new HashMap<String, Integer>(); int order = 0; if (cmdLine.hasOption(fromOpt)) { patterns.add(MailProcessor.FROM_PREFIX); patternOrder.put(MailOptions.FROM, order++); } if (cmdLine.hasOption(toOpt)) { patterns.add(MailProcessor.TO_PREFIX); patternOrder.put(MailOptions.TO, order++); } if (cmdLine.hasOption(refsOpt)) { patterns.add(MailProcessor.REFS_PREFIX); patternOrder.put(MailOptions.REFS, order++); } if (cmdLine.hasOption(subjectOpt)) { patterns.add(MailProcessor.SUBJECT_PREFIX); patternOrder.put(MailOptions.SUBJECT, order++); } options.setStripQuotedText(cmdLine.hasOption(quotedOpt)); options.setPatternsToMatch(patterns.toArray(new Pattern[patterns.size()])); options.setPatternOrder(patternOrder); options.setIncludeBody(cmdLine.hasOption(bodyOpt)); options.setSeparator("\n"); if (cmdLine.hasOption(separatorOpt)) { options.setSeparator(cmdLine.getValue(separatorOpt).toString()); } if (cmdLine.hasOption(bodySeparatorOpt)) { options.setBodySeparator(cmdLine.getValue(bodySeparatorOpt).toString()); } if (cmdLine.hasOption(quotedRegexOpt)){ options.setQuotedTextPattern(Pattern.compile(cmdLine.getValue(quotedRegexOpt).toString())); } long start = System.currentTimeMillis(); dir.createSequenceFiles(options); long finish = System.currentTimeMillis(); log.info("Conversion took {}ms", finish - start); } catch (OptionException e) { log.error("Exception", e); CommandLineUtil.printHelp(group); } } }