/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.mahout.text; import java.io.File; import java.io.FileOutputStream; import java.util.zip.GZIPOutputStream; import org.apache.commons.lang3.SystemUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Text; import org.apache.mahout.common.MahoutTestCase; import org.apache.mahout.common.Pair; import org.apache.mahout.common.iterator.sequencefile.SequenceFileIterator; import org.junit.Assert; import org.junit.Before; import org.junit.Test; /** * Test case for the SequenceFilesFromMailArchives command-line application. */ public final class SequenceFilesFromMailArchivesTest extends MahoutTestCase { private File inputDir; /** * Create the input and output directories needed for testing * the SequenceFilesFromMailArchives application. */ @Override @Before public void setUp() throws Exception { super.setUp(); inputDir = getTestTempDir("mail-archives-in"); // write test mail messages to a gzipped file in a nested directory File subDir = new File(inputDir, "subdir"); subDir.mkdir(); File gzFile = new File(subDir, "mail-messages.gz"); try (GZIPOutputStream gzOut = new GZIPOutputStream(new FileOutputStream(gzFile))) { gzOut.write(testMailMessages.getBytes("UTF-8")); gzOut.finish(); } File subDir2 = new File(subDir, "subsubdir"); subDir2.mkdir(); File gzFile2 = new File(subDir2, "mail-messages-2.gz"); try (GZIPOutputStream gzOut = new GZIPOutputStream(new FileOutputStream(gzFile2))) { gzOut.write(testMailMessages.getBytes("UTF-8")); gzOut.finish(); } } @Test public void testSequential() throws Exception { File outputDir = this.getTestTempDir("mail-archives-out"); String[] args = { "--input", inputDir.getAbsolutePath(), "--output", outputDir.getAbsolutePath(), "--charset", "UTF-8", "--keyPrefix", "TEST", "--method", "sequential", "--body", "--subject", "--separator", "" }; // run the application's main method SequenceFilesFromMailArchives.main(args); // app should create a single SequenceFile named "chunk-0" in the output dir File expectedChunkFile = new File(outputDir, "chunk-0"); String expectedChunkPath = expectedChunkFile.getAbsolutePath(); Assert.assertTrue("Expected chunk file " + expectedChunkPath + " not found!", expectedChunkFile.isFile()); Configuration conf = getConfiguration(); SequenceFileIterator<Text, Text> iterator = new SequenceFileIterator<>(new Path(expectedChunkPath), true, conf); Assert.assertTrue("First key/value pair not found!", iterator.hasNext()); Pair<Text, Text> record = iterator.next(); File parentFile = new File(new File(new File("TEST"), "subdir"), "mail-messages.gz"); Assert.assertEquals(new File(parentFile, testVars[0][0]).toString(), record.getFirst().toString()); Assert.assertEquals(testVars[0][1] + testVars[0][2], record.getSecond().toString()); Assert.assertTrue("Second key/value pair not found!", iterator.hasNext()); record = iterator.next(); Assert.assertEquals(new File(parentFile, testVars[1][0]).toString(), record.getFirst().toString()); Assert.assertEquals(testVars[1][1] + testVars[1][2], record.getSecond().toString()); record = iterator.next(); File parentFileSubSubDir = new File(new File(new File(new File("TEST"), "subdir"), "subsubdir"), "mail-messages-2.gz"); Assert.assertEquals(new File(parentFileSubSubDir, testVars[0][0]).toString(), record.getFirst().toString()); Assert.assertEquals(testVars[0][1] + testVars[0][2], record.getSecond().toString()); Assert.assertTrue("Second key/value pair not found!", iterator.hasNext()); record = iterator.next(); Assert.assertEquals(new File(parentFileSubSubDir, testVars[1][0]).toString(), record.getFirst().toString()); Assert.assertEquals(testVars[1][1] + testVars[1][2], record.getSecond().toString()); Assert.assertFalse("Only two key/value pairs expected!", iterator.hasNext()); } @Test public void testMapReduce() throws Exception { Path tmpDir = getTestTempDirPath(); Path mrOutputDir = new Path(tmpDir, "mail-archives-out-mr"); Configuration configuration = getConfiguration(); FileSystem fs = FileSystem.get(configuration); File expectedInputFile = new File(inputDir.toString()); String[] args = { "-Dhadoop.tmp.dir=" + configuration.get("hadoop.tmp.dir"), "--input", expectedInputFile.getAbsolutePath(), "--output", mrOutputDir.toString(), "--charset", "UTF-8", "--keyPrefix", "TEST", "--method", "mapreduce", "--body", "--subject", "--separator", "" }; // run the application's main method SequenceFilesFromMailArchives.main(args); // app should create a single SequenceFile named "chunk-0" in the output dir FileStatus[] fileStatuses = fs.listStatus(mrOutputDir.suffix("/part-m-00000")); assertEquals(1, fileStatuses.length); // only one assertEquals("part-m-00000", fileStatuses[0].getPath().getName()); SequenceFileIterator<Text, Text> iterator = new SequenceFileIterator<>(mrOutputDir.suffix("/part-m-00000"), true, configuration); Assert.assertTrue("First key/value pair not found!", iterator.hasNext()); Pair<Text, Text> record = iterator.next(); File parentFileSubSubDir = new File(new File(new File(new File("TEST"), "subdir"), "subsubdir"), "mail-messages-2.gz"); String expected = record.getFirst().toString(); if (SystemUtils.IS_OS_WINDOWS) { expected = expected.replace("/", "\\"); } Assert.assertEquals(new File(parentFileSubSubDir, testVars[0][0]).toString(), expected); Assert.assertEquals(testVars[0][1] + testVars[0][2], record.getSecond().toString()); Assert.assertTrue("Second key/value pair not found!", iterator.hasNext()); record = iterator.next(); expected = record.getFirst().toString(); if (SystemUtils.IS_OS_WINDOWS) { expected = expected.replace("/", "\\"); } Assert.assertEquals(new File(parentFileSubSubDir, testVars[1][0]).toString(), expected); Assert.assertEquals(testVars[1][1] + testVars[1][2], record.getSecond().toString()); // test other file File parentFile = new File(new File(new File("TEST"), "subdir"), "mail-messages.gz"); record = iterator.next(); expected = record.getFirst().toString(); if (SystemUtils.IS_OS_WINDOWS) { expected = expected.replace("/", "\\"); } Assert.assertEquals(new File(parentFile, testVars[0][0]).toString(), expected); Assert.assertEquals(testVars[0][1] + testVars[0][2], record.getSecond().toString()); Assert.assertTrue("Second key/value pair not found!", iterator.hasNext()); record = iterator.next(); expected = record.getFirst().toString(); if (SystemUtils.IS_OS_WINDOWS) { expected = expected.replace("/", "\\"); } Assert.assertEquals(new File(parentFile, testVars[1][0]).toString(), expected); Assert.assertEquals(testVars[1][1] + testVars[1][2], record.getSecond().toString()); Assert.assertFalse("Only four key/value pairs expected!", iterator.hasNext()); } // Messages extracted and made anonymous from the ASF mail archives private static final String[][] testVars = { new String[] { "user@example.com", "Ant task for JDK1.1 collections build option", "\nThis is just a test message\n--\nTesty McTester\n" }, new String[] { "somebody@example.com", "Problem with build files in several directories", "\nHi all,\nThis is another test message.\nRegards,\nAnother Test\n" } }; private static final String testMailMessages = "From user@example.com Mon Jul 24 19:13:53 2000\n" + "Return-Path: <user@example.com>\n" + "Mailing-List: contact ant-user-help@jakarta.apache.org; run by ezmlm\n" + "Delivered-To: mailing list ant-user@jakarta.apache.org\n" + "Received: (qmail 49267 invoked from network); 24 Jul 2000 19:13:53 -0000\n" + "Message-ID: <" + testVars[0][0] + ">\n" + "From: \"Testy McTester\" <user@example.com>\n" + "To: <ant-user@jakarta.apache.org>\n" + "Subject: " + testVars[0][1] + '\n' + "Date: Mon, 24 Jul 2000 12:24:56 -0700\n" + "MIME-Version: 1.0\n" + "Content-Type: text/plain;\n" + " charset=\"Windows-1252\"\n" + "Content-Transfer-Encoding: 7bit\n" + "X-Spam-Rating: locus.apache.org 1.6.2 0/1000/N\n" + testVars[0][2] + '\n' + "From somebody@example.com Wed Jul 26 11:32:16 2000\n" + "Return-Path: <somebody@example.com>\n" + "Mailing-List: contact ant-user-help@jakarta.apache.org; run by ezmlm\n" + "Delivered-To: mailing list ant-user@jakarta.apache.org\n" + "Received: (qmail 73966 invoked from network); 26 Jul 2000 11:32:16 -0000\n" + "User-Agent: Microsoft-Outlook-Express-Macintosh-Edition/5.02.2022\n" + "Date: Wed, 26 Jul 2000 13:32:08 +0200\n" + "Subject: " + testVars[1][1] + '\n' + "From: Another Test <somebody@example.com>\n" + "To: <ant-user@jakarta.apache.org>\n" + "Message-Id: <" + testVars[1][0] + ">\n" + "Mime-Version: 1.0\n" + "Content-Type: text/plain; charset=\"US-ASCII\"\n" + "Content-Transfer-Encoding: 7bit\n" + "X-Spam-Rating: locus.apache.org 1.6.2 0/1000/N\n" + testVars[1][2]; }