/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.mahout.text;
import java.io.File;
import java.io.FileOutputStream;
import java.util.zip.GZIPOutputStream;
import com.google.common.io.Closeables;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.mahout.common.Pair;
import org.apache.mahout.common.iterator.sequencefile.SequenceFileIterator;
import org.apache.mahout.utils.MahoutTestCase;
import org.junit.Assert;
import org.junit.Before;
import org.junit.Test;
/**
* Test case for the SequenceFilesFromMailArchives command-line application.
*/
public final class SequenceFilesFromMailArchivesTest extends MahoutTestCase {
// TODO: Negative tests
private File inputDir = null;
private File outputDir = null;
/**
* Create the input and output directories needed for testing
* the SequenceFilesFromMailArchives application.
*/
@Override
@Before
public void setUp() throws Exception {
super.setUp();
inputDir = getTestTempDir("mail-archives-in");
outputDir = getTestTempDir("mail-archives-out");
// write test mail messages to a gzipped file in a nested directory
File subDir = new File(inputDir, "subdir");
subDir.mkdir();
File gzFile = new File(subDir, "mail-messages.gz");
GZIPOutputStream gzOut = null;
try {
gzOut = new GZIPOutputStream(new FileOutputStream(gzFile));
gzOut.write(testMailMessages.getBytes("UTF-8"));
gzOut.finish();
} finally {
Closeables.closeQuietly(gzOut);
}
}
/**
* Test the main method of the SequenceFilesFromMailArchives
* command-line application.
*/
@Test
public void testMain() throws Exception {
String[] args = {
"--input", inputDir.getAbsolutePath(),
"--output", outputDir.getAbsolutePath(),
"--charset", "UTF-8",
"--keyPrefix", "TEST",
"--body", "--subject", "--separator", ""
};
// run the application's main method
SequenceFilesFromMailArchives.main(args);
// app should create a single SequenceFile named "chunk-0"
// in the output dir
File expectedChunkFile = new File(outputDir, "chunk-0");
String expectedChunkPath = expectedChunkFile.getAbsolutePath();
Assert.assertTrue("Expected chunk file "+expectedChunkPath+" not found!", expectedChunkFile.isFile());
Configuration conf = new Configuration();
SequenceFileIterator<Text,Text> iterator =
new SequenceFileIterator<Text,Text>(new Path(expectedChunkPath), true, conf);
Assert.assertTrue("First key/value pair not found!", iterator.hasNext());
Pair<Text,Text> record = iterator.next();
File parentFile = new File(new File(new File("TEST"), "subdir"), "mail-messages.gz");
Assert.assertEquals(new File(parentFile, testVars[0][0]).toString(), record.getFirst().toString());
Assert.assertEquals(testVars[0][1]+testVars[0][2], record.getSecond().toString());
Assert.assertTrue("Second key/value pair not found!", iterator.hasNext());
record = iterator.next();
Assert.assertEquals(new File(parentFile, testVars[1][0]).toString(), record.getFirst().toString());
Assert.assertEquals(testVars[1][1]+testVars[1][2], record.getSecond().toString());
Assert.assertFalse("Only two key/value pairs expected!", iterator.hasNext());
}
// Messages extracted and anonymized from the ASF mail archives
private static final String[][] testVars = {
new String[] {
"user@example.com",
"Ant task for JDK1.1 collections build option",
"\nThis is just a test message\n--\nTesty McTester\n"
},
new String[] {
"somebody@example.com",
"Problem with build files in several directories",
"\nHi all,\nThis is another test message.\nRegards,\nAnother Test\n"
}
};
private static final String testMailMessages =
"From user@example.com Mon Jul 24 19:13:53 2000\n"+
"Return-Path: <user@example.com>\n"+
"Mailing-List: contact ant-user-help@jakarta.apache.org; run by ezmlm\n"+
"Delivered-To: mailing list ant-user@jakarta.apache.org\n"+
"Received: (qmail 49267 invoked from network); 24 Jul 2000 19:13:53 -0000\n"+
"Message-ID: <"+testVars[0][0]+">\n"+
"From: \"Testy McTester\" <user@example.com>\n"+
"To: <ant-user@jakarta.apache.org>\n"+
"Subject: "+testVars[0][1]+ '\n' +
"Date: Mon, 24 Jul 2000 12:24:56 -0700\n"+
"MIME-Version: 1.0\n"+
"Content-Type: text/plain;\n"+
" charset=\"Windows-1252\"\n"+
"Content-Transfer-Encoding: 7bit\n"+
"X-Spam-Rating: locus.apache.org 1.6.2 0/1000/N\n"+
testVars[0][2]+'\n' +
"From somebody@example.com Wed Jul 26 11:32:16 2000\n"+
"Return-Path: <somebody@example.com>\n"+
"Mailing-List: contact ant-user-help@jakarta.apache.org; run by ezmlm\n"+
"Delivered-To: mailing list ant-user@jakarta.apache.org\n"+
"Received: (qmail 73966 invoked from network); 26 Jul 2000 11:32:16 -0000\n"+
"User-Agent: Microsoft-Outlook-Express-Macintosh-Edition/5.02.2022\n"+
"Date: Wed, 26 Jul 2000 13:32:08 +0200\n"+
"Subject: "+testVars[1][1]+ '\n' +
"From: Another Test <somebody@example.com>\n"+
"To: <ant-user@jakarta.apache.org>\n"+
"Message-Id: <"+testVars[1][0]+">\n"+
"Mime-Version: 1.0\n"+
"Content-Type: text/plain; charset=\"US-ASCII\"\n"+
"Content-Transfer-Encoding: 7bit\n"+
"X-Spam-Rating: locus.apache.org 1.6.2 0/1000/N\n"+
testVars[1][2];
}