/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.mbox;
import static org.apache.tika.TikaTest.assertContains;
import static org.junit.Assert.assertEquals;
import java.io.InputStream;
import java.util.Map;
import org.apache.tika.detect.TypeDetector;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.BodyContentHandler;
import org.junit.Before;
import org.junit.Test;
import org.xml.sax.ContentHandler;
public class MboxParserTest {
protected ParseContext recursingContext;
private Parser autoDetectParser;
private TypeDetector typeDetector;
private MboxParser mboxParser;
private static InputStream getStream(String name) {
return MboxParserTest.class.getClass().getResourceAsStream(name);
}
@Before
public void setUp() throws Exception {
typeDetector = new TypeDetector();
autoDetectParser = new AutoDetectParser(typeDetector);
recursingContext = new ParseContext();
recursingContext.set(Parser.class, autoDetectParser);
mboxParser = new MboxParser();
mboxParser.setTracking(true);
}
@Test
public void testSimple() throws Exception {
ContentHandler handler = new BodyContentHandler();
Metadata metadata = new Metadata();
try (InputStream stream = getStream("/test-documents/simple.mbox")) {
mboxParser.parse(stream, handler, metadata, recursingContext);
}
String content = handler.toString();
assertContains("Test content 1", content);
assertContains("Test content 2", content);
assertEquals("application/mbox", metadata.get(Metadata.CONTENT_TYPE));
Map<Integer, Metadata> mailsMetadata = mboxParser.getTrackingMetadata();
assertEquals("Nb. Of mails", 2, mailsMetadata.size());
Metadata mail1 = mailsMetadata.get(0);
assertEquals("message/rfc822", mail1.get(Metadata.CONTENT_TYPE));
assertEquals("envelope-sender-mailbox-name Mon Jun 01 10:00:00 2009", mail1.get("MboxParser-from"));
Metadata mail2 = mailsMetadata.get(1);
assertEquals("message/rfc822", mail2.get(Metadata.CONTENT_TYPE));
assertEquals("envelope-sender-mailbox-name Mon Jun 01 11:00:00 2010", mail2.get("MboxParser-from"));
}
@Test
public void testHeaders() throws Exception {
ContentHandler handler = new BodyContentHandler();
Metadata metadata = new Metadata();
try (InputStream stream = getStream("/test-documents/headers.mbox")) {
mboxParser.parse(stream, handler, metadata, recursingContext);
}
assertContains("Test content", handler.toString());
assertEquals("Nb. Of mails", 1, mboxParser.getTrackingMetadata().size());
Metadata mailMetadata = mboxParser.getTrackingMetadata().get(0);
assertEquals("2009-06-10T03:58:45Z", mailMetadata.get(TikaCoreProperties.CREATED));
assertEquals("<author@domain.com>", mailMetadata.get(TikaCoreProperties.CREATOR));
assertEquals("subject", mailMetadata.get(Metadata.SUBJECT));
assertEquals("<author@domain.com>", mailMetadata.get(Metadata.AUTHOR));
assertEquals("message/rfc822", mailMetadata.get(Metadata.CONTENT_TYPE));
assertEquals("author@domain.com", mailMetadata.get("Message-From"));
assertEquals("<name@domain.com>", mailMetadata.get("MboxParser-return-path"));
}
@Test
public void testMultilineHeader() throws Exception {
ContentHandler handler = new BodyContentHandler();
Metadata metadata = new Metadata();
try (InputStream stream = getStream("/test-documents/multiline.mbox")) {
mboxParser.parse(stream, handler, metadata, recursingContext);
}
assertEquals("Nb. Of mails", 1, mboxParser.getTrackingMetadata().size());
Metadata mailMetadata = mboxParser.getTrackingMetadata().get(0);
assertEquals("from xxx by xxx with xxx; date", mailMetadata.get("MboxParser-received"));
}
@Test
public void testQuoted() throws Exception {
ContentHandler handler = new BodyContentHandler();
Metadata metadata = new Metadata();
try (InputStream stream = getStream("/test-documents/quoted.mbox")) {
mboxParser.parse(stream, handler, metadata, recursingContext);
}
assertContains("Test content", handler.toString());
assertContains("> quoted stuff", handler.toString());
}
@Test
public void testComplex() throws Exception {
ContentHandler handler = new BodyContentHandler();
Metadata metadata = new Metadata();
try (InputStream stream = getStream("/test-documents/complex.mbox")) {
mboxParser.parse(stream, handler, metadata, recursingContext);
}
assertEquals("Nb. Of mails", 3, mboxParser.getTrackingMetadata().size());
Metadata firstMail = mboxParser.getTrackingMetadata().get(0);
assertEquals("Re: question about when shuffle/sort start working", firstMail.get(Metadata.SUBJECT));
assertEquals("Re: question about when shuffle/sort start working", firstMail.get(TikaCoreProperties.TITLE));
assertEquals("Jothi Padmanabhan <jothipn@yahoo-inc.com>", firstMail.get(Metadata.AUTHOR));
assertEquals("Jothi Padmanabhan <jothipn@yahoo-inc.com>", firstMail.get(TikaCoreProperties.CREATOR));
assertEquals("core-user@hadoop.apache.org", firstMail.get(Metadata.MESSAGE_RECIPIENT_ADDRESS));
assertContains("When a Mapper completes", handler.toString());
}
}