package com.google.sitebricks.mail.imap; import com.google.common.base.Charsets; import com.google.common.base.Objects; import com.google.common.collect.Multimap; import com.google.common.io.ByteStreams; import com.google.common.io.CharStreams; import com.google.common.io.Resources; import org.testng.annotations.Test; import javax.mail.MessagingException; import javax.mail.internet.MimeUtility; import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.net.URL; import java.text.ParseException; import java.util.Collection; import java.util.List; import java.util.Map; import java.util.regex.Matcher; import java.util.regex.Pattern; import static org.testng.Assert.assertEquals; import static org.testng.Assert.assertFalse; import static org.testng.Assert.assertNotNull; import static org.testng.Assert.assertNull; import static org.testng.Assert.assertTrue; /** * @author dhanji@gmail.com (Dhanji R. Prasanna) */ public class MessageBodyExtractorTest { private static final Pattern MESSAGE_LOG_REGEX = Pattern.compile("^.* DEBUG c\\.g\\.s\\.mail\\.MailClientHandler: Message received \\["); static { java.util.logging.ConsoleHandler fh = new java.util.logging.ConsoleHandler(); java.util.logging.Logger.getLogger("").addHandler(fh); java.util.logging.Logger.getLogger("").setLevel(java.util.logging.Level.FINEST); } // @Test //DISABLED. Only use this test for debugging. public final void testAgainstFewerMessagesParsedThanExistError() throws IOException { List<String> data = Resources.readLines(MessageBodyExtractorTest.class.getResource( "broken_rfc822.log"), Charsets.UTF_8); // List<String> redacted = Lists.newArrayList(); // for (String line : data) { // Matcher matcher = MESSAGE_LOG_REGEX.matcher(line); // if (matcher.find()) { // line = matcher.replaceAll(""); // redacted.add(line.substring(0, line.lastIndexOf("]"))); // } // } // List<Message> extract = new MessageBodyExtractor().extract(data); for (Message message : extract) { Collection<String> messageId = message.getHeaders().get("Message-Id"); if (messageId.isEmpty()) messageId = message.getHeaders().get("Message-ID"); System.out.println(messageId + " " + message.getHeaders().get("Subject")); } System.out.println("Total: " + extract.size()); } /** * WARNING: THIS TEST IS DATA-DEPENDENT! */ @Test public final void testAwkwardGmailEmailStreamUsingTruncatorGroping() throws IOException, ParseException { testAwkwardGmailEmailStream(true); } /** * WARNING: THIS TEST IS DATA-DEPENDENT! */ @Test public final void testAwkwardGmailEmailStreamUsingLengths() throws IOException, ParseException { testAwkwardGmailEmailStream(false); } public final void testAwkwardGmailEmailStream(boolean forceTruncatorGroping) throws IOException, ParseException { final List<String> lines = Resources.readLines(MessageBodyExtractorTest.class.getResource("fetch_bodies.txt"), Charsets.UTF_8); List<Message> extract = new MessageBodyExtractor(forceTruncatorGroping, 999999999999999999L).extract(lines); assertEquals(extract.size(), 23); // ------------------------------------------------------------ // First message. // Folded headers with tabs + spaces, repeat headers, one body. Message message = extract.get(0); String expectedHeaders = CharStreams.toString(new InputStreamReader(MessageBodyExtractorTest.class.getResourceAsStream("fetch_headers_1.txt"))); assertEquals(message.getHeaders().toString(), expectedHeaders); assertEquals(1, message.getBodyParts().size()); Message.BodyPart part1 = message.getBodyParts().get(0); assertNull(part1.getBinBody()); assertTrue(part1.getHeaders().isEmpty()); // We have to compare the raw bytes because the encoded string comes in as ISO-8859-1 // And Java literals are encoded as UTF-8. assertEquals(part1.getBody().getBytes(), ByteStreams.toByteArray( MessageBodyExtractorTest.class.getResourceAsStream("fetch_body_1_raw.dat"))); assertEquals(new String(part1.getBody().getBytes()), new String(ByteStreams.toByteArray( MessageBodyExtractorTest.class.getResourceAsStream("fetch_body_1_raw.dat")))); // ------------------------------------------------------------ // Second message. // missing content-transfer-encoding and mimetype. // Should parse it as a UTF-8 text/plain message even though no mimetype is specified, // and 7bit CTE. message = extract.get(1); assertTrue(message.getHeaders().get("Content-Transfer-Encoding").isEmpty()); assertTrue(message.getHeaders().get("Content-Type").isEmpty()); assertHeaderEquals(message.getHeaders(), "Subject", "Re: Slow to Respond"); assertEquals(1, message.getBodyParts().size()); part1 = message.getBodyParts().get(0); assertTrue(part1.getHeaders().isEmpty()); assertNull(part1.getBinBody()); assertEquals(part1.getBody(), CharStreams.toString( new InputStreamReader(MessageBodyExtractorTest.class.getResourceAsStream("fetch_body_2.txt")))); // ------------------------------------------------------------ // Third message. // multipart 2 parts, 1-level deep only. message = extract.get(2); assertEquals(message.getHeaders().toString(), "{Message-ID=[<askdopaksdNq6o3M+veqCfc+x3m1PxeLn-raisdj" + "@mail.gmail.com>], Subject=[Re: Slow to Respond], Content-Type=[multipart/alternative; " + "boundary=\"_000_9E22DB2E4EF0164D9F76BB4BC3FC689E31BCF27D87CPPXCMS01:morg_\";], " + "X-Sitebricks-Test=[multipart-alternatives;quoted-headers]}"); assertEquals(2, message.getBodyParts().size()); part1 = message.getBodyParts().get(0); Message.BodyPart part2 = message.getBodyParts().get(1); assertEquals(1, part1.getHeaders().size()); assertTrue(Parsing.startsWithIgnoreCase(part1.getHeaders().get("Content-Type").iterator().next(), "text/plain")); assertEquals(2, part2.getHeaders().size()); assertTrue(Parsing.startsWithIgnoreCase(part2.getHeaders() .get("Content-Type") .iterator() .next(), "text/html")); assertEquals(1, part2.getHeaders().get("MIME-Version").size()); assertHeaderEquals(part2.getHeaders(), "MIME-Version", "1.0"); assertEquals(part2.getBody(), "<body>\r\n" + "I am OOO and may have sporadic access to email.\r\n" + "</body>\r\n"); // ------------------------------------------------------------ // Fourth message. // multipart 2 parts, 1-level deep only, sameline-rparen, preamble, epilogue. message = extract.get(3); assertEquals(message.getHeaders().toString(), "{Delivered-To=[dhanji@gmail.com], Date=[Thu, 8 Sep 2011 17:07:44 -0700]," + " Message-ID=[CAEEYBPaoksdpoak+veqCfc+x3m1PxeLn-raisdj@mail.gmail.com]," + " Subject=[Re: Slow to Respond], MIME-Version=[1.0], Content-Disposition=[inline]," + " Content-Type=[multipart/alternative;" + " boundary=_000_:9E22DB2E4EF0164D9F76BB4BC3FC689E31BCF27D87CPPXCMS01morg_]," + " X-Sitebricks-Test=[multipart-alternatives;quoted-headers;sameline-rparen;preamble]}"); assertEquals(2, message.getBodyParts().size()); part1 = message.getBodyParts().get(0); part2 = message.getBodyParts().get(1); assertEquals(1, part1.getHeaders().size()); assertTrue(Parsing.startsWithIgnoreCase(part1.getHeaders().get("Content-Type").iterator().next(), "text/plain")); assertEquals(part1.getBody(), "I am OOO and may have sporadic access to email.\r\n\r\n" + "--\r\n\r\n"); assertEquals(3, part2.getHeaders().size()); assertTrue(Parsing.startsWithIgnoreCase(part2.getHeaders() .get("Content-Type") .iterator() .next(), "text/html")); assertEquals(0, part2.getHeaders().get("MIME-Version").size()); assertHeaderEquals(part2.getHeaders(), "Content-Disposition", "something"); assertHeaderEquals(part2.getHeaders(), "Content-Doodle", "somethingelse"); assertEquals(part2.getBody(), "<body>\r\n" + "I am OOO and may have sporadic access to email.\r\n" + "</body>\r\n"); // ------------------------------------------------------------ // Fifth message. // multipart 2 parts, 1-level deep only, tight-preamble/epilogue. message = extract.get(4); assertEquals(message.getHeaders().toString(), "{Delivered-To=[dhanji@gmail.com], Date=[Thu, 8 Sep 2011 17:07:44 -0700]," + " Message-ID=[CAEEYBPaoksdpoak+veqCfc+x3m1PxeLn-raisdj@mail.gmail.com]," + " Subject=[Re: Slow to Respond], MIME-Version=[1.0], Content-Disposition=[inline]," + " Content-Type=[multipart/alternative;" + " boundary=_000_9E22DB2E4EF0164D9F76BB4BC3FC689E31BCF27D87CPPXCMS01morg_]," + " X-Sitebricks-Test=[multipart-alternatives;quoted-headers;preamble;epilogue]}"); assertEquals(2, message.getBodyParts().size()); part1 = message.getBodyParts().get(0); part2 = message.getBodyParts().get(1); assertEquals(1, part1.getHeaders().size()); assertTrue(Parsing.startsWithIgnoreCase(part1.getHeaders().get("Content-Type").iterator().next(), "text/plain")); assertEquals(part1.getBody(), "I am OOO and may have sporadic access to email.\r\n\r\n" + "> OK Success\r\n\r\n" + "--\r\n"); assertEquals(3, part2.getHeaders().size()); assertTrue(Parsing.startsWithIgnoreCase(part2.getHeaders() .get("Content-Type") .iterator() .next(), "text/html")); assertEquals(0, part2.getHeaders().get("MIME-Version").size()); assertHeaderEquals(part2.getHeaders(), "Content-Disposition", "something"); assertHeaderEquals(part2.getHeaders(), "Content-Doodle", "somethingelse"); assertEquals(part2.getBody(), "<body>\r\n" + "I am OOO and may have sporadic access to email.\r\n" + "</body>\r\n"); // ------------------------------------------------------------ // Sixth message. // multipart 2 parts each, 2-level deep, preambles/epilogues. message = extract.get(5); assertNestedMultipart2LevelDeep(message, "<CAEEYBPNq6o3M+aisjd+x3m1PxeLn-raisdj@mail.gmail.co" + "m>", "_000_9E22DB2E4EF0164D9F76BB4BC3FC689E31BCF27D87CPPXCMS01morg_"); // ------------------------------------------------------------ // Seventh message. // same as sixth but with wide spacing and different ID. message = extract.get(6); assertNestedMultipart2LevelDeep(message, "<CAEEYBPNq6o3Mm1PxeLn-raisdj@mail.gmail.com>", "_000_9E22DB2E4EF0164D9F76BB4BC3FC689E31BCF27D87CPPXCMS01morg_"); // ------------------------------------------------------------ // Eigth message. // same as sixth but with compact spacing and different ID. message = extract.get(7); assertNestedMultipart2LevelDeep(message, "<SPLAT_CAEEYBPNq6o3Mm1PxeLn-raisdj@mail.gmail.com>", "_000_9E22DB2E4EF0164D9F76BB4BC3FC689E31BCF27D87CPPXCMS01morg_"); // ------------------------------------------------------------ // Ninth message. // same as eigth but re-using the SAME boundary for inner and outer parts. message = extract.get(8); assertNestedMultipart2LevelDeep(message, "<SPLAT_CAEEYBPNq6o3Mm1PxeLn-raisdj@mail.gmail.com>", "----NextPart_1293809.9123_LLas"); // ------------------------------------------------------------ // Tenth message. // multipart 3 parts each, 3-level deep, preambles/epilogues. message = extract.get(9); assertComplexNestedStructure(message); // ------------------------------------------------------------ // Eleventh message. // multipart 3 parts, 1-level deep, message/rfc822 nested message with quoted-printable. message = extract.get(10); assertRfc822(message, "quoted-printable"); // ------------------------------------------------------------ // multipart 3 parts, 1-level deep, message/rfc822 nested message. message = extract.get(11); assertRfc822(message, null); // ------------------------------------------------------------ // multipart 3 parts, message/rfc822 nested multipart message. message = extract.get(12); assertMultipartRfc822(message); // ------------------------------------------------------------ // multipart 3 parts, message/rfc822 nested message with multipart and attachment. message = extract.get(13); assertRfc822withAttachment(message); // ------------------------------------------------------------ // Test mixed case in Content-Type. message = extract.get(14); assertEquals(2, message.getBodyParts().size()); // ------------------------------------------------------------ // Test mixed case in Content-Type. message = extract.get(15); assertEquals(1, message.getBodyParts().size()); assertEquals(message.getBodyParts().get(0).getBody(), "Danke für die Weihnachtswünsche! Viele Grüße.\r\n"); // ------------------------------------------------------------ // This one is intentionally broken and forces terminator groping, // check that we get what we expect. message = extract.get(16); assertEquals(1, message.getBodyParts().size()); assertEquals(message.getBodyParts().get(0).getBody(), "the message body\r\n)\r\n\r\n45988 OK Success\r\n"); message = extract.get(17); assertEquals(1, message.getBodyParts().size()); if (forceTruncatorGroping) assertEquals(message.getBodyParts().get(0).getBody(), "fake ending\r\n\r\n"); else assertEquals(message.getBodyParts().get(0).getBody(), "fake ending\r\n\r\n)\r\n10 OK Success\r\n"); // ------------------------------------------------------------ // Many parts, with verified length as sent by gmail. message = extract.get(18); assertEquals(4, message.getBodyParts().size()); // ------------------------------------------------------------ // Awkward length boundary. message = extract.get(19); assertEquals(1, message.getBodyParts().size()); assertEquals(message.getBodyParts().get(0).getBody(), "the message body\r\n"); // ------------------------------------------------------------ // Invalid body length, but still expect correct parsing message = extract.get(20); assertEquals(1, message.getBodyParts().size()); assertEquals(message.getBodyParts().get(0).getBody(), "the message body\r\n"); // ------------------------------------------------------------ // Multipart with base64 encoding of plain text. message = extract.get(21); assertEquals(1, message.getBodyParts().size()); assertEquals(message.getBodyParts().get(0).getBody(), "hi\n\n"); // ------------------------------------------------------------ // Accept bad content transfer encoding message = extract.get(22); assertEquals(1, message.getBodyParts().size()); assertEquals(message.getBodyParts().get(0).getBody(), "Danke für die Weihnachtswünsche! Viele Grüße.\r\n"); } private void assertRfc822(Message message, String contentTransferEncoding) { assertEquals(3, message.getBodyParts().size()); Message.BodyPart part1; Message.BodyPart part2; Message.BodyPart part3; part1 = message.getBodyParts().get(0); part2 = message.getBodyParts().get(1); part3 = message.getBodyParts().get(2); assertNotNull(part1); assertNotNull(part2); assertNotNull(part3); assertTrue(Parsing.startsWithIgnoreCase(part1.getHeaders().get("Content-Type").iterator().next(), "text/plain")); assertTrue(Parsing.startsWithIgnoreCase(part3.getHeaders().get("Content-Type").iterator().next(), "text/plain")); // Message 2 is an encapsulated rfc822 message. assertTrue( Parsing.startsWithIgnoreCase(part2.getHeaders().get("Content-Type").iterator().next(), "message/rfc822")); if (contentTransferEncoding != null) assertTrue( Parsing.startsWithIgnoreCase(part2.getHeaders().get("Content-Transfer-Encoding").iterator().next(), contentTransferEncoding)); assertNull(part2.getBody()); assertNull(part2.getBinBody()); assertEquals(1, part2.getBodyParts().size()); // It should contain its content as a child message. Message.BodyPart rfc822 = part2.getBodyParts().get(0); assertNotNull(rfc822); assertEquals(rfc822.getHeaders().size(), 7); assertEquals(rfc822.getHeaders().get("Message-ID").iterator().next(), "<9632091.970.1320441146867.JavaMail.geo-discussion-forums@yqie15>"); assertEquals(rfc822.getHeaders().get("In-Reply-To").iterator().next(), "<AANLkTikNOzOVjj=mS8nFXoiuW=LPufKKsK_SOPEXdCby@mail.gmail.com>"); assertEquals(rfc822.getHeaders().get("X-Annoy").iterator().next(), "dhanji"); assertEquals(rfc822.getHeaders().get("From").iterator().next(), "example@example.com"); assertEquals(rfc822.getHeaders().get("To").iterator().next(), "example2@example.com"); assertEquals(rfc822.getHeaders().get("Subject").iterator().next(), "As basic as it gets"); assertEquals(rfc822.getHeaders().get("Content-Type").iterator().next(), "text/plain"); assertNull(rfc822.getBinBody()); assertNotNull(rfc822.getBody()); assertEquals("This is the plain text body of the message. Note the blank line\r\n" + "between the header information and the body of the message.\r\n\r\n", rfc822.getBody()); } private void assertMultipartRfc822(Message message) { // Assume all the stuff about the non-rfc822 matches the previous case. // skip right down the the nested message. assertEquals(3, message.getBodyParts().size()); Message.BodyPart part2; part2 = message.getBodyParts().get(1); assertNotNull(part2); // Message 2 is an encapsulated rfc822 message. assertTrue( Parsing.startsWithIgnoreCase(part2.getHeaders().get("Content-Type").iterator().next(), "message/rfc822")); assertNull(part2.getBody()); assertNull(part2.getBinBody()); assertEquals(1, part2.getBodyParts().size()); // It should contain its content as a child message. Message.BodyPart rfc822 = part2.getBodyParts().get(0); assertNotNull(rfc822); assertEquals(rfc822.getHeaders().size(), 7); assertEquals(rfc822.getHeaders().get("Message-ID").iterator().next(), "<9632091.970.1320441146867.JavaMail.geo-discussion-forums@yqie15>"); assertEquals(rfc822.getHeaders().get("In-Reply-To").iterator().next(), "<AANLkTikNOzOVjj=mS8nFXoiuW=LPufKKsK_SOPEXdCby@mail.gmail.com>"); assertEquals(rfc822.getHeaders().get("X-Annoy").iterator().next(), "dhanji"); assertEquals(rfc822.getHeaders().get("From").iterator().next(), "example@example.com"); assertEquals(rfc822.getHeaders().get("To").iterator().next(), "example2@example.com"); assertEquals(rfc822.getHeaders().get("Subject").iterator().next(), "As basic as it gets"); assertEquals(rfc822.getHeaders().get("Content-Type").iterator().next(), "multipart/mixed; boundary=e89a8ff1c384d8017504b42beb91"); assertEquals(2, rfc822.getBodyParts().size()); Message.BodyPart sub1 = rfc822.getBodyParts().get(0); Message.BodyPart sub2 = rfc822.getBodyParts().get(1); assertEquals(sub1.getHeaders().get("Content-Type").iterator().next(), "text/plain; charset=ISO-8859-1"); assertNotNull(sub1.getBody()); assertNull(sub1.getBinBody()); assertEquals(sub2.getHeaders().get("Content-Type").iterator().next(), "text/plain; charset=ISO-8859-1"); assertNotNull(sub2.getBody()); assertNull(sub2.getBinBody()); } private void assertRfc822withAttachment(Message message) { // Assume all the stuff about the non-rfc822 matches the previous case. // skip right down the the nested message. assertEquals(message.getBodyParts().size(), 3); Message.BodyPart part2; part2 = message.getBodyParts().get(1); assertNotNull(part2); // Message 2 is an encapsulated rfc822 message. assertTrue( Parsing.startsWithIgnoreCase(part2.getHeaders().get("Content-Type").iterator().next(), "message/rfc822")); assertTrue( Parsing.startsWithIgnoreCase(part2.getHeaders().get("Content-Transfer-Encoding").iterator().next(), "quoted-printable")); assertNull(part2.getBody()); assertNull(part2.getBinBody()); assertEquals(1, part2.getBodyParts().size()); // It should contain its content as a child message. Message.BodyPart rfc822 = part2.getBodyParts().get(0); assertNotNull(rfc822); assertEquals(rfc822.getHeaders().size(), 7); assertEquals(rfc822.getHeaders().get("Message-ID").iterator().next(), "<9632091.970.1320441146867.JavaMail.geo-discussion-forums@yqie15>"); assertEquals(rfc822.getHeaders().get("In-Reply-To").iterator().next(), "<AANLkTikNOzOVjj=mS8nFXoiuW=LPufKKsK_SOPEXdCby@mail.gmail.com>"); assertEquals(rfc822.getHeaders().get("X-Annoy").iterator().next(), "dhanji"); assertEquals(rfc822.getHeaders().get("From").iterator().next(), "example@example.com"); assertEquals(rfc822.getHeaders().get("To").iterator().next(), "example2@example.com"); assertEquals(rfc822.getHeaders().get("Subject").iterator().next(), "As basic as it gets"); assertEquals(rfc822.getHeaders().get("Content-Type").iterator().next(), "multipart/mixed; boundary=e89a8ff1c384d8017504b42beb91"); assertEquals(2, rfc822.getBodyParts().size()); Message.BodyPart sub1 = rfc822.getBodyParts().get(0); Message.BodyPart sub2 = rfc822.getBodyParts().get(1); assertEquals(sub1.getHeaders().get("Content-Type").iterator().next(), "text/plain; charset=ISO-8859-1"); assertNotNull(sub1.getBody()); assertNull(sub1.getBinBody()); assertEquals(sub2.getHeaders().get("Content-Type").iterator().next(), "text/csv; charset=US-ASCII; name=\"csv-demo.csv\""); assertNull(sub2.getBody()); assertNotNull(sub2.getBinBody()); } private void assertComplexNestedStructure(Message message) { Message.BodyPart part1; Message.BodyPart part2; Message.BodyPart part3; Message.BodyPart innerPart1; assertEquals(message.getHeaders().toString(), "{Delivered-To=[dhanji@gmail.com], Message-ID=[<id> id]," + " Subject=[Re: Slow to Respond], Content-Type=[multipart/alternative;" + " boundary = __BOUNDARY__]," + " X-Sitebricks-Test=[multipart-alternatives;quoted-headers;nested-parts;preamble]}"); assertEquals(3, message.getBodyParts().size()); part1 = message.getBodyParts().get(0); part2 = message.getBodyParts().get(1); part3 = message.getBodyParts().get(2); assertEquals(1, part1.getBodyParts().size()); innerPart1 = part1.getBodyParts().get(0); assertEquals(2, innerPart1.getBodyParts().size()); Message.BodyPart innerInnerPart1 = innerPart1.getBodyParts().get(0); assertEquals(3, innerInnerPart1.getBodyParts().size()); Message.BodyPart innerInnerPart2 = innerPart1.getBodyParts().get(1); assertEquals("Hi this is a body.\r\n\r\n", innerInnerPart2.getBody()); // Back to TOP level. assertEquals("This is a signature.\r\n", part2.getBody()); // Last top-level part. assertEquals(3, part3.getBodyParts().size()); assertEquals("This is a signature.\r\n" + "--__BOUNDARY-2__\r\n" + "--__BOUNDARY-2__--\r\n" + "--__BOUNDARY-1__--\r\n" + "fooled you--this is all textbody.\r\n\r\n", part3.getBodyParts().get(0).getBody()); assertEquals("Beric is dead.\r\n", part3.getBodyParts().get(1).getBody()); Message.BodyPart peric = part3.getBodyParts().get(2); assertEquals(2, peric.getBodyParts().size()); assertEquals(peric.getBodyParts().get(0).getBody(), "HI!\r\n\r\n"); assertHeaderEquals(peric.getBodyParts().get(0).getHeaders(), "Content-Type", "text/plain"); assertEquals(peric.getBodyParts().get(1).getBody(), "<body>yo</body>\r\n\r\n"); assertHeaderEquals(peric.getBodyParts().get(1).getHeaders(), "Content-Type", "text/html"); } private void assertNestedMultipart2LevelDeep(Message message, String id, String boundary) { Message.BodyPart part1; Message.BodyPart part2; assertEquals(message.getHeaders().toString(), "{Delivered-To=[dhanji@gmail.com], Message-ID=[" + id + "]," + " Subject=[Re: Slow to Respond], Content-Type=[multipart/alternative;" + " boundary=" + boundary + "]," + " X-Sitebricks-Test=[multipart-alternatives;quoted-headers;nested-parts;preamble]}"); assertEquals(2, message.getBodyParts().size()); part1 = message.getBodyParts().get(0); part2 = message.getBodyParts().get(1); // The first part should itself have two parts. assertEquals(2, part1.getBodyParts().size()); Message.BodyPart innerPart1 = part1.getBodyParts().get(0); Message.BodyPart innerPart2 = part1.getBodyParts().get(1); assertEquals(1, part1.getHeaders().size()); assertTrue( Parsing.startsWithIgnoreCase(part1.getHeaders().get("Content-Type").iterator().next(), "multipart/alternative")); // Inner parts should be as exepcted. assertTrue(Parsing.startsWithIgnoreCase(innerPart1.getHeaders().get("Content-Type").iterator().next(), "text/plain")); assertTrue(Parsing.startsWithIgnoreCase(innerPart2.getHeaders().get("Content-Type").iterator().next(), "text/html")); assertEquals(innerPart1.getBody(), "I am OOO and may have sporadic access to email.\r\n\r\n"); assertEquals(innerPart2.getBody(), "<body>\r\n" + "I am OOO and may have sporadic access to email.\r\n" + "</body>\r\n\r\n"); // The multipart body part itself has no body, instead has subparts. assertNull(part1.getBody()); assertEquals(2, part2.getHeaders().size()); assertTrue(Parsing.startsWithIgnoreCase(part2.getHeaders() .get("Content-Type") .iterator() .next(), "text/plain")); assertHeaderEquals(part2.getHeaders(), "MIME-Version", "1.0"); assertEquals(part2.getBody(), "This is a signature.\r\n\r\n"); } private static void assertHeaderEquals(Multimap<String, String> headers, String header, String value) { assertEquals(headers.get(header).iterator().next(), value); } @Test public final void testTypicalGmailEmail() throws IOException, ParseException { List<String> data = Resources.readLines(MessageBodyExtractorTest.class.getResource("fetch_body_data1.txt"), Charsets.UTF_8); List<Message> statuses = new MessageBodyExtractor().extract(data); for (int i = 0, statusesSize = statuses.size(); i < statusesSize; i++) { Message message = statuses.get(i); System.out.println(Objects.toStringHelper(message)); System.out.println("----------->"); for (Message.BodyPart bodyPart : message.getBodyParts()) { System.out.println(Objects.toStringHelper(bodyPart)); } } } @Test public final void testReadUnfoldedHeaders() throws IOException { URL assertions = MessageBodyExtractorTest.class.getResource("split_headers_assertion_1.txt"); List<String> data = Resources.readLines( MessageBodyExtractorTest.class.getResource("split_headers_fetch_data_1.txt"), Charsets.UTF_8); List<Message> messages = new MessageBodyExtractor().extract(data); assertEquals(1, messages.size()); Message message = messages.get(0); // Emit what we've just read back out in a similar format to the file. StringBuilder out = new StringBuilder(); for (Map.Entry<String, Collection<String>> entry : message.getHeaders().asMap().entrySet()) { for (String value : entry.getValue()) { out.append(entry.getKey()) .append(": ") .append(value) .append('\n'); } } // Compare the parsed headers with what we slurped in. assertEquals(out.toString().trim(), Resources.toString(assertions, Charsets.UTF_8).trim()); } @Test public final void testStartRegex() { Pattern pattern = MessageBodyExtractor.MESSAGE_START_PREFIX_REGEX; assertTrue(pattern.matcher("* 5 FETCH (UID 1001 BODY[] {2346}").find()); assertTrue(pattern.matcher("* 235 FETCH (UID 1001 BODY[]").find()); assertTrue(pattern.matcher("* 1 FETCH (UID 1001 BODY[] AOKSDOAKSD").find()); assertFalse(pattern.matcher(" * 1 FETCH (UID 1001 BODY[] AOKSDOAKSD").find()); assertFalse(pattern.matcher("X * 1 FETCH (UID 1001 BODY[] AOKSDOAKSD").find()); assertFalse(pattern.matcher(" 1 FETCH (UID1001 BODY[] AOKSDOAKSD").find()); assertFalse(pattern.matcher("* 1 FETCH(UID 1001 BODY [] AOKSDOAKSD").find()); assertFalse(pattern.matcher("* 1 FETCH(UID 1001BODY [] AOKSDOAKSD").find()); assertFalse(pattern.matcher(" 1 FETCH (UID 1001 BODY[] AOKSDOAKSD").find()); assertFalse(pattern.matcher("* 1 FETCH(UID 1001 BODY[] AOKSDOAKSD").find()); assertFalse(pattern.matcher("* 1 FETCH (UID 1001 BODY [ ] AOKSDOAKSD").find()); assertFalse(pattern.matcher("* 1 FETCH (UID 1001 BODY [] {2345}").find()); assertFalse(pattern.matcher(" * 1 FETCH (UID 1001 BODY [] {2345}").find()); assertFalse(pattern.matcher("T * 1 FETCH (UID 1001 BODY [] {2345}").find()); } @Test public final void testEosRegex() { Pattern pattern = MessageBodyExtractor.EOS_REGEX; assertTrue(pattern.matcher("4 OK SUCCESS").matches()); assertTrue(pattern.matcher("5 OK SUCCESS").matches()); assertTrue(pattern.matcher("22 ok success").matches()); assertFalse(pattern.matcher(") (").matches()); assertFalse(pattern.matcher("(").matches()); } @Test public final void testBoundaryExtractorRegex() { Matcher matcher = MessageBodyExtractor.BOUNDARY_REGEX.matcher( "multipart/alternative;\n" + " boundary=_000_:9E22DB2E4EF0164D9F76BB4BC3FC689E31BCF27D87CPPXCMS01morg_"); assertTrue(matcher.find()); assertEquals("_000_:9E22DB2E4EF0164D9F76BB4BC3FC689E31BCF27D87CPPXCMS01morg_", matcher.group(1)); matcher = MessageBodyExtractor.BOUNDARY_REGEX.matcher( "multipart/alternative;\n" + " boundary=\"_000_:9E22DB2E4EF0164D9F76BB4BC3FC689E31BCF27D87CPPXCMS01morg_\""); assertTrue(matcher.find()); assertEquals("_000_:9E22DB2E4EF0164D9F76BB4BC3FC689E31BCF27D87CPPXCMS01morg_", matcher.group(1)); matcher = MessageBodyExtractor.BOUNDARY_REGEX.matcher( "multipart/alternative;\n" + " boundary =\"_000_:9E22DB2E4EF0164D9F76BB4BC3FC689E31BCF27D87CPPXCMS01morg_\""); assertTrue(matcher.find()); assertEquals("_000_:9E22DB2E4EF0164D9F76BB4BC3FC689E31BCF27D87CPPXCMS01morg_", matcher.group(1)); matcher = MessageBodyExtractor.BOUNDARY_REGEX.matcher( "multipart/alternative;\n" + " boundary = \"_000_:9E22DB2E4EF0164D9F76BB4BC3FC689E31BCF27D87CPPXCMS01morg_\";"); assertTrue(matcher.find()); assertEquals("_000_:9E22DB2E4EF0164D9F76BB4BC3FC689E31BCF27D87CPPXCMS01morg_", matcher.group(1)); matcher = MessageBodyExtractor.BOUNDARY_REGEX.matcher( "multipart/alternative;\n" + " boundary = \"_000_:9E22DB2E4EF0164D9F76BB4BC3FC689E31BCF27D87CPPXCMS01morg_"); assertTrue(matcher.find()); assertEquals("_000_:9E22DB2E4EF0164D9F76BB4BC3FC689E31BCF27D87CPPXCMS01morg_", matcher.group(1)); matcher = MessageBodyExtractor.BOUNDARY_REGEX.matcher( "multipart/alternative;" + "boundary = \"_000_:9E22DB2E4EF0164D9F76BB4BC3FC689E31BCF27D87CPPXCMS01morg_"); assertTrue(matcher.find()); assertEquals("_000_:9E22DB2E4EF0164D9F76BB4BC3FC689E31BCF27D87CPPXCMS01morg_", matcher.group(1)); // Boundary function assertEquals("--_000_:9E22DB2E4EF0164D9F76BB4BC3FC689E31BCF27D87CPPXCMS01morg_", MessageBodyExtractor.boundary("multipart/alternative;" + "boundary = _000_:9E22DB2E4EF0164D9F76BB4BC3FC689E31BCF27D87CPPXCMS01morg_ ")); assertEquals("--_000_:9E22DB2E4EF0164D9F76BB4BC3FC689E31BCF27D87CPPXCMS01morg_", MessageBodyExtractor.boundary("multipart/alternative;" + "BOUNDARY = _000_:9E22DB2E4EF0164D9F76BB4BC3FC689E31BCF27D87CPPXCMS01morg_ ")); // Invalid values... (spam?) assertNull(MessageBodyExtractor.boundary("multipart/alternative;" + "boundary =")); assertNull(MessageBodyExtractor.boundary("multipart/alternative;")); assertNull(MessageBodyExtractor.boundary("multipart/alternative;;;")); } @Test public final void testCharsetExtractorRegex() { // Charset function assertEquals("us-ascii", MessageBodyExtractor.charset("text/html;\n" + " charset=us-ascii")); assertEquals("us-ascii", MessageBodyExtractor.charset("text/html;\n" + " charset=us-ascii ")); assertEquals("us-ascii", MessageBodyExtractor.charset("text/html;\n" + " charset=\"us-ascii\"")); assertEquals("us-ascii", MessageBodyExtractor.charset("text/html;\n" + " charset = \"us-ascii\"")); assertEquals("us-ascii", MessageBodyExtractor.charset("text/html;\n" + " charset=\"us-ascii \"")); assertEquals("us-ascii", MessageBodyExtractor.charset("text/html;\n" + " CHARSET =\"us-ascii \"")); assertEquals("US-ASCII", MessageBodyExtractor.charset("text/html;\n" + " CHARSET =\"US-ASCII \"")); assertEquals("UTF-8", MessageBodyExtractor.charset("text/html;\n" + " charset=")); assertEquals("UTF-8", MessageBodyExtractor.charset("text/html;\n" + " CHARSET=")); assertEquals("UTF-8", MessageBodyExtractor.charset("text/html")); assertEquals("UTF-8", MessageBodyExtractor.charset("text/html;;;")); assertEquals("UTF-8", MessageBodyExtractor.charset("text/html;charset=;;")); assertEquals("UTF-8", MessageBodyExtractor.charset("")); assertEquals("UTF-8", MessageBodyExtractor.charset(null)); } @Test public final void testDecoding() throws MessagingException, IOException { String body = "Grüße"; String encoding = "8bit"; String charset = "ISO-8859-1"; final byte[] bytes = body.getBytes(charset); final InputStream decoded = MimeUtility.decode(new ByteArrayInputStream(bytes), encoding); String result = CharStreams.toString(new InputStreamReader(decoded, charset)); assertEquals(result, body); } }