/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.nutch.util; import java.io.IOException; import org.junit.Assert; import org.junit.Test; /** Unit tests for GZIPUtils methods. */ public class TestGZIPUtils { /* a short, highly compressable, string */ String SHORT_TEST_STRING = "aaaaaaaaaaaaaaaabbbbbbbbbbbbbbbbbbbbbcccccccccccccccc"; /* a short, highly compressable, string */ String LONGER_TEST_STRING = SHORT_TEST_STRING + SHORT_TEST_STRING + SHORT_TEST_STRING + SHORT_TEST_STRING + SHORT_TEST_STRING + SHORT_TEST_STRING + SHORT_TEST_STRING + SHORT_TEST_STRING + SHORT_TEST_STRING + SHORT_TEST_STRING + SHORT_TEST_STRING + SHORT_TEST_STRING; /* a snapshot of the nutch webpage */ String WEBPAGE = "<!DOCTYPE html PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\">\n" + "<html>\n" + "<head>\n" + " <meta http-equiv=\"content-type\"\n" + " content=\"text/html; charset=ISO-8859-1\">\n" + " <title>Nutch</title>\n" + "</head>\n" + "<body>\n" + "<h1\n" + " style=\"font-family: helvetica,arial,sans-serif; text-align: center; color: rgb(255, 153, 0);\"><a\n" + " href=\"http://www.nutch.org/\"><font style=\"color: rgb(255, 153, 0);\">Nutch</font></a><br>\n" + "<small>an open source web-search engine</small></h1>\n" + "<hr style=\"width: 100%; height: 1px;\" noshade=\"noshade\">\n" + "<table\n" + " style=\"width: 100%; text-align: left; margin-left: auto; margin-right: auto;\"\n" + " border=\"0\" cellspacing=\"0\" cellpadding=\"0\">\n" + " <tbody>\n" + " <tr>\n" + " <td style=\"vertical-align: top; text-align: center;\"><a\n" + " href=\"http://sourceforge.net/project/showfiles.php?group_id=59548\">Download</a><br>\n" + " </td>\n" + " <td style=\"vertical-align: top; text-align: center;\"><a\n" + " href=\"tutorial.html\">Tutorial</a><br>\n" + " </td>\n" + " <td style=\"vertical-align: top; text-align: center;\"><a\n" + " href=\"http://cvs.sourceforge.net/cgi-bin/viewcvs.cgi/nutch/nutch/\">CVS</a><br>\n" + " </td>\n" + " <td style=\"vertical-align: top; text-align: center;\"><a\n" + " href=\"api/index.html\">Javadoc</a><br>\n" + " </td>\n" + " <td style=\"vertical-align: top; text-align: center;\"><a\n" + " href=\"http://sourceforge.net/tracker/?atid=491356&group_id=59548&func=browse\">Bugs</a><br>\n" + " </td>\n" + " <td style=\"vertical-align: top; text-align: center;\"><a\n" + " href=\"http://sourceforge.net/mail/?group_id=59548\">Lists</a></td>\n" + " <td style=\"vertical-align: top; text-align: center;\"><a\n" + " href=\"policies.html\">Policies</a><br>\n" + " </td>\n" + " </tr>\n" + " </tbody>\n" + "</table>\n" + "<hr style=\"width: 100%; height: 1px;\" noshade=\"noshade\">\n" + "<h2>Introduction</h2>\n" + "Nutch is a nascent effort to implement an open-source web search\n" + "engine. Web search is a basic requirement for internet navigation, yet\n" + "the number of web search engines is decreasing. Today's oligopoly could\n" + "soon be a monopoly, with a single company controlling nearly all web\n" + "search for its commercial gain.  That would not be good for the\n" + "users of internet.  Nutch aims to enable anyone to easily and\n" + "cost-effectively deploy a world-class web search engine.<br>\n" + "<br>\n" + "To succeed, the Nutch software must be able to:<br>\n" + "<ul>\n" + " <li> crawl several billion pages per month</li>\n" + " <li>maintain an index of these pages</li>\n" + " <li>search that index up to 1000 times per second</li>\n" + " <li>provide very high quality search results</li>\n" + " <li>operate at minimal cost</li>\n" + "</ul>\n" + "<h2>Status</h2>\n" + "Currently we're just a handful of developers working part-time to put\n" + "together a demo.  The demo is coded entirely in Java.  However\n" + "persistent data is written in well-documented formats so that modules\n" + "may eventually be re-written in other languages (e.g., Perl, C++) as the\n" + "project progresses.<br>\n" + "<br>\n" + "<hr style=\"width: 100%; height: 1px;\" noshade=\"noshade\"> <a\n" + " href=\"http://sourceforge.net\"> </a>\n" + "<div style=\"text-align: center;\"><a href=\"http://sourceforge.net\"><img\n" + " src=\"http://sourceforge.net/sflogo.php?group_id=59548&type=1\"\n" + " style=\"border: 0px solid ; width: 88px; height: 31px;\"\n" + " alt=\"SourceForge.net Logo\" title=\"\"></a></div>\n" + "</body>\n" + "</html>\n"; @Test public void testZipUnzip() { byte[] testBytes = SHORT_TEST_STRING.getBytes(); testZipUnzip(testBytes); testBytes = LONGER_TEST_STRING.getBytes(); testZipUnzip(testBytes); testBytes = WEBPAGE.getBytes(); testZipUnzip(testBytes); } @Test public void testZipUnzipBestEffort() { byte[] testBytes = SHORT_TEST_STRING.getBytes(); testZipUnzipBestEffort(testBytes); testBytes = LONGER_TEST_STRING.getBytes(); testZipUnzipBestEffort(testBytes); testBytes = WEBPAGE.getBytes(); testZipUnzipBestEffort(testBytes); } public void testTruncation() { byte[] testBytes = SHORT_TEST_STRING.getBytes(); testTruncation(testBytes); testBytes = LONGER_TEST_STRING.getBytes(); testTruncation(testBytes); testBytes = WEBPAGE.getBytes(); testTruncation(testBytes); } @Test public void testLimit() { byte[] testBytes = SHORT_TEST_STRING.getBytes(); testLimit(testBytes); testBytes = LONGER_TEST_STRING.getBytes(); testLimit(testBytes); testBytes = WEBPAGE.getBytes(); testLimit(testBytes); } // helpers public void testZipUnzip(byte[] origBytes) { byte[] compressedBytes = GZIPUtils.zip(origBytes); Assert.assertTrue("compressed array is not smaller!", compressedBytes.length < origBytes.length); byte[] uncompressedBytes = null; try { uncompressedBytes = GZIPUtils.unzip(compressedBytes); } catch (IOException e) { e.printStackTrace(); Assert.assertTrue("caught exception '" + e + "' during unzip()", false); } Assert.assertTrue("uncompressedBytes is wrong size", uncompressedBytes.length == origBytes.length); for (int i = 0; i < origBytes.length; i++) if (origBytes[i] != uncompressedBytes[i]) Assert.assertTrue("uncompressedBytes does not match origBytes", false); } public void testZipUnzipBestEffort(byte[] origBytes) { byte[] compressedBytes = GZIPUtils.zip(origBytes); Assert.assertTrue("compressed array is not smaller!", compressedBytes.length < origBytes.length); byte[] uncompressedBytes = GZIPUtils.unzipBestEffort(compressedBytes); Assert.assertTrue("uncompressedBytes is wrong size", uncompressedBytes.length == origBytes.length); for (int i = 0; i < origBytes.length; i++) if (origBytes[i] != uncompressedBytes[i]) Assert.assertTrue("uncompressedBytes does not match origBytes", false); } public void testTruncation(byte[] origBytes) { byte[] compressedBytes = GZIPUtils.zip(origBytes); System.out.println("original data has len " + origBytes.length); System.out.println("compressed data has len " + compressedBytes.length); for (int i = compressedBytes.length; i >= 0; i--) { byte[] truncCompressed = new byte[i]; for (int j = 0; j < i; j++) truncCompressed[j] = compressedBytes[j]; byte[] trunc = GZIPUtils.unzipBestEffort(truncCompressed); if (trunc == null) { System.out.println("truncated to len " + i + ", trunc is null"); } else { System.out.println("truncated to len " + i + ", trunc.length= " + trunc.length); for (int j = 0; j < trunc.length; j++) if (trunc[j] != origBytes[j]) Assert.assertTrue("truncated/uncompressed array differs at pos " + j + " (compressed data had been truncated to len " + i + ")", false); } } } public void testLimit(byte[] origBytes) { byte[] compressedBytes = GZIPUtils.zip(origBytes); Assert.assertTrue("compressed array is not smaller!", compressedBytes.length < origBytes.length); for (int i = 0; i < origBytes.length; i++) { byte[] uncompressedBytes = GZIPUtils.unzipBestEffort(compressedBytes, i); Assert.assertTrue("uncompressedBytes is wrong size", uncompressedBytes.length == i); for (int j = 0; j < i; j++) if (origBytes[j] != uncompressedBytes[j]) Assert .assertTrue("uncompressedBytes does not match origBytes", false); } } }