/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.nutch.tools; //Junit imports import static org.junit.Assert.*; import org.junit.Test; //Commons imports import org.apache.commons.io.FileUtils; import org.apache.commons.io.filefilter.FileFilterUtils; //JDK imports import java.io.File; import java.nio.file.Files; import java.util.Collection; //Nutch imports import org.apache.nutch.tools.CommonCrawlDataDumper; import org.apache.nutch.tools.CommonCrawlConfig; /** * * Test harness for the {@link CommonCrawlDataDumper}. * */ public class TestCommonCrawlDataDumper { @Test public void testDump() throws Exception { File sampleSegmentDir = new File(System.getProperty("test.build.data", "."), "test-segments"); File tempDir = Files.createTempDirectory("temp").toFile(); String[] crawledFiles = { "c463a4381eb837f9f5d45978cfbde79e_.html", "a974b8d74f7779ab6c6f90b9b279467e_.html", "6bc6497314656a3129732efd708e9f96_.html", "6e88c40abe26cad0a726102997aed048_.html", "5cafdd88f4e9cf3f0cd4c298c6873358_apachecon-europe.html", "932dc10a76e894a2baa8ea4086ad72a8_apachecon-north-america.html", "8540187d75b9cd405b8fa97d665f9f90_.html", "e501bc976c8693b4d28a55b79c390a32_.html", "6add662f9f5758b7d75eec5cfa1f340b_.html", "d4f20df3c37033dc516067ee1f424e4e_.html", "d7b8fa9a02cdc95546030d04be4a98f3_solr.html", "3cbe876e3a8e7a397811de3bb6a945cd_.html", "5b987dde0da79d7f2e3f22b46437f514_bot.html", "3d742820d9a701a1f02e10d5bf5ae633_credits.html", "693673f3c73d04a26276effdea69b7ee_downloads.html", "4f7e3469dafabb4c3b87b00531f81aa4_index.html", "15c5330675be8a69995aab18ff9859e0_javadoc.html", "bc624e1b49e29870ef095819bb0e977a_mailing_lists.html", "a7d66b68754c3665c66e62225255e3fd_version_control.html", "32fb7fe362e1a0d8a1b15addf2a00bdc_1.9-rel", "54ab3db10fe7b26415a04e21045125a8_1zE.html", "1012a41c08092c40340598bd8ee0bfa6_PGa.html", "c830cfc5c28bed10e69d5b83e9c1bcdc_nutch_2.3", "687d915dc264a77f35c61ba841936730_oHY.html", "2bf1afb650010128b4cf4afe677db3c5_1pav9xl.html", "550cab79e14110bbee61c36c61c830b0_1pbE15n.html", "664ff07b46520cc1414494ae49da91f6_.html", "04223714e648a6a43d7c8af8b095f733_.html", "3c8ccb865cd72cca06635d74c7f2f3c4_.html", "90fe47b28716a2230c5122c83f0b8562_Becoming_A_Nutch_Developer.html", "ac0fefe70007d40644e2b8bd5da3c305_FAQ.html", "bc9bc7f11c1262e8924032ab1c7ce112_NutchPropertiesCompleteList.html", "78d04611985e7375b441e478fa36f610_.html", "64adaebadd44e487a8b58894e979dc70_CHANGES.txt", "a48e9c2659b703fdea3ad332877708d8_.html", "159d66d679dd4442d2d8ffe6a83b2912_sponsorship.html", "66f1ce6872c9195c665fc8bdde95f6dc_thanks.html", "ef7ee7e929a048c4a119af78492095b3_.html", "e4251896a982c2b2b68678b5c9c57f4d_.html", "5384764a16fab767ebcbc17d87758a24_.html", "a6ba75a218ef2a09d189cb7dffcecc0f_.html", "f2fa63bd7a3aca63841eed4cd10fb519_SolrCloud.html", "f8de0fbda874e1a140f1b07dcebab374_NUTCH-1047.html", "9c120e94f52d690e9cfd044c34134649_NUTCH-1591.html", "7dd70378379aa452279ce9200d0a5fed_NUTCH-841.html", "ddf78b1fe5c268d59fd62bc745815b92_.html", "401c9f04887dbbf8d29ad52841b8bdb3_ApacheNutch.html", "8f984e2d3c2ba68d1695288f1738deaf_Nutch.html", "c2ef09a95a956207cea073a515172be2_FrontPage.html", "90d9b76e8eabdab1cbcc29bea437c7ae_NutchRESTAPI.html" }; CommonCrawlDataDumper dumper = new CommonCrawlDataDumper( new CommonCrawlConfig()); dumper.dump(tempDir, sampleSegmentDir, null, false, null, false, "", false); Collection<File> tempFiles = FileUtils.listFiles(tempDir, FileFilterUtils.fileFileFilter(), FileFilterUtils.directoryFileFilter()); for (String expectedFileName : crawledFiles) { assertTrue("Missed file " + expectedFileName + " in dump", hasFile(expectedFileName, tempFiles)); } } private boolean hasFile(String fileName, Collection<File> files) { for (File f : files) { if (f.getName().equals(fileName)) { return true; } } return false; } }