/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.nutch.indexer; import java.lang.invoke.MethodHandles; import org.apache.commons.codec.binary.Base64; import org.apache.hadoop.mrunit.ReduceDriver; import org.apache.hadoop.mrunit.types.Pair; import org.apache.hadoop.util.StringUtils; import org.apache.nutch.crawl.CrawlDatum; import org.apache.nutch.crawl.NutchWritable; import org.apache.nutch.metadata.Metadata; import org.apache.nutch.metadata.Nutch; import org.apache.nutch.parse.Outlink; import org.apache.nutch.parse.ParseData; import org.apache.nutch.parse.ParseStatus; import org.apache.nutch.parse.ParseText; import org.apache.nutch.protocol.Content; import org.apache.nutch.util.NutchConfiguration; import org.junit.Test; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapred.Reducer; import static org.junit.Assert.*; import java.io.IOException; import java.nio.charset.Charset; import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.List; import org.apache.hadoop.conf.Configuration; /** Test {@link IndexerMapReduce} */ public class TestIndexerMapReduce { private static final Logger LOG = LoggerFactory .getLogger(MethodHandles.lookup().lookupClass()); public static String testUrl = "http://nutch.apache.org/"; public static Text testUrlText = new Text(testUrl); public static String htmlContentType = "text/html"; public static String testHtmlDoc = "<!DOCTYPE html>\n" + "<html>\n" + "<head>\n" + "<title>Test Indexing Binary Content</title>\n" + "<meta charset=\"utf-8\">\n" + "<meta name=\"keywords\" lang=\"en\" content=\"charset, encoding\" />\n" + "<meta name=\"keywords\" lang=\"fr\" content=\"codage des caractères\" />\n" + "<meta name=\"keywords\" lang=\"cs\" content=\"kódování znaků\" />\n" + "</head>\n" + "<body>\n" + "<p>\n" + "<ul>\n" + " <li lang=\"en\">English: character set, encoding\n" + " <li lang=\"fr\">Français: codage des caractères\n" + " <li lang=\"cs\">Čeština: kódování znaků (not covered by Latin-1)\n" + "</ul>\n" + "</body>\n" + "</html>"; public static Metadata htmlMeta = new Metadata(); static { htmlMeta.add("Content-Type", "text/html"); // add segment and signature to avoid NPEs htmlMeta.add(Nutch.SEGMENT_NAME_KEY, "123"); htmlMeta.add(Nutch.SIGNATURE_KEY, "123"); } public static ParseText parseText = new ParseText("Test"); public static ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, "Test", new Outlink[] {}, htmlMeta); public static CrawlDatum crawlDatumDbFetched = new CrawlDatum( CrawlDatum.STATUS_DB_FETCHED, 60 * 60 * 24); public static CrawlDatum crawlDatumFetchSuccess = new CrawlDatum( CrawlDatum.STATUS_FETCH_SUCCESS, 60 * 60 * 24); private Reducer<Text, NutchWritable, Text, NutchIndexAction> reducer = new IndexerMapReduce(); private ReduceDriver<Text, NutchWritable, Text, NutchIndexAction> reduceDriver; private Configuration configuration; /** * Test indexing of base64-encoded binary content. */ @Test public void testBinaryContentBase64() { configuration = NutchConfiguration.create(); configuration.setBoolean(IndexerMapReduce.INDEXER_BINARY_AS_BASE64, true); Charset[] testCharsets = { StandardCharsets.UTF_8, Charset.forName("iso-8859-1"), Charset.forName("iso-8859-2") }; for (Charset charset : testCharsets) { LOG.info("Testing indexing binary content as base64 for charset {}", charset.name()); String htmlDoc = testHtmlDoc; if (charset != StandardCharsets.UTF_8) { htmlDoc = htmlDoc.replaceAll("utf-8", charset.name()); if (charset.name().equalsIgnoreCase("iso-8859-1")) { // Western-European character set: remove Czech content htmlDoc = htmlDoc.replaceAll("\\s*<[^>]+\\slang=\"cs\".+?\\n", ""); } else if (charset.name().equalsIgnoreCase("iso-8859-2")) { // Eastern-European character set: remove French content htmlDoc = htmlDoc.replaceAll("\\s*<[^>]+\\slang=\"fr\".+?\\n", ""); } } Content content = new Content(testUrl, testUrl, htmlDoc.getBytes(charset), htmlContentType, htmlMeta, configuration); NutchDocument doc = runIndexer(crawlDatumDbFetched, crawlDatumFetchSuccess, parseText, parseData, content); assertNotNull("No NutchDocument indexed", doc); String binaryContentBase64 = (String) doc.getField("binaryContent") .getValues().get(0); LOG.info("binary content (base64): {}", binaryContentBase64); String binaryContent = new String( Base64.decodeBase64(binaryContentBase64), charset); LOG.info("binary content (decoded): {}", binaryContent); assertEquals( "Binary content (" + charset + ") not correctly saved as base64", htmlDoc, binaryContent); } } /** * Run {@link IndexerMapReduce.reduce(...)} to get a "indexed" * {@link NutchDocument} by passing objects from segment and CrawlDb to the * indexer. * * @param dbDatum * crawl datum from CrawlDb * @param fetchDatum * crawl datum (fetch status) from segment * @param parseText * plain text from parsed document * @param parseData * parse data * @param content * (optional, if index binary content) protocol content * @return "indexed" document */ public NutchDocument runIndexer(CrawlDatum dbDatum, CrawlDatum fetchDatum, ParseText parseText, ParseData parseData, Content content) { List<NutchWritable> values = new ArrayList<NutchWritable>(); values.add(new NutchWritable(dbDatum)); values.add(new NutchWritable(fetchDatum)); values.add(new NutchWritable(parseText)); values.add(new NutchWritable(parseData)); values.add(new NutchWritable(content)); reduceDriver = ReduceDriver.newReduceDriver(reducer); reduceDriver.setConfiguration(configuration); reduceDriver.withInput(testUrlText, values); List<Pair<Text, NutchIndexAction>> reduceResult; NutchDocument doc = null; try { reduceResult = reduceDriver.run(); for (Pair<Text, NutchIndexAction> p : reduceResult) { if (p.getSecond().action != NutchIndexAction.DELETE) { doc = p.getSecond().doc; } } } catch (IOException e) { LOG.error(StringUtils.stringifyException(e)); } return doc; } }