/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.nutch.indexer.replace; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.io.Text; import org.apache.nutch.crawl.CrawlDatum; import org.apache.nutch.crawl.Inlinks; import org.apache.nutch.indexer.NutchDocument; import org.apache.nutch.indexer.basic.BasicIndexingFilter; import org.apache.nutch.indexer.metadata.MetadataIndexer; import org.apache.nutch.parse.Parse; import org.apache.nutch.parse.ParseUtil; import org.apache.nutch.protocol.Content; import org.apache.nutch.protocol.Protocol; import org.apache.nutch.protocol.ProtocolFactory; import org.apache.nutch.util.NutchConfiguration; import org.junit.Assert; import org.junit.Test; /** * JUnit tests for the <code>index-replace</code> plugin. * * In these tests, the sample file has some meta tags added to the Nutch * document by the <code>index-metadata</code> plugin. The * <code>index-replace</code> plugin is then used to either change (or not * change) the fields depending on the various values of * <code>index.replace.regexp</code> property being provided to Nutch. * * * @author Peter Ciuffetti * */ public class TestIndexReplace { private static final String INDEX_REPLACE_PROPERTY = "index.replace.regexp"; private String fileSeparator = System.getProperty("file.separator"); private String sampleDir = System.getProperty("test.data", "."); private String sampleFile = "testIndexReplace.html"; /** * Run a test file through the Nutch parser and index filters. * * @param fileName * @param conf * @return the Nutch document with the replace indexer applied */ public NutchDocument parseAndFilterFile(String fileName, Configuration conf) { NutchDocument doc = new NutchDocument(); BasicIndexingFilter basicIndexer = new BasicIndexingFilter(); basicIndexer.setConf(conf); Assert.assertNotNull(basicIndexer); MetadataIndexer metaIndexer = new MetadataIndexer(); metaIndexer.setConf(conf); Assert.assertNotNull(basicIndexer); ReplaceIndexer replaceIndexer = new ReplaceIndexer(); replaceIndexer.setConf(conf); Assert.assertNotNull(replaceIndexer); try { String urlString = "file:" + sampleDir + fileSeparator + fileName; Text text = new Text(urlString); CrawlDatum crawlDatum = new CrawlDatum(); Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString); Content content = protocol.getProtocolOutput(text, crawlDatum) .getContent(); Parse parse = new ParseUtil(conf).parse(content).get(content.getUrl()); crawlDatum.setFetchTime(100L); Inlinks inlinks = new Inlinks(); doc = basicIndexer.filter(doc, parse, text, crawlDatum, inlinks); doc = metaIndexer.filter(doc, parse, text, crawlDatum, inlinks); doc = replaceIndexer.filter(doc, parse, text, crawlDatum, inlinks); } catch (Exception e) { e.printStackTrace(); Assert.fail(e.toString()); } return doc; } /** * Test property parsing. * * The filter does not expose details of the parse. So all we are checking is * that the parse does not throw a runtime exception and that the value * provided is the value returned. */ @Test public void testPropertyParse() { Configuration conf = NutchConfiguration.create(); String indexReplaceProperty = " metatag.description=/this(.*)plugin/this awesome plugin/2\n" + " metatag.keywords=/\\,/\\!/\n" + " hostmatch=.*.com\n" + " metatag.keywords=/\\,/\\?/\n" + " metatag.author:dc_author=/\\s+/ David /\n" + " urlmatch=.*.html\n" + " metatag.keywords=/\\,/\\./\n" + " metatag.author=/\\s+/ D. /\n"; conf.set(INDEX_REPLACE_PROPERTY, indexReplaceProperty); ReplaceIndexer rp = new ReplaceIndexer(); try { rp.setConf(conf); } catch (RuntimeException ohno) { Assert.fail("Unable to parse a valid index.replace.regexp property! " + ohno.getMessage()); } Configuration parsedConf = rp.getConf(); // Does the getter equal the setter? Too easy! Assert.assertEquals(indexReplaceProperty, parsedConf.get(INDEX_REPLACE_PROPERTY)); } /** * Test metatag value replacement using global replacement settings. * * The index.replace.regexp property does not use hostmatch or urlmatch, so * all patterns are global. */ @Test public void testGlobalReplacement() { String expectedDescription = "With this awesome plugin, I control the description! Bwuhuhuhaha!"; String expectedKeywords = "Breathtaking! Riveting! Two Thumbs Up!"; String expectedAuthor = "Peter D. Ciuffetti"; String indexReplaceProperty = " metatag.description=/this(.*)plugin/this awesome plugin/\n" + " metatag.keywords=/\\,/\\!/\n" + " metatag.author=/\\s+/ D. /\n"; Configuration conf = NutchConfiguration.create(); conf.set( "plugin.includes", "protocol-file|urlfilter-regex|parse-(html|metatags)|index-(basic|anchor|metadata|static|replace)|urlnormalizer-(pass|regex|basic)"); conf.set(INDEX_REPLACE_PROPERTY, indexReplaceProperty); conf.set("metatags.names", "author,description,keywords"); conf.set("index.parse.md", "metatag.author,metatag.description,metatag.keywords"); // Not necessary but helpful when debugging the filter. conf.set("http.timeout", "99999999999"); // Run the document through the parser and index filters. NutchDocument doc = parseAndFilterFile(sampleFile, conf); Assert.assertEquals(expectedDescription, doc.getFieldValue("metatag.description")); Assert .assertEquals(expectedKeywords, doc.getFieldValue("metatag.keywords")); Assert.assertEquals(expectedAuthor, doc.getFieldValue("metatag.author")); } /** * Test that invalid property settings are handled and ignored. * * This test provides an invalid property setting that will fail property * parsing and Pattern.compile. The expected outcome is that the patterns will * not cause failure and the targeted fields will not be modified by the * filter. */ @Test public void testInvalidPatterns() { String expectedDescription = "With this plugin, I control the description! Bwuhuhuhaha!"; String expectedKeywords = "Breathtaking, Riveting, Two Thumbs Up!"; String expectedAuthor = "Peter Ciuffetti"; // Contains: invalid pattern, invalid flags, incomplete property String indexReplaceProperty = " metatag.description=/this\\s+**plugin/this awesome plugin/\n" + " metatag.keywords=/\\,/\\!/what\n" + " metatag.author=#notcomplete"; Configuration conf = NutchConfiguration.create(); conf.set( "plugin.includes", "protocol-file|urlfilter-regex|parse-(html|metatags)|index-(basic|anchor|metadata|static|replace)|urlnormalizer-(pass|regex|basic)"); conf.set(INDEX_REPLACE_PROPERTY, indexReplaceProperty); conf.set("metatags.names", "author,description,keywords"); conf.set("index.parse.md", "metatag.author,metatag.description,metatag.keywords"); // Not necessary but helpful when debugging the filter. conf.set("http.timeout", "99999999999"); // Run the document through the parser and index filters. NutchDocument doc = parseAndFilterFile(sampleFile, conf); // Assert that our metatags have not changed. Assert.assertEquals(expectedDescription, doc.getFieldValue("metatag.description")); Assert .assertEquals(expectedKeywords, doc.getFieldValue("metatag.keywords")); Assert.assertEquals(expectedAuthor, doc.getFieldValue("metatag.author")); } /** * Test URL pattern matching */ @Test public void testUrlMatchesPattern() { String expectedDescription = "With this awesome plugin, I control the description! Bwuhuhuhaha!"; String expectedKeywords = "Breathtaking! Riveting! Two Thumbs Up!"; String expectedAuthor = "Peter D. Ciuffetti"; String indexReplaceProperty = " urlmatch=.*.html\n" + " metatag.description=/this(.*)plugin/this awesome plugin/\n" + " metatag.keywords=/\\,/\\!/\n" + " metatag.author=/\\s+/ D. /\n"; Configuration conf = NutchConfiguration.create(); conf.set( "plugin.includes", "protocol-file|urlfilter-regex|parse-(html|metatags)|index-(basic|anchor|metadata|static|replace)|urlnormalizer-(pass|regex|basic)"); conf.set(INDEX_REPLACE_PROPERTY, indexReplaceProperty); conf.set("metatags.names", "author,description,keywords"); conf.set("index.parse.md", "metatag.author,metatag.description,metatag.keywords"); // Not necessary but helpful when debugging the filter. conf.set("http.timeout", "99999999999"); // Run the document through the parser and index filters. NutchDocument doc = parseAndFilterFile(sampleFile, conf); // Assert that our metatags have changed. Assert.assertEquals(expectedDescription, doc.getFieldValue("metatag.description")); Assert .assertEquals(expectedKeywords, doc.getFieldValue("metatag.keywords")); Assert.assertEquals(expectedAuthor, doc.getFieldValue("metatag.author")); } /** * Test URL pattern not matching. * * Expected result is that the filter does not change the fields. */ @Test public void testUrlNotMatchesPattern() { String expectedDescription = "With this plugin, I control the description! Bwuhuhuhaha!"; String expectedKeywords = "Breathtaking, Riveting, Two Thumbs Up!"; String expectedAuthor = "Peter Ciuffetti"; String indexReplaceProperty = " urlmatch=.*.xml\n" + " metatag.description=/this(.*)plugin/this awesome plugin/\n" + " metatag.keywords=/\\,/\\!/\n" + " metatag.author=/\\s+/ D. /\n"; Configuration conf = NutchConfiguration.create(); conf.set( "plugin.includes", "protocol-file|urlfilter-regex|parse-(html|metatags)|index-(basic|anchor|metadata|static|replace)|urlnormalizer-(pass|regex|basic)"); conf.set(INDEX_REPLACE_PROPERTY, indexReplaceProperty); conf.set("metatags.names", "author,description,keywords"); conf.set("index.parse.md", "metatag.author,metatag.description,metatag.keywords"); // Not necessary but helpful when debugging the filter. conf.set("http.timeout", "99999999999"); // Run the document through the parser and index filters. NutchDocument doc = parseAndFilterFile(sampleFile, conf); // Assert that our metatags have not changed. Assert.assertEquals(expectedDescription, doc.getFieldValue("metatag.description")); Assert .assertEquals(expectedKeywords, doc.getFieldValue("metatag.keywords")); Assert.assertEquals(expectedAuthor, doc.getFieldValue("metatag.author")); } /** * Test a global pattern match for description and URL pattern match for * keywords and author. * * All three should be triggered. It also tests replacement groups. */ @Test public void testGlobalAndUrlMatchesPattern() { String expectedDescription = "With this awesome plugin, I control the description! Bwuhuhuhaha!"; String expectedKeywords = "Breathtaking! Riveting! Two Thumbs Up!"; String expectedAuthor = "Peter D. Ciuffetti"; String indexReplaceProperty = " metatag.description=/this(.*)plugin/this$1awesome$1plugin/\n" + " urlmatch=.*.html\n" + " metatag.keywords=/\\,/\\!/\n" + " metatag.author=/\\s+/ D. /\n"; Configuration conf = NutchConfiguration.create(); conf.set( "plugin.includes", "protocol-file|urlfilter-regex|parse-(html|metatags)|index-(basic|anchor|metadata|static|replace)|urlnormalizer-(pass|regex|basic)"); conf.set(INDEX_REPLACE_PROPERTY, indexReplaceProperty); conf.set("metatags.names", "author,description,keywords"); conf.set("index.parse.md", "metatag.author,metatag.description,metatag.keywords"); // Not necessary but helpful when debugging the filter. conf.set("http.timeout", "99999999999"); // Run the document through the parser and index filters. NutchDocument doc = parseAndFilterFile(sampleFile, conf); // Assert that our metatags have changed. Assert.assertEquals(expectedDescription, doc.getFieldValue("metatag.description")); Assert .assertEquals(expectedKeywords, doc.getFieldValue("metatag.keywords")); Assert.assertEquals(expectedAuthor, doc.getFieldValue("metatag.author")); } /** * Test a global pattern match for description and URL pattern match for * keywords and author. * * Only the global match should be triggered. */ @Test public void testGlobalAndUrlNotMatchesPattern() { String expectedDescription = "With this awesome plugin, I control the description! Bwuhuhuhaha!"; String expectedKeywords = "Breathtaking, Riveting, Two Thumbs Up!"; String expectedAuthor = "Peter Ciuffetti"; String indexReplaceProperty = " metatag.description=/this(.*)plugin/this$1awesome$1plugin/\n" + " urlmatch=.*.xml\n" + " metatag.keywords=/\\,/\\!/\n" + " metatag.author=/\\s+/ D. /\n"; Configuration conf = NutchConfiguration.create(); conf.set( "plugin.includes", "protocol-file|urlfilter-regex|parse-(html|metatags)|index-(basic|anchor|metadata|static|replace)|urlnormalizer-(pass|regex|basic)"); conf.set(INDEX_REPLACE_PROPERTY, indexReplaceProperty); conf.set("metatags.names", "author,description,keywords"); conf.set("index.parse.md", "metatag.author,metatag.description,metatag.keywords"); // Not necessary but helpful when debugging the filter. conf.set("http.timeout", "99999999999"); // Run the document through the parser and index filters. NutchDocument doc = parseAndFilterFile(sampleFile, conf); // Assert that description has changed and the others have not changed. Assert.assertEquals(expectedDescription, doc.getFieldValue("metatag.description")); Assert .assertEquals(expectedKeywords, doc.getFieldValue("metatag.keywords")); Assert.assertEquals(expectedAuthor, doc.getFieldValue("metatag.author")); } /** * Test order-specific replacement settings. * * This makes multiple replacements on the same field and will produce the * expected value only if the replacements are run in the order specified. */ @Test public void testReplacementsRunInSpecifedOrder() { String expectedDescription = "With this awesome plugin, I control the description! Bwuhuhuhaha!"; String indexReplaceProperty = " metatag.description=/this plugin/this amazing plugin/\n" + " metatag.description=/this amazing plugin/this valuable plugin/\n" + " metatag.description=/this valuable plugin/this cool plugin/\n" + " metatag.description=/this cool plugin/this wicked plugin/\n" + " metatag.description=/this wicked plugin/this awesome plugin/\n"; Configuration conf = NutchConfiguration.create(); conf.set( "plugin.includes", "protocol-file|urlfilter-regex|parse-(html|metatags)|index-(basic|anchor|metadata|static|replace)|urlnormalizer-(pass|regex|basic)"); conf.set(INDEX_REPLACE_PROPERTY, indexReplaceProperty); conf.set("metatags.names", "author,description,keywords"); conf.set("index.parse.md", "metatag.author,metatag.description,metatag.keywords"); // Not necessary but helpful when debugging the filter. conf.set("http.timeout", "99999999999"); // Run the document through the parser and index filters. NutchDocument doc = parseAndFilterFile(sampleFile, conf); // Check that the value produced by the last replacement has worked. Assert.assertEquals(expectedDescription, doc.getFieldValue("metatag.description")); } /** * Test a replacement pattern that uses the flags feature. * * A 2 is Pattern.CASE_INSENSITIVE. We look for upper case and expect to match * any case. */ @Test public void testReplacementsWithFlags() { String expectedDescription = "With this awesome plugin, I control the description! Bwuhuhuhaha!"; String indexReplaceProperty = " metatag.description=/THIS PLUGIN/this awesome plugin/2"; Configuration conf = NutchConfiguration.create(); conf.set( "plugin.includes", "protocol-file|urlfilter-regex|parse-(html|metatags)|index-(basic|anchor|metadata|static|replace)|urlnormalizer-(pass|regex|basic)"); conf.set(INDEX_REPLACE_PROPERTY, indexReplaceProperty); conf.set("metatags.names", "author,description,keywords"); conf.set("index.parse.md", "metatag.author,metatag.description,metatag.keywords"); // Not necessary but helpful when debugging the filter. conf.set("http.timeout", "99999999999"); // Run the document through the parser and index filters. NutchDocument doc = parseAndFilterFile(sampleFile, conf); // Check that the value produced by the case-insensitive replacement has // worked. Assert.assertEquals(expectedDescription, doc.getFieldValue("metatag.description")); } /** * Test a replacement pattern that uses the target field feature. * Check that the input is not modifid and that the taret field is added. */ @Test public void testReplacementsDifferentTarget() { String expectedDescription = "With this plugin, I control the description! Bwuhuhuhaha!"; String expectedTargetDescription = "With this awesome plugin, I control the description! Bwuhuhuhaha!"; String indexReplaceProperty = " metatag.description:new=/this plugin/this awesome plugin/"; Configuration conf = NutchConfiguration.create(); conf.set( "plugin.includes", "protocol-file|urlfilter-regex|parse-(html|metatags)|index-(basic|anchor|metadata|static|replace)|urlnormalizer-(pass|regex|basic)"); conf.set(INDEX_REPLACE_PROPERTY, indexReplaceProperty); conf.set("metatags.names", "author,description,keywords"); conf.set("index.parse.md", "metatag.author,metatag.description,metatag.keywords"); // Not necessary but helpful when debugging the filter. conf.set("http.timeout", "99999999999"); // Run the document through the parser and index filters. NutchDocument doc = parseAndFilterFile(sampleFile, conf); // Check that the input field has not been modified Assert.assertEquals(expectedDescription, doc.getFieldValue("metatag.description")); // Check that the output field has created Assert.assertEquals(expectedTargetDescription, doc.getFieldValue("new")); } }