/** * Licensed to DigitalPebble Ltd under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * DigitalPebble licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.digitalpebble.stormcrawler.parse.filter; import java.nio.charset.StandardCharsets; import java.util.Map; import org.apache.commons.codec.digest.DigestUtils; import org.apache.storm.shade.org.apache.commons.lang.StringUtils; import org.w3c.dom.DocumentFragment; import com.digitalpebble.stormcrawler.Metadata; import com.digitalpebble.stormcrawler.parse.ParseData; import com.digitalpebble.stormcrawler.parse.ParseFilter; import com.digitalpebble.stormcrawler.parse.ParseResult; import com.fasterxml.jackson.databind.JsonNode; /** * Computes a signature for a page, based on the binary content or text. If the * content is empty, the URL is used. * * Configuration properties: * <dl> * <dt>useText</dt> * <dd>compute signature on plain text, instead of binary content</dd> * <dt>keyName</dt> * <dd>name of the metadata field to hold the signature (default: * "signature")</dd> * <dt>keyNameCopy</dt> * <dd>name of the metadata field to hold a temporary copy of the signature used * to decide by signature comparison whether the document has changed. If not * defined or empty, the signature is not copied.</dd> * </dl> * */ public class MD5SignatureParseFilter extends ParseFilter { private String key_name = "signature"; private boolean useText = false; private String copyKeyName = null; @Override public void filter(String URL, byte[] content, DocumentFragment doc, ParseResult parse) { ParseData parseData = parse.get(URL); Metadata metadata = parseData.getMetadata(); if (copyKeyName != null) { String signature = metadata.getFirstValue(key_name); if (signature != null) { metadata.setValue(copyKeyName, signature); } } byte[] data = null; if (useText) { String text = parseData.getText(); if (StringUtils.isNotBlank(text)) { data = text.getBytes(StandardCharsets.UTF_8); } } else { data = content; } if (data == null) { data = URL.getBytes(StandardCharsets.UTF_8); } String hex = DigestUtils.md5Hex(data); metadata.setValue(key_name, hex); } @SuppressWarnings("rawtypes") @Override public void configure(Map stormConf, JsonNode filterParams) { JsonNode node = filterParams.get("useText"); if (node != null && node.asBoolean()) { useText = true; } node = filterParams.get("keyName"); if (node != null && node.isTextual()) { key_name = node.asText("signature"); } node = filterParams.get("keyNameCopy"); if (node != null && node.isTextual() && StringUtils.isNotBlank(node.asText(""))) { copyKeyName = node.asText("signatureOld"); } } }