/** * Licensed to DigitalPebble Ltd under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * DigitalPebble licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.digitalpebble.stormcrawler.util; import java.util.HashSet; import java.util.Map; import java.util.Set; import org.apache.commons.lang.StringUtils; import com.digitalpebble.stormcrawler.Constants; import com.digitalpebble.stormcrawler.Metadata; /** * Implements the logic of how the metadata should be passed to the outlinks, * what should be stored back in the persistence layer etc... */ public class MetadataTransfer { /** * Class to use for transfering metadata to outlinks. Must extend the class * MetadataTransfer. */ public static final String metadataTransferClassParamName = "metadata.transfer.class"; /** * Parameter name indicating which metadata to transfer to the outlinks and * persist for a given document. Value is either a vector or a single valued * String. */ public static final String metadataTransferParamName = "metadata.transfer"; /** * Parameter name indicating which metadata to persist for a given document * but <b>not</b> transfer to outlinks. Value is either a vector or a single * valued String. */ public static final String metadataPersistParamName = "metadata.persist"; /** * Parameter name indicating whether to track the url path or not. Boolean * value, true by default. */ public static final String trackPathParamName = "metadata.track.path"; /** * Parameter name indicating whether to track the depth from seed. Boolean * value, true by default. */ public static final String trackDepthParamName = "metadata.track.depth"; /** Metadata key name for tracking the source URLs */ public static final String urlPathKeyName = "url.path"; /** Metadata key name for tracking the depth */ public static final String depthKeyName = "depth"; /** Metadata key name for tracking a non-default max depth */ public static final String maxDepthKeyName = "max.depth"; private Set<String> mdToTransfer = new HashSet<>(); private Set<String> mdToPersistOnly = new HashSet<>(); private boolean trackPath = true; private boolean trackDepth = true; public static MetadataTransfer getInstance(Map<String, Object> conf) { String className = ConfUtils.getString(conf, metadataTransferClassParamName); MetadataTransfer transferInstance; // no custom class specified if (StringUtils.isBlank(className)) { transferInstance = new MetadataTransfer(); } else { try { Class<?> transferClass = Class.forName(className); boolean interfaceOK = MetadataTransfer.class .isAssignableFrom(transferClass); if (!interfaceOK) { throw new RuntimeException("Class " + className + " must extend MetadataTransfer"); } transferInstance = (MetadataTransfer) transferClass .newInstance(); } catch (Exception e) { throw new RuntimeException("Can't instanciate " + className); } } // should not be null if (transferInstance != null) transferInstance.configure(conf); return transferInstance; } protected void configure(Map<String, Object> conf) { trackPath = ConfUtils.getBoolean(conf, trackPathParamName, true); trackDepth = ConfUtils.getBoolean(conf, trackDepthParamName, true); // keep the path but don't add anything to it if (trackPath) { mdToTransfer.add(urlPathKeyName); } // keep the depth but don't add anything to it if (trackDepth) { mdToTransfer.add(depthKeyName); mdToTransfer.add(maxDepthKeyName); } mdToTransfer.addAll(ConfUtils.loadListFromConf( metadataTransferParamName, conf)); mdToPersistOnly.addAll(ConfUtils.loadListFromConf( metadataPersistParamName, conf)); // always add the fetch error count mdToPersistOnly.add(Constants.fetchErrorCountParamName); } /** * Determine which metadata should be transfered to an outlink. Adds * additional metadata like the URL path. **/ public Metadata getMetaForOutlink(String targetURL, String sourceURL, Metadata parentMD) { Metadata md = _filter(parentMD, mdToTransfer); // keep the path? if (trackPath) { md.addValue(urlPathKeyName, sourceURL); } // track depth if (trackDepth) { String existingDepth = md.getFirstValue(depthKeyName); int depth; try { depth = Integer.parseInt(existingDepth); } catch (Exception e) { depth = 0; } md.setValue(depthKeyName, Integer.toString(++depth)); } return md; } /** * Determine which metadata should be persisted for a given document * including those which are not necessarily transferred to the outlinks **/ public Metadata filter(Metadata metadata) { Metadata filtered_md = _filter(metadata, mdToTransfer); // add the features that are only persisted but // not transfered like __redirTo_ filtered_md.putAll(_filter(metadata, mdToPersistOnly)); return filtered_md; } private Metadata _filter(Metadata metadata, Set<String> filter) { Metadata filtered_md = new Metadata(); for (String key : filter) { String[] vals = metadata.getValues(key); if (vals != null) filtered_md.setValues(key, vals); } return filtered_md; } }