/**
* Licensed to DigitalPebble Ltd under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* DigitalPebble licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.digitalpebble.storm.crawler.util;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import org.apache.commons.lang.StringUtils;
import clojure.lang.PersistentVector;
import com.digitalpebble.storm.crawler.Metadata;
/**
* Implements the logic of how the metadata should be passed to the outlinks,
* what should be stored back in the persistence layer etc...
*/
public class MetadataTransfer {
/**
* Class to use for transfering metadata to outlinks. Must extend the class
* MetadataTransfer.
*/
public static final String metadataTransferClassParamName = "metadata.transfer.class";
/**
* Parameter name indicating which metadata to transfer to the outlinks.
* Value is either a vector or a single valued String.
*/
public static final String metadataTransferParamName = "metadata.transfer";
/**
* Parameter name indicating whether to track the url path or not. Boolean
* value, true by default.
*/
public static final String trackPathParamName = "metadata.track.path";
/**
* Parameter name indicating whether to track the depth from seed. Boolean
* value, true by default.
*/
public static final String trackDepthParamName = "metadata.track.depth";
/** Metadata key name for tracking the source URLs */
public static final String urlPathKeyName = "url.path";
/** Metadata key name for tracking the depth */
public static final String depthKeyName = "depth";
private List<String> mdToKeep = new ArrayList<String>();
private boolean trackPath = true;
private boolean trackDepth = true;
public static MetadataTransfer getInstance(Map<String, Object> conf) {
String className = ConfUtils.getString(conf,
metadataTransferClassParamName);
MetadataTransfer transferInstance;
// no custom class specified
if (StringUtils.isBlank(className)) {
transferInstance = new MetadataTransfer();
}
else {
try {
Class<?> transferClass = Class.forName(className);
boolean interfaceOK = MetadataTransfer.class
.isAssignableFrom(transferClass);
if (!interfaceOK) {
throw new RuntimeException("Class " + className
+ " must extend MetadataTransfer");
}
transferInstance = (MetadataTransfer) transferClass
.newInstance();
} catch (Exception e) {
throw new RuntimeException("Can't instanciate " + className);
}
}
// should not be null
if (transferInstance != null)
transferInstance.configure(conf);
return transferInstance;
}
protected void configure(Map<String, Object> conf) {
trackPath = ConfUtils.getBoolean(conf, trackPathParamName, true);
trackDepth = ConfUtils.getBoolean(conf, trackDepthParamName, true);
Object obj = conf.get(metadataTransferParamName);
if (obj == null)
return;
if (obj instanceof PersistentVector) {
mdToKeep.addAll((PersistentVector) obj);
}
// single value?
else {
mdToKeep.add(obj.toString());
}
}
/**
* Determine which metadata should be transfered to an outlink. Adds
* additional metadata like the URL path.
**/
public Metadata getMetaForOutlink(String targetURL, String sourceURL,
Metadata parentMD) {
Metadata md = filter(parentMD);
// keep the path?
if (trackPath) {
md.addValue(urlPathKeyName, sourceURL);
}
// track depth
if (trackDepth) {
String existingDepth = md.getFirstValue(depthKeyName);
int depth = 0;
try {
depth = Integer.parseInt(existingDepth);
} catch (Exception e) {
depth = 0;
}
md.setValue(depthKeyName, Integer.toString(++depth));
}
return md;
}
/**
* Determine which metadata should be kept e.g. for storing into a database
**/
public Metadata filter(Metadata metadata) {
Metadata md = new Metadata();
List<String> metadataToKeep = new ArrayList<String>(mdToKeep.size());
metadataToKeep.addAll(mdToKeep);
// keep the path but don't add anything to it
if (trackPath) {
metadataToKeep.add(urlPathKeyName);
}
// keep the depth but don't add anything to it
if (trackDepth) {
metadataToKeep.add(depthKeyName);
}
// what to keep from parentMD?
for (String key : metadataToKeep) {
String[] vals = metadata.getValues(key);
if (vals != null)
md.setValues(key, vals);
}
return md;
}
}