/** * Licensed to DigitalPebble Ltd under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * DigitalPebble licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.digitalpebble.stormcrawler.indexing; import java.net.MalformedURLException; import java.net.URL; import java.util.HashMap; import java.util.Iterator; import java.util.Map; import java.util.Map.Entry; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.commons.lang.ArrayUtils; import org.apache.commons.lang.StringUtils; import org.apache.storm.task.OutputCollector; import org.apache.storm.task.TopologyContext; import org.apache.storm.topology.OutputFieldsDeclarer; import org.apache.storm.topology.base.BaseRichBolt; import org.apache.storm.tuple.Fields; import org.apache.storm.tuple.Tuple; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.digitalpebble.stormcrawler.Metadata; import com.digitalpebble.stormcrawler.util.ConfUtils; import com.digitalpebble.stormcrawler.util.RobotsTags; import com.digitalpebble.stormcrawler.util.URLUtil; /** Abstract class to simplify writing IndexerBolts **/ @SuppressWarnings("serial") public abstract class AbstractIndexerBolt extends BaseRichBolt { private final Logger LOG = LoggerFactory.getLogger(getClass()); /** * Mapping between metadata keys and field names for indexing Can be a list * of values separated by a = or a single string **/ public static final String metadata2fieldParamName = "indexer.md.mapping"; /** * list of metadata key + values to be used as a filter. A document will be * indexed only if it has such a md. Can be null in which case we don't * filter at all. **/ public static final String metadataFilterParamName = "indexer.md.filter"; /** Field name to use for storing the text of a document **/ public static final String textFieldParamName = "indexer.text.fieldname"; /** Field name to use for storing the url of a document **/ public static final String urlFieldParamName = "indexer.url.fieldname"; /** Field name to use for reading the canonical property of the metadata */ public static final String canonicalMetadataParamName = "indexer.canonical.name"; private String[] filterKeyValue = null; private Map<String, String> metadata2field = new HashMap<>(); private String fieldNameForText = null; private String fieldNameForURL = null; private String canonicalMetadataName = null; @SuppressWarnings({ "rawtypes", "unchecked" }) @Override public void prepare(Map conf, TopologyContext context, OutputCollector collector) { String mdF = ConfUtils.getString(conf, metadataFilterParamName); if (StringUtils.isNotBlank(mdF)) { // split it in key value int equals = mdF.indexOf('='); if (equals != -1) { String key = mdF.substring(0, equals); String value = mdF.substring(equals + 1); filterKeyValue = new String[] { key.trim(), value.trim() }; } else { LOG.error("Can't split into key value : {}", mdF); } } fieldNameForText = ConfUtils.getString(conf, textFieldParamName); fieldNameForURL = ConfUtils.getString(conf, urlFieldParamName); canonicalMetadataName = ConfUtils.getString(conf, canonicalMetadataParamName); for (String mapping : ConfUtils.loadListFromConf( metadata2fieldParamName, conf)) { int equals = mapping.indexOf('='); if (equals != -1) { String key = mapping.substring(0, equals); String value = mapping.substring(equals + 1); metadata2field.put(key.trim(), value.trim()); } else { LOG.error("Can't split into key value : {}", mapping); } } } /** * Determine whether a document should be indexed based on the presence of a * given key/value or the RobotsTags.ROBOTS_NO_INDEX directive. * * @return true if the document should be kept. **/ protected boolean filterDocument(Metadata meta) { String noindexVal = meta.getFirstValue(RobotsTags.ROBOTS_NO_INDEX); if ("true".equalsIgnoreCase(noindexVal)) return false; if (filterKeyValue == null) return true; String[] values = meta.getValues(filterKeyValue[0]); // key not found if (values == null) return false; return ArrayUtils.contains(values, filterKeyValue[1]); } /** Returns a mapping field name / values for the metadata to index **/ protected Map<String, String[]> filterMetadata(Metadata meta) { Pattern indexValuePattern = Pattern.compile("\\[(\\d+)\\]"); Map<String, String[]> fieldVals = new HashMap<>(); Iterator<Entry<String, String>> iter = metadata2field.entrySet() .iterator(); while (iter.hasNext()) { Entry<String, String> entry = iter.next(); // check whether we want a specific value or all of them? int index = -1; String key = entry.getKey(); Matcher match = indexValuePattern.matcher(key); if (match.find()) { index = Integer.parseInt(match.group(1)); key = key.substring(0, match.start()); } String[] values = meta.getValues(key); // not found if (values == null || values.length == 0) continue; // want a value index that it outside the range given if (index >= values.length) continue; // store all values available if (index == -1) fieldVals.put(entry.getValue(), values); // or only the one we want else fieldVals.put(entry.getValue(), new String[] { values[index] }); } return fieldVals; } /** * Returns the value to be used as the URL for indexing purposes, if present * the canonical value is used instead */ protected String valueForURL(Tuple tuple) { String url = tuple.getStringByField("url"); Metadata metadata = (Metadata) tuple.getValueByField("metadata"); // functionality deactivated if (StringUtils.isBlank(canonicalMetadataParamName)) { return url; } String canonicalValue = metadata.getFirstValue(canonicalMetadataName); // no value found? if (StringUtils.isBlank(canonicalValue)) { return url; } try { URL sURL = new URL(url); URL canonical = URLUtil.resolveURL(sURL, canonicalValue); // check is the same host if (sURL.getHost().equals(canonical.getHost())) { return canonical.toExternalForm(); } else { LOG.info( "Canonical URL references a different host, ignoring in {} ", url); } } catch (MalformedURLException e) { LOG.error("Malformed canonical URL {} was found in {} ", canonicalValue, url); } return url; } /** * Returns the field name to use for the text or null if the text must not * be indexed **/ protected String fieldNameForText() { return fieldNameForText; } /** * Returns the field name to use for the URL or null if the URL must not be * indexed **/ protected String fieldNameForURL() { return fieldNameForURL; } @Override public void declareOutputFields(OutputFieldsDeclarer declarer) { declarer.declareStream( com.digitalpebble.stormcrawler.Constants.StatusStreamName, new Fields("url", "metadata", "status")); } }