/*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.addthis.hydra.data.filter.value;
import javax.annotation.Nonnull;
import javax.annotation.Nullable;
import com.addthis.bundle.value.ValueFactory;
import com.addthis.bundle.value.ValueMap;
import com.addthis.bundle.value.ValueObject;
import com.google.common.collect.HashMultiset;
import com.google.common.collect.Multiset;
import com.fasterxml.jackson.annotation.JsonProperty;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.parser.Parser;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Matches values to extracted tags from raw html.
*/
public class ValueFilterGrepTags extends AbstractValueFilter {
private static final Logger log = LoggerFactory.getLogger(ValueFilterGrepTags.class);
/** Set of values to match against. */
@JsonProperty(required = true) private String[] values;
/** Tag name to search for. */
@JsonProperty(required = true) private String tagName;
/** Tag attribute to search for. */
@JsonProperty(required = true) private String[] tagAttrs;
/** Log error once for every N instances. */
@JsonProperty private int logEveryN = 100;
private int parserErrors = 0;
@Override
@Nullable
public ValueObject filterValue(ValueObject value) {
if (value == null) {
return null;
}
String html = value.asString().asNative();
if (html == null) {
return null;
}
@Nonnull Multiset<String> valueCounts = HashMultiset.create();
try {
Parser parser = Parser.htmlParser().setTrackErrors(0);
@Nonnull Document doc = parser.parseInput(html, "");
@Nonnull Elements tags = doc.select(tagName);
for (Element tag : tags) {
for (String tagAttr : tagAttrs) {
@Nonnull String attrValue = tag.attr(tagAttr).toLowerCase();
for (String matchValue : values) {
if (attrValue.contains(matchValue)) {
valueCounts.add(matchValue);
}
}
}
}
} catch (Exception e) {
if (parserErrors++ % logEveryN == 0) {
log.error("Failed to extract tags due to : {} Total Parser Errors : {}",
e.getMessage(), parserErrors);
}
}
return valueCounts.isEmpty() ? null : multisetToValueMap(valueCounts);
}
private static ValueMap multisetToValueMap(Multiset<String> matches) {
ValueMap valueMap = ValueFactory.createMap();
for (Multiset.Entry<String> match : matches.entrySet()) {
valueMap.put(match.getElement(), ValueFactory.create(match.getCount()));
}
return valueMap;
}
}