/* * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.addthis.hydra.data.filter.bundle; import java.net.InetAddress; import java.net.MalformedURLException; import java.net.URL; import java.util.concurrent.ConcurrentHashMap; import java.util.regex.Matcher; import java.util.regex.Pattern; import com.addthis.basis.collect.HotMap; import com.addthis.basis.net.NetUtil; import com.addthis.basis.util.LessBytes; import com.addthis.bundle.core.Bundle; import com.addthis.bundle.util.AutoField; import com.addthis.bundle.util.ValueUtil; import com.addthis.bundle.value.ValueFactory; import com.addthis.codec.annotations.FieldConfig; import com.google.common.base.Joiner; import com.google.common.net.InternetDomainName; /** * This {@link BundleFilter BundleFilter} <span class="hydra-summary"> dissects an url and updates * the bundle with the component pieces</span>. * <p/> * <p> * URLs can be hard to parse due to the variety of formats and levels of URL encoding they come in. * This filter is useful to help 'clean' the URL and to pull common components such as the domain, * host, path, and parameters out of the URL string and into individual bundle fields. * </p> * <p/> * <p>Example:</p> * <pre> * {url {field:"PAGE_URL", setHost:"PAGE_DOMAIN", clean:true}} * </pre> * * @user-reference */ public final class BundleFilterURL implements BundleFilter { private static final HotMap<String, String> iphost = new HotMap<>(new ConcurrentHashMap<String, String>()); private static final int maxhostcache = Integer.parseInt(System.getProperty("packet.cachehost.max", "4000")); private static final boolean debugMalformed = System.getProperty("path.debug.malformed", "0").equals("1"); // stolen and modified from NetUtil.resolveDottedIP private static String resolveDottedIP(String ip) { int ipl = ip.length(); if (ipl == 0 || !(Character.isDigit(ip.charAt(ipl - 1)) && Character.isDigit(ip.charAt(0)))) { return ip; } try { String newhost = InetAddress.getByName(ip).getHostName(); if (newhost != null) { return newhost; } } catch (Exception ex) { ex.printStackTrace(); } return ip; } public BundleFilterURL() { } /** * Field containing the URL used as input to this filter. */ @FieldConfig(codable = true, required = true) private AutoField field; /** * If <code>true</code> the URL will be properly url-decoded and a * trailing '/' will be added if not already present. * The corrected url will be saved back into the input specified by {@link #field}. * If {@link #clean} and {@link #fixProto} are both true then * result of both transformations are saved back into the input field. */ @FieldConfig(codable = true) private boolean clean; /** * If <code>true</code> then 'http://' will be prepended * to the URL if not already present. This transformation * will be stored back into the input field if-and-only-if * the {@link #clean} parameter is <code>true</code>. * Several other parameters such as {@link #setHost} * and {@link #setHostNormal} only work on URLs that are * prefixed with a protocol. For this reason it can * be useful to use this parameter even when {@link #clean} * is <code>false</code>. */ @FieldConfig(codable = true) private boolean fixProto; /** * If true the IP of the host identified by the URL will be resolved and set to the * returnhost value. */ @FieldConfig(codable = true) private boolean resolveIP; /** * If true the host will be resolved to its base domain. Only affects the field specified * by the {@link #setHost setHost} parameter. Cannot be used in conjunction with {@link #toTopPrivateDomain}. */ @FieldConfig(codable = true) private boolean toBaseDomain; /** * If true the host will be resolved to its * <a href="https://code.google.com/p/guava-libraries/wiki/InternetDomainNameExplained"> * top private domain</a>. * Only affects the field specified by the {@link #setHost setHost} parameter. * Cannot be used in conjunction with {@link #toBaseDomain}. */ @FieldConfig(codable = true) private boolean toTopPrivateDomain; /** * If true then the URL is a file based URL, e.g. file:///. */ @FieldConfig(codable = true) private boolean asFile; /** * Name of the field to populate with the * host defined by this URL. If the input URL * does not have a protocol prefix then * the target field is not populated. * To ensure that the input is prefixed with a protocol * set the {@link #fixProto} parameter to <code>true</code>. */ @FieldConfig(codable = true) private AutoField setHost; /** * Name of the field to populate with the * normalized host defined by this URL. * If the input URL does not have a protocol prefix then * the target field is not populated. * To ensure that the input is prefixed with a protocol * set the {@link #fixProto} parameter to <code>true</code>. */ @FieldConfig(codable = true) private AutoField setHostNormal; /** * Name of the field to populate with the top * private domain as defined by Google * Guava's * <a href="http://docs.guava-libraries.googlecode * .com/git-history/release/javadoc/com/google/common/net/InternetDomainName * .html">InternetDomainName</a> . * This field should be used in combination with * {@link #fixProto fixProto} set to true. */ @FieldConfig(codable = true) private AutoField setTopPrivateDomain; /** * Name of the field to populate with the path * defined by this URL. If null the path will not be set. */ @FieldConfig(codable = true) private AutoField setPath; /** * Name of the field to populate with the parameters * defined by this URL. If null the parameters will not be set. */ @FieldConfig(codable = true) private AutoField setParams; /** * Name of the field to populate with the anchor * defined by this URL. If null the anchor will not be set. */ @FieldConfig(codable = true) private AutoField setAnchor; /** * Value to return when input is invalid. Default is false. */ @FieldConfig(codable = true) private boolean invalidExit; private static final Pattern hostNormalPattern = Pattern.compile("^www*\\d*\\.(.*)"); public BundleFilterURL setHost(AutoField value) { this.setHost = value; return this; } public BundleFilterURL setHostNormal(AutoField value) { this.setHostNormal = value; return this; } public BundleFilterURL setField(AutoField value) { this.field = value; return this; } public BundleFilterURL setBaseDomain(boolean value) { this.toBaseDomain = value; return this; } public BundleFilterURL setTopPrivateDomain(boolean value) { this.toTopPrivateDomain = value; return this; } public BundleFilterURL setFixProto(boolean value) { this.fixProto = value; return this; } private static final Joiner DOT_JOINER = Joiner.on('.'); @Override public boolean filter(Bundle bundle) { String pv = ValueUtil.asNativeString(field.getValue(bundle)); if (!asFile) { if (pv == null) { return invalidExit; } String lpv = pv.trim().toLowerCase(); if (!(lpv.startsWith("http"))) { if (fixProto) { if (clean && lpv.indexOf("%2f") >= 0) { pv = LessBytes.urldecode(pv); } pv = "http://".concat(pv); } else { return invalidExit; } } if (clean && (lpv.startsWith("http%") || lpv.startsWith("https%"))) { pv = LessBytes.urldecode(pv); } } // up to two 'decoding' passes on the url to try and find a valid one for (int i = 0; i < 2; i++) { if (pv == null) { return invalidExit; } try { URL urec = asFile ? new URL("file://".concat(pv)) : new URL(pv); String urlhost = urec.getHost(); String returnhost = null; if (resolveIP) { synchronized (iphost) { returnhost = iphost.get(urlhost).toLowerCase(); if (returnhost == null) { returnhost = resolveDottedIP(urlhost); iphost.put(urlhost, returnhost); if (iphost.size() > maxhostcache) { iphost.removeEldest(); } } } } else { returnhost = urlhost.toLowerCase(); } // store cleaned up (url decoded) version back to packet if (clean) { if (urec != null && urec.getPath().isEmpty()) { // if the path element is null, append the slash pv = pv.concat("/"); } field.setValue(bundle, ValueFactory.create(pv)); } if (setHost != null) { if (toBaseDomain) { returnhost = NetUtil.getBaseDomain(returnhost); } else if (toTopPrivateDomain) { if (returnhost != null && InternetDomainName.isValid(returnhost)) { InternetDomainName domain = InternetDomainName.from(returnhost); if (domain.hasPublicSuffix() && domain.isUnderPublicSuffix()) { InternetDomainName topPrivateDomain = domain.topPrivateDomain(); returnhost = topPrivateDomain.toString(); } } } setHost.setValue(bundle, ValueFactory.create(returnhost)); } if (setPath != null) { setPath.setValue(bundle, ValueFactory.create(urec.getPath())); } if (setParams != null) { setParams.setValue(bundle, ValueFactory.create(urec.getQuery())); } if (setAnchor != null) { setAnchor.setValue(bundle, ValueFactory.create(urec.getRef())); } if (setHostNormal != null) { Matcher m = hostNormalPattern.matcher(returnhost); if (m.find()) { returnhost = m.group(1); } setHostNormal.setValue(bundle, ValueFactory.create(returnhost)); } if (setTopPrivateDomain != null) { String topDomain = returnhost; if (InternetDomainName.isValid(returnhost)) { InternetDomainName domainName = InternetDomainName.from(returnhost); if (domainName.isTopPrivateDomain() || domainName.isUnderPublicSuffix()) { topDomain = DOT_JOINER.join(domainName.topPrivateDomain().parts()); } } setTopPrivateDomain.setValue(bundle, ValueFactory.create(topDomain)); } } catch (MalformedURLException e) { if (pv.indexOf("%3") > 0 && pv.indexOf("%2") > 0) { pv = LessBytes.urldecode(pv); } else { if (debugMalformed) { System.err.println("malformed(" + i + ") " + pv); } return invalidExit; } } } return true; } }