/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.streams.urls;
import java.util.Collection;
import java.util.Date;
import java.util.HashSet;
import java.util.Timer;
import java.util.TimerTask;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentMap;
import java.util.regex.Pattern;
/**
* This is a static utility helper class to verify strings are URLs,
* obey domain sensitivity, and find URLs within a string.
* It cannot be, instantiated and can only be referenced through
* the static accessor functions
*
*/
public final class LinkResolverHelperFunctions {
private static final String REGEX_URL =
"(?:(?:https?|ftp)://)" + // protocol identifier
"(?:\\S+(?::\\S*)?@)?" + // user:pass authentication
"(?:" +
"(?!(?:10|127)(?:\\.\\d{1,3}){3})" + // IP address exclusion
"(?!(?:169\\.254|192\\.168)(?:\\.\\d{1,3}){2})" + // private & local networks
"(?!172\\.(?:1[6-9]|2\\d|3[0-1])(?:\\.\\d{1,3}){2})" +
"(?:[1-9]\\d?|1\\d\\d|2[01]\\d|22[0-3])" + // IP address dotted notation octets
"(?:\\.(?:1?\\d{1,2}|2[0-4]\\d|25[0-5])){2}" + // excludes loop-back network 0.0.0.0, excludes network & broadcast addresses
"(?:\\.(?:[1-9]\\d?|1\\d\\d|2[0-4]\\d|25[0-4]))" + // excludes reserved space >= 224.0.0.0, (first & last IP address of each class)
"|" +
"(?:(?:[a-z\\u00a1-\\uffff0-9]+-?)*[a-z\\u00a1-\\uffff0-9]+)" + // host name
"(?:\\.(?:[a-z\\u00a1-\\uffff0-9]+-?)*[a-z\\u00a1-\\uffff0-9]+)*" + // domain name
"(?:\\.(?:[a-z\\u00a1-\\uffff]{2,}))" + // TLD identifier
")" +
"(?::\\d{2,5})?" + // port number
"(?:/[^\\s]*)?"; // resource path
private static final String REGEX_URL_EXPLICIT = "^" + REGEX_URL + "$";
// The amount of time we want to space between domain calls
public static final long RECENT_DOMAINS_BACKOFF = 1000;
public static final long DEFAULT_STAGGER = RECENT_DOMAINS_BACKOFF / 10;
// Map to store the information of recent domains, with the last time they were accessed.
private static final ConcurrentMap<String, Date> RECENT_DOMAINS = new ConcurrentHashMap<>();
private static Timer timer;
/**
* Check to see if this string is a URL or not
* @param possibleURL
* The possible URL that we would like to test
* @return
* Whether or not it is a URL
*/
public static boolean isURL(String possibleURL) {
return possibleURL.matches(REGEX_URL_EXPLICIT);
}
public static boolean containsURLs(String possiblyHasURLs) {
return possiblyHasURLs != null && Pattern.compile(REGEX_URL).matcher(possiblyHasURLs).find();
}
private LinkResolverHelperFunctions() {
// force it not to be instantiated.
}
public static void purgeAllDomainWaitTimes() {
RECENT_DOMAINS.clear();
}
public static long waitTimeForDomain(String domain) {
domain = domain.toLowerCase();
long toReturn = 0;
synchronized (LinkResolverHelperFunctions.class) {
purgeAnyExpiredDomains();
// if the timer doesn't exist, then setup the timer (IE: first time called)
if(timer == null)
setupTimer();
long currentTime = new Date().getTime();
if(RECENT_DOMAINS.containsKey(domain)) {
// find the time it wants us to wait until
long nextTime = RECENT_DOMAINS.get(domain).getTime();
long random = (long)((Math.random() * (RECENT_DOMAINS_BACKOFF / 5))); // stagger
// back-off has been satisfied
if(currentTime >= nextTime)
RECENT_DOMAINS.put(domain, new Date(currentTime + RECENT_DOMAINS_BACKOFF));
else {
// we are going to have to wait longer than the back-off
// add the time we told them they needed to wait
toReturn = (nextTime - currentTime) + RECENT_DOMAINS_BACKOFF;
RECENT_DOMAINS.put(domain, new Date(currentTime + toReturn));
toReturn += random + 1;
}
} else {
// no wait
RECENT_DOMAINS.put(domain, new Date(currentTime + RECENT_DOMAINS_BACKOFF));
}
} // end synchronized block
return toReturn;
}
/**
* Quick function to setup the daemon to clear domains to keep our memory foot-print low
*/
private static void setupTimer() {
timer = new Timer(true);
timer.schedule(new TimerTask() {
public void run() {
purgeAnyExpiredDomains();
}
}, RECENT_DOMAINS_BACKOFF * 2);
}
/**
* called by the timer to expire any domains
*/
private static void purgeAnyExpiredDomains() {
// ensure this method is synchronized to get the proper information
synchronized (LinkResolverHelperFunctions.class) {
// figure the time that we would like for these domains to expire
long currentTime = new Date().getTime();
// see if there is any work that 'can' be done
if(RECENT_DOMAINS.size() != 0) {
// create a temporary list of the items that can be removed
Collection<String> ableToRemove = new HashSet<>();
// iterate through all the domains (keys)
// if it qualifies, we can add it to the remove list
for(String k : RECENT_DOMAINS.keySet())
if(currentTime >= RECENT_DOMAINS.get(k).getTime())
ableToRemove.add(k);
if(ableToRemove.size() > 0) // if there are domains to remove, then remove them
for(String k : ableToRemove) // iterate through every domain that we can remove
RECENT_DOMAINS.remove(k); // remove the domain from our map.
}
}
}
}