package org.jabref.logic.net;
import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLDecoder;
import java.nio.charset.StandardCharsets;
import java.util.Objects;
public class URLUtil {
private static final String URL_EXP = "^(https?|ftp)://.+";
// Detect Google search URL
private static final String GOOGLE_SEARCH_EXP = "^https?://(?:www\\.)?google\\.[\\.a-z]+?/url.*";
private URLUtil() {
}
/**
* Cleans URLs returned by Google search.
*
* <example>
* If you copy links from search results from Google, all links will be enriched with search meta data, e.g.
* https://www.google.de/url?sa=t&rct=j&q=&esrc=s&source=web&cd=1&cad=rja&uact=8&&url=http%3A%2F%2Fwww.inrg.csie.ntu.edu.tw%2Falgorithm2014%2Fhomework%2FWagner-74.pdf&ei=DifeVYHkDYWqU5W0j6gD&usg=AFQjCNFl638rl5KVta1jIMWLyb4CPSZidg&sig2=0hSSMw9XZXL3HJWwEcJtOg
* </example>
*
* @param url the Google search URL string
* @return the cleaned Google URL or @code{url} if no search URL was detected
*/
public static String cleanGoogleSearchURL(String url) {
Objects.requireNonNull(url);
if (!url.matches(GOOGLE_SEARCH_EXP)) {
return url;
}
// Extract destination URL
try {
URL searchURL = new URL(url);
// URL parameters
String query = searchURL.getQuery();
// no parameters
if (query == null) {
return url;
}
// extract url parameter
String[] pairs = query.split("&");
for (String pair: pairs) {
// "clean" url is decoded value of "url" parameter
if (pair.startsWith("url=")) {
String value = pair.substring(pair.indexOf('=') + 1, pair.length());
String decode = URLDecoder.decode(value, StandardCharsets.UTF_8.name());
// url?
if (decode.matches(URL_EXP)) {
return decode;
}
}
}
return url;
} catch (UnsupportedEncodingException | MalformedURLException e) {
return url;
}
}
/**
* Checks whether the given String is a URL.
* Currently only checks for a protocol String.
*
* @param url the String to check for a URL
* @return true if <c>url</c> contains a valid URL
*/
public static boolean isURL(String url) {
return url.contains("://");
}
}