package com.widowcrawler.parse; import com.netflix.governator.annotations.AutoBindSingleton; import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.Validate; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import javax.ws.rs.core.UriBuilder; import java.net.URI; import java.net.URISyntaxException; /** * @author Scott Mansfield */ public class LinkNormalizer { private static final Logger logger = LoggerFactory.getLogger(LinkNormalizer.class); /** * Normalizes a link extracted from a page given the page's original URI. * * @param original The URI of the page that is being parsed * @param extracted The extracted link from the page * @return The normalized URI, or null if there was an error parsing the extracted URI. */ public String normalize(String original, String extracted) { Validate.notBlank(original); Validate.notNull(extracted); URI originalUri = null; URI extractedUri = null; try { originalUri = new URI(original); extractedUri = new URI(extracted); } catch (URISyntaxException ex) { logger.warn("Extracted URI is invalid: " + extracted, ex); return null; } UriBuilder retval = UriBuilder.fromUri(extractedUri); boolean hasHost = StringUtils.isNotBlank(extractedUri.getHost()); boolean hasScheme = StringUtils.isNotBlank(extractedUri.getScheme()); if (!hasScheme) { retval.scheme(originalUri.getScheme()); } if (!hasHost) { // moved to below, checking // TODO: Seeing IllegalArgumentException: Schema specific part is opaque // Maybe move this below the scheme instead of the other way around? retval.host(originalUri.getHost()); String normalizedPath = normalizePath(originalUri.getPath(), extractedUri.getPath()); retval.replacePath(normalizedPath); } retval.fragment(""); return retval.toString(); } private String normalizePath(String originalPath, String extractedPath) { originalPath = findPathDirectory(originalPath); if (StringUtils.startsWith(extractedPath, "../")) { while (StringUtils.startsWith(extractedPath, "../") && originalPath.length() > 0) { originalPath = removePathChunkAtEnd(originalPath); extractedPath = removePathChunkAtStart(extractedPath); } } originalPath = StringUtils.stripEnd(originalPath, "/"); extractedPath = StringUtils.stripStart(extractedPath, "/"); return originalPath + "/" + extractedPath; } private String findPathDirectory(String path) { int lastSlash = StringUtils.lastIndexOf(path, "/"); if (lastSlash == path.length() - 1) { // already a directory return path; } return removePathChunkAtEnd(path); } private String removePathChunkAtEnd(String path) { path = StringUtils.stripEnd(path, "/"); int lastSlash = StringUtils.lastIndexOf(path, "/"); return StringUtils.substring(path, 0, lastSlash); } private String removePathChunkAtStart(String path) { path = StringUtils.stripStart(path, "/"); int firstSlash = StringUtils.indexOf(path, "/"); return StringUtils.substring(path, firstSlash + 1, path.length()); } }