/* * $HeadURL$ * $Id$ * * Copyright (c) 2007-2012 by Public Library of Science * http://plos.org * http://ambraproject.org * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.ambraproject.service.trackback; import org.ambraproject.models.Journal; import org.ambraproject.util.UriUtil; import org.apache.commons.configuration.Configuration; import java.net.MalformedURLException; import java.net.URI; import java.net.URISyntaxException; import java.net.URL; /** * An instance of this class translates links into DOIs according to the rules for a particular DOI resolver or journal * domain. * * @author Ryan Skonnord */ public abstract class InboundLinkTranslator { protected static final String WWW = "www."; protected static final String DEFAULT_DOI_SCHEME = "info:doi/"; protected static final String LOCAL_RESOLVER_KEY = "ambra.services.crossref.plos.doiurl"; protected static final String ARTICLE_ACTION_KEY = "ambra.platform.articleAction"; protected static final String JOURNAL_HOST_FORMAT = "ambra.virtualJournals.%s.url"; protected final String hostname; protected final boolean acceptWww; protected static String getHostnameForJournal(String journalKey, Configuration configuration) { String journalUrl = configuration.getString(String.format(JOURNAL_HOST_FORMAT, journalKey)); try { return new URL(journalUrl).getHost(); } catch (MalformedURLException e) { throw new IllegalStateException(e); } } /** * Accessed by public static factory methods. * <p/> * The {@code hostname} argument must be lowercase and not have a leading "www." regardless of {@code acceptWww}. If * the hostname is null, the new object will treat any hostname in an input link as valid. A null hostname should be * passed in only by {@link #forAnyJournal}. * * @param hostname the only valid hostname * @param acceptWww if {@code true}, ignore leading "www." on input hostnames */ private InboundLinkTranslator(String hostname, boolean acceptWww) { this.hostname = hostname; this.acceptWww = acceptWww; } /** * If the link refers to a DOI-identified resource on the domain that this object represents, return the DOI. Such a * return value <em>can</em> be the DOI of an object in this system but is not necessarily. Null means that the link * either does not refer to this object's domain or does not contain a DOI in a valid syntax. * * @return the DOI to which the link points, or {@code null} if the link does not indicate a DOI validly for this * object's domain */ public final String getDoi(URL link) { if (!"http".equals(link.getProtocol())) { // Links to articles can only be in HTTP // This will need to be updated if articles are ever served on other protocols such as HTTPS return null; } if (hostname != null && !validateHostname(link)) { return null; } String decodedPath = UriUtil.decodeUtf8(link.getPath()); if (decodedPath.length() <= 1) { return null; } decodedPath = decodedPath.substring(1); // strip leading slash String doiCandidate = getDoiFromPath(decodedPath); if (doiCandidate == null) { return null; } try { URI uri = new URI(doiCandidate); if (uri.getScheme() == null) { return null; // All DOIs used as article keys are prefixed with a URI scheme } } catch (URISyntaxException e) { return null; // By definition, all valid DOIs are valid URIs } return doiCandidate; } /** * Parse a DOI from a link's path. * <p/> * Return {@code null} if a DOI can't be found or if the link is definitely invalid. This method must not return a * valid DOI if the link wouldn't take a browser to that article. However, it is safe (but less efficient) to return * an invalid DOI which will eventually fail to match an article. * * @param path a URL path with the leading slash removed * @return a DOI that can be used as an article key (meaning it should have a scheme, usually {@code info:doi/}), or * null if the path is invalid */ protected abstract String getDoiFromPath(String path); /** * Check whether the host of the link matches this object's expected host, ignoring a "www." prefix if this object * says so. Hostnames are specified to be lowercase and browsers generally treat them as case-insensitive, so the * comparison is case-insensitive. * * @param link the URL to check * @return whether the URL's host matches */ protected boolean validateHostname(URL link) { String linkHost = link.getHost(); if (linkHost == null) { return false; } if (!acceptWww) { return linkHost.equalsIgnoreCase(hostname); } boolean linkHostHasWww = WWW.regionMatches(true, 0, linkHost, 0, WWW.length()); int linkHostOffset = linkHostHasWww ? WWW.length() : 0; int length = linkHost.length() - linkHostOffset; if (length != hostname.length()) { return false; } return hostname.regionMatches(true, 0, linkHost, linkHostOffset, length); } /** * Reflects behavior of the public DOI resolver at {@code http://dx.doi.org/}. */ public static final InboundLinkTranslator GLOBAL_RESOLVER = new InboundLinkTranslator("dx.doi.org", false) { @Override protected String getDoiFromPath(String path) { return path.startsWith(DEFAULT_DOI_SCHEME) ? path : DEFAULT_DOI_SCHEME + path; } }; /** * Construct a translator to reflect behavior of this Ambra instance's DOI resolver. * * @param configuration the local configuration * @return the translator object */ public static InboundLinkTranslator forLocalResolver(Configuration configuration) { URL resolverUrl; try { resolverUrl = new URL(configuration.getString(LOCAL_RESOLVER_KEY)); } catch (MalformedURLException e) { throw new IllegalStateException(e); } final String rootPath = resolverUrl.getPath().substring(1); // strip leading slash return new InboundLinkTranslator(resolverUrl.getHost(), false) { @Override protected String getDoiFromPath(String path) { if (path.startsWith(rootPath)) { path = path.substring(rootPath.length()); } // Expect "info:doi/" *not* to be in the link, but append it for the returned key return path.startsWith(DEFAULT_DOI_SCHEME) ? null : DEFAULT_DOI_SCHEME + path; } }; } public static InboundLinkTranslator forJournal(Journal journal, Configuration configuration) { return forJournal(journal.getJournalKey(), configuration); } public static InboundLinkTranslator forJournal(String journalKey, Configuration configuration) { String journalHost = getHostnameForJournal(journalKey, configuration); return forJournalByHost(journalHost, configuration); } public static InboundLinkTranslator forAnyJournal(Configuration configuration) { return forJournalByHost(null, configuration); } /** * Construct a translator that acts on URLs for a particular journal's host. * * @param journalHost the hostname of the journal * @param configuration the local configuration * @return the translator object */ private static InboundLinkTranslator forJournalByHost(String journalHost, Configuration configuration) { final String articleAction = configuration.getString(ARTICLE_ACTION_KEY); return new InboundLinkTranslator(journalHost, true) { @Override protected String getDoiFromPath(String path) { return path.startsWith(articleAction) ? path.substring(articleAction.length()) : null; } }; } }