package uk.ac.ebi.fg.myequivalents.utils; import static uk.ac.ebi.fg.myequivalents.model.Service.UNSPECIFIED_SERVICE_NAME; import java.net.URI; import java.net.URISyntaxException; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.commons.lang3.StringUtils; import uk.ac.ebi.fg.myequivalents.model.Entity; import uk.ac.ebi.fg.myequivalents.model.EntityId; import uk.ac.ebi.fg.myequivalents.model.Service; /** * <p>This resolves an entity ID string into a an {@link EntityId}, that is, a pair of service + accession (+ URI). * an entity ID string follow a simple syntax, as explained in {@link #parse(String)}, which allows one to specify * either a service name + accession, or a straight URI, with or without the service reference added to it.</p> * * <p>Note that his base class is almost useless alone and it requires a backend-specific implementation, such as * DbEntityIdResolver in the DB package (see {@link #resolveUri(String, String, String)}).</p> * * @author brandizi * <dl><dt>Date:</dt><dd>28 May 2015</dd> * */ public class EntityIdResolver { /** * Used to split an entity ID string into two substrings, separated by colon. This regular expression * doesn't consider special sequences like '\:', '://', '::' as separators. */ public static final Pattern ENT_ID_SPLIT_PATTERN = Pattern.compile ( "(?<!(\\\\|:+)):(?!//|:+)" ); /** * Used to identify the '$id' string, which is considered an accession placeholder in URI patterns * (like in <a href = 'http://www.ebi.ac.uk/miriam'>MIRIAM</a>). This regular expression ignores * the sequence '\$id'. */ public static final Pattern ID_PLACEHOLDER_PATTERN = Pattern.compile ( "(?<!\\\\)\\$id" ); /** * Used in {@link #breakUri(String, String)}, to find the last separator in a URI, so that what follows can be considered * an accession. */ public static final Pattern URI_FIND_PREFIX_PATTERN = Pattern.compile ( "[\\#/\\=\\\\]+" ); /** * An entity ID may have the forms: * <ul> * <li>serviceName:accession, URI is computed from {@link Entity#getURI()}, i.e., by using * {@link Service#getUriPattern() uri pattern} and accession.</li> * <li>serviceName:<uri>, the URI is intended to refer an entity provided by the service (an error is raised * if it isn't)</li> * <li><uri>, a straight URI, in this case {@link #resolveUri(String)} tries to see if the URI corresponds to * some existing myEquivalents service to be associated to the URI (resolving this type of entityId is a bit slower * than all other syntax formats, use :<>, _:<>, or service:<>, if you can. * <li>:<uri> or _:<uri>, which means the URI is not linked to a real service, but to the special * {@link Service#UNSPECIFIED_SERVICE}.</li> * </ul> * * Note that both the initial string and its sub-parts are pre-processed to trim extra boundary spaces. */ public EntityId parse ( String entityId ) { if ( entityId == null ) return null; entityId = entityId.trim (); if ( entityId.startsWith ( "<" ) ) { // A full URI, no service mentioned if ( !entityId.endsWith ( ">" ) ) throw new RuntimeException ( "Syntax error for entity ID '" + entityId + "'" ); String uri = entityId.substring ( 1, entityId.length () - 1 ); return new EntityId ( (String) null, null, uri ); } else { // a service:acc or service:<uri> form String uri = null; // Try to extract the two chunks demarked by ':' Matcher matcher = ENT_ID_SPLIT_PATTERN.matcher ( entityId ); if ( matcher.find () ) { int idx = matcher.start (); String serviceName = StringUtils.trimToNull ( entityId.substring ( 0, idx ) ); String acc = StringUtils.trimToNull ( entityId.substring ( idx + 1 ) ); if ( acc == null ) throw new RuntimeException ( "Syntax error (null entity accession) for entity ID '" + entityId + "'" ); // Is this acc actually a URI (i.e., wrapped in <>)? if ( acc.startsWith ( "<" ) ) { if ( !acc.endsWith ( ">" ) ) throw new RuntimeException ( "Syntax error for entity ID '" + entityId + "'" ); // then we have a URI and no real acc uri = acc.substring ( 1, acc.length () - 1 ); acc = null; // if the service name was empty in this *:<*> pattern, then you mean unspecified // because if you want us to resolve the service, you should omit colon. if ( serviceName == null ) serviceName = UNSPECIFIED_SERVICE_NAME; } return new EntityId ( serviceName, acc, uri ); } // None of the above worked throw new RuntimeException ( "Syntax error (null entity accession) for entity ID '" + entityId + "'" ); } } /** * invokes {@link #parse(String)} and then, if the result is non-null, {@link #resolve(EntityId)}. */ public EntityId doall ( String entityId ) { EntityId eid = parse ( entityId ); return eid == null ? null : resolve ( eid ); } /** * if uri != null invokes some form of resolveUri() (see below), else uses {@link #resolve(String, String)}. */ public EntityId resolve ( String serviceName, String acc, String uri ) { serviceName = StringUtils.trimToNull ( serviceName ); acc = StringUtils.trimToNull ( acc ); uri = StringUtils.trimToNull ( uri ); if ( uri != null ) { EntityId result = serviceName == null ? resolveUri ( uri ) // No service, try to get it from the URI : acc == null ? resolveUri ( serviceName, uri ) // service + URI, no acc, verify it corresponds to the URI pattern. : resolveUri ( serviceName, acc, uri ); // verify that the URI rebuilt with pattern + acc matches the param return result; } else // No URI, so it's in the form serviceName:acc return resolve ( serviceName, acc ); } /** * Just a wrapper for {@link #resolve(String, String, String)}. */ public EntityId resolve ( EntityId eid ) { if ( eid == null ) throw new RuntimeException ( "Cannot resolve a null entity ID" ); return resolve ( eid.getServiceName (), eid.getAcc (), eid.getUri () ); } /** * <p>Assumes a null URI and returns a new {@link EntityId}, using the two received parameters.</p> * * <p>So, here we don't actually 'resolve' anything here, ie, we don't lookup the service into * any storage backend. We suggest that you keep this method this way even in backend-specific * implementation, since it's usually used when the invoker already knows to be dealing with * a service:acc entity identifier and usually it doesn't need the service in such a situation.</p> * * <p>See DbEntityIdResolver in the DB package for details.</p> * */ public EntityId resolve ( String serviceName, String acc ) { if ( serviceName == null || acc == null ) throw new RuntimeException ( String.format ( "Syntax error for '%s:%s': cannot resolve a service:acc pair if either of them are null", serviceName, acc )); return new EntityId ( serviceName, acc ); } /** * Invokes {@link #resolveUri(String, String, String) resolveUri ( null, null, uri )}. */ public EntityId resolveUri ( String uri ) { return resolveUri ( null, null, uri ); } /** * Invokes {@link #resolveUri(String, String, String) resolveUri ( serviceName, null, uri )}. */ public EntityId resolveUri ( String serviceName, String uri ) { return resolveUri ( serviceName, null, uri ); } /** * <p>It should work this way: * * <ul> * <li>If serviceName is specified, just verify that the pattern corresponds to the URI and the accession * (if the latter is specified;</li> * <li>if the serviceName isn't available, try to find exactly one service that has a URI pattern matching the URI, * possibly verify the result using the specified accession. If no service cannot be found, or more than one exist * that match the URI, return an exception.</li> * </ul> * </p> * * <p>This default implementation just uses {@code 'new Service (serviceName)'}.</p> * * <p>A real implementation should try to fetch the service (including {@link Service#UNSPECIFIED_SERVICE_NAME}) and, * if the acc != null, verify the URI.</p> * * <p>This method is not supposed to pre-process its parameters (eg, {@link String#trim()}), since that's * usually done by {@link #parse(String)}.</p> * */ public EntityId resolveUri ( String serviceName, String acc, String uri ) { return new EntityId ( new Service ( serviceName ), acc, uri ); } /** * Tries to find the prefix in a URI having a form like {@code prefix + <acc>}. Eg, for * http://www.somewhere.net/path/to/123 returns into http://www.somewhere.net/path/to/$id, even * when {@code acc} is null (uses the last slash as separator). * * If this prefix-guessing doesn't work, tries to split the URI's after the domain * specification (eg, http://www.somewhere.net), so that this can be used to find one or more services. * * If acc != null, it simply returns uri.replaceAll ( acc, "\\$id" ), i.e., quickly rebuilds the URI pattern * by replacing the accession it contains with the placeholder. * */ public static String breakUri ( String acc, String uri ) { if ( acc != null ) return uri.replaceAll ( acc, "\\$id" ); String ruri = StringUtils.reverse ( uri ); Matcher matcher = URI_FIND_PREFIX_PATTERN.matcher ( ruri ); if ( matcher.find () ) return uri.substring ( 0, uri.length () - matcher.start () ) + "$id"; // else, try to find the domain return getDomain ( uri ); } /** * Wraps {@link #breakUri(String, String) breakUri ( null, uri )}. */ public static String breakUri ( String uri ) { return breakUri ( null, uri ); } /** * Tries to split the URI at the point where the domain ends. e.g., for http://www.somewhere.net/path/to/123 * returns http://www.somewhere.net. This uses methods from {@link URI}. */ public static String getDomain ( String uri ) { try { URI urio = new URI ( uri ); String auth = urio.getRawAuthority (); if ( auth == null || auth.length () == 0 ) return uri; int idx = uri.indexOf ( auth ); return uri.substring ( 0, idx + auth.length () ); } catch ( URISyntaxException ex ) { return uri; } } /** * Simply builds the a URI replacing '$id' in uriPattern with acc (actually uses {@link #ID_PLACEHOLDER_PATTERN}). * Note that if uriPattern = "$id", it quickly returns acc. */ public static String buildUriFromAcc ( String acc, String uriPattern ) { if ( uriPattern == null ) return null; // Special case the URI is also the accession (ie, UNSPECIFIED_SERVICE), let's speed up things a bit if ( "$id".equals ( uriPattern ) ) return acc; // Replace $id, unless it's \\$id return ID_PLACEHOLDER_PATTERN.matcher ( uriPattern ).replaceAll ( acc ); } /** * Tries to extract the accession from a given URI, matching the {@link #ID_PLACEHOLDER_PATTERN '$id'} pattern. */ public static String extractAccession ( String uri, String uriPattern ) { if ( uriPattern == null ) return null; // First find '$id' in the pattern Matcher matcher = ID_PLACEHOLDER_PATTERN.matcher ( uriPattern ); if ( !matcher.find () ) return null; int startIdx = matcher.start (), endIdx = matcher.end (); // Now split it into prefix and postfix around the ID placeholder String uriPatPrefx = uriPattern.substring ( 0, startIdx ); // Get a pattern tail but only up to the next $id placeholder String uriPatTail = null; if ( endIdx < uriPattern.length () ) { int nextEndIdx = matcher.find () ? matcher.start () : uriPattern.length (); uriPatTail = uriPattern.substring ( endIdx, nextEndIdx ); } // Does the URI's head match the extracted head? if ( !uri.startsWith ( uriPatPrefx ) ) return null; // Does the URI tail match the extracted tail? int uriPatPrefxLen = uriPatPrefx.length (); int endAccIdx = uriPatTail != null ? uri.indexOf ( uriPatTail, uriPatPrefxLen ) : uri.length (); if ( endAccIdx == -1 ) return null; // If both head and tail match, then the accession is the thing in between return uri.substring ( uriPatPrefxLen, endAccIdx ); } }