package org.genedb.db.loading; import org.apache.log4j.Logger; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; /** * Represents an EMBL location. * * @author rh11 * */ public abstract class EmblLocation { private static final Logger logger = Logger.getLogger(EmblLocation.class); /** * A regular expression that matches an EMBL <symbol>, as defined in * Appendix II of the feature table definition. * * (Update: In the Apr 2009 version 8.1 of the feature table definition, * the BNF appendix has been removed. The permitted characters are still * listed in section 3.1.) */ private static final String symbol = "[A-Za-z0-9_\\-'*]*[A-Za-z][A-Za-z0-9_\\-'*]*"; /** * A pattern that matches an external location. Does not validate the local part. */ private static final Pattern externalPattern = Pattern.compile(String.format("(?:(%s)::)?(%s)\\.(\\d+):(.+)", symbol, symbol)); public static EmblLocation parse(String locationString) throws ParsingException { //nds: A trim here gets rid of whitespaces which you cannot see in the embl files //which can cause quite a lot of confusing error messages locationString = locationString.trim(); if (locationString.startsWith("complement")) { return Complement.parse(locationString.substring(11, locationString.length() - 1)); } else if (locationString.startsWith("join(")) { if (!locationString.endsWith(")")) { throw new SyntaxError( String.format("Failed to parse join location '%s': no closing parenthesis at end", locationString)); } return Join.parse(locationString.substring(5, locationString.length() - 1).split(",")); } else if (locationString.startsWith("order(")) { if (!locationString.endsWith(")")) { throw new SyntaxError( String.format("Failed to parse order location '%s': no closing parenthesis at end", locationString)); } return Order.parse(locationString.substring(6, locationString.length() - 1).split(",")); } else if (locationString.matches("<?\\d+\\.\\.>?\\d+|\\d+\\^\\d+|[<>]?\\d+")) { return Simple.parse(locationString); } else if (locationString.matches("gap\\((?:unk100|\\d+)\\)")) { return Gap.parse(locationString); } else if (externalPattern.matcher(locationString).matches()) { return External.parse(locationString); } else { throw new SyntaxError("Cannot parse location string '" + locationString + "'"); } } public abstract int getStrand(); public abstract int getFmin(); public abstract int getFmax(); public boolean isExternal() { return false; } public List<EmblLocation> getParts() { return Collections.singletonList(this); } static class Complement extends EmblLocation { public static Complement parse(String locationString) throws ParsingException { return new Complement(EmblLocation.parse(locationString)); } EmblLocation location; public Complement(EmblLocation location) { this.location = location; } @Override public int getStrand() { return -location.getStrand(); } @Override public int getFmin() { return location.getFmin(); } @Override public int getFmax() { return location.getFmax(); } @Override public String toString() { return String.format("complement(%s)", location.toString()); } @Override public List<EmblLocation> getParts() { // Reverse the order of the parts, and complement each one List<EmblLocation> parts = location.getParts(); EmblLocation[] ret = new EmblLocation[parts.size()]; for (int i=0; i < parts.size(); i++) { EmblLocation part = parts.get(i); EmblLocation complementedPart = part instanceof Complement ? ((Complement)part).location : new Complement(part); ret[parts.size() - i - 1] = complementedPart; } return Arrays.asList(ret); } @Override public boolean isExternal() { return location.isExternal(); } } /** * A joining location: either join(...) or order(...). * @author rh11 * */ abstract static class Joining extends EmblLocation { protected abstract String operator(); private int fmin = Integer.MAX_VALUE, fmax = Integer.MIN_VALUE; protected void add(EmblLocation location) throws DataError { if (! (location instanceof Gap) && !location.isExternal()) { int locationFmin = location.getFmin(); int locationFmax = location.getFmax(); int locationStrand = location.getStrand(); if (locationFmin < fmin) { if (locationStrand != -1 && !locations.isEmpty()) { throw new DataError("Locations are joined in the wrong order"); } fmin = locationFmin; } if (locationFmax > fmax) { if (locationStrand == -1 && !locations.isEmpty()) { throw new DataError("Locations are joined in the wrong order"); } fmax = locationFmax; } } locations.add(location); } List<EmblLocation> locations = new ArrayList<EmblLocation>(); @Override public int getStrand() { int strand = 0; for (EmblLocation location: locations) { if (strand == 0) { strand = location.getStrand(); } else if (strand != location.getStrand()) { // This could occasionally be okay, if we have trans-splicing // between opposite strands (perhaps of different chromosomes). throw new RuntimeException("This EMBL location joins features from different strands. " + "That's probably a mistake; if not, the code needs to be extended to cope."); } } return strand; } @Override public int getFmin() { if (locations.isEmpty()) { throw new RuntimeException("A join that doesn't join anything?"); } return fmin; } @Override public int getFmax() { if (locations.isEmpty()) { throw new RuntimeException("A join that doesn't join anything?"); } return fmax; } @Override public String toString() { StringBuilder sb = new StringBuilder(); for (EmblLocation location: locations) { if (sb.length() > 0) { sb.append(','); } sb.append(location.toString()); } return String.format("%s(%s)", operator(), sb); } @Override public List<EmblLocation> getParts() { List<EmblLocation> ret = new ArrayList<EmblLocation>(); for (EmblLocation location: locations) { ret.addAll(location.getParts()); } return ret; } @Override public boolean isExternal() { for (EmblLocation location: locations) { if (location.isExternal()) { return true; } } return false; } } static class Join extends Joining { @Override protected String operator() { return "join"; } public static Join parse(String[] locationStrings) throws ParsingException { Join join = new Join(); for (String locationString: locationStrings) { join.add(EmblLocation.parse(locationString)); } return join; } } static class Order extends Joining { @Override protected String operator() { return "order"; } public static Order parse(String[] locationStrings) throws ParsingException { Order order = new Order(); for (String locationString: locationStrings) { order.add(EmblLocation.parse(locationString)); } return order; } } static class External extends EmblLocation { String database; String accession; int version; Simple simple; public static External parse(String locationString) throws ParsingException { Matcher matcher = externalPattern.matcher(locationString); if (!matcher.matches()) { throw new RuntimeException(String.format("Failed to parse location string '%s'", locationString)); } External external = new External(); external.database = matcher.group(1); external.accession = matcher.group(2); external.version = Integer.parseInt(matcher.group(3)); external.simple = Simple.parse(matcher.group(4)); return external; } @Override public int getStrand() { return simple.getStrand(); } @Override public int getFmin() { return simple.getFmin(); } @Override public int getFmax() { return simple.getFmax(); } @Override public String toString() { if (database != null) { return String.format("%s::%s.%d:%s", database, accession, version, simple.toString()); } else { return String.format("%s.%d:%s", accession, version, simple.toString()); } } @Override public boolean isExternal() { return true; } } static class Simple extends EmblLocation { int fmin, fmax; boolean isFminPartial = false, isFmaxPartial = false; public Simple(int fmin, int fmax) { this.fmin = fmin; this.fmax = fmax; } public Simple(int fmin, boolean isFminPartial, int fmax, boolean isFmaxPartial) { this.fmin = fmin; this.isFminPartial = isFminPartial; this.fmax = fmax; this.isFmaxPartial = isFmaxPartial; } public int getLength() { return fmax - fmin; } /* * We lose some partiality information when parsing a simple location, * because the < and > symbols are ignored. Notice that single-base locations * of the form <23 (say) are not even representable in Chado. */ private static final Pattern rangePattern = Pattern.compile("(<)?(\\d+)\\.\\.(>)?(\\d+)"); private static final Pattern interbasePattern = Pattern.compile("(\\d+)\\^(\\d+)"); private static final Pattern singleBasePattern = Pattern.compile("([<>])?(\\d+)"); public static Simple parse(String locationString) throws ParsingException { Matcher rangeMatcher = rangePattern.matcher(locationString); Matcher interbaseMatcher = interbasePattern.matcher(locationString); Matcher singleBaseMatcher = singleBasePattern.matcher(locationString); if (rangeMatcher.matches()) { boolean isFminPartial = rangeMatcher.group(1) != null; int fmin = Integer.parseInt(rangeMatcher.group(2)); boolean isFmaxPartial = rangeMatcher.group(3) != null; int fmax = Integer.parseInt(rangeMatcher.group(4)); if (fmin > fmax) { throw new DataError("Range end is before range start. (We don't support wrap-around features on circular chromosomes.)"); } return new Simple(fmin-1, isFminPartial, fmax, isFmaxPartial); } else if (interbaseMatcher.matches()) { int before = Integer.parseInt(interbaseMatcher.group(1)); int after = Integer.parseInt(interbaseMatcher.group(2)); if (after - before != 1) { throw new SyntaxError(String.format("Failed to parse location '%s'", locationString)); } return new Simple(before, before); } else if (singleBaseMatcher.matches()) { boolean isPartial = singleBaseMatcher.group(1) != null; int base = Integer.parseInt(singleBaseMatcher.group(2)); if (isPartial) { logger.warn(String.format("Location string '%s' has a form that cannot be represented in Chado" + "(a single base of indeterminate location)", locationString)); } return new Simple(base-1, base); } else { // This is an unusual error, since under most circumstances we won't even // try to parse a string as a SimpleLocation unless it looks like one. However // there is at least one exception to this: the local part of an External location // is parsed as a throw new SyntaxError(String.format( "Failed to parse simple location '%s'", locationString)); } } @Override public int getStrand() { return 1; } @Override public int getFmin() { return fmin; } @Override public int getFmax() { return fmax; } @Override public String toString() { if (fmin == fmax) { return String.format("%d^%d", fmin, fmin + 1); } else { return String.format("%s%d..%s%d", isFminPartial ? "<" : "", fmin+1, isFmaxPartial ? ">" : "", fmax); } } } private static final Pattern gapPattern = Pattern.compile("gap\\((?:(\\d+)|unk100)\\)"); abstract static class Gap extends EmblLocation { public static Gap parse(String locationString) throws ParsingException { Matcher matcher = gapPattern.matcher(locationString); if (!matcher.matches()) { throw new SyntaxError("Failed to parse gap: "+locationString); } String gapSizeString = matcher.group(1); if (gapSizeString == null) { return new UnknownGap(); } else { return new KnownGap(Integer.parseInt(gapSizeString)); } } public abstract int getLength(); @Override public int getStrand() { return 0; } @Override public int getFmin() { throw new RuntimeException("A gap doesn't know where it is. You have to work it out from the context."); } @Override public int getFmax() { // So that a subclass need only override getFmin() return getFmin() + getLength(); } } static class KnownGap extends Gap { private int size; public KnownGap(int size) { this.size = size; } @Override public int getLength() { return size; } @Override public String toString() { return String.format("gap(%d)", size); } } static class UnknownGap extends Gap { @Override public String toString() { return "gap(unk100)"; } @Override public int getLength() { return 100; } } }