/******************************************************************************* * Copyright 2013 EMBL-EBI * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. ******************************************************************************/ package net.sf.cram.select; import htsjdk.samtools.SAMRecord; import java.util.EnumSet; import java.util.Map; import java.util.Set; import java.util.TreeMap; import java.util.TreeSet; import java.util.regex.Matcher; import java.util.regex.Pattern; /** * Describes and applies selection rules to SAMRecord fields. Examples: * * <pre> * /* select all * * select all * SEQ select only column with bases * *:SEQ select all columns including tags except for bases * *:SEQ:AM select all columns except bases and tag AM * *:SEQ/*:AM select all columns except bases and tag AM * SEQ:RNAME/*:NM:AM select bases, ref name and all tags except for NM and AM * SEQ:RNAME:NM:AM select bases, ref name and tags NM and AM * </pre> * * @author vadim * */ public class SAMFieldSelector { public static final String allSymbol = "*"; public static final String fieldSeparatorSymbol = ":"; public static final String fieldTagSeparatorSymbol = "/"; public static final EnumSet<FIELD_TYPE> onlyTags = EnumSet.of(FIELD_TYPE.TAG); public static final EnumSet<FIELD_TYPE> exceptTags = EnumSet.complementOf(onlyTags); private static final Set<SAMRecordField> ALL_FIELDS = new TreeSet<SAMRecordField>(); { for (FIELD_TYPE type : exceptTags) ALL_FIELDS.add(new SAMRecordField(type)); } public static final Pattern pattern = Pattern .compile("^([*]?)([\\p{Upper}:]+)?(?:(/?)([*]?)([\\p{Upper}\\d:]+)?)?$"); protected Set<SAMRecordField> fields = new TreeSet<SAMRecordField>(); protected Set<SAMRecordField> tags = new TreeSet<SAMRecordField>(); protected boolean allButFields = false; protected boolean allButTags = false; private int power = 0; private Map<String, SAMRecordField> tagFieldCache = new TreeMap<String, SAMRecordField>(); public SAMFieldSelector(String spec) { Matcher matcher = pattern.matcher(spec); if (!matcher.matches() || matcher.groupCount() != 5) throw new IllegalArgumentException("Confusing SAMRecord field selector: " + spec); allButFields = allSymbol.equals(matcher.group(1)); if (matcher.group(3).length() == 0) allButTags = allButFields; else allButTags = allSymbol.equals(matcher.group(4)); Set<SAMRecordField> f1 = parseListOfFields(matcher.group(2)); Set<SAMRecordField> f2 = parseListOfFields(matcher.group(5)); fields.addAll(filterByType(f1, exceptTags)); tags.addAll(filterByType(f1, onlyTags)); tags.addAll(filterByType(f2, onlyTags)); } private SAMRecordField getCachedTagField(String id) { SAMRecordField f = tagFieldCache.get(id); if (f == null) { f = SAMRecordField.fromTagId(id); tagFieldCache.put(id, f); } return f; } public Map<SAMRecordField, Object> getValues(SAMRecord record, Map<SAMRecordField, Object> map) { if (map == null) map = new TreeMap<SAMRecordField, Object>(); if (allButFields) { for (SAMRecordField f : ALL_FIELDS) { if (!fields.contains(f)) map.put(f, f.getValue(record)); } } else { for (SAMRecordField f : fields) { if (fields.contains(f)) map.put(f, f.getValue(record)); } } if (allButTags) { for (SAMRecord.SAMTagAndValue tv : record.getAttributes()) { SAMRecordField f = getCachedTagField(tv.tag); if (!tags.contains(f)) map.put(f, f.getValue(record)); } } else { for (SAMRecord.SAMTagAndValue tv : record.getAttributes()) { SAMRecordField f = getCachedTagField(tv.tag); if (tags.contains(f)) map.put(f, f.getValue(record)); } } return map; } public boolean matches(SAMRecordField field) { if (field.type == FIELD_TYPE.TAG) return allButTags ^ tags.contains(field); return allButFields ^ fields.contains(field); } @Override public String toString() { StringBuilder sb = new StringBuilder(); if (allButFields && fields.isEmpty()) { if (allButTags && tags.isEmpty()) return "*"; } if (allButFields) sb.append(allSymbol); if (!fields.isEmpty()) { boolean first = true; for (SAMRecordField f : fields) { if (!first || allButFields) sb.append(":"); sb.append(f.toString()); first = false; } } if (allButTags || !tags.isEmpty()) { sb.append("/"); if (allButTags) sb.append(allSymbol); if (!tags.isEmpty()) { boolean first = true; for (SAMRecordField f : tags) { if (!first || allButTags) sb.append(":"); sb.append(f.toString()); first = false; } } } return sb.toString(); } @Override public boolean equals(Object obj) { if (!(obj instanceof SAMFieldSelector)) return false; SAMFieldSelector f = (SAMFieldSelector) obj; if (allButFields != f.allButFields) return false; if (allButTags != f.allButTags) return false; if (!fields.equals(f.fields)) return false; if (!tags.equals(f.tags)) return false; return super.equals(obj); } public static Set<SAMRecordField> filterByType(Set<SAMRecordField> set, EnumSet<FIELD_TYPE> types) { Set<SAMRecordField> result = new TreeSet<SAMRecordField>(); for (SAMRecordField f : set) if (types.contains(f.type)) result.add(f); return result; } public static Set<SAMRecordField> parseListOfFields(String spec) { Set<SAMRecordField> set = new TreeSet<SAMRecordField>(); if (spec == null) return set; for (String s : spec.split(":")) { if (s.length() == 0) continue; SAMRecordField field = SAMRecordField.parseString(s); set.add(field); } return set; } /** * Not sure if needed. Allows to specifiy details about each field, only [N] * the length of the field is supported. * * @author vadim * */ private static class FieldSpec { public SAMRecordField field; public int maxLen = Integer.MAX_VALUE; public String trimMarker = "..."; private static Pattern trimPattern = Pattern.compile("^(\\p{Upper}+)(?:\\[(\\d+)\\])?$"); public FieldSpec(String spec) { // handles only [] for now: Matcher matcher = trimPattern.matcher(spec); if (matcher.matches()) { switch (matcher.groupCount()) { case 1: field = SAMRecordField.parseString(matcher.group(1)); break; case 2: field = SAMRecordField.parseString(matcher.group(1)); if (matcher.group(2) != null) maxLen = Integer.parseInt(matcher.group(2)); break; default: throw new IllegalArgumentException("Failed to parse field specification: " + spec); } } } @Override public String toString() { if (maxLen < Integer.MAX_VALUE) return String.format("%s[%d]", field.toString(), maxLen); return field.toString(); } } public static void main(String[] args) { test("SEQ"); test("*:SEQ"); test("*:SEQ:AM"); test("*SEQ:AM"); test("*:SEQ/*:AM"); test("SEQ:RNAME/*:NM:AM"); test("SEQ:RNAME:NM:AM"); test("*/*"); test("*"); SAMFieldSelector s = new SAMFieldSelector("*:SEQ:RNAME:NM:AM"); System.out.println(s); System.out.println(s.matches(new SAMRecordField(FIELD_TYPE.RNAME))); System.out.println(s.matches(new SAMRecordField(FIELD_TYPE.SEQ))); System.out.println(s.matches(new SAMRecordField(FIELD_TYPE.POS))); System.out.println(s.matches(SAMRecordField.fromTagId("NM"))); System.out.println(s.matches(SAMRecordField.fromTagId("AM"))); System.out.println(s.matches(SAMRecordField.fromTagId("OQ"))); } private static void test(String spec) { SAMFieldSelector s = new SAMFieldSelector(spec); System.out.println(spec + "\t" + s.toString()); System.out.println(new SAMFieldSelector(s.toString()).toString()); System.out.println(); } }