package org.molgenis.gavin.job.input; import com.google.common.collect.EnumMultiset; import com.google.common.collect.Multiset; import org.molgenis.gavin.job.input.model.CaddVariant; import org.molgenis.gavin.job.input.model.LineType; import org.molgenis.gavin.job.input.model.Variant; import org.molgenis.gavin.job.input.model.VcfVariant; import org.slf4j.Logger; import org.springframework.stereotype.Component; import java.io.File; import java.io.IOException; import java.nio.charset.StandardCharsets; import java.util.Arrays; import java.util.Objects; import java.util.regex.Matcher; import java.util.regex.Pattern; import java.util.stream.Stream; import static java.lang.String.format; import static org.apache.commons.lang3.StringUtils.isEmpty; import static org.molgenis.gavin.job.input.Files.getLines; import static org.molgenis.gavin.job.input.model.LineType.*; import static org.slf4j.LoggerFactory.getLogger; /** * Parses input lines. * Two formats are supported, the output from the online CADD webtool and a VCF of which only the first five columns are read. */ @Component public class Parser { private static final Logger LOG = getLogger(Parser.class); private static final int CHROM_INDEX = 0; private static final int POS_INDEX = 1; private static final int CADD_REF_INDEX = 2; private static final int CADD_ALT_INDEX = 3; private static final int CADD_RAW_SCORE_INDEX = 4; private static final int CADD_PHRED_SCORE = 5; private static final int VCF_ID_INDEX = 2; private static final int VCF_REF_INDEX = 3; private static final int VCF_ALT_INDEX = 4; private static final int CADD_NR_OF_COLS = 6; private static final int VCF_NR_OF_COLUMNS = 5; public static final int MAX_LINES = 100000; private static Pattern CHROM_PATTERN = Pattern .compile("([Cc][Hh][Rr])?(?<chrom>([1-9])|(1[0-9])|(2[0-2])|[xX]|[yY])"); private static Pattern REF_PATTERN = Pattern.compile("[ACTG]+"); private static Pattern ALT_PATTERN = Pattern.compile("[ACTG]+|\\."); /** * Transforms gavin input file. * * @param inputFile the file to transform * @param output the file to output parsed variants to * @param error the file to output error lines to * @return Multiset counting the {@link LineType}s of the input file's lines * @throws IOException if the file interaction fails */ public Multiset<LineType> tryTransform(File inputFile, File output, File error) throws IOException { LOG.debug("Parsing {}...", inputFile.getAbsolutePath()); try (Stream<String> lines = getLines(inputFile.toPath(), StandardCharsets.UTF_8); LineSink outputSink = new LineSink(output); LineSink errorSink = new LineSink(error)) { Multiset<LineType> lineTypes = transformLines(lines, outputSink, errorSink); LOG.info("Parsed {}. LineTypes: {}", inputFile.getAbsolutePath(), lineTypes); return lineTypes; } } /** * Transforms a stream of lines and sends them to the error sink * * @param lines the Stream of lines to transform * @param outputSink {@link LineSink} to write transformed lines to * @param errorSink {@link LineSink} to write unparseable lines to * @return Multiset counting the {@link LineType}s found in the stream */ Multiset<LineType> transformLines(Stream<String> lines, LineSink outputSink, LineSink errorSink) { Multiset<LineType> lineTypes = EnumMultiset.create(LineType.class); writeVcfHeader(outputSink); lines.map(line -> transformLine(line, lineTypes.size(), countValidLines(lineTypes), outputSink, errorSink)) .forEach(lineTypes::add); return lineTypes; } private int countValidLines(Multiset<LineType> lineTypes) { return lineTypes.count(VCF) + lineTypes.count(CADD); } private void writeVcfHeader(LineSink outputSink) { outputSink.accept("##fileformat=VCFv4.0"); outputSink.accept("##INFO=<ID=CADD,Number=.,Type=String,Description=\"Raw CADD score\">"); outputSink.accept("##INFO=<ID=CADD_SCALED,Number=.,Type=String,Description=\"Scaled CADD score\">"); outputSink.accept("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO"); } /** * Transforms a single line. * * @param line the line to parse * @param numValidLines the number of valid lines already parsed * @param outputSink {@link LineSink} to write parsed variants to * @param errorSink {@link LineSink} to write lines to that we cannot parse * @return LineType of the parsed line */ public LineType transformLine(String line, int numLines, int numValidLines, LineSink outputSink, LineSink errorSink) { if (numValidLines >= MAX_LINES) { return SKIPPED; } if (isComment(line)) { return COMMENT; } Variant variant = tryParseVariant(line); if (variant == null) { errorSink.accept(format("Line %d:\t%s", numLines + 1, line)); return ERROR; } if (variant.getLineType() == INDEL_NOCADD) { // Don't process indels without cadd annotation errorSink.accept(format("Line %d:\t%s", numLines + 1, line)); } else { outputSink.accept(variant.toString()); } return variant.getLineType(); } /** * Determines if a line is a comment line. Comment lines start with "#". * * @param line the line that may be a comment line * @return true if the line is a comment line */ public boolean isComment(String line) { return line != null && line.startsWith("#"); } /** * Parses a line into a {@link Variant}. It may be one of two formats, CADD output or five VCF columns. * * @param line the line to parse * @return parsed Variant, or null if the line could not be parsed */ public Variant tryParseVariant(String line) { try { return parseVariant(line); } catch (Exception ex) { LOG.debug("Error parsing line {}", line, ex); return null; } } private Variant parseVariant(String line) { String[] columns = line.split("\t"); Variant caddVariant = parseCaddLine(columns); return caddVariant != null ? caddVariant : parseVcfLine(columns); } /** * Determines if any of the values are null * * @param values the values that may be null * @return true if any of the values was null */ private boolean anyNull(Object... values) { return Arrays.stream(values).anyMatch(Objects::isNull); } /** * Attempts to parse a line as a CADD output record. * * @param columns the columns of the line * @return parsed {@link CaddVariant}, or null if parsing failed */ private CaddVariant parseCaddLine(String[] columns) { if (columns.length != CADD_NR_OF_COLS) { return null; } String chrom = parseChrom(columns[CHROM_INDEX].trim()); Long pos = parsePos(columns[POS_INDEX].trim()); if (anyNull(chrom, pos)) { return null; } try { String ref = parseRef(columns[CADD_REF_INDEX].trim()); String alt = parseAlt(columns[CADD_ALT_INDEX].trim()); Double rawScore = parseDouble(columns[CADD_RAW_SCORE_INDEX].trim()); Double phred = parseDouble(columns[CADD_PHRED_SCORE].trim()); if (anyNull(ref, alt)) { return null; } return CaddVariant.create(chrom, pos, ref, alt, rawScore, phred); } catch (NumberFormatException e) { return null; } } /** * Attempts to parse a line as a VCF record. * * @param columns the columns of the line * @return parsed {@link VcfVariant} */ private VcfVariant parseVcfLine(String[] columns) { if (columns.length < VCF_NR_OF_COLUMNS) { return null; } String chrom = parseChrom(columns[CHROM_INDEX].trim()); Long pos = parsePos(columns[POS_INDEX].trim()); String id = columns[VCF_ID_INDEX].trim(); if (isEmpty(id)) { id = "."; } String ref = parseRef(columns[VCF_REF_INDEX].trim()); String alt = parseAlt(columns[VCF_ALT_INDEX].trim()); if (anyNull(chrom, pos, ref, alt)) { return null; } return VcfVariant.create(chrom, pos, id, ref, alt); } private Double parseDouble(String doubleString) throws NumberFormatException { return isEmpty(doubleString) ? null : Double.parseDouble(doubleString); } String parseChrom(String chrom) { Matcher m = CHROM_PATTERN.matcher(chrom); return !m.matches() ? null : m.group("chrom").toUpperCase(); } private Long parsePos(String pos) { try { return Long.parseLong(pos); } catch (NumberFormatException ex) { return null; } } private String parseRef(String value) { if (!REF_PATTERN.matcher(value).matches()) { return null; } return value; } private String parseAlt(String value) { if (!ALT_PATTERN.matcher(value).matches()) { return null; } return value; } }