/* * The MIT License * * Copyright (c) 2011 The Broad Institute * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ package picard.util; import htsjdk.samtools.util.AbstractIterator; import htsjdk.samtools.util.CloseableIterator; import picard.PicardException; import java.util.Iterator; /** * Class for parsing text files where each line consists of fields separated by whitespace. * Code is abstracted into this class so that we can optimize its performance over time. * * This class assumes that every line will have the same number of whitespace-separated "words" * and that lines that start with "#" are comments and should be ignored. * * Classes that extend this parser can do so simply by implementing their own constructors and the * readNextLine(), close(), and getFileName() methods. * * @author Kathleen Tibbetts */ public abstract class AbstractInputParser extends AbstractIterator<String[]> implements Iterable<String[]>, CloseableIterator<String[]> { private boolean treatGroupedDelimitersAsOne = true; // Whether multiple delimiters in succession should be treated as one private int wordCount = 0; /* The number of delimiter-separated "words" per line of the file. We can save a little caclulation, or handle files with varying numbers of words per line, by specifying this if known in advance */ private boolean skipBlankLines = true; /** * Closes this stream and releases any system resources associated with it. */ public abstract void close(); /** * @return the next line of text from the underlying stream(s) or null if there is no next line */ protected abstract byte[] readNextLine(); /** * @return the name(s) of the file(s) being parsed, or null if no name is available */ public abstract String getFileName(); /** * @return an iterator over a set of elements of type String[] */ public Iterator<String[]> iterator() { if (isIterating()) { throw new IllegalStateException("iterator() method can only be called once, before the" + "first call to hasNext()"); } hasNext(); return this; } @Override protected String[] advance() { byte[] nextLine; do { nextLine = readNextLine(); } while (nextLine != null && ((this.skipBlankLines && isBlank(nextLine)) || isComment(nextLine))); return nextLine == null ? null : parseLine(nextLine); } /** * This method represents the most efficient way (so far) to parse a line of whitespace-delimited text * * @param line the line to parse * @return an array of all the "words" */ private String[] parseLine(final byte[] line) { if (getWordCount() == 0) { calculateWordCount(line); } final String[] parts = new String[getWordCount()]; boolean delimiter = true; int index=0; int start = 0; try { for (int i = 0; i < line.length; i++) { if (isDelimiter(line[i])) { if (!delimiter) { parts[index++] = new String(line,start,i-start); } else if(!isTreatGroupedDelimitersAsOne()) { parts[index++] = null; } delimiter=true; } else { if (delimiter) start = i; delimiter = false; } } if (!delimiter) { parts[index] = new String(line,start,line.length-start); } } catch (ArrayIndexOutOfBoundsException e) { throw new PicardException("Unexpected number of elements found when parsing file " + this.getFileName() + ": " + index + ". Expected a maximum of " + this.getWordCount() + " elements per line:" + new String(line,0,line.length), e); } return parts; } /** * Calculates the number of delimiter-separated "words" in a line and sets the value of <code>wordCount</code> * * @param line representative line from the file */ protected void calculateWordCount(final byte[] line) { int words = 0; boolean delimiter = true; for (final byte b : line) { if (isDelimiter(b)) { if (delimiter && !isTreatGroupedDelimitersAsOne()) words++; delimiter = true; } else { if (delimiter) words++; delimiter = false; } } if (delimiter && !isTreatGroupedDelimitersAsOne()) { words += 1; } setWordCount(words); } /** * Determines whether a given line is a comment * * @param line the line to evaluate * @return true if the line is a comment (and should be ignored) otherwise false */ protected boolean isComment(final byte[] line) { return line.length > 0 && line[0] == '#'; } /** * Determines whether a given line is a comment * * @param line the line to evaluate * @return true if the line is a comment (and should be ignored) otherwise false */ protected boolean isBlank(final byte[] line) { return line.length == 0; } /** * Determines whether a given character is a delimiter * * @param b the character to evaluate * @return true if <code>b</code> is a delimiter; otherwise false */ protected boolean isDelimiter(final byte b) { return b == ' ' || b == '\t'; } protected int getWordCount() { return wordCount; } protected void setWordCount(final int wordCount) { this.wordCount = wordCount; } protected boolean isTreatGroupedDelimitersAsOne() { return treatGroupedDelimitersAsOne; } protected void setTreatGroupedDelimitersAsOne(final boolean treatGroupedDelimitersAsOne) { this.treatGroupedDelimitersAsOne = treatGroupedDelimitersAsOne; } protected boolean isSkipBlankLines() { return this.skipBlankLines; } protected void setSkipBlankLines(final boolean skipBlankLines) { this.skipBlankLines = skipBlankLines; } }