/*******************************************************************************
* GenPlay, Einstein Genome Analyzer
* Copyright (C) 2009, 2014 Albert Einstein College of Medicine
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
* Authors: Julien Lajugie <julien.lajugie@einstein.yu.edu>
* Nicolas Fourel <nicolas.fourel@einstein.yu.edu>
* Eric Bouhassira <eric.bouhassira@einstein.yu.edu>
*
* Website: <http://genplay.einstein.yu.edu>
******************************************************************************/
package edu.yu.einstein.genplay.core.IO.utils;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Random;
import java.util.TreeSet;
import edu.yu.einstein.genplay.exception.ExceptionManager;
import edu.yu.einstein.genplay.exception.exceptions.DataLineException;
import edu.yu.einstein.genplay.util.Utils;
/**
* Tools for extractors.
* @author Julien Lajugie
* @author Nicolas Fourel
*/
public class Extractors {
/**
* Counts the number of line containing data in the specified
* @param file a file
* @return the number of line containing data in the specified file
*/
public static final Integer countDataLines(File file) {
BufferedReader reader = null;
try {
reader = new BufferedReader(new FileReader(file));
String line = null;
int count = 0;
while ((line = reader.readLine()) != null) {
if (!isHeaderLine(line)) {
count++;
}
}
return count;
} catch (Exception e) {
ExceptionManager.getInstance().caughtException(e);
return null;
} finally {
// always close the reader before exiting
if (reader != null) {
try {
reader.close();
} catch (IOException e) {
ExceptionManager.getInstance().caughtException(e);
}
}
}
}
/**
* Creates a list of random integers that represents the numbers of the line to extract.
* @param randomCount count of lines to extract
* @param dataFile a file containing data
* @return a {@link TreeSet} containing all the lines to extract
* @throws IOException
*/
public static final TreeSet<Integer> generateRandomLineNumbers(int randomCount, File dataFile) throws IOException {
TreeSet<Integer> randomLineNumbers = new TreeSet<Integer>();
// we compute how many lines there is in the file
int lineCount = Extractors.countDataLines(dataFile);
// if there is less line in the file than the specified number of line to extract
// we extract the entire file
if (lineCount > randomCount) {
randomLineNumbers = new TreeSet<Integer>();
Random randomGenerator = new Random();
while (randomLineNumbers.size() < randomCount) {
// the add function in a set works only if the element to add is not already present
randomLineNumbers.add(randomGenerator.nextInt(lineCount) + 1);
}
}
return randomLineNumbers;
}
/**
* Convert a string into a float
* @param s a string
* @return a float if the string is valid
* @throws DataLineException
*/
public static final Float getFloat(String s) throws DataLineException {
try {
return Float.parseFloat(s);
} catch (Exception e) {
throw new DataLineException("The information '" + s + "' does not seem to be a valid number.", DataLineException.SKIP_PROCESS);
}
}
/**
* Convert a string into a float
* @param s the string
* @param alternative the value to return if the string could not be converted (can be null)
* @return the float if the string is valid, the alternative otherwise
*/
public static final Float getFloat(String s, Float alternative) {
try {
return Float.parseFloat(s);
} catch (Exception e) {
return alternative;
}
}
/**
* Convert a string to an integer
* @param s the string
* @return the integer if the string is valid
* @throws DataLineException
*/
public static final Integer getInt (String s) throws DataLineException {
try {
return Integer.parseInt(s);
} catch (Exception e) {
throw new DataLineException("The information '" + s + "' does not seem to be a valid number.", DataLineException.SKIP_PROCESS);
}
}
/**
* Convert a string to an integer
* @param s the string
* @param alternative the value to return if the string could not be converted (can be null)
* @return the integer if the string is valid, the alternative otherwise
*/
public static final Integer getInt (String s, Integer alternative) {
try {
return Integer.parseInt(s);
} catch (Exception e) {
return alternative;
}
}
/**
* @param line a line from the data file
* @return true if the line is a header line or if the line is empty (could be a blank line in the header)
*/
public static final boolean isHeaderLine(String line) {
// if the line starts with chr it's a data line so we skip the other tests
if ((line.length() >= 3) && (line.substring(0, 3).equalsIgnoreCase("chr"))) {
return false;
}
// empty line
if (line.length() == 0) {
return true;
}
// comment line
if (line.charAt(0) == '#') {
return true;
}
// sam comment line
if (line.charAt(0) == '@') {
return true;
}
// track line
if ((line.length() >= 5) && (line.substring(0, 5).equalsIgnoreCase("track"))) {
return true;
}
// browser line
if ((line.length() >= 7) && (line.substring(0, 7).equalsIgnoreCase("browser"))) {
return true;
}
return false; // not a header line
}
/**
* @param line line from the data file
* @return true if the line is a track info line (line starting with 'track'). False otherwise
*/
public static final boolean isTrackInfoLine(String line) {
if ((line.length() > 5) && (line.substring(0, 5).equalsIgnoreCase("track"))) {
return true;
}
return false;
}
/**
* This methods parse a line and returns an array of strings containing
* all the fields from the input line that are separated either by one or many
* continuous spaces or tabs except if this tabs or spaces are from inside double quotes.
* @param line input line to parse
* @return an array of strings containing the fields of the input line
*/
public static final String[] parseLineTabAndSpace(String line) {
List<String> parsedLine = new ArrayList<String>();
int i = 0;
while (i < line.length()) {
// skip all the space and tabs
while ((i < line.length()) &&
((line.charAt(i) == ' ') || (line.charAt(i) == '\t'))) {
i++;
}
if (i < line.length()) {
// if the spaces and tabs weren't at the end of the line
int indexStart = i; // retrieve the start index
boolean isInsideQuotes = false; // when we start we're not inside double quotes
while ((i < line.length()) &&
(isInsideQuotes || ((line.charAt(i) != ' ') && (line.charAt(i) != '\t')))) {
// loop until we meet a new space or tab that is not between double quotes
if (line.charAt(i) == '"') { // check if we enter or leave double quotes
isInsideQuotes = !isInsideQuotes;
}
i++;
}
// add the field to the result list
parsedLine.add(line.substring(indexStart, i));
}
}
if (parsedLine.isEmpty()) { // if our list is empty we return null
return null;
} else { // if there is element in our list we transform it in an array and return it
String[] returnArray = new String[parsedLine.size()];
return parsedLine.toArray(returnArray);
}
}
/**
* This methods parse a line and returns an array of strings containing
* all the fields from the input line that are separated by one or many
* continuous tabs except if this tabs are from inside double quotes.
* @param line input line to parse
* @return an array of strings containing the fields of the input line
*/
public static final String[] parseLineTabOnly(String line) {
List<String> parsedLine = new ArrayList<String>();
int i = 0;
while (i < line.length()) {
// skip all the tabs
while ((i < line.length()) &&
(line.charAt(i) == '\t')) {
i++;
}
if (i < line.length()) {
// if the tabs weren't at the end of the line
int indexStart = i; // retrieve the start index
boolean isInsideQuotes = false; // when we start we're not inside double quotes
while ((i < line.length()) &&
(isInsideQuotes || (line.charAt(i) != '\t'))) {
// loop until we meet a new tab that is not between double quotes
if (line.charAt(i) == '"') { // check if we enter or leave double quotes
isInsideQuotes = !isInsideQuotes;
}
i++;
}
// add the field to the result list
parsedLine.add(line.substring(indexStart, i));
}
}
if (parsedLine.isEmpty()) { // if our list is empty we return null
return null;
} else { // if there is element in our list we transform it in an array and return it
String[] returnArray = new String[parsedLine.size()];
return parsedLine.toArray(returnArray);
}
}
/**
* @param dataFile a data file
* @return the name of the data if it is specified in the file. Null otherwise
*/
public static final String retrieveDataName(File dataFile) {
BufferedReader reader = null;
try {
reader = new BufferedReader(new FileReader(dataFile));
boolean isHeader = true;
boolean isTrackInfo = false;
String line = null;
while (((line = reader.readLine()) != null) && isHeader) {
isHeader = isHeaderLine(line);
isTrackInfo = isTrackInfoLine(line);
if (isHeader && isTrackInfo) {
String lineTmp = line.toLowerCase();
if (lineTmp.contains("name")) {
int indexStart = lineTmp.indexOf("name") + 4;
line = line.substring(indexStart);
line = line.trim();
if (line.charAt(0) != '=') {
return null;
}
// remove the '=' from the line
line = line.substring(1);
line = line.trim();
if (line.charAt(0) == '\"') {
reader.close();
// remove the first "
line = line.substring(1);
return Utils.split(line, '"')[0];
} else {
line = line.trim();
return Utils.split(line, ' ')[0].trim();
}
}
}
}
} catch (Exception e) {
ExceptionManager.getInstance().caughtException(e);
} finally {
// always close the reader before exiting
if (reader != null) {
try {
reader.close();
} catch (IOException e) {
ExceptionManager.getInstance().caughtException(e);
}
}
}
return null;
}
}