/*******************************************************************************
* GenPlay, Einstein Genome Analyzer
* Copyright (C) 2009, 2014 Albert Einstein College of Medicine
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
* Authors: Julien Lajugie <julien.lajugie@einstein.yu.edu>
* Nicolas Fourel <nicolas.fourel@einstein.yu.edu>
* Eric Bouhassira <eric.bouhassira@einstein.yu.edu>
*
* Website: <http://genplay.einstein.yu.edu>
******************************************************************************/
package edu.yu.einstein.genplay.core.IO.fileSorter;
import java.io.File;
import java.io.IOException;
import java.nio.charset.Charset;
import java.security.InvalidParameterException;
import java.util.Comparator;
import java.util.List;
import edu.yu.einstein.genplay.core.IO.utils.Extractors;
import edu.yu.einstein.genplay.core.comparator.ChromosomeComparator;
import edu.yu.einstein.genplay.core.comparator.StringComparator;
import edu.yu.einstein.genplay.dataStructure.chromosomeWindow.ChromosomeWindow;
import edu.yu.einstein.genplay.dataStructure.chromosomeWindow.SimpleChromosomeWindow;
import edu.yu.einstein.genplay.util.Utils;
/**
* Adapts the ExternalSort class for GenPlay
* @author Julien Lajugie
*/
public class ExternalSortAdapter {
/**
* Comparator for genomic files. Compare the chromosome, start and stop positions found at the specied indexes
* @author Julien Lajugie
*/
private static class GenomicFileLineComparator implements Comparator<String> {
private static ChromosomeComparator chromosomeComparator = new ChromosomeComparator(); // comparator for chromosomes
private final int chromoFieldIndex; // index of the chromosome field
private final int startFieldIndex; // index of the start field
private final int stopFieldIndex; // index of the stop field
/**
* Creates an instance of {@link ExternalSortAdapter}
* @param chromoFieldIndex
* @param startFieldIndex
* @param stopFieldIndex
*/
public GenomicFileLineComparator(int chromoFieldIndex, int startFieldIndex, int stopFieldIndex) {
this.chromoFieldIndex = chromoFieldIndex;
this.startFieldIndex = startFieldIndex;
this.stopFieldIndex = stopFieldIndex;
}
@Override
public int compare(String o1, String o2) {
if (Extractors.isHeaderLine(o1)) {
return -1;
} else if (Extractors.isHeaderLine(o2)) {
return 1;
}
String[] splitLine1 = Utils.splitWithTab(o1);
String[] splitLine2 = Utils.splitWithTab(o2);
int cmp;
try {
cmp = chromosomeComparator.compareChromosomeName(splitLine1[chromoFieldIndex], splitLine2[chromoFieldIndex]);
} catch (Exception e) {
// if the chromosome comparator doesn't work we use a string cpmparator
cmp = new StringComparator().compare(splitLine1[chromoFieldIndex], splitLine2[chromoFieldIndex]);
}
// if the chromosomes are equals we compare the positions
if (cmp == 0) {
try {
ChromosomeWindow cw1 = new SimpleChromosomeWindow(splitLine1[startFieldIndex], splitLine1[stopFieldIndex]);
ChromosomeWindow cw2 = new SimpleChromosomeWindow(splitLine2[startFieldIndex], splitLine2[stopFieldIndex]);
cmp = cw1.compareTo(cw2);
} catch (Exception e) {
new StringComparator().compare(splitLine1[startFieldIndex], splitLine2[startFieldIndex]);
}
}
return cmp;
}
}
/**
* Sorts a genomic file per chromosome, start and then stop position
* @param inputFile
* @throws IOException
*/
public static void externalSortGenomicFile(File inputFile) throws IOException {
Comparator<String> comparator = generateComparator(inputFile);
int maxTmpFiles = ExternalSort.DEFAULTMAXTEMPFILES;
Charset charset = Charset.defaultCharset();
File tmpDir = new File(Utils.getTmpDirectoryPath());
File outputFile = generateOutputFile(inputFile);
List<File> l = ExternalSort.sortInBatch(inputFile, comparator, maxTmpFiles, charset, tmpDir, false, 0, false);
ExternalSort.mergeSortedFiles(l, outputFile, comparator, charset, false, false, false);
}
/**
* @param inputFile
* @return a string comparator adapted that compares the lines of the input file
* @throws InvalidParameterException
*/
private static Comparator<String> generateComparator(File inputFile) throws InvalidParameterException {
String fileExtension = Utils.getExtension(inputFile);
if (fileExtension == null) {
throw new InvalidParameterException("Cannot sort the specified file: " + inputFile.getName() + "\n" +
"Files without extension are not recognized by the sorting function");
} else if (fileExtension.equalsIgnoreCase("gff")) {
return new GenomicFileLineComparator(0, 3, 4);
} else if (fileExtension.equalsIgnoreCase("gr")) {
return new GenomicFileLineComparator(0, 1, 2);
} else if (fileExtension.equalsIgnoreCase("bed")) {
return new GenomicFileLineComparator(0, 1, 2);
} else if (fileExtension.equalsIgnoreCase("bgr")) {
return new GenomicFileLineComparator(0, 1, 2);
} else if (fileExtension.equalsIgnoreCase("pair")) {
return new GenomicFileLineComparator(0, 4, 4);
} else if (fileExtension.equalsIgnoreCase("psl")) {
return new GenomicFileLineComparator(13, 15, 16);
} else if (fileExtension.equalsIgnoreCase("sam")) {
return new GenomicFileLineComparator(0, 3, 3);
} else {
throw new InvalidParameterException("Cannot sort the specified file: " + inputFile.getName() + "\n" +
"Files with " + fileExtension + " extension are not recognized by the sorting function");
}
}
/**
* @param inputFile input file
* @return a new File with prefix ".sorted" added before the extension
*/
public static File generateOutputFile(File inputFile) {
String extension = Utils.getExtension(inputFile);
String nameWithoutExtension = Utils.getFileNameWithoutExtension(inputFile);
String newFileName = nameWithoutExtension + ".sorted." + extension;
File parentDirectory = inputFile.getParentFile();
return new File(parentDirectory, newFileName);
}
}