/*
* The MIT License (MIT)
*
* Copyright (c) 2007-2015 Broad Institute
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
package org.broad.igv.feature.genome;
import com.google.common.collect.HashBasedTable;
import com.google.common.collect.Table;
import java.util.Comparator;
/**
* Comparator for chromosome names. All pure string comparisons are case insensitive.
* In general, we compare strings lexicographically, but attempt to include numbers
* if they are in the same position
* 0. Mitochondria are sorted to the end (chrM, MT)
* 1. If BOTH strings contain a number starting at the same location,
* we first by the leading string, then sort by that number. Examples:
* a. "chr1" < "chr10" because 1 < 10
* b. "chrUn_12" > "chr20" because "chrUn" > "chr20"
* c. "Alpha5" < "gamma1" because Alpha < gamma (numbers in same location are ignored because
* string comparison takes precedence)
* 2. Numeric comparisons are performed recursively if numbers found are the same.
* For example, "scaffold_v2_100" < "scaffold_v2_1000". The first numbers match (2 == 2),
* but we then compare the trailing strings, and "_100" < "_1000"
*/
public class ChromosomeNameComparator implements Comparator<String> {
private static ChromosomeNameComparator instance;
private Table<String, String, Integer> cache = HashBasedTable.create();
private ChromosomeNameComparator() {
}
public static ChromosomeNameComparator get() {
if (instance == null) {
instance = new ChromosomeNameComparator();
}
return instance;
}
private boolean isMito(String chr) {
return chr.equalsIgnoreCase("chrM") || chr.equalsIgnoreCase("MT");
}
public int compare(String chr0, String chr1) {
if (cache.contains(chr0, chr1)) {
return cache.get(chr0, chr1);
}
int comparison = compareNonCache(chr0, chr1);
//Just to make sure cache size doesn't go crazy.
//In general don't expect more than ~50 chromosomes,
//which would be 50 choose 2 ~= 1250 mappings
if (cache.size() < 10000) {
cache.put(chr0, chr1, comparison);
}
return comparison;
}
public void resetCache() {
cache.clear();
}
public int compareNonCache(String chr0, String chr1) {
int[] range0 = findDigitRange(chr0);
int[] range1 = findDigitRange(chr1);
if (range0 == null || range1 == null || range0[0] != range1[0]) {
// Special rule -- put the mitochondria at the end
boolean mito0 = isMito(chr0);
boolean mito1 = isMito(chr1);
if (mito0 && !mito1) {
return +1;
} else if (!mito0 && mito1) {
return -1;
} else if (mito0 && mito1) {
return 0;
}
return chr0.compareToIgnoreCase(chr1);
} else {
String alpha1 = chr0.substring(0, range0[0]);
String alpha2 = chr1.substring(0, range1[0]);
int alphaCmp = alpha1.compareToIgnoreCase(alpha2);
if (alphaCmp != 0) {
return alphaCmp;
} else {
long dig1 = 0;
long dig2 = 0;
try {
dig1 = Long.parseLong(chr0.substring(range0[0], range0[1]));
dig2 = Long.parseLong(chr1.substring(range1[0], range1[1]));
} catch (NumberFormatException e) {
// This can occur if numbers are too large for Long. In this case revert to alpha compare
return chr0.compareTo(chr1);
}
if (dig1 > dig2) {
return 1;
} else if (dig1 < dig2) {
return -1;
} else {
return compare(chr0.substring(range0[1]), chr1.substring(range1[1]));
}
}
}
// try {
// // Find the first digit
// int idx1 = findDigitIndex(chr1);
// int idx2 = findDigitIndex(chr2);
// if (idx1 == idx2) {
// String alpha1 = idx1 == -1 ? chr1 : chr1.substring(0, idx1);
// String alpha2 = idx2 == -1 ? chr2 : chr2.substring(0, idx2);
// int alphaCmp = alpha1.compareTo(alpha2);
// if (alphaCmp != 0) {
// return alphaCmp;
// } else {
// int dig1 = Integer.parseInt(chr1.substring(idx1));
// int dig2 = Integer.parseInt(chr2.substring(idx2));
// return dig1 - dig2;
// }
// } else if (idx1 == -1) {
// return +1;
// } else if (idx2 == -1) {
// return -1;
// }
// return idx1 - idx2;
// } catch (Exception numberFormatException) {
// return 0;
// }
}
/**
* Return start/end (inclusive/exclusive) locations of first range in string
* which represent a digit.
*
* @param chr
* @return
*/
private int[] findDigitRange(String chr) {
int[] locs = null;
int loc = 0;
for (char c : chr.toCharArray()) {
if (Character.isDigit(c)) {
if (locs == null) {
locs = new int[]{loc, chr.length()};
}
} else if (locs != null) {
locs[1] = loc;
break;
}
loc++;
}
return locs;
}
// private int findDigitIndex(String chr) {
//
// int n = chr.length() - 1;
// if (!Character.isDigit(chr.charAt(n))) {
// return -1;
// }
//
// for (int i = n - 1; i > 0; i--) {
// if (!Character.isDigit(chr.charAt(i))) {
// return i + 1;
// }
// }
// return 0;
// }
}