/*
* MicroSatImporter.java
*
* Copyright (c) 2002-2015 Alexei Drummond, Andrew Rambaut and Marc Suchard
*
* This file is part of BEAST.
* See the NOTICE file distributed with this work for additional
* information regarding copyright ownership and licensing.
*
* BEAST is free software; you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as
* published by the Free Software Foundation; either version 2
* of the License, or (at your option) any later version.
*
* BEAST is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with BEAST; if not, write to the
* Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
* Boston, MA 02110-1301 USA
*/
package dr.evolution.io;
import dr.evolution.alignment.Patterns;
import dr.evolution.datatype.Microsatellite;
import dr.evolution.util.Taxa;
import dr.evolution.util.Taxon;
import java.io.BufferedReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
/**
* @author Alexei Drummond
* @author Walter Xie
*/
public class MicroSatImporter implements PatternImporter {
protected final BufferedReader reader;
protected String delimiter;
protected Taxa unionSetTaxonList = new Taxa();
protected Microsatellite microsatellite;
protected boolean hasDifferentTaxon = false;
public MicroSatImporter(BufferedReader reader) {
this(reader, "\t");
}
public MicroSatImporter(BufferedReader reader, String delimiter) {
this.reader = reader;
this.delimiter = delimiter;
}
public Taxa getUnionSetTaxonList() throws IOException, Importer.ImportException {
return unionSetTaxonList;
}
public Microsatellite getMicrosatellite() {
return microsatellite;
}
public boolean isHasDifferentTaxon() {
return hasDifferentTaxon;
}
public List<Patterns> importPatterns() throws IOException, Importer.ImportException {
List<Patterns> microsatPatList = new ArrayList<Patterns>();
List<List<String>> data = new ArrayList<List<String>>(); // 1st List<String> is taxon names
String[] microsatName = new String[2]; // microsatName[0] is keyword, microsatName[1] is name
microsatName[1] = "unnamed.microsat";
String line = reader.readLine();
while (line.startsWith("#")) { // comments
if (line.toUpperCase().contains("NAME")) {
microsatName = line.trim().split("[" + delimiter + " ]+");
if (microsatName[1] == null || microsatName[1].length() < 1)
throw new Importer.ImportException("Improper microsatellite name : " + microsatName[1]);
}
line = reader.readLine();
}
// read locus (microsat pattern) names in the 1st row after comments, where 1st element is id
String[] names = line.trim().split("[" + delimiter + " ]+"); // trim trailing whitespace ?
int colLen = names.length; // for validation
if (colLen < 2) throw new Importer.ImportException("Import file must have more than 1 columns : " + colLen);
for (int i = 0; i < colLen; i++) { // init data
List<String> l = new ArrayList<String>();
data.add(l);
}
int min = Integer.MAX_VALUE;
int max = Integer.MIN_VALUE;
line = reader.readLine();
while (line != null) { // read data
String[] dataLine = line.trim().split("[" + delimiter + " ]+");
if (dataLine.length != colLen)
throw new Importer.ImportException("The number of name columns are different with values columns," +
"\nplease use only letters or numbers in the name.");
// + "\ndataLine.length = " + dataLine.length + ", colLen = " + colLen);
for (int i = 0; i < dataLine.length; i++) {
data.get(i).add(dataLine[i]);
if (i > 0) {
int v = parseInt(dataLine[i]);
if (v != Microsatellite.UNKNOWN_STATE_LENGTH) {
if (min > v) min = v;
if (max < v) max = v;
}
}
}
line = reader.readLine();
}
if (max < min) throw new Importer.ImportException("Importing invalid data: max < min !");
// if (min - 2 < 0) throw new Importer.ImportException("Importing invaild data: min-2 < 0 where min = " + min);
// The min also = 1 and max should be the longest repeat length + 2.
microsatellite = new Microsatellite(microsatName[1], 1, max + 2, 1);
Taxa taxaHaploid = new Taxa();
for (String name : data.get(0)) {
Taxon t = new Taxon(name);
taxaHaploid.addTaxon(t);
}
// unionSetTaxonList.addTaxa(taxaHaploid);
Patterns microsatPat;
for (int i = 1; i < data.size(); i++) { // create pattern
// List<Integer> pattern = new ArrayList<Integer>();
List<Integer> pattern;
Taxa taxa = new Taxa();
if ((i + 1 < data.size()) && names[i].equalsIgnoreCase(names[i + 1])) { // diploid: Locus2 Locus2
Taxa taxaDiploid = new Taxa();
for (String name : data.get(0)) {
Taxon t = new Taxon(names[i] + "_1_" + name);
taxaDiploid.addTaxon(t);
}
for (String name : data.get(0)) {
Taxon t = new Taxon(names[i] + "_2_" + name);
taxaDiploid.addTaxon(t);
}
if (unionSetTaxonList.containsAny(taxaDiploid))
throw new Importer.ImportException("Importing invalid data: duplicate taxon name in this locus : " + names[i]);
unionSetTaxonList.addTaxa(taxaDiploid);
hasDifferentTaxon = true;
pattern = new ArrayList<Integer>();
String value;
int size = data.get(i).size();
for (int v = 0; v < size; v++) {
value = data.get(i).get(v);
// if (!isUnknownChar(value)) {
Taxon t = taxaDiploid.getTaxon(v);
if (!taxa.contains(t)) {
taxa.addTaxon(t);
pattern.add(parseInt(value));//microsatellite.getState(value);
if (!unionSetTaxonList.contains(t)) {
unionSetTaxonList.addTaxon(t);
if (i > 1) hasDifferentTaxon = true;
}
}
// }
}
for (int v = 0; v < data.get(i + 1).size(); v++) {
value = data.get(i + 1).get(v);
// if (!isUnknownChar(value)) {
Taxon t = taxaDiploid.getTaxon(v + size);
if (!taxa.contains(t)) {
taxa.addTaxon(t);
pattern.add(parseInt(value));//microsatellite.getState(value);
if (!unionSetTaxonList.contains(t)) {
unionSetTaxonList.addTaxon(t);
if (i > 1) hasDifferentTaxon = true;
}
}
// }
}
i++;
} else { // haploid Locus1
pattern = new ArrayList<Integer>();
for (int v = 0; v < data.get(i).size(); v++) {
String value = data.get(i).get(v);
// if (!isUnknownChar(value)) {
Taxon t = taxaHaploid.getTaxon(v);
if (!taxa.contains(t)) {
taxa.addTaxon(t);
pattern.add(parseInt(value));//microsatellite.getState(value);
if (!unionSetTaxonList.contains(t)) {
unionSetTaxonList.addTaxon(t);
if (i > 1) hasDifferentTaxon = true;
}
}
// }
}
}
int[] p = new int[pattern.size()];
for (int v = 0; v < pattern.size(); v++) {
p[v] = pattern.get(v);
}
microsatPat = new Patterns(microsatellite, taxa);
microsatPat.addPattern(p);
microsatPat.setId(names[i]);
microsatPatList.add(microsatPat);
}
return microsatPatList;
}
private int parseInt(String s) {
if (s.charAt(0) == Microsatellite.UNKNOWN_CHARACTER) {
return Microsatellite.UNKNOWN_STATE_LENGTH; // -1
} else {
return Integer.parseInt(s);
}
}
private boolean isUnknownChar(String s) {
return parseInt(s) == -1; // -1
}
/*
id Locus1 Locus2 Locus2 Locus3 Locus4 Locus4 Locus5 Locus6
T1 5 6 ? 20 ? ? ? 11
T2 5 6 ? 12 ? ? ? 12
T3 8 6 4 16 9 9 ? 13
T4 12 ? 6 1 9 12 ? 4
T5 17 ? 9 18 7 7 ? 5
T6 19 ? 5 14 12 12 ? 6
*/
}