/*
* Copyright 2015-2016 OpenCB
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.opencb.opencga.catalog.utils;
import org.apache.commons.lang3.math.NumberUtils;
import org.opencb.opencga.catalog.models.File;
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URI;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
/**
* Created by ralonso on 12/03/15.
*/
public class BioformatDetector {
// protected static final Map<File.Bioformat, Pattern> bioformatMap = new HashMap<>();
// static {
// bioformatMap.put(File.Bioformat.ALIGNMENT, Pattern.compile(".*\\.(bam|sam|cram)(\\.[\\w]+)*", Pattern.CASE_INSENSITIVE));
// bioformatMap.put(File.Bioformat.VARIANT, Pattern.compile(".*\\.(vcf)(\\.[\\w]+)*", Pattern.CASE_INSENSITIVE));
// bioformatMap.put(File.Bioformat.PEDIGREE, Pattern.compile(".*\\.(ped)(\\.[\\w]+)*", Pattern.CASE_INSENSITIVE));
// }
public static File.Bioformat detect(URI uri) {
return detect(uri, FormatDetector.detect(uri), CompressionDetector.detect(uri));
}
public static File.Bioformat detect(URI uri, File.Format format, File.Compression compression) {
String path = uri.getPath();
Path source = Paths.get(uri);
String mimeType;
try {
switch (format) {
case VCF:
case GVCF:
case BCF:
return File.Bioformat.VARIANT;
case TBI:
break;
case SAM:
case BAM:
case CRAM:
return File.Bioformat.ALIGNMENT;
case BAI:
return File.Bioformat.NONE; //TODO: Alignment?
case FASTQ:
return File.Bioformat.SEQUENCE;
case PED:
return File.Bioformat.PEDIGREE;
case TAB_SEPARATED_VALUES:
break;
case COMMA_SEPARATED_VALUES:
break;
case PROTOCOL_BUFFER:
break;
case PLAIN:
break;
case JSON:
case AVRO:
String file;
if (compression != File.Compression.NONE) {
file = com.google.common.io.Files.getNameWithoutExtension(uri.getPath()); //Remove compression extension
file = com.google.common.io.Files.getNameWithoutExtension(file); //Remove format extension
} else {
file = com.google.common.io.Files.getNameWithoutExtension(uri.getPath()); //Remove format extension
}
if (file.endsWith("variants")) {
return File.Bioformat.VARIANT;
} else if (file.endsWith("alignments")) {
return File.Bioformat.ALIGNMENT;
}
break;
case PARQUET:
break;
case IMAGE:
case BINARY:
case EXECUTABLE:
case UNKNOWN:
case XML:
return File.Bioformat.NONE;
default:
break;
}
// for (Map.Entry<File.Bioformat, Pattern> entry : bioformatMap.entrySet()) {
// if (entry.getValue().matcher(path).matches()) {
// return entry.getKey();
// }
// }
mimeType = Files.probeContentType(source);
if (path.endsWith(".nw")) {
return File.Bioformat.OTHER_NEWICK;
}
if (mimeType == null
|| !mimeType.equalsIgnoreCase("text/plain")
|| path.endsWith(".redirection")
|| path.endsWith(".Rout")
|| path.endsWith("cel_files.txt")
|| !path.endsWith(".txt")) {
return File.Bioformat.NONE;
}
FileInputStream fstream = new FileInputStream(path);
BufferedReader br = new BufferedReader(new InputStreamReader(fstream));
String strLine;
int numberOfLines = 20;
int i = 0;
boolean names = false;
while ((strLine = br.readLine()) != null) {
if (strLine.equalsIgnoreCase("")) {
continue;
}
if (i == numberOfLines) {
break;
}
if (strLine.startsWith("#")) {
if (strLine.startsWith("#NAMES")) {
names = true;
} else {
continue;
}
} else {
String[] fields = strLine.split("\t");
if (fields.length > 2) {
if (names && NumberUtils.isNumber(fields[1])) {
return File.Bioformat.DATAMATRIX_EXPRESSION;
}
} else if (fields.length == 1) {
if (fields[0].split(" ").length == 1 && !NumberUtils.isNumber(fields[0])) {
return File.Bioformat.IDLIST;
}
} else if (fields.length == 2) {
if (!fields[0].contains(" ") && NumberUtils.isNumber(fields[1])) {
return File.Bioformat.IDLIST_RANKED;
}
}
}
i++;
}
br.close();
} catch (IOException e) {
e.printStackTrace();
}
return File.Bioformat.NONE;
}
}