package org.genedb.crawl.elasticsearch.index;
import java.io.BufferedReader;
import java.io.File;
import java.util.ArrayList;
import java.util.EnumSet;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.log4j.Logger;
import org.elasticsearch.action.admin.cluster.health.ClusterHealthStatus;
import org.genedb.crawl.elasticsearch.index.json.ReferenceIndexBuilder;
import org.genedb.crawl.elasticsearch.mappers.ElasticSearchRegionsMapper;
import org.genedb.crawl.model.Alignments;
import org.genedb.crawl.model.Feature;
import org.genedb.crawl.model.LocatedFeature;
import org.genedb.crawl.model.Reference;
import org.genedb.crawl.model.Sequence;
import junit.framework.TestCase;
public class ReferenceTest extends TestCase {
static Logger logger = Logger.getLogger(ReferenceTest.class);
String propFile = "resource-elasticsearch-local.properties";
String jsonFile = "src/test/resources/alignments-vrtrack.json";
public void test1() throws Exception {
String[] args = new String[] {
"-pe", propFile,
"-r" , jsonFile
};
ReferenceIndexBuilder builder = new ReferenceIndexBuilder();
builder.prerun(args);
ElasticSearchRegionsMapper regionsMapper = builder.regionsMapper;
regionsMapper.waitForStatus(EnumSet.of(ClusterHealthStatus.GREEN, ClusterHealthStatus.YELLOW));
Alignments store = builder.jsonIzer.fromStringOrFile(jsonFile, Alignments.class);
List<String> includes = new ArrayList<String>();
includes.add("exon");
Pattern p = Pattern.compile("ID=[^;]+");
for (Reference r : store.references) {
logger.info("verifying " + r.organism.common_name);
String file = r.file;
BufferedReader buf = builder.getReader(new File(file));
Set<String> ids = new HashSet<String>();
Map<String,String> idLines = new HashMap<String,String>();
boolean fasta = false;
String line = null;
int featureCDSLines = 0;
while ((line=buf.readLine())!=null) {
if (line.startsWith("##sequence-region")) {
fasta = false;
continue;
}
if (line.startsWith(">")) {
fasta = true;
continue;
}
if ( (!fasta) && (!line.startsWith("#")) && (!line.startsWith(">"))) {
logger.info(line);
Matcher m = p.matcher(line);
m.find();
String id = m.group();
id = id.replaceFirst("ID=", "");
if (id.startsWith("\"") && id.endsWith("\"")) {
id = id.substring(1, id.length() - 1);
}
logger.info("Found id " + id);
if (ids.contains(id)) {
logger.warn("already seen " + id + " here " + idLines.get(id));
logger.warn("now seen " + id + " here " + line);
continue;
}
else if (line.contains("\tCDS\t") ) {
ids.add(id);
idLines.put(id, line);
featureCDSLines++;
}
}
}
int featureCount = 0;
Set<String> locatedIDs = new HashSet<String>();
List<Feature> regions = regionsMapper.inorganism(r.organism.ID, null, null, null);
for (Feature region : regions) {
Sequence sequence = regionsMapper.sequence(region.uniqueName);
//logger.info(String.format("%s %s %s", region.uniqueName, 1, (int) sequence.length));
List<LocatedFeature> locatedFeatures = regionsMapper.locations(
region.uniqueName, 1, (int) sequence.length, false, includes);
//logger.info(locatedFeatures.size());
for (LocatedFeature feature : locatedFeatures) {
locatedIDs.add(feature.uniqueName);
featureCount++;
}
}
// boolean allPresentAndAccountedFor = true;
//
// for (String id : ids) {
// if (locatedIDs.contains(id)) {
// logger.warn("Found id " + id + " in ES.");
// } else if ((locatedIDs.contains(id))
// } else {
// logger.error("Did not find id " + id + " in ES!");
// logger.error(id + " : " + idLines.get(id));
// allPresentAndAccountedFor = false;
// }
// }
//assertTrue(allPresentAndAccountedFor);
logger.info(ids.size() + " == " + locatedIDs.size());
logger.info(String.format("%s GFF lines %d == %d features in ES %d", file, featureCDSLines, featureCount, r.organism.ID));
//assertEquals(featureCDSLines, featureCount);
}
builder.closeIndex();
}
}