package org.genedb.crawl.elasticsearch.mappers; import java.io.IOException; import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import org.apache.log4j.Logger; import org.elasticsearch.action.count.CountResponse; import org.elasticsearch.action.search.SearchResponse; import org.elasticsearch.client.action.search.SearchRequestBuilder; import org.elasticsearch.index.query.BoolQueryBuilder; import org.elasticsearch.index.query.FieldQueryBuilder; import org.elasticsearch.index.query.QueryBuilders; import org.elasticsearch.index.query.RangeQueryBuilder; import org.elasticsearch.search.SearchHit; import org.elasticsearch.search.SearchHits; import org.elasticsearch.search.sort.SortBuilders; import org.genedb.crawl.elasticsearch.index.gff.GFFSequenceExtractor; import org.genedb.crawl.mappers.RegionsMapper; import org.genedb.crawl.model.Coordinates; import org.genedb.crawl.model.Cvterm; import org.genedb.crawl.model.Feature; import org.genedb.crawl.model.Property; import org.genedb.crawl.model.LocatedFeature; import org.genedb.crawl.model.LocationBoundaries; import org.genedb.crawl.model.Sequence; import org.springframework.stereotype.Component; import com.hazelcast.core.Hazelcast; @Component public class ElasticSearchRegionsMapper extends ElasticSearchBaseMapper implements RegionsMapper { private Logger logger = Logger.getLogger(ElasticSearchRegionsMapper.class); private GFFSequenceExtractor extractor = new GFFSequenceExtractor(); private int getTotalInRegion(String region) { FieldQueryBuilder regionQuery = QueryBuilders.fieldQuery("region", region); logger.debug(String.format("Index %s, Type %s", connection.getIndex(), connection.getFeatureType())); CountResponse cr = connection.getClient() .prepareCount() .setIndices(connection.getIndex()) .setTypes(connection.getFeatureType()) .setQuery(regionQuery) .execute() .actionGet(); long count = cr.count(); logger.debug(String.format("Count in %s : %s", region, count)); return (int) count; } private int getTotalRegionsInOrganism(int organism_id) { FieldQueryBuilder regionQuery = QueryBuilders.fieldQuery("organism_id", organism_id); CountResponse cr = connection.getClient() .prepareCount(connection.getIndex()) .setTypes(connection.getRegionType()) .setQuery(regionQuery) .execute() .actionGet(); long count = cr.count(); logger.debug(String.format("Count in organism %s : %s", organism_id, count)); return (int) count; } private BoolQueryBuilder isOverlap(String region, int start, int end, boolean exclude, List<String> types) { RangeQueryBuilder startLowerThanRequested = QueryBuilders.rangeQuery("fmin") .lt(start); RangeQueryBuilder endHigherThanRequested = QueryBuilders.rangeQuery("fmax") .gt(end); // (fmin <= start) && (end <= fmax) BoolQueryBuilder spansBothSides = QueryBuilders.boolQuery() .must(startLowerThanRequested) .must(endHigherThanRequested); RangeQueryBuilder startInRange = QueryBuilders.rangeQuery("fmin") .from(start) .to(end); RangeQueryBuilder endInRange = QueryBuilders.rangeQuery("fmax") .from(start) .to(end); // (start <= fmin <= end) || (start <= fmax <= end) BoolQueryBuilder isInsideRange = QueryBuilders.boolQuery() .should(startInRange) .should(endInRange); BoolQueryBuilder isOverlap = QueryBuilders.boolQuery() .should(spansBothSides) .should(isInsideRange); FieldQueryBuilder regionQuery = QueryBuilders.fieldQuery("region", region); BoolQueryBuilder isOverlapOnRegion = QueryBuilders.boolQuery() .must(isOverlap) .must(regionQuery); if (types != null) { BoolQueryBuilder typesQuery = QueryBuilders.boolQuery(); for (String type : types) { FieldQueryBuilder excludeQuery = QueryBuilders.fieldQuery("type.name", type); if (exclude) { typesQuery.mustNot(excludeQuery); } else { typesQuery.should(excludeQuery); } } isOverlapOnRegion.must(typesQuery); } return isOverlapOnRegion; } @Override public LocationBoundaries locationsMinAndMaxBoundaries(String region, int start, int end, boolean exclude, List<String> types) { BoolQueryBuilder isOverlap = isOverlap(region, start, end, exclude, types); SearchRequestBuilder builder = connection .getClient() .prepareSearch(connection.getIndex()) .setTypes(connection.getFeatureType()); SearchResponse response = builder .setQuery(isOverlap) .setExplain(true) .setSize(getTotalInRegion(region)) .execute() .actionGet(); //logger.info(toString(builder.internalBuilder())); LocationBoundaries lb = new LocationBoundaries(); lb.start = start; lb.end = end; for (SearchHit hit : response.getHits()) { String source = hit.sourceAsString(); //logger.debug(source); LocatedFeature feature = this.getFeatureFromJson(source); if (feature != null) { if (feature.fmax != null && feature.fmin != null && feature.region != null) { if (feature.region.equals(region)) { if (feature.fmin < lb.start) { lb.start = feature.fmin; } if (feature.fmax > lb.end) { lb.end = feature.fmax; } } } else if (feature.coordinates != null) { for (Coordinates co : feature.coordinates) { if (co.region.equals(region)) { if (co.fmin < lb.start) { lb.start = co.fmin; } if (co.fmax > lb.end) { lb.end = co.fmax; } break; } } } } } logger.debug(String.format("Actual start: %s. Actual end %s", lb.start, lb.end)); return lb; } // @Override // public List<LocatedFeature> locationsPaged(String region, int limit, // int offset,boolean exclude, List<String> types) { // // SearchRequestBuilder builder = connection.getClient() // .prepareSearch(connection.getIndex()) // .setTypes(connection.getFeatureType()) // .addSort(SortBuilders.fieldSort("fmin")) // .addSort(SortBuilders.fieldSort("fmax")); // // FieldQueryBuilder regionQuery = // QueryBuilders.fieldQuery("region", region); // // RangeQueryBuilder rangeQuery = // QueryBuilders.rangeQuery("fmin").from(0); // // BoolQueryBuilder locations = // QueryBuilders.boolQuery() // .must(rangeQuery) // .must(regionQuery); // // SearchResponse response = builder // .setQuery(locations) // .setExplain(true) // .setSize(limit) // .setFrom(offset) // .execute() // .actionGet(); // // return parseLocations(region, response); // } @Override public List<LocatedFeature> locations(String region, int start, int end, boolean exclude, List<String> types) { BoolQueryBuilder isOverlap = isOverlap(region, start, end, exclude, types); SearchRequestBuilder builder = connection.getClient() .prepareSearch(connection.getIndex()) .setTypes(connection.getFeatureType()) .addSort(SortBuilders.fieldSort("fmin")) .addSort(SortBuilders.fieldSort("fmax")); SearchResponse response = builder .setQuery(isOverlap) .setExplain(true) .setSize(getTotalInRegion(region)) .execute() .actionGet(); jsonIzer.setPretty(true); logger.info(toString(builder.internalBuilder())); return parseLocations(region, response); } private List<LocatedFeature> parseLocations(String region, SearchResponse response) { List<LocatedFeature> features = new ArrayList<LocatedFeature>(); String[] fieldNames = new String[] {"uniqueName", "fmin", "fmax", "isObsolete", "parent", "phase", "type", "strand", "region", "properties"}; for (SearchHit hit : response.getHits()) { String source = hit.sourceAsString(); //logger.debug(source); LocatedFeature feature = this.getFeatureFromJson(source); // we only want to return colour List<Property> fps = new ArrayList<Property>(); if (feature.properties != null) { for (Property prop : feature.properties) { if (prop.name.equals("colour")) { fps.add(prop); } } } feature.properties=fps; if (feature != null) { if (feature.fmax != null && feature.fmin != null && feature.region != null) { if (feature.region.equals(region)) { try { features.add(copy(feature, fieldNames, LocatedFeature.class)); } catch (InstantiationException e) { logger.error(e); e.printStackTrace(); } catch (IllegalAccessException e) { logger.error(e); e.printStackTrace(); } } } else if (feature.coordinates != null) { for (Coordinates co : feature.coordinates) { if (co.region.equals(region)) { try { features.add(copy(feature, fieldNames, LocatedFeature.class)); } catch (InstantiationException e) { logger.error(e); e.printStackTrace(); } catch (IllegalAccessException e) { logger.error(e); e.printStackTrace(); } break; } } } } } return features; } public Feature getInfo(String uniqueName, String name, Integer organism_id) { Map<String,String> map = new HashMap<String,String>(); map.put("uniqueName", uniqueName); if (name != null) map.put("name", name); if (organism_id != null) map.put("organism_id", String.valueOf(organism_id)); return this.getFirstMatch(connection.getIndex(), connection.getRegionType(), map, LocatedFeature.class); } /** * * We clone the sequence object here because we don't want to alter * the one kept in the Hazelcast cache (sequenceTrimmed, for instance * which calls this method, trims the dna string for instance). The * bean itself is not designed to be immutable. * * @param sequenceCached * @return */ private Sequence clone (Sequence sequenceCached) { Sequence clone = new Sequence(); clone.organism_id = sequenceCached.organism_id; clone.dna = sequenceCached.dna; clone.uniqueName = sequenceCached.name; clone.length = sequenceCached.length; clone.region = sequenceCached.region; return clone; } @Override public Sequence sequence(String region) { Sequence sequenceCached = (Sequence) Hazelcast.getMap("regions").get(region); if (sequenceCached != null) { return clone(sequenceCached); } Sequence sequence = new Sequence(); sequence.dna = ""; sequence.organism_id = -1; Feature regionFeature = this.getInfo(region, null, null); String regionFilePath = null; if (regionFeature != null) { sequence.organism_id = regionFeature.organism_id; for (Property prop : regionFeature.properties) { if (prop.name.equals("file")) { regionFilePath = prop.value; } } if (regionFilePath != null) { try { sequence.dna = extractor.read(regionFilePath, region);; } catch (IOException e) { logger.warn("Could not read sequence file for " + region); } } } sequence.length = sequence.dna.length(); // // logger.info(String.format("%s %s %s %s %s",connection.getIndex(), connection.getRegionType(), region, connection, connection.getClient())); // String json = connection // .getClient() // .prepareGet() // .setIndex(connection.getIndex()) // .setType(connection.getRegionType()) // .setId(region) // .execute() // .actionGet() // .sourceAsString(); // // // // Feature regionFeature = (Feature) jsonIzer.fromJson(json, Feature.class); // // sequence.dna = regionFeature.residues; // sequence.length = regionFeature.residues.length(); // sequence.organism_id = regionFeature.organism_id; Hazelcast.getMap("regions").put(region, sequence); return clone(sequence); } @Override public Sequence sequenceLength(String region) { Sequence sequence = sequence(region); sequence.dna = ""; return sequence; } @Override public Sequence sequenceTrimmed(String region, Integer start, Integer end) { logger.info(String.format("%s:%s-%s", region,start,end)); Sequence sequence = sequence(region); // if it's a simple case of no start or end position, just return what we've got if (start == null && end == null) { sequence.start = 1; sequence.end = sequence.length; sequence.region = region; return sequence; } int max = sequence.dna.length(); if (max < 0) { sequence.start = 0; sequence.end = 0; return sequence; } if (start <= 0) start = 1; int actualStart = start ; // region request in interbase coordinates // to reflect chado system int actualEnd = (end < max) ? end : max ; if(actualStart > actualEnd) actualStart = actualEnd; logger.info(String.format("max: %s, actualStart: %s, actualEnd %s", max, actualStart, actualEnd)); sequence.dna = sequence.dna.substring(actualStart, actualEnd); sequence.start = start; sequence.end = end; //s.length = s.dna.length(); return sequence; } @Override public List<Feature> inorganism(int organismid, Integer limit, Integer offset, String type_name) { logger.debug(String.format("%s %s %s", "sequences", "organism_id", String.valueOf(organismid))); BoolQueryBuilder regionInOrganismQuery = QueryBuilders.boolQuery(); FieldQueryBuilder organismQuery = QueryBuilders.fieldQuery("organism_id", organismid); regionInOrganismQuery.must(organismQuery); if (type_name != null) { FieldQueryBuilder typeQuery = QueryBuilders.fieldQuery("type.name", type_name); regionInOrganismQuery.must(typeQuery); } SearchRequestBuilder srb = connection.getClient() .prepareSearch(connection.getIndex()) .setTypes(connection.getRegionType()) .setQuery(regionInOrganismQuery) .addFields(new String[] {"organism_id", "type.name"}); if (limit == null) { limit = getTotalRegionsInOrganism(organismid); } srb.setSize(limit); if (offset != null) { srb.setFrom(offset); } //logger.info(toString(srb.internalBuilder())); SearchResponse response = srb.execute() .actionGet(); //logger.info(response); SearchHits hits = response.getHits(); List<Feature> regions = new ArrayList<Feature>(); for (SearchHit hit : hits) { // logger.info(hit.id()); // // for (Entry<String, SearchHitField> fieldEntry : hit.getFields().entrySet()) { // logger.debug("-" + fieldEntry.getKey()); // // SearchHitField field = fieldEntry.getValue(); // logger.debug("---" + field.getValue()); // // } Feature region = new Feature(); region.uniqueName = hit.getId(); region.type = new Cvterm(); region.type.name = (String) hit.field("type.name").getValue(); region.organism_id = (Integer) hit.field("organism_id").getValue(); regions.add(region); } return regions; } // QueryBuilder regionInOrganismQuery(int organismid) { // FieldQueryBuilder organismQuery = // QueryBuilders.fieldQuery("organism_id", organismid); // //// FieldQueryBuilder regionQuery = //// QueryBuilders.fieldQuery("topLevel", true); // //// BoolQueryBuilder regionInOrganismQuery = //// QueryBuilders.boolQuery() //// .must(organismQuery) //// .must(regionQuery); // // return organismQuery; // } @Override public List<Cvterm> typesInOrganism(int organismid) { FieldQueryBuilder organismQuery = QueryBuilders.fieldQuery("organism_id", organismid); SearchResponse response = connection.getClient() .prepareSearch() .setQuery(organismQuery).execute().actionGet(); List<Feature> regions = this.getAllMatches(response, Feature.class); Set<Cvterm> terms = new HashSet<Cvterm>(); for (Feature region : regions) { terms.add(region.type); } return new ArrayList<Cvterm>(terms); } public void createOrUpdate(Feature feature) { this.createOrUpdate(connection.getIndex(), connection.getRegionType(), feature.uniqueName, feature); } // public static String getIndex() { // return "regions"; // } // // // // public static String getType() { // return "Region"; // } }