package org.genedb.crawl.elasticsearch.index.sql; import java.util.ArrayList; import java.util.Date; import java.util.HashSet; import java.util.List; import java.util.Set; import org.apache.log4j.Logger; import org.genedb.crawl.elasticsearch.mappers.ElasticSearchFeatureMapper; import org.genedb.crawl.elasticsearch.mappers.ElasticSearchOrganismsMapper; import org.genedb.crawl.elasticsearch.mappers.ElasticSearchRegionsMapper; import org.genedb.crawl.mappers.AuditMapper; import org.genedb.crawl.mappers.FeatureMapper; import org.genedb.crawl.mappers.FeaturesMapper; import org.genedb.crawl.mappers.OrganismsMapper; import org.genedb.crawl.mappers.RegionsMapper; import org.genedb.crawl.mappers.TermsMapper; import org.genedb.crawl.model.Cvterm; import org.genedb.crawl.model.Feature; import org.genedb.crawl.model.HierarchyRelation; import org.genedb.crawl.model.LocatedFeature; import org.genedb.crawl.model.Organism; import org.genedb.crawl.model.Property; import org.genedb.crawl.model.Sequence; import org.genedb.crawl.modelling.LocatedFeatureUtil; public class SQLIndexer { private static Logger logger = Logger.getLogger(SQLIndexer.class); public ElasticSearchOrganismsMapper esOrganismMapper; public ElasticSearchFeatureMapper esFeatureMapper; public ElasticSearchRegionsMapper esRegionsMapper; public OrganismsMapper organismMapper; public FeaturesMapper featuresMapper; public FeatureMapper featureMapper; public RegionsMapper regionsMapper; public TermsMapper termsMapper; private Set<Integer> organism_ids; private Set<String> regions; private Set<String> features; public List<Cvterm> relationships = new ArrayList<Cvterm>(); public AuditMapper auditMapper; public boolean exclude = false; public List<String> types; public SQLIndexer() { reset(); } public void reset() { organism_ids = new HashSet<Integer>(); regions = new HashSet<String>(); features = new HashSet<String>(); } /** * * @param date * @param organism if null, will query across organisms */ public void indexFeaturesSince(Date date, Organism organism) { if (auditMapper.exists()) { List<Feature> deleted = auditMapper.deleted(date); for (Feature toDeleteFromIndex : deleted) { esFeatureMapper.delete(toDeleteFromIndex); } } else { logger.warn("Audit schema does not exist in this database. Cannot delete features from index."); } List<Feature> modifiedFeatures = null; if (organism != null) { modifiedFeatures = featuresMapper.timelastmodified(date, organism.ID, types, exclude); } else { modifiedFeatures = featuresMapper.timelastmodified(date, null, types, exclude); } for (Feature f : modifiedFeatures) { f.coordinates = featureMapper.coordinates(f); LocatedFeature lf = LocatedFeatureUtil.fromFeature(f); // features if (lf.region != null) { indexRegion(lf.region); } indexOrganism(lf.organism_id); indexLocatedFeature(lf); } } public void indexRegionContents(String region) { logger.info("indexing region : " + region); indexRegion(region); int start = 0; int end = regionsMapper.sequence(region).length; List<LocatedFeature> features = regionsMapper.locations(region, start, end, exclude, types); for (LocatedFeature f : features) { // the regionsMapper.locations does not return the region name f.region = region; indexOrganism(f.organism_id); indexLocatedFeature(f); } } public void indexOrganismContents(Organism o) { indexOrganism(o); for (Feature region : regionsMapper.inorganism(o.ID, null, null, null)) { indexRegionContents(region.uniqueName); } } public void indexOrganisms() { for (Organism o : organismMapper.list()) { indexOrganism(o); } } public void indexOrganism(int id) { Organism o = organismMapper.getByID(id); indexOrganism(o); } public void indexOrganism(Organism o) { if (organism_ids.contains(o.ID)) { return; } organism_ids.add(o.ID); Property taxon = organismMapper.getOrganismProp(o, "genedb_misc", "taxonId"); Property translation_table = organismMapper.getOrganismProp(o, "genedb_misc", "translationTable"); logger.debug("Setting organism " + o.common_name); if (taxon != null) { o.taxonID = Integer.parseInt(taxon.value); } if (translation_table != null) { logger.debug("Setting translation table " + translation_table.value); o.translation_table = Integer.parseInt(translation_table.value); } esOrganismMapper.createOrUpdate(o); } private void indexLocatedFeature(LocatedFeature feature) { if (features.contains(feature.uniqueName)) { logger.warn("Already indexed this feature in this run :" + feature.uniqueName); } features.add(feature.uniqueName); feature.terms = featureMapper.terms(feature); feature.properties = featureMapper.properties(feature); List<HierarchyRelation> relations = featuresMapper.getRelationshipsParents(feature.uniqueName, relationships); if (relations.size() > 0) { feature.parent = relations.get(0).uniqueName; logger.info("parent : " + feature.parent); } esFeatureMapper.createOrUpdate(feature); } public void indexRegion(String region) { if (regions.contains(region)) { return; } regions.add(region); Feature f = featureMapper.get(region, null, null, null); if (f != null) { logger.info("Generating region : " + f.uniqueName); Sequence s = regionsMapper.sequence(region); f.residues = s.dna; esRegionsMapper.createOrUpdate(f); } else { throw new RuntimeException("Could not find region " + region); } } }