package org.genedb.crawl.elasticsearch.index; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStreamReader; import java.text.ParseException; import java.util.zip.GZIPInputStream; import org.apache.log4j.Logger; import org.codehaus.jackson.JsonParseException; import org.codehaus.jackson.map.JsonMappingException; import org.genedb.crawl.elasticsearch.index.gff.GFFAnnotatationExtractor; import org.genedb.crawl.elasticsearch.index.gff.GFFFileFilter; import org.genedb.crawl.elasticsearch.mappers.ElasticSearchFeatureMapper; import org.genedb.crawl.elasticsearch.mappers.ElasticSearchOrganismsMapper; import org.genedb.crawl.elasticsearch.mappers.ElasticSearchRegionsMapper; import org.genedb.crawl.elasticsearch.mappers.ElasticSearchTermsMapper; import org.genedb.crawl.model.Organism; public abstract class NonDatabaseDataSourceIndexBuilder extends IndexBuilder { private static Logger logger = Logger.getLogger(NonDatabaseDataSourceIndexBuilder.class); protected ElasticSearchFeatureMapper featureMapper; protected ElasticSearchOrganismsMapper organismsMapper; protected ElasticSearchRegionsMapper regionsMapper; protected ElasticSearchTermsMapper termsMapper; public NonDatabaseDataSourceIndexBuilder() { super(); } public void init() throws IOException { setupIndex(); featureMapper = new ElasticSearchFeatureMapper(); featureMapper.setConnection(connection); organismsMapper = new ElasticSearchOrganismsMapper(); organismsMapper.setConnection(connection); regionsMapper = new ElasticSearchRegionsMapper(); regionsMapper.setConnection(connection); termsMapper = new ElasticSearchTermsMapper(); termsMapper.setConnection(connection); } protected void convertPath(String path, Organism organism) throws ParseException, IOException { File gffFile = new File(path); GFFFileFilter filter = new GFFFileFilter(); filter.filter_set = GFFFileFilter.GFFFileExtensionSet.ALL; if (gffFile.isDirectory()) { for (File f : gffFile.listFiles(filter)) { convertFile(f, organism); f = null; } } else { if (! gffFile.isFile() ) { throw new IOException("File " + path + " does not exist"); } convertFile(gffFile, organism); gffFile = null; } } protected void convertFile(File gffFile, Organism organism) throws ParseException, IOException { BufferedReader reader = getReader(gffFile); new GFFAnnotatationExtractor(reader, gffFile.getAbsolutePath(), organism, featureMapper, regionsMapper); } /** * Returns the correct reader for gzipped or non-gzipped files. * * @param file * @return * @throws IOException */ public static BufferedReader getReader(File file) throws IOException { BufferedReader reader = null; FileInputStream fileStream = new FileInputStream(file); if (file.getName().endsWith("gz")) { logger.info("unzipping"); reader = new BufferedReader(new InputStreamReader(new GZIPInputStream(fileStream))); } else { reader = new BufferedReader(new InputStreamReader(fileStream)); } return reader; } protected Organism getAndPossiblyStoreOrganism(String organismString) throws JsonParseException, JsonMappingException, IOException, SecurityException, NoSuchFieldException, IllegalArgumentException, IllegalAccessException { Organism userSuppliedOrganism = (Organism) jsonIzer.fromStringOrFile(organismString, Organism.class); Organism organism = null; if (userSuppliedOrganism.ID != null) { logger.info("Getting by ID " + userSuppliedOrganism.ID ); try { organism = organismsMapper.getByID(userSuppliedOrganism.ID); logger.info("found!"); } catch (Exception e) { logger.warn(e.getMessage()); //e.printStackTrace(); logger.warn("Could not find an organism with this ID"); } } if (userSuppliedOrganism.common_name != null) { logger.info("Getting by common_name: " + userSuppliedOrganism.common_name); try { organism = organismsMapper.getByCommonName(userSuppliedOrganism.common_name); logger.info("found!"); } catch (Exception e) { logger.warn(e.getMessage()); logger.warn("Could not find an organism with this common_name."); } } if (organism == null) { organism = userSuppliedOrganism; logger.warn("Could not find existing organism matching the one you supplied."); if ( userSuppliedOrganism.common_name == null || userSuppliedOrganism.ID == null || userSuppliedOrganism.genus == null || userSuppliedOrganism.species == null || userSuppliedOrganism.translation_table == null || userSuppliedOrganism.taxonID == null) { logger.error(String.format("Missing common_name? %s, ID %s, genus %s, species %s, translation_table %s, taxonID %s ", userSuppliedOrganism.common_name == null, userSuppliedOrganism.ID == null, userSuppliedOrganism.genus == null, userSuppliedOrganism.species == null, userSuppliedOrganism.translation_table == null, userSuppliedOrganism.taxonID == null)); throw new RuntimeException("The supplied organism must have all fields declared as it's not present in the repository."); } } else { if (userSuppliedOrganism.common_name != null) { organism.common_name = userSuppliedOrganism.common_name; } if (userSuppliedOrganism.ID != null) { organism.ID = userSuppliedOrganism.ID; } if (userSuppliedOrganism.genus != null) { organism.genus = userSuppliedOrganism.genus; } if (userSuppliedOrganism.species != null) { organism.species = userSuppliedOrganism.species; } if (userSuppliedOrganism.taxonID != null) { organism.taxonID = userSuppliedOrganism.taxonID; } if (userSuppliedOrganism.translation_table != null) { organism.translation_table = userSuppliedOrganism.translation_table; } } logger.info(String.format("Organism to be stored as : %s (%s %s) %s %s %s", organism.common_name, organism.genus, organism.species, organism.ID, organism.translation_table, organism.taxonID)); organismsMapper.createOrUpdate(organism); return organism; } }