package org.genedb.crawl.elasticsearch.index.gff; import java.io.BufferedReader; import java.io.IOException; import java.util.ArrayList; import java.util.List; import org.apache.log4j.Logger; import org.genedb.crawl.elasticsearch.mappers.ElasticSearchFeatureMapper; import org.genedb.crawl.elasticsearch.mappers.ElasticSearchRegionsMapper; import org.genedb.crawl.model.Feature; import org.genedb.crawl.model.LocatedFeature; import org.genedb.crawl.model.Organism; import org.genedb.crawl.modelling.RegionFeatureBuilder; public class GFFAnnotatationExtractor { private static Logger logger = Logger.getLogger(GFFAnnotatationExtractor.class); public GFFAnnotatationExtractor(BufferedReader buf, String filePath, Organism organism, ElasticSearchFeatureMapper featureMapper, ElasticSearchRegionsMapper regionsMapper) throws IOException { List<RegionFeatureBuilder> sequences = new ArrayList<RegionFeatureBuilder>(); try { boolean parsingAnnotations = true; LocatedFeature lastFeature = null; RegionFeatureBuilder sequence = null; String line = ""; while ((line = buf.readLine()) != null) { logger.debug(line); if (line.startsWith("##sequence-region")) { parsingAnnotations = true; } if (line.contains("##FASTA")) { parsingAnnotations = false; } if (line.startsWith("#")) { continue; } if (parsingAnnotations) { LocatedFeature feature = new FeatureBeanFactory(organism, line).getFeature(); if (feature.type.name.equals("CDS")) { logger.debug("changing type from CDS to exon"); feature.type.name = "exon"; } /* * If the last feature has the same uniqueName, then add the extra * coordinates to the last one, else store this feature. */ if (lastFeature != null && lastFeature.uniqueName != null && lastFeature.uniqueName.equals(feature.uniqueName)) { if (feature.fmin != lastFeature.fmin || feature.fmax != lastFeature.fmax) { logger.info(String.format("adding extra coordinates to %s : %s-%s", lastFeature.uniqueName, feature.coordinates.get(0).fmin, feature.coordinates.get(0).fmax)); lastFeature.coordinates.add(feature.coordinates.get(0)); createOrUpdate(lastFeature, featureMapper); } } else { createOrUpdate(feature, featureMapper); lastFeature = feature; } } else { if (line.startsWith(">")) { String sequenceName = line.substring(1); /* we ignore everything after a space */ int spacePos = sequenceName.indexOf(" "); if (spacePos != -1) { sequenceName = sequenceName.substring(0, spacePos); } sequence = new RegionFeatureBuilder(sequenceName, organism.ID); sequence.setSequenceFile(filePath); sequences.add(sequence); } } } for (RegionFeatureBuilder regionBuilder : sequences) { Feature region = regionBuilder.getRegion(); logger.info("Storing region : " + region.uniqueName); regionsMapper.createOrUpdate(region); } } finally { buf.close(); } } private void createOrUpdate(LocatedFeature feature, ElasticSearchFeatureMapper featureMapper) { logger.info(info(feature)); featureMapper.createOrUpdate(feature); } private String info(LocatedFeature f) { return (f.uniqueName + " " + f.fmin + "-" + f.fmax + " " + f.type.name); } }