package org.genedb.db.domain.luceneImpls; import org.genedb.db.domain.objects.BasicGene; import org.genedb.db.domain.objects.Chromosome; import org.genedb.db.domain.objects.Exon; import org.genedb.db.domain.objects.Gap; import org.genedb.db.domain.objects.Transcript; import org.genedb.db.domain.objects.TranscriptComponent; import org.genedb.db.domain.objects.UTR; import org.genedb.db.domain.services.BasicGeneService; import org.genedb.querying.core.LuceneIndex; import org.apache.log4j.Logger; import org.apache.lucene.document.Document; import org.apache.lucene.index.CorruptIndexException; import org.apache.lucene.index.Term; import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.ConstantScoreRangeQuery; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.Sort; import org.apache.lucene.search.SortField; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TopDocs; import org.apache.lucene.search.WildcardQuery; import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; import java.util.Collections; import java.util.HashSet; import java.util.List; import java.util.Set; import java.util.SortedSet; import java.util.TreeSet; public class BasicGeneServiceImpl implements BasicGeneService { private LuceneIndex luceneIndex; private static Logger logger = Logger.getLogger(BasicGeneServiceImpl.class); public BasicGeneServiceImpl(LuceneIndex luceneIndex) { this.luceneIndex = luceneIndex; } /** * Defines a conversion from a Lucene Document to some other * class of object. A DocumentConverter can also function as a filter if * desired, by returning <code>null</code> from the convert method to * indicate that a particular document should be ignored. * * @author rh11 * * @param <T> The return type of the conversion */ private interface DocumentConverter<T> { /** * Convert the document to the desired form. * * @param doc the document to convert * @return the result of the conversion, or null if this document should be ignored */ T convert(Document doc); } private static final SortedSet<TranscriptComponent> NO_EXONS = Collections.unmodifiableSortedSet(new TreeSet<TranscriptComponent>()); /** * Convert the document to a Transcript object. If the <code>_hibernate_class</code> */ private final DocumentConverter<Transcript> convertToTranscript = new DocumentConverter<Transcript>() { public Transcript convert(Document doc) { logger.debug(String.format("Transcript has class '%s'", doc.get("_hibernate_class"))); Transcript transcript = new Transcript(); String colourString = doc.get("colour"); if (colourString == null || colourString.equals("null")) { transcript.setColourId(null); } else { transcript.setColourId(Integer.parseInt(colourString)); } transcript.setUniqueName(doc.get("uniqueName")); transcript.setFmin(Integer.parseInt(doc.get("start"))); transcript.setFmax(Integer.parseInt(doc.get("stop"))); try { transcript.setComponents(parseLocs(doc.get("locs"))); } catch (NullPointerException e) { logger.error(String.format("Failed to parse locs for transcript '%s' due to null", doc.get("uniqueName"))); transcript.setComponents(NO_EXONS); } catch (Exception e) { logger.error(String.format("Failed to parse locs for transcript '%s'", doc.get("uniqueName")), e); transcript.setComponents(NO_EXONS); } String productsTabSeparated = doc.get("product"); if (productsTabSeparated != null) { transcript.setProducts(Arrays.asList(productsTabSeparated.split("\t"))); } return transcript; } }; /** * This DocumentConverter populates a BasicGene object using the Lucene * Document. Currently it makes a new Lucene query for every gene to pull * back the associated transcripts. Should this prove unacceptable, the * associated transcripts could instead all be loaded at once. * * If the <code>_hibernate_class</code> is neither equal to <code>org.gmod.schema.feature.Gene</code> * nor to <code>org.gmod.schema.feature.Pseudogene</code>, returns null. */ private final DocumentConverter<BasicGene> convertToGene = new DocumentConverter<BasicGene>() { public BasicGene convert(Document doc) { if (!doc.get("_hibernate_class").equals("org.gmod.schema.feature.Gene") && !doc.get("_hibernate_class").equals("org.gmod.schema.feature.Pseudogene")) { logger.debug(String.format("It's not a Gene, it's a '%s'", doc.get("_hibernate_class"))); return null; } BasicGene ret = new BasicGene(); String geneUniqueName = doc.get("uniqueName"); ret.setOrganism(doc.get("organism.commonName")); ret.setFeatureId(Integer.parseInt(doc.get("featureId"))); ret.setUniqueName(geneUniqueName); ret.setName(doc.get("name")); ret.setChromosome(new Chromosome(doc.get("chr"), Integer.parseInt(doc.get("chrId")), Integer.parseInt(doc.get("chrlen")))); ret.setOrganism(doc.get("organism.commonName")); ret.setFmin(Integer.parseInt(doc.get("start"))); ret.setFmax(Integer.parseInt(doc.get("stop"))); String synonyms = doc.get("synonym"); if (synonyms != null) { ret.setSynonyms(Arrays.asList(synonyms.split("\t"))); } BooleanQuery transcriptQuery = new BooleanQuery(); transcriptQuery.add(new TermQuery(new Term("gene", geneUniqueName)), BooleanClause.Occur.MUST); List<Transcript> transcripts = findWithQuery(transcriptQuery, convertToTranscript); if (transcripts.size() == 0) { logger.warn(String.format("No mRNA transcripts found for gene '%s'", geneUniqueName)); } ret.setTranscripts(transcripts); return ret; } }; /** * This DocumentConverter populates a Gap object using the Lucene Document. */ private final DocumentConverter<Gap> convertToGap = new DocumentConverter<Gap>() { public Gap convert(Document doc) { return new Gap(doc.get("uniqueName"), Integer.parseInt(doc.get("start")), Integer.parseInt(doc.get("stop"))); } }; private static Set<TranscriptComponent> parseLocs(String locs) { Set<TranscriptComponent> components = new HashSet<TranscriptComponent>(); for (String loc: locs.split(",")) { int colonIndex = loc.indexOf(':'); boolean isExon = (colonIndex == -1); if (!isExon) { String type = loc.substring(0, colonIndex); if (!type.endsWith("UTR")) { logger.warn(String.format("Unknown transcript component type '%s'", type)); continue; } } int numberStart = colonIndex + 1; int numberEnd = loc.length(); if (loc.charAt(numberStart) == '(') { if (!loc.endsWith(")")) { throw new IllegalArgumentException(String. format("Exon location '%s' starts with '(' but does not end with ')'; from string '%s'", loc, locs)); } numberStart++; numberEnd--; } int dots = loc.indexOf(".."); if (dots < 1) { throw new IllegalArgumentException(String.format("Failed to parse exon location '%s' from string '%s'", loc, locs)); } int componentStart = Integer.parseInt(loc.substring(numberStart, dots)); int componentStop = Integer.parseInt(loc.substring(dots+2, numberEnd)); if (isExon) { components.add(new Exon(componentStart, componentStop)); } else { components.add(new UTR(componentStart, componentStop)); } } return components; } /** * Finds all documents matching the query, and makes a list of matches. Each * document is converted to an object of type T using the converter. * * @param <T> The return type * @param query The query object * @param converter Result converter * @return */ private <T> List<T> findWithQuery(Query query, DocumentConverter<T> converter) { return findWithQuery(query, null, converter); } /** * Finds all documents matching the query, sorted using the specified sort order, * and makes a list of matches. Each document is converted to an object of type T * using the converter. * * @param <T> The return type * @param query The query object * @param sort How to order the results * @param converter Result converter * @return */ private <T> List<T> findWithQuery(Query query, Sort sort, DocumentConverter<T> converter) { List<T> ret = new ArrayList<T>(); //IndexSearcher searcher = new IndexSearcher(luceneIndex); logger.debug("Running Lucene query: "+query); TopDocs topDocs = luceneIndex.search(query, sort); logger.debug(String.format("Query returned %d results", topDocs.totalHits)); for (ScoreDoc scoreDoc : topDocs.scoreDocs) { Document doc; try { doc = fetchDocument(scoreDoc.doc); } catch (CorruptIndexException e) { throw new RuntimeException("Lucene index is corrupted!", e); } catch (IOException e) { throw new RuntimeException("IOException while fetching results of Lucene query", e); } T convertedDocument = converter.convert(doc); if (convertedDocument != null) { ret.add(convertedDocument); } } return ret; } protected Document fetchDocument(int docId) throws CorruptIndexException, IOException { return luceneIndex.getDocument(docId); } private <T> T findUniqueWithQuery(Query query, DocumentConverter<T> converter) { List<T> results = findWithQuery(query, converter); int numberOfResults = results.size(); if (numberOfResults == 0) { logger.info(String.format("Failed to find gene matching Lucene query '%s'", query)); return null; } else if (numberOfResults > 1) { logger.error(String.format("Found %d genes matching query '%s'; expected only one!", numberOfResults, query)); } return results.get(0); } public BasicGene findGeneByUniqueName(String uniqueName) { return findUniqueWithQuery(new TermQuery(new Term("uniqueName", uniqueName)), convertToGene); } /** * Warning: this method is liable to be very slow, because it results * in a Lucene query of the form *foo*: the problem is the initial wildcard. */ public List<String> findGeneNamesByPartialName(String search) { return findWithQuery(new WildcardQuery(new Term("uniqueName", String.format("*%s*", search))), new DocumentConverter<String>() { public String convert(Document doc) { return doc.get("uniqueName"); } }); } private static BooleanQuery geneOrPseudogeneQuery = new BooleanQuery(); static { geneOrPseudogeneQuery.add(new TermQuery(new Term("_hibernate_class", "org.gmod.schema.feature.Gene")), BooleanClause.Occur.SHOULD); geneOrPseudogeneQuery.add(new TermQuery(new Term("_hibernate_class", "org.gmod.schema.feature.Pseudogene")), BooleanClause.Occur.SHOULD); } public Collection<BasicGene> findGenesOverlappingRange(String organismCommonName, String chromosomeUniqueName, int strand, long locMin, long locMax) { BooleanQuery query = new BooleanQuery(); query.add(geneOrPseudogeneQuery, BooleanClause.Occur.MUST); query.add(new TermQuery(new Term("organism.commonName", organismCommonName)), BooleanClause.Occur.MUST); query.add(new TermQuery(new Term("chr", chromosomeUniqueName)), BooleanClause.Occur.MUST); query.add(new ConstantScoreRangeQuery("start", null, String.format("%09d", locMax), false, false), BooleanClause.Occur.MUST); query.add(new ConstantScoreRangeQuery("stop", String.format("%09d", locMin), null, true, false), BooleanClause.Occur.MUST); query.add(new TermQuery(new Term("strand", String.valueOf(strand))), BooleanClause.Occur.MUST); return findWithQuery(query, convertToGene); } private static final SortField START_ASC = new SortField("start"); private static final SortField STOP_DESC = new SortField("stop", true); private static final Sort SORT_BY_LOCATION = new Sort(new SortField[] {START_ASC, STOP_DESC}); public Collection<Gap> findGapsOverlappingRange(String organismCommonName, String chromosomeUniqueName, long locMin, long locMax) { BooleanQuery query = new BooleanQuery(); query.add(new TermQuery(new Term("_hibernate_class", "org.gmod.schema.feature.Gap")), BooleanClause.Occur.MUST); query.add(new TermQuery(new Term("organism.commonName", organismCommonName)), BooleanClause.Occur.MUST); query.add(new TermQuery(new Term("chr", chromosomeUniqueName)), BooleanClause.Occur.MUST); query.add(new ConstantScoreRangeQuery("start", null, String.format("%09d", locMax), false, false), BooleanClause.Occur.MUST); query.add(new ConstantScoreRangeQuery("stop", String.format("%09d", locMin), null, true, false), BooleanClause.Occur.MUST); return findWithQuery(query, SORT_BY_LOCATION, convertToGap); } public Collection<Gap> findGapsOnChromosome(String organismCommonName, String chromosomeUniqueName) { BooleanQuery query = new BooleanQuery(); query.add(new TermQuery(new Term("_hibernate_class", "org.gmod.schema.feature.Gap")), BooleanClause.Occur.MUST); query.add(new TermQuery(new Term("organism.commonName", organismCommonName)), BooleanClause.Occur.MUST); query.add(new TermQuery(new Term("chr", chromosomeUniqueName)), BooleanClause.Occur.MUST); return findWithQuery(query, SORT_BY_LOCATION, convertToGap); } public Collection<BasicGene> findGenesExtendingIntoRange(String organismCommonName, String chromosomeUniqueName, int strand, long locMin, long locMax) { BooleanQuery query = new BooleanQuery(); query.add(geneOrPseudogeneQuery, BooleanClause.Occur.MUST); query.add(new TermQuery(new Term("organism.commonName", organismCommonName)), BooleanClause.Occur.MUST); query.add(new TermQuery(new Term("chr", chromosomeUniqueName)), BooleanClause.Occur.MUST); query.add(new ConstantScoreRangeQuery("stop", String.format("%09d", locMin), String.format("%09d", locMax), true, false), BooleanClause.Occur.MUST); query.add(new TermQuery(new Term("strand", String.valueOf(strand))), BooleanClause.Occur.MUST); return findWithQuery(query, convertToGene); } public Collection<BasicGene> findGenesOnStrand(String organismCommonName, String chromosomeUniqueName, int strand) { BooleanQuery query = new BooleanQuery(); query.add(geneOrPseudogeneQuery, BooleanClause.Occur.MUST); query.add(new TermQuery(new Term("organism.commonName", organismCommonName)), BooleanClause.Occur.MUST); query.add(new TermQuery(new Term("chr", chromosomeUniqueName)), BooleanClause.Occur.MUST); query.add(new TermQuery(new Term("strand", String.valueOf(strand))), BooleanClause.Occur.MUST); return findWithQuery(query, convertToGene); } }