package org.nextprot.api.tasks.solr.indexer;
import org.apache.solr.common.SolrInputDocument;
import org.nextprot.api.core.dao.EntityName;
import org.nextprot.api.core.domain.*;
import org.nextprot.api.core.domain.DbXref.DbXrefProperty;
import org.nextprot.api.core.domain.annotation.Annotation;
import org.nextprot.api.core.domain.annotation.AnnotationProperty;
import org.nextprot.api.core.service.DbXrefService;
import org.nextprot.api.core.service.TerminologyService;
import java.util.*;
// !!! This is an old approach for the entry solr index, will be deleted soon
public class AnnotationSolrIndexer extends SolrIndexer<Entry> {
private TerminologyService terminologyservice;
private DbXrefService dbxrefservice;
public AnnotationSolrIndexer(String url) {
super(url);
}
@Override
public SolrInputDocument convertToSolrDocument(Entry entry) {
SolrInputDocument doc = new SolrInputDocument();
Set <String> cv_acs = new HashSet<String>();
Set <String> cv_ancestors_acs = new HashSet<String>();
Set <String> cv_synonyms = new HashSet<String>();
Set <String> cv_tissues = new HashSet<String>();
String id = entry.getUniqueName();
doc.addField("id", id);
doc.addField("idsp0", id);
doc.addField("recommended_ac", id.substring(3));
Overview ovv = entry.getOverview();
doc.addField("protein_existence", ovv.getProteinExistence());
doc.addField("pe_level", ovv.getProteinExistenceLevel());
//doc.addField("isoform_num", entry.getIsoforms().size());
String precname = ovv.getMainProteinName();
System.err.println(id + " " + precname);
doc.addField("recommended_name", precname);
doc.addField("recommended_name_s", precname);
// Filters and entry properties
EntryProperties props = entry.getProperties();
doc.addField("isoform_num", props.getIsoformCount());
int cnt;
cnt = props.getPtmCount();
if(cnt > 0) doc.addField("ptm_num", cnt);
cnt = props.getVarCount();
if(cnt > 0) doc.addField("var_num", cnt);
String filters = "";
if(props.getFilterstructure()) filters += "filterstructure ";
if(props.getFilterdisease()) filters += "filterdisease ";
if(props.getFilterexpressionprofile()) filters += "filterexpressionprofile ";
if(props.getFiltermutagenesis()) filters += "filtermutagenesis ";
if(props.getFilterproteomics()) filters += "filterproteomics ";
if(filters.length() > 0) doc.addField("filters", filters);
doc.addField("aa_length", props.getMaxSeqLen()); // max length among all isoforms
List <EntityName> altnames = null;
altnames = ovv.getProteinNames();
if(altnames != null )
for (EntityName altname : altnames) {
List <EntityName> paltnames = altname.getSynonyms();
if(paltnames != null )
for (EntityName paltfullname : paltnames) {
doc.addField("alternative_names", paltfullname.getName());
List <EntityName> paltshortnames = paltfullname.getSynonyms();
if(paltshortnames != null )
for (EntityName paltshortname : paltshortnames) {
doc.addField("alternative_names", paltshortname.getName());
}
}
}
altnames = ovv.getAdditionalNames(); // special names (INN, allergens)
if(altnames != null )
for (EntityName altname : altnames) {
doc.addField("alternative_names", altname.getName());
}
altnames = ovv.getFunctionalRegionNames(); // The enzymatic activities of a multifunctional enzyme (maybe redundent with getEnzymes)
if(altnames != null )
for (EntityName altname : altnames) {
doc.addField("region_name", altname.getName());
//System.err.println(id + " fromincludes: " + altname.getName());
List <EntityName> paltnames = altname.getSynonyms();
if(paltnames != null )
for (EntityName ecname : paltnames) {
//doc.addField("ec_name", ecname.getName());
//System.err.println(id + " fromincludes: " + ecname.getName());
}
}
// Gene names, synonyms and orf names
List <EntityName> genenames = ovv.getGeneNames();
if(genenames != null ) {
String maingenename = ovv.getMainGeneName(); // TODO: check for multigene entries
doc.addField("recommended_gene_names", maingenename);
doc.addField("recommended_gene_names_s", maingenename);
for (EntityName currname : genenames) {
List <EntityName> genesynonames = currname.getSynonyms();
if(genesynonames != null)
for (EntityName genesynoname : genesynonames) {
doc.addField("alternative_gene_names", genesynoname.getName());
//System.err.println("syn: " + genesynoname.getName());
}
}
}
//else System.err.println("no gene names for: " + id);
List<Family> families = ovv.getFamilies();
String allfamilies = null;
for (Family family : families) { // alternatively use a multivalue solr field
if(allfamilies == null) allfamilies = family.getName();
else allfamilies += " | " + family.getName();
cv_acs.add(family.getAccession());
doc.addField("cv_acs", family.getAccession());
}
if(allfamilies == null) {doc.addField("family_names", allfamilies); doc.addField("family_names_s", allfamilies);}
List <ChromosomalLocation> chrlocs = entry.getChromosomalLocations();
String chrloc = null;
//String tststring = null;
for (ChromosomalLocation currloc : chrlocs) {
if(chrloc == null) chrloc = currloc.getChromosome();
String band = currloc.getBand();
if(band != null) {
chrloc += band;
doc.addField("gene_band", band);
}
}
//System.err.println("adding chr_loc: " + chrloc);
doc.addField("chr_loc", chrloc);
doc.addField("chr_loc_s", this.sortChr(chrloc));
/*DbXrefDaoImpl dao = new DbXrefDaoImpl(); // Not working
Set<DbXref> intactdbrefs = dao.findEntryInteractionInteractantsXrefs(id);
if(intactdbrefs != null)
for (DbXref intactdbref : intactdbrefs) {
System.err.println(intactdbref.getPropertiesMap());
List<DbXrefProperty> xrefprops = intactdbref.getPropertiesMap();
for (DbXrefProperty xrefprop : xrefprops) {
if(xrefprop.getName().equals("gene designation")) {
System.err.println("gene: " + xrefprop.getValue());
}
}
} */
List<Annotation> annots = entry.getAnnotations();
int cvac_cnt = 0;
for (Annotation currannot : annots) {
String category = currannot.getCategory();
//System.err.println(category);
if(category.contains("Binary")) {
//System.err.println(category + " : " + currannot.getUniqueName());
Collection<AnnotationProperty> annotprops = currannot.getProperties();
for (AnnotationProperty annotprop : annotprops) {
if(annotprop.getName().equals("interactant")) {
//int dbrefid = Integer.parseInt(annotprop.getValue());
//DbXref xref =
if(annotprop.getValueType().equals("resource-internal-ref")) {
long dbrefid = Integer.parseInt(annotprop.getValue());
//DbXref xref =
//System.err.println("dbrefid: " + dbrefid);
}
}
}
}
if(category.equals("function")) doc.addField("function_desc", currannot.getDescription());
else if(category.equals("tissue specificity")) {
cv_tissues.add(currannot.getCvTermAccessionCode()); // No duplicates: this is a Set
cv_tissues.add(currannot.getCvTermName()); // No duplicates: this is a Set
//doc.addField("expression", currannot.getCvTermName());
}
else if(category.equals("subcellular location") || category.equals("go cellular component") ||
category.equals("domain") || category.equals("repeat") || category.equals("zinc finger region") ||
category.equals("go molecular function") || category.equals("go biological process") ||
category.equals("pathway") || category.equals("amino acid modification") ||
category.equals("uniprot keyword") || category.equals("disease")) {
String cvac = currannot.getCvTermAccessionCode();
if(cvac != null) {
doc.addField("cv_acs", cvac);
cvac_cnt++;
cv_acs.add(cvac); // No duplicates: this is a Set
doc.addField("cv_names", currannot.getCvTermName());
}
}
else if(category.equals("region of interest") || category.equals("interacting region")) { // compositionally biased region ?
//doc.addField("?", currannot.getDescription());
}
else {
doc.addField("annotations", currannot.getDescription());
//System.err.println(category + " : " + currannot.getDescription());
} // or maybe not 'else'
}
// Identifiers
List <Identifier> identifiers = entry.getIdentifiers();
for (Identifier currident : identifiers) {
String idtype = currident.getType();
//if(currident.getDatabase() == null)
//System.err.println("type: " + idtype + " " + currident.getName());
if(idtype.equals("Secondary AC")) doc.addField("alternative_acs", currident.getName());
else if (idtype.equals("IMAGE") || idtype.equals("FLJ") || idtype.equals("MGC") || idtype.equals("DKFZ") || idtype.equals("Others")) doc.addField("clone_name", currident.getName());
else if (idtype.equals("Illumina") || idtype.equals("Affymetrix")) doc.addField("microarray_probe", currident.getName());
else if (idtype.equals("Entry name")) doc.addField("uniprot_name", currident.getName());
//else System.err.println("type: " + idtype);
}
List<CvTerm> enzymes = entry.getEnzymes();
for (CvTerm currenzyme : enzymes) {
//System.err.println(id + " fromenz: " + currenzyme.getAccession());
doc.addField("cv_acs", currenzyme.getAccession());
cvac_cnt++;
cv_acs.add(currenzyme.getAccession());
doc.addField("cv_names", currenzyme.getName());
List <String> synonyms = currenzyme.getSynonyms();
if(synonyms != null)
for (String synonym : synonyms) { System.err.println("enzyme synonym: " + synonym); // never shows up
doc.addField("cv_synonyms", synonym);
}
}
List<Interaction> interactions = entry.getInteractions();
//System.err.println(interactions.size() + " interactions");
for (Interaction currinteraction : interactions) {
//System.err.println(currinteraction.getEvidenceXrefAC()); // EBI-372273,EBI-603319
doc.addField("interactions", currinteraction.getEvidenceXrefAC());
List<Interactant> interactants = currinteraction.getInteractants();
//System.err.println(interactants.size() + " interactants");
for (Interactant currinteractant : interactants) {
//currinteractant.
//System.err.println(currinteractant.getNextprotAccession() + " " + currinteractant.getUrl());
List<Long> ll = Arrays.asList(currinteractant.getXrefId()); // findDbXRefByIds exists but not findDbXRefById
DbXref xref1 = this.dbxrefservice.findDbXRefByIds(ll).get(0);
List<DbXrefProperty> xrefprops = xref1.getProperties();
if(xrefprops != null)
for (DbXrefProperty xrefprop : xrefprops) {
//System.err.println("propname: " + xrefprop.getName());
} //else System.err.println("no properties for: " + xref1.getAccession());
//System.err.println("propval: " + xref1.getAccession());
//System.err.println("propval: " + xref1.getPropertyValue("gene designation"));
}
//doc.addField("interactions", interaction.getAccession());
}
// Xrefs
List<DbXref> xrefs = entry.getXrefs();
for (DbXref xref : xrefs) {
String acc = xref.getAccession();
String db = xref.getDatabaseName();
//System.err.println(db+":"+acc);
//if(db.equals("IntAct")) System.err.println("id " + xref.getDbXrefId() + ": " + xref.getPropertyValue("gene designation"));
/*if(db.equals("neXtProt")) {
if(acc.equals(id)) continue; // Internal stuff like NX_VG_10_51732257_248
String gen = xref.getPropertyValue("gene designation");
System.err.println("nonxeno: " + gen);
}
if(db.equals("UniProt") && !id.contains(acc)) {
String gen = xref.getPropertyValue("gene designation");
if(gen != null) { gen = gen.toUpperCase(); System.err.println("xeno: " + gen); }
else System.err.println("no gene for: " + acc );
} */
if(db.equals("HPA") && !acc.contains("ENSG")) doc.addField("antibody", acc);
else if(db.equals("PeptideAtlas") || db.equals("SRMAtlas")) doc.addField("peptide", acc + ", " + db + ":" + acc);
else if(db.equals("Ensembl")) doc.addField("ensembl", acc);
else doc.addField("xrefs", acc + ", " + db + ":" + acc);
}
// Publications
List<Publication> publications = entry.getPublications();
int publi_computed_count = 0;
int publi_curated_count = 0;
int publi_large_scale_count = 0;
for (Publication currpubli : publications) {
if(currpubli.getIsComputed() == true) publi_computed_count++;
if(currpubli.getIsCurated() == true) publi_curated_count++;
if(currpubli.getIsLargeScale() == true) publi_large_scale_count++;
String title = currpubli.getTitle();
if(title.length() > 0) doc.addField("publications",title);
SortedSet<PublicationAuthor> authors = currpubli.getAuthors();
for (PublicationAuthor currauthor : authors) {
doc.addField("publications",currauthor.getLastName() + " " + currauthor.getForeName() + " " + currauthor.getInitials());
}
Set<DbXref> pubxrefs = currpubli.getDbXrefs();
for (DbXref pubxref : pubxrefs) {
String acc = pubxref.getAccession();
String db = pubxref.getDatabaseName();
doc.addField("xrefs", acc + ", " + db + ":" + acc);
}
}
if(publi_computed_count > 0) doc.addField("publi_computed_count", publi_computed_count);
if(publi_curated_count > 0) doc.addField("publi_curated_count", publi_curated_count);
if(publi_large_scale_count > 0) doc.addField("publi_large_scale_count", publi_large_scale_count);
//doc.addField("orf_names", entry.()); -> together with gene synonyms
//doc.addField("ec_name", entry.getChromosomalLocations().get(0));
//doc.addField("expression", entry.());
//doc.addField("informational_score", entry.());
//doc.addField("interactions", entry.());
//doc.addField("region_name", entry.()); corresponds to 'includes' activity
/*
List<Terminology.TermProperty> properties = terminology.getPropertiesMap();
if (properties != null) {
doc.addField("properties",TerminologyUtils.convertPropertiesToString(properties));
} */
// Final CV acs, ancestors and synonyms
for (String cvac : cv_acs) {
CvTerm term = this.terminologyservice.findCvTermByAccession(cvac);
String category = term.getOntology();
//System.out.println(cvac + ": " + category);
//if(term == null) System.err.println("problem with " + cvac);
//else { System.err.println(cvac);
List<String> ancestors = term.getAncestorAccession();
if(ancestors != null)
for (String ancestor : ancestors)
cv_ancestors_acs.add(ancestor); // No duplicate: this is a Set
List<String> synonyms = term.getSynonyms();
if(synonyms != null) { //if (term.getOntology().startsWith("Go")) System.err.println("adding: " + synonyms.get(0));
for (String synonym : synonyms)
cv_synonyms.add(synonym.trim()); // No duplicate: this is a Set
}
}
// Index generated sets
for (String ancestorac : cv_ancestors_acs) {
doc.addField("cv_ancestors_acs", ancestorac);
doc.addField("cv_ancestors", this.terminologyservice.findCvTermByAccession(ancestorac).getName());
}
for (String synonym : cv_synonyms) {
doc.addField("cv_synonyms", synonym);
}
// Expression
SortedSet <String> cv_tissues_final = new TreeSet<String>();
for (String cv : cv_tissues) {
cv_tissues_final.add(cv); // No duplicate: this is a Set
if(cv.startsWith("TS-")) {
CvTerm term = this.terminologyservice.findCvTermByAccession(cv);
List<String> ancestors = term.getAncestorAccession();
if(ancestors != null)
for (String ancestorac : ancestors) {
cv_tissues_final.add(ancestorac); // No duplicate: this is a Set
cv_tissues_final.add(this.terminologyservice.findCvTermByAccession(ancestorac).getName()); // No duplicate: this is a Set
}
List<String> synonyms = term.getSynonyms();
if(synonyms != null) for (String synonym : synonyms) cv_tissues_final.add(synonym);
}
}
for (String cv : cv_tissues_final) doc.addField("expression", cv.trim());
return doc;
}
public TerminologyService getTerminologysAnnotationSolrIndexerervice() {
return terminologyservice;
}
public void setTerminologyservice(TerminologyService terminologyservice) {
this.terminologyservice = terminologyservice;
}
public DbXrefService getDbxrefSolrIndexerervice() {
return dbxrefservice;
}
public void setDbxrefservice(DbXrefService dbxrefservice) {
this.dbxrefservice = dbxrefservice;
}
public static long sortChr(String chr) {
// Allows to sort results based on chromosomal location
chr=chr.trim();
String[] chr_loc=chr.split("([pq]|cen)"); // split on p or q
long f_chr0=1000000;
long f_q=50000;
long f_chr1=1000;
int max_chr=50; // max chr localtion after pq
long chr0, chr1;
// push unknown chromosome at the end
if (chr.indexOf("unknown")>-1 || chr.equals("")) { return f_chr0*30; }
if(chr_loc[0].equalsIgnoreCase("x")){ chr0=23*f_chr0;}
else if(chr_loc[0].equalsIgnoreCase("y")) { chr0=24*f_chr0; }
else if(chr_loc[0].equalsIgnoreCase("mt")) { chr0=25*f_chr0;}
else { chr0=Integer.parseInt(chr_loc[0])*f_chr0; }
// sort(cen) = 10E5*XX + 10E4-1
if (chr.indexOf("cen")>-1) return chr0+f_q-1;
// sort(chr) = 10E5*XX
if (chr_loc.length==1) return (chr0);
// extract double value from digits after p or q
chr1=(long)( Double.parseDouble(chr_loc[1].split("[-,]")[0]) * f_chr1);
// sort(q) = 10E5*XX + 10E4 + 100*YY
if(chr.indexOf('q')>-1) return chr0+chr1+f_q;
// sort(p) = 10E6*XX + 1000*(45-YY) //descending order
return chr0 + f_chr1 * max_chr - chr1;
}
}