package org.nextprot.api.tasks.solr.indexer.entry.impl;
import java.util.Arrays;
import java.util.Collection;
import java.util.List;
import java.util.Map;
import org.nextprot.api.commons.constants.AnnotationCategory;
import org.nextprot.api.commons.utils.StringUtils;
import org.nextprot.api.core.domain.Entry;
import org.nextprot.api.core.domain.EntryUtils;
import org.nextprot.api.core.domain.Family;
import org.nextprot.api.core.domain.annotation.Annotation;
import org.nextprot.api.core.domain.annotation.AnnotationEvidence;
import org.nextprot.api.core.domain.annotation.AnnotationIsoformSpecificity;
import org.nextprot.api.core.domain.annotation.AnnotationProperty;
import org.nextprot.api.core.utils.TerminologyUtils;
import org.nextprot.api.solr.index.EntryIndex.Fields;
import org.nextprot.api.tasks.solr.indexer.entry.EntryFieldBuilder;
import org.nextprot.api.tasks.solr.indexer.entry.FieldBuilder;
@EntryFieldBuilder
public class AnnotationFieldBuilder extends FieldBuilder {
@Override
protected void init(Entry entry) {
// Function with canonical first
List<String> function_canonical = EntryUtils.getFunctionInfoWithCanonicalFirst(entry);
for (String finfo : function_canonical) {
addField(Fields.FUNCTION_DESC, finfo);
addField(Fields.ANNOTATIONS, finfo);
}
List<Annotation> annots = entry.getAnnotations();
for (Annotation currannot : annots) {
String category = currannot.getCategory();
AnnotationCategory apiCategory = currannot.getAPICategory();
String quality = currannot.getQualityQualifier();
if (apiCategory.equals(AnnotationCategory.FUNCTION_INFO) || apiCategory.equals(AnnotationCategory.EXPRESSION_PROFILE))
// We just processed this via the EntryUtils dedicated method, and tissue specificity values are indexed under other fields
continue;
// We also should exclude uninformative category 'sequence conflict'
//if(!category.equals("tissue specificity")) {//These values are indexed under other fields
//if(!apiCategory.equals(AnnotationCategory.) {//These values are indexed under other fields
String desc = currannot.getDescription();
if(apiCategory.equals(AnnotationCategory.GLYCOSYLATION_SITE)) {
String xref = currannot.getSynonym();
if(xref != null)
// It is actually not a synonym but the carbohydrate id from glycosuitedb !
addField(Fields.ANNOTATIONS, xref);
}
else if(apiCategory.equals(AnnotationCategory.DNA_BINDING_REGION))
addField(Fields.ANNOTATIONS, category);
else if(apiCategory.equals(AnnotationCategory.VARIANT))
// We need to index them somehow for the GOLD/SILVER tests, or do we ? in creates a lot of useless 'variant null' tokens
desc = "Variant " + desc;
if (desc != null) { //System.err.println(category + ": " + desc);
if (apiCategory.equals(AnnotationCategory.SEQUENCE_CAUTION)) {
int stringpos=0;
desc = desc.split(":")[1].substring(1); // The sequence AAH70170 differs from that shown. Reason: miscellaneous discrepancy
String[] desclevels = desc.split("\\.");
String mainreason = desclevels[0];
if((stringpos=mainreason.indexOf(" at position")) != -1) {
// truncate the position
mainreason=mainreason.substring(0,stringpos);
}
addField(Fields.ANNOTATIONS, mainreason);
if(desclevels.length > 1) {
if(stringpos > 0) // mainreason truncated
desc = desc.substring(desc.indexOf(".") + 2);
else {
stringpos=desc.indexOf(mainreason) + mainreason.length();
desc = desc.substring(stringpos+2);
}
addField(Fields.ANNOTATIONS, desc);
}
}
if(!category.startsWith("go") && desc.length() > 1) { // go will be indexed via cvac, not description
if(!this.isGold() || quality.equals("GOLD"))
{
if(apiCategory.equals(AnnotationCategory.PHENOTYPIC_VARIATION)) {
// Get BED data (also get the notes ? )
Map<String,AnnotationIsoformSpecificity> annotSpecs = currannot.getTargetingIsoformsMap();
for ( Map.Entry<String, AnnotationIsoformSpecificity> mapentry: annotSpecs.entrySet()) {
String subjectName = mapentry.getValue().getName();
// update description with the subject for each target isofotm
addField(Fields.ANNOTATIONS, subjectName + " " + desc);
//System.err.println("adding: " + subjectName + " " + desc);
}
}
else addField(Fields.ANNOTATIONS, desc);
}
}
// in pathway and disease new annotations may appear due to transformation of specific xrefs (orphanet...) into annotations in the api
}
String cvac = currannot.getCvTermAccessionCode();
if (cvac != null && !cvac.isEmpty()) {
if(cvac.startsWith("GO:")) {
boolean allnegative = true;
// We don't index negative annotations
for(AnnotationEvidence ev : currannot.getEvidences())
allnegative = allnegative & ev.isNegativeEvidence();
if(allnegative == true) {
//System.err.println(cvac + ": skipped (all evidences are negative)");
continue;
}
}
if(!this.isGold() || quality.equals("GOLD")) {
addField(Fields.ANNOTATIONS, cvac);
addField(Fields.ANNOTATIONS, currannot.getCvTermName());
List<String> synonyms = this.terminologyservice.findCvTermByAccession(cvac).getSynonyms();
if(synonyms != null) {
String allsynonyms="";
for (String synonym : synonyms) {
if(!allsynonyms.isEmpty()) allsynonyms += " | ";
allsynonyms += synonym.trim();
}
addField(Fields.ANNOTATIONS,StringUtils.getSortedValueFromPipeSeparatedField(allsynonyms));
}
List<String> ancestors = TerminologyUtils.getAllAncestorsAccession(cvac, terminologyservice);
String allancestors="";
for (String ancestor : ancestors) {
if(!allancestors.isEmpty()) allancestors += " | ";
allancestors += ancestor + " | "; // adding Ac
String ancestorname = this.terminologyservice.findCvTermByAccession(ancestor).getName();
allancestors += ancestorname;
}
if(allancestors.endsWith(" domain")) allancestors="domain"; // don't index generic top level ancestors
else if(allancestors.endsWith("zinc finger region")) allancestors="zinc finger region"; // don't index generic top level ancestors
else if(allancestors.endsWith("repeat")) allancestors="repeat"; // don't index generic top level ancestors
if(allancestors.length() > 1)
addField(Fields.ANNOTATIONS, StringUtils.getSortedValueFromPipeSeparatedField(allancestors));
}
}
if (apiCategory.equals(AnnotationCategory.MATURE_PROTEIN) || apiCategory.equals(AnnotationCategory.MATURATION_PEPTIDE)) {
String chainid = currannot.getSynonym();
if(chainid != null) {
//System.err.println( currannot.getAllSynonyms().size() + " synonyms: " + currannot.getAllSynonyms());
if(chainid.contains("-")) addField(Fields.ANNOTATIONS,chainid); // Uniprot FT id, like PRO_0000019235, shouldn't be called a synonym
else {
List<String> chainsynonyms = currannot.getSynonyms();
if(chainsynonyms.size() == 1)
addField(Fields.ANNOTATIONS,StringUtils.getSortedValueFromPipeSeparatedField(desc + " | " + chainid));
else {
chainid = "";
for (String syno : chainsynonyms) {
chainid += syno + " | ";
}
addField(Fields.ANNOTATIONS,StringUtils.getSortedValueFromPipeSeparatedField(chainid));
}
}
} // else System.err.println("chainid null for: " + desc); chainid 's null for the main chain, this is wrong
}
// variant xrefs and identifiers
if (apiCategory.equals(AnnotationCategory.VARIANT)) {
String evidxrefaccs = "";
List<AnnotationEvidence> evidences = currannot.getEvidences();
if(evidences != null)
for (AnnotationEvidence ev : evidences) {
if(ev.isResourceAXref()) {
String db = ev.getResourceDb();
if(db == null) System.err.println("db is null for evidence in variant annot: " + desc);
else {
if(!evidxrefaccs.isEmpty()) evidxrefaccs += " | ";
if(db.equals("Cosmic")) evidxrefaccs += db.toLowerCase() + ":" + ev.getResourceAccession();
else if(db.equals("dbSNP"))// Just to allow comparison with incoherent current solr implementation
evidxrefaccs += ev.getResourceAccession();
else evidxrefaccs += currannot.getSynonym(); // Uniprot FT id, like VAR_056577
}
}
}
if(!this.isGold() || quality.equals("GOLD")) {
if(!evidxrefaccs.isEmpty()) addField(Fields.ANNOTATIONS,StringUtils.getSortedValueFromPipeSeparatedField(evidxrefaccs));
Collection<AnnotationProperty> props = currannot.getProperties();
for (AnnotationProperty prop : props)
if(prop.getName().equals("mutation AA"))
// eg: p.D1685E, it is unclear why this property exists only in cosmic variants
addField(Fields.ANNOTATIONS,prop.getValue());
}
}
}
// Families (why not part of Annotations ?), always GOLD
for (Family family : entry.getOverview().getFamilies()) {
String ac = family.getAccession();
int stringpos = 0;
addField(Fields.ANNOTATIONS, ac);
String famdesc = family.getDescription();
// There is no get_synonyms() method for families -> can't access PERVR for FA-04785
addField(Fields.ANNOTATIONS, famdesc);
stringpos = famdesc.indexOf("elongs to ") + 14;
famdesc = famdesc.substring(stringpos); // Skip the 'Belongs to' and what may come before (eg: NX_P19021)
famdesc = famdesc.substring(0,famdesc.length()-1); // remove final dot
addField(Fields.ANNOTATIONS, famdesc);
String[] families = famdesc.split("\\. "); // are there subfamilies ?
if(families.length > 1) { // Always GOLD
for(int i=0; i< families.length; i++) {
addField(Fields.ANNOTATIONS, families[i]);
if(families[i].contains(") superfamily")) { // index one more time without parenthesis
famdesc = families[i].substring(0, families[i].indexOf("(")) + "superfamily";
addField(Fields.ANNOTATIONS, famdesc);
}
}
}
// Sonetimes these synonymes are wrong eg: NX_Q6NUT3 -> Major facilitator (TC 2.A.1) superfamily
List<String> famsynonyms = this.terminologyservice.findCvTermByAccession(ac).getSynonyms();
if(famsynonyms != null) for(String famsynonym : famsynonyms)
addField(Fields.ANNOTATIONS, famsynonym.trim());
}
}
@Override
public Collection<Fields> getSupportedFields() {
return Arrays.asList(Fields.ANNOTATIONS, Fields.FUNCTION_DESC);
}
}