/*
* Copyright (C) 2010-2013 "Bio4j"
*
* This file is part of Bio4j
*
* Bio4j is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as
* published by the Free Software Foundation, either version 3 of the
* License, or (at your option) any later version.
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>
*/
package com.bio4j.neo4jdb.programs;
import com.bio4j.neo4jdb.model.nodes.*;
import com.bio4j.neo4jdb.model.nodes.citation.*;
import com.bio4j.neo4jdb.model.nodes.reactome.ReactomeTermNode;
import com.bio4j.neo4jdb.model.nodes.refseq.GenomeElementNode;
import com.bio4j.neo4jdb.model.relationships.InstituteCountryRel;
import com.bio4j.neo4jdb.model.relationships.IsoformEventGeneratorRel;
import com.bio4j.neo4jdb.model.relationships.SubcellularLocationParentRel;
import com.bio4j.neo4jdb.model.relationships.TaxonParentRel;
import com.bio4j.neo4jdb.model.relationships.aproducts.AlternativeProductInitiationRel;
import com.bio4j.neo4jdb.model.relationships.aproducts.AlternativeProductPromoterRel;
import com.bio4j.neo4jdb.model.relationships.aproducts.AlternativeProductRibosomalFrameshiftingRel;
import com.bio4j.neo4jdb.model.relationships.aproducts.AlternativeProductSplicingRel;
import com.bio4j.neo4jdb.model.relationships.citation.article.ArticleAuthorRel;
import com.bio4j.neo4jdb.model.relationships.citation.article.ArticleJournalRel;
import com.bio4j.neo4jdb.model.relationships.citation.article.ArticleProteinCitationRel;
import com.bio4j.neo4jdb.model.relationships.citation.book.*;
import com.bio4j.neo4jdb.model.relationships.citation.onarticle.OnlineArticleAuthorRel;
import com.bio4j.neo4jdb.model.relationships.citation.onarticle.OnlineArticleJournalRel;
import com.bio4j.neo4jdb.model.relationships.citation.onarticle.OnlineArticleProteinCitationRel;
import com.bio4j.neo4jdb.model.relationships.citation.patent.PatentAuthorRel;
import com.bio4j.neo4jdb.model.relationships.citation.patent.PatentProteinCitationRel;
import com.bio4j.neo4jdb.model.relationships.citation.submission.SubmissionAuthorRel;
import com.bio4j.neo4jdb.model.relationships.citation.submission.SubmissionDbRel;
import com.bio4j.neo4jdb.model.relationships.citation.submission.SubmissionProteinCitationRel;
import com.bio4j.neo4jdb.model.relationships.citation.thesis.ThesisAuthorRel;
import com.bio4j.neo4jdb.model.relationships.citation.thesis.ThesisInstituteRel;
import com.bio4j.neo4jdb.model.relationships.citation.thesis.ThesisProteinCitationRel;
import com.bio4j.neo4jdb.model.relationships.citation.uo.UnpublishedObservationAuthorRel;
import com.bio4j.neo4jdb.model.relationships.citation.uo.UnpublishedObservationProteinCitationRel;
import com.bio4j.neo4jdb.model.relationships.comment.*;
import com.bio4j.neo4jdb.model.relationships.features.*;
import com.bio4j.neo4jdb.model.relationships.protein.*;
import com.bio4j.neo4jdb.model.util.Bio4jManager;
import com.bio4j.neo4jdb.model.util.UniprotStuff;
import com.ohnosequences.util.Executable;
import com.ohnosequences.xml.model.bio4j.UniprotDataXML;
import com.ohnosequences.xml.api.model.XMLElement;
import java.io.*;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.logging.FileHandler;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.logging.SimpleFormatter;
import org.jdom2.Element;
import org.neo4j.graphdb.index.IndexHits;
import org.neo4j.helpers.collection.MapUtil;
import org.neo4j.index.lucene.unsafe.batchinsert.LuceneBatchInserterIndexProvider;
import org.neo4j.unsafe.batchinsert.*;
/**
* This class deals with the main part of Bio4j importing process.
* ImportGeneOntology importation must have been performed prior to this step.
* Features, comments, GeneOntology annotations and all information directly
* related to entries are imported in this step, (except protein interactions
* and isoform sequences).
*
* @author Pablo Pareja Tobes <ppareja@era7.com>
*/
public class ImportUniprot implements Executable {
private static final Logger logger = Logger.getLogger("ImportUniprot");
private static FileHandler fh;
//------------------nodes properties maps-----------------------------------
public static Map<String, Object> organismProperties = new HashMap<>();
public static Map<String, Object> proteinProperties = new HashMap<>();
public static Map<String, Object> keywordProperties = new HashMap<>();
public static Map<String, Object> subcellularLocationProperties = new HashMap<>();
public static Map<String, Object> interproProperties = new HashMap<>();
public static Map<String, Object> pfamProperties = new HashMap<>();
public static Map<String, Object> taxonProperties = new HashMap<>();
public static Map<String, Object> datasetProperties = new HashMap<>();
public static Map<String, Object> personProperties = new HashMap<>();
public static Map<String, Object> consortiumProperties = new HashMap<>();
public static Map<String, Object> instituteProperties = new HashMap<>();
public static Map<String, Object> thesisProperties = new HashMap<>();
public static Map<String, Object> bookProperties = new HashMap<>();
public static Map<String, Object> patentProperties = new HashMap<>();
public static Map<String, Object> articleProperties = new HashMap<>();
public static Map<String, Object> submissionProperties = new HashMap<>();
public static Map<String, Object> onlineArticleProperties = new HashMap<>();
public static Map<String, Object> unpublishedObservationProperties = new HashMap<>();
public static Map<String, Object> publisherProperties = new HashMap<>();
public static Map<String, Object> cityProperties = new HashMap<>();
public static Map<String, Object> journalProperties = new HashMap<>();
public static Map<String, Object> onlineJournalProperties = new HashMap<>();
public static Map<String, Object> countryProperties = new HashMap<>();
public static Map<String, Object> isoformProperties = new HashMap<>();
public static Map<String, Object> commentTypeProperties = new HashMap<>();
public static Map<String, Object> featureTypeProperties = new HashMap<>();
public static Map<String, Object> reactomeTermProperties = new HashMap<>();
public static Map<String, Object> dbProperties = new HashMap<>();
//---------------------------------------------------------------------
//-------------------relationships properties maps--------------------------
public static Map<String, Object> proteinGoProperties = new HashMap<>();
public static Map<String, Object> proteinSubcellularLocationProperties = new HashMap<>();
public static Map<String, Object> bookProteinCitationProperties = new HashMap<>();
public static Map<String, Object> articleJournalProperties = new HashMap<>();
public static Map<String, Object> onlineArticleJournalProperties = new HashMap<>();
public static Map<String, Object> commentProperties = new HashMap<>();
public static Map<String, Object> onlineInformationCommentProperties = new HashMap<>();
public static Map<String, Object> biophysicochemicalCommentProperties = new HashMap<>();
public static Map<String, Object> rnaEditingCommentProperties = new HashMap<>();
public static Map<String, Object> massSpectrometryCommentProperties = new HashMap<>();
public static Map<String, Object> featureProperties = new HashMap<>();
public static Map<String, Object> sequenceCautionProperties = new HashMap<>();
//----------------------------------------------------------------------------
//--------------------------------relationships------------------------------------------
public static ProteinGoRel proteinGoRel = new ProteinGoRel(null);
public static ProteinOrganismRel proteinOrganismRel = new ProteinOrganismRel(null);
public static TaxonParentRel taxonParentRel = new TaxonParentRel(null);
public static ProteinKeywordRel proteinKeywordRel = new ProteinKeywordRel(null);
public static ProteinDatasetRel proteinDatasetRel = new ProteinDatasetRel(null);
public static ProteinInterproRel proteinInterproRel = new ProteinInterproRel(null);
public static ProteinPfamRel proteinPfamRel = new ProteinPfamRel(null);
public static ProteinSubcellularLocationRel proteinSubcellularLocationRel = new ProteinSubcellularLocationRel(null);
public static SubcellularLocationParentRel subcellularLocationParentRel = new SubcellularLocationParentRel(null);
public static ThesisAuthorRel thesisAuthorRel = new ThesisAuthorRel(null);
public static ThesisInstituteRel thesisInstituteRel = new ThesisInstituteRel(null);
public static ThesisProteinCitationRel thesisProteinCitationRel = new ThesisProteinCitationRel(null);
public static PatentAuthorRel patentAuthorRel = new PatentAuthorRel(null);
public static PatentProteinCitationRel patentProteinCitationRel = new PatentProteinCitationRel(null);
public static SubmissionAuthorRel submissionAuthorRel = new SubmissionAuthorRel(null);
public static SubmissionProteinCitationRel submissionProteinCitationRel = new SubmissionProteinCitationRel(null);
public static SubmissionDbRel submissionDbRel = new SubmissionDbRel(null);
public static BookAuthorRel bookAuthorRel = new BookAuthorRel(null);
public static BookProteinCitationRel bookProteinCitationRel = new BookProteinCitationRel(null);
public static BookEditorRel bookEditorRel = new BookEditorRel(null);
public static BookCityRel bookCityRel = new BookCityRel(null);
public static BookPublisherRel bookPublisherRel = new BookPublisherRel(null);
public static ArticleAuthorRel articleAuthorRel = new ArticleAuthorRel(null);
public static ArticleJournalRel articleJournalRel = new ArticleJournalRel(null);
public static ArticleProteinCitationRel articleProteinCitationRel = new ArticleProteinCitationRel(null);
public static OnlineArticleAuthorRel onlineArticleAuthorRel = new OnlineArticleAuthorRel(null);
public static OnlineArticleJournalRel onlineArticleJournalRel = new OnlineArticleJournalRel(null);
public static OnlineArticleProteinCitationRel onlineArticleProteinCitationRel = new OnlineArticleProteinCitationRel(null);
public static UnpublishedObservationAuthorRel unpublishedObservationAuthorRel = new UnpublishedObservationAuthorRel(null);
public static UnpublishedObservationProteinCitationRel unpublishedObservationProteinCitationRel = new UnpublishedObservationProteinCitationRel(null);
public static InstituteCountryRel instituteCountryRel = new InstituteCountryRel(null);
public static IsoformEventGeneratorRel isoformEventGeneratorRel = new IsoformEventGeneratorRel(null);
public static ProteinIsoformRel proteinIsoformRel = new ProteinIsoformRel(null);
public static ProteinErroneousGeneModelPredictionRel proteinErroneousGeneModelPredictionRel = new ProteinErroneousGeneModelPredictionRel(null);
public static ProteinErroneousInitiationRel proteinErroneousInitiationRel = new ProteinErroneousInitiationRel(null);
public static ProteinErroneousTerminationRel proteinErroneousTerminationRel = new ProteinErroneousTerminationRel(null);
public static ProteinErroneousTranslationRel proteinErroneousTranslationRel = new ProteinErroneousTranslationRel(null);
public static ProteinFrameshiftRel proteinFrameshiftRel = new ProteinFrameshiftRel(null);
public static ProteinMiscellaneousDiscrepancyRel proteinMiscellaneousDiscrepancyRel = new ProteinMiscellaneousDiscrepancyRel(null);
public static ProteinGenomeElementRel proteinGenomeElementRel = new ProteinGenomeElementRel(null);
public static ProteinReactomeRel proteinReactomeRel = new ProteinReactomeRel(null);
public static ProteinEnzymaticActivityRel proteinEnzymaticActivityRel = new ProteinEnzymaticActivityRel(null);
//----comment relationships-----
public static AllergenCommentRel allergenCommentRel = new AllergenCommentRel(null);
public static BioPhysicoChemicalPropertiesCommentRel bioPhysicoChemicalPropertiesCommentRel = new BioPhysicoChemicalPropertiesCommentRel(null);
public static BiotechnologyCommentRel biotechnologyCommentRel = new BiotechnologyCommentRel(null);
public static CatalyticActivityCommentRel catalyticActivityCommentRel = new CatalyticActivityCommentRel(null);
public static CautionCommentRel cautionCommentRel = new CautionCommentRel(null);
public static CofactorCommentRel cofactorCommentRel = new CofactorCommentRel(null);
public static DevelopmentalStageCommentRel developmentalStageCommentRel = new DevelopmentalStageCommentRel(null);
public static DiseaseCommentRel diseaseCommentRel = new DiseaseCommentRel(null);
public static DisruptionPhenotypeCommentRel disruptionPhenotypeCommentRel = new DisruptionPhenotypeCommentRel(null);
public static DomainCommentRel domainCommentRel = new DomainCommentRel(null);
public static EnzymeRegulationCommentRel enzymeRegulationCommentRel = new EnzymeRegulationCommentRel(null);
public static FunctionCommentRel functionCommentRel = new FunctionCommentRel(null);
public static InductionCommentRel inductionCommentRel = new InductionCommentRel(null);
public static MassSpectrometryCommentRel massSpectrometryCommentRel = new MassSpectrometryCommentRel(null);
public static MiscellaneousCommentRel miscellaneousCommentRel = new MiscellaneousCommentRel(null);
public static OnlineInformationCommentRel onlineInformationCommentRel = new OnlineInformationCommentRel(null);
public static PathwayCommentRel pathwayCommentRel = new PathwayCommentRel(null);
public static PharmaceuticalCommentRel pharmaceuticalCommentRel = new PharmaceuticalCommentRel(null);
public static PolymorphismCommentRel polymorphismCommentRel = new PolymorphismCommentRel(null);
public static PostTranslationalModificationCommentRel postTranslationalModificationCommentRel = new PostTranslationalModificationCommentRel(null);
public static RnaEditingCommentRel rnaEditingCommentRel = new RnaEditingCommentRel(null);
public static SimilarityCommentRel similarityCommentRel = new SimilarityCommentRel(null);
public static SubunitCommentRel subunitCommentRel = new SubunitCommentRel(null);
public static TissueSpecificityCommentRel tissueSpecificityCommentRel = new TissueSpecificityCommentRel(null);
public static ToxicDoseCommentRel toxicDoseCommentRel = new ToxicDoseCommentRel(null);
//features relationships------------------------------------------
public static ActiveSiteFeatureRel activeSiteFeatureRel = new ActiveSiteFeatureRel(null);
public static BindingSiteFeatureRel bindingSiteFeatureRel = new BindingSiteFeatureRel(null);
public static CalciumBindingRegionFeatureRel calciumBindingRegionFeatureRel = new CalciumBindingRegionFeatureRel(null);
public static ChainFeatureRel chainFeatureRel = new ChainFeatureRel(null);
public static CoiledCoilRegionFeatureRel coiledCoilRegionFeatureRel = new CoiledCoilRegionFeatureRel(null);
public static CompositionallyBiasedRegionFeatureRel compositionallyBiasedRegionFeatureRel = new CompositionallyBiasedRegionFeatureRel(null);
public static CrossLinkFeatureRel crossLinkFeatureRel = new CrossLinkFeatureRel(null);
public static DisulfideBondFeatureRel disulfideBondFeatureRel = new DisulfideBondFeatureRel(null);
public static DnaBindingRegionFeatureRel dnaBindingRegionFeatureRel = new DnaBindingRegionFeatureRel(null);
public static DomainFeatureRel domainFeatureRel = new DomainFeatureRel(null);
public static GlycosylationSiteFeatureRel glycosylationSiteFeatureRel = new GlycosylationSiteFeatureRel(null);
public static HelixFeatureRel helixFeatureRel = new HelixFeatureRel(null);
public static InitiatorMethionineFeatureRel initiatorMethionineFeatureRel = new InitiatorMethionineFeatureRel(null);
public static IntramembraneRegionFeatureRel intramembraneRegionFeatureRel = new IntramembraneRegionFeatureRel(null);
public static LipidMoietyBindingRegionFeatureRel lipidMoietyBindingRegionFeatureRel = new LipidMoietyBindingRegionFeatureRel(null);
public static MetalIonBindingSiteFeatureRel metalIonBindingSiteFeatureRel = new MetalIonBindingSiteFeatureRel(null);
public static ModifiedResidueFeatureRel modifiedResidueFeatureRel = new ModifiedResidueFeatureRel(null);
public static MutagenesisSiteFeatureRel mutagenesisSiteFeatureRel = new MutagenesisSiteFeatureRel(null);
public static NonConsecutiveResiduesFeatureRel nonConsecutiveResiduesFeatureRel = new NonConsecutiveResiduesFeatureRel(null);
public static NonStandardAminoAcidFeatureRel nonStandardAminoAcidFeatureRel = new NonStandardAminoAcidFeatureRel(null);
public static NonTerminalResidueFeatureRel nonTerminalResidueFeatureRel = new NonTerminalResidueFeatureRel(null);
public static NucleotidePhosphateBindingRegionFeatureRel nucleotidePhosphateBindingRegionFeatureRel = new NucleotidePhosphateBindingRegionFeatureRel(null);
public static PeptideFeatureRel peptideFeatureRel = new PeptideFeatureRel(null);
public static PropeptideFeatureRel propeptideFeatureRel = new PropeptideFeatureRel(null);
public static RegionOfInterestFeatureRel regionOfInterestFeatureRel = new RegionOfInterestFeatureRel(null);
public static RepeatFeatureRel repeatFeatureRel = new RepeatFeatureRel(null);
public static SequenceConflictFeatureRel sequenceConflictFeatureRel = new SequenceConflictFeatureRel(null);
public static SequenceVariantFeatureRel sequenceVariantFeatureRel = new SequenceVariantFeatureRel(null);
public static ShortSequenceMotifFeatureRel shortSequenceMotifFeatureRel = new ShortSequenceMotifFeatureRel(null);
public static SignalPeptideFeatureRel signalPeptideFeatureRel = new SignalPeptideFeatureRel(null);
public static SiteFeatureRel siteFeatureRel = new SiteFeatureRel(null);
public static SpliceVariantFeatureRel spliceVariantFeatureRel = new SpliceVariantFeatureRel(null);
public static StrandFeatureRel strandFeatureRel = new StrandFeatureRel(null);
public static TopologicalDomainFeatureRel topologicalDomainFeatureRel = new TopologicalDomainFeatureRel(null);
public static TransitPeptideFeatureRel transitPeptideFeatureRel = new TransitPeptideFeatureRel(null);
public static TransmembraneRegionFeatureRel transmembraneRegionFeatureRel = new TransmembraneRegionFeatureRel(null);
public static TurnFeatureRel turnFeatureRel = new TurnFeatureRel(null);
public static UnsureResidueFeatureRel unsureResidueFeatureRel = new UnsureResidueFeatureRel(null);
public static ZincFingerRegionFeatureRel zincFingerRegionFeatureRel = new ZincFingerRegionFeatureRel(null);
//------------------------------------------------------------------------------------------------
//------------------------------------------------------------------------------------------------
//-----other things....------
public static long alternativeProductInitiationId;
public static long alternativeProductPromoterId;
public static long alternativeProductSplicingId;
public static long alternativeProductRibosomalFrameshiftingId;
public static long seqCautionErroneousInitiationId;
public static long seqCautionErroneousTranslationId;
public static long seqCautionFrameshiftId;
public static long seqCautionErroneousTerminationId;
public static long seqCautionMiscellaneousDiscrepancyId;
public static long seqCautionErroneousGeneModelPredictionId;
//---------------------------------
//--------indexing API constans-----
private static String PROVIDER_ST = "provider";
private static String EXACT_ST = "exact";
private static String FULL_TEXT_ST = "fulltext";
private static String LUCENE_ST = "lucene";
private static String TYPE_ST = "type";
//-----------------------------------
@Override
public void execute(ArrayList<String> array) {
String[] args = new String[array.size()];
for (int i = 0; i < array.size(); i++) {
args[i] = array.get(i);
}
main(args);
}
public static void main(String[] args) {
if (args.length != 4) {
System.out.println("This program expects the following parameters: \n"
+ "1. Uniprot xml filename \n"
+ "2. Bio4j DB folder \n"
+ "3. batch inserter .properties file \n"
+ "4. Config XML file");
} else {
long initTime = System.nanoTime();
File inFile = new File(args[0]);
File configFile = new File(args[3]);
String currentAccessionId = "";
BatchInserter inserter = null;
BatchInserterIndexProvider indexProvider = null;
BufferedWriter enzymeIdsNotFoundBuff = null;
BufferedWriter statsBuff = null;
int proteinCounter = 0;
int limitForPrintingOut = 10000;
try {
// This block configures the logger with handler and formatter
fh = new FileHandler("ImportUniprot" + args[0].split("\\.")[0] + ".log", false);
SimpleFormatter formatter = new SimpleFormatter();
fh.setFormatter(formatter);
logger.addHandler(fh);
logger.setLevel(Level.ALL);
System.out.println("Reading conf file...");
BufferedReader reader = new BufferedReader(new FileReader(configFile));
String line;
StringBuilder stBuilder = new StringBuilder();
while ((line = reader.readLine()) != null) {
stBuilder.append(line);
}
reader.close();
UniprotDataXML uniprotDataXML = new UniprotDataXML(stBuilder.toString());
//---creating writer for enzymes not found file-----
enzymeIdsNotFoundBuff = new BufferedWriter(new FileWriter(new File("EnzymeIdsNotFound.log")));
//---creating writer for stats file-----
statsBuff = new BufferedWriter(new FileWriter(new File("ImportUniprotStats_" + inFile.getName().split("\\.")[0] + ".txt")));
// create the batch inserter
inserter = BatchInserters.inserter(args[1], MapUtil.load(new File(args[2])));
// create the batch index service
indexProvider = new LuceneBatchInserterIndexProvider(inserter);
//-----------------create batch indexes----------------------------------
//----------------------------------------------------------------------
BatchInserterIndex proteinAccessionIndex = indexProvider.nodeIndex(ProteinNode.PROTEIN_ACCESSION_INDEX,
MapUtil.stringMap(PROVIDER_ST, LUCENE_ST, TYPE_ST, EXACT_ST));
BatchInserterIndex proteinFullNameFullTextIndex = indexProvider.nodeIndex(ProteinNode.PROTEIN_FULL_NAME_FULL_TEXT_INDEX,
MapUtil.stringMap(PROVIDER_ST, LUCENE_ST, TYPE_ST, FULL_TEXT_ST));
BatchInserterIndex proteinGeneNamesFullTextIndex = indexProvider.nodeIndex(ProteinNode.PROTEIN_GENE_NAMES_FULL_TEXT_INDEX,
MapUtil.stringMap(PROVIDER_ST, LUCENE_ST, TYPE_ST, FULL_TEXT_ST));
BatchInserterIndex proteinEnsemblPlantsIndex = indexProvider.nodeIndex(ProteinNode.PROTEIN_ENSEMBL_PLANTS_INDEX,
MapUtil.stringMap(PROVIDER_ST, LUCENE_ST, TYPE_ST, EXACT_ST));
BatchInserterIndex datasetNameIndex = indexProvider.nodeIndex(DatasetNode.DATASET_NAME_INDEX,
MapUtil.stringMap(PROVIDER_ST, LUCENE_ST, TYPE_ST, EXACT_ST));
BatchInserterIndex keywordIdIndex = indexProvider.nodeIndex(KeywordNode.KEYWORD_ID_INDEX,
MapUtil.stringMap(PROVIDER_ST, LUCENE_ST, TYPE_ST, EXACT_ST));
BatchInserterIndex keywordNameIndex = indexProvider.nodeIndex(KeywordNode.KEYWORD_NAME_INDEX,
MapUtil.stringMap(PROVIDER_ST, LUCENE_ST, TYPE_ST, EXACT_ST));
BatchInserterIndex interproIdIndex = indexProvider.nodeIndex(InterproNode.INTERPRO_ID_INDEX,
MapUtil.stringMap(PROVIDER_ST, LUCENE_ST, TYPE_ST, EXACT_ST));
BatchInserterIndex pfamIdIndex = indexProvider.nodeIndex(PfamNode.PFAM_ID_INDEX,
MapUtil.stringMap(PROVIDER_ST, LUCENE_ST, TYPE_ST, EXACT_ST));
BatchInserterIndex goTermIdIndex = indexProvider.nodeIndex(GoTermNode.GO_TERM_ID_INDEX,
MapUtil.stringMap(PROVIDER_ST, LUCENE_ST, TYPE_ST, EXACT_ST));
BatchInserterIndex organismScientificNameIndex = indexProvider.nodeIndex(OrganismNode.ORGANISM_SCIENTIFIC_NAME_INDEX,
MapUtil.stringMap(PROVIDER_ST, LUCENE_ST, TYPE_ST, EXACT_ST));
BatchInserterIndex organismNcbiTaxonomyIdIndex = indexProvider.nodeIndex(OrganismNode.ORGANISM_NCBI_TAXONOMY_ID_INDEX,
MapUtil.stringMap(PROVIDER_ST, LUCENE_ST, TYPE_ST, EXACT_ST));
BatchInserterIndex taxonNameIndex = indexProvider.nodeIndex(TaxonNode.TAXON_NAME_INDEX,
MapUtil.stringMap(PROVIDER_ST, LUCENE_ST, TYPE_ST, EXACT_ST));
BatchInserterIndex genomeElementVersionIndex = indexProvider.nodeIndex(GenomeElementNode.GENOME_ELEMENT_VERSION_INDEX,
MapUtil.stringMap(PROVIDER_ST, LUCENE_ST, TYPE_ST, EXACT_ST));
BatchInserterIndex reactomeTermIdIndex = indexProvider.nodeIndex(ReactomeTermNode.REACTOME_TERM_ID_INDEX,
MapUtil.stringMap(PROVIDER_ST, LUCENE_ST, TYPE_ST, EXACT_ST));
BatchInserterIndex enzymeIdIndex = indexProvider.nodeIndex(EnzymeNode.ENZYME_ID_INDEX,
MapUtil.stringMap(PROVIDER_ST, LUCENE_ST, TYPE_ST, EXACT_ST));
BatchInserterIndex nodeTypeIndex = indexProvider.nodeIndex(Bio4jManager.NODE_TYPE_INDEX_NAME,
MapUtil.stringMap(PROVIDER_ST, LUCENE_ST, TYPE_ST, EXACT_ST));
BatchInserterIndex mainNodesIndex = indexProvider.nodeIndex(Bio4jManager.MAIN_NODES_INDEX_NAME,
MapUtil.stringMap(PROVIDER_ST, LUCENE_ST, TYPE_ST, EXACT_ST));
//----------------------------------------------------------------------
//----------------------------------------------------------------------
reader = new BufferedReader(new FileReader(inFile));
StringBuilder entryStBuilder = new StringBuilder();
//----------------------------------------------------------------------
//------------------------looking up for main nodes---------------------
alternativeProductInitiationId = mainNodesIndex.get(Bio4jManager.MAIN_NODES_INDEX_NAME, Bio4jManager.ALTERNATIVE_PRODUCT_INITIATION).getSingle();
alternativeProductPromoterId = mainNodesIndex.get(Bio4jManager.MAIN_NODES_INDEX_NAME, Bio4jManager.ALTERNATIVE_PRODUCT_PROMOTER).getSingle();
alternativeProductSplicingId = mainNodesIndex.get(Bio4jManager.MAIN_NODES_INDEX_NAME, Bio4jManager.ALTERNATIVE_PRODUCT_SPLICING).getSingle();
alternativeProductRibosomalFrameshiftingId = mainNodesIndex.get(Bio4jManager.MAIN_NODES_INDEX_NAME, Bio4jManager.ALTERNATIVE_PRODUCT_RIBOSOMAL_FRAMESHIFTING).getSingle();
seqCautionErroneousInitiationId = mainNodesIndex.get(Bio4jManager.MAIN_NODES_INDEX_NAME, Bio4jManager.SEQUENCE_CAUTION_ERRONEOUS_INITIATION).getSingle();
seqCautionErroneousTranslationId = mainNodesIndex.get(Bio4jManager.MAIN_NODES_INDEX_NAME, Bio4jManager.SEQUENCE_CAUTION_ERRONEOUS_TRANSLATION).getSingle();
seqCautionFrameshiftId = mainNodesIndex.get(Bio4jManager.MAIN_NODES_INDEX_NAME, Bio4jManager.SEQUENCE_CAUTION_FRAMESHIFT).getSingle();
seqCautionErroneousTerminationId = mainNodesIndex.get(Bio4jManager.MAIN_NODES_INDEX_NAME, Bio4jManager.SEQUENCE_CAUTION_ERRONEOUS_TERMINATION).getSingle();
seqCautionMiscellaneousDiscrepancyId = mainNodesIndex.get(Bio4jManager.MAIN_NODES_INDEX_NAME, Bio4jManager.SEQUENCE_CAUTION_MISCELLANEOUS_DISCREPANCY).getSingle();
seqCautionErroneousGeneModelPredictionId = mainNodesIndex.get(Bio4jManager.MAIN_NODES_INDEX_NAME, Bio4jManager.SEQUENCE_CAUTION_ERRONEOUS_GENE_MODEL_PREDICTION).getSingle();
//----------------------------------------------------------------------
//----------------------------------------------------------------------------------
//---------------------initializing node type properties----------------------------
organismProperties.put(OrganismNode.NODE_TYPE_PROPERTY, OrganismNode.NODE_TYPE);
proteinProperties.put(ProteinNode.NODE_TYPE_PROPERTY, ProteinNode.NODE_TYPE);
keywordProperties.put(KeywordNode.NODE_TYPE_PROPERTY, KeywordNode.NODE_TYPE);
subcellularLocationProperties.put(SubcellularLocationNode.NODE_TYPE_PROPERTY, SubcellularLocationNode.NODE_TYPE);
interproProperties.put(InterproNode.NODE_TYPE_PROPERTY, InterproNode.NODE_TYPE);
pfamProperties.put(PfamNode.NODE_TYPE_PROPERTY, PfamNode.NODE_TYPE);
taxonProperties.put(TaxonNode.NODE_TYPE_PROPERTY, TaxonNode.NODE_TYPE);
datasetProperties.put(DatasetNode.NODE_TYPE_PROPERTY, DatasetNode.NODE_TYPE);
personProperties.put(PersonNode.NODE_TYPE_PROPERTY, PersonNode.NODE_TYPE);
consortiumProperties.put(ConsortiumNode.NODE_TYPE_PROPERTY, ConsortiumNode.NODE_TYPE);
instituteProperties.put(InstituteNode.NODE_TYPE_PROPERTY, InstituteNode.NODE_TYPE);
thesisProperties.put(ThesisNode.NODE_TYPE_PROPERTY, ThesisNode.NODE_TYPE);
bookProperties.put(BookNode.NODE_TYPE_PROPERTY, BookNode.NODE_TYPE);
patentProperties.put(PatentNode.NODE_TYPE_PROPERTY, PatentNode.NODE_TYPE);
articleProperties.put(ArticleNode.NODE_TYPE_PROPERTY, ArticleNode.NODE_TYPE);
submissionProperties.put(SubmissionNode.NODE_TYPE_PROPERTY, SubmissionNode.NODE_TYPE);
onlineArticleProperties.put(OnlineArticleNode.NODE_TYPE_PROPERTY, OnlineArticleNode.NODE_TYPE);
unpublishedObservationProperties.put(UnpublishedObservationNode.NODE_TYPE_PROPERTY, UnpublishedObservationNode.NODE_TYPE);
publisherProperties.put(PublisherNode.NODE_TYPE_PROPERTY, PublisherNode.NODE_TYPE);
cityProperties.put(CityNode.NODE_TYPE_PROPERTY, CityNode.NODE_TYPE);
journalProperties.put(JournalNode.NODE_TYPE_PROPERTY, JournalNode.NODE_TYPE);
onlineJournalProperties.put(OnlineJournalNode.NODE_TYPE_PROPERTY, OnlineJournalNode.NODE_TYPE);
countryProperties.put(CountryNode.NODE_TYPE_PROPERTY, CountryNode.NODE_TYPE);
isoformProperties.put(IsoformNode.NODE_TYPE_PROPERTY, IsoformNode.NODE_TYPE);
commentTypeProperties.put(CommentTypeNode.NODE_TYPE_PROPERTY, CommentTypeNode.NODE_TYPE);
featureTypeProperties.put(FeatureTypeNode.NODE_TYPE_PROPERTY, FeatureTypeNode.NODE_TYPE);
//-----------------------------------------------------------------------------------------
//-----------------------------------------------------------------------------------------
while ((line = reader.readLine()) != null) {
if (line.trim().startsWith("<" + UniprotStuff.ENTRY_TAG_NAME)) {
while (!line.trim().startsWith("</" + UniprotStuff.ENTRY_TAG_NAME + ">")) {
entryStBuilder.append(line);
line = reader.readLine();
}
//linea final del organism
entryStBuilder.append(line);
//System.out.println("organismStBuilder.toString() = " + organismStBuilder.toString());
XMLElement entryXMLElem = new XMLElement(entryStBuilder.toString());
entryStBuilder.delete(0, entryStBuilder.length());
String modifiedDateSt = entryXMLElem.asJDomElement().getAttributeValue(UniprotStuff.ENTRY_MODIFIED_DATE_ATTRIBUTE);
String accessionSt = entryXMLElem.asJDomElement().getChildText(UniprotStuff.ENTRY_ACCESSION_TAG_NAME);
String nameSt = entryXMLElem.asJDomElement().getChildText(UniprotStuff.ENTRY_NAME_TAG_NAME);
String fullNameSt = getProteinFullName(entryXMLElem.asJDomElement().getChild(UniprotStuff.PROTEIN_TAG_NAME));
String shortNameSt = getProteinShortName(entryXMLElem.asJDomElement().getChild(UniprotStuff.PROTEIN_TAG_NAME));
if (shortNameSt == null) {
shortNameSt = "";
}
if (fullNameSt == null) {
fullNameSt = "";
}
currentAccessionId = accessionSt;
//-----------alternative accessions-------------
ArrayList<String> alternativeAccessions = new ArrayList<>();
List<Element> altAccessionsList = entryXMLElem.asJDomElement().getChildren(UniprotStuff.ENTRY_ACCESSION_TAG_NAME);
for (int i = 1; i < altAccessionsList.size(); i++) {
alternativeAccessions.add(altAccessionsList.get(i).getText());
}
proteinProperties.put(ProteinNode.ALTERNATIVE_ACCESSIONS_PROPERTY, convertToStringArray(alternativeAccessions));
//-----db references-------------
String pirIdSt = "";
String keggIdSt = "";
String ensemblIdSt = "";
String uniGeneIdSt = "";
String arrayExpressIdSt = "";
List<Element> dbReferenceList = entryXMLElem.asJDomElement().getChildren(UniprotStuff.DB_REFERENCE_TAG_NAME);
ArrayList<String> emblCrossReferences = new ArrayList<>();
ArrayList<String> refseqReferences = new ArrayList<>();
ArrayList<String> enzymeDBReferences = new ArrayList<>();
ArrayList<String> ensemblPlantsReferences = new ArrayList<>();
HashMap<String, String> reactomeReferences = new HashMap<>();
for (Element dbReferenceElem : dbReferenceList) {
String refId = dbReferenceElem.getAttributeValue("id");
switch (dbReferenceElem.getAttributeValue(UniprotStuff.DB_REFERENCE_TYPE_ATTRIBUTE)) {
case "Ensembl":
ensemblIdSt = refId;
break;
case "PIR":
pirIdSt = refId;
break;
case "UniGene":
uniGeneIdSt = refId;
break;
case "KEGG":
keggIdSt = refId;
break;
case "EMBL":
emblCrossReferences.add(refId);
break;
case "EC":
enzymeDBReferences.add(refId);
break;
case "ArrayExpress":
arrayExpressIdSt = refId;
break;
case "RefSeq":
//refseqReferences.add(refId);
List<Element> children = dbReferenceElem.getChildren("property");
for (Element propertyElem : children) {
if (propertyElem.getAttributeValue("type").equals("nucleotide sequence ID")) {
refseqReferences.add(propertyElem.getAttributeValue("value"));
}
}
break;
case "Reactome":
Element propertyElem = dbReferenceElem.getChild("property");
String pathwayName = "";
if (propertyElem.getAttributeValue("type").equals("pathway name")) {
pathwayName = propertyElem.getAttributeValue("value");
}
reactomeReferences.put(refId, pathwayName);
break;
case "EnsemblPlants":
ensemblPlantsReferences.add(refId);
break;
}
}
Element sequenceElem = entryXMLElem.asJDomElement().getChild(UniprotStuff.ENTRY_SEQUENCE_TAG_NAME);
String sequenceSt = sequenceElem.getText();
int seqLength = Integer.parseInt(sequenceElem.getAttributeValue(UniprotStuff.SEQUENCE_LENGTH_ATTRIBUTE));
float seqMass = Float.parseFloat(sequenceElem.getAttributeValue(UniprotStuff.SEQUENCE_MASS_ATTRIBUTE));
//System.out.println("lalala " + seqMass);
proteinProperties.put(ProteinNode.MODIFIED_DATE_PROPERTY, modifiedDateSt);
proteinProperties.put(ProteinNode.ACCESSION_PROPERTY, accessionSt);
proteinProperties.put(ProteinNode.NAME_PROPERTY, nameSt);
proteinProperties.put(ProteinNode.FULL_NAME_PROPERTY, fullNameSt);
proteinProperties.put(ProteinNode.SHORT_NAME_PROPERTY, shortNameSt);
proteinProperties.put(ProteinNode.SEQUENCE_PROPERTY, sequenceSt);
proteinProperties.put(ProteinNode.LENGTH_PROPERTY, seqLength);
proteinProperties.put(ProteinNode.MASS_PROPERTY, seqMass);
proteinProperties.put(ProteinNode.ARRAY_EXPRESS_ID_PROPERTY, arrayExpressIdSt);
proteinProperties.put(ProteinNode.PIR_ID_PROPERTY, pirIdSt);
proteinProperties.put(ProteinNode.KEGG_ID_PROPERTY, keggIdSt);
proteinProperties.put(ProteinNode.EMBL_REFERENCES_PROPERTY, convertToStringArray(emblCrossReferences));
proteinProperties.put(ProteinNode.ENSEMBL_PLANTS_REFERENCES_PROPERTY, convertToStringArray(ensemblPlantsReferences));
proteinProperties.put(ProteinNode.ENSEMBL_ID_PROPERTY, ensemblIdSt);
proteinProperties.put(ProteinNode.UNIGENE_ID_PROPERTY, uniGeneIdSt);
//---------------gene-names-------------------
Element geneElement = entryXMLElem.asJDomElement().getChild(UniprotStuff.GENE_TAG_NAME);
ArrayList<String> geneNames = new ArrayList<>();
if (geneElement != null) {
List<Element> genesList = geneElement.getChildren(UniprotStuff.GENE_NAME_TAG_NAME);
for (Element geneNameElem : genesList) {
geneNames.add(geneNameElem.getText());
}
}
proteinProperties.put(ProteinNode.GENE_NAMES_PROPERTY, convertToStringArray(geneNames));
//-----------------------------------------
long currentProteinId = inserter.createNode(proteinProperties);
proteinAccessionIndex.add(currentProteinId, MapUtil.map(ProteinNode.PROTEIN_ACCESSION_INDEX, accessionSt));
//indexing protein by alternative accessions
for (String altAccessionSt : alternativeAccessions) {
proteinAccessionIndex.add(currentProteinId, MapUtil.map(ProteinNode.PROTEIN_ACCESSION_INDEX, altAccessionSt));
}
//---flushing protein accession index----
proteinAccessionIndex.flush();
//---adding protein node to node_type index----
nodeTypeIndex.add(currentProteinId, MapUtil.map(Bio4jManager.NODE_TYPE_INDEX_NAME, ProteinNode.NODE_TYPE));
//indexing protein by full name
if (!fullNameSt.isEmpty()) {
proteinFullNameFullTextIndex.add(currentProteinId, MapUtil.map(ProteinNode.PROTEIN_FULL_NAME_FULL_TEXT_INDEX, fullNameSt));
//System.out.println(fullNameSt.toUpperCase() + " , " + currentProteinId);
}
//indexing protein by gene names
String geneNamesStToBeIndexed = "";
for (String geneNameSt : geneNames) {
geneNamesStToBeIndexed += geneNameSt + " ";
}
proteinGeneNamesFullTextIndex.add(currentProteinId, MapUtil.map(ProteinNode.PROTEIN_GENE_NAMES_FULL_TEXT_INDEX, geneNamesStToBeIndexed));
//indexing protein by Ensembl plants references
for (String ensemblPlantRef : ensemblPlantsReferences) {
proteinEnsemblPlantsIndex.add(currentProteinId, MapUtil.map(ProteinNode.PROTEIN_ENSEMBL_PLANTS_INDEX, ensemblPlantRef));
}
//--------------refseq associations----------------
if (uniprotDataXML.getRefseq()) {
for (String refseqReferenceSt : refseqReferences) {
//System.out.println("refseqReferenceSt = " + refseqReferenceSt);
IndexHits<Long> hits = genomeElementVersionIndex.get(GenomeElementNode.GENOME_ELEMENT_VERSION_INDEX, refseqReferenceSt);
if (hits.hasNext()) {
inserter.createRelationship(currentProteinId, hits.getSingle(), proteinGenomeElementRel, null);
} else {
logger.log(Level.INFO, ("GenomeElem not found for: " + currentAccessionId + " , " + refseqReferenceSt));
}
}
}
//--------------reactome associations----------------
if (uniprotDataXML.getReactome()) {
for (String reactomeId : reactomeReferences.keySet()) {
long reactomeTermNodeId = -1;
IndexHits<Long> reactomeTermIdIndexHits = reactomeTermIdIndex.get(ReactomeTermNode.REACTOME_TERM_ID_INDEX, reactomeId);
if (reactomeTermIdIndexHits.hasNext()) {
reactomeTermNodeId = reactomeTermIdIndexHits.getSingle();
}
if (reactomeTermNodeId < 0) {
reactomeTermProperties.put(ReactomeTermNode.ID_PROPERTY, reactomeId);
reactomeTermProperties.put(ReactomeTermNode.PATHWAY_NAME_PROPERTY, reactomeReferences.get(reactomeId));
reactomeTermNodeId = inserter.createNode(reactomeTermProperties);
reactomeTermIdIndex.add(reactomeTermNodeId, MapUtil.map(ReactomeTermNode.REACTOME_TERM_ID_INDEX, reactomeId));
//----flushing reactome index---
reactomeTermIdIndex.flush();
//---adding reactome term node to node_type index----
nodeTypeIndex.add(reactomeTermNodeId, MapUtil.map(Bio4jManager.NODE_TYPE_INDEX_NAME, ReactomeTermNode.NODE_TYPE));
}
inserter.createRelationship(currentProteinId, reactomeTermNodeId, proteinReactomeRel, null);
}
}
//-------------------------------------------------------
//---------------enzyme db associations----------------------
if (uniprotDataXML.getEnzymeDb()) {
for (String enzymeDBRef : enzymeDBReferences) {
long enzymeNodeId;
IndexHits<Long> enzymeIdIndexHits = enzymeIdIndex.get(EnzymeNode.ENZYME_ID_INDEX, enzymeDBRef);
if (enzymeIdIndexHits.hasNext()) {
enzymeNodeId = enzymeIdIndexHits.next();
inserter.createRelationship(currentProteinId, enzymeNodeId, proteinEnzymaticActivityRel, null);
} else {
enzymeIdsNotFoundBuff.write("Enzyme term: " + enzymeDBRef + " not found.\t" + currentAccessionId);
}
}
}
//------------------------------------------------------------
//-----comments import---
if (uniprotDataXML.getComments()) {
importProteinComments(entryXMLElem, inserter, indexProvider, currentProteinId, sequenceSt, uniprotDataXML);
}
//-----features import----
if (uniprotDataXML.getFeatures()) {
importProteinFeatures(entryXMLElem, inserter, indexProvider, currentProteinId);
}
//--------------------------------datasets--------------------------------------------------
String proteinDataSetSt = entryXMLElem.asJDomElement().getAttributeValue(UniprotStuff.ENTRY_DATASET_ATTRIBUTE);
//long datasetId = indexService.getSingleNode(DatasetNode.DATASET_NAME_INDEX, proteinDataSetSt);
long datasetId = -1;
IndexHits<Long> datasetNameIndexHits = datasetNameIndex.get(DatasetNode.DATASET_NAME_INDEX, proteinDataSetSt);
if (datasetNameIndexHits.hasNext()) {
datasetId = datasetNameIndexHits.getSingle();
}
if (datasetId < 0) {
datasetProperties.put(DatasetNode.NAME_PROPERTY, proteinDataSetSt);
datasetId = inserter.createNode(datasetProperties);
datasetNameIndex.add(datasetId, MapUtil.map(DatasetNode.DATASET_NAME_INDEX, proteinDataSetSt));
//----flushing dataset name index---
datasetNameIndex.flush();
//---adding dataset node to node_type index----
nodeTypeIndex.add(datasetId, MapUtil.map(Bio4jManager.NODE_TYPE_INDEX_NAME, DatasetNode.NODE_TYPE));
}
inserter.createRelationship(currentProteinId, datasetId, proteinDatasetRel, null);
//---------------------------------------------------------------------------------------------
if (uniprotDataXML.getCitations()) {
importProteinCitations(entryXMLElem,
inserter,
indexProvider,
currentProteinId,
uniprotDataXML);
}
//-------------------------------keywords------------------------------------------------------
if (uniprotDataXML.getKeywords()) {
List<Element> keywordsList = entryXMLElem.asJDomElement().getChildren(UniprotStuff.KEYWORD_TAG_NAME);
for (Element keywordElem : keywordsList) {
String keywordId = keywordElem.getAttributeValue(UniprotStuff.KEYWORD_ID_ATTRIBUTE);
String keywordName = keywordElem.getText();
long keywordNodeId = -1;
IndexHits<Long> keyworIdIndexHits = keywordIdIndex.get(KeywordNode.KEYWORD_ID_INDEX, keywordId);
if (keyworIdIndexHits.hasNext()) {
keywordNodeId = keyworIdIndexHits.getSingle();
}
if (keywordNodeId < 0) {
keywordProperties.put(KeywordNode.ID_PROPERTY, keywordId);
keywordProperties.put(KeywordNode.NAME_PROPERTY, keywordName);
keywordNodeId = inserter.createNode(keywordProperties);
keywordIdIndex.add(keywordNodeId, MapUtil.map(KeywordNode.KEYWORD_ID_INDEX, keywordId));
keywordNameIndex.add(keywordNodeId, MapUtil.map(KeywordNode.KEYWORD_NAME_INDEX, keywordName));
//---flushing keyword id index----
keywordIdIndex.flush();
//---adding keyword node to node_type index----
nodeTypeIndex.add(keywordNodeId, MapUtil.map(Bio4jManager.NODE_TYPE_INDEX_NAME, KeywordNode.NODE_TYPE));
}
inserter.createRelationship(currentProteinId, keywordNodeId, proteinKeywordRel, null);
}
}
//---------------------------------------------------------------------------------------
for (Element dbReferenceElem : dbReferenceList) {
//-------------------------------INTERPRO------------------------------------------------------
if (dbReferenceElem.getAttributeValue(UniprotStuff.DB_REFERENCE_TYPE_ATTRIBUTE).equals(UniprotStuff.INTERPRO_DB_REFERENCE_TYPE)) {
if (uniprotDataXML.getInterpro()) {
String interproId = dbReferenceElem.getAttributeValue(UniprotStuff.DB_REFERENCE_ID_ATTRIBUTE);
//long interproNodeId = indexService.getSingleNode(InterproNode.INTERPRO_ID_INDEX, interproId);
long interproNodeId = -1;
IndexHits<Long> interproIdIndexHits = interproIdIndex.get(InterproNode.INTERPRO_ID_INDEX, interproId);
if (interproIdIndexHits.hasNext()) {
interproNodeId = interproIdIndexHits.getSingle();
}
if (interproNodeId < 0) {
String interproEntryNameSt = "";
List<Element> properties = dbReferenceElem.getChildren(UniprotStuff.DB_REFERENCE_PROPERTY_TAG_NAME);
for (Element prop : properties) {
if (prop.getAttributeValue(UniprotStuff.DB_REFERENCE_TYPE_ATTRIBUTE).equals(UniprotStuff.INTERPRO_ENTRY_NAME)) {
interproEntryNameSt = prop.getAttributeValue(UniprotStuff.DB_REFERENCE_VALUE_ATTRIBUTE);
break;
}
}
interproProperties.put(InterproNode.ID_PROPERTY, interproId);
interproProperties.put(InterproNode.NAME_PROPERTY, interproEntryNameSt);
interproNodeId = inserter.createNode(interproProperties);
interproIdIndex.add(interproNodeId, MapUtil.map(InterproNode.INTERPRO_ID_INDEX, interproId));
//flushing interpro id index
interproIdIndex.flush();
//---adding interpro node to node_type index----
nodeTypeIndex.add(interproNodeId, MapUtil.map(Bio4jManager.NODE_TYPE_INDEX_NAME, InterproNode.NODE_TYPE));
}
inserter.createRelationship(currentProteinId, interproNodeId, proteinInterproRel, null);
}
} //-------------------------------PFAM------------------------------------------------------
else if (dbReferenceElem.getAttributeValue(UniprotStuff.DB_REFERENCE_TYPE_ATTRIBUTE).equals("Pfam")) {
if (uniprotDataXML.getPfam()) {
String pfamId = dbReferenceElem.getAttributeValue(UniprotStuff.DB_REFERENCE_ID_ATTRIBUTE);
long pfamNodeId = -1;
IndexHits<Long> pfamIdIndexHits = pfamIdIndex.get(PfamNode.PFAM_ID_INDEX, pfamId);
if (pfamIdIndexHits.hasNext()) {
pfamNodeId = pfamIdIndexHits.getSingle();
}
if (pfamNodeId < 0) {
String pfamEntryNameSt = "";
List<Element> properties = dbReferenceElem.getChildren(UniprotStuff.DB_REFERENCE_PROPERTY_TAG_NAME);
for (Element prop : properties) {
if (prop.getAttributeValue(UniprotStuff.DB_REFERENCE_TYPE_ATTRIBUTE).equals("entry name")) {
pfamEntryNameSt = prop.getAttributeValue(UniprotStuff.DB_REFERENCE_VALUE_ATTRIBUTE);
break;
}
}
pfamProperties.put(PfamNode.ID_PROPERTY, pfamId);
pfamProperties.put(PfamNode.NAME_PROPERTY, pfamEntryNameSt);
pfamNodeId = inserter.createNode(pfamProperties);
pfamIdIndex.add(pfamNodeId, MapUtil.map(PfamNode.PFAM_ID_INDEX, pfamId));
//flushing pfam id index
pfamIdIndex.flush();
//---adding pfam node to node_type index----
nodeTypeIndex.add(pfamNodeId, MapUtil.map(Bio4jManager.NODE_TYPE_INDEX_NAME, PfamNode.NODE_TYPE));
}
inserter.createRelationship(currentProteinId, pfamNodeId, proteinPfamRel, null);
}
} //-------------------GO -----------------------------
else if (dbReferenceElem.getAttributeValue(UniprotStuff.DB_REFERENCE_TYPE_ATTRIBUTE).toUpperCase().equals(UniprotStuff.GO_DB_REFERENCE_TYPE)) {
if (uniprotDataXML.getGeneOntology()) {
String goId = dbReferenceElem.getAttributeValue(UniprotStuff.DB_REFERENCE_ID_ATTRIBUTE);
String evidenceSt = "";
List<Element> props = dbReferenceElem.getChildren(UniprotStuff.DB_REFERENCE_PROPERTY_TAG_NAME);
for (Element element : props) {
if (element.getAttributeValue(UniprotStuff.DB_REFERENCE_TYPE_ATTRIBUTE).equals(UniprotStuff.EVIDENCE_TYPE_ATTRIBUTE)) {
evidenceSt = element.getAttributeValue("value");
if (evidenceSt == null) {
evidenceSt = "";
}
break;
}
}
long goTermNodeId = goTermIdIndex.get(GoTermNode.GO_TERM_ID_INDEX, goId).getSingle();
proteinGoProperties.put(ProteinGoRel.EVIDENCE_PROPERTY, evidenceSt);
inserter.createRelationship(currentProteinId, goTermNodeId, proteinGoRel, proteinGoProperties);
}
}
}
//---------------------------------------------------------------------------------------
//---------------------------------------------------------------------------------------
//--------------------------------organism-----------------------------------------------
String scName, commName, synName;
scName = "";
commName = "";
synName = "";
Element organismElem = entryXMLElem.asJDomElement().getChild(UniprotStuff.ORGANISM_TAG_NAME);
List<Element> organismNames = organismElem.getChildren(UniprotStuff.ORGANISM_NAME_TAG_NAME);
for (Element element : organismNames) {
String type = element.getAttributeValue(UniprotStuff.ORGANISM_NAME_TYPE_ATTRIBUTE);
switch (type) {
case UniprotStuff.ORGANISM_SCIENTIFIC_NAME_TYPE:
scName = element.getText();
break;
case UniprotStuff.ORGANISM_COMMON_NAME_TYPE:
commName = element.getText();
break;
case UniprotStuff.ORGANISM_SYNONYM_NAME_TYPE:
synName = element.getText();
break;
}
}
//long organismNodeId = indexService.getSingleNode(OrganismNode.ORGANISM_SCIENTIFIC_NAME_INDEX, scName);
long organismNodeId = -1;
IndexHits<Long> organismScientifiNameIndexHits = organismScientificNameIndex.get(OrganismNode.ORGANISM_SCIENTIFIC_NAME_INDEX, scName);
if (organismScientifiNameIndexHits.hasNext()) {
organismNodeId = organismScientifiNameIndexHits.getSingle();
}
if (organismNodeId < 0) {
organismProperties.put(OrganismNode.COMMON_NAME_PROPERTY, commName);
organismProperties.put(OrganismNode.SCIENTIFIC_NAME_PROPERTY, scName);
organismProperties.put(OrganismNode.SYNONYM_NAME_PROPERTY, synName);
List<Element> organismDbRefElems = organismElem.getChildren(UniprotStuff.DB_REFERENCE_TAG_NAME);
boolean ncbiIdFound = false;
if (organismDbRefElems != null) {
for (Element dbRefElem : organismDbRefElems) {
String t = dbRefElem.getAttributeValue("type");
if (t.equals("NCBI Taxonomy")) {
organismProperties.put(OrganismNode.NCBI_TAXONOMY_ID_PROPERTY, dbRefElem.getAttributeValue("id"));
ncbiIdFound = true;
break;
}
}
}
if (!ncbiIdFound) {
organismProperties.put(OrganismNode.NCBI_TAXONOMY_ID_PROPERTY, "");
}
organismNodeId = inserter.createNode(organismProperties);
organismScientificNameIndex.add(organismNodeId, MapUtil.map(OrganismNode.ORGANISM_SCIENTIFIC_NAME_INDEX, scName));
organismNcbiTaxonomyIdIndex.add(organismNodeId, MapUtil.map(OrganismNode.NCBI_TAXONOMY_ID_PROPERTY, organismProperties.get(OrganismNode.NCBI_TAXONOMY_ID_PROPERTY)));
//flushing organism scientifica name index
organismScientificNameIndex.flush();
//---adding organism node to node_type index----
nodeTypeIndex.add(organismNodeId, MapUtil.map(Bio4jManager.NODE_TYPE_INDEX_NAME, OrganismNode.NODE_TYPE));
Element lineage = entryXMLElem.asJDomElement().getChild("organism").getChild("lineage");
List<Element> taxons = lineage.getChildren("taxon");
Element firstTaxonElem = taxons.get(0);
//long firstTaxonId = indexService.getSingleNode(TaxonNode.TAXON_NAME_INDEX, firstTaxonElem.getText());
long firstTaxonId = -1;
IndexHits<Long> firstTaxonIndexHits = taxonNameIndex.get(TaxonNode.TAXON_NAME_INDEX, firstTaxonElem.getText());
if (firstTaxonIndexHits.hasNext()) {
firstTaxonId = firstTaxonIndexHits.getSingle();
}
if (firstTaxonId < 0) {
String firstTaxonName = firstTaxonElem.getText();
taxonProperties.put(TaxonNode.NAME_PROPERTY, firstTaxonName);
firstTaxonId = createTaxonNode(taxonProperties, inserter, taxonNameIndex, nodeTypeIndex);
//flushing taxon name index--
taxonNameIndex.flush();
}
long lastTaxonId = firstTaxonId;
for (int i = 1; i < taxons.size(); i++) {
String taxonName = taxons.get(i).getText();
long currentTaxonId = -1;
IndexHits<Long> currentTaxonIndexHits = taxonNameIndex.get(TaxonNode.TAXON_NAME_INDEX, taxonName);
if (currentTaxonIndexHits.hasNext()) {
currentTaxonId = currentTaxonIndexHits.getSingle();
}
if (currentTaxonId < 0) {
taxonProperties.put(TaxonNode.NAME_PROPERTY, taxonName);
currentTaxonId = createTaxonNode(taxonProperties, inserter, taxonNameIndex, nodeTypeIndex);
//flushing taxon name index--
taxonNameIndex.flush();
inserter.createRelationship(lastTaxonId, currentTaxonId, taxonParentRel, null);
}
lastTaxonId = currentTaxonId;
}
inserter.createRelationship(lastTaxonId, organismNodeId, taxonParentRel, null);
}
//---------------------------------------------------------------------------------------
//---------------------------------------------------------------------------------------
inserter.createRelationship(currentProteinId, organismNodeId, proteinOrganismRel, null);
proteinCounter++;
if ((proteinCounter % limitForPrintingOut) == 0) {
String countProteinsSt = proteinCounter + " proteins inserted!!";
logger.log(Level.INFO, countProteinsSt);
}
}
}
} catch (Exception e) {
logger.log(Level.SEVERE, ("Exception retrieving protein " + currentAccessionId));
logger.log(Level.SEVERE, e.getMessage());
StackTraceElement[] trace = e.getStackTrace();
for (StackTraceElement stackTraceElement : trace) {
logger.log(Level.SEVERE, stackTraceElement.toString());
}
} finally {
try {
//------closing writers-------
enzymeIdsNotFoundBuff.close();
// shutdown, makes sure all changes are written to disk
indexProvider.shutdown();
inserter.shutdown();
// closing logger file handler
fh.close();
//-----------------writing stats file---------------------
long elapsedTime = System.nanoTime() - initTime;
long elapsedSeconds = Math.round((elapsedTime / 1000000000.0));
long hours = elapsedSeconds / 3600;
long minutes = (elapsedSeconds % 3600) / 60;
long seconds = (elapsedSeconds % 3600) % 60;
statsBuff.write("Statistics for program ImportUniprot:\nInput file: " + inFile.getName()
+ "\nThere were " + proteinCounter + " proteins inserted.\n"
+ "The elapsed time was: " + hours + "h " + minutes + "m " + seconds + "s\n");
//---closing stats writer---
statsBuff.close();
} catch (IOException ex) {
Logger.getLogger(ImportUniprot.class.getName()).log(Level.SEVERE, null, ex);
}
}
}
}
private static void importProteinFeatures(XMLElement entryXMLElem,
BatchInserter inserter,
BatchInserterIndexProvider indexProvider,
long currentProteinId) {
//-----------------create batch indexes----------------------------------
//----------------------------------------------------------------------
BatchInserterIndex featureTypeNameIndex = indexProvider.nodeIndex(FeatureTypeNode.FEATURE_TYPE_NAME_INDEX,
MapUtil.stringMap(PROVIDER_ST, LUCENE_ST, TYPE_ST, EXACT_ST));
BatchInserterIndex nodeTypeIndex = indexProvider.nodeIndex(Bio4jManager.NODE_TYPE_INDEX_NAME,
MapUtil.stringMap(PROVIDER_ST, LUCENE_ST, TYPE_ST, EXACT_ST));
//------------------------------------------------------------------------
//--------------------------------features----------------------------------------------------
List<Element> featuresList = entryXMLElem.asJDomElement().getChildren(UniprotStuff.FEATURE_TAG_NAME);
for (Element featureElem : featuresList) {
String featureTypeSt = featureElem.getAttributeValue(UniprotStuff.FEATURE_TYPE_ATTRIBUTE);
//long featureTypeNodeId = indexService.getSingleNode(FeatureTypeNode.FEATURE_TYPE_NAME_INDEX, featureTypeSt);
long featureTypeNodeId = -1;
IndexHits<Long> featureTypeNameIndexHits = featureTypeNameIndex.get(FeatureTypeNode.FEATURE_TYPE_NAME_INDEX, featureTypeSt);
if (featureTypeNameIndexHits.hasNext()) {
featureTypeNodeId = featureTypeNameIndexHits.getSingle();
}
featureTypeNameIndexHits.close();
if (featureTypeNodeId < 0) {
featureTypeProperties.put(FeatureTypeNode.NAME_PROPERTY, featureTypeSt);
featureTypeNodeId = inserter.createNode(featureTypeProperties);
//indexService.index(featureTypeNodeId, FeatureTypeNode.FEATURE_TYPE_NAME_INDEX, featureTypeSt);
featureTypeNameIndex.add(featureTypeNodeId, MapUtil.map(FeatureTypeNode.FEATURE_TYPE_NAME_INDEX, featureTypeSt));
//---flushing feature type name index----
featureTypeNameIndex.flush();
//---adding feature type node to node_type index----
nodeTypeIndex.add(featureTypeNodeId, MapUtil.map(Bio4jManager.NODE_TYPE_INDEX_NAME, FeatureTypeNode.NODE_TYPE));
}
String featureDescSt = featureElem.getAttributeValue(UniprotStuff.FEATURE_DESCRIPTION_ATTRIBUTE);
if (featureDescSt == null) {
featureDescSt = "";
}
String featureIdSt = featureElem.getAttributeValue(UniprotStuff.FEATURE_ID_ATTRIBUTE);
if (featureIdSt == null) {
featureIdSt = "";
}
String featureStatusSt = featureElem.getAttributeValue(UniprotStuff.STATUS_ATTRIBUTE);
if (featureStatusSt == null) {
featureStatusSt = "";
}
String featureEvidenceSt = featureElem.getAttributeValue(UniprotStuff.EVIDENCE_ATTRIBUTE);
if (featureEvidenceSt == null) {
featureEvidenceSt = "";
}
Element locationElem = featureElem.getChild(UniprotStuff.FEATURE_LOCATION_TAG_NAME);
Element positionElem = locationElem.getChild(UniprotStuff.FEATURE_POSITION_TAG_NAME);
String beginFeatureSt;
String endFeatureSt;
if (positionElem != null) {
beginFeatureSt = positionElem.getAttributeValue(UniprotStuff.FEATURE_POSITION_POSITION_ATTRIBUTE);
endFeatureSt = beginFeatureSt;
} else {
beginFeatureSt = locationElem.getChild(UniprotStuff.FEATURE_LOCATION_BEGIN_TAG_NAME).getAttributeValue(UniprotStuff.FEATURE_LOCATION_POSITION_ATTRIBUTE);
endFeatureSt = locationElem.getChild(UniprotStuff.FEATURE_LOCATION_END_TAG_NAME).getAttributeValue(UniprotStuff.FEATURE_LOCATION_POSITION_ATTRIBUTE);
}
if (beginFeatureSt == null) {
beginFeatureSt = "";
}
if (endFeatureSt == null) {
endFeatureSt = "";
}
String originalSt = featureElem.getChildText(UniprotStuff.FEATURE_ORIGINAL_TAG_NAME);
String variationSt = featureElem.getChildText(UniprotStuff.FEATURE_VARIATION_TAG_NAME);
if (originalSt == null) {
originalSt = "";
}
if (variationSt == null) {
variationSt = "";
}
String featureRefSt = featureElem.getAttributeValue(UniprotStuff.FEATURE_REF_ATTRIBUTE);
if (featureRefSt == null) {
featureRefSt = "";
}
featureProperties.put(BasicFeatureRel.DESCRIPTION_PROPERTY, featureDescSt);
featureProperties.put(BasicFeatureRel.ID_PROPERTY, featureIdSt);
featureProperties.put(BasicFeatureRel.EVIDENCE_PROPERTY, featureEvidenceSt);
featureProperties.put(BasicFeatureRel.STATUS_PROPERTY, featureStatusSt);
featureProperties.put(BasicFeatureRel.BEGIN_PROPERTY, beginFeatureSt);
featureProperties.put(BasicFeatureRel.END_PROPERTY, endFeatureSt);
featureProperties.put(BasicFeatureRel.ORIGINAL_PROPERTY, originalSt);
featureProperties.put(BasicFeatureRel.VARIATION_PROPERTY, variationSt);
featureProperties.put(BasicFeatureRel.REF_PROPERTY, featureRefSt);
switch (featureTypeSt) {
case ActiveSiteFeatureRel.UNIPROT_ATTRIBUTE_TYPE_VALUE:
inserter.createRelationship(currentProteinId, featureTypeNodeId, activeSiteFeatureRel, featureProperties);
break;
case BindingSiteFeatureRel.UNIPROT_ATTRIBUTE_TYPE_VALUE:
inserter.createRelationship(currentProteinId, featureTypeNodeId, bindingSiteFeatureRel, featureProperties);
break;
case CrossLinkFeatureRel.UNIPROT_ATTRIBUTE_TYPE_VALUE:
inserter.createRelationship(currentProteinId, featureTypeNodeId, crossLinkFeatureRel, featureProperties);
break;
case GlycosylationSiteFeatureRel.UNIPROT_ATTRIBUTE_TYPE_VALUE:
inserter.createRelationship(currentProteinId, featureTypeNodeId, glycosylationSiteFeatureRel, featureProperties);
break;
case InitiatorMethionineFeatureRel.UNIPROT_ATTRIBUTE_TYPE_VALUE:
inserter.createRelationship(currentProteinId, featureTypeNodeId, initiatorMethionineFeatureRel, featureProperties);
break;
case LipidMoietyBindingRegionFeatureRel.UNIPROT_ATTRIBUTE_TYPE_VALUE:
inserter.createRelationship(currentProteinId, featureTypeNodeId, lipidMoietyBindingRegionFeatureRel, featureProperties);
break;
case MetalIonBindingSiteFeatureRel.UNIPROT_ATTRIBUTE_TYPE_VALUE:
inserter.createRelationship(currentProteinId, featureTypeNodeId, metalIonBindingSiteFeatureRel, featureProperties);
break;
case ModifiedResidueFeatureRel.UNIPROT_ATTRIBUTE_TYPE_VALUE:
inserter.createRelationship(currentProteinId, featureTypeNodeId, modifiedResidueFeatureRel, featureProperties);
break;
case NonStandardAminoAcidFeatureRel.UNIPROT_ATTRIBUTE_TYPE_VALUE:
inserter.createRelationship(currentProteinId, featureTypeNodeId, nonStandardAminoAcidFeatureRel, featureProperties);
break;
case NonTerminalResidueFeatureRel.UNIPROT_ATTRIBUTE_TYPE_VALUE:
inserter.createRelationship(currentProteinId, featureTypeNodeId, nonTerminalResidueFeatureRel, featureProperties);
break;
case PeptideFeatureRel.UNIPROT_ATTRIBUTE_TYPE_VALUE:
inserter.createRelationship(currentProteinId, featureTypeNodeId, peptideFeatureRel, featureProperties);
break;
case UnsureResidueFeatureRel.UNIPROT_ATTRIBUTE_TYPE_VALUE:
inserter.createRelationship(currentProteinId, featureTypeNodeId, unsureResidueFeatureRel, featureProperties);
break;
case MutagenesisSiteFeatureRel.UNIPROT_ATTRIBUTE_TYPE_VALUE:
inserter.createRelationship(currentProteinId, featureTypeNodeId, mutagenesisSiteFeatureRel, featureProperties);
break;
case SequenceVariantFeatureRel.UNIPROT_ATTRIBUTE_TYPE_VALUE:
inserter.createRelationship(currentProteinId, featureTypeNodeId, sequenceVariantFeatureRel, featureProperties);
break;
case CalciumBindingRegionFeatureRel.UNIPROT_ATTRIBUTE_TYPE_VALUE:
inserter.createRelationship(currentProteinId, featureTypeNodeId, calciumBindingRegionFeatureRel, featureProperties);
break;
case ChainFeatureRel.UNIPROT_ATTRIBUTE_TYPE_VALUE:
inserter.createRelationship(currentProteinId, featureTypeNodeId, chainFeatureRel, featureProperties);
break;
case CoiledCoilRegionFeatureRel.UNIPROT_ATTRIBUTE_TYPE_VALUE:
inserter.createRelationship(currentProteinId, featureTypeNodeId, coiledCoilRegionFeatureRel, featureProperties);
break;
case CompositionallyBiasedRegionFeatureRel.UNIPROT_ATTRIBUTE_TYPE_VALUE:
inserter.createRelationship(currentProteinId, featureTypeNodeId, compositionallyBiasedRegionFeatureRel, featureProperties);
break;
case DisulfideBondFeatureRel.UNIPROT_ATTRIBUTE_TYPE_VALUE:
inserter.createRelationship(currentProteinId, featureTypeNodeId, disulfideBondFeatureRel, featureProperties);
break;
case DnaBindingRegionFeatureRel.UNIPROT_ATTRIBUTE_TYPE_VALUE:
inserter.createRelationship(currentProteinId, featureTypeNodeId, dnaBindingRegionFeatureRel, featureProperties);
break;
case DomainFeatureRel.UNIPROT_ATTRIBUTE_TYPE_VALUE:
inserter.createRelationship(currentProteinId, featureTypeNodeId, domainFeatureRel, featureProperties);
break;
case HelixFeatureRel.UNIPROT_ATTRIBUTE_TYPE_VALUE:
inserter.createRelationship(currentProteinId, featureTypeNodeId, helixFeatureRel, featureProperties);
break;
case IntramembraneRegionFeatureRel.UNIPROT_ATTRIBUTE_TYPE_VALUE:
inserter.createRelationship(currentProteinId, featureTypeNodeId, intramembraneRegionFeatureRel, featureProperties);
break;
case NonConsecutiveResiduesFeatureRel.UNIPROT_ATTRIBUTE_TYPE_VALUE:
inserter.createRelationship(currentProteinId, featureTypeNodeId, nonConsecutiveResiduesFeatureRel, featureProperties);
break;
case NucleotidePhosphateBindingRegionFeatureRel.UNIPROT_ATTRIBUTE_TYPE_VALUE:
inserter.createRelationship(currentProteinId, featureTypeNodeId, nucleotidePhosphateBindingRegionFeatureRel, featureProperties);
break;
case PropeptideFeatureRel.UNIPROT_ATTRIBUTE_TYPE_VALUE:
inserter.createRelationship(currentProteinId, featureTypeNodeId, propeptideFeatureRel, featureProperties);
break;
case RegionOfInterestFeatureRel.UNIPROT_ATTRIBUTE_TYPE_VALUE:
inserter.createRelationship(currentProteinId, featureTypeNodeId, regionOfInterestFeatureRel, featureProperties);
break;
case RepeatFeatureRel.UNIPROT_ATTRIBUTE_TYPE_VALUE:
inserter.createRelationship(currentProteinId, featureTypeNodeId, repeatFeatureRel, featureProperties);
break;
case ShortSequenceMotifFeatureRel.UNIPROT_ATTRIBUTE_TYPE_VALUE:
inserter.createRelationship(currentProteinId, featureTypeNodeId, shortSequenceMotifFeatureRel, featureProperties);
break;
case SignalPeptideFeatureRel.UNIPROT_ATTRIBUTE_TYPE_VALUE:
inserter.createRelationship(currentProteinId, featureTypeNodeId, signalPeptideFeatureRel, featureProperties);
break;
case SpliceVariantFeatureRel.UNIPROT_ATTRIBUTE_TYPE_VALUE:
inserter.createRelationship(currentProteinId, featureTypeNodeId, spliceVariantFeatureRel, featureProperties);
break;
case StrandFeatureRel.UNIPROT_ATTRIBUTE_TYPE_VALUE:
inserter.createRelationship(currentProteinId, featureTypeNodeId, strandFeatureRel, featureProperties);
break;
case TopologicalDomainFeatureRel.UNIPROT_ATTRIBUTE_TYPE_VALUE:
inserter.createRelationship(currentProteinId, featureTypeNodeId, topologicalDomainFeatureRel, featureProperties);
break;
case TransitPeptideFeatureRel.UNIPROT_ATTRIBUTE_TYPE_VALUE:
inserter.createRelationship(currentProteinId, featureTypeNodeId, transitPeptideFeatureRel, featureProperties);
break;
case TransmembraneRegionFeatureRel.UNIPROT_ATTRIBUTE_TYPE_VALUE:
inserter.createRelationship(currentProteinId, featureTypeNodeId, transmembraneRegionFeatureRel, featureProperties);
break;
case ZincFingerRegionFeatureRel.UNIPROT_ATTRIBUTE_TYPE_VALUE:
inserter.createRelationship(currentProteinId, featureTypeNodeId, zincFingerRegionFeatureRel, featureProperties);
break;
case SiteFeatureRel.UNIPROT_ATTRIBUTE_TYPE_VALUE:
inserter.createRelationship(currentProteinId, featureTypeNodeId, siteFeatureRel, featureProperties);
break;
case TurnFeatureRel.UNIPROT_ATTRIBUTE_TYPE_VALUE:
inserter.createRelationship(currentProteinId, featureTypeNodeId, turnFeatureRel, featureProperties);
break;
case SequenceConflictFeatureRel.UNIPROT_ATTRIBUTE_TYPE_VALUE:
inserter.createRelationship(currentProteinId, featureTypeNodeId, sequenceConflictFeatureRel, featureProperties);
break;
}
}
}
private static void importProteinComments(XMLElement entryXMLElem,
BatchInserter inserter,
BatchInserterIndexProvider indexProvider,
long currentProteinId,
String proteinSequence,
UniprotDataXML uniprotDataXML) {
//---------------indexes declaration---------------------------
BatchInserterIndex commentTypeNameIndex = indexProvider.nodeIndex(CommentTypeNode.COMMENT_TYPE_NAME_INDEX,
MapUtil.stringMap(PROVIDER_ST, LUCENE_ST, TYPE_ST, EXACT_ST));
BatchInserterIndex subcellularLocationNameIndex = indexProvider.nodeIndex(SubcellularLocationNode.SUBCELLULAR_LOCATION_NAME_INDEX,
MapUtil.stringMap(PROVIDER_ST, LUCENE_ST, TYPE_ST, EXACT_ST));
BatchInserterIndex isoformIdIndex = indexProvider.nodeIndex(IsoformNode.ISOFORM_ID_INDEX,
MapUtil.stringMap(PROVIDER_ST, LUCENE_ST, TYPE_ST, EXACT_ST));
BatchInserterIndex nodeTypeIndex = indexProvider.nodeIndex(Bio4jManager.NODE_TYPE_INDEX_NAME,
MapUtil.stringMap(PROVIDER_ST, LUCENE_ST, TYPE_ST, EXACT_ST));
//-----------------------------------------------------------
List<Element> comments = entryXMLElem.asJDomElement().getChildren(UniprotStuff.COMMENT_TAG_NAME);
for (Element commentElem : comments) {
String commentTypeSt = commentElem.getAttributeValue(UniprotStuff.COMMENT_TYPE_ATTRIBUTE);
Element textElem = commentElem.getChild("text");
String commentTextSt = "";
String commentStatusSt = "";
String commentEvidenceSt = "";
if (textElem != null) {
commentTextSt = textElem.getText();
commentStatusSt = textElem.getAttributeValue("status");
if (commentStatusSt == null) {
commentStatusSt = "";
}
commentEvidenceSt = textElem.getAttributeValue("evidence");
if (commentEvidenceSt == null) {
commentEvidenceSt = "";
}
}
commentProperties.put(BasicCommentRel.TEXT_PROPERTY, commentTextSt);
commentProperties.put(BasicCommentRel.STATUS_PROPERTY, commentStatusSt);
commentProperties.put(BasicCommentRel.EVIDENCE_PROPERTY, commentEvidenceSt);
//-----------------COMMENT TYPE NODE RETRIEVING/CREATION----------------------
//long commentTypeId = indexService.getSingleNode(CommentTypeNode.COMMENT_TYPE_NAME_INDEX, commentTypeSt);
IndexHits<Long> commentTypeNameIndexHits = commentTypeNameIndex.get(CommentTypeNode.COMMENT_TYPE_NAME_INDEX, commentTypeSt);
long commentTypeId = -1;
if (commentTypeNameIndexHits.hasNext()) {
commentTypeId = commentTypeNameIndexHits.getSingle();
}
commentTypeNameIndexHits.close();
if (commentTypeId < 0) {
commentTypeProperties.put(CommentTypeNode.NAME_PROPERTY, commentTypeSt);
commentTypeId = inserter.createNode(commentTypeProperties);
commentTypeNameIndex.add(commentTypeId, MapUtil.map(CommentTypeNode.COMMENT_TYPE_NAME_INDEX, commentTypeSt));
//----flushing the indexation----
commentTypeNameIndex.flush();
//---adding comment type node to node_type index----
nodeTypeIndex.add(commentTypeId, MapUtil.map(Bio4jManager.NODE_TYPE_INDEX_NAME, CommentTypeNode.NODE_TYPE));
}
//-----toxic dose----------------
switch (commentTypeSt) {
case ToxicDoseCommentRel.UNIPROT_ATTRIBUTE_TYPE_VALUE:
inserter.createRelationship(currentProteinId, commentTypeId, toxicDoseCommentRel, commentProperties);
break;
case CautionCommentRel.UNIPROT_ATTRIBUTE_TYPE_VALUE:
inserter.createRelationship(currentProteinId, commentTypeId, cautionCommentRel, commentProperties);
break;
case CofactorCommentRel.UNIPROT_ATTRIBUTE_TYPE_VALUE:
inserter.createRelationship(currentProteinId, commentTypeId, cofactorCommentRel, commentProperties);
break;
case DiseaseCommentRel.UNIPROT_ATTRIBUTE_TYPE_VALUE:
inserter.createRelationship(currentProteinId, commentTypeId, diseaseCommentRel, commentProperties);
break;
case OnlineInformationCommentRel.UNIPROT_ATTRIBUTE_TYPE_VALUE:
onlineInformationCommentProperties.put(OnlineInformationCommentRel.STATUS_PROPERTY, commentStatusSt);
onlineInformationCommentProperties.put(OnlineInformationCommentRel.EVIDENCE_PROPERTY, commentEvidenceSt);
onlineInformationCommentProperties.put(OnlineInformationCommentRel.TEXT_PROPERTY, commentTextSt);
String nameSt = commentElem.getAttributeValue("name");
if (nameSt == null) {
nameSt = "";
}
String linkSt = "";
Element linkElem = commentElem.getChild("link");
if (linkElem != null) {
String uriSt = linkElem.getAttributeValue("uri");
if (uriSt != null) {
linkSt = uriSt;
}
}
onlineInformationCommentProperties.put(OnlineInformationCommentRel.NAME_PROPERTY, nameSt);
onlineInformationCommentProperties.put(OnlineInformationCommentRel.LINK_PROPERTY, linkSt);
inserter.createRelationship(currentProteinId, commentTypeId, onlineInformationCommentRel, onlineInformationCommentProperties);
break;
case TissueSpecificityCommentRel.UNIPROT_ATTRIBUTE_TYPE_VALUE:
inserter.createRelationship(currentProteinId, commentTypeId, tissueSpecificityCommentRel, commentProperties);
break;
case FunctionCommentRel.UNIPROT_ATTRIBUTE_TYPE_VALUE:
inserter.createRelationship(currentProteinId, commentTypeId, functionCommentRel, commentProperties);
break;
case BiotechnologyCommentRel.UNIPROT_ATTRIBUTE_TYPE_VALUE:
inserter.createRelationship(currentProteinId, commentTypeId, biotechnologyCommentRel, commentProperties);
break;
case SubunitCommentRel.UNIPROT_ATTRIBUTE_TYPE_VALUE:
inserter.createRelationship(currentProteinId, commentTypeId, subunitCommentRel, commentProperties);
break;
case PolymorphismCommentRel.UNIPROT_ATTRIBUTE_TYPE_VALUE:
inserter.createRelationship(currentProteinId, commentTypeId, polymorphismCommentRel, commentProperties);
break;
case DomainCommentRel.UNIPROT_ATTRIBUTE_TYPE_VALUE:
inserter.createRelationship(currentProteinId, commentTypeId, domainCommentRel, commentProperties);
break;
case PostTranslationalModificationCommentRel.UNIPROT_ATTRIBUTE_TYPE_VALUE:
inserter.createRelationship(currentProteinId, commentTypeId, postTranslationalModificationCommentRel, commentProperties);
break;
case CatalyticActivityCommentRel.UNIPROT_ATTRIBUTE_TYPE_VALUE:
inserter.createRelationship(currentProteinId, commentTypeId, catalyticActivityCommentRel, commentProperties);
break;
case DisruptionPhenotypeCommentRel.UNIPROT_ATTRIBUTE_TYPE_VALUE:
inserter.createRelationship(currentProteinId, commentTypeId, disruptionPhenotypeCommentRel, commentProperties);
break;
case BioPhysicoChemicalPropertiesCommentRel.UNIPROT_ATTRIBUTE_TYPE_VALUE:
biophysicochemicalCommentProperties.put(BioPhysicoChemicalPropertiesCommentRel.STATUS_PROPERTY, commentStatusSt);
biophysicochemicalCommentProperties.put(BioPhysicoChemicalPropertiesCommentRel.EVIDENCE_PROPERTY, commentEvidenceSt);
biophysicochemicalCommentProperties.put(BioPhysicoChemicalPropertiesCommentRel.TEXT_PROPERTY, commentTextSt);
String phDependenceSt = commentElem.getChildText("phDependence");
String temperatureDependenceSt = commentElem.getChildText("temperatureDependence");
if (phDependenceSt == null) {
phDependenceSt = "";
}
if (temperatureDependenceSt == null) {
temperatureDependenceSt = "";
}
String absorptionMaxSt = "";
String absorptionTextSt = "";
Element absorptionElem = commentElem.getChild("absorption");
if (absorptionElem != null) {
absorptionMaxSt = absorptionElem.getChildText("max");
absorptionTextSt = absorptionElem.getChildText("text");
if (absorptionMaxSt == null) {
absorptionMaxSt = "";
}
if (absorptionTextSt == null) {
absorptionTextSt = "";
}
}
String kineticsSt = "";
Element kineticsElem = commentElem.getChild("kinetics");
if (kineticsElem != null) {
kineticsSt = new XMLElement(kineticsElem).toString();
}
String redoxPotentialSt = "";
String redoxPotentialEvidenceSt = "";
Element redoxPotentialElem = commentElem.getChild("redoxPotential");
if (redoxPotentialElem != null) {
redoxPotentialSt = redoxPotentialElem.getText();
redoxPotentialEvidenceSt = redoxPotentialElem.getAttributeValue("evidence");
if (redoxPotentialSt == null) {
redoxPotentialSt = "";
}
if (redoxPotentialEvidenceSt == null) {
redoxPotentialEvidenceSt = "";
}
}
biophysicochemicalCommentProperties.put(BioPhysicoChemicalPropertiesCommentRel.TEMPERATURE_DEPENDENCE_PROPERTY, temperatureDependenceSt);
biophysicochemicalCommentProperties.put(BioPhysicoChemicalPropertiesCommentRel.PH_DEPENDENCE_PROPERTY, phDependenceSt);
biophysicochemicalCommentProperties.put(BioPhysicoChemicalPropertiesCommentRel.KINETICS_XML_PROPERTY, kineticsSt);
biophysicochemicalCommentProperties.put(BioPhysicoChemicalPropertiesCommentRel.ABSORPTION_MAX_PROPERTY, absorptionMaxSt);
biophysicochemicalCommentProperties.put(BioPhysicoChemicalPropertiesCommentRel.ABSORPTION_TEXT_PROPERTY, absorptionTextSt);
biophysicochemicalCommentProperties.put(BioPhysicoChemicalPropertiesCommentRel.REDOX_POTENTIAL_EVIDENCE_PROPERTY, redoxPotentialEvidenceSt);
biophysicochemicalCommentProperties.put(BioPhysicoChemicalPropertiesCommentRel.REDOX_POTENTIAL_PROPERTY, redoxPotentialSt);
inserter.createRelationship(currentProteinId, commentTypeId, bioPhysicoChemicalPropertiesCommentRel, biophysicochemicalCommentProperties);
break;
case AllergenCommentRel.UNIPROT_ATTRIBUTE_TYPE_VALUE:
inserter.createRelationship(currentProteinId, commentTypeId, allergenCommentRel, commentProperties);
break;
case PathwayCommentRel.UNIPROT_ATTRIBUTE_TYPE_VALUE:
inserter.createRelationship(currentProteinId, commentTypeId, pathwayCommentRel, commentProperties);
break;
case InductionCommentRel.UNIPROT_ATTRIBUTE_TYPE_VALUE:
inserter.createRelationship(currentProteinId, commentTypeId, inductionCommentRel, commentProperties);
break;
case ProteinSubcellularLocationRel.UNIPROT_ATTRIBUTE_TYPE_VALUE:
if (uniprotDataXML.getSubcellularLocations()) {
List<Element> subcLocations = commentElem.getChildren(UniprotStuff.SUBCELLULAR_LOCATION_TAG_NAME);
for (Element subcLocation : subcLocations) {
List<Element> locations = subcLocation.getChildren(UniprotStuff.LOCATION_TAG_NAME);
Element firstLocation = locations.get(0);
//long firstLocationId = indexService.getSingleNode(SubcellularLocationNode.SUBCELLULAR_LOCATION_NAME_INDEX, firstLocation.getTextTrim());
long firstLocationId = -1;
IndexHits<Long> firstLocationIndexHits = subcellularLocationNameIndex.get(SubcellularLocationNode.SUBCELLULAR_LOCATION_NAME_INDEX, firstLocation.getTextTrim());
if (firstLocationIndexHits.hasNext()) {
firstLocationId = firstLocationIndexHits.getSingle();
}
firstLocationIndexHits.close();
long lastLocationId = firstLocationId;
if (firstLocationId < 0) {
subcellularLocationProperties.put(SubcellularLocationNode.NAME_PROPERTY, firstLocation.getTextTrim());
lastLocationId = createSubcellularLocationNode(subcellularLocationProperties, inserter, subcellularLocationNameIndex, nodeTypeIndex);
//---flushing subcellular location name index---
subcellularLocationNameIndex.flush();
}
for (int i = 1; i < locations.size(); i++) {
long tempLocationId;
IndexHits<Long> tempLocationIndexHits = subcellularLocationNameIndex.get(SubcellularLocationNode.SUBCELLULAR_LOCATION_NAME_INDEX, locations.get(i).getTextTrim());
if (tempLocationIndexHits.hasNext()) {
tempLocationId = tempLocationIndexHits.getSingle();
tempLocationIndexHits.close();
} else {
subcellularLocationProperties.put(SubcellularLocationNode.NAME_PROPERTY, locations.get(i).getTextTrim());
tempLocationId = createSubcellularLocationNode(subcellularLocationProperties, inserter, subcellularLocationNameIndex, nodeTypeIndex);
subcellularLocationNameIndex.flush();
}
inserter.createRelationship(tempLocationId, lastLocationId, subcellularLocationParentRel, null);
lastLocationId = tempLocationId;
}
Element lastLocation = locations.get(locations.size() - 1);
String evidenceSt = lastLocation.getAttributeValue(UniprotStuff.EVIDENCE_ATTRIBUTE);
String statusSt = lastLocation.getAttributeValue(UniprotStuff.STATUS_ATTRIBUTE);
String topologyStatusSt = "";
String topologySt = "";
Element topologyElem = subcLocation.getChild("topology");
if (topologyElem != null) {
topologySt = topologyElem.getText();
topologyStatusSt = topologyElem.getAttributeValue("status");
}
if (topologyStatusSt == null) {
topologyStatusSt = "";
}
if (topologySt == null) {
topologySt = "";
}
if (evidenceSt == null) {
evidenceSt = "";
}
if (statusSt == null) {
statusSt = "";
}
proteinSubcellularLocationProperties.put(ProteinSubcellularLocationRel.EVIDENCE_PROPERTY, evidenceSt);
proteinSubcellularLocationProperties.put(ProteinSubcellularLocationRel.STATUS_PROPERTY, statusSt);
proteinSubcellularLocationProperties.put(ProteinSubcellularLocationRel.TOPOLOGY_PROPERTY, topologySt);
proteinSubcellularLocationProperties.put(ProteinSubcellularLocationRel.TOPOLOGY_STATUS_PROPERTY, topologyStatusSt);
inserter.createRelationship(currentProteinId, lastLocationId, proteinSubcellularLocationRel, proteinSubcellularLocationProperties);
}
}
break;
case UniprotStuff.COMMENT_ALTERNATIVE_PRODUCTS_TYPE:
if (uniprotDataXML.getIsoforms()) {
List<Element> eventList = commentElem.getChildren("event");
List<Element> isoformList = commentElem.getChildren("isoform");
for (Element isoformElem : isoformList) {
String isoformIdSt = isoformElem.getChildText("id");
String isoformNoteSt = isoformElem.getChildText("note");
String isoformNameSt = isoformElem.getChildText("name");
String isoformSeqSt = "";
Element isoSeqElem = isoformElem.getChild("sequence");
if (isoSeqElem != null) {
String isoSeqTypeSt = isoSeqElem.getAttributeValue("type");
if (isoSeqTypeSt.equals("displayed")) {
isoformSeqSt = proteinSequence;
}
}
if (isoformNoteSt == null) {
isoformNoteSt = "";
}
if (isoformNameSt == null) {
isoformNameSt = "";
}
isoformProperties.put(IsoformNode.ID_PROPERTY, isoformIdSt);
isoformProperties.put(IsoformNode.NOTE_PROPERTY, isoformNoteSt);
isoformProperties.put(IsoformNode.NAME_PROPERTY, isoformNameSt);
isoformProperties.put(IsoformNode.SEQUENCE_PROPERTY, isoformSeqSt);
//--------------------------------------------------------
//long isoformId = indexService.getSingleNode(IsoformNode.ISOFORM_ID_INDEX, isoformIdSt);
long isoformId = -1;
IndexHits<Long> isoformIdIndexHits = isoformIdIndex.get(IsoformNode.ISOFORM_ID_INDEX, isoformIdSt);
if (isoformIdIndexHits.hasNext()) {
isoformId = isoformIdIndexHits.getSingle();
}
isoformIdIndexHits.close();
if (isoformId < 0) {
isoformId = createIsoformNode(isoformProperties, inserter, isoformIdIndex, nodeTypeIndex);
}
for (Element eventElem : eventList) {
String eventTypeSt = eventElem.getAttributeValue("type");
switch (eventTypeSt) {
case AlternativeProductInitiationRel.UNIPROT_ATTRIBUTE_TYPE_VALUE:
inserter.createRelationship(isoformId, alternativeProductInitiationId, isoformEventGeneratorRel, null);
break;
case AlternativeProductPromoterRel.UNIPROT_ATTRIBUTE_TYPE_VALUE:
inserter.createRelationship(isoformId, alternativeProductPromoterId, isoformEventGeneratorRel, null);
break;
case AlternativeProductRibosomalFrameshiftingRel.UNIPROT_ATTRIBUTE_TYPE_VALUE:
inserter.createRelationship(isoformId, alternativeProductRibosomalFrameshiftingId, isoformEventGeneratorRel, null);
break;
case AlternativeProductSplicingRel.UNIPROT_ATTRIBUTE_TYPE_VALUE:
inserter.createRelationship(isoformId, alternativeProductSplicingId, isoformEventGeneratorRel, null);
break;
}
}
//protein isoform relationship
inserter.createRelationship(currentProteinId, isoformId, proteinIsoformRel, null);
}
}
break;
case UniprotStuff.COMMENT_SEQUENCE_CAUTION_TYPE:
sequenceCautionProperties.put(BasicProteinSequenceCautionRel.EVIDENCE_PROPERTY, commentEvidenceSt);
sequenceCautionProperties.put(BasicProteinSequenceCautionRel.STATUS_PROPERTY, commentStatusSt);
sequenceCautionProperties.put(BasicProteinSequenceCautionRel.TEXT_PROPERTY, commentTextSt);
Element conflictElem = commentElem.getChild("conflict");
if (conflictElem != null) {
String conflictTypeSt = conflictElem.getAttributeValue("type");
String resourceSt = "";
String idSt = "";
String versionSt = "";
ArrayList<String> positionsList = new ArrayList<>();
Element sequenceElem = conflictElem.getChild("sequence");
if (sequenceElem != null) {
resourceSt = sequenceElem.getAttributeValue("resource");
if (resourceSt == null) {
resourceSt = "";
}
idSt = sequenceElem.getAttributeValue("id");
if (idSt == null) {
idSt = "";
}
versionSt = sequenceElem.getAttributeValue("version");
if (versionSt == null) {
versionSt = "";
}
}
Element locationElem = commentElem.getChild("location");
if (locationElem != null) {
Element positionElem = locationElem.getChild("position");
if (positionElem != null) {
String tempPos = positionElem.getAttributeValue("position");
if (tempPos != null) {
positionsList.add(tempPos);
}
}
}
sequenceCautionProperties.put(BasicProteinSequenceCautionRel.RESOURCE_PROPERTY, resourceSt);
sequenceCautionProperties.put(BasicProteinSequenceCautionRel.ID_PROPERTY, idSt);
sequenceCautionProperties.put(BasicProteinSequenceCautionRel.VERSION_PROPERTY, versionSt);
switch (conflictTypeSt) {
case ProteinErroneousGeneModelPredictionRel.UNIPROT_ATTRIBUTE_TYPE_VALUE:
if (positionsList.size() > 0) {
for (String tempPosition : positionsList) {
sequenceCautionProperties.put(BasicProteinSequenceCautionRel.POSITION_PROPERTY, tempPosition);
inserter.createRelationship(currentProteinId, seqCautionErroneousGeneModelPredictionId, proteinErroneousGeneModelPredictionRel, sequenceCautionProperties);
}
} else {
sequenceCautionProperties.put(BasicProteinSequenceCautionRel.POSITION_PROPERTY, "");
inserter.createRelationship(currentProteinId, seqCautionErroneousGeneModelPredictionId, proteinErroneousGeneModelPredictionRel, sequenceCautionProperties);
}
break;
case ProteinErroneousInitiationRel.UNIPROT_ATTRIBUTE_TYPE_VALUE:
if (positionsList.size() > 0) {
for (String tempPosition : positionsList) {
sequenceCautionProperties.put(BasicProteinSequenceCautionRel.POSITION_PROPERTY, tempPosition);
inserter.createRelationship(currentProteinId, seqCautionErroneousInitiationId, proteinErroneousInitiationRel, sequenceCautionProperties);
}
} else {
sequenceCautionProperties.put(BasicProteinSequenceCautionRel.POSITION_PROPERTY, "");
inserter.createRelationship(currentProteinId, seqCautionErroneousInitiationId, proteinErroneousInitiationRel, sequenceCautionProperties);
}
break;
case ProteinErroneousTranslationRel.UNIPROT_ATTRIBUTE_TYPE_VALUE:
if (positionsList.size() > 0) {
for (String tempPosition : positionsList) {
sequenceCautionProperties.put(BasicProteinSequenceCautionRel.POSITION_PROPERTY, tempPosition);
inserter.createRelationship(currentProteinId, seqCautionErroneousTranslationId, proteinErroneousTranslationRel, sequenceCautionProperties);
}
} else {
sequenceCautionProperties.put(BasicProteinSequenceCautionRel.POSITION_PROPERTY, "");
inserter.createRelationship(currentProteinId, seqCautionErroneousTranslationId, proteinErroneousTranslationRel, sequenceCautionProperties);
}
break;
case ProteinErroneousTerminationRel.UNIPROT_ATTRIBUTE_TYPE_VALUE:
if (positionsList.size() > 0) {
for (String tempPosition : positionsList) {
sequenceCautionProperties.put(BasicProteinSequenceCautionRel.POSITION_PROPERTY, tempPosition);
inserter.createRelationship(currentProteinId, seqCautionErroneousTerminationId, proteinErroneousTerminationRel, sequenceCautionProperties);
}
} else {
sequenceCautionProperties.put(BasicProteinSequenceCautionRel.POSITION_PROPERTY, "");
inserter.createRelationship(currentProteinId, seqCautionErroneousTerminationId, proteinErroneousTerminationRel, sequenceCautionProperties);
}
break;
case ProteinFrameshiftRel.UNIPROT_ATTRIBUTE_TYPE_VALUE:
if (positionsList.size() > 0) {
for (String tempPosition : positionsList) {
sequenceCautionProperties.put(BasicProteinSequenceCautionRel.POSITION_PROPERTY, tempPosition);
inserter.createRelationship(currentProteinId, seqCautionFrameshiftId, proteinFrameshiftRel, sequenceCautionProperties);
}
} else {
sequenceCautionProperties.put(BasicProteinSequenceCautionRel.POSITION_PROPERTY, "");
inserter.createRelationship(currentProteinId, seqCautionFrameshiftId, proteinFrameshiftRel, sequenceCautionProperties);
}
break;
case ProteinMiscellaneousDiscrepancyRel.UNIPROT_ATTRIBUTE_TYPE_VALUE:
if (positionsList.size() > 0) {
for (String tempPosition : positionsList) {
sequenceCautionProperties.put(BasicProteinSequenceCautionRel.POSITION_PROPERTY, tempPosition);
inserter.createRelationship(currentProteinId, seqCautionMiscellaneousDiscrepancyId, proteinMiscellaneousDiscrepancyRel, sequenceCautionProperties);
}
} else {
sequenceCautionProperties.put(BasicProteinSequenceCautionRel.POSITION_PROPERTY, "");
inserter.createRelationship(currentProteinId, seqCautionMiscellaneousDiscrepancyId, proteinMiscellaneousDiscrepancyRel, sequenceCautionProperties);
}
break;
}
}
break;
case DevelopmentalStageCommentRel.UNIPROT_ATTRIBUTE_TYPE_VALUE:
inserter.createRelationship(currentProteinId, commentTypeId, developmentalStageCommentRel, commentProperties);
break;
case MiscellaneousCommentRel.UNIPROT_ATTRIBUTE_TYPE_VALUE:
inserter.createRelationship(currentProteinId, commentTypeId, miscellaneousCommentRel, commentProperties);
break;
case SimilarityCommentRel.UNIPROT_ATTRIBUTE_TYPE_VALUE:
inserter.createRelationship(currentProteinId, commentTypeId, similarityCommentRel, commentProperties);
break;
case RnaEditingCommentRel.UNIPROT_ATTRIBUTE_TYPE_VALUE:
rnaEditingCommentProperties.put(RnaEditingCommentRel.STATUS_PROPERTY, commentStatusSt);
rnaEditingCommentProperties.put(RnaEditingCommentRel.EVIDENCE_PROPERTY, commentEvidenceSt);
rnaEditingCommentProperties.put(RnaEditingCommentRel.TEXT_PROPERTY, commentTextSt);
List<Element> locationsList = commentElem.getChildren("location");
for (Element tempLoc : locationsList) {
String positionSt = tempLoc.getChild("position").getAttributeValue("position");
rnaEditingCommentProperties.put(RnaEditingCommentRel.POSITION_PROPERTY, positionSt);
inserter.createRelationship(currentProteinId, commentTypeId, rnaEditingCommentRel, rnaEditingCommentProperties);
}
break;
case PharmaceuticalCommentRel.UNIPROT_ATTRIBUTE_TYPE_VALUE:
inserter.createRelationship(currentProteinId, commentTypeId, pharmaceuticalCommentRel, commentProperties);
break;
case EnzymeRegulationCommentRel.UNIPROT_ATTRIBUTE_TYPE_VALUE:
inserter.createRelationship(currentProteinId, commentTypeId, enzymeRegulationCommentRel, commentProperties);
break;
case MassSpectrometryCommentRel.UNIPROT_ATTRIBUTE_TYPE_VALUE:
String methodSt = commentElem.getAttributeValue("method");
String massSt = commentElem.getAttributeValue("mass");
if (methodSt == null) {
methodSt = "";
}
if (massSt == null) {
massSt = "";
}
String beginSt = "";
String endSt = "";
Element locationElem = commentElem.getChild("location");
if (locationElem != null) {
Element beginElem = commentElem.getChild("begin");
Element endElem = commentElem.getChild("end");
if (beginElem != null) {
beginSt = beginElem.getAttributeValue("position");
}
if (endElem != null) {
endSt = endElem.getAttributeValue("position");
}
}
massSpectrometryCommentProperties.put(MassSpectrometryCommentRel.STATUS_PROPERTY, commentStatusSt);
massSpectrometryCommentProperties.put(MassSpectrometryCommentRel.EVIDENCE_PROPERTY, commentEvidenceSt);
massSpectrometryCommentProperties.put(MassSpectrometryCommentRel.TEXT_PROPERTY, commentTextSt);
massSpectrometryCommentProperties.put(MassSpectrometryCommentRel.METHOD_PROPERTY, methodSt);
massSpectrometryCommentProperties.put(MassSpectrometryCommentRel.MASS_PROPERTY, massSt);
massSpectrometryCommentProperties.put(MassSpectrometryCommentRel.BEGIN_PROPERTY, beginSt);
massSpectrometryCommentProperties.put(MassSpectrometryCommentRel.END_PROPERTY, endSt);
inserter.createRelationship(currentProteinId, commentTypeId, massSpectrometryCommentRel, massSpectrometryCommentProperties);
break;
}
}
}
private static String getProteinFullName(Element proteinElement) {
if (proteinElement == null) {
return "";
} else {
Element recElem = proteinElement.getChild(UniprotStuff.PROTEIN_RECOMMENDED_NAME_TAG_NAME);
if (recElem == null) {
return "";
} else {
return recElem.getChildText(UniprotStuff.PROTEIN_FULL_NAME_TAG_NAME);
}
}
}
private static String getProteinShortName(Element proteinElement) {
if (proteinElement == null) {
return "";
} else {
Element recElem = proteinElement.getChild(UniprotStuff.PROTEIN_RECOMMENDED_NAME_TAG_NAME);
if (recElem == null) {
return "";
} else {
return recElem.getChildText(UniprotStuff.PROTEIN_SHORT_NAME_TAG_NAME);
}
}
}
private static long createIsoformNode(Map<String, Object> isoformProperties,
BatchInserter inserter,
BatchInserterIndex isoformIdIndex,
BatchInserterIndex nodeTypeIndex) {
long isoformId = inserter.createNode(isoformProperties);
isoformIdIndex.add(isoformId, MapUtil.map(IsoformNode.ISOFORM_ID_INDEX, isoformProperties.get(IsoformNode.ID_PROPERTY)));
//---adding isoform node to node_type index----
nodeTypeIndex.add(isoformId, MapUtil.map(Bio4jManager.NODE_TYPE_INDEX_NAME, IsoformNode.NODE_TYPE));
return isoformId;
}
private static long createTaxonNode(Map<String, Object> taxonProperties,
BatchInserter inserter,
BatchInserterIndex taxonNameIndex,
BatchInserterIndex nodeTypeIndex) {
long taxonId = inserter.createNode(taxonProperties);
taxonNameIndex.add(taxonId, MapUtil.map(TaxonNode.TAXON_NAME_INDEX, taxonProperties.get(TaxonNode.NAME_PROPERTY)));
//---adding taxon node to node_type index----
nodeTypeIndex.add(taxonId, MapUtil.map(Bio4jManager.NODE_TYPE_INDEX_NAME, TaxonNode.NODE_TYPE));
return taxonId;
}
private static long createPersonNode(Map<String, Object> personProperties,
BatchInserter inserter,
BatchInserterIndex index,
BatchInserterIndex nodeTypeIndex) {
long personId = inserter.createNode(personProperties);
index.add(personId, MapUtil.map(PersonNode.PERSON_NAME_FULL_TEXT_INDEX, personProperties.get(PersonNode.NAME_PROPERTY)));
//---adding person node to node_type index----
nodeTypeIndex.add(personId, MapUtil.map(Bio4jManager.NODE_TYPE_INDEX_NAME, PersonNode.NODE_TYPE));
return personId;
}
private static long createConsortiumNode(Map<String, Object> consortiumProperties,
BatchInserter inserter,
BatchInserterIndex index,
BatchInserterIndex nodeTypeIndex) {
long consortiumId = inserter.createNode(consortiumProperties);
index.add(consortiumId, MapUtil.map(ConsortiumNode.CONSORTIUM_NAME_INDEX, consortiumProperties.get(ConsortiumNode.NAME_PROPERTY)));
//---adding consortium node to node_type index----
nodeTypeIndex.add(consortiumId, MapUtil.map(Bio4jManager.NODE_TYPE_INDEX_NAME, ConsortiumNode.NODE_TYPE));
return consortiumId;
}
private static long createInstituteNode(Map<String, Object> instituteProperties,
BatchInserter inserter,
BatchInserterIndex index,
BatchInserterIndex nodeTypeIndex) {
long instituteId = inserter.createNode(instituteProperties);
index.add(instituteId, MapUtil.map(InstituteNode.INSTITUTE_NAME_INDEX, instituteProperties.get(InstituteNode.NAME_PROPERTY)));
//---adding institute node to node_type index----
nodeTypeIndex.add(instituteId, MapUtil.map(Bio4jManager.NODE_TYPE_INDEX_NAME, InstituteNode.NODE_TYPE));
return instituteId;
}
private static long createCountryNode(Map<String, Object> countryProperties,
BatchInserter inserter,
BatchInserterIndex index,
BatchInserterIndex nodeTypeIndex) {
long countryId = inserter.createNode(countryProperties);
index.add(countryId, MapUtil.map(CountryNode.COUNTRY_NAME_INDEX, countryProperties.get(CountryNode.NAME_PROPERTY)));
//---adding country node to node_type index----
nodeTypeIndex.add(countryId, MapUtil.map(Bio4jManager.NODE_TYPE_INDEX_NAME, CountryNode.NODE_TYPE));
return countryId;
}
private static long createCityNode(Map<String, Object> cityProperties,
BatchInserter inserter,
BatchInserterIndex index,
BatchInserterIndex nodeTypeIndex) {
long cityId = inserter.createNode(cityProperties);
index.add(cityId, MapUtil.map(CityNode.CITY_NAME_INDEX, cityProperties.get(CityNode.NAME_PROPERTY)));
//---adding city node to node_type index----
nodeTypeIndex.add(cityId, MapUtil.map(Bio4jManager.NODE_TYPE_INDEX_NAME, CityNode.NODE_TYPE));
return cityId;
}
private static long createDbNode(Map<String, Object> dbProperties,
BatchInserter inserter,
BatchInserterIndex index,
BatchInserterIndex nodeTypeIndex) {
long dbId = inserter.createNode(dbProperties);
index.add(dbId, MapUtil.map(DBNode.DB_NAME_INDEX, dbProperties.get(DBNode.NAME_PROPERTY)));
//---adding db node to node_type index----
nodeTypeIndex.add(dbId, MapUtil.map(Bio4jManager.NODE_TYPE_INDEX_NAME, DBNode.NODE_TYPE));
return dbId;
}
private static long createSubcellularLocationNode(Map<String, Object> subcellularLocationProperties,
BatchInserter inserter,
BatchInserterIndex index,
BatchInserterIndex nodeTypeIndex) {
long subcellularLocationId = inserter.createNode(subcellularLocationProperties);
index.add(subcellularLocationId, MapUtil.map(SubcellularLocationNode.SUBCELLULAR_LOCATION_NAME_INDEX, subcellularLocationProperties.get(SubcellularLocationNode.NAME_PROPERTY)));
//---adding subcellular location node to node_type index----
nodeTypeIndex.add(subcellularLocationId, MapUtil.map(Bio4jManager.NODE_TYPE_INDEX_NAME, SubcellularLocationNode.NODE_TYPE));
return subcellularLocationId;
}
private static void importProteinCitations(XMLElement entryXMLElem,
BatchInserter inserter,
BatchInserterIndexProvider indexProvider,
long currentProteinId,
UniprotDataXML uniprotDataXML) {
//-----------------create batch indexes----------------------------------
//----------------------------------------------------------------------
BatchInserterIndex personNameIndex = indexProvider.nodeIndex(PersonNode.PERSON_NAME_FULL_TEXT_INDEX,
MapUtil.stringMap(PROVIDER_ST, LUCENE_ST, TYPE_ST, FULL_TEXT_ST));
BatchInserterIndex consortiumNameIndex = indexProvider.nodeIndex(ConsortiumNode.CONSORTIUM_NAME_INDEX,
MapUtil.stringMap(PROVIDER_ST, LUCENE_ST, TYPE_ST, EXACT_ST));
BatchInserterIndex thesisTitleIndex = indexProvider.nodeIndex(ThesisNode.THESIS_TITLE_FULL_TEXT_INDEX,
MapUtil.stringMap(PROVIDER_ST, LUCENE_ST, TYPE_ST, FULL_TEXT_ST));
BatchInserterIndex instituteNameIndex = indexProvider.nodeIndex(InstituteNode.INSTITUTE_NAME_INDEX,
MapUtil.stringMap(PROVIDER_ST, LUCENE_ST, TYPE_ST, EXACT_ST));
BatchInserterIndex countryNameIndex = indexProvider.nodeIndex(CountryNode.COUNTRY_NAME_INDEX,
MapUtil.stringMap(PROVIDER_ST, LUCENE_ST, TYPE_ST, EXACT_ST));
BatchInserterIndex cityNameIndex = indexProvider.nodeIndex(CityNode.CITY_NAME_INDEX,
MapUtil.stringMap(PROVIDER_ST, LUCENE_ST, TYPE_ST, EXACT_ST));
BatchInserterIndex patentNumberIndex = indexProvider.nodeIndex(PatentNode.PATENT_NUMBER_INDEX,
MapUtil.stringMap(PROVIDER_ST, LUCENE_ST, TYPE_ST, EXACT_ST));
BatchInserterIndex bookNameIndex = indexProvider.nodeIndex(BookNode.BOOK_NAME_FULL_TEXT_INDEX,
MapUtil.stringMap(PROVIDER_ST, LUCENE_ST, TYPE_ST, FULL_TEXT_ST));
BatchInserterIndex publisherNameIndex = indexProvider.nodeIndex(PublisherNode.PUBLISHER_NAME_INDEX,
MapUtil.stringMap(PROVIDER_ST, LUCENE_ST, TYPE_ST, EXACT_ST));
BatchInserterIndex onlineArticleTitleIndex = indexProvider.nodeIndex(OnlineArticleNode.ONLINE_ARTICLE_TITLE_FULL_TEXT_INDEX,
MapUtil.stringMap(PROVIDER_ST, LUCENE_ST, TYPE_ST, FULL_TEXT_ST));
BatchInserterIndex onlineJournalNameIndex = indexProvider.nodeIndex(OnlineJournalNode.ONLINE_JOURNAL_NAME_INDEX,
MapUtil.stringMap(PROVIDER_ST, LUCENE_ST, TYPE_ST, EXACT_ST));
BatchInserterIndex submissionTitleIndex = indexProvider.nodeIndex(SubmissionNode.SUBMISSION_TITLE_INDEX,
MapUtil.stringMap(PROVIDER_ST, LUCENE_ST, TYPE_ST, FULL_TEXT_ST));
BatchInserterIndex articleTitleIndex = indexProvider.nodeIndex(ArticleNode.ARTICLE_TITLE_FULL_TEXT_INDEX,
MapUtil.stringMap(PROVIDER_ST, LUCENE_ST, TYPE_ST, FULL_TEXT_ST));
BatchInserterIndex articleDoiIdIndex = indexProvider.nodeIndex(ArticleNode.ARTICLE_DOI_ID_INDEX,
MapUtil.stringMap(PROVIDER_ST, LUCENE_ST, TYPE_ST, EXACT_ST));
BatchInserterIndex articlePubmedIdIndex = indexProvider.nodeIndex(ArticleNode.ARTICLE_PUBMED_ID_INDEX,
MapUtil.stringMap(PROVIDER_ST, LUCENE_ST, TYPE_ST, EXACT_ST));
BatchInserterIndex articleMedlineIdIndex = indexProvider.nodeIndex(ArticleNode.ARTICLE_MEDLINE_ID_INDEX,
MapUtil.stringMap(PROVIDER_ST, LUCENE_ST, TYPE_ST, EXACT_ST));
BatchInserterIndex journalNameIndex = indexProvider.nodeIndex(JournalNode.JOURNAL_NAME_INDEX,
MapUtil.stringMap(PROVIDER_ST, LUCENE_ST, TYPE_ST, EXACT_ST));
BatchInserterIndex nodeTypeIndex = indexProvider.nodeIndex(Bio4jManager.NODE_TYPE_INDEX_NAME,
MapUtil.stringMap(PROVIDER_ST, LUCENE_ST, TYPE_ST, EXACT_ST));
BatchInserterIndex dbNameIndex = indexProvider.nodeIndex(DBNode.DB_NAME_INDEX,
MapUtil.stringMap(PROVIDER_ST, LUCENE_ST, TYPE_ST, EXACT_ST));
//----------------------------------------------------------------------
//----------------------------------------------------------------------
List<Element> referenceList = entryXMLElem.asJDomElement().getChildren(UniprotStuff.REFERENCE_TAG_NAME);
for (Element reference : referenceList) {
List<Element> citationsList = reference.getChildren(UniprotStuff.CITATION_TAG_NAME);
for (Element citation : citationsList) {
String citationType = citation.getAttributeValue(UniprotStuff.DB_REFERENCE_TYPE_ATTRIBUTE);
List<Long> authorsPersonNodesIds = new ArrayList<>();
List<Long> authorsConsortiumNodesIds = new ArrayList<>();
List<Element> authorPersonElems = citation.getChild("authorList").getChildren("person");
List<Element> authorConsortiumElems = citation.getChild("authorList").getChildren("consortium");
for (Element person : authorPersonElems) {
//long personId = indexService.getSingleNode(PersonNode.PERSON_NAME_INDEX, person.getAttributeValue("name"));
long personId = -1;
IndexHits<Long> personNameIndexHits = personNameIndex.get(PersonNode.PERSON_NAME_FULL_TEXT_INDEX, person.getAttributeValue("name"));
if (personNameIndexHits.hasNext()) {
personId = personNameIndexHits.getSingle();
}
personNameIndexHits.close();
if (personId < 0) {
personProperties.put(PersonNode.NAME_PROPERTY, person.getAttributeValue("name"));
personId = createPersonNode(personProperties, inserter, personNameIndex, nodeTypeIndex);
//flushing person name index
personNameIndex.flush();
}
authorsPersonNodesIds.add(personId);
}
for (Element consortium : authorConsortiumElems) {
long consortiumId = -1;
IndexHits<Long> consortiumIdIndexHits = consortiumNameIndex.get(ConsortiumNode.CONSORTIUM_NAME_INDEX, consortium.getAttributeValue("name"));
if (consortiumIdIndexHits.hasNext()) {
consortiumId = consortiumIdIndexHits.getSingle();
}
consortiumIdIndexHits.close();
if (consortiumId < 0) {
consortiumProperties.put(ConsortiumNode.NAME_PROPERTY, consortium.getAttributeValue("name"));
consortiumId = createConsortiumNode(consortiumProperties, inserter, consortiumNameIndex, nodeTypeIndex);
//---flushing consortium name index--
consortiumNameIndex.flush();
}
authorsConsortiumNodesIds.add(consortiumId);
}
//----------------------------------------------------------------------------
//-----------------------------THESIS-----------------------------------------
switch (citationType) {
case ThesisNode.UNIPROT_ATTRIBUTE_TYPE_VALUE:
if (uniprotDataXML.getThesis()) {
String dateSt = citation.getAttributeValue("date");
String titleSt = citation.getChildText("title");
if (dateSt == null) {
dateSt = "";
}
if (titleSt == null) {
titleSt = "";
}
long thesisId = -1;
IndexHits<Long> thesisTitleIndexHits = thesisTitleIndex.get(ThesisNode.THESIS_TITLE_FULL_TEXT_INDEX, titleSt);
if (thesisTitleIndexHits.hasNext()) {
thesisId = thesisTitleIndexHits.getSingle();
}
thesisTitleIndexHits.close();
if (thesisId < 0) {
thesisProperties.put(ThesisNode.DATE_PROPERTY, dateSt);
thesisProperties.put(ThesisNode.TITLE_PROPERTY, titleSt);
//---thesis node creation and indexing
thesisId = inserter.createNode(thesisProperties);
nodeTypeIndex.add(thesisId, MapUtil.map(Bio4jManager.NODE_TYPE_INDEX_NAME, ThesisNode.NODE_TYPE));
thesisTitleIndex.add(thesisId, MapUtil.map(ThesisNode.THESIS_TITLE_FULL_TEXT_INDEX, titleSt));
//flushing thesis title index
thesisTitleIndex.flush();
//---authors association-----
for (long personId : authorsPersonNodesIds) {
inserter.createRelationship(thesisId, personId, thesisAuthorRel, null);
}
//-----------institute-----------------------------
String instituteSt = citation.getAttributeValue("institute");
String countrySt = citation.getAttributeValue("country");
if (instituteSt != null) {
long instituteId = -1;
IndexHits<Long> instituteNameIndexHits = instituteNameIndex.get(InstituteNode.INSTITUTE_NAME_INDEX, instituteSt);
if (instituteNameIndexHits.hasNext()) {
instituteId = instituteNameIndexHits.getSingle();
}
instituteNameIndexHits.close();
if (instituteId < 0) {
instituteProperties.put(InstituteNode.NAME_PROPERTY, instituteSt);
instituteId = createInstituteNode(instituteProperties, inserter, instituteNameIndex, nodeTypeIndex);
//flushing institute name index
instituteNameIndex.flush();
}
if (countrySt != null) {
//long countryId = indexService.getSingleNode(CountryNode.COUNTRY_NAME_INDEX, countrySt);
long countryId = -1;
IndexHits<Long> countryNameIndexHits = countryNameIndex.get(CountryNode.COUNTRY_NAME_INDEX, countrySt);
if (countryNameIndexHits.hasNext()) {
countryId = countryNameIndexHits.getSingle();
}
countryNameIndexHits.close();
if (countryId < 0) {
countryProperties.put(CountryNode.NAME_PROPERTY, countrySt);
countryId = createCountryNode(countryProperties, inserter, countryNameIndex, nodeTypeIndex);
//flushing country name index
countryNameIndex.flush();
}
inserter.createRelationship(instituteId, countryId, instituteCountryRel, null);
}
inserter.createRelationship(thesisId, instituteId, thesisInstituteRel, null);
}
}
//--protein citation relationship
inserter.createRelationship(thesisId, currentProteinId, thesisProteinCitationRel, null);
}
//----------------------------------------------------------------------------
//-----------------------------PATENT-----------------------------------------
break;
case PatentNode.UNIPROT_ATTRIBUTE_TYPE_VALUE:
if (uniprotDataXML.getPatents()) {
String numberSt = citation.getAttributeValue("number");
String dateSt = citation.getAttributeValue("date");
String titleSt = citation.getChildText("title");
if (dateSt == null) {
dateSt = "";
}
if (titleSt == null) {
titleSt = "";
}
if (numberSt == null) {
numberSt = "";
}
if (!numberSt.equals("")) {
long patentId = -1;
IndexHits<Long> patentNumberIndexHits = patentNumberIndex.get(PatentNode.PATENT_NUMBER_INDEX, numberSt);
if (patentNumberIndexHits.hasNext()) {
patentId = patentNumberIndexHits.getSingle();
}
patentNumberIndexHits.close();
if (patentId < 0) {
patentProperties.put(PatentNode.NUMBER_PROPERTY, numberSt);
patentProperties.put(PatentNode.DATE_PROPERTY, dateSt);
patentProperties.put(PatentNode.TITLE_PROPERTY, titleSt);
//---patent node creation and indexing
patentId = inserter.createNode(patentProperties);
patentNumberIndex.add(patentId, MapUtil.map(PatentNode.PATENT_NUMBER_INDEX, numberSt));
nodeTypeIndex.add(patentId, MapUtil.map(Bio4jManager.NODE_TYPE_INDEX_NAME, PatentNode.NODE_TYPE));
//---flushing patent number index---
patentNumberIndex.flush();
//---authors association-----
for (long personId : authorsPersonNodesIds) {
inserter.createRelationship(patentId, personId, patentAuthorRel, null);
}
}
//--protein citation relationship
inserter.createRelationship(patentId, currentProteinId, patentProteinCitationRel, null);
}
}
//----------------------------------------------------------------------------
//-----------------------------SUBMISSION-----------------------------------------
break;
case SubmissionNode.UNIPROT_ATTRIBUTE_TYPE_VALUE:
if (uniprotDataXML.getSubmissions()) {
String dateSt = citation.getAttributeValue("date");
String titleSt = citation.getChildText("title");
String dbSt = citation.getAttributeValue("db");
if (dateSt == null) {
dateSt = "";
}
if (titleSt == null) {
titleSt = "";
}
submissionProperties.put(SubmissionNode.DATE_PROPERTY, dateSt);
submissionProperties.put(SubmissionNode.TITLE_PROPERTY, titleSt);
long submissionId;
IndexHits<Long> submissionTitleIndexHits = submissionTitleIndex.get(SubmissionNode.SUBMISSION_TITLE_INDEX, titleSt);
if (submissionTitleIndexHits.hasNext()) {
submissionId = submissionTitleIndexHits.getSingle();
submissionTitleIndexHits.close();
} else {
//---submission node creation and indexing
submissionId = inserter.createNode(submissionProperties);
//--indexing node by type---
nodeTypeIndex.add(submissionId, MapUtil.map(Bio4jManager.NODE_TYPE_INDEX_NAME, SubmissionNode.NODE_TYPE));
if (!titleSt.isEmpty()) {
//--indexing node by title---
submissionTitleIndex.add(submissionId, MapUtil.map(SubmissionNode.SUBMISSION_TITLE_INDEX, titleSt));
submissionTitleIndex.flush();
}
}
//---authors association-----
for (long personId : authorsPersonNodesIds) {
inserter.createRelationship(submissionId, personId, submissionAuthorRel, null);
}
//---authors consortium association-----
for (long consortiumId : authorsConsortiumNodesIds) {
inserter.createRelationship(submissionId, consortiumId, submissionAuthorRel, null);
}
if (dbSt != null) {
long dbId = -1;
IndexHits<Long> dbNameIndexHits = dbNameIndex.get(DBNode.DB_NAME_INDEX, dbSt);
if (dbNameIndexHits.hasNext()) {
dbId = dbNameIndexHits.getSingle();
}
dbNameIndexHits.close();
if (dbId < 0) {
dbProperties.put(DBNode.NODE_TYPE_PROPERTY, DBNode.NODE_TYPE);
dbProperties.put(DBNode.NAME_PROPERTY, dbSt);
dbId = createDbNode(dbProperties, inserter, dbNameIndex, nodeTypeIndex);
dbNameIndex.flush();
}
//-----submission db relationship-----
inserter.createRelationship(submissionId, dbId, submissionDbRel, null);
}
//--protein citation relationship
inserter.createRelationship(submissionId, currentProteinId, submissionProteinCitationRel, null);
}
//----------------------------------------------------------------------------
//-----------------------------BOOK-----------------------------------------
break;
case BookNode.UNIPROT_ATTRIBUTE_TYPE_VALUE:
if (uniprotDataXML.getBooks()) {
String nameSt = citation.getAttributeValue("name");
String dateSt = citation.getAttributeValue("date");
String titleSt = citation.getChildText("title");
String publisherSt = citation.getAttributeValue("publisher");
String firstSt = citation.getAttributeValue("first");
String lastSt = citation.getAttributeValue("last");
String citySt = citation.getAttributeValue("city");
String volumeSt = citation.getAttributeValue("volume");
if (nameSt == null) {
nameSt = "";
}
if (dateSt == null) {
dateSt = "";
}
if (titleSt == null) {
titleSt = "";
}
if (publisherSt == null) {
publisherSt = "";
}
if (firstSt == null) {
firstSt = "";
}
if (lastSt == null) {
lastSt = "";
}
if (citySt == null) {
citySt = "";
}
if (volumeSt == null) {
volumeSt = "";
}
long bookId = -1;
IndexHits<Long> bookNameIndexHits = bookNameIndex.get(BookNode.BOOK_NAME_FULL_TEXT_INDEX, nameSt);
if (bookNameIndexHits.hasNext()) {
bookId = bookNameIndexHits.getSingle();
}
bookNameIndexHits.close();
if (bookId < 0) {
bookProperties.put(BookNode.NAME_PROPERTY, nameSt);
bookProperties.put(BookNode.DATE_PROPERTY, dateSt);
//---book node creation and indexing
bookId = inserter.createNode(bookProperties);
bookNameIndex.add(bookId, MapUtil.map(BookNode.BOOK_NAME_FULL_TEXT_INDEX, nameSt));
//--indexing node by type---
nodeTypeIndex.add(bookId, MapUtil.map(Bio4jManager.NODE_TYPE_INDEX_NAME, BookNode.NODE_TYPE));
//--flushing book name index---
bookNameIndex.flush();
//---authors association-----
for (long personId : authorsPersonNodesIds) {
inserter.createRelationship(bookId, personId, bookAuthorRel, null);
}
//---editor association-----
Element editorListElem = citation.getChild("editorList");
if (editorListElem != null) {
List<Element> editorsElems = editorListElem.getChildren("person");
for (Element person : editorsElems) {
//long editorId = indexService.getSingleNode(PersonNode.PERSON_NAME_INDEX, person.getAttributeValue("name"));
long editorId = -1;
IndexHits<Long> personNameIndexHits = personNameIndex.get(PersonNode.PERSON_NAME_FULL_TEXT_INDEX, person.getAttributeValue("name"));
if (personNameIndexHits.hasNext()) {
editorId = personNameIndexHits.getSingle();
}
personNameIndexHits.close();
if (editorId < 0) {
personProperties.put(PersonNode.NAME_PROPERTY, person.getAttributeValue("name"));
editorId = createPersonNode(personProperties, inserter, personNameIndex, nodeTypeIndex);
}
//---flushing person name index---
personNameIndex.flush();
//editor association
inserter.createRelationship(bookId, editorId, bookEditorRel, null);
}
}
//----publisher--
if (!publisherSt.equals("")) {
//long publisherId = indexService.getSingleNode(PublisherNode.PUBLISHER_NAME_INDEX, publisherSt);
long publisherId = -1;
IndexHits<Long> publisherNameIndexHits = publisherNameIndex.get(PublisherNode.PUBLISHER_NAME_INDEX, publisherSt);
if (publisherNameIndexHits.hasNext()) {
publisherId = publisherNameIndexHits.getSingle();
}
publisherNameIndexHits.close();
if (publisherId < 0) {
publisherProperties.put(PublisherNode.NAME_PROPERTY, publisherSt);
publisherId = inserter.createNode(publisherProperties);
//--indexing node by type---
nodeTypeIndex.add(publisherId, MapUtil.map(Bio4jManager.NODE_TYPE_INDEX_NAME, PublisherNode.NODE_TYPE));
publisherNameIndex.add(publisherId, MapUtil.map(PublisherNode.PUBLISHER_NAME_INDEX, publisherSt));
//--flushing publisher name index--
publisherNameIndex.flush();
}
inserter.createRelationship(bookId, publisherId, bookPublisherRel, null);
}
//-----city-----
if (!citySt.equals("")) {
//long cityId = indexService.getSingleNode(CityNode.CITY_NAME_INDEX, citySt);
long cityId = -1;
IndexHits<Long> cityNameIndexHits = cityNameIndex.get(CityNode.CITY_NAME_INDEX, citySt);
if (cityNameIndexHits.hasNext()) {
cityId = cityNameIndexHits.getSingle();
}
cityNameIndexHits.close();
if (cityId < 0) {
cityProperties.put(CityNode.NAME_PROPERTY, citySt);
cityId = createCityNode(cityProperties, inserter, cityNameIndex, nodeTypeIndex);
//-----flushing city name index---
cityNameIndex.flush();
}
inserter.createRelationship(bookId, cityId, bookCityRel, null);
}
}
bookProteinCitationProperties.put(BookProteinCitationRel.FIRST_PROPERTY, firstSt);
bookProteinCitationProperties.put(BookProteinCitationRel.LAST_PROPERTY, lastSt);
bookProteinCitationProperties.put(BookProteinCitationRel.VOLUME_PROPERTY, volumeSt);
bookProteinCitationProperties.put(BookProteinCitationRel.TITLE_PROPERTY, titleSt);
//--protein citation relationship
inserter.createRelationship(bookId, currentProteinId, bookProteinCitationRel, bookProteinCitationProperties);
}
//----------------------------------------------------------------------------
//-----------------------------ONLINE ARTICLE-----------------------------------------
break;
case OnlineArticleNode.UNIPROT_ATTRIBUTE_TYPE_VALUE:
if (uniprotDataXML.getOnlineArticles()) {
String locatorSt = citation.getChildText("locator");
String nameSt = citation.getAttributeValue("name");
String titleSt = citation.getChildText("title");
if (titleSt == null) {
titleSt = "";
}
if (nameSt == null) {
nameSt = "";
}
if (locatorSt == null) {
locatorSt = "";
}
long onlineArticleId = -1;
IndexHits<Long> onlineArticleTitleIndexHits = onlineArticleTitleIndex.get(OnlineArticleNode.ONLINE_ARTICLE_TITLE_FULL_TEXT_INDEX, titleSt);
if (onlineArticleTitleIndexHits.hasNext()) {
onlineArticleId = onlineArticleTitleIndexHits.getSingle();
}
onlineArticleTitleIndexHits.close();
if (onlineArticleId < 0) {
onlineArticleProperties.put(OnlineArticleNode.TITLE_PROPERTY, titleSt);
onlineArticleId = inserter.createNode(onlineArticleProperties);
//--indexing node by type---
nodeTypeIndex.add(onlineArticleId, MapUtil.map(Bio4jManager.NODE_TYPE_INDEX_NAME, OnlineArticleNode.NODE_TYPE));
if (!titleSt.equals("")) {
onlineArticleTitleIndex.add(onlineArticleId, MapUtil.map(OnlineArticleNode.ONLINE_ARTICLE_TITLE_FULL_TEXT_INDEX, titleSt));
//-----flushing online article title index---
onlineArticleTitleIndex.flush();
}
//---authors person association-----
for (long personId : authorsPersonNodesIds) {
inserter.createRelationship(onlineArticleId, personId, onlineArticleAuthorRel, null);
}
//---authors consortium association-----
for (long consortiumId : authorsConsortiumNodesIds) {
inserter.createRelationship(onlineArticleId, consortiumId, onlineArticleAuthorRel, null);
}
//------online journal-----------
if (!nameSt.equals("")) {
long onlineJournalId = -1;
IndexHits<Long> onlineJournalNameIndexHits = onlineJournalNameIndex.get(OnlineJournalNode.ONLINE_JOURNAL_NAME_INDEX, nameSt);
if (onlineJournalNameIndexHits.hasNext()) {
onlineJournalId = onlineJournalNameIndexHits.getSingle();
}
onlineJournalNameIndexHits.close();
if (onlineJournalId < 0) {
onlineJournalProperties.put(OnlineJournalNode.NAME_PROPERTY, nameSt);
onlineJournalId = inserter.createNode(onlineJournalProperties);
//--indexing node by type---
nodeTypeIndex.add(onlineJournalId, MapUtil.map(Bio4jManager.NODE_TYPE_INDEX_NAME, OnlineJournalNode.NODE_TYPE));
onlineJournalNameIndex.add(onlineJournalId, MapUtil.map(OnlineJournalNode.ONLINE_JOURNAL_NAME_INDEX, nameSt));
//---flushing online journal name index---
onlineJournalNameIndex.flush();
}
onlineArticleJournalProperties.put(OnlineArticleJournalRel.LOCATOR_PROPERTY, locatorSt);
inserter.createRelationship(onlineArticleId, onlineJournalId, onlineArticleJournalRel, onlineArticleJournalProperties);
}
//----------------------------
}
//protein citation
inserter.createRelationship(onlineArticleId, currentProteinId, onlineArticleProteinCitationRel, null);
}
//----------------------------------------------------------------------------
//-----------------------------ARTICLE-----------------------------------------
break;
case ArticleNode.UNIPROT_ATTRIBUTE_TYPE_VALUE:
if (uniprotDataXML.getArticles()) {
String journalNameSt = citation.getAttributeValue("name");
String dateSt = citation.getAttributeValue("date");
String titleSt = citation.getChildText("title");
String firstSt = citation.getAttributeValue("first");
String lastSt = citation.getAttributeValue("last");
String volumeSt = citation.getAttributeValue("volume");
String doiSt = "";
String medlineSt = "";
String pubmedSt = "";
if (journalNameSt == null) {
journalNameSt = "";
}
if (dateSt == null) {
dateSt = "";
}
if (firstSt == null) {
firstSt = "";
}
if (lastSt == null) {
lastSt = "";
}
if (volumeSt == null) {
volumeSt = "";
}
if (titleSt == null) {
titleSt = "";
}
List<Element> dbReferences = citation.getChildren("dbReference");
for (Element tempDbRef : dbReferences) {
switch (tempDbRef.getAttributeValue("type")) {
case "DOI":
doiSt = tempDbRef.getAttributeValue("id");
break;
case "MEDLINE":
medlineSt = tempDbRef.getAttributeValue("id");
break;
case "PubMed":
pubmedSt = tempDbRef.getAttributeValue("id");
break;
}
}
//long articleId = indexService.getSingleNode(ArticleNode.ARTICLE_TITLE_FULL_TEXT_INDEX, titleSt);
long articleId = -1;
IndexHits<Long> articleTitleIndexHits = articleTitleIndex.get(ArticleNode.ARTICLE_TITLE_FULL_TEXT_INDEX, titleSt);
if (articleTitleIndexHits.hasNext()) {
articleId = articleTitleIndexHits.getSingle();
}
articleTitleIndexHits.close();
if (articleId < 0) {
articleProperties.put(ArticleNode.TITLE_PROPERTY, titleSt);
articleProperties.put(ArticleNode.DOI_ID_PROPERTY, doiSt);
articleProperties.put(ArticleNode.MEDLINE_ID_PROPERTY, medlineSt);
articleProperties.put(ArticleNode.PUBMED_ID_PROPERTY, pubmedSt);
articleId = inserter.createNode(articleProperties);
//--indexing node by type---
nodeTypeIndex.add(articleId, MapUtil.map(Bio4jManager.NODE_TYPE_INDEX_NAME, ArticleNode.NODE_TYPE));
if (!titleSt.equals("")) {
articleTitleIndex.add(articleId, MapUtil.map(ArticleNode.ARTICLE_TITLE_FULL_TEXT_INDEX, titleSt));
//--flushing article title index---
articleTitleIndex.flush();
}
//---indexing by medline, doi and pubmed--
if (!doiSt.isEmpty()) {
articleDoiIdIndex.add(articleId, MapUtil.map(ArticleNode.ARTICLE_DOI_ID_INDEX, doiSt));
}
if (!medlineSt.isEmpty()) {
articleMedlineIdIndex.add(articleId, MapUtil.map(ArticleNode.ARTICLE_MEDLINE_ID_INDEX, medlineSt));
}
if (!pubmedSt.isEmpty()) {
articlePubmedIdIndex.add(articleId, MapUtil.map(ArticleNode.ARTICLE_PUBMED_ID_INDEX, pubmedSt));
}
//---authors person association-----
for (long personId : authorsPersonNodesIds) {
inserter.createRelationship(articleId, personId, articleAuthorRel, null);
}
//---authors consortium association-----
for (long consortiumId : authorsConsortiumNodesIds) {
inserter.createRelationship(articleId, consortiumId, articleAuthorRel, null);
}
//------journal-----------
if (!journalNameSt.equals("")) {
//long journalId = indexService.getSingleNode(JournalNode.JOURNAL_NAME_INDEX, journalNameSt);
long journalId = -1;
IndexHits<Long> journalNameIndexHits = journalNameIndex.get(JournalNode.JOURNAL_NAME_INDEX, journalNameSt);
if (journalNameIndexHits.hasNext()) {
journalId = journalNameIndexHits.getSingle();
}
journalNameIndexHits.close();
if (journalId < 0) {
journalProperties.put(JournalNode.NAME_PROPERTY, journalNameSt);
journalId = inserter.createNode(journalProperties);
//--indexing node by type---
nodeTypeIndex.add(journalId, MapUtil.map(Bio4jManager.NODE_TYPE_INDEX_NAME, JournalNode.NODE_TYPE));
journalNameIndex.add(journalId, MapUtil.map(JournalNode.JOURNAL_NAME_INDEX, journalNameSt));
//----flushing journal name index----
journalNameIndex.flush();
}
articleJournalProperties.put(ArticleJournalRel.DATE_PROPERTY, dateSt);
articleJournalProperties.put(ArticleJournalRel.FIRST_PROPERTY, firstSt);
articleJournalProperties.put(ArticleJournalRel.LAST_PROPERTY, lastSt);
articleJournalProperties.put(ArticleJournalRel.VOLUME_PROPERTY, volumeSt);
inserter.createRelationship(articleId, journalId, articleJournalRel, articleJournalProperties);
}
//----------------------------
}
//protein citation
inserter.createRelationship(articleId, currentProteinId, articleProteinCitationRel, null);
}
//----------------------------------------------------------------------------
//----------------------UNPUBLISHED OBSERVATIONS-----------------------------------------
break;
case UnpublishedObservationNode.UNIPROT_ATTRIBUTE_TYPE_VALUE:
if (uniprotDataXML.getUnpublishedObservations()) {
String dateSt = citation.getAttributeValue("date");
if (dateSt == null) {
dateSt = "";
}
unpublishedObservationProperties.put(UnpublishedObservationNode.DATE_PROPERTY, dateSt);
long unpublishedObservationId = inserter.createNode(unpublishedObservationProperties);
//--indexing node by type---
nodeTypeIndex.add(unpublishedObservationId, MapUtil.map(Bio4jManager.NODE_TYPE_INDEX_NAME, UnpublishedObservationNode.NODE_TYPE));
//---authors person association-----
for (long personId : authorsPersonNodesIds) {
inserter.createRelationship(unpublishedObservationId, personId, unpublishedObservationAuthorRel, null);
}
inserter.createRelationship(unpublishedObservationId, currentProteinId, unpublishedObservationProteinCitationRel, null);
}
break;
}
}
}
}
private static String[] convertToStringArray(List<String> list) {
String[] result = new String[list.size()];
for (int i = 0; i < list.size(); i++) {
result[i] = list.get(i);
}
return result;
}
}