/** * Copyright 2014 IHTSDO * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.ihtsdo.otf.snomed.loader; import static org.ihtsdo.otf.snomed.loader.RF2ImportHelper.characteristicsMap; import static org.ihtsdo.otf.snomed.loader.RF2ImportHelper.descMap; import static org.ihtsdo.otf.snomed.loader.RF2ImportHelper.relTypeMap; import static org.ihtsdo.otf.snomed.loader.RF2ImportHelper.vMap; import static org.ihtsdo.otf.snomed.loader.RF2ImportHelper.getVertex; import static org.ihtsdo.otf.snomed.loader.RF2ImportHelper.processCaseSinificance; import static org.ihtsdo.otf.snomed.loader.RF2ImportHelper.processDefinitionStatus; import static org.ihtsdo.otf.snomed.loader.RF2ImportHelper.processModifier; import static org.ihtsdo.otf.snomed.loader.RF2ImportHelper.processModule; import static org.ihtsdo.otf.snomed.loader.RF2ImportHelper.processType; import java.io.FileInputStream; import java.io.FileReader; import java.io.IOException; import java.io.InputStreamReader; import java.util.HashMap; import java.util.zip.GZIPInputStream; import org.apache.commons.lang3.StringUtils; import org.ihtsdo.otf.snomed.domain.DescriptionType; import org.ihtsdo.otf.snomed.domain.Properties; import org.ihtsdo.otf.snomed.domain.Relationship; import org.ihtsdo.otf.snomed.domain.Types; import org.joda.time.DateTime; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.supercsv.cellprocessor.ift.CellProcessor; import org.supercsv.io.CsvBeanReader; import org.supercsv.io.ICsvBeanReader; import org.supercsv.prefs.CsvPreference; import com.thinkaurelius.titan.core.TitanException; import com.thinkaurelius.titan.core.TitanGraph; import com.tinkerpop.blueprints.Edge; import com.tinkerpop.blueprints.Vertex; /** * */ public class Rf2SnapshotLoader { private static final Logger LOGGER = LoggerFactory.getLogger(Rf2SnapshotLoader.class); private static final String SNAPSHOT_USER = "system"; private TitanGraph g; private int bufferSize = 1000; boolean isReload = false; /**all files together * */ public Rf2SnapshotLoader(TitanGraph g) { // TODO Auto-generated constructor stub LOGGER.info("Initializing graph {}" , g); if (g == null) { throw new IllegalArgumentException("Graph instance is required for snapshot loading. Can not continue"); } this.g = g; } @SuppressWarnings("unchecked") public void load(String file) { LOGGER.debug("Starting to load file {}", file); long start = System.currentTimeMillis(); long totalRow = 0; ICsvBeanReader beanReader = null; InputStreamReader reader = null; try { if (StringUtils.isBlank(file)) { throw new IllegalArgumentException("Please check file supplied."); } else if(file.endsWith(".gz")) { reader = new InputStreamReader(new GZIPInputStream(new FileInputStream(file))); } else { reader = new FileReader(file); } LOGGER.debug("Starting to load file {}", file); final CsvPreference RF2_PREF = new CsvPreference.Builder('"', '\t', "\r\n").build(); beanReader = new CsvBeanReader(reader, RF2_PREF); final String[] header = beanReader.getHeader(true); int noOfColumns = 0; if (header != null) { noOfColumns = header.length; } CellProcessor[] processors = null; @SuppressWarnings("rawtypes") Class rf2Base = null; LOGGER.debug("noOfColumns {} in the file", noOfColumns); switch (noOfColumns) { case 9: LOGGER.debug("Processing description file"); processors = RF2CellProcessor.getDescriptionCellProcessor(); rf2Base = Rf2Description.class; break; case 10: LOGGER.debug("Processing relationship file"); processors = RF2CellProcessor.getRelationshipCellProcessor(); rf2Base = Rf2Relationship.class; break; case 5: LOGGER.debug("Processing concept file"); processors = RF2CellProcessor.getConceptCellProcessor(); rf2Base = Rf2Concept.class; break; default: LOGGER.debug("Nothing to load"); break; } Rf2Base bean; beginTx(); while( (bean = (Rf2Base)beanReader.read(rf2Base, header, processors)) != null ) { if (isReload) { Vertex v = getVertex(g, bean.getId()); if (v != null) { LOGGER.debug("Not Processing lineNo={}, rowNo={} as record already loaded", beanReader.getLineNumber(), beanReader.getRowNumber()); continue; } } LOGGER.debug("Processing lineNo={}, rowNo={} ", beanReader.getLineNumber(), beanReader.getRowNumber()); switch (noOfColumns) { case 9: //process description processDescription((Rf2Description) bean); break; case 10: //process relationship processRelationship((Rf2Relationship) bean); break; case 5: //process concept processConcept((Rf2Concept)bean); break; default: break; } commit(beanReader.getRowNumber()); } LOGGER.info("Commiting remaining data"); g.commit();//remaining operation commit totalRow = beanReader.getRowNumber(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); throw new RuntimeException("Can not process file", e); } catch (Exception e) { LOGGER.error("Transaction rolledback"); g.rollback(); e.printStackTrace(); throw new RuntimeException("Can not process file", e); } finally { if( beanReader != null ) { try { LOGGER.info("Closing IO resources"); beanReader.close(); reader.close(); } catch (IOException e) { LOGGER.error("Error closing bean reader"); e.printStackTrace(); } } } LOGGER.info("Total row {} processed in {} minute ", totalRow, ((System.currentTimeMillis() - start)/60000)); } private void commit(int rowNumber) { if (rowNumber % bufferSize == 0) { LOGGER.info("Committing running transaction"); try { g.commit(); beginTx(); LOGGER.info("Total concept processed {}", rowNumber); } catch (TitanException e) { try { g.commit();//retry beginTx(); LOGGER.info("Total concept processed {}", rowNumber); } catch (TitanException e2) { LOGGER.error("Error commiting transaction {}", rowNumber); throw e2; } } } } private void beginTx() { LOGGER.info("Starting a new transaction"); try { Thread.sleep(2000); } catch (InterruptedException e) { // TODO Auto-generated catch block e.printStackTrace(); } g.buildTransaction().enableBatchLoading(); g.newTransaction(); vMap = new HashMap<String, Vertex>();//refresh map as vertex are stale after commit } /** * @param bean */ private void processConcept(Rf2Concept bean) { long start = System.currentTimeMillis(); LOGGER.debug("Processing concept {}", bean.getId()); Vertex vC = g.addVertexWithLabel(Types.concept.toString()); vC.setProperty(g.getRelationType(Properties.sctid.toString()).toString(), bean.getId()); vC.setProperty(Properties.effectiveTime.toString(), bean.getEffectiveTime().getMillis()); vC.setProperty(Properties.status.toString(), bean.getActive()); vC.setProperty(Properties.created.toString(), new DateTime().getMillis()); vC.setProperty(Properties.createdBy.toString(), SNAPSHOT_USER); //add module Vertex vM = processModule(g, bean.getModuleId()); vC.addEdge(Relationship.hasModule.toString(), vM); //definition status Vertex vDs = processDefinitionStatus(g, bean.getDefinitionStatusId()); vC.addEdge(Relationship.ds.toString(), vDs); LOGGER.trace("processConcept total time {} sec ", (System.currentTimeMillis() - start)/1000); } private void processDescription(Rf2Description desc) { long start = System.currentTimeMillis(); LOGGER.debug("Processing description {}", desc.getId()); Vertex vD = g.addVertexWithLabel(Types.description.toString()); vD.setProperty(Properties.sctid.toString(), desc.getId()); vD.setProperty(Properties.effectiveTime.toString(), desc.getEffectiveTime().getMillis()); vD.setProperty(Properties.status.toString(), desc.getActive()); vD.setProperty(Properties.created.toString(), new DateTime().getMillis()); vD.setProperty(Properties.createdBy.toString(), SNAPSHOT_USER); vD.setProperty(Properties.languageCode.toString(), desc.getLanguageCode()); vD.setProperty(Properties.title.toString(), desc.getTerm()); //add module Vertex vM = processModule(g, desc.getModuleId()); vD.addEdge(Relationship.hasModule.toString(), vM); //case significance Vertex vCs = processCaseSinificance(g, desc.getCaseSignificanceId()); vD.addEdge(Relationship.hasCaseSignificance.toString(), vCs); //type Vertex vT = processType(g, desc.getTypeId()); vD.addEdge(Relationship.hasType.toString(), vT); //concept Vertex vC = getVertex(g, desc.getConceptId()); LOGGER.trace("Concept vertex {}", vC); if (vC != null) { if (DescriptionType.fsn.equals(descMap.get(desc.getTypeId()))) { LOGGER.debug("Adding FSN as concept title {}", desc.getTerm()); vC.setProperty(Properties.title.toString(), desc.getTerm()); } String name = descMap.get(desc.getTypeId()).toString(); LOGGER.trace("Concept vertex {}, typeId {} and resulted edge label name {}", vC, desc.getTypeId(), name); Edge e = vC.addEdge(name, vD); e.setProperty(Properties.title.toString(), desc.getTerm()); } else { LOGGER.error("Could not find concept for id {}", desc.getConceptId()); } LOGGER.trace("processDescription total time {} sec ", (System.currentTimeMillis() - start)/1000); } private void processRelationship(Rf2Relationship rel) { long start = System.currentTimeMillis(); LOGGER.debug("Processing relationship {}", rel.getId()); //this is special vertex only required to get to have special relationships Vertex vR = g.addVertexWithLabel(Types.relationship.toString()); vR.setProperty(Properties.sctid.toString(), rel.getId()); vR.setProperty(Properties.characteristicId.toString(), rel.getCharacteristicTypeId()); //add module Vertex vM = processModule(g, rel.getModuleId()); g.addEdge(rel.getModuleId(), vR, vM, Relationship.hasModule.toString()); //type Vertex vT = processType(g, rel.getTypeId()); g.addEdge(rel.getTypeId(), vR, vT, Relationship.hasType.toString()); //modifier Vertex vMo = processModifier(g, rel.getModifierId()); g.addEdge(rel.getModifierId(), vR, vMo, Relationship.hasModifier.toString()); //concept Vertex vSource = getVertex(g, rel.getSourceId()); Vertex vDest = getVertex(g, rel.getDestinationId()); LOGGER.trace("Source concept {} - vertex {}", rel.getSourceId(), vSource); LOGGER.trace("Destination concept {} - vertex {}", rel.getDestinationId(), vDest); if (vSource != null && vDest != null) { Properties relName = characteristicsMap.get(rel.getCharacteristicTypeId()); String nature = relName != null ? relName.toString() : null; Relationship relType = relTypeMap.get(rel.getTypeId()); String type = relType != null ? relType.toString() : Relationship.generic.toString(); LOGGER.trace("Relationship edge label {} ", type); Edge eR = g.addEdge(rel.getId(), vSource, vDest, type); eR.setProperty(Properties.sctid.toString(), rel.getId()); eR.setProperty(Properties.effectiveTime.toString(), rel.getEffectiveTime().getMillis()); eR.setProperty(Properties.status.toString(), rel.getActive()); eR.setProperty(Properties.created.toString(), new DateTime().getMillis()); eR.setProperty(Properties.createdBy.toString(), SNAPSHOT_USER); eR.setProperty(Properties.group.toString(), rel.getRelationshipGroup()); eR.setProperty(Properties.characteristic.toString(), nature); vR.setProperty(Properties.characteristicId.toString(), rel.getCharacteristicTypeId()); //these are special relationship edge properties in a sense that they should represent as v --> v "relationship" if relationship is a vertex. eR.setProperty(Properties.typeId.toString(), rel.getTypeId()); eR.setProperty(Properties.moduleId.toString(), rel.getModuleId()); eR.setProperty(Properties.modifierId.toString(), rel.getModifierId()); } LOGGER.trace("processRelationship total time {} sec ", (System.currentTimeMillis() - start)/1000); } /** * @param bufferSize the bufferSize to set */ public void setBufferSize(int bufferSize) { this.bufferSize = bufferSize; } /** * @param isReload the isReload to set */ public void setReload(boolean isReload) { this.isReload = isReload; } }