package com.linkedin.databus.bootstrap.utils; /* * * Copyright 2013 LinkedIn Corp. All rights reserved * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. * */ import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.HashMap; import java.util.List; import java.util.Map; import org.apache.avro.file.DataFileReader; import org.apache.avro.generic.GenericDatumReader; import org.apache.avro.generic.GenericRecord; import org.apache.log4j.Logger; import com.linkedin.databus.core.UnsupportedKeyException; import com.linkedin.databus2.core.DatabusException; import com.linkedin.databus2.producers.EventCreationException; import com.linkedin.databus2.producers.db.EventReaderSummary; import com.linkedin.databus2.producers.db.OracleTriggerMonitoredSourceInfo; import com.linkedin.databus2.producers.db.ReadEventCycleSummary; import com.linkedin.databus2.producers.db.SourceDBEventReader; import com.linkedin.databus.core.util.ConfigBuilder; import com.linkedin.databus.core.util.InvalidConfigException; import com.linkedin.databus.core.util.RateMonitor; public class BootstrapAvroFileEventReader extends DbusSeederBaseThread implements SourceDBEventReader { public static final Logger LOG = Logger.getLogger(BootstrapAvroFileEventReader.class.getName()); private static final long MILLISEC_TO_MIN = (1000 * 60); private StaticConfig _config; private BootstrapEventBuffer _bootstrapEventBuffer; private List<OracleTriggerMonitoredSourceInfo> _sources; private final Map<String, Long> _lastRows; public BootstrapAvroFileEventReader(StaticConfig config, List<OracleTriggerMonitoredSourceInfo> sources, Map<String, Long> lastRows, BootstrapEventBuffer bootstrapEventBuffer) { super("BootstrapAvroFileEventReader"); _config = config; _sources = sources; _lastRows = new HashMap<String,Long>(lastRows); _bootstrapEventBuffer = bootstrapEventBuffer; } @Override public void run() { try { readEventsFromAllSources(0); } catch (Exception ex) { LOG.error("Got Error when executing readEventsFromAllSources !!",ex); } LOG.info(Thread.currentThread().getName() + " done seeding ||"); } @Override public ReadEventCycleSummary readEventsFromAllSources( long sinceSCN) throws DatabusException, EventCreationException, UnsupportedKeyException { List<EventReaderSummary> summaries = new ArrayList<EventReaderSummary>(); boolean error = false; long startTS = System.currentTimeMillis(); long endScn = -1; long minScn = Long.MAX_VALUE; try { for ( OracleTriggerMonitoredSourceInfo sourceInfo : _sources) { endScn = _config.getSeedWindowSCNMap().get(sourceInfo.getEventView()); minScn = Math.min(endScn,minScn); LOG.info("Bootstrapping " + sourceInfo.getEventView()); _bootstrapEventBuffer.start(endScn); String dir = _config.getAvroSeedInputDirMap().get(sourceInfo.getEventView()); File d = new File(dir); EventReaderSummary summary = readEventsFromHadoopFiles(sourceInfo, d, endScn); // Script assumes seeding is done for one schema at a time _bootstrapEventBuffer.endEvents(BootstrapEventBuffer.END_OF_SOURCE, endScn, null); summaries.add(summary); } } catch (Exception ex) { error = true; throw new DatabusException(ex); } finally { // Notify writer that I am done if ( error ) { _bootstrapEventBuffer.endEvents(BootstrapEventBuffer.ERROR_CODE, endScn,null); LOG.error("Seeder stopping unexpectedly !!"); } else { _bootstrapEventBuffer.endEvents(BootstrapEventBuffer.END_OF_FILE, endScn,null); LOG.info("Completed Seeding !!"); } } LOG.info("Start SCN :" + minScn); long endTS = System.currentTimeMillis(); ReadEventCycleSummary cycleSummary = new ReadEventCycleSummary("seeder", summaries, minScn, (endTS - startTS)); return cycleSummary; } private EventReaderSummary readEventsFromHadoopFiles(OracleTriggerMonitoredSourceInfo sourceInfo, File avroSeedDir, Long windowSCN) { DataFileReader<GenericRecord> reader = null; File[] files = avroSeedDir.listFiles(); List<File> fileList = Arrays.asList(files); Collections.sort(fileList); long numRead = 0; long prevNumRead = 0; long numBytes = 0; long timestamp = System.currentTimeMillis(); long timeStart = timestamp; long lastTime = timestamp; long commitInterval = _config.getCommitInterval(); long totLatency = 0; GenericRecord record = null; RateMonitor seedingRate = new RateMonitor("Seeding Rate"); seedingRate.start(); seedingRate.suspend(); long startRowId = _lastRows.get(sourceInfo.getEventView()); LOG.info("Last Known Row Id is :" + startRowId); boolean resumeSeedingRate = true; for (File avroSeedFile : files) { if (! avroSeedFile.isFile()) continue; LOG.info("Seeding from File : " + avroSeedFile); try { reader = new DataFileReader<GenericRecord>(avroSeedFile, new GenericDatumReader<GenericRecord>()); } catch (IOException e) { LOG.fatal("Failed to bootstrap from file " + avroSeedFile.getAbsolutePath(), e); throw new RuntimeException("Failed to bootstrap from file " + avroSeedFile.getAbsolutePath(), e); } try { boolean committed = false; for (GenericRecord hdfsRecord : reader) { record = hdfsRecord; committed = false; numRead++; if (numRead < startRowId) continue; if (resumeSeedingRate) { seedingRate.resume(); resumeSeedingRate = false; } seedingRate.tick(); //LOG.info("Read record :" + record); long start = System.nanoTime(); long eventSize = sourceInfo.getFactory().createAndAppendEvent(windowSCN, timestamp, hdfsRecord, _bootstrapEventBuffer, false, null); numBytes+=eventSize; long latency = System.nanoTime() - start; totLatency += latency; if (numRead%commitInterval == 0) { _bootstrapEventBuffer.endEvents(numRead,timestamp,null); _bootstrapEventBuffer.startEvents(); long procTime = totLatency/1000000000; long currTime = System.currentTimeMillis(); long diff = (currTime - lastTime)/1000; long timeSinceStart = (currTime - timeStart)/1000; LOG.info("Processed " + commitInterval + " rows in " + diff + " seconds, Avro Processing Time (seconds) so far :" + (procTime) + ",Seconds elapsed since start :" + (timeSinceStart) + ",Overall Row Rate:" + seedingRate.getRate() + ", NumRows Fetched so far:" + numRead + ". TotalEventSize :" + numBytes); lastTime = currTime; seedingRate.resume(); committed = true; } } if ( ! committed) { _bootstrapEventBuffer.endEvents(numRead,timestamp,null); _bootstrapEventBuffer.startEvents(); long procTime = totLatency/1000000000; long currTime = System.currentTimeMillis(); long diff = (currTime - lastTime)/1000; long timeSinceStart = (currTime - timeStart)/1000; LOG.info("Completed Seeding from : " + avroSeedFile + ", Processed " + commitInterval + " rows in " + diff + " seconds, Avro Processing Time (seconds) so far :" + (procTime) + ",Seconds elapsed since start :" + (timeSinceStart) + ",Overall Row Rate:" + seedingRate.getRate() + ", NumRows Fetched so far:" + numRead + ". TotalEventSize :" + numBytes); lastTime = currTime; seedingRate.resume(); } } catch (Exception e) { LOG.fatal("NumRead :" + numRead + ", Got Exception while processing generic record :" + record, e); throw new RuntimeException(e); } LOG.info("Processed " + (numRead - prevNumRead) + " rows of Source: " + sourceInfo.getSourceName() + " from file " + avroSeedFile ); prevNumRead = numRead; } long timeEnd = System.currentTimeMillis(); long elapsedMin = (timeEnd - timeStart)/(MILLISEC_TO_MIN); LOG.info("Processed " + numRead + " rows of Source: " + sourceInfo.getSourceName() + " in " + elapsedMin + " minutes" ); return new EventReaderSummary(sourceInfo.getSourceId(), sourceInfo.getSourceName(), -1, (int)numRead, numBytes, (timeEnd - timeStart),(timeEnd-timeStart)/numRead,0,0,0); } @Override public List<OracleTriggerMonitoredSourceInfo> getSources() { return _sources; } public Map<String, String> getPKeyNameMap() { return _config.getPKeyNameMap(); } public static class StaticConfig { private final Map<String, String> avroSeedInputDirMap; private final Map<String, Long> seedWindowSCNMap; private final Map<String, String> pKeyNameMap; private final int commitInterval; public StaticConfig(Map<String, String> sourceAvroSchemaMap, Map<String, Long> seedWindowSCNMap, Map<String, String> pKeyNameMap, int commitInterval) { super(); this.avroSeedInputDirMap = sourceAvroSchemaMap; this.seedWindowSCNMap = seedWindowSCNMap; this.pKeyNameMap = pKeyNameMap; this.commitInterval = commitInterval; } public Map<String, String> getAvroSeedInputDirMap() { return avroSeedInputDirMap; } public Map<String, Long> getSeedWindowSCNMap() { return seedWindowSCNMap; } public Map<String, String> getPKeyNameMap() { return pKeyNameMap; } public int getCommitInterval() { return commitInterval; } } public static class Config implements ConfigBuilder<StaticConfig> { private static final int DEFAULT_COMMIT_INTERVAL = 10000; private static final String DEFAULT_AVRO_SEED_INPUT_FILE = "DEFAULT_FILE_NAME"; private static final Long DEFAULT_WINDOW_SCN = -1L; private static final String DEFAULT_PKEY_NAME = "key"; private HashMap<String, String> avroSeedInputDirMap; private int commitInterval; private HashMap<String, Long> seedWindowSCNMap; private Map<String, String> pKeyNameMap; public Config() { avroSeedInputDirMap = new HashMap<String, String>(); seedWindowSCNMap = new HashMap<String, Long>(); pKeyNameMap = new HashMap<String, String>(); commitInterval = DEFAULT_COMMIT_INTERVAL; } public Long getSeedWindowSCN(String sourceName) { Long scn = seedWindowSCNMap.get(sourceName); if ( null == scn) { seedWindowSCNMap.put(sourceName,DEFAULT_WINDOW_SCN); return DEFAULT_WINDOW_SCN; } return scn; } public String getAvroSeedInputDir(String sourceName) { String file = avroSeedInputDirMap.get(sourceName); if ( null == file) { avroSeedInputDirMap.put(sourceName, DEFAULT_AVRO_SEED_INPUT_FILE); return DEFAULT_AVRO_SEED_INPUT_FILE; } return file; } public void setSeedWindowSCN(String sourceName, Long scn) { seedWindowSCNMap.put(sourceName,scn); } public void setAvroSeedInputDir(String sourceName, String file) { avroSeedInputDirMap.put(sourceName, file); } public int getCommitInterval() { return commitInterval; } public void setCommitInterval(int commitInterval) { this.commitInterval = commitInterval; } public String getPKeyName(String srcName) { String key = pKeyNameMap.get(srcName); if ( null == key) { pKeyNameMap.put(srcName, DEFAULT_PKEY_NAME); return DEFAULT_PKEY_NAME; } return key; } public void setPKeyName(String srcName, String key) { pKeyNameMap.put(srcName, key); } @Override public StaticConfig build() throws InvalidConfigException { LOG.info("BootstrapAvroFileEventReader starting with config :" + this.toString()); for( String file : avroSeedInputDirMap.values()) { File f = new File(file); if (! (f.isDirectory()) || (!f.canRead())) { LOG.error("File (" + f + ") does not exist or cannot be read !!"); throw new InvalidConfigException("File (" + f + ") does not exist or cannot be read !!"); } } return new StaticConfig(avroSeedInputDirMap, seedWindowSCNMap, pKeyNameMap, commitInterval); } @Override public String toString() { return "Config [avroSeedInputDirMap=" + avroSeedInputDirMap + ", commitInterval=" + commitInterval + ", seedWindowSCNMap=" + seedWindowSCNMap + ", _pKeyNameMap=" + pKeyNameMap + "]"; } } }