package at.ac.ait.ubicity.fileloader; /** Copyright (C) 2013 AIT / Austrian Institute of Technology http://www.ait.ac.at This program is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. You should have received a copy of the GNU Affero General Public License along with this program. If not, see http://www.gnu.org/licenses/agpl-3.0.html */ import at.ac.ait.ubicity.fileloader.aggregation.AggregationJob; import at.ac.ait.ubicity.fileloader.aggregation.Aggregator; import at.ac.ait.ubicity.fileloader.cassandra.AstyanaxInitializer; import at.ac.ait.ubicity.fileloader.util.Delay; import at.ac.ait.ubicity.fileloader.util.FileCache; import at.ac.ait.ubicity.fileloader.util.FileCache.FileInformation; import at.ac.ait.ubicity.fileloader.util.LogFileCache; import at.ac.ait.ubicity.fileloader.util.LogFileNameFilter; import at.ac.ait.ubicity.fileloader.util.StatsTableActualizer; import com.lmax.disruptor.EventHandler; import com.lmax.disruptor.RingBuffer; import com.lmax.disruptor.dsl.Disruptor; import com.netflix.astyanax.Keyspace; import com.netflix.astyanax.MutationBatch; import com.netflix.astyanax.model.ColumnFamily; import java.io.File; import java.io.FileNotFoundException; import java.net.URI; import java.util.Iterator; import java.util.SortedSet; import java.util.TreeSet; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.logging.Level; import java.util.logging.Logger; import org.apache.commons.io.FileUtils; import org.apache.commons.io.LineIterator; /** * * @author Jan van Oort */ public final class FileLoader { public final static double TWO = 2.0; final static Logger logger = Logger.getLogger( "FileLoader" ); static Keyspace keySpace; public static boolean cassandraInitialized = false; static boolean useCache = true; //default delay, in milliseconds, for which to check our invigilance directory or file for new updates static final long INVIGILANCE_WAITING_DELAY = 5000; static { logger.setLevel( Level.ALL ); } /** * * @param _fileInfo A FileInformation object representing usage information on the file we are supposed to load: line count already ingested, last usage time... * @param _keySpace Cassandra key space into which to ingest * @param _host Cassandra host / server * @param _batchSize MutationBatch size * @throws Exception Shouldn't happen, although the Disruptor may throw an Exception under duress */ @SuppressWarnings("unchecked") public final static void load( final FileInformation _fileInfo, final String _keySpace, final String _host, final int _batchSize ) throws Exception { if( ! cassandraInitialized ) { keySpace = AstyanaxInitializer.doInit( "Test Cluster", _host, _keySpace ); cassandraInitialized = true; } LongTimeStampSorter tsSorter = new LongTimeStampSorter(); Thread tTSSorter = new Thread( tsSorter ); tTSSorter.setPriority( Thread.MAX_PRIORITY - 1 ); tTSSorter.setName( "long timestamp sorter " ); tTSSorter.start(); //get the log id from the file's URI final String log_id = _fileInfo.getURI().toString(); final MutationBatch batch = keySpace.prepareMutationBatch(); logger.info( "got keyspace " + keySpace.getKeyspaceName() + " from Astyanax initializer" ); final LineIterator onLines = FileUtils.lineIterator( new File( _fileInfo.getURI() ) ); final ExecutorService exec = Executors.newFixedThreadPool( Runtime.getRuntime().availableProcessors() * 2 ); ColumnFamily crawl_stats = null; AggregationJob aggregationJob = new AggregationJob( keySpace, crawl_stats ); Thread tAggJob = new Thread( aggregationJob ); tAggJob.setName( "Monitrix loader / aggregation job " ); tAggJob.setPriority( Thread.MIN_PRIORITY + 1); tAggJob.start(); logger.info( "[FILELOADER] started aggregation job, ring buffer running"); final Disruptor< SingleLogLineAsString > disruptor = new Disruptor( SingleLogLineAsString.EVENT_FACTORY, ( int ) Math.pow( TWO, 17 ), exec ); SingleLogLineAsStringEventHandler.batch = batch; SingleLogLineAsStringEventHandler.keySpace = keySpace; SingleLogLineAsStringEventHandler.batchSize = _batchSize; SingleLogLineAsStringEventHandler.LOG_ID = log_id; SingleLogLineAsStringEventHandler.tsSorter = tsSorter; SingleLogLineAsStringEventHandler.aggregationJob = aggregationJob; //The EventHandler contains the actual logic for ingesting final EventHandler< SingleLogLineAsString > handler = new SingleLogLineAsStringEventHandler( ); disruptor.handleEventsWith( handler ); //get our Aggregate job in place //we are almost ready to start final RingBuffer< SingleLogLineAsString > rb = disruptor.start(); int _lineCount = 0; long _start, _lapse; _start = System.nanoTime(); int _linesAlreadyProcessed = _fileInfo.getLineCount(); //cycle through the lines already processed while( _lineCount < _linesAlreadyProcessed ) { onLines.nextLine(); _lineCount++; } //now get down to the work we actually must do, and fill the ring buffer logger.info( "begin proccessing of file " + _fileInfo.getURI() + " @line #" + _lineCount ); while( onLines.hasNext() ){ final long _seq = rb.next(); final SingleLogLineAsString event = rb.get( _seq ); event.setValue( onLines.nextLine() ); rb.publish(_seq); _lineCount++; } _lapse = System.nanoTime() - _start; logger.info( "ended proccessing of file " + _fileInfo.getURI() + " @line #" + _lineCount ); //stop, waiting for last threads still busy to finish their work disruptor.shutdown(); //update the file info, this will land in the cache _fileInfo.setLineCount( _lineCount ); _fileInfo.setLastAccess( System.currentTimeMillis() ); int _usageCount = _fileInfo.getUsageCount(); _fileInfo.setUsageCount( _usageCount++ ); //make sure we release resources onLines.close(); logger.info( "handled " + ( _lineCount - _linesAlreadyProcessed ) + " log lines in " + _lapse + " nanoseconds" ); //now go to aggregation step SortedSet< Long > timeStamps = new TreeSet( tsSorter.timeStamps ); long _minTs = timeStamps.first(); long _maxTs = timeStamps.last(); logger.info( "**** min TimeStamp = " + _minTs ); logger.info( "**** max TimeStamp = " + _maxTs ); StatsTableActualizer.update( _fileInfo.getURI().toString(), _minTs, _maxTs, _lineCount ); // AggregationJob aggJob = new AggregationJob( keySpace, _host, _batchSize ); // Thread tAgg = new Thread( aggJob ); // tAgg.setName( "aggregation job " ); // tAgg.setPriority( Thread.MAX_PRIORITY - 1 ); // tAgg.start(); } /** * * @param _uri the uri we must "patrol" * @param keySpace the Cassandra keyspace to use * @param host the Cassandra host / node * @param batchSize MutationBatch size for ingests * @param millisToWait the number of milliseconds we are supposed to wait before visiting the uri again * @throws FileNotFoundException if there is a problem with the given uri * @throws Exception if actually loading ( ingesting ) from some file under the uri leads to a problem * */ public final static void invigilate( URI _uri, String keySpace, String host, int batchSize, long millisToWait ) throws FileNotFoundException, Exception { logger.info( "[FILELOADER] invigilating URI: " + _uri ); if( _uri.getScheme().equals( "file" ) ) { //we don't know yet if the URI is a directory or a file File _startingPoint = new File( _uri ); File[] _files = getLogFilesFor( _startingPoint ); FileCache cache = useCache ? LogFileCache.get().loadCache() : null; for( File file: _files ) { logger.info( "[FILELOADER] found file under " + _uri.toString() + " : " + file.getName() ); doLoad( file, cache, keySpace, host, batchSize ); } return; } logger.info( "[FILELOADER] URI " + _uri.toString() + " is not something FileLoader can currently handle" ); } /** * Perform a load, and either write to cache or not, according to settings. * * @param _f The file we must ingest * @param _cache The cache we are to use for keeping file usage information up to date * @param _keySpace Cassandra key space into which to ingest * @param _host Cassandra host / server * @param _batchSize MutationBatch size * @throws Exception if actual loading of the file causes a problem */ private final static void doLoad( File _f, FileCache _cache, String _keySpace, String _host, int _batchSize ) throws Exception { if( ! ( _cache == null) ) { FileInformation _fileInfo = _cache.getFileInformationFor( _f.toURI() ); if( _fileInfo == null ) { _fileInfo = new FileInformation( _f.toURI(), System.currentTimeMillis(), 1, 0 ); _cache.updateCacheFor( _f.toURI(), _fileInfo ); } logger.info( "[FILELOADER] " + _fileInfo.toString() ); load( _fileInfo, _keySpace, _host, _batchSize); _cache.saveCache(); } else { load( new FileInformation( _f.toURI(), System.currentTimeMillis(), 0, 0 ), _keySpace, _host, _batchSize ); } } /** * * @param _file * @return the log files present under the argument, if it is a directory, or otherwise the argument itself ( if it is a log file ) */ public static File[] getLogFilesFor( File _file ) throws FileNotFoundException { File[] _returned = { _file }; if( _file.isDirectory() ) return _file.listFiles( new LogFileNameFilter() ); else if ( _file.getName().endsWith( LogFileNameFilter.FILE_NAME_SUFFIX ) ) return _returned; throw new FileNotFoundException( "[AGGREGATOR] no log file(s) at " + _file.getName() ); } /** * * This method is here for demo purposes only. It is not part of the required functionality for this class. * * * @param args arg 0 = file, arg #1 = keyspace, arg #2 = server host name, arg #3 = batch size, arg #4 = number of time units to wait, arg #5 = time unit ( minute, second, hour,.. ) * ( For now, tacitly assume we are on the default Cassandra 9160 port ). Clustering is not yet supported. */ public final static void main( String[] args ) throws Exception { if( ! ( args.length == 6 ) ) { usage(); System.exit( 1 ); } try { final File _f = new File( args[ 0 ] ); URI uri = _f.toURI(); String keySpaceName = args[ 1 ]; final String host = args[ 2 ]; final int batchSize = Integer.parseInt( args[ 3 ] ); final int timeUnitCount = Integer.parseInt( args[ 4 ] ); Delay timeUnit = timeUnitsFromCmdLine( args[ 5 ].toUpperCase() ); if( timeUnit == null ) timeUnit = Delay.SECOND; long millisToWait = timeUnitCount * timeUnit.getMilliSeconds(); useCache = true; while( true ) { try { invigilate( uri, keySpaceName, host, batchSize, millisToWait ); Thread.sleep( millisToWait ); } catch( InterruptedException | Error any ) { Thread.interrupted(); } finally { } } } catch( Exception e ) { logger.log(Level.SEVERE, e.toString() ); } } /** * Helper method for converting cmd line, human-readable invigilance delays * * @param _arg a time unit readable to a human ( minute, second, hour... ) * @return a Delay known to the system ( Minute, Hour, ... ) */ private static Delay timeUnitsFromCmdLine( String _arg ) { Iterator< Delay > onKnownDelayOptions = Delay.knownOptions.iterator(); while( onKnownDelayOptions.hasNext() ) { Delay _d = onKnownDelayOptions.next(); if( _d.name().equals(_arg ) ) { return _d; } } return null; } private static void usage() { System.out.println( "usage: FileLoader file URL | keyspace | server | batch_size { number | seconds }" ); System.out.println( "example: FileLoader /data/bl/ mykeyspace localhost 10000 10 minutes" ); } }