FileLoader.java example

Explorer

monitrix-master
- monitrix-loader
  - src
    - main
      - java
        at
        ac
        ait
        ubicity
        fileloader
        Constants.java
        FileLoader.java
        LogLineTokenizer.java
        LongTimeStampSorter.java
        SingleLogLineAsString.java
        SingleLogLineAsStringEventHandler.java
        TokenizedLogLine.java
        aggregation
        Aggregate.java
        AggregationJob.java
        Aggregator.java
        CompletedHosts.java
        CrawlStatsEntry.java
        DownloadVolume.java
        NumberOfNewHostsCrawled.java
        NumberOfURLsCrawled.java
        cassandra
        AstyanaxInitializer.java
        CassandraCrawlLogSchema.java
        LogLineColumn.java
        util
        CheckDB.java
        CleanUp.java
        Delay.java
        FileCache.java
        LogFileCache.java
        LogFileNameFilter.java
        StatsTableActualizer.java
- monitrix-webapp
  - app
    - controllers
    - uk
      - bl
        monitrix
        Global.java
        NumberFormat.java
        analytics
        CrawlStatsAnalytics.java
        HostAnalytics.java
        LogAnalytics.java
        PieChartValue.java
        TimeseriesValue.java
        database
        DBConnector.java
        DBIngestConnector.java
        ExtensionTable.java
        cassandra
        CassandraDBConnector.java
        CassandraProperties.java
        ingest
        CassandraAlertLogImporter.java
        CassandraCrawlLogImporter.java
        CassandraCrawlStatsImporter.java
        CassandraDBIngestConnector.java
        CassandraKnownHostImporter.java
        CassandraKnownTLDImporter.java
        CassandraVirusLogImporter.java
        model
        CassandraAlert.java
        CassandraAlertLog.java
        CassandraCrawlLog.java
        CassandraCrawlLogEntry.java
        CassandraCrawlStats.java
        CassandraCrawlStatsUnit.java
        CassandraIngestSchedule.java
        CassandraIngestedLog.java
        CassandraKnownHost.java
        CassandraKnownHostList.java
        CassandraVirusLog.java
        CassandraVirusRecord.java
        mongodb
        MongoDBConnector.java
        MongoProperties.java
        ingest
        MongoAlertLogImporter.java
        MongoCrawlLogImporter.java
        MongoCrawlStatsImporter.java
        MongoDBIngestConnector.java
        MongoKnownHostImporter.java
        MongoVirusLogImporter.java
        model
        MongoAlert.java
        MongoAlertLog.java
        MongoCrawlLog.java
        MongoCrawlLogEntry.java
        MongoCrawlStats.java
        MongoCrawlStatsUnit.java
        MongoIngestSchedule.java
        MongoIngestedLog.java
        MongoKnownHost.java
        MongoKnownHostList.java
        MongoVirusLog.java
        MongoVirusRecord.java
        email
        Mailer.java
        export
        VirusReport.java
        extensions
        imageqa
        csv
        CsvImageQALog.java
        CsvImageQALogEntry.java
        model
        ImageQALog.java
        ImageQALogEntry.java
        mongodb
        MongoImageQAProperties.java
        ingest
        MongoImageQALogImporter.java
        model
        MongoImageQALog.java
        MongoImageQALogEntry.java
        heritrix
        IncrementalLogfileReader.java
        LogFileEntry.java
        SimpleLogfileReader.java
        api
        AbstractHeritrixResponse.java
        HeritrixAPI.java
        HeritrixSummary.java
        RunScript.java
        ScriptResult.java
        UnsafeTrustManager.java
        ingest
        IngestActor.java
        IngestControlMessage.java
        IngestStatus.java
        IngestWatcher.java
        WatchedLog.java
        model
        Alert.java
        AlertLog.java
        CrawlLog.java
        CrawlLogEntry.java
        CrawlStats.java
        CrawlStatsUnit.java
        IngestSchedule.java
        IngestedLog.java
        KnownHost.java
        KnownHostList.java
        SearchResult.java
        SearchResultItem.java
        VirusLog.java
        VirusRecord.java
  - test
    - uk
      - bl
        monitrix
        extensions
        imageqa
        QATextLogTest.java
        heritrix
        LogReaderTest.java
        api
        HeritrixAPITest.java
        util
        BatchLogProcessor.java
        HeritrixDummy.java
        ImageQALogImporter.java
        IncrementalLogProcessor.java

package at.ac.ait.ubicity.fileloader;

/**
    Copyright (C) 2013  AIT / Austrian Institute of Technology
    http://www.ait.ac.at

    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU Affero General Public License as
    published by the Free Software Foundation, either version 3 of the
    License, or (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU Affero General Public License for more details.

    You should have received a copy of the GNU Affero General Public License
    along with this program.  If not, see http://www.gnu.org/licenses/agpl-3.0.html
 */


import at.ac.ait.ubicity.fileloader.aggregation.AggregationJob;
import at.ac.ait.ubicity.fileloader.aggregation.Aggregator;
import at.ac.ait.ubicity.fileloader.cassandra.AstyanaxInitializer;
import at.ac.ait.ubicity.fileloader.util.Delay;
import at.ac.ait.ubicity.fileloader.util.FileCache;
import at.ac.ait.ubicity.fileloader.util.FileCache.FileInformation;
import at.ac.ait.ubicity.fileloader.util.LogFileCache;
import at.ac.ait.ubicity.fileloader.util.LogFileNameFilter;
import at.ac.ait.ubicity.fileloader.util.StatsTableActualizer;
import com.lmax.disruptor.EventHandler;
import com.lmax.disruptor.RingBuffer;
import com.lmax.disruptor.dsl.Disruptor;
import com.netflix.astyanax.Keyspace;
import com.netflix.astyanax.MutationBatch;
import com.netflix.astyanax.model.ColumnFamily;

import java.io.File;
import java.io.FileNotFoundException;
import java.net.URI;
import java.util.Iterator;
import java.util.SortedSet;
import java.util.TreeSet;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.logging.Level;
import java.util.logging.Logger;

import org.apache.commons.io.FileUtils;
import org.apache.commons.io.LineIterator;




/**
 *
 * @author Jan van Oort
 */

public final class FileLoader {
    

    public final static double TWO = 2.0;
    
    final static Logger logger = Logger.getLogger( "FileLoader" );
    
    static Keyspace keySpace;
    
    public static boolean cassandraInitialized = false;

    static boolean useCache = true;
    
    //default delay, in milliseconds, for which to check our invigilance directory or file for new updates
    static final long INVIGILANCE_WAITING_DELAY = 5000;
    
    static  {
        logger.setLevel( Level.ALL );
    }
    
    
    
    
    /**
     * 
     * @param _fileInfo A FileInformation object representing usage information on the file we are supposed to load: line count already ingested, last usage time...
     * @param _keySpace Cassandra key space into which to ingest
     * @param _host Cassandra host / server
     * @param _batchSize MutationBatch size
     * @throws Exception Shouldn't happen, although the Disruptor may throw an Exception under duress
     */
    @SuppressWarnings("unchecked")
    public final static void load( final FileInformation _fileInfo, final String _keySpace, final String _host, final int _batchSize ) throws Exception {

        if( ! cassandraInitialized ) {
            keySpace = AstyanaxInitializer.doInit( "Test Cluster", _host, _keySpace );
            cassandraInitialized = true;
        }
        
        LongTimeStampSorter tsSorter = new LongTimeStampSorter();
        Thread tTSSorter = new Thread( tsSorter );
        tTSSorter.setPriority( Thread.MAX_PRIORITY - 1 );
        tTSSorter.setName( "long timestamp sorter " );
        tTSSorter.start();
        //get the log id from the file's URI
        final String log_id = _fileInfo.getURI().toString();
        
        final MutationBatch batch = keySpace.prepareMutationBatch();
        
        logger.info( "got keyspace " + keySpace.getKeyspaceName() + " from Astyanax initializer" );
        
        final LineIterator onLines = FileUtils.lineIterator( new File( _fileInfo.getURI() ) );
        
        final ExecutorService exec = Executors.newFixedThreadPool( Runtime.getRuntime().availableProcessors() * 2 );  
        
        ColumnFamily crawl_stats = null;
        
        AggregationJob aggregationJob = new AggregationJob( keySpace, crawl_stats );
        Thread tAggJob = new Thread( aggregationJob );
        tAggJob.setName( "Monitrix loader / aggregation job " );
        tAggJob.setPriority( Thread.MIN_PRIORITY + 1);
        tAggJob.start();
        logger.info( "[FILELOADER] started aggregation job, ring buffer running");
               
        
        final Disruptor< SingleLogLineAsString > disruptor = new Disruptor( SingleLogLineAsString.EVENT_FACTORY, ( int ) Math.pow( TWO, 17 ), exec );
        SingleLogLineAsStringEventHandler.batch = batch;
        SingleLogLineAsStringEventHandler.keySpace = keySpace;
        SingleLogLineAsStringEventHandler.batchSize = _batchSize;
        SingleLogLineAsStringEventHandler.LOG_ID = log_id;
        SingleLogLineAsStringEventHandler.tsSorter = tsSorter;
        SingleLogLineAsStringEventHandler.aggregationJob = aggregationJob;
        
        //The EventHandler contains the actual logic for ingesting
        final EventHandler< SingleLogLineAsString > handler = new SingleLogLineAsStringEventHandler(  );
        
        disruptor.handleEventsWith( handler );
        
        //get our Aggregate job in place
        
        
        //we are almost ready to start
        final RingBuffer< SingleLogLineAsString > rb = disruptor.start();
        
        
        
        
        
        
        
        int _lineCount = 0;
        long _start, _lapse;
        _start = System.nanoTime();

        int _linesAlreadyProcessed = _fileInfo.getLineCount();
        
        //cycle through the lines already processed
        while( _lineCount < _linesAlreadyProcessed ) {
            onLines.nextLine();
            _lineCount++;
        }
        
        //now get down to the work we actually must do, and fill the ring buffer
        logger.info( "begin proccessing of file " + _fileInfo.getURI() + " @line #" + _lineCount );
        while( onLines.hasNext() ){

            final long _seq = rb.next();
            final SingleLogLineAsString event = rb.get( _seq );
            event.setValue( onLines.nextLine() );
            rb.publish(_seq);
            _lineCount++;
        }
        _lapse = System.nanoTime() - _start;
        logger.info( "ended proccessing of file " + _fileInfo.getURI() + " @line #" + _lineCount );
        
        //stop, waiting for last threads still busy to finish their work
        disruptor.shutdown();

        

        //update the file info, this will  land in the cache
        _fileInfo.setLineCount( _lineCount );
        _fileInfo.setLastAccess( System.currentTimeMillis() );
        int _usageCount = _fileInfo.getUsageCount();
        _fileInfo.setUsageCount( _usageCount++ );
        
        //make sure we release resources
        onLines.close();
        
        logger.info( "handled " + ( _lineCount - _linesAlreadyProcessed )  + " log lines in " + _lapse + " nanoseconds" );
        
        
        //now go to aggregation step
        SortedSet< Long > timeStamps = new TreeSet( tsSorter.timeStamps );
        
        long _minTs = timeStamps.first();
        long _maxTs = timeStamps.last();
        logger.info( "**** min TimeStamp = " + _minTs );
        logger.info( "**** max TimeStamp = " + _maxTs );
        
        StatsTableActualizer.update( _fileInfo.getURI().toString(), _minTs, _maxTs, _lineCount );
        
        
//        AggregationJob aggJob = new AggregationJob( keySpace, _host, _batchSize );
//        Thread tAgg = new Thread( aggJob );
//        tAgg.setName( "aggregation job " );
//        tAgg.setPriority( Thread.MAX_PRIORITY - 1 );
//        tAgg.start();
        
    }
    
    
    
    /**
     * 
     * @param _uri the uri we must "patrol"
     * @param keySpace the Cassandra keyspace to use
     * @param host the Cassandra host / node
     * @param batchSize MutationBatch size for ingests
     * @param millisToWait the number of milliseconds we are supposed to wait before visiting the uri again
     * @throws FileNotFoundException if there is a problem with the given uri
     * @throws Exception if actually loading ( ingesting ) from some file under the uri leads to a problem
     * 
     */
    public final static void invigilate( URI _uri, String keySpace, String host, int batchSize, long millisToWait )  throws FileNotFoundException, Exception {
        logger.info( "[FILELOADER] invigilating URI: " + _uri );
        if( _uri.getScheme().equals( "file" ) ) {
            //we don't know yet if the URI is a directory or a file
            File _startingPoint = new File( _uri );
            File[] _files = getLogFilesFor( _startingPoint );
            FileCache cache = useCache ? LogFileCache.get().loadCache() : null;
            
            for( File file: _files ) {
                logger.info( "[FILELOADER] found file under " + _uri.toString() + " : " + file.getName() );                    
                doLoad( file, cache, keySpace, host, batchSize );
            }
            return;
        }
        logger.info( "[FILELOADER] URI " + _uri.toString() + " is not something FileLoader can currently handle" );
    }
    
    
    
    /**
     * Perform a load, and either write to cache or not, according to settings.
     * 
     * @param _f The file we must ingest
     * @param _cache The cache we are to use for keeping file usage information up to date
     * @param _keySpace Cassandra key space into which to ingest
     * @param _host Cassandra host / server
     * @param _batchSize MutationBatch size
     * @throws Exception  if actual loading of the file causes a problem
     */
    private final static void doLoad( File _f, FileCache _cache, String _keySpace, String _host, int _batchSize ) throws Exception  {
        if( ! ( _cache == null) )   {
            FileInformation _fileInfo = _cache.getFileInformationFor( _f.toURI() );
                    if( _fileInfo == null ) {
                        _fileInfo = new FileInformation( _f.toURI(), System.currentTimeMillis(), 1, 0 );
                        _cache.updateCacheFor( _f.toURI(), _fileInfo );
                    }
                    logger.info( "[FILELOADER] " + _fileInfo.toString()  );
                    load( _fileInfo, _keySpace, _host, _batchSize);
                
                _cache.saveCache();
        }
        else    {
            load( new FileInformation( _f.toURI(), System.currentTimeMillis(), 0, 0 ), _keySpace, _host, _batchSize );
        }
    }
    
    
    
    /**
     * 
     * @param _file
     * @return  the log files present under the argument, if it is a directory, or otherwise the argument itself ( if it is a log file )
     */
    public static File[] getLogFilesFor( File _file ) throws FileNotFoundException {
        File[] _returned =  { _file }; 
        if( _file.isDirectory() ) return _file.listFiles( new LogFileNameFilter() );
        else if ( _file.getName().endsWith( LogFileNameFilter.FILE_NAME_SUFFIX ) ) return _returned;
        throw new FileNotFoundException( "[AGGREGATOR] no log file(s) at " + _file.getName() );
    }
    
    
    /**
     * 
     * This method is here for demo purposes only. It is not part of the required functionality for this class. 
     * 
     * 
     * @param args arg 0 = file, arg #1 = keyspace, arg #2 = server host name, arg #3 = batch size, arg #4 = number of time units to wait, arg #5 = time unit ( minute, second, hour,.. ) 
     * ( For now, tacitly assume we are on the default Cassandra 9160 port ). Clustering is not yet supported.
     */
    public final static void main( String[] args ) throws Exception {
        if(  ! ( args.length == 6 )  )  {
            usage();
            System.exit( 1 );
        }

        try {
            final File _f = new File( args[ 0 ] );
            
            URI uri = _f.toURI();
            String keySpaceName = args[ 1 ];
            final String host = args[ 2 ];
            final int batchSize = Integer.parseInt( args[ 3 ] );
            final int timeUnitCount = Integer.parseInt( args[ 4 ] );
            Delay timeUnit = timeUnitsFromCmdLine( args[ 5 ].toUpperCase()  );
            if( timeUnit == null ) timeUnit = Delay.SECOND;
            long millisToWait = timeUnitCount * timeUnit.getMilliSeconds();
            useCache = true;
            while( true )   {
                try {
                    invigilate( uri, keySpaceName, host, batchSize, millisToWait );
                    Thread.sleep( millisToWait );
                }
                catch( InterruptedException | Error any  )  {
                    Thread.interrupted();
                }
                finally {
                    
                }
            }
        }
        catch( Exception e )    {
            logger.log(Level.SEVERE, e.toString() );
        }   
    }

    
    /**
     * Helper method for converting cmd line, human-readable invigilance delays
     * 
     * @param _arg a time unit readable to a human ( minute, second, hour... )
     * @return a Delay known to the system ( Minute, Hour, ... ) 
     */
    private static Delay timeUnitsFromCmdLine( String _arg )    {
        Iterator< Delay > onKnownDelayOptions = Delay.knownOptions.iterator();
        while( onKnownDelayOptions.hasNext() )  {
            Delay _d = onKnownDelayOptions.next();
            if( _d.name().equals(_arg ) )   {
                return _d;
            }
        }
        return null;
    }
    
    private static void usage() {
        System.out.println( "usage: FileLoader file URL | keyspace | server | batch_size {  number | seconds }" );
        System.out.println( "example: FileLoader /data/bl/ mykeyspace  localhost 10000 10 minutes" );
    }
}