MarkerStore.java example

Explorer
Flume-Hive-master
/**
 * Licensed to Cloudera, Inc. under one

 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  Cloudera, Inc. licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.cloudera.flume.handlers.hive;

import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.List;

import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.cli.CliDriver;
import org.apache.hadoop.hive.service.HiveClient;
import org.apache.hadoop.hive.service.HiveServerException;
import org.apache.http.HttpResponse;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.entity.StringEntity;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.log4j.Logger;
import org.apache.thrift.TException;
import org.apache.thrift.protocol.TBinaryProtocol;
import org.apache.thrift.protocol.TProtocol;
import org.apache.thrift.transport.TSocket;
import org.apache.thrift.transport.TTransport;
import org.apache.thrift.transport.TTransportException;

import com.cloudera.flume.conf.FlumeConfiguration;
import com.google.common.io.CharStreams;


/**
 * Writes events the a file give a hadoop uri path. If no uri is specified It
 * defaults to the set by the given configured by fs.default.name config
 * variable. The user can specify an output format for the file. If none is
 * specified the default set by flume.collector.outputformat in the flume
 * configuration file is used.
 * 
 * 
 * 
 * TODO (jon) refactor this to be sane. Not happening now.
 */
public class MarkerStore  {

  private static String hiveHost;
  private static int hivePort;
  private static TTransport transport;
  private static TProtocol protocol;
  HiveClient client;
  FlumeConfiguration conf;
  Path dstPath;
  String hiveTableName, hiveMarkerFolder, elasticsearchMarkerFolder, elasticsearchUrl;
  boolean runMarkerQueries;
  
  final static Logger LOG =
    Logger.getLogger(MarkerStore.class.getName());


  public MarkerStore(String hiveTableName, String elasticsearchUrl, boolean runMarkerQueries) {
    this.conf = FlumeConfiguration.get();

    hiveHost = conf.getHiveHost();
    hivePort = conf.getHivePort();

    transport = new TSocket(hiveHost, hivePort);
    protocol = new TBinaryProtocol(transport);
    client = new HiveClient(protocol);
    if (StringUtils.isNotEmpty(elasticsearchUrl)) {
      this.elasticsearchUrl = elasticsearchUrl;
      this.elasticsearchMarkerFolder = conf.getElasticSearchMarkerFolder();
      if (runMarkerQueries) {
        LOG.info("RUNNING ELASTICSEARCHMARKERQUERIES\n");
        runElasticSearchMarkerQueries();
      }
    }
    this.hiveTableName = hiveTableName;
    hiveMarkerFolder = conf.getHiveDefaultMarkerFolder();
    try {
      if (!transport.isOpen()) {
        LOG.error("hive transport is closed, re-opening");
        transport = new TSocket(hiveHost, hivePort);
        protocol = new TBinaryProtocol(transport);
        client = new HiveClient(protocol);
        transport.open();        
        if (runMarkerQueries) {
          LOG.info("RUNNING HIVEMARKERQUERIES\n");
          runHiveMarkerQueries();
        }
        
      }

    } catch (TTransportException e) {
      LOG.error("error opening transport layer to hive" + e.getMessage());
    }
    


  }

  private boolean runElasticSearchMarkerQueries() {
      boolean success = true;
      FileSystem hdfs;    
      FSDataInputStream in;
      dstPath = new Path(elasticsearchMarkerFolder);
      LOG.info("DSTPATH: " + dstPath);
      try {
        hdfs = dstPath.getFileSystem(conf);
        if (hdfs.exists(dstPath)) {
          FileStatus[] fileListing = hdfs.listStatus(dstPath);
          for (FileStatus fs : fileListing) {
            if (!fs.isDir()) {
              LOG.info("File marker path: " + fs.getPath());
              in = hdfs.open(fs.getPath());
              byte[] fileData = new byte[(int) fs.getLen()];
              in.readFully(fileData);
              in.close();
              LOG.info("cleaning markerfile @: " + fs.getPath().toString());
              cleanMarkerFile(fs.getPath().toString());
              sendESQuery(elasticsearchUrl, new String(fileData));
              
            }
          }
        }
      } catch (Exception e) {
        success = false;
      } 
      return success;
  }
  
  //you have indices, think of each index as a distributed database, a type is like a table in a database
  public boolean sendESQuery(String elasticSearchUrl, String sb) {
    boolean success = true; 
    LOG.info("sending batched stringentities");
    LOG.info("elasticSearchUrl: " + elasticSearchUrl);
    try {


      HttpClient httpClient = new DefaultHttpClient();
      HttpPost httpPost = new HttpPost(elasticSearchUrl);
      StringEntity se = new StringEntity(sb);    
      
      httpPost.setEntity(se);
      HttpResponse hr = httpClient.execute(httpPost);

      LOG.info("HTTP Response: " + hr.getStatusLine());
      LOG.info("Closing httpConnection");
      httpClient.getConnectionManager().shutdown();        
      LOG.info("booooooo: " + CharStreams.toString(new InputStreamReader(se.getContent())));
    } catch (IOException e) {
      e.printStackTrace();
      success = false;
    } finally {
      if (!success) {
        LOG.info("ESQuery wasn't successful, writing to markerfolder");
        writeElasticSearchToMarkerFolder(new StringBuilder(sb));
      }
    }
    LOG.info("ESQuery was successful, yay!");
    return success;

  }
  
  private boolean  writeElasticSearchToMarkerFolder(StringBuilder httpQuery) {
    FileSystem hdfs;
    try {
      String markerFolder = conf.getElasticSearchDefaultMarkerFolder();
      dstPath = new Path(markerFolder);
      hdfs = dstPath.getFileSystem(conf);
      if (!hdfs.exists(dstPath)) {
        hdfs.mkdirs(dstPath);
      }

      dstPath = new Path(markerFolder + "/es-" + System.currentTimeMillis() + ".marker");
      System.out.println("creating file at: " + dstPath.toString());
      FSDataOutputStream writer_marker = hdfs.create(dstPath);
      writer_marker.writeBytes(httpQuery + "\n");
      writer_marker.close();
      dstPath = null;
      writer_marker = null;
    } catch (IOException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
      return false;
    }
    return true;
  }


  private boolean runHiveMarkerQueries() {
    boolean queryStatus = true;
    FileSystem hdfs;    
    FSDataInputStream in;
    dstPath = new Path(hiveMarkerFolder);
    LOG.info("DSTPATH: " + dstPath);
    try {
      hdfs = dstPath.getFileSystem(conf);
      if (hdfs.exists(dstPath)) {
        FileStatus[] fileListing = hdfs.listStatus(dstPath);
        for (FileStatus fs : fileListing) {
          if (!fs.isDir()) {
            LOG.info("File marker path: " + fs.getPath());
            in = hdfs.open(fs.getPath());
            byte[] fileData = new byte[(int) fs.getLen()];
            in.readFully(fileData);
            String[] splitTab = new String(fileData).split("\t");
            if (splitTab.length == 2) {
              dstPath = new Path(splitTab[0]);
              FileSystem hiveFile = dstPath.getFileSystem(conf);
              if (hiveFile.exists(dstPath)) {
                LOG.info("marker file data: " + splitTab[1]);
                if (runHiveQuery(splitTab[1])) {
                  LOG.info("Marker query is successful");
                  in.close();
                  cleanMarkerFile(fs.getPath().toString());
                } else {
                  LOG.info("Error running marker query, marker point not deleted");
                  queryStatus = false;
                }

              } else {
                LOG.info("marker points to invalid hive file location, deleting the marker");
                in.close();
                cleanMarkerFile(fs.getPath().toString());
              }
            }
            //in.close();
          }
        }
      }
      hdfs.close();        
    } catch (IOException e) {
      LOG.error("ERROR running runMarkerQueries:" + e.getMessage());
    }       

    return queryStatus;
  }


  public boolean cleanMarkerFile(String hiveMarkerPath) {
    LOG.debug("cleaning up hiveMarker: " + hiveMarkerPath);    
    FileSystem localHdfs;    
    Path deletePath = new Path(hiveMarkerPath);
    try {
      localHdfs = deletePath.getFileSystem(conf);
      if (localHdfs.delete(deletePath, false)) {
        LOG.debug("hiveMarker deleted successfully: " + hiveMarkerPath);
        return true;
      } else {
        LOG.error("error deleting hive marker: " + hiveMarkerPath);
      }
    } catch (IOException e) {
      // TODO Auto-generated catch block
      LOG.error("Error deleting hiveMarker: " + e.getMessage());
    }
    return false;
  }

  public boolean runHiveQuery(String query) {
//    CliDriver clidriver = new CliDriver();
//    LOG.error("QUery: " + query);
//    int cliStatus = clidriver.processLine(query);
//    LOG.error("cliStatus: " + cliStatus);

    
    try {
      if (!transport.isOpen()) {
        LOG.error("hive transport is closed, re-opening");
        transport = new TSocket(hiveHost, hivePort);
        protocol = new TBinaryProtocol(transport);
        client = new HiveClient(protocol);
        transport.open();

      }
      client.execute(query);
      transport.close();
      return true;
    } catch (TTransportException e) {
      // TODO Auto-generated catch block
      LOG.error("Error setting up transport with hive: " + e.getMessage());
      e.printStackTrace();
    } catch (HiveServerException e) {
      // TODO Auto-generated catch block
      LOG.error("HiveServerException: " + e.getMessage());
    } catch (TException e) {
      LOG.error("TException: " + e.getMessage());
      // TODO Auto-generated catch block
      e.printStackTrace();
    }
  

    return false;
  }


  public boolean writeHiveMarker(String hqlQuery, String filePath, String hiveMarkerFolder, String hiveMarkerPath) {
    LOG.debug("writing to hiveMarker: " + hiveMarkerFolder);


    FileSystem hdfs;    
    dstPath = new Path(hiveMarkerFolder);
    try {
      hdfs = dstPath.getFileSystem(conf);

      if (!hdfs.exists(dstPath)) {
        hdfs.mkdirs(dstPath);
      }
      dstPath = new Path(hiveMarkerPath);
      FSDataOutputStream writer = hdfs.create(dstPath);
      writer.writeBytes(filePath + "\t" + hqlQuery + "\n");
      writer.close();
      dstPath = null;
      writer = null;

    } catch (IOException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    }



    /*
      dstPath = new Path(hiveMarkerPath);
      hdfs = dstPath.getFileSystem(conf);

      writer = hdfs.create(dstPath);      
      writer.writeUTF(hqlQuery);
      writer.close();
      writer = null;
     */
    return true;


  }

  public boolean mergeFiles(String folder, Path file, String hiveOutputLocation) {
    FileSystem hdfs;    
    FSDataInputStream in;
    FSDataOutputStream out;
    List<Path> fileCollection = new ArrayList<Path>();
    dstPath = new Path(folder);
    LOG.info("mergeFiles DSTPATH: " + dstPath);
    try {
      hdfs = dstPath.getFileSystem(conf);
      
      if (hdfs.exists(dstPath)) {
        FileStatus[] fileListing = hdfs.listStatus(dstPath);
        LOG.error("Creating file @: " + hiveOutputLocation);
        out = hdfs.create(new Path(hiveOutputLocation));
        
        
        in = hdfs.open(file);
        byte[] fileData = new byte[(int) hdfs.getFileStatus(file).getLen()];
        in.readFully(fileData);
        out.write(fileData);
        
        
        
        for (FileStatus fs : fileListing) {
          if (!fs.isDir()) {
            LOG.info("mergeFiles File marker path: " + fs.getPath());
            fileCollection.add(fs.getPath());
            in = hdfs.open(fs.getPath());
            fileData = new byte[(int) fs.getLen()];
            in.readFully(fileData);
            out.write(fileData);
          }
        }
        out.close();
      }

      hdfs.close();
      LOG.error("Written file: " + hiveOutputLocation);
      
      //lets start the purge process, delete all files except the merged file
      hdfs = dstPath.getFileSystem(conf);
      for (Path p : fileCollection) {
        if (hdfs.delete(p,false)) {
          LOG.error("Successfully deleted: " + p);
        } else {
          LOG.error("Error deleting file: " + p);
        }
      }
      
    } catch (IOException e) {
      LOG.error("ERROR running runMarkerQueries:" + e.getMessage());
    }  
    LOG.error("mergeFiles Done merging files");
    return false;
  }
  
  public boolean checkIfPartitionExists(String filePath) {
    dstPath = new Path(filePath);
    FileSystem hdfs; 
    try {
      hdfs = dstPath.getFileSystem(conf);
      return hdfs.exists(dstPath);
    } catch (IOException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    }

    return false;
  }


}