/** * Licensed to Cloudera, Inc. under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. Cloudera, Inc. licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.cloudera.flume.handlers.hive; import java.io.IOException; import java.io.InputStreamReader; import java.util.ArrayList; import java.util.List; import org.apache.commons.lang.StringUtils; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.cli.CliDriver; import org.apache.hadoop.hive.service.HiveClient; import org.apache.hadoop.hive.service.HiveServerException; import org.apache.http.HttpResponse; import org.apache.http.client.HttpClient; import org.apache.http.client.methods.HttpPost; import org.apache.http.entity.StringEntity; import org.apache.http.impl.client.DefaultHttpClient; import org.apache.log4j.Logger; import org.apache.thrift.TException; import org.apache.thrift.protocol.TBinaryProtocol; import org.apache.thrift.protocol.TProtocol; import org.apache.thrift.transport.TSocket; import org.apache.thrift.transport.TTransport; import org.apache.thrift.transport.TTransportException; import com.cloudera.flume.conf.FlumeConfiguration; import com.google.common.io.CharStreams; /** * Writes events the a file give a hadoop uri path. If no uri is specified It * defaults to the set by the given configured by fs.default.name config * variable. The user can specify an output format for the file. If none is * specified the default set by flume.collector.outputformat in the flume * configuration file is used. * * * * TODO (jon) refactor this to be sane. Not happening now. */ public class MarkerStore { private static String hiveHost; private static int hivePort; private static TTransport transport; private static TProtocol protocol; HiveClient client; FlumeConfiguration conf; Path dstPath; String hiveTableName, hiveMarkerFolder, elasticsearchMarkerFolder, elasticsearchUrl; boolean runMarkerQueries; final static Logger LOG = Logger.getLogger(MarkerStore.class.getName()); public MarkerStore(String hiveTableName, String elasticsearchUrl, boolean runMarkerQueries) { this.conf = FlumeConfiguration.get(); hiveHost = conf.getHiveHost(); hivePort = conf.getHivePort(); transport = new TSocket(hiveHost, hivePort); protocol = new TBinaryProtocol(transport); client = new HiveClient(protocol); if (StringUtils.isNotEmpty(elasticsearchUrl)) { this.elasticsearchUrl = elasticsearchUrl; this.elasticsearchMarkerFolder = conf.getElasticSearchMarkerFolder(); if (runMarkerQueries) { LOG.info("RUNNING ELASTICSEARCHMARKERQUERIES\n"); runElasticSearchMarkerQueries(); } } this.hiveTableName = hiveTableName; hiveMarkerFolder = conf.getHiveDefaultMarkerFolder(); try { if (!transport.isOpen()) { LOG.error("hive transport is closed, re-opening"); transport = new TSocket(hiveHost, hivePort); protocol = new TBinaryProtocol(transport); client = new HiveClient(protocol); transport.open(); if (runMarkerQueries) { LOG.info("RUNNING HIVEMARKERQUERIES\n"); runHiveMarkerQueries(); } } } catch (TTransportException e) { LOG.error("error opening transport layer to hive" + e.getMessage()); } } private boolean runElasticSearchMarkerQueries() { boolean success = true; FileSystem hdfs; FSDataInputStream in; dstPath = new Path(elasticsearchMarkerFolder); LOG.info("DSTPATH: " + dstPath); try { hdfs = dstPath.getFileSystem(conf); if (hdfs.exists(dstPath)) { FileStatus[] fileListing = hdfs.listStatus(dstPath); for (FileStatus fs : fileListing) { if (!fs.isDir()) { LOG.info("File marker path: " + fs.getPath()); in = hdfs.open(fs.getPath()); byte[] fileData = new byte[(int) fs.getLen()]; in.readFully(fileData); in.close(); LOG.info("cleaning markerfile @: " + fs.getPath().toString()); cleanMarkerFile(fs.getPath().toString()); sendESQuery(elasticsearchUrl, new String(fileData)); } } } } catch (Exception e) { success = false; } return success; } //you have indices, think of each index as a distributed database, a type is like a table in a database public boolean sendESQuery(String elasticSearchUrl, String sb) { boolean success = true; LOG.info("sending batched stringentities"); LOG.info("elasticSearchUrl: " + elasticSearchUrl); try { HttpClient httpClient = new DefaultHttpClient(); HttpPost httpPost = new HttpPost(elasticSearchUrl); StringEntity se = new StringEntity(sb); httpPost.setEntity(se); HttpResponse hr = httpClient.execute(httpPost); LOG.info("HTTP Response: " + hr.getStatusLine()); LOG.info("Closing httpConnection"); httpClient.getConnectionManager().shutdown(); LOG.info("booooooo: " + CharStreams.toString(new InputStreamReader(se.getContent()))); } catch (IOException e) { e.printStackTrace(); success = false; } finally { if (!success) { LOG.info("ESQuery wasn't successful, writing to markerfolder"); writeElasticSearchToMarkerFolder(new StringBuilder(sb)); } } LOG.info("ESQuery was successful, yay!"); return success; } private boolean writeElasticSearchToMarkerFolder(StringBuilder httpQuery) { FileSystem hdfs; try { String markerFolder = conf.getElasticSearchDefaultMarkerFolder(); dstPath = new Path(markerFolder); hdfs = dstPath.getFileSystem(conf); if (!hdfs.exists(dstPath)) { hdfs.mkdirs(dstPath); } dstPath = new Path(markerFolder + "/es-" + System.currentTimeMillis() + ".marker"); System.out.println("creating file at: " + dstPath.toString()); FSDataOutputStream writer_marker = hdfs.create(dstPath); writer_marker.writeBytes(httpQuery + "\n"); writer_marker.close(); dstPath = null; writer_marker = null; } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); return false; } return true; } private boolean runHiveMarkerQueries() { boolean queryStatus = true; FileSystem hdfs; FSDataInputStream in; dstPath = new Path(hiveMarkerFolder); LOG.info("DSTPATH: " + dstPath); try { hdfs = dstPath.getFileSystem(conf); if (hdfs.exists(dstPath)) { FileStatus[] fileListing = hdfs.listStatus(dstPath); for (FileStatus fs : fileListing) { if (!fs.isDir()) { LOG.info("File marker path: " + fs.getPath()); in = hdfs.open(fs.getPath()); byte[] fileData = new byte[(int) fs.getLen()]; in.readFully(fileData); String[] splitTab = new String(fileData).split("\t"); if (splitTab.length == 2) { dstPath = new Path(splitTab[0]); FileSystem hiveFile = dstPath.getFileSystem(conf); if (hiveFile.exists(dstPath)) { LOG.info("marker file data: " + splitTab[1]); if (runHiveQuery(splitTab[1])) { LOG.info("Marker query is successful"); in.close(); cleanMarkerFile(fs.getPath().toString()); } else { LOG.info("Error running marker query, marker point not deleted"); queryStatus = false; } } else { LOG.info("marker points to invalid hive file location, deleting the marker"); in.close(); cleanMarkerFile(fs.getPath().toString()); } } //in.close(); } } } hdfs.close(); } catch (IOException e) { LOG.error("ERROR running runMarkerQueries:" + e.getMessage()); } return queryStatus; } public boolean cleanMarkerFile(String hiveMarkerPath) { LOG.debug("cleaning up hiveMarker: " + hiveMarkerPath); FileSystem localHdfs; Path deletePath = new Path(hiveMarkerPath); try { localHdfs = deletePath.getFileSystem(conf); if (localHdfs.delete(deletePath, false)) { LOG.debug("hiveMarker deleted successfully: " + hiveMarkerPath); return true; } else { LOG.error("error deleting hive marker: " + hiveMarkerPath); } } catch (IOException e) { // TODO Auto-generated catch block LOG.error("Error deleting hiveMarker: " + e.getMessage()); } return false; } public boolean runHiveQuery(String query) { // CliDriver clidriver = new CliDriver(); // LOG.error("QUery: " + query); // int cliStatus = clidriver.processLine(query); // LOG.error("cliStatus: " + cliStatus); try { if (!transport.isOpen()) { LOG.error("hive transport is closed, re-opening"); transport = new TSocket(hiveHost, hivePort); protocol = new TBinaryProtocol(transport); client = new HiveClient(protocol); transport.open(); } client.execute(query); transport.close(); return true; } catch (TTransportException e) { // TODO Auto-generated catch block LOG.error("Error setting up transport with hive: " + e.getMessage()); e.printStackTrace(); } catch (HiveServerException e) { // TODO Auto-generated catch block LOG.error("HiveServerException: " + e.getMessage()); } catch (TException e) { LOG.error("TException: " + e.getMessage()); // TODO Auto-generated catch block e.printStackTrace(); } return false; } public boolean writeHiveMarker(String hqlQuery, String filePath, String hiveMarkerFolder, String hiveMarkerPath) { LOG.debug("writing to hiveMarker: " + hiveMarkerFolder); FileSystem hdfs; dstPath = new Path(hiveMarkerFolder); try { hdfs = dstPath.getFileSystem(conf); if (!hdfs.exists(dstPath)) { hdfs.mkdirs(dstPath); } dstPath = new Path(hiveMarkerPath); FSDataOutputStream writer = hdfs.create(dstPath); writer.writeBytes(filePath + "\t" + hqlQuery + "\n"); writer.close(); dstPath = null; writer = null; } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } /* dstPath = new Path(hiveMarkerPath); hdfs = dstPath.getFileSystem(conf); writer = hdfs.create(dstPath); writer.writeUTF(hqlQuery); writer.close(); writer = null; */ return true; } public boolean mergeFiles(String folder, Path file, String hiveOutputLocation) { FileSystem hdfs; FSDataInputStream in; FSDataOutputStream out; List<Path> fileCollection = new ArrayList<Path>(); dstPath = new Path(folder); LOG.info("mergeFiles DSTPATH: " + dstPath); try { hdfs = dstPath.getFileSystem(conf); if (hdfs.exists(dstPath)) { FileStatus[] fileListing = hdfs.listStatus(dstPath); LOG.error("Creating file @: " + hiveOutputLocation); out = hdfs.create(new Path(hiveOutputLocation)); in = hdfs.open(file); byte[] fileData = new byte[(int) hdfs.getFileStatus(file).getLen()]; in.readFully(fileData); out.write(fileData); for (FileStatus fs : fileListing) { if (!fs.isDir()) { LOG.info("mergeFiles File marker path: " + fs.getPath()); fileCollection.add(fs.getPath()); in = hdfs.open(fs.getPath()); fileData = new byte[(int) fs.getLen()]; in.readFully(fileData); out.write(fileData); } } out.close(); } hdfs.close(); LOG.error("Written file: " + hiveOutputLocation); //lets start the purge process, delete all files except the merged file hdfs = dstPath.getFileSystem(conf); for (Path p : fileCollection) { if (hdfs.delete(p,false)) { LOG.error("Successfully deleted: " + p); } else { LOG.error("Error deleting file: " + p); } } } catch (IOException e) { LOG.error("ERROR running runMarkerQueries:" + e.getMessage()); } LOG.error("mergeFiles Done merging files"); return false; } public boolean checkIfPartitionExists(String filePath) { dstPath = new Path(filePath); FileSystem hdfs; try { hdfs = dstPath.getFileSystem(conf); return hdfs.exists(dstPath); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } return false; } }