package com.thinkbiganalytics.nifi.v2.elasticsearch; /*- * #%L * thinkbig-nifi-elasticsearch-processors * %% * Copyright (C) 2017 ThinkBig Analytics * %% * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * #L% */ import com.google.gson.Gson; import com.google.gson.JsonArray; import com.google.gson.JsonElement; import com.google.gson.reflect.TypeToken; import com.thinkbiganalytics.nifi.processor.AbstractNiFiProcessor; import org.apache.commons.io.IOUtils; import org.apache.nifi.annotation.behavior.InputRequirement; import org.apache.nifi.annotation.documentation.CapabilityDescription; import org.apache.nifi.annotation.documentation.Tags; import org.apache.nifi.components.PropertyDescriptor; import org.apache.nifi.flowfile.FlowFile; import org.apache.nifi.logging.ComponentLog; import org.apache.nifi.processor.ProcessContext; import org.apache.nifi.processor.ProcessSession; import org.apache.nifi.processor.Relationship; import org.apache.nifi.processor.exception.ProcessException; import org.apache.nifi.processor.io.InputStreamCallback; import org.apache.nifi.processor.io.OutputStreamCallback; import org.apache.nifi.processor.util.StandardValidators; import org.codehaus.jettison.json.JSONArray; import org.codehaus.jettison.json.JSONObject; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; import java.nio.charset.Charset; import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Set; /** * This processor aggregates JSON metadata about a hive table so that a table and it's columns are in one JSON document */ @InputRequirement(InputRequirement.Requirement.INPUT_REQUIRED) @Tags({"hive", "metadata", "thinkbig", "elasticsearch"}) @CapabilityDescription("Aggregate JSON across multiple documents into one document representing a Hive table (V2)") public class MergeHiveTableMetadata extends AbstractNiFiProcessor { /** * Success Relationship for when JSON objects are successfully merged */ public static final Relationship REL_SUCCESS = new Relationship.Builder() .name("success") .description("JSON objects that are successfully merged are transferred to this relationship") .build(); /** * Failure Relationship for when the merge of JSON metadata does not succeed. */ public static final Relationship REL_FAILURE = new Relationship.Builder() .name("failure") .description( "JSON objects that are un-successfully merged are transferred to this relationship") .build(); /** * A property for the name of the hive database field */ public static final PropertyDescriptor DATABASE_NAME = new PropertyDescriptor.Builder() .name("Database Name Field") .description("The name of the hive database field") .required(true) .addValidator(StandardValidators.NON_EMPTY_VALIDATOR) .expressionLanguageSupported(true) .build(); /** * A property for the database owner field name */ public static final PropertyDescriptor DATABASE_OWNER = new PropertyDescriptor.Builder() .name("Database Owner Field") .description("Database owner field name") .required(true) .addValidator(StandardValidators.NON_EMPTY_VALIDATOR) .expressionLanguageSupported(true) .build(); /** * A property for the table create time */ public static final PropertyDescriptor TABLE_CREATE_TIME = new PropertyDescriptor.Builder() .name("Table Create Time Field") .description("Field representing the table create time") .required(true) .addValidator(StandardValidators.NON_EMPTY_VALIDATOR) .expressionLanguageSupported(true) .build(); /** * A property for the table name */ public static final PropertyDescriptor TABLE_NAME = new PropertyDescriptor.Builder() .name("Table Name Field") .description("Field holding the table name") .required(true) .addValidator(StandardValidators.NON_EMPTY_VALIDATOR) .expressionLanguageSupported(true) .build(); /** * A property for the table type */ public static final PropertyDescriptor TABLE_TYPE = new PropertyDescriptor.Builder() .name("Table Type Field") .description("Field representing what type of hive table it is") .required(true) .addValidator(StandardValidators.NON_EMPTY_VALIDATOR) .expressionLanguageSupported(true) .build(); /** * A property for the column name */ public static final PropertyDescriptor COLUMN_NAME = new PropertyDescriptor.Builder() .name("Column Name Field") .description("Field representing the column name") .required(true) .addValidator(StandardValidators.NON_EMPTY_VALIDATOR) .expressionLanguageSupported(true) .build(); /** * A property for the column type */ public static final PropertyDescriptor COLUMN_TYPE = new PropertyDescriptor.Builder() .name("Column Type Field") .description("Field representing what the column type is") .required(true) .addValidator(StandardValidators.NON_EMPTY_VALIDATOR) .expressionLanguageSupported(true) .build(); private final Set<Relationship> relationships; private final List<PropertyDescriptor> propDescriptors; /** * default constructor constructs the relationship and property collections */ public MergeHiveTableMetadata() { final Set<Relationship> r = new HashSet<>(); r.add(REL_SUCCESS); r.add(REL_FAILURE); relationships = Collections.unmodifiableSet(r); final List<PropertyDescriptor> pds = new ArrayList<>(); pds.add(DATABASE_NAME); pds.add(DATABASE_OWNER); pds.add(TABLE_CREATE_TIME); pds.add(TABLE_NAME); pds.add(TABLE_TYPE); pds.add(COLUMN_NAME); pds.add(COLUMN_TYPE); propDescriptors = Collections.unmodifiableList(pds); } @Override public Set<Relationship> getRelationships() { return relationships; } @Override protected List<PropertyDescriptor> getSupportedPropertyDescriptors() { return propDescriptors; } @Override public void onTrigger(final ProcessContext context, final ProcessSession session) throws ProcessException { final ComponentLog logger = getLog(); FlowFile flowFile = session.get(); if (flowFile == null) { return; } try { /* Configuration parameters for spark launcher */ final String databaseNameField = context.getProperty(DATABASE_NAME).evaluateAttributeExpressions(flowFile).getValue(); final String databaseOwnerField = context.getProperty(DATABASE_OWNER).evaluateAttributeExpressions(flowFile).getValue(); final String tableCreateTimeField = context.getProperty(TABLE_CREATE_TIME).evaluateAttributeExpressions(flowFile).getValue(); final String tableNameField = context.getProperty(TABLE_NAME).evaluateAttributeExpressions(flowFile).getValue(); final String tableTypeField = context.getProperty(TABLE_TYPE).evaluateAttributeExpressions(flowFile).getValue(); final String columnNameField = context.getProperty(COLUMN_NAME).evaluateAttributeExpressions(flowFile).getValue(); final String columnTypeField = context.getProperty(COLUMN_TYPE).evaluateAttributeExpressions(flowFile).getValue(); final StringBuffer sb = new StringBuffer(); session.read(flowFile, new InputStreamCallback() { @Override public void process(InputStream in) throws IOException { sb.append(IOUtils.toString(in, Charset.defaultCharset())); } }); logger.debug("The json that was received is: " + sb.toString()); flowFile = session.write(flowFile, new OutputStreamCallback() { @Override public void process(final OutputStream out) throws IOException { try { JSONArray array = new JSONArray(sb.toString()); Map<String, Metadata> tables = new HashMap<>(); for (int i = 0; i < array.length(); i++) { JSONObject jsonObj = array.getJSONObject(i); String databaseName = jsonObj.getString(databaseNameField); String databaseOwner = jsonObj.getString(databaseOwnerField); String tableName = jsonObj.getString(tableNameField); String tableCreateTime = jsonObj.getString(tableCreateTimeField); String tableType = jsonObj.getString(tableTypeField); String columnName = jsonObj.getString(columnNameField); String columnType = jsonObj.getString(columnTypeField); String key = databaseName + tableName; if (tables.containsKey(key)) { Metadata meta = tables.get(key); HiveColumn column = new HiveColumn(); column.setColumnName(columnName); column.setColumnType(columnType); meta.getHiveColumns().add(column); } else { Metadata meta = new Metadata(); meta.setDatabaseName(databaseName); meta.setDatabaseOwner(databaseOwner); meta.setTableCreateTime(tableCreateTime); meta.setTableName(tableName); meta.setTableType(tableType); HiveColumn column = new HiveColumn(); column.setColumnName(columnName); column.setColumnType(columnType); meta.getHiveColumns().add(column); tables.put(key, meta); } } List<Metadata> tablesAsList = new ArrayList<>(); Iterator iter = tables.entrySet().iterator(); while (iter.hasNext()) { Map.Entry pair = (Map.Entry) iter.next(); tablesAsList.add((Metadata) pair.getValue()); } Gson gson = new Gson(); JsonElement element = gson.toJsonTree(tablesAsList, new TypeToken<List<Metadata>>() { }.getType()); JsonArray jsonArray = element.getAsJsonArray(); out.write(jsonArray.toString().getBytes()); } catch (final Exception e) { throw new ProcessException(e); } } }); logger.info("*** Completed with status "); session.transfer(flowFile, REL_SUCCESS); } catch (final Exception e) { logger.error("Unable to execute merge hive json job", new Object[]{flowFile, e}); session.transfer(flowFile, REL_FAILURE); } } private class Metadata { private String databaseName; private String databaseOwner; private String tableCreateTime; private String tableName; private String tableType; private List<HiveColumn> hiveColumns = new ArrayList<>(); public List<HiveColumn> getHiveColumns() { return hiveColumns; } public void setHiveColumns(List<HiveColumn> hiveColumns) { this.hiveColumns = hiveColumns; } public String getDatabaseName() { return databaseName; } public void setDatabaseName(String databaseName) { this.databaseName = databaseName; } public String getDatabaseOwner() { return databaseOwner; } public void setDatabaseOwner(String databaseOwner) { this.databaseOwner = databaseOwner; } public String getTableCreateTime() { return tableCreateTime; } public void setTableCreateTime(String tableCreateTime) { this.tableCreateTime = tableCreateTime; } public String getTableName() { return tableName; } public void setTableName(String tableName) { this.tableName = tableName; } public String getTableType() { return tableType; } public void setTableType(String tableType) { this.tableType = tableType; } } private class HiveColumn { private String columnName; private String columnType; public String getColumnType() { return columnType; } public void setColumnType(String columnType) { this.columnType = columnType; } public String getColumnName() { return columnName; } public void setColumnName(String columnName) { this.columnName = columnName; } } }