/* * Seldon -- open source prediction engine * ======================================= * Copyright 2011-2015 Seldon Technologies Ltd and Rummble Ltd (http://www.seldon.io/) * ********************************************************************************************** * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ********************************************************************************************** */ package io.seldon.spark.actions; import java.text.ParseException; import java.util.Comparator; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.PriorityQueue; import java.util.Queue; import org.apache.commons.lang3.builder.ReflectionToStringBuilder; import org.apache.commons.lang3.builder.ToStringStyle; import org.apache.hadoop.conf.Configuration; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.function.Function; import org.apache.spark.api.java.function.PairFunction; import org.apache.spark.broadcast.Broadcast; import org.apache.spark.storage.StorageLevel; import scala.Tuple2; import com.beust.jcommander.JCommander; import com.beust.jcommander.Parameter; import com.fasterxml.jackson.databind.DeserializationFeature; import com.fasterxml.jackson.databind.ObjectMapper; public class GroupActionsJob { public static class ClientDetail { public final String client; public final long itemCount; public ClientDetail(String client, long itemCount) { this.client = client; this.itemCount = itemCount; } } public static class CmdLineArgs { @Parameter(names = "--input-path-pattern", required = true) private String input_path_pattern; @Parameter(names = "--input-date-string", required = true) private String input_date_string; @Parameter(names = "--output-path-dir", required = true) private String output_path_dir; @Parameter(names = "--debug-use-local-master") private Boolean debug_use_local_master = false; @Parameter(names = "--aws-access-key-id", required = false) private String aws_access_key_id; @Parameter(names = "--aws-secret-access-key", required = false) private String aws_secret_access_key; @Parameter(names = "--gzip-output", required = false) private boolean gzip_output = false; @Parameter(names = "--single-client", required = false) private String single_client; @Override public String toString() { return ReflectionToStringBuilder.toString(this, ToStringStyle.SHORT_PREFIX_STYLE); } } public static void main(String[] args) { CmdLineArgs cmdLineArgs = new CmdLineArgs(); new JCommander(cmdLineArgs, args); run(cmdLineArgs); } public static void run(CmdLineArgs cmdLineArgs) { long unixDays = 0; try { unixDays = JobUtils.dateToUnixDays(cmdLineArgs.input_date_string); } catch (ParseException e) { unixDays = 0; } System.out.println(String.format("--- started GroupActionsJob date[%s] unixDays[%s] ---", cmdLineArgs.input_date_string, unixDays)); System.out.println("Env: " + System.getenv()); System.out.println("Properties: " + System.getProperties()); SparkConf sparkConf = new SparkConf().setAppName("GroupActionsJob"); if (cmdLineArgs.debug_use_local_master) { System.out.println("Using 'local' master"); sparkConf.setMaster("local"); } Tuple2<String, String>[] sparkConfPairs = sparkConf.getAll(); System.out.println("--- sparkConf ---"); for (int i = 0; i < sparkConfPairs.length; i++) { Tuple2<String, String> kvPair = sparkConfPairs[i]; System.out.println(String.format("%s:%s", kvPair._1, kvPair._2)); } System.out.println("-----------------"); JavaSparkContext jsc = new JavaSparkContext(sparkConf); { // setup aws access Configuration hadoopConf = jsc.hadoopConfiguration(); hadoopConf.set("fs.s3.impl", "org.apache.hadoop.fs.s3native.NativeS3FileSystem"); if (cmdLineArgs.aws_access_key_id != null && !"".equals(cmdLineArgs.aws_access_key_id)) { hadoopConf.set("fs.s3n.awsAccessKeyId", cmdLineArgs.aws_access_key_id); hadoopConf.set("fs.s3n.awsSecretAccessKey", cmdLineArgs.aws_secret_access_key); } } // String output_path_dir = "./out/" + input_date_string + "-" + UUID.randomUUID(); JavaRDD<String> dataSet = jsc.textFile(JobUtils.getSourceDirFromDate(cmdLineArgs.input_path_pattern, cmdLineArgs.input_date_string)).repartition(4); final ObjectMapper objectMapper = new ObjectMapper(); objectMapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); final String single_client = cmdLineArgs.single_client; if (single_client != null) { Function<String, Boolean> clientFilter = new Function<String, Boolean>() { @Override public Boolean call(String t) throws Exception { ActionData actionData = JobUtils.getActionDataFromActionLogLine(objectMapper, t); return ((actionData.client != null) && (actionData.client.equals(single_client))); } }; dataSet = dataSet.filter(clientFilter); } JavaPairRDD<String, ActionData> pairs = dataSet.mapToPair(new PairFunction<String, String, ActionData>() { @Override public Tuple2<String, ActionData> call(String t) throws Exception { ActionData actionData = JobUtils.getActionDataFromActionLogLine(objectMapper, t); // String key = (actionData.userid == 0) ? "__no_userid__" : actionData.client; String key = actionData.client; return new Tuple2<String, ActionData>(key, actionData); } }).persist(StorageLevel.MEMORY_AND_DISK()); List<String> clientList = pairs.keys().distinct().collect(); Queue<ClientDetail> clientDetailQueue = new PriorityQueue<ClientDetail>(30, new Comparator<ClientDetail>() { @Override public int compare(ClientDetail o1, ClientDetail o2) { if (o1.itemCount > o2.itemCount) { return -1; } else if (o1.itemCount < o2.itemCount) { return 1; } return 0; } }); Queue<ClientDetail> clientDetailZeroQueue = new PriorityQueue<ClientDetail>(30, new Comparator<ClientDetail>() { @Override public int compare(ClientDetail o1, ClientDetail o2) { if (o1.itemCount > o2.itemCount) { return -1; } else if (o1.itemCount < o2.itemCount) { return 1; } return 0; } }); System.out.println("Client list "+clientList.toString()); for (String client : clientList) { if (client != null) { System.out.println("looking at client "+client); final String currentClient = client; JavaPairRDD<String, ActionData> filtered_by_client = pairs.filter(new Function<Tuple2<String, ActionData>, Boolean>() { @Override public Boolean call(Tuple2<String, ActionData> v1) throws Exception { if (currentClient.equalsIgnoreCase(v1._1)) { return Boolean.TRUE; } else { return Boolean.FALSE; } } }); JavaPairRDD<String, ActionData> nonZeroUserIds = filtered_by_client.filter(new Function<Tuple2<String, ActionData>, Boolean>() { @Override public Boolean call(Tuple2<String, ActionData> v1) throws Exception { if (v1._2.userid == 0) { return Boolean.FALSE; } else { return Boolean.TRUE; } } }); JavaPairRDD<String, Integer> userIdLookupRDD = nonZeroUserIds.mapToPair(new PairFunction<Tuple2<String, ActionData>, String, Integer>() { @Override public Tuple2<String, Integer> call(Tuple2<String, ActionData> t) throws Exception { String key = currentClient + "_" + t._2.client_userid; return new Tuple2<String, Integer>(key, t._2.userid); } }); Map<String, Integer> userIdLookupMap = userIdLookupRDD.collectAsMap(); Map<String, Integer> userIdLookupMap_wrapped = new HashMap<String, Integer>(userIdLookupMap); final Broadcast<Map<String, Integer>> broadcastVar = jsc.broadcast(userIdLookupMap_wrapped); JavaRDD<String> json_only_with_zeros = filtered_by_client.map(new Function<Tuple2<String, ActionData>, String>() { @Override public String call(Tuple2<String, ActionData> v1) throws Exception { Map<String, Integer> m = broadcastVar.getValue(); ActionData actionData = v1._2; if (actionData.userid == 0) { String key = currentClient + "_" + actionData.client_userid; if (m.containsKey(key)) { actionData.userid = m.get(key); } else { return ""; } } String json = JobUtils.getJsonFromActionData(actionData); return json; } }); JavaRDD<String> json_only = json_only_with_zeros.filter(new Function<String, Boolean>() { @Override public Boolean call(String v1) throws Exception { return (v1.length() == 0) ? Boolean.FALSE : Boolean.TRUE; } }); String outputPath = getOutputPath(cmdLineArgs.output_path_dir, unixDays, client); if (cmdLineArgs.gzip_output) { json_only.saveAsTextFile(outputPath, org.apache.hadoop.io.compress.GzipCodec.class); } else { json_only.saveAsTextFile(outputPath); } long json_only_count = json_only.count(); clientDetailZeroQueue.add(new ClientDetail(currentClient, json_only_with_zeros.count() - json_only_count)); clientDetailQueue.add(new ClientDetail(currentClient, json_only_count)); } else System.out.println("Found null client!"); } System.out.println("- Client Action (Zero Userid) Count -"); while (clientDetailZeroQueue.size() != 0) { GroupActionsJob.ClientDetail clientDetail = clientDetailZeroQueue.remove(); System.out.println(String.format("%s: %d", clientDetail.client, clientDetail.itemCount)); } System.out.println("- Client Action Count -"); while (clientDetailQueue.size() != 0) { GroupActionsJob.ClientDetail clientDetail = clientDetailQueue.remove(); System.out.println(String.format("%s: %d", clientDetail.client, clientDetail.itemCount)); } jsc.stop(); System.out.println(String.format("--- finished GroupActionsJob date[%s] unixDays[%s] ---", cmdLineArgs.input_date_string, unixDays)); } public static String getOutputPath(String output_path_dir, long unixDays, String client) { return output_path_dir + "/" + client + "/actions/" + unixDays; } }