/*
* Seldon -- open source prediction engine
* =======================================
* Copyright 2011-2015 Seldon Technologies Ltd and Rummble Ltd (http://www.seldon.io/)
*
**********************************************************************************************
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
**********************************************************************************************
*/
package io.seldon.stream.itemsim;
import java.text.SimpleDateFormat;
import java.util.List;
import java.util.Properties;
import java.util.Timer;
import java.util.TimerTask;
import java.util.concurrent.atomic.AtomicLong;
import net.sourceforge.argparse4j.ArgumentParsers;
import net.sourceforge.argparse4j.inf.ArgumentParser;
import net.sourceforge.argparse4j.inf.ArgumentParserException;
import net.sourceforge.argparse4j.inf.Namespace;
import org.apache.kafka.clients.consumer.ConsumerConfig;
import org.apache.kafka.clients.producer.KafkaProducer;
import org.apache.kafka.clients.producer.ProducerRecord;
import org.apache.kafka.common.serialization.Serde;
import org.apache.kafka.common.serialization.Serdes;
import org.apache.kafka.connect.json.JsonDeserializer;
import org.apache.kafka.connect.json.JsonSerializer;
import org.apache.kafka.streams.KafkaStreams;
import org.apache.kafka.streams.StreamsConfig;
import org.apache.kafka.streams.kstream.ForeachAction;
import org.apache.kafka.streams.kstream.KStream;
import org.apache.kafka.streams.kstream.KStreamBuilder;
import org.apache.kafka.streams.kstream.Predicate;
import org.apache.kafka.streams.processor.WallclockTimestampExtractor;
import org.joda.time.DateTime;
import com.fasterxml.jackson.databind.JsonNode;
public class ItemSimilarityProcessor {
final StreamingJaccardSimilarity streamJaccard;
Timer outputTimer;
long lastTime = 0;
AtomicLong outputSimilaritiesTime = new AtomicLong(0);
int windowSecs;
int windowProcessed;
String outputTopic;
int count = 0;
final String kafkaServers;
public ItemSimilarityProcessor(final Namespace ns)
{
this.windowSecs = ns.getInt("window_secs");
this.windowProcessed = ns.getInt("window_processed");
this.outputTopic = ns.getString("output_topic");
this.kafkaServers = ns.getString("kafka");
System.out.println(ns);
this.streamJaccard = new StreamingJaccardSimilarity(windowSecs, ns.getInt("hashes"), ns.getInt("min_activity"));
//createOutputSimilaritiesTimer(ns);
}
public void createOutputSimilaritiesTimer(Namespace ns)
{
int windowSecs = ns.getInt("output_poll_secs");
int timer_ms = windowSecs * 1000;
System.out.println("Scheduling at "+timer_ms);
outputTimer = new Timer(true);
outputTimer.scheduleAtFixedRate(new TimerTask() {
public void run()
{
long time = ItemSimilarityProcessor.this.outputSimilaritiesTime.get();
if (time > 0)
{
SimpleDateFormat sdf = new SimpleDateFormat("MMMM d, yyyy 'at' h:mm a");
String date = sdf.format(time*1000);
System.out.println("getting similarities at "+date);
List<JaccardSimilarity> res = streamJaccard.getSimilarity(time);
System.out.println("Results size "+res.size()+". Sending Messages...");
sendMessages(res, time);
System.out.println("Messages sent");
ItemSimilarityProcessor.this.outputSimilaritiesTime.set(0);
}
else
{
System.out.println("Timer: not outputing similarities");
}
}
}, timer_ms, timer_ms);
}
@SuppressWarnings("unchecked")
public void process(final Namespace ns) throws InterruptedException
{
Properties props = new Properties();
final String app_id = "stream-item-similarity-" +ns.getString("client");
props.put(StreamsConfig.APPLICATION_ID_CONFIG, app_id);
props.put(StreamsConfig.BOOTSTRAP_SERVERS_CONFIG, ns.getString("kafka"));
props.put(StreamsConfig.ZOOKEEPER_CONNECT_CONFIG, ns.getString("zookeeper"));
props.put(StreamsConfig.KEY_SERDE_CLASS_CONFIG, Serdes.String().getClass().getName());
props.put(StreamsConfig.VALUE_SERDE_CLASS_CONFIG, Serdes.String().getClass().getName());
props.put(StreamsConfig.TIMESTAMP_EXTRACTOR_CLASS_CONFIG, WallclockTimestampExtractor.class);
props.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, "latest");
KStreamBuilder builder = new KStreamBuilder();
JsonDeserializer jsonDeserializer = new JsonDeserializer();
final Serde<JsonNode> jsonSerde = Serdes.serdeFrom(new JsonSerializer(),jsonDeserializer);
//final Serde<String> stringSerde = Serdes.String();
final String topic = ns.getString("topic");
System.out.println("topic:"+topic);
final String parseDateMethod = ns.getString("parse_date_method");
KStream<byte[], JsonNode> source = builder.stream(Serdes.ByteArray(),jsonSerde,topic);
source.filter(new Predicate<byte[], JsonNode>() {
@Override
public boolean test(byte[] key, JsonNode value) {
//System.out.println(value);
String client = value.get("client").asText();
if (client.equals(ns.getString("client")))
return true;
else
return false;
}
})
.foreach(new ForeachAction<byte[], JsonNode>() {
@Override
public void apply(byte[] key, JsonNode value) {
Long user = value.get("userid").asLong();
Long item = value.get("itemid").asLong();
Long time;
if (parseDateMethod.equals("json-utc"))
{
//expected 2016-07-18T08:49:45Z
DateTime dtime = new DateTime(value.get("timestamp_utc").asText());
time = dtime.getMillis()/1000;
}
else if (parseDateMethod.equals("json-time"))
time = value.get("time").asLong();
else
time = System.currentTimeMillis()/1000;
//System.out.println("User:"+user+"item:"+item+"time:"+time);
ItemSimilarityProcessor.this.streamJaccard.add(item, user, time);
//debugging only
if (ItemSimilarityProcessor.this.lastTime == 0)
ItemSimilarityProcessor.this.lastTime = time;
long diff = time - ItemSimilarityProcessor.this.lastTime;
if ((windowSecs > -1 && diff >= windowSecs) || (windowProcessed > -1 && ItemSimilarityProcessor.this.count % windowProcessed == 0))
{
//ItemSimilarityProcessor.this.outputSimilaritiesTime.compareAndSet(0, time);
//ItemSimilarityProcessor.this.lastTime = time;
SimpleDateFormat sdf = new SimpleDateFormat("MMMM d, yyyy 'at' h:mm a");
String date = sdf.format(time*1000);
System.out.println("getting similarities at "+date);
List<JaccardSimilarity> res = streamJaccard.getSimilarity(time);
if (res.size() > 0)
{
System.out.println("Results size "+res.size()+" Sending messages..");
sendMessages(res, time);
System.out.println("Sent messages");
}
else
System.out.println("Results size "+res.size()+" Not sending messages");
ItemSimilarityProcessor.this.lastTime = time;
}
ItemSimilarityProcessor.this.count++;
if (ItemSimilarityProcessor.this.count % 1000 == 0)
{
System.out.println("Processed "+count+" time diff is "+diff+" window is "+windowSecs);
}
}
});
KafkaStreams streams = new KafkaStreams(builder, props);
streams.start();
}
public void sendMessages(List<JaccardSimilarity> sims,long timestamp)
{
Properties producerConfig = new Properties();
producerConfig.put("bootstrap.servers", this.kafkaServers);
producerConfig.put("key.serializer",
"org.apache.kafka.common" +
".serialization.ByteArraySerializer");
producerConfig.put("value.serializer",
"org.apache.kafka.common" +
".serialization.StringSerializer");
KafkaProducer producer = new KafkaProducer<byte[], String>(producerConfig);
StringBuffer buf = new StringBuffer();
producer.send(new ProducerRecord<byte[], String>(this.outputTopic, "START".getBytes(),"0,,,"));
for(JaccardSimilarity s : sims)
{
buf.append(timestamp).append(",").append(s.item1).append(",").append(s.item2).append(",").append(s.similarity);
producer.send(new ProducerRecord<byte[], String>(this.outputTopic, "A".getBytes(), buf.toString()));
buf.delete(0, buf.length());
}
producer.send(new ProducerRecord<byte[], String>(this.outputTopic, "END".getBytes(),"0,,,"));
}
public static void main(String[] args) throws Exception {
ArgumentParser parser = ArgumentParsers.newArgumentParser("ImpressionsToInfluxDb")
.defaultHelp(true)
.description("Read Seldon impressions and send stats to influx db");
parser.addArgument("-t", "--topic").setDefault("actions").help("Kafka topic to read from");
parser.addArgument("-c", "--client").required(true).help("Client to run item similarity");
parser.addArgument("-o", "--output-topic").required(true).help("Output topic");
parser.addArgument("-k", "--kafka").setDefault("localhost:9092").help("Kafka server and port");
parser.addArgument("-z", "--zookeeper").setDefault("localhost:2181").help("Zookeeper server and port");
parser.addArgument("-w", "--window-secs").type(Integer.class).setDefault(3600*5).help("streaming window size in secs, -1 means ignore");
parser.addArgument("-u", "--window-processed").type(Integer.class).setDefault(-1).help("streaming window size in processed count, -1 means ignore");
parser.addArgument("--output-poll-secs").type(Integer.class).setDefault(60).help("output timer polling period in secs");
parser.addArgument("--hashes").type(Integer.class).setDefault(100).help("number of hashes");
parser.addArgument("-m", "--min-activity").type(Integer.class).setDefault(200).help("min activity");
parser.addArgument("-p", "--parse-date-method").choices("json-time","json-utc","system").setDefault("json-time").help("min activity");
Namespace ns = null;
try {
ns = parser.parseArgs(args);
ItemSimilarityProcessor processor = new ItemSimilarityProcessor(ns);
processor.process(ns);
} catch (ArgumentParserException e) {
parser.handleError(e);
System.exit(1);
}
}
}