package com.alimama.quanjingmonitor.kmeans;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.List;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.log4j.Logger;
import com.alimama.mdrill.ui.service.AdhocOfflineService;
/**
*
*
*
*
*
http://quanjing.alimama.com:9999/downloadabtest.jsp?strmatch=rep0&uuid=46ce2a8e-1b74-4fee-bb3a-a9c111e35655
http://quanjing.alimama.com:9999/downloadabtest.jsp?strmatch=rep1&uuid=46ce2a8e-1b74-4fee-bb3a-a9c111e35655
http://adhoc.etao.com:9999/downloadoffline.jsp?uuid=46ce2a8e-1b74-4fee-bb3a-a9c111e35655
http://quanjing.alimama.com:9999/abtestInfo.jsp?uuid=46ce2a8e-1b74-4fee-bb3a-a9c111e35655
http://42.156.210.133:9999/visualization/kmeans.jsp
colls ["prov","seller_star_name"]
colls_important ["main_cat_name"]
jobname abtest
mailto yannian.mu@alibaba-inc.com
memo beiyong
number_important ["(case when (alipay_direct_num+alipay_indirect_num)=0 then 0.00001 else ((alipay_direct_amt+alipay_indirect_amt)/(alipay_direct_num+alipay_indirect_num)) end)"]
numbers ["alipay_indirect_amt","alipay_direct_num","e_gmv_indirect_amt"]
params {"project":"rpt_p4padhoc_cust","callback":"null","start":"0","rows":"20","q":"[{\"thedate\":{\"operate\":5,\"value\":[\"20140521\",\"20140520\",\"20140519\",\"20140518\",\"20140517\",\"20140516\"]}}]","dist":"null","username":"yannian.mu","fl":"prov,seller_star_name,sum(alipay_direct_amt),sum(alipay_indirect_amt),sum(alipay_indirect_num),sum(alipay_direct_num),sum(e_gmv_direct_cnt),average(e_gmv_direct_cnt),sum(e_gmv_direct_amt),sum(e_gmv_indirect_amt),sum(e_gmv_indirect_cnt)","groupby":"prov,seller_star_name","sort":"null","order":"null","leftjoin":"null","dimvalue":"省, 卖家星级, 求和(直通车直接成交金额), 求和(直通车间接成交金额), 求和(直通车间接成交笔数), 求和(直通车直接成交笔数), 求和(GMV直接成交笔数), 平均值(GMV直接成交笔数), 求和(GMV直接成交金额), 求和(GMV间接成交金额), 求和(GMV间接成交笔数)","jobparam":"过滤条件:"}
project rpt_p4padhoc_cust
q [{"thedate":{"operate":5,"value":["20140521","20140520","20140519","20140518","20140517","20140516"]}}]
rnd 0.13910470720947243
username yannian.mu
* hadoop fs -cat /group/tbdp-etao-adhoc/p4padhoc/abtest_cust/2001/cluster_abtest/*|sed 's/\x01/\x09/g'|more
hadoop jar ./kkk.jar com.alimama.quanjingmonitor.kmeans.KMeansDriver 0 /group/tbdp-etao-adhoc/p4padhoc/abtestforbp /group/tbdp-etao-adhoc/p4padhoc/abtest_out/131 3 200 0.0001 1000 2 10 0,1@2@3,4 1048576
hadoop jar ./kkk.jar com.alimama.quanjingmonitor.kmeans.KMeansDriver 0 /group/tbdp-etao-adhoc/p4padhoc/abtest_out/131/cluster_1 xxx 0 10 0.0001 1000 2 10 0,1@2@3,4 0000 |more
hadoop jar ./kkk.jar com.alimama.quanjingmonitor.kmeans.KMeansDriver 0 /group/tbdp-etao-adhoc/p4padhoc/abtestforbp /group/tbdp-etao-adhoc/p4padhoc/abtest_out/500 4 1000 0.0001 1000 2 100 0;1;2;3,4 548576
hadoop jar ./kkk.jar com.alimama.quanjingmonitor.kmeans.KMeansDriver 0 /group/tbdp-etao-adhoc/p4padhoc/adhoc_cust_seed /group/tbdp-etao-adhoc/p4padhoc/abtest_cust/001 2 999 0.0001 1500 2 100 "1;2;3,4;5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29" 548576
hadoop jar ./kkk.jar com.alimama.quanjingmonitor.kmeans.KMeansDriver 0 /group/tbdp-etao-adhoc/p4padhoc/abtest_cust/001/part-InitCenter xxx 0 10 0.0001 1000 2 10 "1;2;3,4;5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29" 0000 |more
hadoop fs -cat /group/tbdp-etao-adhoc/p4padhoc/abtest_cust/kkk1413/cluster_abtest/*|sed 's/\x01/\x09/g'
|grep rep0>abtest_1500_rep0.txt
nohup hadoop jar ./kkk_ok.jar com.alimama.quanjingmonitor.kmeans.KMeansDriver 0 /group/tbdp-etao-adhoc/p4padhoc/adhoc_cust_seed /group/tbdp-etao-adhoc/p4padhoc/abtest_cust/kkk_ok 15 999 0.0001 1500 2 100 "1,3,4;2;3,4;5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29" 548576 >kkk_ok_1500.log &
* @author yannian.mu
*
*
*
*/
public class KMeansDriver extends Configured implements Tool {
MysqlCallbackKmeans callback=null;
public KMeansDriver(MysqlCallbackKmeans callback) {
this.callback = callback;
}
private static Logger LOG = Logger.getLogger(KMeansDriver.class);
public static void main(String[] args) throws Exception {
ToolRunner.run(new Configuration(), new KMeansDriver(null), args);
}
@Override
public int run(String[] args) throws Exception {
LOG.info("KMeansDriver start :"+Arrays.toString(args));
Path input=new Path(args[1]);
Path output=new Path(args[2]);
int maxIterations=Integer.parseInt(args[3]);
int k=Integer.parseInt(args[4]);
String delta=args[5];
int count=Integer.parseInt(args[6]);
int rep=Integer.parseInt(args[7]);
int reduce= Integer.parseInt(args[8]);
String config=args[9];
String splitsize=args[10];
this.KMeans(input, output, maxIterations, k,delta, count, rep,reduce,config,splitsize);
return 0;
}
private ParseVector parse = new ParseVector();
private int reduce=2;
private int percentStage=18;
public void KMeans(
Path input, Path output, int maxIterations, int k,String delta, int count, int rep,int reduce,String config
,String splitsize) throws IOException,
InterruptedException, ClassNotFoundException {
this.reduce=reduce;
Configuration conf = this.getConf();
conf.set("mapred.max.split.size", splitsize);
conf.set("abtest.kmeans.config", config);
FileSystem fs=FileSystem.get(conf);
this.percentStage=20;
if(maxIterations==0)
{
ArrayList<Cluster> clusters = getClusters(input, conf, fs);
for(Cluster c:clusters){
System.out.println(c.asFormatString());
}
return ;
}
parse.setup(conf);
// HadoopUtil.delete(conf, output);
fs.mkdirs(output);
if(callback!=null)
{
callback.setPercent("Stage-"+(this.percentStage++)+" map = 100.0%, reduce = 100.0%");
callback.maybeSync();
}
Path clusters_random = this.InitCenter(conf, input, output, k);
//
Path finalClusters = this.buildClustersMR(conf, input, clusters_random,
output, maxIterations, delta);
// Path finalClusters=new Path("/group/tbdp-etao-adhoc/p4padhoc/abtest_cust/kkk_ok_1709/cluster_20");///group/tbdp-etao-adhoc/p4padhoc/abtest_out/130/cluster_41
Path clustersOut = this.makeFinalCluster(conf, finalClusters, output,delta, count, rep);
if(callback!=null)
{
callback.setPercent("Stage-"+(this.percentStage++)+" map = 100.0%, reduce = 100.0%");
callback.maybeSync();
}
this.clusterDataMR(conf, input, clustersOut, output, delta, rep);
}
private Path makeFinalCluster(Configuration conf, Path finalClusters,
Path output, String delta, int count, int rep)
throws IOException {
FileSystem fs = FileSystem.get(finalClusters.toUri(), conf);
ArrayList<Cluster> clusters = getClusters(finalClusters, conf, fs);
long total = 0;
for (Cluster c : clusters) {
total += c.getCenter().getNumPoints();
}
float rate = total / count;
HashMap<Integer, Integer> hashm = new HashMap<Integer, Integer>();
long total2 = 0;
for (Cluster c : clusters) {
long points = c.getCenter().getNumPoints();
if(points>=20)
{
int allow = (int) (points / rate)+1;
total2 += allow;
hashm.put(c.getId(), allow);
}else{
hashm.put(c.getId(), 0);
}
}
Collections.sort(clusters, new Comparator<Cluster>() {
@Override
public int compare(Cluster o1, Cluster o2) {
long t1 = o1.getCenter().getNumPoints();
long t2 = o2.getCenter().getNumPoints();
return t1 == t2 ? 0 : t1 < t2 ? 1 : -1;
}
});
long left = total2 - count;
while(true)
{
if (left <= 0) {
break;
}
for (Cluster c : clusters) {
if (left <= 0) {
break;
}
int id = c.getId();
int num = hashm.get(id);
if(num>0)
{
hashm.put(id, num - 1);
left--;
}
}
}
for (int i = 0; i < clusters.size(); i++) {
Cluster c = clusters.get(i);
int id = c.getId();
int num = hashm.get(id);
c.setNumselect(num);
}
Path clustersOut = new Path(output, "cluster_final");
int suma=0;
int sumb=0;
SequenceFile.Writer writer = SequenceFile.createWriter(fs, conf,
clustersOut, Text.class, Cluster.class);
for (int i = 0; i < clusters.size(); i++) {
Cluster c = clusters.get(i);
int id = c.getId();
suma+=c.getCenter().getNumPoints();
sumb+=c.getNumselect();
if(i<10)
{
System.out.println(id +"@"+c.getCenter().getNumPoints()+ "@" + c.getNumselect()+","+c.asFormatString());
}
writer.append(new Text(String.valueOf(id)), c);
}
writer.close();
System.out.println(suma+"==="+sumb);
return clustersOut;
}
private void clusterDataMR(Configuration conf, Path input, Path clustersIn,
Path output, String convergenceDelta, int rep) throws IOException,
InterruptedException, ClassNotFoundException {
FileSystem fs=FileSystem.get(conf);
conf.set(CLUSTER_PATH_KEY, clustersIn.toString());
conf.set(CLUSTER_CONVERGENCE_KEY, convergenceDelta);
conf.setInt(CLUSTER_CONVERGENCE_ABTEST_REP, rep);
Job job = new Job(conf,
"KMeans Driver running clusterData over input: " + input);
// job.setInputFormatClass(FileInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
job.setMapperClass(KMeansClusterMapper.class);
job.setCombinerClass(KMeansClusterCombiner.class);
job.setReducerClass(KMeansClusterReduce.class);
job.setNumReduceTasks(this.reduce);
FileInputFormat.setInputPaths(job, input);
FileOutputFormat.setOutputPath(job, new Path(output, "cluster_abtest"));
job.setJarByClass(KMeansDriver.class);
if (!job.waitForCompletion(true)) {
throw new InterruptedException(
"K-Means Clustering failed processing " + clustersIn);
}
}
private Path buildClustersMR(Configuration conf, Path input,
Path clustersIn, Path output, int maxIterations, String delta)
throws IOException, InterruptedException, ClassNotFoundException {
boolean converged = false;
int iteration = 1;
while (!converged && iteration <= maxIterations) {
if(callback!=null)
{
callback.setPercent("Stage-"+(this.percentStage++)+" map = 100.0%, reduce = 100.0%");
callback.maybeSync();
}
Path clustersOut = new Path(output, "cluster_" + iteration);
converged = runIteration(conf, input, clustersIn, clustersOut,
delta);
clustersIn = clustersOut;
iteration++;
}
return clustersIn;
}
public static String CLUSTER_CONVERGENCE_ABTEST_REP = "org.apache.mahout.clustering.kmeans.abtest.rep";
public static String CLUSTER_CONVERGENCE_KEY = "org.apache.mahout.clustering.kmeans.convergence";
public static String CLUSTER_PATH_KEY = "org.apache.mahout.clustering.kmeans.path";
private boolean runIteration(Configuration conf, Path input,
Path clustersIn, Path clustersOut, String convergenceDelta)
throws IOException, InterruptedException, ClassNotFoundException {
conf.set(CLUSTER_PATH_KEY, clustersIn.toString());
conf.set(CLUSTER_CONVERGENCE_KEY, convergenceDelta);
FileSystem fs=FileSystem.get(conf);
Job job = new Job(conf);
job.setJobName( "KMeans Driver running runIteration over clustersIn: "
+ clustersIn);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Vector.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Cluster.class);
// job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(SequenceFileOutputFormat.class);
job.setMapperClass(KMeansMapper.class);
job.setCombinerClass(KMeansCombiner.class);
job.setReducerClass(KMeansReducer.class);
FileInputFormat.addInputPath(job, input);
SequenceFileOutputFormat.setOutputPath(job, clustersOut);
job.setNumReduceTasks(this.reduce);
job.setJarByClass(KMeansDriver.class);
// HadoopUtil.delete(conf, clustersOut);
if (!job.waitForCompletion(true)) {
throw new InterruptedException(
"K-Means Iteration failed processing " + clustersIn);
}
return isConverged(clustersOut, conf, fs);
}
private static ArrayList<Cluster> getClusters(Path filePath,
Configuration conf, FileSystem fs) throws IOException {
ArrayList<Cluster> clusters = new ArrayList<Cluster>();
KmeansPublic.configureWithClusterInfo(conf, filePath, clusters);
if (clusters.isEmpty()) {
throw new IllegalStateException(
"No clusters found. Check your -c path.");
}
return clusters;
}
private static boolean isConverged(Path filePath, Configuration conf,
FileSystem fs) throws IOException {
try {
Collection<Cluster> clusters = getClusters(filePath, conf, fs);
for (Cluster c : clusters) {
if (!c.isConverged()) {
return false;
}
}
} catch (Throwable e) {
throw new IllegalStateException(e);
}
return true;
}
private Path InitCenter(Configuration conf, Path input, Path output, int k)
throws IOException, InterruptedException, ClassNotFoundException {
FileSystem fs = FileSystem.get(output.toUri(), conf);
Path outFile = new Path(output, "part-InitCenter");
Job job = new Job(conf);
job.setJobName( "KMeans Driver: "+ outFile);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Cluster.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Cluster.class);
job.setOutputFormatClass(SequenceFileOutputFormat.class);
job.setMapperClass(KMeansGroupMapper.class);
job.setCombinerClass(KMeansGroupCombine.class);
job.setReducerClass(KMeansGroupReducer.class);
FileInputFormat.addInputPath(job, input);
SequenceFileOutputFormat.setOutputPath(job, outFile);
job.setNumReduceTasks(32);
job.setJarByClass(KMeansDriver.class);
// HadoopUtil.delete(conf, clustersOut);
if (!job.waitForCompletion(true)) {
throw new InterruptedException(
"K-Means Iteration failed processing " + outFile);
}
return outFile;
}
public Path buildRandom(Configuration conf, Path input, Path output, int k)
throws IOException {
FileSystem fs = FileSystem.get(output.toUri(), conf);
Path outFile = new Path(output, "part-randomSeed");
fs.mkdirs(outFile);
Path inputPathPattern;
System.out.println(input);
if (fs.getFileStatus(input).isDir()) {
inputPathPattern = new Path(input, "*");
} else {
inputPathPattern = input;
}
System.out.println("######"+k);
FileStatus[] inputFiles = fs.globStatus(inputPathPattern,
KmeansPublic.FILTER);
SequenceFile.Writer writer = SequenceFile.createWriter(fs, conf,
new Path(outFile,"random"), Text.class, Cluster.class);
List<Cluster> chosenClusters = new ArrayList<Cluster>(k);
int nextClusterId = 0;
int fileMaxReadCount = 50000;
if (fileMaxReadCount <= k) {
fileMaxReadCount = k;
}
if (inputFiles.length > 0) {
fileMaxReadCount = fileMaxReadCount / inputFiles.length;
}
if (fileMaxReadCount <= 10) {
fileMaxReadCount = 10;
}
System.out.println("#####"+k+"@"+fileMaxReadCount);
int number = 0;
for (FileStatus fileStatus : inputFiles) {
if (fileStatus.isDir()) {
continue;
}
int filehasread = 0;
FSDataInputStream in = fs.open(fileStatus.getPath());
BufferedReader bf = new BufferedReader(
new InputStreamReader(in));
String line;
while ((line = bf.readLine()) != null) {
Vector vec=parse.parseVector(line);
if(vec==null)
{
continue;
}
// System.out.println(filehasread+"@"+fileMaxReadCount+","+vec.toString());
number++;
filehasread++;
int currentSize = chosenClusters.size();
if (currentSize < k) {
Cluster newCluster = new Cluster(vec,nextClusterId++);
chosenClusters.add(newCluster);
} else {
int randIndex = (int) (Math.random() * currentSize);
chosenClusters.get(randIndex).getCenter().merger(vec);
}
if (filehasread > fileMaxReadCount) {
break;
}
}
bf.close();
in.close();
}
for (int i = 0; i < k; i++) {
Cluster closter = chosenClusters.get(i);
closter.setId(i);
writer.append(new Text(String.valueOf(i)), closter);
}
writer.close();
return outFile;
}
}