package com.alimama.quanjingmonitor.kmeans; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStreamReader; import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; import java.util.Collections; import java.util.Comparator; import java.util.HashMap; import java.util.List; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.SequenceFile; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat; import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; import org.apache.log4j.Logger; import com.alimama.mdrill.ui.service.AdhocOfflineService; /** * * * * * http://quanjing.alimama.com:9999/downloadabtest.jsp?strmatch=rep0&uuid=46ce2a8e-1b74-4fee-bb3a-a9c111e35655 http://quanjing.alimama.com:9999/downloadabtest.jsp?strmatch=rep1&uuid=46ce2a8e-1b74-4fee-bb3a-a9c111e35655 http://adhoc.etao.com:9999/downloadoffline.jsp?uuid=46ce2a8e-1b74-4fee-bb3a-a9c111e35655 http://quanjing.alimama.com:9999/abtestInfo.jsp?uuid=46ce2a8e-1b74-4fee-bb3a-a9c111e35655 http://42.156.210.133:9999/visualization/kmeans.jsp colls ["prov","seller_star_name"] colls_important ["main_cat_name"] jobname abtest mailto yannian.mu@alibaba-inc.com memo beiyong number_important ["(case when (alipay_direct_num+alipay_indirect_num)=0 then 0.00001 else ((alipay_direct_amt+alipay_indirect_amt)/(alipay_direct_num+alipay_indirect_num)) end)"] numbers ["alipay_indirect_amt","alipay_direct_num","e_gmv_indirect_amt"] params {"project":"rpt_p4padhoc_cust","callback":"null","start":"0","rows":"20","q":"[{\"thedate\":{\"operate\":5,\"value\":[\"20140521\",\"20140520\",\"20140519\",\"20140518\",\"20140517\",\"20140516\"]}}]","dist":"null","username":"yannian.mu","fl":"prov,seller_star_name,sum(alipay_direct_amt),sum(alipay_indirect_amt),sum(alipay_indirect_num),sum(alipay_direct_num),sum(e_gmv_direct_cnt),average(e_gmv_direct_cnt),sum(e_gmv_direct_amt),sum(e_gmv_indirect_amt),sum(e_gmv_indirect_cnt)","groupby":"prov,seller_star_name","sort":"null","order":"null","leftjoin":"null","dimvalue":"省, 卖家星级, 求和(直通车直接成交金额), 求和(直通车间接成交金额), 求和(直通车间接成交笔数), 求和(直通车直接成交笔数), 求和(GMV直接成交笔数), 平均值(GMV直接成交笔数), 求和(GMV直接成交金额), 求和(GMV间接成交金额), 求和(GMV间接成交笔数)","jobparam":"过滤条件:"} project rpt_p4padhoc_cust q [{"thedate":{"operate":5,"value":["20140521","20140520","20140519","20140518","20140517","20140516"]}}] rnd 0.13910470720947243 username yannian.mu * hadoop fs -cat /group/tbdp-etao-adhoc/p4padhoc/abtest_cust/2001/cluster_abtest/*|sed 's/\x01/\x09/g'|more hadoop jar ./kkk.jar com.alimama.quanjingmonitor.kmeans.KMeansDriver 0 /group/tbdp-etao-adhoc/p4padhoc/abtestforbp /group/tbdp-etao-adhoc/p4padhoc/abtest_out/131 3 200 0.0001 1000 2 10 0,1@2@3,4 1048576 hadoop jar ./kkk.jar com.alimama.quanjingmonitor.kmeans.KMeansDriver 0 /group/tbdp-etao-adhoc/p4padhoc/abtest_out/131/cluster_1 xxx 0 10 0.0001 1000 2 10 0,1@2@3,4 0000 |more hadoop jar ./kkk.jar com.alimama.quanjingmonitor.kmeans.KMeansDriver 0 /group/tbdp-etao-adhoc/p4padhoc/abtestforbp /group/tbdp-etao-adhoc/p4padhoc/abtest_out/500 4 1000 0.0001 1000 2 100 0;1;2;3,4 548576 hadoop jar ./kkk.jar com.alimama.quanjingmonitor.kmeans.KMeansDriver 0 /group/tbdp-etao-adhoc/p4padhoc/adhoc_cust_seed /group/tbdp-etao-adhoc/p4padhoc/abtest_cust/001 2 999 0.0001 1500 2 100 "1;2;3,4;5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29" 548576 hadoop jar ./kkk.jar com.alimama.quanjingmonitor.kmeans.KMeansDriver 0 /group/tbdp-etao-adhoc/p4padhoc/abtest_cust/001/part-InitCenter xxx 0 10 0.0001 1000 2 10 "1;2;3,4;5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29" 0000 |more hadoop fs -cat /group/tbdp-etao-adhoc/p4padhoc/abtest_cust/kkk1413/cluster_abtest/*|sed 's/\x01/\x09/g' |grep rep0>abtest_1500_rep0.txt nohup hadoop jar ./kkk_ok.jar com.alimama.quanjingmonitor.kmeans.KMeansDriver 0 /group/tbdp-etao-adhoc/p4padhoc/adhoc_cust_seed /group/tbdp-etao-adhoc/p4padhoc/abtest_cust/kkk_ok 15 999 0.0001 1500 2 100 "1,3,4;2;3,4;5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29" 548576 >kkk_ok_1500.log & * @author yannian.mu * * * */ public class KMeansDriver extends Configured implements Tool { MysqlCallbackKmeans callback=null; public KMeansDriver(MysqlCallbackKmeans callback) { this.callback = callback; } private static Logger LOG = Logger.getLogger(KMeansDriver.class); public static void main(String[] args) throws Exception { ToolRunner.run(new Configuration(), new KMeansDriver(null), args); } @Override public int run(String[] args) throws Exception { LOG.info("KMeansDriver start :"+Arrays.toString(args)); Path input=new Path(args[1]); Path output=new Path(args[2]); int maxIterations=Integer.parseInt(args[3]); int k=Integer.parseInt(args[4]); String delta=args[5]; int count=Integer.parseInt(args[6]); int rep=Integer.parseInt(args[7]); int reduce= Integer.parseInt(args[8]); String config=args[9]; String splitsize=args[10]; this.KMeans(input, output, maxIterations, k,delta, count, rep,reduce,config,splitsize); return 0; } private ParseVector parse = new ParseVector(); private int reduce=2; private int percentStage=18; public void KMeans( Path input, Path output, int maxIterations, int k,String delta, int count, int rep,int reduce,String config ,String splitsize) throws IOException, InterruptedException, ClassNotFoundException { this.reduce=reduce; Configuration conf = this.getConf(); conf.set("mapred.max.split.size", splitsize); conf.set("abtest.kmeans.config", config); FileSystem fs=FileSystem.get(conf); this.percentStage=20; if(maxIterations==0) { ArrayList<Cluster> clusters = getClusters(input, conf, fs); for(Cluster c:clusters){ System.out.println(c.asFormatString()); } return ; } parse.setup(conf); // HadoopUtil.delete(conf, output); fs.mkdirs(output); if(callback!=null) { callback.setPercent("Stage-"+(this.percentStage++)+" map = 100.0%, reduce = 100.0%"); callback.maybeSync(); } Path clusters_random = this.InitCenter(conf, input, output, k); // Path finalClusters = this.buildClustersMR(conf, input, clusters_random, output, maxIterations, delta); // Path finalClusters=new Path("/group/tbdp-etao-adhoc/p4padhoc/abtest_cust/kkk_ok_1709/cluster_20");///group/tbdp-etao-adhoc/p4padhoc/abtest_out/130/cluster_41 Path clustersOut = this.makeFinalCluster(conf, finalClusters, output,delta, count, rep); if(callback!=null) { callback.setPercent("Stage-"+(this.percentStage++)+" map = 100.0%, reduce = 100.0%"); callback.maybeSync(); } this.clusterDataMR(conf, input, clustersOut, output, delta, rep); } private Path makeFinalCluster(Configuration conf, Path finalClusters, Path output, String delta, int count, int rep) throws IOException { FileSystem fs = FileSystem.get(finalClusters.toUri(), conf); ArrayList<Cluster> clusters = getClusters(finalClusters, conf, fs); long total = 0; for (Cluster c : clusters) { total += c.getCenter().getNumPoints(); } float rate = total / count; HashMap<Integer, Integer> hashm = new HashMap<Integer, Integer>(); long total2 = 0; for (Cluster c : clusters) { long points = c.getCenter().getNumPoints(); if(points>=20) { int allow = (int) (points / rate)+1; total2 += allow; hashm.put(c.getId(), allow); }else{ hashm.put(c.getId(), 0); } } Collections.sort(clusters, new Comparator<Cluster>() { @Override public int compare(Cluster o1, Cluster o2) { long t1 = o1.getCenter().getNumPoints(); long t2 = o2.getCenter().getNumPoints(); return t1 == t2 ? 0 : t1 < t2 ? 1 : -1; } }); long left = total2 - count; while(true) { if (left <= 0) { break; } for (Cluster c : clusters) { if (left <= 0) { break; } int id = c.getId(); int num = hashm.get(id); if(num>0) { hashm.put(id, num - 1); left--; } } } for (int i = 0; i < clusters.size(); i++) { Cluster c = clusters.get(i); int id = c.getId(); int num = hashm.get(id); c.setNumselect(num); } Path clustersOut = new Path(output, "cluster_final"); int suma=0; int sumb=0; SequenceFile.Writer writer = SequenceFile.createWriter(fs, conf, clustersOut, Text.class, Cluster.class); for (int i = 0; i < clusters.size(); i++) { Cluster c = clusters.get(i); int id = c.getId(); suma+=c.getCenter().getNumPoints(); sumb+=c.getNumselect(); if(i<10) { System.out.println(id +"@"+c.getCenter().getNumPoints()+ "@" + c.getNumselect()+","+c.asFormatString()); } writer.append(new Text(String.valueOf(id)), c); } writer.close(); System.out.println(suma+"==="+sumb); return clustersOut; } private void clusterDataMR(Configuration conf, Path input, Path clustersIn, Path output, String convergenceDelta, int rep) throws IOException, InterruptedException, ClassNotFoundException { FileSystem fs=FileSystem.get(conf); conf.set(CLUSTER_PATH_KEY, clustersIn.toString()); conf.set(CLUSTER_CONVERGENCE_KEY, convergenceDelta); conf.setInt(CLUSTER_CONVERGENCE_ABTEST_REP, rep); Job job = new Job(conf, "KMeans Driver running clusterData over input: " + input); // job.setInputFormatClass(FileInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setMapperClass(KMeansClusterMapper.class); job.setCombinerClass(KMeansClusterCombiner.class); job.setReducerClass(KMeansClusterReduce.class); job.setNumReduceTasks(this.reduce); FileInputFormat.setInputPaths(job, input); FileOutputFormat.setOutputPath(job, new Path(output, "cluster_abtest")); job.setJarByClass(KMeansDriver.class); if (!job.waitForCompletion(true)) { throw new InterruptedException( "K-Means Clustering failed processing " + clustersIn); } } private Path buildClustersMR(Configuration conf, Path input, Path clustersIn, Path output, int maxIterations, String delta) throws IOException, InterruptedException, ClassNotFoundException { boolean converged = false; int iteration = 1; while (!converged && iteration <= maxIterations) { if(callback!=null) { callback.setPercent("Stage-"+(this.percentStage++)+" map = 100.0%, reduce = 100.0%"); callback.maybeSync(); } Path clustersOut = new Path(output, "cluster_" + iteration); converged = runIteration(conf, input, clustersIn, clustersOut, delta); clustersIn = clustersOut; iteration++; } return clustersIn; } public static String CLUSTER_CONVERGENCE_ABTEST_REP = "org.apache.mahout.clustering.kmeans.abtest.rep"; public static String CLUSTER_CONVERGENCE_KEY = "org.apache.mahout.clustering.kmeans.convergence"; public static String CLUSTER_PATH_KEY = "org.apache.mahout.clustering.kmeans.path"; private boolean runIteration(Configuration conf, Path input, Path clustersIn, Path clustersOut, String convergenceDelta) throws IOException, InterruptedException, ClassNotFoundException { conf.set(CLUSTER_PATH_KEY, clustersIn.toString()); conf.set(CLUSTER_CONVERGENCE_KEY, convergenceDelta); FileSystem fs=FileSystem.get(conf); Job job = new Job(conf); job.setJobName( "KMeans Driver running runIteration over clustersIn: " + clustersIn); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Vector.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Cluster.class); // job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setMapperClass(KMeansMapper.class); job.setCombinerClass(KMeansCombiner.class); job.setReducerClass(KMeansReducer.class); FileInputFormat.addInputPath(job, input); SequenceFileOutputFormat.setOutputPath(job, clustersOut); job.setNumReduceTasks(this.reduce); job.setJarByClass(KMeansDriver.class); // HadoopUtil.delete(conf, clustersOut); if (!job.waitForCompletion(true)) { throw new InterruptedException( "K-Means Iteration failed processing " + clustersIn); } return isConverged(clustersOut, conf, fs); } private static ArrayList<Cluster> getClusters(Path filePath, Configuration conf, FileSystem fs) throws IOException { ArrayList<Cluster> clusters = new ArrayList<Cluster>(); KmeansPublic.configureWithClusterInfo(conf, filePath, clusters); if (clusters.isEmpty()) { throw new IllegalStateException( "No clusters found. Check your -c path."); } return clusters; } private static boolean isConverged(Path filePath, Configuration conf, FileSystem fs) throws IOException { try { Collection<Cluster> clusters = getClusters(filePath, conf, fs); for (Cluster c : clusters) { if (!c.isConverged()) { return false; } } } catch (Throwable e) { throw new IllegalStateException(e); } return true; } private Path InitCenter(Configuration conf, Path input, Path output, int k) throws IOException, InterruptedException, ClassNotFoundException { FileSystem fs = FileSystem.get(output.toUri(), conf); Path outFile = new Path(output, "part-InitCenter"); Job job = new Job(conf); job.setJobName( "KMeans Driver: "+ outFile); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Cluster.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Cluster.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setMapperClass(KMeansGroupMapper.class); job.setCombinerClass(KMeansGroupCombine.class); job.setReducerClass(KMeansGroupReducer.class); FileInputFormat.addInputPath(job, input); SequenceFileOutputFormat.setOutputPath(job, outFile); job.setNumReduceTasks(32); job.setJarByClass(KMeansDriver.class); // HadoopUtil.delete(conf, clustersOut); if (!job.waitForCompletion(true)) { throw new InterruptedException( "K-Means Iteration failed processing " + outFile); } return outFile; } public Path buildRandom(Configuration conf, Path input, Path output, int k) throws IOException { FileSystem fs = FileSystem.get(output.toUri(), conf); Path outFile = new Path(output, "part-randomSeed"); fs.mkdirs(outFile); Path inputPathPattern; System.out.println(input); if (fs.getFileStatus(input).isDir()) { inputPathPattern = new Path(input, "*"); } else { inputPathPattern = input; } System.out.println("######"+k); FileStatus[] inputFiles = fs.globStatus(inputPathPattern, KmeansPublic.FILTER); SequenceFile.Writer writer = SequenceFile.createWriter(fs, conf, new Path(outFile,"random"), Text.class, Cluster.class); List<Cluster> chosenClusters = new ArrayList<Cluster>(k); int nextClusterId = 0; int fileMaxReadCount = 50000; if (fileMaxReadCount <= k) { fileMaxReadCount = k; } if (inputFiles.length > 0) { fileMaxReadCount = fileMaxReadCount / inputFiles.length; } if (fileMaxReadCount <= 10) { fileMaxReadCount = 10; } System.out.println("#####"+k+"@"+fileMaxReadCount); int number = 0; for (FileStatus fileStatus : inputFiles) { if (fileStatus.isDir()) { continue; } int filehasread = 0; FSDataInputStream in = fs.open(fileStatus.getPath()); BufferedReader bf = new BufferedReader( new InputStreamReader(in)); String line; while ((line = bf.readLine()) != null) { Vector vec=parse.parseVector(line); if(vec==null) { continue; } // System.out.println(filehasread+"@"+fileMaxReadCount+","+vec.toString()); number++; filehasread++; int currentSize = chosenClusters.size(); if (currentSize < k) { Cluster newCluster = new Cluster(vec,nextClusterId++); chosenClusters.add(newCluster); } else { int randIndex = (int) (Math.random() * currentSize); chosenClusters.get(randIndex).getCenter().merger(vec); } if (filehasread > fileMaxReadCount) { break; } } bf.close(); in.close(); } for (int i = 0; i < k; i++) { Cluster closter = chosenClusters.get(i); closter.setId(i); writer.append(new Text(String.valueOf(i)), closter); } writer.close(); return outFile; } }