package org.shanbo.feluca.significanttesting; import java.io.BufferedWriter; import java.io.FileWriter; import java.io.IOException; import java.util.HashSet; import java.util.Random; import java.util.Arrays; import org.apache.commons.lang3.StringUtils; import org.shanbo.feluca.data2.Vector; import org.shanbo.feluca.data2.DataStatistic; import org.shanbo.feluca.paddle.common.Utilities; import org.shanbo.feluca.paddle.common.collection.IntArray; /** * self-loop approach for ARISTIDES GIONIS's binary swap randomization * see "Accessing data mining result via swap randomization" * TODO tests * @author lgn * */ public class BigraphRandomizeSwapper extends RandomSwapper{ public BigraphRandomizeSwapper(String inputData, String outputPrefix, int itersPerLoop, int loops) throws IOException { super(inputData, outputPrefix, itersPerLoop, loops); } final static String outputSuffix = ".dat"; HashSet<Long> graph = new HashSet<Long>(); IntArray jref ; IntArray iref ; Random r = new Random(); int n = 0; // total # of elements public int swap(){ int a = r.nextInt(n); int b = r.nextInt(n); int aj = jref.get(a); int ai = iref.get(a); int bj = jref.get(b); int bi = iref.get(b); if (!graph.contains( ((long)aj << 32)|(long)(bi) ) && !graph.contains( ((long)bj << 32)|(long)(ai))){ graph.remove(((long)aj << 32)|(long)(ai)); graph.remove(((long)bj << 32)|(long)(bi)); graph.add(((long)aj << 32)|(long)(bi)); graph.add(((long)bj << 32)|(long)(ai)); iref.set(a, bi); iref.set(b, ai); return 1; } return 0; } @Override public void runSwap() throws Exception { int attributes = 1024 ; //for the capacity of IntegerArray initialization long t = 0; attributes = Math.max(attributes, Utilities.getIntFromProperties(input.getDataStatistic(), DataStatistic.MAX_FEATURE_ID)); jref = new IntArray(attributes); iref = new IntArray(attributes); int rowc = 0; int maxfeatureSize = 0; System.out.println("loading data~"); input.reOpen(); Vector sample = input.getNextVector(); for(; sample!=null; sample = input.getNextVector()){ for( int i = 0 ; i < sample.getSize(); i++){ jref.add(rowc); iref.add(sample.getFId(i)); graph.add(((long)rowc << 32) | ((long)sample.getFId(i))); n += 1; } rowc += 1; maxfeatureSize = Math.max(maxfeatureSize, sample.getSize()); } System.out.println("loading data finished! " ); Integer[] newFidArray = new Integer[maxfeatureSize]; int swaps = 0; int total = itersPerLoop * loops; // starting swap & output for(int i = 0 ; i < total; i++){ if (i % itersPerLoop == 0){ int k = 0; //rowid BufferedWriter writer = new BufferedWriter(new FileWriter(String.format("%s.%d%s", outputPrefix, (i/itersPerLoop), outputSuffix))); int idx = 0; for (int l = 0; l<= n; l++){ if (l < n && (k == jref.get(l))){ newFidArray[idx] = iref.get(l); // feature id idx ++ ; }else{ // row.featureSize = idx; Arrays.sort(newFidArray, 0, idx); // writer.write(row); // write out a vector writer.write(StringUtils.join(newFidArray, " ", 0, idx)); if (l < n){ idx = 0; newFidArray[idx++] = iref.get(l); k = jref.get(l); } } } writer.close(); if (i > 0){ long t2 = System.currentTimeMillis(); System.out.println(String.format("%.1f\t%d\t%d\t%.4f\t%.5f", (t2-t)/1000.0, i, swaps, (swaps + 0.0)/i, (swaps + 0.0)/n)); t = t2; }else{ //first time System.out.println("time(s)\titerlen\tswapped\t%/loop\t%/all"); System.out.println("0.0\t0\t0\t0.0\t0.0"); t = System.currentTimeMillis(); } } swaps += swap(); } } }