package ldbc.snb.datagen.generator;
import ldbc.snb.datagen.generator.tools.GraphUtils;
import ldbc.snb.datagen.generator.tools.PersonGraph;
import ldbc.snb.datagen.objects.Knows;
import ldbc.snb.datagen.objects.Person;
import org.apache.hadoop.conf.Configuration;
import java.util.*;
/**
* Created by aprat on 11/15/14.
*/
public class ClusteringKnowsGenerator implements KnowsGenerator {
Random rand;
private ArrayList<Float> percentages = null;
private int stepIndex = 0;
private float targetCC = 0.0f;
private int numMisses = 0;
private int numCoreCoreEdges = 0;
private int numCorePeripheryEdges = 0;
private int numCoreExternalEdges = 0;
private float min_community_prob_ = 0.0f;
private class PersonInfo {
public int index_;
public long degree_;
public long original_degree_;
}
private class Community {
public long id_;
public ArrayList<PersonInfo> core_;
public ArrayList<PersonInfo> periphery_;
public float p_ = 1.0f;
}
private class PersonInfoComparator implements Comparator<PersonInfo>{
public int compare(PersonInfo a, PersonInfo b) {
if( a.degree_ != b.degree_ )
return (int)(b.degree_ - a.degree_ );
return a.index_ - b.index_;
}
}
private class ClusteringInfo {
public ArrayList<Boolean> is_core_ = new ArrayList<Boolean>();
public ArrayList<Double> core_node_expected_core_degree_ = new ArrayList<Double>();
public ArrayList<Double> core_node_excedence_degree_ = new ArrayList<Double>();
public ArrayList<Double> core_node_expected_periphery_degree_ = new ArrayList<Double>();
public ArrayList<Double> core_node_expected_external_degree_ = new ArrayList<Double>();
public ArrayList<Double> clustering_coefficient_ = new ArrayList<Double>();
public ArrayList<Long> community_core_stubs_ = new ArrayList<Long>();
public ArrayList<Float> community_core_probs_ = new ArrayList<Float>();
public ArrayList<Integer> core_nodes_ = new ArrayList<Integer>();
public ArrayList<Integer> community_id_ = new ArrayList<Integer>();
public float sumProbs = 0.0f;
public int numCommunities = 0;
ClusteringInfo( int size, ArrayList<Community> communities ) {
for( int i = 0; i < size; ++i) {
core_node_expected_core_degree_.add(0.0);
core_node_excedence_degree_.add(0.0);
core_node_expected_periphery_degree_.add(0.0);
core_node_expected_external_degree_.add(0.0);
is_core_.add(false);
clustering_coefficient_.add(0.0);
community_id_.add(0);
}
for( int i = 0; i < communities.size(); ++i) {
community_core_stubs_.add(0L);
community_core_probs_.add(0.0f);
}
int index = 0;
for( Community c: communities) {
for( PersonInfo pI : c.core_) {
core_nodes_.add(pI.index_);
is_core_.set(pI.index_, true);
community_id_.set(pI.index_,index );
}
for( PersonInfo pI : c.periphery_) {
is_core_.set(pI.index_, false);
community_id_.set(pI.index_,index );
}
index++;
}
numCommunities = communities.size();
sumProbs = communities.size();
}
}
public ClusteringKnowsGenerator() {
rand = new Random();
}
private Community findSolution( ArrayList<Person> persons, int begin, int last) {
ArrayList<PersonInfo> nodes = new ArrayList<PersonInfo>();
for (int i = begin; i < last + 1; ++i ) {
Person p = persons.get(i);
PersonInfo pInfo = new PersonInfo();
pInfo.index_ = i;
pInfo.degree_ = Knows.target_edges(p,percentages,stepIndex);
pInfo.original_degree_ = (long)(p.maxNumKnows());
nodes.add(pInfo);
}
Collections.sort(nodes, new PersonInfoComparator() );
ArrayList<PersonInfo> core = new ArrayList<PersonInfo>();
ArrayList<PersonInfo> periphery = new ArrayList<PersonInfo>();
for (PersonInfo pI : nodes ) {
if(pI.degree_ >= core.size() ) {
core.add(pI);
} else {
periphery.add(pI);
}
}
return checkBudget(persons, core, periphery);
}
private ArrayList<Long> createInitialBudget( ArrayList<PersonInfo> core) {
return createInitialBudget(core, 1.0f);
}
private ArrayList<Long> createInitialBudget( ArrayList<PersonInfo> core, float p ) {
ArrayList<Long> budget = new ArrayList<Long>();
int coreSize = core.size();
for ( PersonInfo pI : core ) {
budget.add(pI.degree_ - (long)((coreSize - 1)*p));
}
return budget;
}
private Community checkBudget(ArrayList<Person> persons, ArrayList<PersonInfo> core, ArrayList<PersonInfo> periphery) {
ArrayList<Long> temp_budget = createInitialBudget(core);
Collections.sort(periphery, new PersonInfoComparator());
for(PersonInfo pI : periphery ) {
long degree = pI.degree_;
long remaining = degree;
int i = 0;
while(i < temp_budget.size() && remaining > 0) {
if( temp_budget.get(i) > 0) {
temp_budget.set(i, temp_budget.get(i) - 1);
remaining -= 1;
}
++i;
}
if (remaining > 0) {
return null;
}
}
Community community = new Community();
community.core_ = core;
community.periphery_ = periphery;
return community;
}
private void testCommunity(Community c) {
for(PersonInfo pI : c.core_ ) {
if(pI.degree_ < (c.core_.size() - 1)) System.out.println("Error in building communities\n");
}
}
private ArrayList<Community> generateCommunities( ArrayList<Person> persons) {
ArrayList<Community> communities = new ArrayList<Community>();
int last = 0;
int begin = 0;
int end = persons.size();
int threshold = 5;
while (last < end ) {
int best = last;
int numTries = 0;
Community bestCommunity = null;
while( numTries <= threshold && last < end ) {
numTries++;
Community community = findSolution(persons, begin, last);
if( community != null ) {
bestCommunity = community;
numTries = 0;
best=last;
}
last++;
}
bestCommunity.id_ = communities.size();
communities.add(bestCommunity);
testCommunity(bestCommunity);
last = best + 1;
begin = last;
}
return communities;
}
private void computeCommunityInfo(ClusteringInfo cInfo, Community c, float prob) {
long [] peripheryBudget = new long[c.periphery_.size()];
Collections.sort(c.periphery_, new PersonInfoComparator());
int index = 0;
for (PersonInfo pI : c.periphery_) {
peripheryBudget[index] = pI.degree_;
index++;
}
// Initializing cInfo with expected degrees
for (PersonInfo pI : c.core_) {
cInfo.core_node_expected_core_degree_.set(pI.index_,(c.core_.size() - 1) * (double)prob);
cInfo.core_node_excedence_degree_.set(pI.index_, pI.degree_ - cInfo.core_node_expected_core_degree_.get(pI.index_));
cInfo.core_node_expected_periphery_degree_.set(pI.index_, 0.0);
}
long remainingStubs = 0;
for (PersonInfo pI : c.core_) {
double pDegree = 0;
double maxDegree = (cInfo.core_node_excedence_degree_.get(pI.index_));
for(index = 0; index < peripheryBudget.length; ++index) {
if (peripheryBudget[index] != 0 && pDegree < maxDegree) {
pDegree++;
peripheryBudget[index]--;
}
}
cInfo.core_node_expected_periphery_degree_.set(pI.index_, pDegree);
double deg = ((pI.degree_ - cInfo.core_node_expected_core_degree_.get(pI.index_) - cInfo.core_node_expected_periphery_degree_.get(pI.index_)));
cInfo.core_node_expected_external_degree_.set(pI.index_, deg);
remainingStubs += deg;
}
cInfo.community_core_stubs_.set((int)c.id_, remainingStubs);
cInfo.community_core_probs_.set((int)c.id_, c.p_);
}
private void estimateCCCommunity( ClusteringInfo cInfo, Community c, float prob ) {
computeCommunityInfo(cInfo, c, prob);
float probSameCommunity = 0.0f;
float probTriangleSameCommunity = 0.0f;
long sumStubs = 0;
int index = 0;
for(Long l : cInfo.community_core_stubs_) {
if(index != c.id_) {
float p = l*l;
probSameCommunity += p;
probTriangleSameCommunity += p*cInfo.community_core_probs_.get(index);
sumStubs+=l;
}
index++;
}
probSameCommunity /= (sumStubs*sumStubs);
probTriangleSameCommunity /= (sumStubs*sumStubs);
float probTwoConnected = 0.0f;
for( Integer i : cInfo.core_nodes_ ) {
double degree1 = cInfo.core_node_expected_external_degree_.get(i);
if(degree1 >= 1) {
for (Integer ii : cInfo.core_nodes_) {
if(cInfo.community_id_.get(i) != cInfo.community_id_.get(i)) {
double degree2 = cInfo.core_node_expected_external_degree_.get(ii);
if (degree2 >= 1)
probTwoConnected += degree1 * degree2 / (float) (2 * sumStubs * sumStubs);
}
}
}
}
// Computing clustering coefficient of periphery nodes
for (PersonInfo pI: c.periphery_) {
if(pI.degree_ > 1) {
cInfo.clustering_coefficient_.set(pI.index_, (double)pI.degree_*(pI.degree_-1)*prob/(pI.original_degree_*(pI.original_degree_-1)));
//cInfo.clustering_coefficient_.set(pI.index_, (double)prob);
//cInfo.clustering_coefficient_.set(pI.index_, 0.0);
}
}
long [] peripheryBudget = new long[c.periphery_.size()];
index = 0;
for(PersonInfo pI: c.periphery_) {
peripheryBudget[index] = pI.degree_;
index++;
}
// Computing clustering coefficient of core nodes
for ( PersonInfo pI : c.core_ ){
int size = c.core_.size();
if( pI.degree_ > 1 ) {
// core core triangles
double internalTriangles = 0.0;
double internalDegree = cInfo.core_node_expected_core_degree_.get(pI.index_);
if(internalDegree >= 2.0) {
internalTriangles = (internalDegree * (internalDegree - 1) * prob);
}
boolean enteredOffset = false;
// core periphery triangles
double peripheryTriangles = 0;
long remainingDegree = pI.degree_;
for(index = 0; index < peripheryBudget.length; ++index) {
if(peripheryBudget[index] > 0) {
peripheryBudget[index]--;
remainingDegree--;
if(c.periphery_.get(index).degree_ > 1) {
peripheryTriangles += 2*(c.periphery_.get(index).degree_ - 1) * prob;
}
}
if(remainingDegree == 0) break;
}
double external_triangles = 0.0;
if(cInfo.core_node_expected_external_degree_.get(pI.index_) >= 2.0) {
external_triangles += cInfo.core_node_expected_external_degree_.get(pI.index_) * (cInfo.core_node_expected_external_degree_.get(pI.index_) - 1) * probTriangleSameCommunity;
external_triangles += cInfo.core_node_expected_external_degree_.get(pI.index_) * (cInfo.core_node_expected_external_degree_.get(pI.index_) - 1) * (1 - probSameCommunity) * probTwoConnected;
}
//double degree = finalInternalDegree;
/*double degree = (cInfo.core_node_expected_core_degree_.get(pI.index_) +
cInfo.core_node_expected_periphery_degree_.get(pI.index_) +
cInfo.core_node_expected_external_degree_.get(pI.index_));*/
double degree = pI.original_degree_;
//System.out.println("Internal Triangles: "+internalTriangles+" , degree: "+degree);
if( degree >= 2.0 ) {
cInfo.clustering_coefficient_.set(pI.index_, (internalTriangles+peripheryTriangles+external_triangles)/(degree*(degree-1)));
}
}
}
}
float clusteringCoefficient(ArrayList<Community> communities, ClusteringInfo cInfo ) {
float CC = clusteringCoefficient(communities, cInfo,true);
return CC;
}
float clusteringCoefficient( ArrayList<Community> communities, ClusteringInfo cInfo, Boolean countZeros ) {
float accum = 0.0f;
int count = 0;
for (Community c : communities) {
for(PersonInfo pI : c.core_) {
if(pI.degree_ > 0) {
accum += cInfo.clustering_coefficient_.get(pI.index_);
count++;
}
}
for(PersonInfo pI : c.periphery_) {
if(pI.degree_ > 0) {
accum += cInfo.clustering_coefficient_.get(pI.index_);
count++;
}
}
}
if(countZeros) {
return accum / (float) cInfo.clustering_coefficient_.size();
}
return accum / (float) count;
}
void refineCommunities( ClusteringInfo cInfo, ArrayList<Community> communities, float targetCC ) {
float currentCC = clusteringCoefficient(communities, cInfo);
int lookAhead = 5;
int tries = 0;
while( Math.abs(currentCC - targetCC) > 0.001 && tries <= lookAhead) {
// System.out.println(currentCC);
boolean found = false;
tries+=1;
if( currentCC < targetCC ) {
found = improveCC(cInfo, communities);
} else if( currentCC > targetCC){
found = worsenCC(cInfo, communities);
}
if( found ) {
currentCC = clusteringCoefficient(communities, cInfo);
tries = 0;
}
}
System.out.println("Clustering Coefficient after refinement: " + currentCC);
}
float step(int n) {
return 3.0f/(float)n;
}
boolean improveCC(ClusteringInfo cInfo, ArrayList<Community> communities) {
ArrayList<Community> filtered = new ArrayList<Community>();
for(Community c : communities ) {
if(c.p_ < 1.0f ) filtered.add(c);
}
if(filtered.size() == 0) return false;
int index = rand.nextInt(filtered.size());
Community c = filtered.get(index);
float step = step(c.core_.size());
c.p_ = c.p_ + step > 1.0f ? 1.0f : c.p_ + step;
cInfo.sumProbs+=0.01;
estimateCCCommunity(cInfo, c, c.p_);
return true;
}
boolean worsenCC(ClusteringInfo cInfo, ArrayList<Community> communities) {
ArrayList<Community> filtered = new ArrayList<Community>();
for(Community c : communities ) {
if(c.p_ > min_community_prob_ ) filtered.add(c);
}
if(filtered.size() == 0) return false;
int index = rand.nextInt(filtered.size());
Community c = filtered.get(index);
float step = step(c.core_.size());
c.p_ = c.p_ - step < min_community_prob_ ? min_community_prob_ : c.p_ - step ;
cInfo.sumProbs-=0.01;
estimateCCCommunity(cInfo, c, c.p_ );
return true;
}
void createEdgesCommunityCore(ArrayList<Person> persons, Community c) {
for ( PersonInfo pI : c.core_) {
for( PersonInfo other: c.core_) {
if(pI.index_ < other.index_ ) {
float prob = rand.nextFloat();
if( prob <= c.p_ ) {
// crear aresta
if(Knows.createKnow(rand, persons.get(pI.index_), persons.get(other.index_)))
numCoreCoreEdges++;
else
numMisses++;
}
}
}
}
}
void createEdgesCommunityPeriphery(ClusteringInfo cInfo, ArrayList<Person> persons, Community c) {
//long start = System.currentTimeMillis();
long [] peripheryBudget = new long[c.periphery_.size()];
int index = 0;
for(PersonInfo pI: c.periphery_) {
peripheryBudget[index] = pI.degree_;
++index;
}
for ( PersonInfo pI : c.core_ ) {
double pDegree = 0;
double maxDegree = cInfo.core_node_expected_periphery_degree_.get(pI.index_);
for (index = 0; index < peripheryBudget.length; ++index ) {
if( peripheryBudget[index] != 0 && pDegree < maxDegree) {
pDegree++;
peripheryBudget[index]--;
if(Knows.createKnow(rand, persons.get(pI.index_), persons.get(c.periphery_.get(index).index_)))
numCorePeripheryEdges++;
else
numMisses++;
}
}
}
for( PersonInfo pI : c.periphery_ ) {
if(persons.get(pI.index_).knows().size() > pI.degree_ ) {
System.out.println("ERROR");
}
}
//long end = System.currentTimeMillis();
//System.out.println("Time to create core-periphery edges: "+(end-start));
}
void fillGraphWithRemainingEdges(ClusteringInfo cInfo, ArrayList<Community> communities, ArrayList<Person> persons) {
ArrayList<PersonInfo> stubs = new ArrayList<PersonInfo> ();
LinkedList<Integer> indexes = new LinkedList<Integer>();
Integer ii = 0;
for ( Community c : communities ) {
for (PersonInfo pI : c.core_ ) {
long diff = pI.degree_ - persons.get(pI.index_).knows().size();
if( diff > 0 ) {
for( int i = 0; i < diff; ++i) {
stubs.add(pI);
indexes.add(ii++);
}
}
}
}
Collections.shuffle(stubs,rand);
Collections.shuffle(indexes,rand);
while(indexes.size()>0) {
int index = indexes.pop();
PersonInfo first = stubs.get(index);
if(indexes.size() > 0) {
int index2 = indexes.pop();
PersonInfo second = stubs.get(index2);
// create edge
if(persons.get(first.index_) == persons.get(second.index_)) {
numMisses++;
continue;
}
if(Knows.createKnow(rand, persons.get(first.index_), persons.get(second.index_)))
numCoreExternalEdges++;
else
numMisses++;
}
}
}
public void generateKnows( ArrayList<Person> persons, int seed, ArrayList<Float> percentages, int step_index ) {
long start, end;
rand.setSeed(seed);
this.percentages = percentages;
this.stepIndex = step_index;
start = System.currentTimeMillis();
ArrayList<Community> communities = generateCommunities(persons);
end = System.currentTimeMillis();
System.out.println("Time to configure communities: "+(end-start));
ClusteringInfo cInfo = new ClusteringInfo( persons.size(), communities );
System.out.println("Number of generated communities: "+communities.size());
start = System.currentTimeMillis();
for( Community c : communities ) {
c.p_ = 1.0f;
computeCommunityInfo(cInfo, c, 1.0f);
}
end = System.currentTimeMillis();
System.out.println("Time to compute initial community information: "+(end-start));
start = System.currentTimeMillis();
for( Community c : communities ) {
c.p_ = 1.0f;
estimateCCCommunity(cInfo, c, c.p_);
}
float maxCC = clusteringCoefficient(communities, cInfo);
end = System.currentTimeMillis();
System.out.println("maxCC: "+maxCC);
System.out.println("Time to compute maximum CC: "+(end-start));
start = System.currentTimeMillis();
for( Community c : communities ) {
c.p_ = 0.5f;//rand.nextFloat();
//c.p_ = rand.nextFloat();
estimateCCCommunity(cInfo, c, c.p_ );
}
end = System.currentTimeMillis();
System.out.println("Time to compute the initial solution: "+(end-start));
PersonGraph graph;
boolean iterate;
float fakeTargetCC = targetCC;
int numIterations = 0;
do {
System.out.println("****** STARTING REFINEMENT ITERATION ******");
iterate = false;
start = System.currentTimeMillis();
refineCommunities(cInfo, communities, fakeTargetCC);
end = System.currentTimeMillis();
System.out.println("Time to refine communities: "+(end-start));
System.out.println("Creating graph");
start = System.currentTimeMillis();
for(Community c : communities ) {
createEdgesCommunityCore(persons, c);
createEdgesCommunityPeriphery(cInfo, persons, c);
}
fillGraphWithRemainingEdges(cInfo, communities, persons);
end = System.currentTimeMillis();
System.out.println("Time to generate graph: "+(end-start));
graph = new PersonGraph(persons);
System.out.println("Computing clustering coefficient");
double finalCC = 0;
ArrayList<Double> clusteringCoefficient = GraphUtils.ClusteringCoefficientList(graph);
int i = 0;
for( Person p : persons) {
long degree = graph.neighbors(p.accountId()).size();
long originalDegree = p.maxNumKnows();
if(originalDegree > 1)
finalCC += clusteringCoefficient.get(i) * degree*(degree - 1) / (originalDegree*(originalDegree-1));
i++;
}
finalCC /= persons.size();
//double finalCC = GraphUtils.ClusteringCoefficient(graph);
System.out.println("Clustering coefficient of the generated graph: "+finalCC);
double delta = targetCC - finalCC;
if( Math.abs( delta ) > 0.001 ) {
resetStatistics();
for(Person person: persons) {
person.knows().clear();
}
if(delta > 0)
fakeTargetCC += Math.abs(delta)*0.5f;
else
fakeTargetCC /= 2;
System.out.println("New Fake targetCC: "+fakeTargetCC );
iterate = true;
}
numIterations++;
System.out.println("****** FINISHED REFINEMENT ITERATION ******");
}while( iterate && numIterations < 20 );
int countMore = 0;
int countLess = 0;
int sumMore = 0;
int sumLess = 0;
int index = 0;
int countDegreeZero = 0;
for( Person p : persons ) {
if(cInfo.is_core_.get(index)) {
long target = Knows.target_edges(p, percentages, step_index);
if (p.knows().size() > target) {
sumMore += -target + p.knows().size();
countMore++;
} else if (p.knows().size() < target) {
//System.out.println(p.knows().size()+" "+target);
sumLess += target - p.knows().size();
countLess++;
}
}
if(p.knows().size() == 0) countDegreeZero++;
++index;
}
System.out.println("Number of iterations to converge: "+numIterations);
System.out.println("Number of persons with more degree than expected: "+countMore);
System.out.println("Sum of excess degree: "+sumMore);
System.out.println("Number of persons with less degree than expected: "+countLess);
System.out.println("Sum of degree missed: "+sumLess);
System.out.println("Number of persons with degree zero: "+countDegreeZero);
printStatistics();
}
public void initialize( Configuration conf ) {
targetCC = conf.getFloat("ldbc.snb.datagen.generator.ClusteringKnowsGenerator.clusteringCoefficient", 0.1f);
System.out.println("Initialized clustering coefficient to "+targetCC);
targetCC /= 2.0f;
}
public void resetStatistics() {
numCoreCoreEdges = 0;
numCorePeripheryEdges = 0;
numCoreExternalEdges = 0;
numMisses = 0;
}
public void printStatistics() {
System.out.println("Number core-core edges: "+numCoreCoreEdges);
System.out.println("Number core-periphery edges: "+numCorePeripheryEdges);
System.out.println("Number core-external edges: "+numCoreExternalEdges);
System.out.println("Number edges missed: "+numMisses);
}
}