/*
* (C) Copyright IBM Corp. 2008
*
* LICENSE: Eclipse Public License v1.0
* http://www.eclipse.org/legal/epl-v10.html
*/
package com.ibm.gaiandb;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.util.Arrays;
import org.apache.derby.iapi.types.DataValueDescriptor;
import org.apache.derby.iapi.types.SQLInteger;
public class EntityMatrixJoiner {
// Use PROPRIETARY notice if class contains a main() method, otherwise use COPYRIGHT notice.
public static final String COPYRIGHT_NOTICE = "(c) Copyright IBM Corp. 2008";
private static final Logger logger = new Logger( "EntityMatrixJoiner", 30 );
private final int MAX_LINKS; // = 5;
private int[][] matrix;
private int[] linkCounts;
private int[] remainingGroups;
private int[] restrictedSizeGroupsRemaining;
private int maxCascadedOverflowDepth = 0;
private int cascadedOverflowDepth = 0;
private int numGroups = 0;
public static void main(String[] args) throws IOException {
Logger.setPrintStream( System.out );
Logger.setLogLevel( Logger.LOG_MORE );
// String filename = args.length > 0 ? args[0] : "C:\\temp\\rmat_tiny2.txt";
// String filename = args.length > 0 ? args[0] : "C:\\temp\\rmat_tim2.txt";
// String filename = args.length > 0 ? args[0] : "C:\\temp\\rmat2.txt";
// String filename = args.length > 0 ? args[0] : "C:\\temp\\rmatbig_mod.txt";
// String filename = args.length > 0 ? args[0] : "C:\\temp\\rmatbig2.txt";
// String filename = args.length > 0 ? args[0] : "C:\\temp\\rmatmed1tail.txt";
String filename = args.length > 0 ? args[0] : "C:\\temp\\rmatmed.txt"; // a tenth of the file
// String filename = args.length > 0 ? args[0] : "C:\\temp\\rmatmed2.txt"; // a fifth of the file
// String filename = args.length > 0 ? args[0] : "C:\\temp\\rmatmed5.txt"; // half of the file
// String filename = args.length > 0 ? args[0] : "C:\\temp\\rmatbig_mod.txt";// the full file
// String filename = args.length > 0 ? args[0] : "C:\\temp\\tim_100.txt";
// String filename = args.length > 0 ? args[0] : "C:\\temp\\GDB52-emj.txt";
EntityMatrixJoiner emj = new EntityMatrixJoiner( filename, 100000000, 5 );
emj.processJoins( 6 );
}
public void releaseMatrixFromMemory() {
if ( null == matrix ) return;
for ( int i=0; i<matrix.length; i++ ) {
int[] e = matrix[i];
if ( null != e ) for ( int j=0; j<=MAX_LINKS; j++ ) e[j] = 0;
matrix[i] = null;
}
matrix = null;
}
/**
* Expects that the given file has lines with pairs of 'linked' integers separated by commas, e.g: 12 linked to 45 would be: 12,45
* Integers are assumed to be in the range 0 - 10000000
*
* The program picks out groups of integers linked to no more than MAX_LINKS other integers, then joins the result with
* itself to look for indirect links. Whenever 5 links are exceeded, the entities are not considered anymore.
*
* @param args
* @throws IOException
* @throws NumberFormatException
* @throws IOException
*/
public EntityMatrixJoiner( String file, int numEntities, int maxGroupSize ) throws NumberFormatException, IOException {
// Add 1 to numEntities so that there will always be one entity that is not connected to anyone
// This ensures that a ResultSet full of overflows does not result in a completely empty Result which would be ambiguous
// as it could also be interpreted as a failure to send the matrix altogether.
numEntities++;
MAX_LINKS = maxGroupSize - 1;
matrix = new int[ numEntities ][ maxGroupSize ]; // 1st value is numlinks
logger.logInfo("Getting groups of 1st degree links with maxGroupSize " + maxGroupSize + " from file: " + file );
FileReader fr = new FileReader( file );
BufferedReader br = new BufferedReader( fr );
String line;
long start = System.currentTimeMillis();
int numlines=0;
while( null != ( line = br.readLine() ) ) {
if ( 0 == numlines++ % 10000000 )
logger.logInfo("Loading next 10M lines from " + numlines);
int cidx = line.indexOf(',');
int n1 = Integer.parseInt( line.substring(0, cidx) );
int n2 = Integer.parseInt( line.substring(cidx+1) );
loadNewAssociationIntoMatrix( n1, n2 );
}
br.close();
fr.close();
long end = System.currentTimeMillis();
logger.logInfo("Done in " + (end - start) + "ms - num distinct groups of " + MAX_LINKS + " links or less: " + numGroups);
logger.logInfo("Max recursive overflow depth: " + maxCascadedOverflowDepth);
}
public EntityMatrixJoiner( GaianChildVTI rows, int numEntities, int maxGroupSize ) throws Exception {
MAX_LINKS = maxGroupSize - 1;
matrix = new int[ numEntities ][ maxGroupSize ]; // 1st value is numlinks
mergeGaianChildRows( rows );
}
public int mergeGaianChildRows( GaianChildVTI rows ) throws Exception {
// INIT_NUM_ENTITIES = numEntities;
// MAX_LINKS = maxGroupSize - 1;
// matrix = new int[ numEntities ][ maxGroupSize ]; // 1st value is numlinks
logger.logInfo("Merging GaianChildRows into matrix...");
long start = System.currentTimeMillis();
int numlines=0;
int headGroupAfterPreviousOne = 0;
DataValueDescriptor[] dvdr = new DataValueDescriptor[ MAX_LINKS+2 ];
// Create column wrappers for the 'numlinks' column and for (max entitites = MAX_LINKS+1) columns
for ( int i=0; i<MAX_LINKS+2; i++ ) dvdr[i] = new SQLInteger();
while( false != rows.fetchNextRow( dvdr ) ) {
if ( 0 == numlines++ % 10000000 )
logger.logInfo("Loading next 10M DVDRs from " + numlines);
int numlinks = ((SQLInteger) dvdr[0]).getInt();
int headGroup = ((SQLInteger) dvdr[1]).getInt();
// NOTE: WE DON'T NEED TO DO ANYTHING ABOUT BACKWARD LINKS (i.e. when numlinks == -1)
// BECAUSE THE INJESTION ALGO WILL RE GENERATE THEM WHEN APPROPRIATE FROM THE FORWARD LINKS
// Overflow all groups that are not in the range of heads between the last one and this one.
for ( int i=headGroupAfterPreviousOne; i<headGroup; i++ ) {
int[] entityList = matrix[i];
if ( MAX_LINKS < entityList[0] ) continue; // already overflowed
overflowSubGroups( entityList ); //i );
}
headGroupAfterPreviousOne = headGroup + 1;
// if ( MAX_LINKS == numlinks )
// overflowSubGroups( matrix[ headGroup ] );
// else
// Insert all forward links in this group
for ( int i=2; i<numlinks+2; i++ )
loadNewAssociationIntoMatrix( headGroup, ((SQLInteger) dvdr[i]).getInt() );
}
// Overflow all groups that are not in the range of heads between the last one and the end of the matrix.
// Only do this if we processed at least one row - this should always be the case as there is 1 more entity in the range
// than were defined for the matrix originally - so that one should be disconnected from every other entity.
if ( 0 < numlines )
for ( int i=headGroupAfterPreviousOne; i<matrix.length; i++ ) {
int[] entityList = matrix[i];
if ( MAX_LINKS < entityList[0] ) continue; // already overflowed
overflowSubGroups( entityList ); //i );
}
long end = System.currentTimeMillis();
logger.logInfo("Done in " + (end - start) + "ms - num distinct groups of " + MAX_LINKS + " links or less: " + numGroups);
logger.logInfo("Max recursive overflow depth: " + maxCascadedOverflowDepth);
return numlines;
}
private void loadNewAssociationIntoMatrix( int n1, int n2 ) {
int[] e1links = matrix[n1];
int numlinks = e1links[0];
// Whilst building these 1st degree links, we need to make sure that element heads are always back referenced.
// So - if a sub-element links to a new element (or element head) we must make the sub-element also a sub-element to the other one
// to avoid breaking the back reference link.
// If both elements to be linked are sub-elements, then one of the back-references can be broken as the join can still be done from
// the other direction. Note that when a join occurs later, all joined elements will be forced to be back referenced to the joiner.
if ( -1 == numlinks ) {
if ( -1 == matrix[n2][0] )
numlinks = 0; // this element will no longer be a sub-element and become a cluster master.
else {
// swap the 2 entities so that the sub-element remains a sub-element.
int n3 = n1;
n1 = n2;
n2 = n3;
e1links = matrix[n1];
numlinks = e1links[0];
}
}
if ( 0 == numlinks ) {
numGroups++;
// logger.logInfo("New group:\t" + n1 + "\tcount " + numGroups);
} else {
// For overflowed groups, overflow any new entites they link to
if ( MAX_LINKS < numlinks ) {
int[] e2links = matrix[n2];
if ( MAX_LINKS < e2links[0] ) return; // Both are overflowed - nothing to do
overflowSubGroups( e2links ); //n2 );
// logger.logInfo("Removed subgroups of:\t" + n2 + "\tcount " + numGroups);
return;
}
// check for duplicate association entry
int i;
for ( i=1; i<=numlinks; i++ )
if ( e1links[i] == n2 ) break;
if ( i<=numlinks ) return; // duplicate was found
// Check that this head element is not about to overflow
if ( MAX_LINKS == numlinks ){
// overflow e1 itself and all subgroups - reducing numGroups in doing so
overflowSubGroups( e1links ); //n1 );
// Now ensure associated entity's links are overflowed aswell
int[] e2links = matrix[n2];
if ( MAX_LINKS < e2links[0] ) return; // Both are overflowed - nothing to do
overflowSubGroups( e2links ); //n2 );
// logger.logInfo("Removed subgroups of:\t" + n1 + " and " + n2 + "\tcount " + numGroups);
return;
}
}
e1links[0] = ++numlinks; // start at index 1 (index 0 holds the numlinks)
e1links[numlinks] = n2;
// only make the 2nd entity a sub-element to this group
// if it wasnt already a cluster master or sub-element to another group
int[] e2links = matrix[n2];
if ( 0 == e2links[0] ) {
e2links[0] = -1; // denotes a back ref link
e2links[1] = n1; // ...to n1
}
}
public void processJoins() {
processJoins( MAX_LINKS+1 );
}
public void processJoins( int numJoins ) {
remainingGroups = null;
int[] previousRemaining = null;
for ( int i=0; i<numJoins; i++ ) {
if ( i < 2 ) showMatrix(10);
maxCascadedOverflowDepth = 0;
cascadedOverflowDepth = 0;
logger.logInfo("Joining matrix with itself and removing overflowed groups, join iteration #" + (i+1));
long start = System.currentTimeMillis();
// this will be the previously remaining entities before the join was applied.
remainingGroups = new int[ numGroups ];
joinMatrix( remainingGroups, previousRemaining );
previousRemaining = remainingGroups;
long end = System.currentTimeMillis();
logger.logInfo("Done in " + (end - start) + "ms - num distinct groups of " + MAX_LINKS + " or less: " + numGroups);
logger.logInfo("Max recursive overflow depth: " + maxCascadedOverflowDepth);
// showMatrix(matrix, remaining, 50);
}
showMatrix(10);
// if ( null != remaining )
// showMatrix(matrix, remaining, 50);
linkCounts = new int[MAX_LINKS];
for (int i=0; i<remainingGroups.length; i++) {
int count = matrix[ remainingGroups[i] ][0];
if ( 0 < count && count <= MAX_LINKS ) linkCounts[ count-1 ]++;
}
logger.logInfo("Number of groups for each links size category [1-" + MAX_LINKS + "]: " + Arrays.toString( linkCounts ) );
logger.logInfo("Sample Results (2 of each group):");
for ( int i=0; i<MAX_LINKS; i++ ) showMatrixResults( i, 2 );
// Initialise the set of restricted sized groups to the whole set of groups
restrictedSizeGroupsRemaining = remainingGroups;
}
private void overflowSubGroups( int[] entityList ) {
// entityList[0] is assumed to have already overflowed
cascadedOverflowDepth++;
int numlinks = entityList[0];
if ( 0 < numlinks ) numGroups--;
entityList[0] = MAX_LINKS+1;
// When dealing with backwards refs, there is only one link to overflow.
if ( -1 == numlinks ) numlinks = 1;
for ( int i=1; i<=numlinks; i++ ) {
// try {
int[] sublist = matrix[ entityList[i] ];
// logger.logInfo("i = " + i + ", entityList[i] = " + entityList[i] + ":\t" + Arrays.toString( sublist ) );
if ( MAX_LINKS >= sublist[0] ) {
overflowSubGroups( sublist );
if ( cascadedOverflowDepth > maxCascadedOverflowDepth ) maxCascadedOverflowDepth = cascadedOverflowDepth;
}
// }
// catch (ArrayIndexOutOfBoundsException e) {
//
// logger.logException("Exception in overflowSubGroups: i=" + i, e);
// logger.logException("Exception in overflowSubGroups: i=" + i + ", entityList[i]=" + entityList[i], e);
// throw e;
// }
}
cascadedOverflowDepth--;
}
// private void overflowSubGroups( int eindex ) {
//
// cascadedOverflowDepth++;
//
// int[] entityList = matrix[ eindex ];
// int numlinks = entityList[0];
// if ( 0 < numlinks ) numGroups--;
// entityList[0] = MAX_LINKS+1;
//
// // When dealing with backwards refs, there is only one link to overflow.
// if ( -1 == numlinks ) numlinks = 1;
//
// for ( int i=1; i<=numlinks; i++ ) {
//
// int e2index = entityList[i];
//
//// try {
// int[] sublist = matrix[ e2index ];
//// logger.logInfo("i = " + i + ", entityList[i] = " + entityList[i] + ":\t" + Arrays.toString( sublist ) );
//
// int numlinks2 = sublist[0];
//
// if ( MAX_LINKS < numlinks2 ) continue;
//
// if ( MAX_LINKS > numlinks2 ) {
//
// sublist[ -1 == numlinks2 ? 1 : numlinks2+1 ] = eindex;
// continue;
// }
//
// overflowSubGroups( e2index );
//
// if ( cascadedOverflowDepth > maxCascadedOverflowDepth ) maxCascadedOverflowDepth = cascadedOverflowDepth;
//
//// }
//// catch (ArrayIndexOutOfBoundsException e) {
////
//// logger.logException("Exception in overflowSubGroups: i=" + i, e);
//// logger.logException("Exception in overflowSubGroups: i=" + i + ", entityList[i]=" + entityList[i], e);
//// throw e;
//// }
// }
//
// cascadedOverflowDepth--;
// }
// Future optimization: only re-join against the newly added entities in each remaining group
private void joinMatrix( int[] remaining, int[] previousRemaining ) {
int idx = 0;
boolean isFirstPass = null == previousRemaining;
int max = isFirstPass ? matrix.length : previousRemaining.length;
// Apply the join and figure out which entities were previously remaining
for ( int e1 = 0; e1<max; e1++ ) {
int newEntityIndex = isFirstPass ? e1 : previousRemaining[e1];
int[] e1links = matrix[ newEntityIndex ];
if ( null == e1links ) continue; // No entities are linked to this entity
int numlinks = e1links[0];
// if ( 0 == numlinks ) { matrix[ newEntityIndex ] = null; continue; }
if ( 0 >= numlinks || MAX_LINKS < numlinks ) continue;
remaining[ idx++ ] = newEntityIndex;
for ( int j=1; j<=numlinks; j++ ) {
int e2 = e1links[j];
int[] e2links = matrix[ e2 ];
// logger.logInfo("NumGroups " + numGroups + ", Processing " +
// newEntityIndex + ": " + Arrays.toString( e1links ) + " with " + e2 + ": " + Arrays.toString( e2links ));
int numlinks2 = e2links[0];
boolean isOverflow = false;
if ( 0 == numlinks2 ) {
logger.logInfo("Error: unexpected 0 count for referenced entity - should be -1 (back ref) or positive");
continue;
}
// If we have hit an element which is a back referenced sub-element, then add the head element.
// future extn: switch e2 to it to process all its links ? (this wd require more checks on overflows)
if ( -1 == numlinks2 ) {
int e3 = e2links[1]; // the back reference element in e2's list to join to the list of e1's
if ( e3 != newEntityIndex ) isOverflow = !addEntitySkipDuplicates( e1links, e3 );
} else {
if ( MAX_LINKS < numlinks2 ) {
isOverflow = true;
} else for ( int k=1; k<=numlinks2; k++ ) { // Add all elements from e2links into e1links - stop if overflow occurs
int e3 = e2links[k]; // the next element in e2's list to join to the list of e1's
// logger.logInfo("e1: " + e1 + ", e3 " + e3);
// Ensure the element from e2links to be added to e1links is not equal to its head-element!
if ( e3 != newEntityIndex ) {
isOverflow = !addEntitySkipDuplicates( e1links, e3 );
if ( isOverflow ) break; // stop processing these overflowed links
}
}
}
if ( isOverflow ) {
// Overflow e1links and all its sub-elements' links recursively
overflowSubGroups( e1links ); //newEntityIndex );
break; // stop processing these overflowed links
}
// We didnt overflow - make the group for this e2links element empty now as its been joined to e1links.
// Also, back reference it to the e1links head so that other joins to this element will be redirected it.
if ( -1 != numlinks2 ) {
numGroups--;
e2links[0] = -1;
}
e2links[1] = newEntityIndex;
}
}
}
/**
* Adds entity to entities array, unless it is already in it.
* Returns true if overflow occurs.
*/
private boolean addEntitySkipDuplicates( int[] entities, int entity ) {
// logger.logInfo("Trying to add " + entity + " to " + Arrays.toString( entities ));
int numlinks = entities[0];
for ( int i=1; i<=numlinks; i++ )
if ( entities[i] == entity ) return true;
if ( MAX_LINKS == numlinks ) return false;
entities[0] = ++numlinks;
entities[numlinks] = entity;
return true;
}
// public void setGroupSizeRestriction( int maxGroupSize ) {
//
// int numRestrictedSizeGroups = 0;
// int maxLinks = maxGroupSize-1;
//
// // example: group size of 2 -> links size of 1 -> only pick linkCounts[0]
// for ( int i=0; i<maxLinks; i++ ) numRestrictedSizeGroups += linkCounts[i];
//
// restrictedSizeGroupsRemaining = new int[ numRestrictedSizeGroups ];
// int restrictedGroupsIdx = 0;
//
// for ( int i=0; i<remainingGroups.length; i++ ) {
// int groupIdx = remainingGroups[i];
// int numLinks = matrix[ groupIdx ][0];
// if ( 0 < numLinks && maxLinks >= numLinks )
// restrictedSizeGroupsRemaining[restrictedGroupsIdx++] = groupIdx;
// }
// }
public void writeNonOverflowedRowsToFile( String fileName ) throws IOException {
BufferedWriter bw = new BufferedWriter( new FileWriter(fileName) );
StringBuffer sb = new StringBuffer();
// Strings to complete packed zero ranges -
// Corresponds to the packing required at the end of a row after zero-range is written
String zeroPack = new String();
for ( int i=1; i<MAX_LINKS; i++ ) zeroPack += ",";
for ( int i=0; i<matrix.length; i++ ) {
int[] entryList = matrix[i];
if ( null == entryList ) continue;
if ( MAX_LINKS < entryList[0] ) continue;
int numlinks = entryList[0];
sb.setLength(0);
sb.append( numlinks );
sb.append(',');
sb.append( i );
// if ( 0 == numlinks ) {
// int j = i;
// for ( ; i+1<matrix.length; i++ ) {
// int[] entryList2 = matrix[i+1];
// if ( null == entryList2 || 0 != entryList2[0] ) break;
// }
//
// sb.append(',');
// if ( i != j ) sb.append( i );
// sb.append( zeroPack );
//
// bw.write( sb.toString() );
// bw.newLine();
// continue;
// }
int j=1;
for ( ; j<numlinks+1; j++ ) {
sb.append(','); sb.append( entryList[j] );
}
for ( ; j<MAX_LINKS+1; j++ ) sb.append(',');
bw.write( sb.toString() );
bw.newLine();
// bw.write( Integer.toString( numlinks ) );
// bw.write(',');
// bw.write( Integer.toString( i ) );
// for ( int j=1; j<MAX_LINKS; j++ ) {
// bw.write(','); bw.write( Integer.toString( entryList[j] ) );
// }
// bw.newLine();
}
bw.close();
}
public void setGroupSizeRestriction( int requestedGroupSize ) {
int requestedLinks = requestedGroupSize-1;
// example: group size of 2 -> links size of 1 -> only pick linkCounts[0]
restrictedSizeGroupsRemaining = new int[ linkCounts[ requestedLinks-1 ] ];
int restrictedGroupsIdx = 0;
for ( int i=0; i<remainingGroups.length; i++ ) {
int groupIdx = remainingGroups[i];
int numLinks = matrix[ groupIdx ][0];
if ( requestedLinks == numLinks )
restrictedSizeGroupsRemaining[restrictedGroupsIdx++] = groupIdx;
}
}
public int getNumGroups() {
return restrictedSizeGroupsRemaining.length;
}
public int getGroupHead( int idx ) {
return restrictedSizeGroupsRemaining[idx];
}
public int[] getGroupRow( int headIdx ) {
return matrix[ headIdx ];
}
private void showMatrix( int max ) {
// int[] exceptions = new int[] { 518, 8130874, 2972626, 19686, 665417, 1252944 };
// for ( int i=0; i<exceptions.length; i++ ) {
// logger.logInfo( "EntityLinks(" + exceptions[i] + ") = " + Arrays.toString( matrix[ exceptions[i] ] ) );
// }
int trueGroupCount = 0, brc = 0, ovfc = 0, nlc = 0;
for ( int i=0; i<matrix.length; i++ ) {
// if ( null == matrix[i] ) { nlc++; continue; }
int numlinks = matrix[i][0];
if ( numlinks > 0 && numlinks <= MAX_LINKS )
trueGroupCount++;
if ( 0 == numlinks ) nlc++;
if ( -1 == numlinks ) brc++;
if ( MAX_LINKS < numlinks ) ovfc++;
}
logger.logInfo("Real Distinct Groups Count: " + trueGroupCount);
logger.logInfo("Back References Count: " + brc);
logger.logInfo("Overflowed Entities Count: " + ovfc);
logger.logInfo("Non-Linked Entities Count: " + nlc);
for ( int i=0; i<matrix.length && 0 < max; i++ ) {
int[] entryList = matrix[i];
if ( null == entryList ) continue;
int numlinks = entryList[0];
if ( numlinks > 0 && numlinks <= MAX_LINKS ) {
max--;
logger.logInfo( "EntityLinks(" + i + ") = " + Arrays.toString( matrix[ i ] ) );
}
}
}
// private void showMatrixNonLinkedElements( int max ) {
// showMatrixResults( 0, max );
// }
private void showMatrixResults( int groupSize, int max ) {
if ( 0 == groupSize ) return; // Empty groups are not kept
for ( int i=0; i<matrix.length && 0 < max; i++ ) {
int[] entryList = matrix[i];
if ( null == entryList ) continue;
int numlinks = entryList[0];
if ( groupSize == numlinks ) {
max--;
logger.logInfo( "EntityLinks(" + i + ") = " + Arrays.toString( matrix[ i ] ) );
}
}
}
// private void showMatrix( int[] remaining ) {
// showMatrix( remaining, remaining.length );
// }
// private void showMatrix( int[] remaining, int max ) {
//
// if ( max > remaining.length ) max = remaining.length;
//
// for ( int i=0; i<max; i++ ) {
// int idx = remaining[i];
// if ( null == matrix[ idx ] ) continue;
// logger.logInfo( "EntityLinks(" + idx + ") = " + Arrays.toString( matrix[ idx ] ) );
// }
// }
// private static boolean isIn( int a, int[] list ) {
// for ( int i=0; i<list.length; i++ )
// if ( a == list[i] ) return true;
// return false;
// }
//
// private static Object resizeArray (Object oldArray, int newSize) {
// int oldSize = java.lang.reflect.Array.getLength(oldArray);
// Class elementType = oldArray.getClass().getComponentType();
// Object newArray = java.lang.reflect.Array.newInstance(
// elementType,newSize);
// int preserveLength = Math.min(oldSize,newSize);
// if (preserveLength > 0)
// System.arraycopy (oldArray,0,newArray,0,preserveLength);
// return newArray;
// }
}