package org.shanbo.feluca.data2.convert;
import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.msgpack.MessagePack;
import org.msgpack.packer.Packer;
import org.shanbo.feluca.data2.HashPartitioner;
import org.shanbo.feluca.data2.SeqVectorReader;
import org.shanbo.feluca.data2.Vector;
import org.shanbo.feluca.data2.VectorReader;
import com.google.common.io.PatternFilenameFilter;
/**
*
* @author lgn
*
*/
public class VectorPartitioner {
public boolean isPowerOfTwo(int number){
int n = number;
while(n > 1){
int mod = number %2;
if (mod > 0){
return false;
}
n = number >>> 1;
}
return true;
}
void doPartition(VectorReader reader, int shards, String suffix) throws IOException{
assert (isPowerOfTwo(shards) == true);
File[] dats = reader.getDataDir().listFiles(new PatternFilenameFilter(".*\\.v\\.\\d+\\.dat"));
for(File dat : dats){
System.out.print(dat.getName() + ";");
dat.delete();
}
String dataName = reader.getDataDir().getName();
HashPartitioner partitioner = new HashPartitioner(shards);
String blockPathTemplate = reader.getDataDir().getAbsolutePath() + "/" + dataName + ".v.%d.dat" + suffix;
ArrayList<Packer> packers = new ArrayList<Packer>(shards);
MessagePack messagePack = new MessagePack();
for(int i = 0 ; i < shards;i++){ //output
packers.add(messagePack.createPacker(
new BufferedOutputStream(new FileOutputStream(String.format(blockPathTemplate, i)), 1024 * 1024 * 2)));
}
int count = 0;
for(Vector v = reader.getNextVector(); v!= null; v = reader.getNextVector()){
List<Vector> divided = v.divideByFeature(partitioner);
for(int i = 0 ; i < divided.size(); i++){
packers.get(i).write(true);
divided.get(i).pack(packers.get(i));
}
count ++;
if (count % 2000 == 0){
for(int i = 0 ; i < packers.size(); i++){
packers.get(i).flush();
}
System.out.print("*");
}
}
for(int i = 0 ; i < packers.size(); i++){
packers.get(i).write(false).close();
}
}
public void doPartition(String dirName, int shards) throws IOException{
SeqVectorReader vr = new SeqVectorReader(dirName);
doPartition(vr, shards, "");
}
// /**
// * repartition data ; example : [0,1] -> [0,1,2,3]
// * @param dirName
// * @throws IOException
// */
// public void divideToDouble(String dirName) throws IOException{
// MultiVectorReader multiVectorReader = new MultiVectorReader(dirName, null);
// doPartition(multiVectorReader, multiVectorReader.getBlocks() * 2, ".new");
// for(int i = 0 ; i < )
// }
}