package com.cloudera.knittingboar.io;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.util.ArrayList;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.TextInputFormat;
import com.cloudera.iterativereduce.io.TextRecordParser;
import com.cloudera.knittingboar.messages.iterativereduce.ParameterVectorUpdatable;
import com.cloudera.knittingboar.sgd.iterativereduce.POLRWorkerNode;
import junit.framework.TestCase;
public class TestSplitReset extends TestCase {
private static JobConf defaultConf = new JobConf();
private static FileSystem localFs = null;
static {
try {
defaultConf.set("fs.defaultFS", "file:///");
localFs = FileSystem.getLocal(defaultConf);
} catch (IOException e) {
throw new RuntimeException("init failure", e);
}
}
private static Path workDir = new Path(System.getProperty("test.build.data", "/tmp/TestSplitReset/"));
public Configuration generateDebugConfigurationObject() {
Configuration c = new Configuration();
// feature vector size
c.setInt( "com.cloudera.knittingboar.setup.FeatureVectorSize", 10000 );
c.setInt( "com.cloudera.knittingboar.setup.numCategories", 20);
c.setInt("com.cloudera.knittingboar.setup.NumberPasses", 2);
// setup 20newsgroups
c.set( "com.cloudera.knittingboar.setup.RecordFactoryClassname", "com.cloudera.knittingboar.records.TwentyNewsgroupsRecordFactory");
return c;
}
public InputSplit[] generateDebugSplits( Path input_path, JobConf job ) {
long block_size = localFs.getDefaultBlockSize();
// localFs.
System.out.println("default block size: " + (block_size / 1024 / 1024) + "MB");
// ---- set where we'll read the input files from -------------
FileInputFormat.setInputPaths(job, input_path);
// try splitting the file in a variety of sizes
TextInputFormat format = new TextInputFormat();
format.configure(job);
int numSplits = 1;
InputSplit[] splits = null;
try {
splits = format.getSplits(job, numSplits);
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return splits;
}
public void testReset() throws IOException {
//TextRecordParser lineParser = null;
// ---- this all needs to be done in
JobConf job = new JobConf(defaultConf);
Path file = new Path(workDir, "testGetSplits.txt");
int tmp_file_size = 200000;
long block_size = localFs.getDefaultBlockSize();
System.out.println("default block size: " + (block_size / 1024 / 1024) + "MB");
Writer writer = new OutputStreamWriter(localFs.create(file));
try {
for (int i = 0; i < tmp_file_size; i++) {
writer.write("a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, 1, a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, 99");
writer.write("\n");
}
} finally {
writer.close();
}
System.out.println( "file write complete" );
// TODO: work on this, splits are generating for everything in dir
InputSplit[] splits = generateDebugSplits(workDir, job);
System.out.println( "split count: " + splits.length );
TextRecordParser txt_reader = new TextRecordParser();
long len = Integer.parseInt( splits[0].toString().split(":")[2].split("\\+")[1] );
txt_reader.setFile(splits[0].toString().split(":")[1], 0, len);
Text csv_line = new Text();
int x = 0;
while (txt_reader.hasMoreRecords()) {
txt_reader.next(csv_line);
x++;
}
System.out.println( "read recs: " + x );
txt_reader.reset();
//txt_reader.setFile(splits[0].toString().split(":")[1], 0, len);
x = 0;
while (txt_reader.hasMoreRecords()) {
txt_reader.next(csv_line);
x++;
}
System.out.println( "[after reset] read recs: " + x );
}
}