package hip.mahout;
import org.apache.commons.io.FileUtils;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.compress.DefaultCodec;
import org.apache.mahout.math.DenseVector;
import org.apache.mahout.math.VectorWritable;
import java.io.File;
import java.io.IOException;
public class Synthetic2DClusteringPrep {
public static void main(String... args) throws IOException {
write(new File(args[0]), new Path(args[1]));
}
public static void write(File inputFile, Path outputPath)
throws IOException {
Configuration conf = new Configuration();
FileSystem fs = FileSystem.get(conf);
SequenceFile.Writer writer =
SequenceFile.createWriter(fs, conf, outputPath, NullWritable.class,
VectorWritable.class,
SequenceFile.CompressionType.BLOCK,
new DefaultCodec());
try {
for (String line : FileUtils.readLines(inputFile)) {
String parts[] = StringUtils.split(line);
writer.append(NullWritable.get(),
new VectorWritable(new DenseVector(
new double[]{
Double.valueOf(parts[0]),
Double.valueOf(parts[1])
}
)));
}
} finally {
writer.close();
}
}
}