package org.shanbo.feluca.data2.convert;
import java.io.BufferedOutputStream;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.nio.charset.Charset;
import java.util.ArrayList;
import org.msgpack.MessagePack;
import org.msgpack.packer.Packer;
import org.shanbo.feluca.data2.DataSetInfo;
import org.shanbo.feluca.data2.DataStatistic;
import org.shanbo.feluca.data2.Vector;
import org.shanbo.feluca.data2.Vector.VectorType;
import org.shanbo.feluca.data2.util.TextReader;
import org.shanbo.feluca.vectors.LabelVector;
import com.google.common.io.CharSource;
import com.google.common.io.Closeables;
import com.google.common.io.Files;
public class VectorConverter {
static int OUTPUT_BUFFER_SIZE = 8 * 1024 * 1024;
BufferedWriter globalStatWriter;
BufferedReader rawDataReader;
TextReader textReader;
String fileName;
public VectorConverter(String inFile) throws IOException{
File input = new File(inFile);
fileName = input.getName();
if (input.isFile()){
rawDataReader = new BufferedReader(new FileReader(inFile));
}else if (input.isDirectory()){
File[] files = input.listFiles();
ArrayList<CharSource> fileList= new ArrayList<CharSource>();
for(File f: files){
fileList.add(Files.asCharSource(f, Charset.defaultCharset()));
}
CharSource concat = CharSource.concat(fileList.iterator());
rawDataReader = concat.openBufferedStream();
}else{
throw new FileNotFoundException("path : " + inFile + " not found");
}
}
private void generalConverting(String outDir, Vector vector) throws FileNotFoundException, IOException{
File dir = new File(outDir);
if (dir.isFile()){
dir.delete();
}
if (!dir.exists()){
dir.mkdir();
}
String blockPathTemplate = dir.getAbsolutePath() + "/" + fileName + ".ser";
int blockSize = 0;
globalStatWriter = new BufferedWriter(new FileWriter(dir.getAbsolutePath() + "/" + fileName + ".sta"));
MessagePack mPack = new MessagePack();
Packer packer = mPack.createPacker( new BufferedOutputStream(new FileOutputStream(blockPathTemplate),OUTPUT_BUFFER_SIZE ));
int count = 0;
DataSetInfo dataSetInfo = new DataSetInfo();
for(String line = textReader.readLine(); line != null ; line = textReader.readLine()){
boolean parse = vector.parseLine(line);
if (parse == false){
continue;
}
packer.write(true);
vector.pack(packer);
dataSetInfo.doStat(vector);
count ++;
if (count % 10000 == 0){
// System.out.println("!");
}
blockSize += vector.getSpaceCost() ;
}
packer.write(false).close();
globalStatWriter.write(DataSetInfo.prop2String(dataSetInfo.getStatInfo()));
Closeables.close(textReader, true);
Closeables.close(globalStatWriter, true);
}
public void convertLW2LW(String outFile) throws IOException{
textReader = new TextReader(rawDataReader);
generalConverting(outFile, new LabelVector());
}
}