package org.shanbo.feluca.data2.convert; import java.io.BufferedOutputStream; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.File; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.FileReader; import java.io.FileWriter; import java.io.IOException; import java.nio.charset.Charset; import java.util.ArrayList; import org.msgpack.MessagePack; import org.msgpack.packer.Packer; import org.shanbo.feluca.data2.Vector; import org.shanbo.feluca.data2.Tuple.AlignColumn; import org.shanbo.feluca.data2.Tuple.TupleType; import org.shanbo.feluca.data2.Vector.VectorType; import org.shanbo.feluca.data2.DataStatistic; import org.shanbo.feluca.data2.util.TextReader; import com.google.common.io.CharSource; import com.google.common.io.Closeables; import com.google.common.io.Files; public class VectorSerializer { final static int DATA_SIZE_PER_BLOCK = 64 * 1024 * 1024; //6 final static int OUTPUT_BUFFER_SIZE = 8 * 1024 * 1024; BufferedWriter globalStatWriter; BufferedReader rawDataReader; TextReader textReader; public VectorSerializer(String inFile) throws IOException{ File input = new File(inFile); if (input.isFile()){ rawDataReader = new BufferedReader(new FileReader(inFile)); }else if (input.isDirectory()){ File[] files = input.listFiles(); ArrayList<CharSource> fileList= new ArrayList<CharSource>(); for(File f: files){ fileList.add(Files.asCharSource(f, Charset.defaultCharset())); } CharSource concat = CharSource.concat(fileList.iterator()); rawDataReader = concat.openBufferedStream(); }else{ throw new FileNotFoundException("path : " + inFile + " not found"); } } private void generalConverting(String outDir, VectorType inputType, VectorType outputType, DataStatistic globalStat) throws FileNotFoundException, IOException{ File dir = new File(outDir); if (dir.isFile()){ dir.delete(); } if (!dir.exists()){ dir.mkdir(); } String dataName = dir.getName(); String blockPathTemplate = dir.getAbsolutePath() + "/" + dataName + ".%d.dat"; int blockId = 1; int blockSize = 0; globalStatWriter = new BufferedWriter(new FileWriter(dir.getAbsolutePath() + "/" + dataName + ".sta")); MessagePack mPack = new MessagePack(); Packer packer = mPack.createPacker( new BufferedOutputStream(new FileOutputStream(String.format(blockPathTemplate, blockId)),OUTPUT_BUFFER_SIZE )); int count = 0; Vector vector = Vector.create(inputType); vector.setOutputType(outputType); for(String line = textReader.readLine(); line != null ; line = textReader.readLine()){ boolean parse = vector.parseLine(line); if (parse == false){ continue; } packer.write(true); vector.pack(packer); globalStat.stat(vector); count ++; if (count % 10000 == 0){ // System.out.println("!"); } blockSize += vector.getSpaceCost() ; if (blockSize > DATA_SIZE_PER_BLOCK){ packer.write(false).close(); blockId += 1; blockSize = 0; packer = mPack.createPacker( new BufferedOutputStream(new FileOutputStream(String.format(blockPathTemplate, blockId)),OUTPUT_BUFFER_SIZE)); } } packer.write(false).close(); globalStatWriter.write(globalStat.toString()); Closeables.close(textReader, true); Closeables.close(globalStatWriter, true); } public void convertLW2LW(String outFile) throws IOException{ textReader = new TextReader(rawDataReader); generalConverting(outFile, VectorType.LABEL_FID_WEIGHT, VectorType.LABEL_FID_WEIGHT, DataStatistic.createLWstat()); } public void convertVW2VW(String outFile) throws IOException{ textReader = new TextReader(rawDataReader); generalConverting(outFile, VectorType.LABEL_FID_WEIGHT, VectorType.LABEL_FID_WEIGHT, DataStatistic.createVWstat()); } public void convertTuple2VID(String outFile) throws IOException{ textReader = new TextReader(rawDataReader, TupleType.WEIGHT_TYPE, AlignColumn.FIRST); generalConverting(outFile, VectorType.VID_FID_WEIGHT, VectorType.VID_FID_WEIGHT, DataStatistic.createVWstat()); } }