package focusedCrawler.tools; import java.io.File; import java.io.IOException; import java.util.Arrays; import java.util.Comparator; import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.dataformat.cbor.CBORFactory; import focusedCrawler.target.model.TargetModelCbor; public class CborToGzipCompressor { static final ObjectMapper cborMapper = new ObjectMapper(new CBORFactory()); static final ObjectMapper jsonMapper = new ObjectMapper(); public static void main(String[] args) throws IOException { String inputLocation = args[0]; String outputLocation = args[1]; long objectsPerFile = Long.parseLong(args[2]); boolean useJson = false; File file = new File(inputLocation); File[] files = file.listFiles(); Arrays.sort(files, new Comparator<File>() { @Override public int compare(File o1, File o2) { return Long.compare(o1.lastModified(), o2.lastModified()); } }); String currentArchive = null; GzipCborFileWriter gzipCborFileWriter = null; long objectsWritten = 0; for (File f : files) { // if(!f.getName().contains("showthread.php")) { // continue; // } TargetModelCbor targetModel = cborMapper.readValue(f, focusedCrawler.target.model.TargetModelCbor.class); // open gzip output file if (currentArchive == null || objectsWritten % objectsPerFile == 0) { if (currentArchive != null) { gzipCborFileWriter.close(); } currentArchive = new File(inputLocation).getName()+"_"+System.currentTimeMillis(); if(useJson) currentArchive = currentArchive+"_json.tar.gz"; else currentArchive = currentArchive+"_cbor.tar.gz"; String fullArchivePath = outputLocation + File.separator + currentArchive; if(useJson) { gzipCborFileWriter = new GzipCborFileWriter(fullArchivePath, jsonMapper); } else { gzipCborFileWriter = new GzipCborFileWriter(fullArchivePath, cborMapper); } } // fix key for objects stored with wrong key targetModel.key = targetModel.computeReverseKey(targetModel.url); System.out.println("Writing object: "+f.getName()); gzipCborFileWriter.writeTargetModel(targetModel); objectsWritten++; } if(gzipCborFileWriter != null) { gzipCborFileWriter.close(); } } }