package focusedCrawler.tools;
import focusedCrawler.target.model.Page;
import focusedCrawler.target.model.TargetModelCbor;
import focusedCrawler.target.model.TargetModelJson;
import focusedCrawler.target.repository.FileSystemTargetRepository;
import focusedCrawler.target.repository.FileSystemTargetRepository.DataFormat;
import focusedCrawler.target.repository.FileSystemTargetRepository.FileContentIterator;
import focusedCrawler.target.repository.FilesTargetRepository;
import focusedCrawler.util.CliTool;
import io.airlift.airline.Command;
import io.airlift.airline.Option;
@Command(name="MigrateToFilesTargetRepository", description="Migrate a FS repository to a FILES repository")
public class MigrateToFilesTargetRepository extends CliTool {
@Option(name = "--input-path", required = true, description = "Path to old input data_target folder")
private String inputPath;
@Option(name = "--output-path", required = true, description = "Path to new output data_target folder")
private String outputPath;
@Option(name = "--hash-file-name", required = false, description = "If the repository uses hashed file names")
private boolean hashFilename = false;
@Option(name = "--compressed-data", required = false, description = "If the repository uses compressed files")
private boolean compressData = false;
@Option(name = "--data-format", required = false, description = "The data format used by the old repository")
private DataFormat dataFormat = DataFormat.JSON;
public static void main(String[] args) throws Exception {
CliTool.run(args, new MigrateToFilesTargetRepository());
}
@Override
public void execute() throws Exception {
System.out.println("Reading URLs from file: " + inputPath);
System.out.println("Writing output file at: " + outputPath);
System.out.println();
int processedPages = 0;
FileSystemTargetRepository oldRep = new FileSystemTargetRepository(inputPath, dataFormat, hashFilename, compressData);
FilesTargetRepository newRep = new FilesTargetRepository(outputPath);
try (FileContentIterator<?> oldIt = oldRep.iterator()) {
while (oldIt.hasNext()) {
try {
TargetModelJson target = null;
if (dataFormat == DataFormat.CBOR) {
target = new TargetModelJson(new Page((TargetModelCbor) oldIt.next()));
} else if (dataFormat == DataFormat.JSON) {
target = (TargetModelJson) oldIt.next();
}
newRep.insert(target);
} catch(Exception e) {
System.out.println("Ignoring file due to failure.");
e.printStackTrace(System.out);
continue;
}
processedPages++;
if (processedPages % 1000 == 0) {
System.out.printf("Migrated %s pages...\n", processedPages);
}
}
}
System.out.printf("Finished processing %d pages.\n", processedPages);
}
}