package org.archive.server;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStreamReader;
import java.io.PrintWriter;
import java.nio.charset.Charset;
import java.util.ArrayList;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.commons.cli.PosixParser;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.archive.util.DateUtils;
import com.google.common.io.ByteStreams;
public class GZRangeClientTool implements Tool {
public final static String TOOL_NAME = "gzrange-client";
public static final String TOOL_DESCRIPTION =
"Command line tool for repackages records from remote ARC/WARC files into new ARC/WARC files.";
private static final Charset UTF8 = Charset.forName("UTF-8");
private final static String USAGE_HEADER = "Repackages a series of W/ARC records into new W/ARC files.\n\n"
+ "Reads lines from MANIFEST, which are of the format:\n\n"
+ "\tOFFSET URL1 URL2 ... URLN\n\n"
+ "where:\n\n"
+ "\tOFFSET is the start offset of a W/ARC record\n"
+ "\tURLX are HTTP URLs pointing to the W/ARCs\n\n"
+ "A new series of W/ARC files are written in TGT_DIR, where each is prefixed with PREFIX\n"
+ "\n";
private Configuration conf;
public void setConf(Configuration conf) {
this.conf = conf;
}
public Configuration getConf() {
return conf;
}
public static int USAGE(Options opts) {
HelpFormatter formatter = new HelpFormatter();
System.err.println();
PrintWriter pw = new PrintWriter(System.err);
formatter.printHelp(pw,80,TOOL_NAME + " [OPTIONS] TGT_DIR PREFIX MANIFEST",USAGE_HEADER,opts,4,5,"");
pw.flush();
return 1;
}
public static void main(String[] args) throws Exception {
int res = ToolRunner.run(new Configuration(), new GZRangeClientTool(), args);
System.exit(res);
}
private static Options buildOptions() {
Options options = new Options();
Option arcSize = new Option("as","arc-size",true,
"stop writing records to ARCs after they grow beyond SIZE bytes");
arcSize.setArgName("SIZE");
Option warcSize = new Option("ws","warc-size",true,
"stop writing records to WARCs after they grow beyond SIZE bytes");
warcSize.setArgName("SIZE");
Option warcHeaderFields = new Option("wf","warc-header-fields",true,
"Read default WARC header fields from file PATH");
warcHeaderFields.setArgName("PATH");
Option timestamp = new Option("t","timestamp",true,
"Use TIMESTAMP14 as the timestamp for W/ARC names, and for W/ARC header records.");
timestamp.setArgName("TIMESTAMP14");
Option errOnExit = new Option("e", "exit-on-error", false,
"if declared, a failure to get a single record causes a failure in the tool");
options.addOption(arcSize);
options.addOption(warcSize);
options.addOption(warcHeaderFields);
options.addOption(timestamp);
options.addOption(errOnExit);
return options;
}
public int run(String[] args) throws Exception {
Options options = buildOptions();
CommandLineParser parser = new PosixParser();
CommandLine line = null;
try {
line = parser.parse( options, args );
} catch (ParseException e) {
System.err.format("Problem parsing options (%s)\n", e.getMessage());
return USAGE(options);
}
String[] extra = line.getArgs();
if(extra.length != 3) {
return USAGE(options);
}
File targetDir = new File(extra[0]);
String prefix = extra[1];
File manifest = new File(extra[2]);
if(!targetDir.isDirectory()) {
System.err.println("Target directory(" + extra[0] + ") is not a directory");
return 1;
}
if(!targetDir.canWrite()) {
System.err.println("Target directory(" + extra[0] + ") is not writable");
return 1;
}
if(!manifest.isFile()) {
System.err.println("Manifest file(" + extra[2] + ") is not a file");
return 1;
}
if(!manifest.canRead()) {
System.err.println("Manifest file(" + extra[2] + ") is not readable");
return 1;
}
String timestamp14 = DateUtils.get14DigitDate(System.currentTimeMillis());
if(line.hasOption("timestamp")) {
timestamp14 = DateUtils.get14DigitDate(DateUtils.parse14DigitDate(line.getOptionValue("timestamp")));
}
GZRangeClient cli = new GZRangeClient(targetDir, prefix, timestamp14);
if(line.hasOption("arc-size")) {
cli.setMaxArcSize(Long.parseLong(line.getOptionValue("arc-size")));
}
if(line.hasOption("warc-size")) {
cli.setMaxWarcSize(Long.parseLong(line.getOptionValue("warc-size")));
}
if(line.hasOption("warc-header-fields")) {
String path = line.getOptionValue("warc-header-fields");
File f = new File(path);
FileInputStream fis = new FileInputStream(f);
int len = (int) f.length();
byte[] whf = new byte[len];
ByteStreams.readFully(fis, whf);
cli.setWarcHeaderContents(whf);
}
if(line.hasOption("e")) {
System.err.println("Exit on error mode");
cli.setExitOnError(true);
}
FileInputStream manIS = new FileInputStream(manifest);
InputStreamReader manR = new InputStreamReader(manIS,UTF8);
BufferedReader manBufR = new BufferedReader(manR);
while(true) {
String manLine = manBufR.readLine();
if(manLine == null) {
break;
}
String[] parts = manLine.split("\\s");
if(parts.length < 2) {
System.err.format("Line(%s) has < 2 fields\n",manLine);
return 1;
}
long offset = 0;
try {
offset = Long.parseLong(parts[0]);
} catch(NumberFormatException e) {
System.err.format("Line(%s) has non numeric column 1\n",manLine);
return 1;
}
ArrayList<String> urls = new ArrayList<String>();
for(int i = 1; i < parts.length; i++) {
if(!parts[i].startsWith("http://")) {
System.err.format("URL in Line(%s) does not start with http://\n",manLine);
System.exit(1);
}
urls.add(parts[i]);
}
cli.append(offset, urls);
}
cli.finish();
return 0;
}
}