package org.archive.extract;
import java.io.IOException;
import java.io.OutputStream;
import java.io.PrintWriter;
import java.net.MalformedURLException;
import java.net.URISyntaxException;
import java.net.URL;
import java.util.List;
import java.util.logging.Logger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.archive.format.gzip.GZIPFormatException;
import org.archive.format.json.JSONUtils;
import org.archive.format.json.SimpleJSONPathSpec;
import org.archive.resource.MetaData;
import org.archive.resource.Resource;
import org.archive.util.IAUtils;
import org.archive.util.StreamCopy;
import org.json.JSONArray;
import org.json.JSONException;
import org.json.JSONObject;
import com.google.common.io.ByteStreams;
import com.google.common.io.CountingOutputStream;
public class WARCMetadataRecordExtractorOutput implements ExtractorOutput {
private static final Logger LOG =
Logger.getLogger(WARCMetadataRecordExtractorOutput.class.getName());
private PrintWriter out;
SimpleJSONPathSpec formatSpec = new SimpleJSONPathSpec("Envelope.Format");
SimpleJSONPathSpec warcURL = new SimpleJSONPathSpec("Envelope.WARC-Header-Metadata.WARC-Target-URI");
SimpleJSONPathSpec warcDate = new SimpleJSONPathSpec("Envelope.WARC-Header-Metadata.WARC-Date");
SimpleJSONPathSpec warcType = new SimpleJSONPathSpec("Envelope.WARC-Header-Metadata.WARC-Type");
SimpleJSONPathSpec warcMetadataRecord = new SimpleJSONPathSpec("Envelope.Payload-Metadata.WARC-Metadata-Metadata.Metadata-Records");
private String outputType = "outlinks";
public WARCMetadataRecordExtractorOutput(PrintWriter out, String outputType) {
this.out = out;
this.outputType = outputType;
}
public WARCMetadataRecordExtractorOutput(PrintWriter out) {
this(out,"outlinks");
}
public void output(Resource resource) throws IOException {
OutputStream nullo = ByteStreams.nullOutputStream();
CountingOutputStream co = new CountingOutputStream(nullo);
try {
StreamCopy.copy(resource.getInputStream(), co);
} catch(GZIPFormatException e) {
e.printStackTrace();
return;
}
long bytes = co.getCount();
if(bytes > 0) {
LOG.info(bytes + " unconsumed bytes in Resource InputStream.");
}
try {
MetaData m = resource.getMetaData().getTopMetaData();
// URL DATE OURL MIME HTTP-CODE SHA1 META REDIR OFFSET LENGTH FILE
String format = getEnvelopeFormat(m);
String origUrl = "TBD";
String date = "TBD";
String canUrl = "TBD";
if(format.startsWith("WARC")) {
origUrl = getWARCURL(m);
date = getWARCDate(m);
String type = getWARCType(m);
if(type.equals("metadata")) {
String warcMetadataRecord = getWARCMetadataRecord(m);
JSONArray array = new JSONArray(warcMetadataRecord);
String viaUrl = "-";
String viaPath = "-";
String sourceTag = "-";
for(int i=0;i<array.length();i++) {
JSONObject obj = array.getJSONObject(i);
if(outputType.equals("outlinks")) {
if(obj.get("Name").toString().equals("outlink")) {
String outLinkValue = obj.get("Value").toString();
String[] linkParts = outLinkValue.split(" ");
if(linkParts.length > 2)
//'outlinks': 'origUrl date origOutlinkUrl linktype linktext'
out.format("%s\t%s\t%s\t%s\t\n",origUrl,date,linkParts[0],linkParts[2]);
}
} else if(outputType.equals("hopinfo")) {
String key = obj.get("Name").toString();
String value = obj.get("Value").toString();
if(key.equals("via")) {
viaUrl = value;
} else if (key.equals("hopsFromSeed")) {
viaPath = value;
} else if (key.equals("sourceTag")) {
sourceTag = value;
}
}
}
if(outputType.equals("hopinfo")) {
//'hopinfo': 'origCrawledUrl date origViaUrl hopPathFromVia sourceTag'
out.format("%s\t%s\t%s\t%s\t%s\n",origUrl,date,viaUrl,viaPath,sourceTag);
}
}
}
}
catch (Exception e) {
throw new IOException(e);
}
out.flush();
}
private String getEnvelopeFormat(MetaData m) {
return unwrapFirst(formatSpec.extract(m),"-");
}
private String getWARCURL(MetaData m) {
return unwrapFirst(warcURL.extract(m),"-");
}
private String getWARCDate(MetaData m) {
return unwrapFirst(warcDate.extract(m),"-");
}
private String getWARCType(MetaData m) {
return unwrapFirst(warcType.extract(m),"-");
}
private String getWARCMetadataRecord(MetaData m) {
return unwrapFirst(warcMetadataRecord.extract(m),"-");
}
private String unwrapFirst(List<List<String>> l, String defaultValue) {
if(l != null) {
if(l.size() > 0) {
if(l.get(0) != null) {
if(l.get(0).size() > 0) {
String v = l.get(0).get(0);
if(v != null) {
if(v.length() > 0) {
return v;
}
}
}
}
}
}
return defaultValue;
}
}