package org.archive.modules.postprocessor; import java.util.Map; import java.util.Map.Entry; import org.apache.commons.lang.StringUtils; import org.archive.modules.CoreAttributeConstants; import org.archive.modules.CrawlURI; import org.archive.modules.net.CrawlHost; import org.archive.modules.net.ServerCache; import org.archive.util.ArchiveUtils; import org.archive.util.MimetypeUtils; import org.json.JSONObject; public class CrawlLogJsonBuilder { protected static Object checkForNull(Object o) { return o != null ? o : JSONObject.NULL; } public static JSONObject buildJson(CrawlURI curi, Map<String,String> extraFields, ServerCache serverCache) { JSONObject jo = new JSONObject(); jo.put("timestamp", ArchiveUtils.getLog17Date(System.currentTimeMillis())); for (Entry<String, String> entry: extraFields.entrySet()) { jo.put(entry.getKey(), entry.getValue()); } jo.put("content_length", curi.isHttpTransaction() && curi.getContentLength() >= 0 ? curi.getContentLength() : JSONObject.NULL); jo.put("size", curi.getContentSize() > 0 ? curi.getContentSize() : JSONObject.NULL); jo.put("status_code", checkForNull(curi.getFetchStatus())); jo.put("url", checkForNull(curi.getUURI().toString())); jo.put("hop_path", checkForNull(curi.getPathFromSeed())); jo.put("via", checkForNull(curi.flattenVia())); jo.put("mimetype", checkForNull(MimetypeUtils.truncate(curi.getContentType()))); jo.put("thread", checkForNull(curi.getThreadNumber())); if (curi.containsDataKey(CoreAttributeConstants.A_FETCH_COMPLETED_TIME)) { long beganTime = curi.getFetchBeginTime(); String fetchBeginDuration = ArchiveUtils.get17DigitDate(beganTime) + "+" + (curi.getFetchCompletedTime() - beganTime); jo.put("start_time_plus_duration", fetchBeginDuration); } else { jo.put("start_time_plus_duration", JSONObject.NULL); } jo.put("content_digest", checkForNull(curi.getContentDigestSchemeString())); jo.put("seed", checkForNull(curi.getSourceTag())); CrawlHost host = serverCache.getHostFor(curi.getUURI()); if (host != null) { jo.put("host", host.fixUpName()); } else { jo.put("host", JSONObject.NULL); } jo.put("annotations", checkForNull(StringUtils.join(curi.getAnnotations(), ","))); JSONObject ei = curi.getExtraInfo(); if (ei == null) { ei = new JSONObject(); } // copy so we can remove unrolled fields ei = new JSONObject(curi.getExtraInfo().toString()); ei.remove("contentSize"); // we get this value above jo.put("warc_filename", checkForNull(ei.remove("warcFilename"))); jo.put("warc_offset", checkForNull(ei.remove("warcFileOffset"))); jo.put("extra_info", ei); return jo; } }