package uk.bl.monitrix.database.mongodb.model;
import java.text.DateFormat;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Date;
import java.util.List;
import com.mongodb.DBObject;
import uk.bl.monitrix.database.mongodb.MongoProperties;
import uk.bl.monitrix.model.CrawlLogEntry;
/**
* A MongoDB-backed implementation of {@link CrawlLogEntry}.
* @author Rainer Simon <rainer.simon@ait.ac.at>
*/
public class MongoCrawlLogEntry extends CrawlLogEntry {
private DBObject dbo;
private List<String> fields = null;
private static DateFormat RFC2550_FORMAT = new SimpleDateFormat("yyyyMMddHHmmssSSS");
public MongoCrawlLogEntry(DBObject dbo) {
this.dbo = dbo;
}
private void parseEntry() {
fields = new ArrayList<String>();
String[] split = this.toString().split(" ");
// Column 1 - 11
int ctr = 0;
while (fields.size() < 11 && ctr < split.length) {
if (!split[ctr].isEmpty())
fields.add(split[ctr].trim());
ctr++;
}
// Column 12 (annotations) - note that annotations may contain white spaces, so we need to re-join
StringBuilder sb = new StringBuilder();
for (int i=ctr; i<split.length; i++) {
sb.append(split[i] + " ");
}
fields.add(sb.toString().trim());
}
/**
* Returns the MongoDB entity that's backing this object.
* @return the DBObject
*/
public DBObject getBackingDBO() {
return dbo;
}
@Override
public String getLogId() {
return (String) dbo.get(MongoProperties.FIELD_CRAWL_LOG_LOG_ID);
}
public void setLogId(String logId) {
dbo.put(MongoProperties.FIELD_CRAWL_LOG_LOG_ID, logId);
}
@Override
public Date getLogTimestamp() {
return new Date((Long) dbo.get(MongoProperties.FIELD_CRAWL_LOG_TIMESTAMP));
}
public void setTimestamp(long timestamp) {
dbo.put(MongoProperties.FIELD_CRAWL_LOG_TIMESTAMP, timestamp);
}
@Override
public int getHTTPCode() {
return (Integer) dbo.get(MongoProperties.FIELD_CRAWL_LOG_HTTP_CODE);
}
public void setHTTPCode(int httpCode) {
dbo.put(MongoProperties.FIELD_CRAWL_LOG_HTTP_CODE, httpCode);
}
@Override
public long getDownloadSize() {
if (fields == null)
parseEntry();
if (fields.get(2).equals("-"))
return 0;
return Long.parseLong(fields.get(2));
}
@Override
public String getURL() {
return (String) dbo.get(MongoProperties.FIELD_CRAWL_LOG_URL);
}
public void setURL(String url) {
dbo.put(MongoProperties.FIELD_CRAWL_LOG_URL, url);
}
@Override
public String getHost() {
return (String) dbo.get(MongoProperties.FIELD_CRAWL_LOG_HOST);
}
public void setHost(String hostname) {
dbo.put(MongoProperties.FIELD_CRAWL_LOG_HOST, hostname);
}
@Override
public String getDomain() {
return (String) dbo.get(MongoProperties.FIELD_CRAWL_LOG_DOMAIN);
}
public void setDomain(String domain) {
dbo.put(MongoProperties.FIELD_CRAWL_LOG_DOMAIN, domain);
}
@Override
public String getSubdomain() {
return (String) dbo.get(MongoProperties.FIELD_CRAWL_LOG_SUBDOMAIN);
}
public void setSubdomain(String subdomain) {
dbo.put(MongoProperties.FIELD_CRAWL_LOG_SUBDOMAIN, subdomain);
}
@Override
public String getBreadcrumbCodes() {
if (fields == null)
parseEntry();
return fields.get(4);
}
@Override
public String getReferrer() {
if (fields == null)
parseEntry();
return fields.get(5);
}
@Override
public String getContentType() {
if (fields == null)
parseEntry();
return fields.get(6);
}
@Override
public String getWorkerThread() {
return (String) dbo.get(MongoProperties.FIELD_CRAWL_LOG_CRAWLER_ID);
}
@Override
public Date getFetchTimestamp() {
if (fields == null)
parseEntry();
try {
String timestamp = fields.get(8);
if (timestamp.indexOf('+') > -1)
timestamp = timestamp.substring(0, timestamp.indexOf('+'));
System.out.println("fetch timestamp: " + timestamp);
return RFC2550_FORMAT.parse(timestamp);
} catch (ParseException e) {
// Should never happen!
throw new RuntimeException(e);
}
}
@Override
public int getFetchDuration() {
if (fields == null)
parseEntry();
String duration = fields.get(8);
if (duration.indexOf('+') > -1) {
duration = duration.substring(duration.indexOf('+') + 1);
return Integer.parseInt(duration);
}
return 0;
}
public void setCrawlerID(String crawlerId) {
dbo.put(MongoProperties.FIELD_CRAWL_LOG_CRAWLER_ID, crawlerId);
}
@Override
public String getSHA1Hash() {
if (fields == null)
parseEntry();
return fields.get(9);
}
@Override
public String getAnnotations() {
return (String) dbo.get(MongoProperties.FIELD_CRAWL_LOG_ANNOTATIONS);
}
public void setAnnotations(String annotations) {
dbo.put(MongoProperties.FIELD_CRAWL_LOG_ANNOTATIONS, annotations);
dbo.put(MongoProperties.FIELD_CRAWL_LOG_ANNOTATIONS_TOKENIZED, Arrays.asList(annotations.split(",")));
}
@Override
public int getRetries() {
return (Integer) dbo.get(MongoProperties.FIELD_CRAWL_LOG_RETRIES);
}
public void setRetries(int retries) {
dbo.put(MongoProperties.FIELD_CRAWL_LOG_RETRIES, retries);
}
@Override
public double getCompressability() {
return (Double) dbo.get(MongoProperties.FIELD_CRAWL_LOG_COMPRESSABILITY);
}
public void setCompressability(double compressability) {
dbo.put(MongoProperties.FIELD_CRAWL_LOG_COMPRESSABILITY, compressability);
}
public void setLogLine(String line) {
dbo.put(MongoProperties.FIELD_CRAWL_LOG_LINE, line);
}
@Override
public String toString() {
return (String) dbo.get(MongoProperties.FIELD_CRAWL_LOG_LINE);
}
}