package focusedCrawler.memex.cdr; import java.io.Serializable; import java.util.ArrayList; import java.util.Date; import java.util.List; import org.apache.commons.codec.digest.DigestUtils; import org.apache.tika.mime.MediaType; import com.fasterxml.jackson.annotation.JsonFormat; import com.fasterxml.jackson.annotation.JsonIgnore; import com.fasterxml.jackson.annotation.JsonInclude; import com.fasterxml.jackson.annotation.JsonInclude.Include; import com.fasterxml.jackson.annotation.JsonProperty; import com.fasterxml.jackson.core.JsonProcessingException; import com.fasterxml.jackson.databind.ObjectMapper; /** * Represents a web page according the CDRv3 schema. */ @SuppressWarnings("serial") @JsonInclude(Include.NON_NULL) public class CDR3Document implements Serializable { public static class CDR3MediaObject { @JsonProperty("obj_original_url") String objOriginalUrl; @JsonProperty("obj_stored_url") String objStoredUrl; @JsonProperty("content_type") private String contentType; @JsonProperty("timestamp_crawl") @JsonFormat(shape=JsonFormat.Shape.STRING, pattern="yyyy-MM-dd'T'HH:mm:ss'Z'") private Date timestampCrawl; } private String _id; @JsonProperty("content_type") private String contentType; @JsonProperty("crawler") private String crawler; @JsonProperty("objects") private List<CDR3MediaObject> objects; @JsonProperty("raw_content") private String rawContent; @JsonProperty("team") private String team; @JsonProperty("timestamp_crawl") @JsonFormat(shape=JsonFormat.Shape.STRING, pattern="yyyy-MM-dd'T'HH:mm:ss'Z'") private Date timestampCrawl; @JsonProperty("timestamp_index") @JsonFormat(shape=JsonFormat.Shape.STRING, pattern="yyyy-MM-dd'T'HH:mm:ss'Z'") private Date timestampIndex; @JsonProperty("url") private String url; @JsonProperty("version") private final float version = 3.0f; public CDR3Document() { // required from JSON deserialization } public CDR3Document(Builder builder) { this._id = builder._id; this.contentType = builder.contentType; this.crawler = builder.crawler; this.objects = builder.objects; this.rawContent = builder.rawContent; this.team = builder.team; this.timestampCrawl = builder.timestampCrawl; this.timestampIndex = builder.timestampIndex; this.url = builder.url; } public String getUrl() { return url; } public Date getTimestampCrawl() { return timestampCrawl; } public Date getTimestampIndex() { return timestampIndex; } public String getTeam() { return team; } public String getCrawler() { return crawler; } public String getRawContent() { return rawContent; } public String getContentType() { return contentType; } @JsonIgnore public String getId() { return this._id; } public float getVersion() { return version; } public static class Builder { private static final TikaExtractor extractor = new TikaExtractor(); private static final ObjectMapper jsonMapper = new ObjectMapper(); private String _id; private String contentType; private String crawler; private List<CDR3MediaObject> objects; private String rawContent; private String team; private Date timestampCrawl; private Date timestampIndex; private String url; public CDR3Document build() { if (this.url == null) throw new IllegalArgumentException("Field 'url' is mandatory"); if (this.rawContent == null) throw new IllegalArgumentException("Field 'raw_content' is mandatory"); if (this.crawler == null) throw new IllegalArgumentException("Field 'crawler' is mandatory"); if (this.team == null) throw new IllegalArgumentException("Field 'team' is mandatory"); if (this.timestampIndex == null) throw new IllegalArgumentException("Field 'timestampIndex' is mandatory"); if(this.contentType == null) { MediaType mediaType = extractor.detect(this.rawContent, this.url, this.contentType); this.contentType = mediaType.getBaseType().toString(); } if(this.objects == null) { this.objects = new ArrayList<>(); } if (this._id == null) { // auto-generate _id field this._id = computeId(); } return new CDR3Document(this); } public String buildAsJson() throws JsonProcessingException { return jsonMapper.writeValueAsString(this.build()); } private String computeId() { StringBuilder textForId = new StringBuilder(); textForId.append(this.url); textForId.append("-"); textForId.append(this.timestampCrawl); return DigestUtils.sha256Hex(textForId.toString()).toUpperCase(); } public Builder setId(String id) { this._id = id; return this; } public Builder setUrl(String url) { this.url = url; return this; } public Builder setTimestampCrawl(Date timestampCrawl) { this.timestampCrawl = timestampCrawl; return this; } public Builder setTimestampIndex(Date timestampIndex) { this.timestampIndex = timestampIndex; return this; } public Builder setTeam(String team) { this.team = team; return this; } public Builder setCrawler(String crawler) { this.crawler = crawler; return this; } public Builder setRawContent(String rawContent) { this.rawContent = rawContent; return this; } public Builder setContentType(String contentType) { this.contentType = contentType; return this; } public Builder setObjects(List<CDR3MediaObject> mediaObjects) { this.objects = mediaObjects; return this; } } }