package focusedCrawler.memex.cdr; import java.io.Serializable; import java.util.Map; import org.apache.commons.codec.digest.DigestUtils; import org.apache.tika.mime.MediaType; import com.fasterxml.jackson.annotation.JsonIgnore; import com.fasterxml.jackson.annotation.JsonInclude; import com.fasterxml.jackson.annotation.JsonInclude.Include; import com.fasterxml.jackson.core.JsonProcessingException; import com.fasterxml.jackson.databind.ObjectMapper; import focusedCrawler.memex.cdr.TikaExtractor.ParsedData; import com.fasterxml.jackson.annotation.JsonProperty; /** { 'url': <full URL of the web page>, 'timestamp': <timestamp for data when scraped, in epoch milliseconds>, 'team': <name of crawling team>, 'crawler': <name of crawler; each type of crawler should have a distinct name or reference>, 'raw_content': <full data of raw crawled page; source page that can be reproduced for system purposes (e.g. provenance, law enforcement evidence)>, 'content_type': <mime-type of data in stored in raw_content>, 'crawl_data': <source page from crawler that extracts full text but not full layout; full-text>, 'extracted_metadata': { // Metadata extracted by Tika/other extractors }, ‘extracted_text’: { // Text extracted from the document if applicable for that mime type } } */ @SuppressWarnings("serial") @JsonInclude(Include.NON_NULL) public class CDR2Document implements Serializable { private String _id; private String url; private long timestamp; private String team; private String crawler; @JsonProperty("raw_content") private String rawContent; @JsonProperty("content_type") private String contentType; @JsonProperty("crawl_data") private Object crawlData; @JsonProperty("extracted_metadata") private Map<String, String> extractedMetadata; @JsonProperty("extracted_text") private String extractedText; private String version; public CDR2Document() { } public CDR2Document(Builder builder) { this._id = builder._id; this.url = builder.url; this.timestamp = builder.timestamp; this.team = builder.team; this.crawler = builder.crawler; this.rawContent = builder.rawContent; this.contentType = builder.contentType; this.crawlData = builder.crawlData; this.extractedMetadata = builder.extractedMetadata; this.extractedText = builder.extractedText; this.version = builder.version; } public String getUrl() { return url; } public long getTimestamp() { return timestamp; } public String getTeam() { return team; } public String getCrawler() { return crawler; } public String getRawContent() { return rawContent; } public String getContentType() { return contentType; } public Object getCrawlData() { return crawlData; } public Map<String, String> getExtractedMetadata() { return extractedMetadata; } public String getExtractedText() { return extractedText; } @JsonIgnore public String getId() { return this._id; } public String getVersion() { return version; } public static class Builder { private static final TikaExtractor extractor = new TikaExtractor(); private static final ObjectMapper jsonMapper = new ObjectMapper(); private String _id; private String url; private long timestamp; private String team; private String crawler; private String rawContent; private String contentType; private Object crawlData; private Map<String, String> extractedMetadata; private String extractedText; private String version; public CDR2Document build() { if (this.url == null) throw new IllegalArgumentException("Field 'url' is mandatory"); if (this.rawContent == null) throw new IllegalArgumentException("Field 'raw_content' is mandatory"); if (this.crawler == null) throw new IllegalArgumentException("Field 'crawler' is mandatory"); if (this.team == null) throw new IllegalArgumentException("Field 'team' is mandatory"); if (this.version == null) throw new IllegalArgumentException("Field 'version' is mandatory"); if (this.timestamp == 0) throw new IllegalArgumentException("Field 'timestamp' is mandatory"); if(this.contentType == null) { MediaType mediaType = extractor.detect(this.rawContent, this.url, this.contentType); this.contentType = mediaType.getBaseType().toString(); } if (this.extractedMetadata == null || this.extractedText == null) { MediaType mediaType = MediaType.parse(this.contentType); if(mediaType.getBaseType().equals(MediaType.TEXT_HTML)) { // auto-generate extracted_metadata field using Tika ParsedData parsedData = extractor.parse(this.rawContent, this.url, this.contentType); if (this.extractedMetadata == null && parsedData != null) { this.extractedMetadata = parsedData.getMetadata(); } // auto-generate extracted_text field using Tika if (this.extractedText == null && parsedData != null) { this.extractedText = parsedData.getPlainText(); } } } if (this._id == null) { // auto-generate _id field this._id = computeId(); } return new CDR2Document(this); } public String buildAsJson() throws JsonProcessingException { return jsonMapper.writeValueAsString(this.build()); } private String computeId() { StringBuilder textForId = new StringBuilder(); textForId.append(this.url); textForId.append("-"); textForId.append(this.timestamp); return DigestUtils.sha256Hex(textForId.toString()).toUpperCase(); } public Builder setId(String id) { this._id = id; return this; } public Builder setUrl(String url) { this.url = url; return this; } public Builder setTimestamp(long timestamp) { this.timestamp = timestamp; return this; } public Builder setTeam(String team) { this.team = team; return this; } public Builder setCrawler(String crawler) { this.crawler = crawler; return this; } public Builder setRawContent(String rawContent) { this.rawContent = rawContent; return this; } public Builder setContentType(String contentType) { this.contentType = contentType; return this; } public Builder setCrawlData(Object crawlData) { this.crawlData = crawlData; return this; } public Builder setExtractedMetadata(Map<String, String> extractedMetadata) { this.extractedMetadata = extractedMetadata; return this; } public Builder setExtractedText(String extractedText) { this.extractedText = extractedText; return this; } public Builder setVersion(String version) { this.version = version; return this; } } }