package focusedCrawler.tools;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import com.fasterxml.jackson.annotation.JsonInclude;
import com.fasterxml.jackson.annotation.JsonInclude.Include;
import focusedCrawler.target.model.TargetModelCbor;
import focusedCrawler.util.parser.PaginaURL;
@JsonInclude(Include.NON_NULL)
public class MemexCrawlSchema {
public String url;
public long timestamp;
public String team;
public String crawler;
public String raw_content;
public String content_type;
public List<String> images;
// public List<String> videos;
// public CrawlData crawl_data = new CrawlData();
public MemexCrawlSchema(String url, long timestamp, String team, String crawler,
String raw_content, String content_type, List<String> images) {
this.url = url;
this.timestamp = timestamp;
this.team = team;
this.crawler = crawler;
this.raw_content = raw_content;
this.content_type = content_type;
this.images = images;
}
public MemexCrawlSchema(TargetModelCbor model) {
this.url = model.url;
this.timestamp = model.timestamp*1000;
this.team = "NYU";
this.crawler = "ACHE";
this.content_type = "text/html";
this.raw_content = model.response.get("body").toString();
URL url;
try {
url = new URL(model.url);
} catch (MalformedURLException e) {
throw new IllegalArgumentException("page has an invalid URL: "+model.url);
}
PaginaURL pageParser = new PaginaURL(url,this.raw_content);
this.images = new ArrayList<String>(pageParser.getImages());
// this.crawl_data.html_title = pageParser.titulo();
}
static class CrawlData {
public String html_title;
}
}