package focusedCrawler.target.model;
import java.io.Serializable;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import org.apache.tika.metadata.Metadata;
import focusedCrawler.crawler.crawlercommons.fetcher.FetchedResult;
import focusedCrawler.link.frontier.LinkRelevance;
import focusedCrawler.target.classifier.TargetRelevance;
@SuppressWarnings("serial")
public class Page implements Serializable {
private URL url;
private URL redirectedURL;
private byte[] content;
private String contentType;
private Map<String, List<String>> responseHeaders;
private long fetchTime;
private LinkRelevance linkRelevance;
private ParsedData parsedData;
private TargetRelevance targetRelevance;
private boolean auth = false;
public Page() {
// required for JSON serialization
}
public Page(URL url, String content) {
this(url, content.getBytes(), null, null);
}
public Page(URL url, String content, Map<String, List<String>> responseHeaders) {
this(url, content.getBytes(), responseHeaders, null);
}
public Page(URL url, byte[] content, Map<String, List<String>> responseHeaders, URL redirectedURL) {
this.url = url;
this.content = content;
this.redirectedURL = redirectedURL;
if (responseHeaders != null) {
this.responseHeaders = responseHeaders;
this.contentType = extractContentType(responseHeaders);
}
}
public Page(TargetModelCbor target) throws MalformedURLException {
this.url = new URL(target.url);
this.content = ((String) target.response.get("body")).getBytes();
this.fetchTime = target.timestamp * 1000;
}
public Page(TargetModelJson target) throws MalformedURLException {
this.url = new URL(target.getUrl());
this.redirectedURL = new URL(target.getRedirectedUrl());
this.content = target.getContent();
this.responseHeaders = target.getResponseHeaders();
this.fetchTime = target.getFetchTime();
this.contentType = target.getContentType();
}
public Page(FetchedResult fetchedResult) throws MalformedURLException {
this.url = new URL(fetchedResult.getBaseUrl());
this.content = fetchedResult.getContent();
this.fetchTime = fetchedResult.getFetchTime();
if (fetchedResult.getNumRedirects() > 0) {
this.redirectedURL = new URL(fetchedResult.getFetchedUrl());
}
parseResponseHeaders(fetchedResult.getHeaders());
}
public static String extractContentType(Map<String, List<String>> responseHeaders) {
for (Entry<String, List<String>> header : responseHeaders.entrySet()) {
if ("content-type".compareToIgnoreCase(header.getKey()) == 0) {
List<String> values = header.getValue();
if (!values.isEmpty()) {
return values.get(0);
}
}
}
return null;
}
private void parseResponseHeaders(Metadata headerAsMetadata) {
Map<String, List<String>> responseHeaders = new HashMap<>();
String[] names = headerAsMetadata.names();
if(names != null && names.length > 0) {
for(String name : names) {
List<String> values = Arrays.asList(headerAsMetadata.getValues(name));
if(values.isEmpty()) {
continue;
}
responseHeaders.put(name, values);
if("content-type".compareToIgnoreCase(name) == 0) {
this.contentType = values.get(0);
}
}
}
this.responseHeaders = responseHeaders;
}
public String getDomainName() {
String domain = url.getHost();
return domain.startsWith("www.") ? domain.substring(4) : domain;
}
public boolean isHub() {
if (linkRelevance != null) {
double relevance = linkRelevance.getRelevance();
return relevance > LinkRelevance.DEFAULT_HUB_RELEVANCE &&
relevance < LinkRelevance.DEFAULT_AUTH_RELEVANCE;
}
return false;
}
public boolean isAuth() {
return auth;
}
public void setAuth(boolean auth) {
this.auth = auth;
}
public URL getURL() {
return url;
}
public byte[] getContent() {
return content;
}
/*
* Warning: using this method for non-textual mime-types might cause data corruption.
*/
public String getContentAsString() {
return new String(content);
}
public void setContent(byte[] content) {
this.content = content;
}
public String getContentType() {
return contentType;
}
public void setContentType(String contentType) {
this.contentType = contentType;
}
public URL getRedirectedURL() {
return redirectedURL;
}
public Map<String, List<String>> getResponseHeaders() {
return responseHeaders;
}
public long getFetchTime() {
return fetchTime;
}
public void setFetchTime(long fetchTime) {
this.fetchTime = fetchTime;
}
public LinkRelevance getLinkRelevance() {
return linkRelevance;
}
public void setLinkRelevance(LinkRelevance linkRelevance) {
this.linkRelevance = linkRelevance;
}
public ParsedData getParsedData() {
return parsedData;
}
public void setParsedData(ParsedData parsedData) {
this.parsedData = parsedData;
}
public TargetRelevance getTargetRelevance() {
return targetRelevance;
}
public void setTargetRelevance(TargetRelevance targetRelevance) {
this.targetRelevance = targetRelevance;
}
}