package focusedCrawler.target.model;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.Date;
import com.google.common.net.InternetDomainName;
import de.l3s.boilerpipe.BoilerpipeProcessingException;
import de.l3s.boilerpipe.extractors.DefaultExtractor;
import focusedCrawler.util.parser.PaginaURL;
public class TargetModelElasticSearch {
private String domain;
private String url;
private String title;
private String text;
private Date retrieved;
private String[] words;
private String[] wordsMeta;
private String topPrivateDomain;
private String html;
private String isRelevant;
private double relevance;
public TargetModelElasticSearch() {
// mandatory for object unserialization
}
public TargetModelElasticSearch(Page page) {
this.url = page.getURL().toString();
this.retrieved = new Date();
this.domain = page.getDomainName();
this.html = page.getContentAsString();
this.words = page.getParsedData().getWords();
this.wordsMeta = page.getParsedData().getWordsMeta();
this.title = page.getParsedData().getTitle();
this.isRelevant = page.getTargetRelevance().isRelevant() ? "relevant" : "irrelevant";
this.relevance = page.getTargetRelevance().getRelevance();
try {
this.text = DefaultExtractor.getInstance().getText(page.getContentAsString());
} catch (BoilerpipeProcessingException e) {
this.text = "";
}
InternetDomainName domainName = InternetDomainName.from(page.getDomainName());
if (domainName.isUnderPublicSuffix()) {
this.topPrivateDomain = domainName.topPrivateDomain().toString();
} else {
this.topPrivateDomain = domainName.toString();
}
}
public TargetModelElasticSearch(TargetModelCbor model) {
URL url;
try {
url = new URL(model.url);
} catch (MalformedURLException e) {
throw new IllegalArgumentException("page has an invalid URL: " + model.url);
}
String rawContent = (String) model.response.get("body");
Page page = new Page(url, rawContent);
page.setParsedData(new ParsedData(new PaginaURL(url, rawContent)));
this.html = rawContent;
this.url = model.url;
this.retrieved = new Date(model.timestamp * 1000);
this.words = page.getParsedData().getWords();
this.wordsMeta = page.getParsedData().getWordsMeta();
this.title = page.getParsedData().getTitle();
this.domain = url.getHost();
try {
this.text = DefaultExtractor.getInstance().getText(page.getContentAsString());
} catch (Exception e) {
this.text = "";
}
InternetDomainName domainName = InternetDomainName.from(page.getDomainName());
if (domainName.isUnderPublicSuffix()) {
this.topPrivateDomain = domainName.topPrivateDomain().toString();
} else {
this.topPrivateDomain = domainName.toString();
}
}
public String getDomain() {
return domain;
}
public void setDomain(String domain) {
this.domain = domain;
}
public String getUrl() {
return url;
}
public void setUrl(String url) {
this.url = url;
}
public String getText() {
return text;
}
public void setText(String text) {
this.text = text;
}
public Date getRetrieved() {
return retrieved;
}
public void setRetrieved(Date retrieved) {
this.retrieved = retrieved;
}
public String[] getWords() {
return words;
}
public void setWords(String[] words) {
this.words = words;
}
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public String[] getWordsMeta() {
return wordsMeta;
}
public void setWordsMeta(String[] wordsMeta) {
this.wordsMeta = wordsMeta;
}
public String getTopPrivateDomain() {
return topPrivateDomain;
}
public void setTopPrivateDomain(String topPrivateDomain) {
this.topPrivateDomain = topPrivateDomain;
}
public String getHtml() {
return html;
}
public void setHtml(String html) {
this.html = html;
}
public String getIsRelevant() {
return isRelevant;
}
public void setIsRelevant(String isRelevant) {
this.isRelevant = isRelevant;
}
public double getRelevance() {
return relevance;
}
public void setRelevance(double relevance) {
this.relevance = relevance;
}
}