/*
* Copyright 2011 Marek Pilecky
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.github.mefi.jkuuza.crawler;
import com.github.mefi.jkuuza.app.db.DbConnector;
import com.github.mefi.jkuuza.crawler.gui.CrawlerConsole;
import com.github.mefi.jkuuza.model.BodyContent;
import com.github.mefi.jkuuza.model.Page;
import com.github.mefi.jkuuza.model.CrawledPageController;
import com.github.mefi.jkuuza.parser.ContentExtractor;
import java.io.UnsupportedEncodingException;
import java.net.SocketTimeoutException;
import java.util.Set;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.niocchi.core.Crawler;
import org.niocchi.core.Worker;
import com.github.mefi.jkuuza.parser.LinksExtractor;
import org.niocchi.core.MemoryResource;
import org.niocchi.core.Query;
/**
*
* @author Marek Pilecky
*/
public class DbSaveWorker extends Worker {
TimeoutURLPool pool = null;
private DbConnector connector;
public DbSaveWorker(Crawler crawler, TimeoutURLPool pool, DbConnector connector) {
super(crawler);
this.pool = pool;
this.connector = connector;
}
/**
* Saves crawled content into db
* @param query
*/
public void processResource(Query query) {
if (query.getResource().getContentMimeSubType() != null) {
if (query.getResource().getContentMimeSubType().contains("html")) {
String html = getCrawledHtml(query);
String url = query.getOriginalURL().toString();
String host = query.getHost();
if (url.split("/").length < 4) {
//http://example.com <- no slash at the end
url = url + "/";
}
String baseUrl = url.substring(0, url.lastIndexOf('/') + 1);
Document doc = Jsoup.parse(html, baseUrl);
LinksExtractor extractor = new LinksExtractor(doc);
// extract links pointing back to the host and add them into url pool
Set<String> links = extractor.getInternalLinks(host);
for (String link : links) {
ExpandableURLPool expPool = (ExpandableURLPool) pool.getUrlPool();
expPool.addURL(link);
}
ContentExtractor contentExtractor = new ContentExtractor(doc);
Page page = new Page(query.getOriginalURL().toString(), extractor.canonizeHost(host));
if (contentExtractor.hasMetaDescription()) {
page.setDescription(contentExtractor.getMetaDescription());
}
if (contentExtractor.hasMetaKeywords()) {
page.setKeywords(contentExtractor.getMetaKeywords());
}
if (contentExtractor.hasMetaCharset()) {
page.setCharset(contentExtractor.getMetaCharset());
}
String bodyText = doc.body().text();
String bodyHtml = doc.body().toString();
BodyContent bodyContent = new BodyContent(page.getUrl(), bodyHtml, bodyText);
CrawledPageController controller = new CrawledPageController(connector.getConnection());
try {
controller.save(page, bodyContent);
CrawlerConsole.print("[crawled] - " + query.getOriginalURL().toString() + " [" + query.getStatus() + "]");
} catch (SocketTimeoutException ex) {
CrawlerConsole.print("[error] - " + query.getOriginalURL().toString() + " [DB RESPONSE ERROR]");
}
} else {
System.out.println(query.getResource().getContentMimeSubType());
}
}
}
/**
* Extracts html code from Query and returns it as a string.
* Function tries to determine which encoding is used and applies it.
* If it fails or encoding isn`t set, it uses default encoding.
*
* @param query
* @return html - String with html from webpage
*/
public String getCrawledHtml(Query query) {
String charset = "";
String html = null;
MemoryResource resource = (MemoryResource) query.getResource();
byte[] bytes = resource.getBytes();
charset = query.getResource().getContentEncoding();
if (charset == null || charset.equals("")) {
String tempHtml = new String(bytes);
Document doc = Jsoup.parse(tempHtml);
ContentExtractor contentExtractor = new ContentExtractor(doc);
// extract charset from meta
if (contentExtractor.hasMetaCharset()) {
charset = contentExtractor.getMetaCharset();
}
}
try {
if (charset == null) {
charset = "";
}
html = new String(bytes, charset);
} catch (UnsupportedEncodingException e) {
// try it with default charset
html = new String(bytes);
}
return html;
}
}