package com.cse10.crawler;
import com.cse10.database.HibernateUtil;
import org.apache.log4j.Logger;
import org.hibernate.Session;
import org.hibernate.Transaction;
import java.text.SimpleDateFormat;
import java.util.Calendar;
import java.util.Date;
import java.util.Iterator;
import java.util.List;
/**
* Created by sampath liyanage on 8/31/14.
* edited by Tharindu on 2014-10-29.
* This class gives resuming support for the crawler
*/
public class DateHandler {
private static Logger logger = Logger.getLogger(DateHandler.class);
public static Date getFromDateToResume(Date startingDate, String tableName) {
Session session = HibernateUtil.getSessionFactory().openSession();
String q = "select max(created_date) from " + tableName;
List results = session.createSQLQuery(q).list();
Iterator iterator = results.iterator();
Date latestDateCrawled = (Date) iterator.next();
if (latestDateCrawled == null) { // if database is empty
return startingDate;
}
/* because Hiru News is crawled month at a time */
if (tableName == "article_hiru_news") {
Calendar cal = Calendar.getInstance();
cal.setTime(latestDateCrawled);
cal.set(Calendar.DATE, 1); // set the first date of the given month
latestDateCrawled = cal.getTime();
}
/* because New York Times is crawled month at a time */
if (tableName == "article_new_york_times") {
Calendar cal = Calendar.getInstance();
cal.setTime(latestDateCrawled);
cal.set(Calendar.DATE, 1); // set the first date of the given month
latestDateCrawled = cal.getTime();
}
if (latestDateCrawled.compareTo(startingDate) < 0) { // if starting date > latest date
return startingDate;
}
/* to delete latest date (because it maybe unfinished) */
Session session1 = HibernateUtil.getSessionFactory().openSession();
Transaction transaction1 = session1.beginTransaction();
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
q = "delete from " + tableName + " where created_date >= '" + sdf.format(latestDateCrawled) + "'"; // logically only the equivalence is significant here (except hiru news)
session1.createSQLQuery(q).executeUpdate();
transaction1.commit();
logger.info("DELETED unfinished date");
return latestDateCrawled;
}
}