package guang.crawler.siteManager.daemon;
import guang.crawler.commons.WebURL;
import guang.crawler.siteManager.SiteConfig;
import guang.crawler.siteManager.SiteManager;
import guang.crawler.siteManager.jobQueue.MapQueue;
import guang.crawler.siteManager.jobQueue.MapQueueIterator;
import guang.crawler.siteManager.urlFilter.ObjectFilter;
import java.io.EOFException;
import java.io.IOException;
import java.net.URI;
import java.util.TimerTask;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import com.alibaba.fastjson.JSON;
/**
* 当前类是一个定时器任务,用来定时备份站点管理器的一些信息.备份过程中其他一切业务都应当停止,以防出现问题.
*
* @author sun
*
*/
public class SiteBackupDaemon extends TimerTask {
public static SiteBackupDaemon newDaemon() {
return new SiteBackupDaemon();
}
private FileSystem fileSystem;
private SiteBackupDaemon() {
}
/**
* 对URL过滤器进行备份
*
* @param filter
* @param path
* @throws IOException
*/
private void backupFilter(final ObjectFilter filter, final String path)
throws IOException {
FSDataOutputStream fsout = null;
try {
fsout = this.fileSystem.create(new Path(path));
fsout.writeUTF(filter.toBackupString());
} finally {
if (fsout != null) {
fsout.close();
}
}
}
/**
* 对某个URL队列进行备份
*
* @param listToBackup
* @param path
* @throws IOException
*/
private void backupList(final MapQueue<WebURL> listToBackup,
final String path) throws IOException {
MapQueueIterator<WebURL> iteraor = listToBackup.iterator();
FSDataOutputStream fsout = null;
try {
fsout = this.fileSystem.create(new Path(path));
while (iteraor.hasNext()) {
WebURL url = iteraor.next();
fsout.writeUTF(JSON.toJSONString(url));
}
} finally {
if (fsout != null) {
fsout.close();
}
iteraor.close();
}
}
/**
* 清除所有的备份信息.当采集点爬取完成之后应当清理备份数据,否则重新启动该采集点时可能出现问题.
*/
public void clearBackups() {
SiteConfig config = SiteConfig.me();
// 首先设置系统为backup time,从而让用户不再获取新的任务,也暂停清理线程的工作
config.setBackTime(true);
// 找到备份的目录
String rootDir = config.getHadoopPath() + "/"
+ config.getSiteManagerInfo()
.getSiteToHandle() + "/backup";
try {
Path path = new Path(rootDir);
if (this.fileSystem.exists(path)) {
this.fileSystem.delete(path, true);
}
} catch (IOException e) {
return;
} catch (IllegalArgumentException e) {
return;
}
config.setBackTime(false);
}
/**
* 强制进行备份 TODO 这里有安全隐患,如果正在进行备份,调用该方法会出现问题的.
*/
public void forceBackup() {
this.run();
}
/**
* 初始化备份线程
*
* @return
* @throws IOException
*/
public SiteBackupDaemon init() throws IOException {
Configuration configuration = new Configuration();
this.fileSystem = FileSystem.get(URI.create(SiteConfig.me()
.getHadoopURL()),
configuration);
return this;
}
/**
* 加载备份的数据
*
* @throws IOException
*/
public boolean loadBackupData() throws IOException {
SiteConfig config = SiteConfig.me();
String rootDir = config.getHadoopPath() + "/"
+ config.getSiteManagerInfo()
.getSiteToHandle() + "/backup";
Path maxVersionFilePath = new Path(rootDir + "/max-version");
boolean exists;
try {
exists = this.fileSystem.exists(maxVersionFilePath);
} catch (IOException e) {
return false;
}
if (exists) {
int version = -1;
FSDataInputStream fsin = this.fileSystem.open(maxVersionFilePath);
try {
version = fsin.readInt();
} finally {
fsin.close();
}
String backDir = rootDir + "/" + version;
SiteManager siteManager = SiteManager.me();
this.readBackupList(siteManager.getToDoTaskList(), backDir
+ "/todoList");
this.readBackupList(siteManager.getWorkingTaskList(), backDir
+ "/workingList");
this.readBackupList(siteManager.getFailedTaskList(), backDir
+ "/failedList");
this.readFilter(siteManager.getUrlsFilter(), backDir + "/filter");
config.setBackupVersion(version);
return true;
} else {
return false;
}
}
/**
* 读取备份的URL列表
*
* @param list
* @param backupFilePath
* @throws IOException
*/
private void readBackupList(final MapQueue<WebURL> list,
final String backupFilePath) throws IOException {
FSDataInputStream fsin = this.fileSystem.open(new Path(backupFilePath));
try {
while (true) {
try {
String urlJSON = fsin.readUTF();
WebURL weburl = JSON.parseObject(urlJSON, WebURL.class);
list.put(weburl);
} catch (EOFException e) {
break;
}
}
} finally {
fsin.close();
}
}
/**
* 读取URL过滤器
*
* @param filter
* @param backupFilePath
* @throws IOException
*/
private void readFilter(final ObjectFilter filter,
final String backupFilePath) throws IOException {
FSDataInputStream fsin = this.fileSystem.open(new Path(backupFilePath));
try {
try {
String filterData = fsin.readUTF();
filter.fromBackupString(filterData);
} catch (EOFException e) {
return;
}
} finally {
fsin.close();
}
}
/**
* 将某个URL列表中的URL重新放入todo URL列表中
*
* @param fromList
*/
public void rescheduleTaskList(final MapQueue<WebURL> fromList) {
if (fromList.getLength() > 0) {
MapQueueIterator<WebURL> iterator = fromList.iterator();
try {
while (iterator.hasNext()) {
SiteManager.me()
.getToDoTaskList()
.put(iterator.next()
.resetTryTime());
}
} finally {
iterator.close();
}
}
}
/**
* 主线程
*/
@Override
public void run() {
SiteConfig config = SiteConfig.me();
// 首先设置系统为backup time,从而让用户不再获取新的任务,也暂停清理线程的工作
config.setBackTime(true);
// 找到备份的目录
String rootDir = config.getHadoopPath() + "/"
+ config.getSiteManagerInfo()
.getSiteToHandle() + "/backup";
try {
this.fileSystem.mkdirs(new Path(rootDir));
} catch (IOException e) {
return;
} catch (IllegalArgumentException e) {
return;
}
// 找到当前最大的备份版本号
Path maxVersionFilePath = new Path(rootDir + "/max-version");
boolean exists;
try {
exists = this.fileSystem.exists(maxVersionFilePath);
} catch (IOException e) {
return;
}
int maxVersion = config.getBackupVersion();
if (exists) {
FSDataInputStream fsin = null;
try {
fsin = this.fileSystem.open(maxVersionFilePath);
int version = fsin.readInt();
maxVersion = maxVersion < version ? version : maxVersion;
maxVersion++;
} catch (IOException e) {
return;
} finally {
if (fsin != null) {
try {
fsin.close();
} catch (IOException e) {
}
}
}
}
String backDir = rootDir + "/" + maxVersion;
Path backDirPath = new Path(backDir);
try {
if (this.fileSystem.exists(backDirPath)) {
this.fileSystem.delete(backDirPath, true);
}
} catch (IOException e1) {
// TODO Auto-generated catch block
e1.printStackTrace();
}
try {
this.fileSystem.mkdirs(backDirPath);
} catch (IllegalArgumentException e) {
return;
} catch (IOException e) {
return;
}
// 备份三个队列的数据
SiteManager siteManager = SiteManager.me();
try {
this.backupList(siteManager.getToDoTaskList(), backDir
+ "/todoList");
this.backupList(siteManager.getWorkingTaskList(), backDir
+ "/workingList");
this.backupList(siteManager.getFailedTaskList(), backDir
+ "/failedList");
this.backupFilter(siteManager.getUrlsFilter(), backDir + "/filter");
} catch (IOException e) {
return;
}
try {
FSDataOutputStream fsout = this.fileSystem.create(maxVersionFilePath,
true);
try {
fsout.writeInt(maxVersion);
} finally {
fsout.close();
}
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
// 到此为止,备份结束了
config.setBackupVersion(maxVersion);
config.setBackTime(false);
}
}