/** * License Agreement for OpenSearchServer * <p> * Copyright (C) 2008-2014 Emmanuel Keller / Jaeksoft * <p> * http://www.open-search-server.com * <p> * This file is part of OpenSearchServer. * <p> * OpenSearchServer is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * <p> * OpenSearchServer is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * <p> * You should have received a copy of the GNU General Public License * along with OpenSearchServer. If not, see <http://www.gnu.org/licenses/>. **/ package com.jaeksoft.searchlib.web.controller.crawler.web; import com.jaeksoft.searchlib.Client; import com.jaeksoft.searchlib.SearchLibException; import com.jaeksoft.searchlib.crawler.common.database.FetchStatus; import com.jaeksoft.searchlib.crawler.common.database.IndexStatus; import com.jaeksoft.searchlib.crawler.common.database.ParserStatus; import com.jaeksoft.searchlib.crawler.web.database.RobotsTxtStatus; import com.jaeksoft.searchlib.crawler.web.database.UrlItem; import com.jaeksoft.searchlib.crawler.web.database.UrlManager; import com.jaeksoft.searchlib.crawler.web.database.UrlManager.SearchTemplate; import com.jaeksoft.searchlib.request.AbstractSearchRequest; import com.jaeksoft.searchlib.scheduler.TaskItem; import com.jaeksoft.searchlib.scheduler.TaskManager; import com.jaeksoft.searchlib.scheduler.task.TaskUrlManagerAction; import com.jaeksoft.searchlib.web.controller.AlertController; import com.jaeksoft.searchlib.web.controller.CommonController; import com.jaeksoft.searchlib.web.controller.ScopeAttribute; import org.xml.sax.SAXException; import org.zkoss.bind.annotation.AfterCompose; import org.zkoss.bind.annotation.Command; import org.zkoss.bind.annotation.NotifyChange; import org.zkoss.zul.Filedownload; import javax.xml.transform.TransformerConfigurationException; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.util.ArrayList; import java.util.Date; import java.util.List; @AfterCompose(superclass = true) public class UrlController extends CommonController { public static enum BatchCommandEnum { NOTHING("Select an action"), EXPORT_TXT("Export URLs"), XML_SITEMAP("Export XML SiteMap"), EXPORT_CRAWLCACHE("Export CrawlCache"), SET_TO_UNFETCHED("Set selected URLs to Unfetched"), SET_TO_FETCH_FIRST("Set selected URLs to fetch first"), LOAD_SITEMAP("Load Sitemap(s)"), DELETE_URL("Delete selected URLs"), OPTIMIZE("Optimize URL database"), SYNCHRONIZE_INDEX("Synchronize the selected URLs with the index"), DELETE_ALL("Delete all URLs"); private final String label; BatchCommandEnum(String label) { this.label = label; } public String getLabel() { return label; } } private transient List<UrlItem> urlList; private transient int totalSize; private transient int activePage; private transient BatchCommandEnum batchCommand; public UrlController() throws SearchLibException { super(); } @Override protected void reset() { urlList = null; totalSize = 0; activePage = 0; batchCommand = BatchCommandEnum.NOTHING; } public int getActivePage() { return activePage; } public void setActivePage(int page) throws SearchLibException { synchronized (this) { activePage = page; computeUrlList(); reload(); } } public int getTotalSize() { return totalSize; } public long getRecordNumber() throws SearchLibException { synchronized (this) { UrlManager urlManager = getUrlManager(); if (urlManager == null) return 0; return urlManager.getSize(); } } public void setHost(String v) { synchronized (this) { setAttribute(ScopeAttribute.SEARCH_URL_HOST, v); } } public String getHost() { synchronized (this) { return (String) getAttribute(ScopeAttribute.SEARCH_URL_HOST); } } public void setBufferSize(int v) { synchronized (this) { setAttribute(ScopeAttribute.SEARCH_URL_BUFFER_SIZE, new Integer(v)); } } public int getBufferSize() { synchronized (this) { Integer v = (Integer) getAttribute(ScopeAttribute.SEARCH_URL_BUFFER_SIZE); return v == null ? 10000 : v; } } public boolean isWithSubDomain() { synchronized (this) { Boolean b = (Boolean) getAttribute(ScopeAttribute.SEARCH_URL_SUBHOST); if (b != null) return b; return false; } } public void setWithSubDomain(boolean b) { synchronized (this) { setAttribute(ScopeAttribute.SEARCH_URL_SUBHOST, new Boolean(b)); } } public void setResponseCode(Integer v) { synchronized (this) { setAttribute(ScopeAttribute.SEARCH_URL_RESPONSE_CODE, v); } } public Integer getResponseCode() { synchronized (this) { return (Integer) getAttribute(ScopeAttribute.SEARCH_URL_RESPONSE_CODE); } } public void setLang(String v) { synchronized (this) { setAttribute(ScopeAttribute.SEARCH_URL_LANG, v); } } public String getLang() { synchronized (this) { return (String) getAttribute(ScopeAttribute.SEARCH_URL_LANG); } } public void setLangMethod(String v) { synchronized (this) { setAttribute(ScopeAttribute.SEARCH_URL_LANG_METHOD, v); } } public String getLangMethod() { synchronized (this) { return (String) getAttribute(ScopeAttribute.SEARCH_URL_LANG_METHOD); } } public void setContentBaseType(String v) { synchronized (this) { setAttribute(ScopeAttribute.SEARCH_URL_CONTENT_BASE_TYPE, v); } } public String getContentBaseType() { synchronized (this) { return (String) getAttribute(ScopeAttribute.SEARCH_URL_CONTENT_BASE_TYPE); } } public void setContentTypeCharset(String v) { synchronized (this) { setAttribute(ScopeAttribute.SEARCH_URL_CONTENT_TYPE_CHARSET, v); } } public String getContentTypeCharset() { synchronized (this) { return (String) getAttribute(ScopeAttribute.SEARCH_URL_CONTENT_TYPE_CHARSET); } } public void setContentEncoding(String v) { synchronized (this) { setAttribute(ScopeAttribute.SEARCH_URL_CONTENT_ENCODING, v); } } public String getContentEncoding() { synchronized (this) { return (String) getAttribute(ScopeAttribute.SEARCH_URL_CONTENT_ENCODING); } } public void setMinContentLength(Integer v) { synchronized (this) { setAttribute(ScopeAttribute.SEARCH_URL_MIN_CONTENT_LENGTH, v); } } public Integer getMinContentLength() { synchronized (this) { return (Integer) getAttribute(ScopeAttribute.SEARCH_URL_MIN_CONTENT_LENGTH); } } public void setMaxContentLength(Integer v) { synchronized (this) { setAttribute(ScopeAttribute.SEARCH_URL_MAX_CONTENT_LENGTH, v); } } public Integer getMaxContentLength() { synchronized (this) { return (Integer) getAttribute(ScopeAttribute.SEARCH_URL_MAX_CONTENT_LENGTH); } } public void setFetchStatus(FetchStatus v) { synchronized (this) { setAttribute(ScopeAttribute.SEARCH_URL_FETCH_STATUS, v); } } public FetchStatus getFetchStatus() { synchronized (this) { FetchStatus st = (FetchStatus) getAttribute(ScopeAttribute.SEARCH_URL_FETCH_STATUS); if (st == null) return FetchStatus.ALL; return st; } } public void setParserStatus(ParserStatus v) { synchronized (this) { setAttribute(ScopeAttribute.SEARCH_URL_PARSER_STATUS, v); } } public ParserStatus getParserStatus() { synchronized (this) { ParserStatus status = (ParserStatus) getAttribute(ScopeAttribute.SEARCH_URL_PARSER_STATUS); if (status == null) return ParserStatus.ALL; return status; } } public void setIndexStatus(IndexStatus v) { synchronized (this) { setAttribute(ScopeAttribute.SEARCH_URL_INDEX_STATUS, v); } } public IndexStatus getIndexStatus() { synchronized (this) { IndexStatus status = (IndexStatus) getAttribute(ScopeAttribute.SEARCH_URL_INDEX_STATUS); if (status == null) return IndexStatus.ALL; return status; } } public void setRobotsTxtStatus(RobotsTxtStatus v) { synchronized (this) { setAttribute(ScopeAttribute.SEARCH_URL_ROBOTSTXT_STATUS, v); } } public RobotsTxtStatus getRobotsTxtStatus() { synchronized (this) { RobotsTxtStatus status = (RobotsTxtStatus) getAttribute(ScopeAttribute.SEARCH_URL_ROBOTSTXT_STATUS); if (status == null) return RobotsTxtStatus.ALL; return status; } } public void setLike(String v) { synchronized (this) { setAttribute(ScopeAttribute.SEARCH_URL_LIKE, v); } } public String getLike() { synchronized (this) { return (String) getAttribute(ScopeAttribute.SEARCH_URL_LIKE); } } public void setPageSize(Integer v) { synchronized (this) { setAttribute(ScopeAttribute.SEARCH_URL_SHEET_ROWS, v); } } public Integer getPageSize() { synchronized (this) { Integer v = (Integer) getAttribute(ScopeAttribute.SEARCH_URL_SHEET_ROWS); if (v == null) v = 10; return v; } } public void setEventDateStart(Date v) { synchronized (this) { setAttribute(ScopeAttribute.SEARCH_URL_DATE_START, v); } } public Date getEventDateStart() { synchronized (this) { return (Date) getAttribute(ScopeAttribute.SEARCH_URL_DATE_START); } } public void setEventDateEnd(Date v) { synchronized (this) { setAttribute(ScopeAttribute.SEARCH_URL_DATE_END, v); } } public Date getEventDateEnd() { synchronized (this) { return (Date) getAttribute(ScopeAttribute.SEARCH_URL_DATE_END); } } public void setModifiedDateStart(Date v) { synchronized (this) { setAttribute(ScopeAttribute.SEARCH_URL_DATE_MODIFIED_START, v); } } public Date getModifiedDateStart() { synchronized (this) { return (Date) getAttribute(ScopeAttribute.SEARCH_URL_DATE_MODIFIED_START); } } public void setModifiedDateEnd(Date v) { synchronized (this) { setAttribute(ScopeAttribute.SEARCH_URL_DATE_MODIFIED_END, v); } } public Date getModifiedDateEnd() { synchronized (this) { return (Date) getAttribute(ScopeAttribute.SEARCH_URL_DATE_MODIFIED_END); } } @Command public void onSearch() throws SearchLibException { synchronized (this) { activePage = 0; computeUrlList(); reload(); } } private AbstractSearchRequest getSearchRequest(SearchTemplate urlSearchTemplate) throws SearchLibException { UrlManager urlManager = getUrlManager(); if (urlManager == null) return null; return urlManager.getSearchRequest(urlSearchTemplate, getLike(), getHost(), isWithSubDomain(), getLang(), getLangMethod(), getContentBaseType(), getContentTypeCharset(), getContentEncoding(), getMinContentLength(), getMaxContentLength(), getRobotsTxtStatus(), getFetchStatus(), getResponseCode(), getParserStatus(), getIndexStatus(), getEventDateStart(), getEventDateEnd(), getModifiedDateStart(), getModifiedDateEnd()); } private void computeUrlList() throws SearchLibException { synchronized (this) { UrlManager urlManager = getUrlManager(); if (urlManager == null) return; urlList = new ArrayList<UrlItem>(); AbstractSearchRequest searchRequest = getSearchRequest(SearchTemplate.urlSearch); totalSize = (int) urlManager.getUrlList(searchRequest, getPageSize() * getActivePage(), getPageSize(), urlList); } } public List<UrlItem> getUrlList() throws SearchLibException { synchronized (this) { return urlList; } } public UrlManager getUrlManager() throws SearchLibException { synchronized (this) { Client client = getClient(); if (client == null) return null; return client.getUrlManager(); } } public FetchStatus[] getFetchStatusList() { synchronized (this) { return FetchStatus.values(); } } public RobotsTxtStatus[] getRobotsTxtStatusList() { synchronized (this) { return RobotsTxtStatus.values(); } } public ParserStatus[] getParserStatusList() { synchronized (this) { return ParserStatus.values(); } } public IndexStatus[] getIndexStatusList() { synchronized (this) { return IndexStatus.values(); } } public void onExportSiteMap() throws IOException, SearchLibException, TransformerConfigurationException, SAXException { synchronized (this) { UrlManager urlManager = getUrlManager(); if (urlManager == null) return; AbstractSearchRequest searchRequest = getSearchRequest(SearchTemplate.urlExport); File file = urlManager.exportSiteMap(searchRequest); Filedownload.save(new FileInputStream(file), "text/xml; charset-UTF-8", "OSS_SiteMap.xml"); } } public void onExportURLs() throws IOException, SearchLibException { synchronized (this) { UrlManager urlManager = getUrlManager(); if (urlManager == null) return; AbstractSearchRequest searchRequest = getSearchRequest(SearchTemplate.urlExport); File file = urlManager.exportURLs(searchRequest); try { Filedownload.save(new FileInputStream(file), "text/plain; charset-UTF-8", "OSS_URLs_Export.txt"); } finally { file.delete(); } } } public void onExportCrawlCache() throws SearchLibException, IOException { synchronized (this) { UrlManager urlManager = getUrlManager(); if (urlManager == null) return; AbstractSearchRequest searchRequest = getSearchRequest(SearchTemplate.urlExport); File file = urlManager.exportCrawlCache(searchRequest); try { Filedownload.save(new FileInputStream(file), "application/zip", "OSS_Crawl_Export.zip"); } finally { file.delete(); } } } private void onTask(TaskUrlManagerAction taskUrlManagerAction) throws SearchLibException, InterruptedException { Client client = getClient(); if (client == null) return; TaskItem taskItem = new TaskItem(client, taskUrlManagerAction); TaskManager.executeTask(client, taskItem, null); client.getUrlManager().waitForTask(taskUrlManagerAction, 30); } public void onSetToUnfetched() throws SearchLibException, InterruptedException { synchronized (this) { AbstractSearchRequest searchRequest = getSearchRequest(SearchTemplate.urlSearch); TaskUrlManagerAction taskUrlManagerAction = new TaskUrlManagerAction(); taskUrlManagerAction.setManual(searchRequest, TaskUrlManagerAction.CommandSetToUnfetched, getBufferSize()); onTask(taskUrlManagerAction); } } public void onSetToFetchFirst() throws SearchLibException, InterruptedException { synchronized (this) { AbstractSearchRequest searchRequest = getSearchRequest(SearchTemplate.urlSearch); TaskUrlManagerAction taskUrlManagerAction = new TaskUrlManagerAction(); taskUrlManagerAction.setManual(searchRequest, TaskUrlManagerAction.CommandSetToFetchFirst, getBufferSize()); onTask(taskUrlManagerAction); } } public void onLoadSitemap() throws SearchLibException, InterruptedException { synchronized (this) { TaskUrlManagerAction taskUrlManagerAction = new TaskUrlManagerAction(); taskUrlManagerAction.setManual(null, TaskUrlManagerAction.CommandLoadSitemap, getBufferSize()); onTask(taskUrlManagerAction); } } public void onDeleteURLs() throws SearchLibException, InterruptedException { synchronized (this) { AbstractSearchRequest searchRequest = getSearchRequest(SearchTemplate.urlExport); TaskUrlManagerAction taskUrlManagerAction = new TaskUrlManagerAction(); taskUrlManagerAction.setManual(searchRequest, TaskUrlManagerAction.CommandDeleteSelection, getBufferSize()); onTask(taskUrlManagerAction); } } public void onSynchronizedIndex() throws SearchLibException, InterruptedException { synchronized (this) { AbstractSearchRequest searchRequest = getSearchRequest(SearchTemplate.urlExport); TaskUrlManagerAction taskUrlManagerAction = new TaskUrlManagerAction(); taskUrlManagerAction.setManual(searchRequest, TaskUrlManagerAction.CommandSynchronize, getBufferSize()); onTask(taskUrlManagerAction); } } public void onOptimize() throws SearchLibException, InterruptedException { synchronized (this) { TaskUrlManagerAction taskUrlManagerAction = new TaskUrlManagerAction(); taskUrlManagerAction.setManual(null, TaskUrlManagerAction.CommandOptimize, getBufferSize()); onTask(taskUrlManagerAction); } } public void onDeleteAll() throws SearchLibException, InterruptedException { synchronized (this) { TaskUrlManagerAction taskUrlManagerAction = new TaskUrlManagerAction(); taskUrlManagerAction.setManual(null, TaskUrlManagerAction.CommandDeleteAll, getBufferSize()); onTask(taskUrlManagerAction); } } public BatchCommandEnum getBatchCommand() { return batchCommand; } public void setBatchCommand(BatchCommandEnum batchCommand) { this.batchCommand = batchCommand; } public BatchCommandEnum[] getBatchCommandEnum() { return BatchCommandEnum.values(); } @Command public void onGo() throws SearchLibException, IOException, TransformerConfigurationException, SAXException, InterruptedException { synchronized (this) { Client client = getClient(); if (client == null) return; if (client.getWebCrawlMaster().isRunning()) { new AlertController("Please stop the Web crawler first."); return; } if (batchCommand == null) return; switch (batchCommand) { case NOTHING: break; case EXPORT_TXT: onExportURLs(); break; case EXPORT_CRAWLCACHE: onExportCrawlCache(); break; case XML_SITEMAP: onExportSiteMap(); break; case SET_TO_UNFETCHED: onSetToUnfetched(); break; case SET_TO_FETCH_FIRST: onSetToFetchFirst(); break; case DELETE_URL: onDeleteURLs(); break; case OPTIMIZE: onOptimize(); break; case DELETE_ALL: onDeleteAll(); break; case LOAD_SITEMAP: onLoadSitemap(); break; case SYNCHRONIZE_INDEX: onSynchronizedIndex(); break; } batchCommand = BatchCommandEnum.NOTHING; reload(); } } @Command @NotifyChange("urlManager") public void onRefreshCurrentTaskLog() { } }