/** * License Agreement for OpenSearchServer * * Copyright (C) 2010-2013 Emmanuel Keller / Jaeksoft * * http://www.open-search-server.com * * This file is part of OpenSearchServer. * * OpenSearchServer is free software: you can redistribute it and/or * modify it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * OpenSearchServer is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with OpenSearchServer. * If not, see <http://www.gnu.org/licenses/>. **/ package com.jaeksoft.searchlib.scheduler.task; import java.io.IOException; import com.jaeksoft.searchlib.Client; import com.jaeksoft.searchlib.SearchLibException; import com.jaeksoft.searchlib.analysis.ClassPropertyEnum; import com.jaeksoft.searchlib.config.Config; import com.jaeksoft.searchlib.crawler.common.database.FetchStatus; import com.jaeksoft.searchlib.crawler.common.database.IndexStatus; import com.jaeksoft.searchlib.crawler.common.database.ParserStatus; import com.jaeksoft.searchlib.crawler.web.database.RobotsTxtStatus; import com.jaeksoft.searchlib.crawler.web.database.UrlManager; import com.jaeksoft.searchlib.request.AbstractSearchRequest; import com.jaeksoft.searchlib.scheduler.TaskAbstract; import com.jaeksoft.searchlib.scheduler.TaskLog; import com.jaeksoft.searchlib.scheduler.TaskProperties; import com.jaeksoft.searchlib.scheduler.TaskPropertyDef; import com.jaeksoft.searchlib.scheduler.TaskPropertyType; import com.jaeksoft.searchlib.util.Variables; public class TaskUrlManagerAction extends TaskAbstract { final private TaskPropertyDef propCommand = new TaskPropertyDef(TaskPropertyType.comboBox, "Command", "Command", "Select the command to execute", 30); final private TaskPropertyDef propFilterUrl = new TaskPropertyDef(TaskPropertyType.textBox, "URL prefix", "URL prefix", "Filter prefix on the URL", 50); final private TaskPropertyDef propFilterLang = new TaskPropertyDef(TaskPropertyType.textBox, "Lang", "Lang", "Filter on the lang", 5); final private TaskPropertyDef propFilterContentBaseType = new TaskPropertyDef(TaskPropertyType.textBox, "ContentBaseType", "Content type", "Filter on the content type", 30); final private TaskPropertyDef propFilterContentTypeCharset = new TaskPropertyDef(TaskPropertyType.textBox, "ContentTypeCharset", "Content charset", "Filter on the content type charset", 10); final private TaskPropertyDef propFilterContentEncoding = new TaskPropertyDef(TaskPropertyType.textBox, "ContentEncoding", "Content encoding", "Filter on the content encoding", 10); final private TaskPropertyDef propFilterMinContentLength = new TaskPropertyDef(TaskPropertyType.textBox, "MinContentLength", "Min length", "Filter on the minimum content length", 10); final private TaskPropertyDef propFilterMaxContentLength = new TaskPropertyDef(TaskPropertyType.textBox, "MaxContentLength", "Max length", "Filter on the maximum content length", 10); final private TaskPropertyDef propFilterHost = new TaskPropertyDef(TaskPropertyType.textBox, "Hostname", "Hostname", "Filter on the hostname", 30); final private TaskPropertyDef propFilterWithSubDomain = new TaskPropertyDef(TaskPropertyType.listBox, "WithSubDomain", "With sub domain", "Filter on the sub domain", 10); final private TaskPropertyDef propRobotsTxtStatus = new TaskPropertyDef(TaskPropertyType.listBox, "Robots.txt status", "Robots.txt status", "Filter on the Robots.txt status", 20); final private TaskPropertyDef propFetchStatus = new TaskPropertyDef(TaskPropertyType.listBox, "Fetch status", "Fetch status", "Filter on the fetch status", 20); final private TaskPropertyDef propParserStatus = new TaskPropertyDef(TaskPropertyType.listBox, "Parser status", "Parser status", "Filter on the Parser status", 20); final private TaskPropertyDef propIndexStatus = new TaskPropertyDef(TaskPropertyType.listBox, "Index status", "Index status", "Filter on the index status", 20); final private TaskPropertyDef propBufferSize = new TaskPropertyDef(TaskPropertyType.textBox, "Buffer size", "Buffer size", "Buffer size", 10); final private TaskPropertyDef[] taskPropertyDefs = { propCommand, propFilterUrl, propFilterLang, propFilterContentBaseType, propFilterContentTypeCharset, propFilterContentEncoding, propFilterMinContentLength, propFilterMaxContentLength, propFilterHost, propFilterWithSubDomain, propRobotsTxtStatus, propFetchStatus, propParserStatus, propIndexStatus, propBufferSize }; final public static String CommandDoNothing = "Do nothing"; final public static String CommandSetToUnfetched = "Set to unfetched"; final public static String CommandSetToFetchFirst = "Set to fetch first"; final public static String CommandDeleteAll = "Delete all"; final public static String CommandDeleteSelection = "Delete selection"; final public static String CommandLoadSitemap = "Load Sitemap(s)"; final public static String CommandOptimize = "Optimize"; final public static String CommandSynchronize = "Synchronize"; final private static String[] CommandList = { CommandDoNothing, CommandSetToUnfetched, CommandSetToFetchFirst, CommandDeleteSelection, CommandDeleteAll, CommandLoadSitemap, CommandSynchronize, CommandOptimize }; @Override public String getName() { return "Web crawler - URL database"; } @Override public TaskPropertyDef[] getPropertyList() { return taskPropertyDefs; } @Override public String[] getPropertyValues(Config config, TaskPropertyDef propertyDef, TaskProperties taskProperties) { if (propertyDef == propCommand) return CommandList; else if (propertyDef == propRobotsTxtStatus) return RobotsTxtStatus.getNames(); else if (propertyDef == propFetchStatus) return FetchStatus.getNames(); else if (propertyDef == propParserStatus) return ParserStatus.getNames(); else if (propertyDef == propIndexStatus) return IndexStatus.getNames(); else if (propertyDef == propFilterWithSubDomain) return ClassPropertyEnum.BOOLEAN_LIST; return null; } @Override public String getDefaultValue(Config config, TaskPropertyDef propertyDef) { if (propertyDef == propCommand) return CommandList[0]; else if (propertyDef == propRobotsTxtStatus) return RobotsTxtStatus.ALL.name; else if (propertyDef == propFetchStatus) return FetchStatus.ALL.name; else if (propertyDef == propParserStatus) return ParserStatus.ALL.name; else if (propertyDef == propIndexStatus) return IndexStatus.ALL.name; else if (propertyDef == propBufferSize) return "10000"; else if (propertyDef == propFilterWithSubDomain) return Boolean.FALSE.toString(); return null; } private AbstractSearchRequest selectionRequest = null; private String manualCommand = null; private Integer manualBufferSize = null; public void setManual(AbstractSearchRequest selectionRequest, String manualCommand, int bufferSize) { this.selectionRequest = selectionRequest; this.manualCommand = manualCommand; this.manualBufferSize = bufferSize; } @Override public void execute(Client client, TaskProperties properties, Variables variables, TaskLog taskLog) throws SearchLibException, IOException { UrlManager urlManager = client.getUrlManager(); taskLog.setInfo("URL manager Action started"); final String command; final int bufferSize; if (manualCommand != null) { command = manualCommand; bufferSize = manualBufferSize; } else { command = properties.getValue(propCommand); bufferSize = Integer.parseInt(properties.getValue(propBufferSize)); String urlLike = properties.getValue(propFilterUrl); String lang = properties.getValue(propFilterLang); String contentBaseType = properties.getValue(propFilterContentBaseType); String contentTypeCharset = properties.getValue(propFilterContentTypeCharset); String contentEncoding = properties.getValue(propFilterContentEncoding); Integer minContentLength = properties.getValueInteger(propFilterMinContentLength); Integer maxContentLength = properties.getValueInteger(propFilterMaxContentLength); String host = properties.getValue(propFilterHost); boolean withSubDomain = properties.getValueBoolean(propFilterWithSubDomain, false); RobotsTxtStatus robotsTxtStatus = RobotsTxtStatus.findByName(properties.getValue(propRobotsTxtStatus)); FetchStatus fetchStatus = FetchStatus.findByName(properties.getValue(propFetchStatus)); ParserStatus parserStatus = ParserStatus.findByName(properties.getValue(propParserStatus)); IndexStatus indexStatus = IndexStatus.findByName(properties.getValue(propIndexStatus)); selectionRequest = urlManager.getSearchRequest(UrlManager.SearchTemplate.urlSearch, urlLike, host, withSubDomain, lang, null, contentBaseType, contentTypeCharset, contentEncoding, minContentLength, maxContentLength, robotsTxtStatus, fetchStatus, null, parserStatus, indexStatus, null, null, null, null); } if (CommandLoadSitemap.equals(command)) { taskLog.setInfo("URL manager: Handle SiteMaps"); urlManager.updateSiteMap(taskLog); } else if (CommandSetToFetchFirst.equals(command)) { taskLog.setInfo("URL manager: Update status to "); urlManager.updateFetchStatus(selectionRequest, FetchStatus.FETCH_FIRST, bufferSize, taskLog); } else if (CommandSetToUnfetched.equals(command)) { taskLog.setInfo("URL manager: Update status to "); urlManager.updateFetchStatus(selectionRequest, FetchStatus.UN_FETCHED, bufferSize, taskLog); } else if (CommandDeleteAll.equals(command)) { taskLog.setInfo("URL manager: Delete All"); urlManager.deleteAll(taskLog); } else if (CommandDeleteSelection.equals(command)) { taskLog.setInfo("URL manager: Delete selection"); urlManager.deleteUrls(selectionRequest, bufferSize, taskLog); } else if (CommandSynchronize.equals(command)) { taskLog.setInfo("URL manager: synchronize"); urlManager.synchronizeIndex(selectionRequest, bufferSize, taskLog); } } }