/*
This file is part of RouteConverter.
RouteConverter is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
RouteConverter is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with RouteConverter; if not, write to the Free Software
Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
Copyright (C) 2007 Christian Pesch. All Rights Reserved.
*/
package slash.navigation.download.tools;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.DefaultParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import slash.navigation.datasources.DataSource;
import slash.navigation.datasources.File;
import slash.navigation.datasources.Map;
import slash.navigation.datasources.Theme;
import slash.navigation.datasources.binding.DatasourceType;
import slash.navigation.datasources.binding.FileType;
import slash.navigation.datasources.binding.MapType;
import slash.navigation.datasources.binding.ThemeType;
import slash.navigation.datasources.helpers.DataSourcesUtil;
import slash.navigation.download.tools.base.BaseDownloadTool;
import slash.navigation.download.tools.helpers.AnchorFilter;
import slash.navigation.download.tools.helpers.AnchorParser;
import slash.navigation.download.tools.helpers.DownloadableType;
import slash.navigation.rest.Delete;
import slash.navigation.rest.Get;
import slash.navigation.rest.Post;
import javax.xml.bind.JAXBException;
import java.io.IOException;
import java.util.Collection;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.logging.Logger;
import static java.lang.String.format;
import static java.lang.System.exit;
import static java.util.Arrays.asList;
import static java.util.Arrays.sort;
import static slash.common.io.Transfer.UTF8_ENCODING;
import static slash.navigation.datasources.helpers.DataSourcesUtil.asDatasourceType;
import static slash.navigation.datasources.helpers.DataSourcesUtil.createFileType;
import static slash.navigation.datasources.helpers.DataSourcesUtil.createMapType;
import static slash.navigation.datasources.helpers.DataSourcesUtil.createThemeType;
import static slash.navigation.download.tools.helpers.DownloadableType.File;
import static slash.navigation.rest.HttpRequest.APPLICATION_JSON;
/**
* Scans a website for resources for the DataSources catalog.
*
* @author Christian Pesch
*/
public class ScanWebsite extends BaseDownloadTool {
private static final Logger log = Logger.getLogger(ScanWebsite.class.getName());
private static final String BASE_URL_ARGUMENT = "baseUrl";
private static final String TYPE_ARGUMENT = "type";
private static final String EXTENSION_ARGUMENT = "extension";
private static final String INCLUDE_ARGUMENT = "include";
private static final String EXCLUDE_ARGUMENT = "exclude";
private String baseUrl;
private DownloadableType type;
private Set<String> extensions, includes, excludes;
private int addCount = 0, removeCount = 0;
private String appendURIs(String uri, String anchor) {
int index = uri.lastIndexOf('/');
return index != -1 ? uri.substring(0, index + 1) + anchor : anchor;
}
private void recursiveCollect(String uri, Set<String> uris, Set<String> visitedUris) throws IOException {
if (visitedUris.contains(uri))
return;
visitedUris.add(uri);
log.info(format("Downloading %s", getUrl() + uri));
Get get = new Get(getUrl() + uri);
String result = get.executeAsString();
List<String> anchors = new AnchorParser().parseAnchors(result.replaceAll("<area", "<a"));
List<String> included = new AnchorFilter().filterAnchors(baseUrl, anchors, extensions, includes, excludes);
for (String anchor : included) {
// create the anchor relative to the current uri
String nextUri = appendURIs(uri, anchor);
uris.add(nextUri);
}
List<String> recurse = new AnchorFilter().filterAnchors(baseUrl, anchors, new HashSet<>(asList(".html", "/")), null, null);
for (String anchor : recurse) {
if((getUrl() + anchor).equals(baseUrl) || baseUrl.endsWith(anchor))
continue;
// create the anchor relative to the current uri
String nextUri = appendURIs(uri, anchor);
recursiveCollect(nextUri, uris, visitedUris);
}
}
private List<String> collectUris() throws IOException {
Set<String> uris = new HashSet<>();
recursiveCollect("", uris, new HashSet<String>());
String[] sortedUris = uris.toArray(new String[uris.size()]);
sort(sortedUris);
return asList(sortedUris);
}
private Set<String> collectURIs(DataSource source) {
Set<String> result = new HashSet<>();
switch (type) {
case File:
for (File file : source.getFiles())
result.add(file.getUri());
break;
case Map:
for (Map map : source.getMaps())
result.add(map.getUri());
break;
case Theme:
for (Theme theme : source.getThemes())
result.add(theme.getUri());
break;
}
return result;
}
private void scan() throws IOException, JAXBException {
List<String> collectedUris = collectUris();
log.info(format("Collected URIs: %s (%d elements)", collectedUris, collectedUris.size()));
DataSource source = loadDataSource(getId());
if (!getUrl().equals(source.getBaseUrl()) && !baseUrl.equals(source.getBaseUrl()))
log.warning("Data source URL: " + source.getBaseUrl() + " doesn't match URL: " + getUrl());
Set<String> files = collectURIs(source);
Set<String> addedUris = new HashSet<>(collectedUris);
addedUris.removeAll(files);
Set<String> removedUris = new HashSet<>(files);
removedUris.removeAll(collectedUris);
if (hasDataSourcesServer()) {
if (addedUris.size() > 0)
addUrisInChunks(source, addedUris);
if (removedUris.size() > 0)
removeUris(source, removedUris);
}
log.info(format("Added %d URIs, removed %d URIs out of %d URIs", addCount, removeCount, collectedUris.size()));
}
private String toXml(DataSource dataSource, Collection<String> uris, DownloadableType type) throws IOException {
DatasourceType datasourceType = asDatasourceType(dataSource);
for (String uri : uris) {
switch (type) {
case File:
FileType fileType = createFileType(uri, null, null);
datasourceType.getFile().add(fileType);
break;
case Map:
MapType mapType = createMapType(uri, null, null);
datasourceType.getMap().add(mapType);
break;
case Theme:
ThemeType themeType = createThemeType(uri, null, null);
datasourceType.getTheme().add(themeType);
break;
}
}
return DataSourcesUtil.toXml(datasourceType);
}
private void addUrisInChunks(DataSource dataSource, Collection<String> uris) throws IOException {
Collection<String> chunk = new HashSet<>();
for(String uri : uris) {
chunk.add(uri);
if(chunk.size() >= MAXIMUM_UPDATE_COUNT) {
addUris(dataSource, chunk);
chunk.clear();
}
}
if (chunk.size() > 0)
addUris(dataSource, chunk);
}
private String addUris(DataSource dataSource, Collection<String> uris) throws IOException {
String xml = toXml(dataSource, uris, type);
log.info(format("Adding URIs:%n%s", xml));
String dataSourcesUrl = getDataSourcesUrl();
Post request = new Post(dataSourcesUrl, getCredentials());
request.addFile("file", xml.getBytes(UTF8_ENCODING));
request.setAccept(APPLICATION_JSON);
request.setSocketTimeout(SOCKET_TIMEOUT);
String result = null;
try {
result = request.executeAsString();
log.info(format("Added URIs with result:%n%s", result));
addCount += uris.size();
}
catch(Exception e) {
log.severe(format("Cannot add URIs: %s", e));
}
return result;
}
private String removeUris(DataSource dataSource, Set<String> uris) throws IOException {
String xml = toXml(dataSource, uris, type);
log.info(format("Removing URIs:%n%s", xml));
String dataSourcesUrl = getDataSourcesUrl();
Delete request = new Delete(dataSourcesUrl, getCredentials());
request.addFile("file", xml.getBytes(UTF8_ENCODING));
request.setAccept(APPLICATION_JSON);
String result = null;
try {
result = request.executeAsString();
log.info(format("Removed URIs with result:%n%s", result));
removeCount += uris.size();
}
catch(Exception e) {
log.severe(format("Cannot remove URIs: %s", e));
}
return result;
}
private void run(String[] args) throws Exception {
CommandLine line = parseCommandLine(args);
String typeArgument = line.getOptionValue(TYPE_ARGUMENT);
setId(line.getOptionValue(ID_ARGUMENT));
setUrl(line.getOptionValue(URL_ARGUMENT));
baseUrl = line.getOptionValue(BASE_URL_ARGUMENT);
if (baseUrl == null)
baseUrl = getUrl();
String[] extensionArguments = line.getOptionValues(EXTENSION_ARGUMENT);
extensions = extensionArguments != null ? new HashSet<>(asList(extensionArguments)) : null;
String[] includeArguments = line.getOptionValues(INCLUDE_ARGUMENT);
includes = includeArguments != null ? new HashSet<>(asList(includeArguments)) : null;
String[] excludeArguments = line.getOptionValues(EXCLUDE_ARGUMENT);
excludes = excludeArguments != null ? new HashSet<>(asList(excludeArguments)) : null;
type = typeArgument != null ? DownloadableType.fromValue(typeArgument) : File;
setDataSourcesServer(line.getOptionValue(DATASOURCES_SERVER_ARGUMENT));
setDataSourcesUserName(line.getOptionValue(DATASOURCES_USERNAME_ARGUMENT));
setDataSourcesPassword(line.getOptionValue(DATASOURCES_PASSWORD_ARGUMENT));
scan();
}
@SuppressWarnings("AccessStaticViaInstance")
private CommandLine parseCommandLine(String[] args) throws ParseException {
CommandLineParser parser = new DefaultParser();
Options options = new Options();
options.addOption(Option.builder().argName(ID_ARGUMENT).hasArgs().required().longOpt("id").
desc("ID of the data source").build());
options.addOption(Option.builder().argName(URL_ARGUMENT).numberOfArgs(1).required().longOpt("url").
desc("URL to scan for resources").build());
options.addOption(Option.builder().argName(BASE_URL_ARGUMENT).numberOfArgs(1).longOpt("baseUrl").
desc("URL to use as a base for resources").build());
options.addOption(Option.builder().argName(EXTENSION_ARGUMENT).hasArgs().longOpt("extension").
desc("Extensions to scan for").build());
options.addOption(Option.builder().argName(INCLUDE_ARGUMENT).hasArgs().longOpt("include").
desc("Regex for resources to include").build());
options.addOption(Option.builder().argName(EXCLUDE_ARGUMENT).hasArgs().longOpt("exclude").
desc("Regex for resources to exclude").build());
options.addOption(Option.builder().argName(TYPE_ARGUMENT).numberOfArgs(1).longOpt("type").
desc("Type of the resources").build());
options.addOption(Option.builder().argName(DATASOURCES_SERVER_ARGUMENT).numberOfArgs(1).longOpt("server").
desc("Data sources server").build());
options.addOption(Option.builder().argName(DATASOURCES_USERNAME_ARGUMENT).numberOfArgs(1).longOpt("username").
desc("Data sources server user name").build());
options.addOption(Option.builder().argName(DATASOURCES_PASSWORD_ARGUMENT).numberOfArgs(1).longOpt("password").
desc("Data sources server password").build());
try {
return parser.parse(options, args);
} catch (ParseException e) {
HelpFormatter formatter = new HelpFormatter();
formatter.printHelp(getClass().getSimpleName(), options);
throw e;
}
}
public static void main(String[] args) throws Exception {
new ScanWebsite().run(args);
exit(0);
}
}