/**
* License Agreement for OpenSearchServer
* <p>
* Copyright (C) 2017 Emmanuel Keller / Jaeksoft
* <p>
* http://www.open-search-server.com
* <p>
* This file is part of OpenSearchServer.
* <p>
* OpenSearchServer is free software: you can redistribute it and/or
* modify it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
* <p>
* OpenSearchServer is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
* <p>
* You should have received a copy of the GNU General Public License
* along with OpenSearchServer.
* If not, see <http://www.gnu.org/licenses/>.
**/
package com.jaeksoft.searchlib.crawler.web.sitemap;
import com.jaeksoft.searchlib.Logging;
import com.jaeksoft.searchlib.SearchLibException;
import com.jaeksoft.searchlib.crawler.web.GenericCache;
import com.jaeksoft.searchlib.crawler.web.spider.DownloadItem;
import com.jaeksoft.searchlib.crawler.web.spider.HttpDownloader;
import com.jaeksoft.searchlib.util.DomUtils;
import com.jaeksoft.searchlib.util.IOUtils;
import org.apache.commons.compress.compressors.CompressorException;
import org.apache.commons.compress.compressors.CompressorStreamFactory;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import javax.xml.parsers.ParserConfigurationException;
import java.io.IOException;
import java.io.InputStream;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.Date;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Set;
public class SiteMapCache extends GenericCache<URI, SiteMapCache.Item> {
private static volatile SiteMapCache INSTANCE;
public static SiteMapCache getInstance() {
if (INSTANCE != null)
return INSTANCE;
synchronized (SiteMapCache.class) {
if (INSTANCE != null)
return INSTANCE;
INSTANCE = new SiteMapCache();
return INSTANCE;
}
}
/**
* Return the SiteMap object related to the URL.
*
* @param uri
* @param forceReload
* @return
* @throws SearchLibException
* @throws URISyntaxException
* @throws IOException
*/
Item getSiteMapItemUrls(final URI uri, final HttpDownloader httpDownloader, final boolean forceReload)
throws SearchLibException {
try {
return getOrCreate(uri, forceReload, new ItemSupplier<Item>() {
@Override
public Item get() throws IOException, SearchLibException {
return new Item(uri, httpDownloader);
}
});
} catch (URISyntaxException | IOException e) {
throw new SearchLibException(e);
}
}
private static void load(final URI uri, final HttpDownloader httpDownloader, final Set<SiteMapUrl> siteMapUrlSet)
throws SearchLibException {
InputStream inputStream = null;
try {
DownloadItem downloadItem = httpDownloader.get(uri, null);
downloadItem.checkNoErrorList(200);
if ("application/x-gzip".equals(downloadItem.getContentBaseType())) {
inputStream = new CompressorStreamFactory().createCompressorInputStream(CompressorStreamFactory.GZIP,
downloadItem.getContentInputStream());
} else
inputStream = downloadItem.getContentInputStream();
Document doc = DomUtils.readXml(new InputSource(inputStream), true);
if (doc != null) {
List<Node> nodes = DomUtils.getAllNodes(doc, "url");
if (nodes != null)
for (Node node : nodes)
siteMapUrlSet.add(new SiteMapUrl(node));
}
} catch (SearchLibException.WrongStatusCodeException e) {
Logging.warn("Error while loading the sitemap: " + uri, e);
} catch (IllegalStateException | IOException | ParserConfigurationException | URISyntaxException | CompressorException | SAXException e) {
throw new SearchLibException(e);
} finally {
IOUtils.close(inputStream);
}
}
final class Item implements GenericCache.Expirable {
private final Date crawlDate;
private final long expirableTime;
private final Set<SiteMapUrl> siteMapUrls;
private final String error;
Item(final URI uri, final HttpDownloader httpDownloader) {
crawlDate = new Date(System.currentTimeMillis());
expirableTime = crawlDate.getTime() + 1000 * 60 * 15;
siteMapUrls = new LinkedHashSet<>();
String err = null;
try {
load(uri, httpDownloader, siteMapUrls);
} catch (SearchLibException e) {
err = e.getMessage();
}
error = err;
}
void fill(final Set<SiteMapUrl> set) {
set.addAll(siteMapUrls);
}
@Override
public long getExpirationTime() {
return expirableTime;
}
@Override
public boolean isCacheable() {
return true;
}
public String getError() {
return error;
}
public Date getCrawlDate() {
return crawlDate;
}
}
}