/**
* License Agreement for OpenSearchServer
*
* Copyright (C) 2013-2015 Emmanuel Keller / Jaeksoft
*
* http://www.open-search-server.com
*
* This file is part of OpenSearchServer.
*
* OpenSearchServer is free software: you can redistribute it and/or
* modify it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* OpenSearchServer is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with OpenSearchServer.
* If not, see <http://www.gnu.org/licenses/>.
**/
package com.jaeksoft.searchlib.crawler.rest;
import java.io.File;
import java.io.FileFilter;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.security.NoSuchAlgorithmException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import java.util.Map;
import java.util.concurrent.atomic.AtomicLong;
import org.apache.commons.io.filefilter.FileFileFilter;
import org.apache.commons.lang3.StringUtils;
import org.apache.cxf.helpers.FileUtils;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.utils.URIBuilder;
import com.jaeksoft.searchlib.Client;
import com.jaeksoft.searchlib.SearchLibException;
import com.jaeksoft.searchlib.crawler.FieldMapContext;
import com.jaeksoft.searchlib.crawler.common.process.CrawlStatus;
import com.jaeksoft.searchlib.crawler.common.process.CrawlThreadAbstract;
import com.jaeksoft.searchlib.crawler.rest.RestCrawlItem.CallbackMode;
import com.jaeksoft.searchlib.crawler.web.spider.DownloadItem;
import com.jaeksoft.searchlib.crawler.web.spider.HttpDownloader;
import com.jaeksoft.searchlib.crawler.web.spider.HttpDownloader.Method;
import com.jaeksoft.searchlib.index.IndexDocument;
import com.jaeksoft.searchlib.schema.SchemaField;
import com.jaeksoft.searchlib.util.InfoCallback;
import com.jaeksoft.searchlib.util.Variables;
import com.jaeksoft.searchlib.webservice.CommonListResult;
import com.jayway.jsonpath.JsonPath;
import com.jayway.jsonpath.PathNotFoundException;
public class RestCrawlThread extends CrawlThreadAbstract<RestCrawlThread, RestCrawlMaster> {
protected final Client client;
private final RestCrawlItem restCrawlItem;
private final AtomicLong pendingIndexDocumentCount;
private final AtomicLong updatedIndexDocumentCount;
private final Collection<String> idsCallback;
private final FieldMapContext fieldMapContext;
private static class RestCrawlContext {
private final HttpDownloader downloader;
private final List<IndexDocument> indexDocumentList;
private final RestFieldMap restFieldMap;
private final int bufferSize;
private final JsonPath jsonPath;
private RestCrawlContext(HttpDownloader downloader, RestCrawlItem restCrawlItem) throws SearchLibException {
this.downloader = downloader;
jsonPath = JsonPath.compile(restCrawlItem.getPathDocument());
restFieldMap = restCrawlItem.getFieldMap();
bufferSize = restCrawlItem.getBufferSize();
indexDocumentList = new ArrayList<IndexDocument>(bufferSize);
}
}
@SuppressWarnings("unchecked")
public RestCrawlThread(Client client, RestCrawlMaster crawlMaster, RestCrawlItem restCrawlItem, Variables variables,
InfoCallback infoCallback) throws SearchLibException {
super(client, crawlMaster, restCrawlItem, infoCallback);
this.restCrawlItem = restCrawlItem.duplicate();
this.restCrawlItem.apply(variables);
this.client = client;
pendingIndexDocumentCount = new AtomicLong();
updatedIndexDocumentCount = new AtomicLong();
fieldMapContext = new FieldMapContext(client, restCrawlItem.getLang());
this.idsCallback = infoCallback != null && infoCallback instanceof CommonListResult<
?> ? ((CommonListResult<String>) infoCallback).items : null;
}
public String getCountInfo() {
StringBuilder sb = new StringBuilder();
sb.append(getUpdatedIndexDocumentCount());
sb.append(" (");
sb.append(getPendingIndexDocumentCount());
sb.append(")");
return sb.toString();
}
final public long getPendingIndexDocumentCount() {
return pendingIndexDocumentCount.get();
}
final public long getUpdatedIndexDocumentCount() {
return updatedIndexDocumentCount.get();
}
public RestCrawlItem getRestCrawlItem() {
return restCrawlItem;
}
@Override
protected String getCurrentInfo() {
return "";
}
private void callback(HttpDownloader downloader, URI uri, String query)
throws URISyntaxException, ClientProtocolException, IllegalStateException, IOException, SearchLibException {
uri = new URI(uri.getScheme(), null, uri.getHost(), uri.getPort(), uri.getPath(), query, uri.getFragment());
DownloadItem dlItem = downloader.request(uri, restCrawlItem.getCallbackMethod(), restCrawlItem.getCredential(),
null, null, null);
dlItem.checkNoErrorList(200, 201, 202, 203);
}
private final void callbackPerDoc(HttpDownloader downloader, URI uri, String queryPrefix, String key)
throws ClientProtocolException, IllegalStateException, IOException, URISyntaxException, SearchLibException {
StringBuilder queryString = new StringBuilder();
String query = uri.getQuery();
if (query != null)
queryString.append(query);
if (!StringUtils.isEmpty(queryPrefix)) {
if (queryString.length() != 0)
queryString.append('&');
queryString.append(queryPrefix);
if (!StringUtils.isEmpty(key)) {
queryString.append('=');
queryString.append(key);
}
}
callback(downloader, uri, queryString.toString());
}
private final void callbackAllDocs(HttpDownloader downloader, URI uri, String queryPrefix, List<String> pkList)
throws ClientProtocolException, IllegalStateException, IOException, URISyntaxException, SearchLibException {
StringBuilder queryString = new StringBuilder();
String query = uri.getQuery();
if (query != null)
queryString.append(query);
if (!StringUtils.isEmpty(queryPrefix) && pkList != null) {
for (String key : pkList) {
if (queryString.length() != 0)
queryString.append('&');
queryString.append(queryPrefix);
queryString.append('=');
queryString.append(key);
}
}
callback(downloader, uri, queryString.toString());
}
private final void doCallBack(HttpDownloader downloader, List<String> pkList)
throws ClientProtocolException, IllegalStateException, IOException, URISyntaxException, SearchLibException {
CallbackMode mode = restCrawlItem.getCallbackMode();
if (mode == CallbackMode.NO_CALL)
return;
String url = restCrawlItem.getCallbackUrl();
String qp = restCrawlItem.getCallbackQueryParameter();
URI uri = new URI(url);
switch (mode) {
case ONE_CALL_PER_DOCUMENT:
if (pkList != null)
for (String key : pkList)
callbackPerDoc(downloader, uri, qp, key);
break;
case ONE_CALL_FOR_ALL_DOCUMENTS:
callbackAllDocs(downloader, uri, qp, pkList);
break;
default:
break;
}
}
private final boolean index(RestCrawlContext context, int limit)
throws NoSuchAlgorithmException, IOException, URISyntaxException, SearchLibException,
InstantiationException, IllegalAccessException, ClassNotFoundException {
int i = context.indexDocumentList.size();
if (i == 0 || i < limit)
return false;
setStatus(CrawlStatus.INDEXATION);
client.updateDocuments(context.indexDocumentList);
SchemaField uniqueField = client.getSchema().getFieldList().getUniqueField();
List<String> pkList = null;
if (uniqueField != null) {
pkList = new ArrayList<String>(context.indexDocumentList.size());
String fieldName = uniqueField.getName();
for (IndexDocument indexDocument : context.indexDocumentList)
pkList.add(indexDocument.getFieldValueString(fieldName, 0));
if (idsCallback != null)
idsCallback.addAll(pkList);
}
doCallBack(context.downloader, pkList);
pendingIndexDocumentCount.addAndGet(-i);
updatedIndexDocumentCount.addAndGet(i);
context.indexDocumentList.clear();
if (infoCallback != null)
infoCallback.setInfo(updatedIndexDocumentCount + " document(s) indexed");
return true;
}
private void runDocument(RestCrawlContext context, Object document) throws Exception {
setStatus(CrawlStatus.CRAWL);
IndexDocument newIndexDocument = new IndexDocument(fieldMapContext.lang);
context.restFieldMap.mapJson(fieldMapContext, document, newIndexDocument);
context.indexDocumentList.add(newIndexDocument);
pendingIndexDocumentCount.incrementAndGet();
if (index(context, context.bufferSize))
setStatus(CrawlStatus.CRAWL);
}
private int runDocumentList(RestCrawlContext context, Object jsonDoc) throws Exception {
if (jsonDoc == null)
return 0;
if (jsonDoc instanceof Map<?, ?>) {
runDocument(context, jsonDoc);
return 1;
}
if (jsonDoc instanceof List<?>) {
List<?> documents = (List<?>) jsonDoc;
for (Object document : documents)
runDocument(context, document);
return documents.size();
}
return 0;
}
private int runDownload(RestCrawlContext context, URI uri) throws Exception {
DownloadItem dlItem = context.downloader.request(uri, restCrawlItem.getMethod(), restCrawlItem.getCredential(),
null, null, null);
try {
List<Object> documents = context.jsonPath.read(dlItem.getContentInputStream());
return runDocumentList(context, documents);
} catch (PathNotFoundException e) {
return 0;
}
}
private int runFile(RestCrawlContext context, File file) throws Exception {
int res;
try {
res = runDocumentList(context, context.jsonPath.read(file));
} catch (PathNotFoundException e) {
res = 0;
}
if (restCrawlItem.getMethod() == Method.DELETE)
FileUtils.delete(file);
return res;
}
private int runFiles(RestCrawlContext context, URI uri) throws Exception {
File rootFile = new File(uri);
if (rootFile.isFile())
return runFile(context, rootFile);
int res = 0;
for (File file : rootFile.listFiles((FileFilter) FileFileFilter.FILE))
res += runFile(context, file);
return res;
}
private int runURL(RestCrawlContext context, URI uri) throws Exception {
setStatus(CrawlStatus.CRAWL);
if ("file".equals(uri.getScheme()))
return runFiles(context, uri);
else
return runDownload(context, uri);
}
private void runSequence(RestCrawlContext context) throws Exception {
Integer start = restCrawlItem.getSequenceFromInclusive();
if (start == null)
start = 0;
Integer end = restCrawlItem.getSequenceToExclusive();
if (end == null)
end = 100;
Integer inc = restCrawlItem.getSequenceIncrement();
if (inc == null)
inc = 1;
for (int i = start; i < end; i += inc) {
URIBuilder uriBuilder = new URIBuilder(restCrawlItem.getUrl());
uriBuilder.addParameter(restCrawlItem.getSequenceParameter(), Integer.toString(i));
if (runURL(context, uriBuilder.build()) == 0)
break;
}
}
@Override
public void runner() throws Exception {
HttpDownloader downloader = getConfig().getWebCrawlMaster().getNewHttpDownloader(true);
setStatus(CrawlStatus.STARTING);
try {
RestCrawlContext context = new RestCrawlContext(downloader, restCrawlItem);
if (StringUtils.isEmpty(restCrawlItem.getSequenceParameter()))
runURL(context, new URI(restCrawlItem.getUrl()));
else
runSequence(context);
index(context, 0);
} finally {
if (downloader != null)
downloader.release();
}
}
}