/*
* Licensed to David Pilato (the "Author") under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. Author licenses this
* file to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package fr.pilato.elasticsearch.crawler.fs;
import fr.pilato.elasticsearch.crawler.fs.client.BulkProcessor;
import fr.pilato.elasticsearch.crawler.fs.client.DeleteRequest;
import fr.pilato.elasticsearch.crawler.fs.client.ElasticsearchClientManager;
import fr.pilato.elasticsearch.crawler.fs.client.IndexRequest;
import fr.pilato.elasticsearch.crawler.fs.client.SearchResponse;
import fr.pilato.elasticsearch.crawler.fs.fileabstractor.FileAbstractModel;
import fr.pilato.elasticsearch.crawler.fs.fileabstractor.FileAbstractor;
import fr.pilato.elasticsearch.crawler.fs.fileabstractor.FileAbstractorFile;
import fr.pilato.elasticsearch.crawler.fs.fileabstractor.FileAbstractorSSH;
import fr.pilato.elasticsearch.crawler.fs.meta.doc.Attributes;
import fr.pilato.elasticsearch.crawler.fs.meta.doc.Doc;
import fr.pilato.elasticsearch.crawler.fs.meta.doc.DocParser;
import fr.pilato.elasticsearch.crawler.fs.meta.doc.PathParser;
import fr.pilato.elasticsearch.crawler.fs.meta.job.FsJob;
import fr.pilato.elasticsearch.crawler.fs.meta.job.FsJobFileHandler;
import fr.pilato.elasticsearch.crawler.fs.meta.settings.FsSettings;
import fr.pilato.elasticsearch.crawler.fs.meta.settings.FsSettingsFileHandler;
import fr.pilato.elasticsearch.crawler.fs.rest.RestServer;
import fr.pilato.elasticsearch.crawler.fs.tika.XmlDocParser;
import fr.pilato.elasticsearch.crawler.fs.util.FsCrawlerUtil;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.file.Files;
import java.nio.file.NoSuchFileException;
import java.nio.file.Path;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.time.LocalDateTime;
import java.time.temporal.ChronoUnit;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import java.util.Properties;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.stream.Collectors;
import static fr.pilato.elasticsearch.crawler.fs.FsCrawlerValidator.validateSettings;
import static fr.pilato.elasticsearch.crawler.fs.client.ElasticsearchClient.extractFromPath;
import static fr.pilato.elasticsearch.crawler.fs.tika.TikaDocParser.generate;
/**
* @author dadoonet (David Pilato)
*/
public class FsCrawlerImpl {
public static final class PROTOCOL {
public static final String LOCAL = "local";
public static final String SSH = "ssh";
public static final int SSH_PORT = 22;
}
private static final Logger logger = LogManager.getLogger(FsCrawlerImpl.class);
private static final String PATH_ROOT = Doc.FIELD_NAMES.PATH + "." + fr.pilato.elasticsearch.crawler.fs.meta.doc.Path.FIELD_NAMES.ROOT;
private static final String FILE_FILENAME = Doc.FIELD_NAMES.FILE + "." + fr.pilato.elasticsearch.crawler.fs.meta.doc.File.FIELD_NAMES.FILENAME;
private final AtomicInteger runNumber = new AtomicInteger(0);
private final static String FSCRAWLER_PROPERTIES = "fscrawler.properties";
public static final Properties properties;
static {
properties = new Properties();
try {
properties.load(FsCrawler.class.getClassLoader().getResourceAsStream(FSCRAWLER_PROPERTIES));
} catch (IOException e) {
logger.error("Can not find [{}] resource in the class loader", FSCRAWLER_PROPERTIES);
throw new RuntimeException(e);
}
}
private static final int REQUEST_SIZE = 10000;
public static final int LOOP_INFINITE = -1;
private volatile boolean closed = false;
private final Object semaphore = new Object();
private final FsSettings settings;
private final FsJobFileHandler fsJobFileHandler;
private final Integer loop;
private final boolean updateMapping;
private final boolean rest;
private MessageDigest messageDigest = null;
private Thread fsCrawlerThread;
private final ElasticsearchClientManager esClientManager;
public FsCrawlerImpl(Path config, FsSettings settings) {
this(config, settings, LOOP_INFINITE, false, false);
}
public FsCrawlerImpl(Path config, FsSettings settings, Integer loop, boolean updateMapping, boolean rest) {
/*
* We store config files here...
* Default to ~/.fscrawler
* The dir will be created if needed by calling the following CTOR
*/
new FsSettingsFileHandler(config);
this.fsJobFileHandler = new FsJobFileHandler(config);
this.settings = settings;
this.loop = loop;
this.rest = rest;
this.updateMapping = updateMapping;
this.esClientManager = new ElasticsearchClientManager(config, settings);
closed = validateSettings(logger, settings, rest);
if (closed) {
// We don't go further as we have critical errors
return;
}
// Generate the directory where we write status and other files
Path jobSettingsFolder = config.resolve(settings.getName());
try {
Files.createDirectories(jobSettingsFolder);
} catch (IOException e) {
throw new RuntimeException("Can not create the job config directory", e);
}
// Create MessageDigest instance
if (settings.getFs().getChecksum() != null) {
try {
messageDigest = MessageDigest.getInstance(settings.getFs().getChecksum());
} catch (NoSuchAlgorithmException e) {
throw new RuntimeException("This should never happen as we checked that previously");
}
}
}
public void start() throws Exception {
logger.info("Starting FS crawler");
if (loop < 0) {
logger.info("FS crawler started in watch mode. It will run unless you stop it with CTRL+C.");
}
esClientManager.start();
esClientManager.createIndexAndMappings(settings, updateMapping);
if (loop == 0 && !rest) {
closed = true;
}
if (closed) {
logger.info("Fs crawler is closed. Exiting");
return;
}
// Start the REST Server if needed
if (rest) {
RestServer.start(settings, esClientManager);
logger.info("FS crawler Rest service started on [{}]", settings.getRest().url());
}
// Start the crawler thread - but not if only in rest mode
if (loop != 0) {
fsCrawlerThread = new Thread(new FSParser(settings), "fs-crawler");
fsCrawlerThread.start();
}
}
public void close() throws InterruptedException {
logger.debug("Closing FS crawler [{}]", settings.getName());
closed = true;
synchronized(semaphore) {
semaphore.notify();
}
if (this.fsCrawlerThread != null) {
while (fsCrawlerThread.isAlive()) {
// We check that the crawler has been closed effectively
logger.debug("FS crawler thread is still running");
Thread.sleep(500);
}
logger.debug("FS crawler thread is now stopped");
}
// Stop the REST Server if needed
RestServer.close();
logger.debug("FS crawler Rest service stopped");
esClientManager.close();
logger.debug("ES Client Manager stopped");
logger.info("FS crawler [{}] stopped", settings.getName());
}
public boolean isClosed() {
return closed;
}
private class FSParser implements Runnable {
private final FsSettings fsSettings;
private ScanStatistic stats;
public FSParser(FsSettings fsSettings) {
this.fsSettings = fsSettings;
logger.debug("creating fs crawler thread [{}] for [{}] every [{}]", fsSettings.getName(),
fsSettings.getFs().getUrl(),
fsSettings.getFs().getUpdateRate());
}
@Override
public void run() {
logger.info("FS crawler started for [{}] for [{}] every [{}]", fsSettings.getName(),
fsSettings.getFs().getUrl(),
fsSettings.getFs().getUpdateRate());
while (true) {
if (closed) {
logger.debug("FS crawler thread [{}] is now marked as closed...", fsSettings.getName());
return;
}
int run = runNumber.incrementAndGet();
FileAbstractor path = null;
try {
logger.debug("Fs crawler thread [{}] is now running. Run #{}...", fsSettings.getName(), run);
stats = new ScanStatistic(fsSettings.getFs().getUrl());
path = buildFileAbstractor();
path.open();
if (!path.exists(fsSettings.getFs().getUrl())) {
throw new RuntimeException(fsSettings.getFs().getUrl() + " doesn't exists.");
}
String rootPathId = SignTool.sign(fsSettings.getFs().getUrl());
stats.setRootPathId(rootPathId);
LocalDateTime scanDatenew = LocalDateTime.now();
LocalDateTime scanDate = getLastDateFromMeta(fsSettings.getName());
// We only index the root directory once (first run)
// That means that we don't have a scanDate yet
if (scanDate == null && fsSettings.getFs().isIndexFolders()) {
indexDirectory(fsSettings.getFs().getUrl());
}
if (scanDate == null) {
scanDate = LocalDateTime.MIN;
}
addFilesRecursively(path, fsSettings.getFs().getUrl(), scanDate);
updateFsJob(fsSettings.getName(), scanDatenew);
} catch (Exception e) {
logger.warn("Error while crawling {}: {}", fsSettings.getFs().getUrl(), e.getMessage());
if (logger.isDebugEnabled()) {
logger.warn("Full stacktrace", e);
}
} finally {
if (path != null) {
try {
path.close();
} catch (Exception e) {
logger.warn("Error while closing the connection: {}", e, e.getMessage());
}
}
}
if (loop > 0 && run >= loop) {
logger.info("FS crawler is stopping after {} run{}", run, run > 1 ? "s" : "");
closed = true;
return;
}
try {
logger.debug("Fs crawler is going to sleep for {}", fsSettings.getFs().getUpdateRate());
// The problem here is that there is no wait to close the thread while we are sleeping.
// Which leads to Zombie threads in our tests
synchronized (semaphore) {
semaphore.wait(fsSettings.getFs().getUpdateRate().millis());
logger.debug("Fs crawler is now waking up again...");
}
} catch (InterruptedException e) {
logger.debug("Fs crawler thread has been interrupted: [{}]", e.getMessage());
}
}
}
@SuppressWarnings("unchecked")
private LocalDateTime getLastDateFromMeta(String jobName) throws IOException {
try {
FsJob fsJob = fsJobFileHandler.read(jobName);
return fsJob.getLastrun();
} catch (NoSuchFileException e) {
// The file does not exist yet
}
return null;
}
/**
* Update the job metadata
* @param jobName job name
* @param scanDate last date we scan the dirs
* @throws Exception In case of error
*/
private void updateFsJob(String jobName, LocalDateTime scanDate) throws Exception {
// We need to round that latest date to the lower second and
// remove 2 seconds.
// See #82: https://github.com/dadoonet/fscrawler/issues/82
scanDate = scanDate.minus(2, ChronoUnit.SECONDS);
FsJob fsJob = FsJob.builder()
.setName(jobName)
.setLastrun(scanDate)
.setIndexed(stats.getNbDocScan())
.setDeleted(stats.getNbDocDeleted())
.build();
fsJobFileHandler.write(jobName, fsJob);
}
private FileAbstractor buildFileAbstractor() {
// What is the protocol used?
if (fsSettings.getServer() == null || PROTOCOL.LOCAL.equals(fsSettings.getServer().getProtocol())) {
// Local FS
return new FileAbstractorFile(fsSettings);
} else if (PROTOCOL.SSH.equals(fsSettings.getServer().getProtocol())) {
// Remote SSH FS
return new FileAbstractorSSH(fsSettings);
}
// Non supported protocol
throw new RuntimeException(fsSettings.getServer().getProtocol() + " is not supported yet. Please use " +
PROTOCOL.LOCAL + " or " + PROTOCOL.SSH);
}
private void addFilesRecursively(FileAbstractor<?> path, String filepath, LocalDateTime lastScanDate)
throws Exception {
logger.debug("indexing [{}] content", filepath);
final Collection<FileAbstractModel> children = path.getFiles(filepath);
Collection<String> fsFiles = new ArrayList<>();
Collection<String> fsFolders = new ArrayList<>();
if (children != null) {
for (FileAbstractModel child : children) {
String filename = child.name;
// https://github.com/dadoonet/fscrawler/issues/1 : Filter documents
boolean isIndexable = FsCrawlerUtil.isIndexable(filename, fsSettings.getFs().getIncludes(), fsSettings.getFs().getExcludes());
// It can happen that we a dir "foo" which does not match the include name like "*.txt"
// We need to go in it unless it has been explicitly excluded by the user
if (child.directory && !FsCrawlerUtil.isExcluded(filename, fsSettings.getFs().getExcludes())) {
isIndexable = true;
}
logger.debug("[{}] can be indexed: [{}]", filename, isIndexable);
if (isIndexable) {
if (child.file) {
logger.debug(" - file: {}", filename);
fsFiles.add(filename);
if (child.lastModifiedDate.isAfter(lastScanDate) ||
(child.creationDate != null && child.creationDate.isAfter(lastScanDate))) {
try {
indexFile(child, stats, filepath,
fsSettings.getFs().isIndexContent() ? path.getInputStream(child) : null, child.size);
stats.addFile();
} catch (java.io.FileNotFoundException e) {
if (fsSettings.getFs().isContinueOnError()) {
logger.warn("Unable to open Input Stream for {}, skipping...", e.getMessage());
} else {
throw e;
}
}
} else {
logger.debug(" - not modified: creation date {} , file date {}, last scan date {}",
child.creationDate, child.lastModifiedDate, lastScanDate);
}
} else if (child.directory) {
logger.debug(" - folder: {}", filename);
if (settings.getFs().isIndexFolders()) {
fsFolders.add(child.fullpath);
indexDirectory(child.fullpath);
}
addFilesRecursively(path, child.fullpath, lastScanDate);
} else {
logger.debug(" - other: {}", filename);
logger.debug("Not a file nor a dir. Skipping {}", child.fullpath);
}
} else {
logger.debug(" - ignored file/dir: {}", filename);
}
}
}
// TODO Optimize
// if (path.isDirectory() && path.lastModified() > lastScanDate
// && lastScanDate != 0) {
if (fsSettings.getFs().isRemoveDeleted()) {
logger.debug("Looking for removed files in [{}]...", filepath);
Collection<String> esFiles = getFileDirectory(filepath);
// for the delete files
for (String esfile : esFiles) {
logger.trace("Checking file [{}]", esfile);
if (FsCrawlerUtil.isIndexable(esfile, fsSettings.getFs().getIncludes(), fsSettings.getFs().getExcludes())
&& !fsFiles.contains(esfile)) {
logger.trace("Removing file [{}] in elasticsearch", esfile);
esDelete(fsSettings.getElasticsearch().getIndex(), fsSettings.getElasticsearch().getType(),
generateIdFromFilename(esfile, filepath));
stats.removeFile();
}
}
if (settings.getFs().isIndexFolders()) {
logger.debug("Looking for removed directories in [{}]...", filepath);
Collection<String> esFolders = getFolderDirectory(filepath);
// for the delete folder
for (String esfolder : esFolders) {
if (FsCrawlerUtil.isIndexable(esfolder, fsSettings.getFs().getIncludes(), fsSettings.getFs().getExcludes())) {
logger.trace("Checking directory [{}]", esfolder);
if (!fsFolders.contains(esfolder)) {
logger.trace("Removing recursively directory [{}] in elasticsearch", esfolder);
removeEsDirectoryRecursively(esfolder);
}
}
}
}
}
}
// TODO Optimize it. We can probably use a search for a big array of filenames instead of
// Searching fo 10000 files (which is somehow limited).
private Collection<String> getFileDirectory(String path)
throws Exception {
Collection<String> files = new ArrayList<>();
// If the crawler is being closed, we return
if (closed) {
return files;
}
logger.trace("Querying elasticsearch for files in dir [{}:{}]", PATH_ROOT, SignTool.sign(path));
SearchResponse response = esClientManager.client().search(
fsSettings.getElasticsearch().getIndex(),
fsSettings.getElasticsearch().getType(),
PATH_ROOT + ":" + SignTool.sign(path),
REQUEST_SIZE, // TODO: WHAT? DID I REALLY WROTE THAT? :p
"_source", FILE_FILENAME
);
logger.trace("Response [{}]", response.toString());
if (response.getHits() != null && response.getHits().getHits() != null) {
for (SearchResponse.Hit hit : response.getHits().getHits()) {
String name;
if (hit.getSource() != null
&& extractFromPath(hit.getSource(), Doc.FIELD_NAMES.FILE).get(fr.pilato.elasticsearch.crawler.fs.meta.doc
.File.FIELD_NAMES.FILENAME) != null) {
name = (String) extractFromPath(hit.getSource(), Doc.FIELD_NAMES.FILE).get(fr.pilato.elasticsearch.crawler.fs.meta.doc.File.FIELD_NAMES.FILENAME);
} else if (hit.getFields() != null
&& hit.getFields().get(FILE_FILENAME) != null) {
// In case someone disabled _source which is not recommended
name = getName(hit.getFields().get(FILE_FILENAME));
} else {
// Houston, we have a problem ! We can't get the old files from ES
logger.warn("Can't find in _source nor fields the existing filenames in path [{}]. " +
"Please enable _source or store field [{}]", path, FILE_FILENAME);
throw new RuntimeException("Mapping is incorrect: please enable _source or store field [" +
FILE_FILENAME + "].");
}
files.add(name);
}
}
return files;
}
private String getName(Object nameObject) {
if (nameObject instanceof List) {
return String.valueOf (((List) nameObject).get(0));
}
throw new RuntimeException("search result, " + nameObject +
" not of type List<String> but " +
nameObject.getClass().getName() + " with value " + nameObject);
}
private Collection<String> getFolderDirectory(String path) throws Exception {
Collection<String> files = new ArrayList<>();
// If the crawler is being closed, we return
if (closed) {
return files;
}
SearchResponse response = esClientManager.client().search(
fsSettings.getElasticsearch().getIndex(),
FsCrawlerUtil.INDEX_TYPE_FOLDER,
fr.pilato.elasticsearch.crawler.fs.meta.doc.Path.FIELD_NAMES.ROOT + ":" + SignTool.sign(path),
REQUEST_SIZE // TODO: WHAT? DID I REALLY WROTE THAT? :p
);
if (response.getHits() != null && response.getHits().getHits() != null) {
for (SearchResponse.Hit hit : response.getHits().getHits()) {
String name = hit.getSource().get(fr.pilato.elasticsearch.crawler.fs.meta.doc.Path.FIELD_NAMES.REAL).toString();
files.add(name);
}
}
return files;
}
/**
* Index a file
*/
private void indexFile(FileAbstractModel fileAbstractModel, ScanStatistic stats, String dirname, InputStream inputStream,
long filesize) throws Exception {
final String filename = fileAbstractModel.name;
final LocalDateTime lastmodified = fileAbstractModel.lastModifiedDate;
final String extension = fileAbstractModel.extension;
final long size = fileAbstractModel.size;
logger.debug("fetching content from [{}],[{}]", dirname, filename);
try {
// Create the Doc object (only needed when we have add_as_inner_object: true (default) or when we don't index json or xml)
if (fsSettings.getFs().isAddAsInnerObject() || (!fsSettings.getFs().isJsonSupport() && !fsSettings.getFs().isXmlSupport())) {
String fullFilename = new File(dirname, filename).toString();
Doc doc = new Doc();
// File
doc.getFile().setFilename(filename);
doc.getFile().setLastModified(lastmodified);
doc.getFile().setIndexingDate(LocalDateTime.now());
doc.getFile().setUrl("file://" + fullFilename);
doc.getFile().setExtension(extension);
if (fsSettings.getFs().isAddFilesize()) {
doc.getFile().setFilesize(size);
}
// File
// Path
// Encoded version of the dir this file belongs to
doc.getPath().setRoot(SignTool.sign(dirname));
// The virtual URL (not including the initial root dir)
doc.getPath().setVirtual(FsCrawlerUtil.computeVirtualPathName(stats.getRootPath(), fullFilename));
// The real and complete filename
doc.getPath().setReal(fullFilename);
// Path
// Attributes
if (fsSettings.getFs().isAttributesSupport()) {
doc.setAttributes(new Attributes());
doc.getAttributes().setOwner(fileAbstractModel.owner);
doc.getAttributes().setGroup(fileAbstractModel.group);
}
// Attributes
if (fsSettings.getFs().isIndexContent()) {
// If needed, we generate the content in addition to metadata
if (fsSettings.getFs().isJsonSupport()) {
// https://github.com/dadoonet/fscrawler/issues/5 : Support JSon files
doc.setObject(DocParser.asMap(read(inputStream)));
} else if (fsSettings.getFs().isXmlSupport()) {
// https://github.com/dadoonet/fscrawler/issues/185 : Support Xml files
doc.setObject(XmlDocParser.generateMap(inputStream));
} else {
// Extracting content with Tika
generate(fsSettings, inputStream, filename, doc, messageDigest, filesize);
}
}
// We index the data structure
esIndex(esClientManager.bulkProcessor(), fsSettings.getElasticsearch().getIndex(),
fsSettings.getElasticsearch().getType(),
generateIdFromFilename(filename, dirname),
doc);
} else if (fsSettings.getFs().isIndexContent()) {
if (fsSettings.getFs().isJsonSupport()) {
// We index the json content directly
esIndex(esClientManager.bulkProcessor(), fsSettings.getElasticsearch().getIndex(),
fsSettings.getElasticsearch().getType(),
generateIdFromFilename(filename, dirname),
read(inputStream));
} else if (fsSettings.getFs().isXmlSupport()) {
// We index the xml content directly
esIndex(esClientManager.bulkProcessor(), fsSettings.getElasticsearch().getIndex(),
fsSettings.getElasticsearch().getType(),
generateIdFromFilename(filename, dirname),
XmlDocParser.generate(inputStream));
}
}
} finally {
// Let's close the stream
if (inputStream != null) {
inputStream.close();
}
}
}
private String generateIdFromFilename(String filename, String filepath) throws NoSuchAlgorithmException {
return fsSettings.getFs().isFilenameAsId() ? filename : SignTool.sign((new File(filepath, filename)).toString());
}
private String read(InputStream input) throws IOException {
try (BufferedReader buffer = new BufferedReader(new InputStreamReader(input, "UTF-8"))) {
return buffer.lines().collect(Collectors.joining("\n"));
}
}
/**
* Index a Path object (AKA a folder) in elasticsearch
* @param id id of the path
* @param path path object
* @throws Exception in case of error
*/
private void indexDirectory(String id, fr.pilato.elasticsearch.crawler.fs.meta.doc.Path path) throws Exception {
esIndex(esClientManager.bulkProcessor(), fsSettings.getElasticsearch().getIndex(),
FsCrawlerUtil.INDEX_TYPE_FOLDER,
id,
path);
}
/**
* Index a directory
* @param path complete path like /path/to/subdir
*/
private void indexDirectory(String path) throws Exception {
fr.pilato.elasticsearch.crawler.fs.meta.doc.Path pathObject = new fr.pilato.elasticsearch.crawler.fs.meta.doc.Path();
// The real and complete path
pathObject.setReal(path);
String rootdir = path.substring(0, path.lastIndexOf(File.separator));
// Encoded version of the parent dir
pathObject.setRoot(SignTool.sign(rootdir));
// The virtual URL (not including the initial root dir)
pathObject.setVirtual(FsCrawlerUtil.computeVirtualPathName(stats.getRootPath(), path));
indexDirectory(SignTool.sign(path), pathObject);
}
/**
* Remove a full directory and sub dirs recursively
*/
private void removeEsDirectoryRecursively(final String path) throws Exception {
logger.debug("Delete folder [{}]", path);
Collection<String> listFile = getFileDirectory(path);
for (String esfile : listFile) {
esDelete(
fsSettings.getElasticsearch().getIndex(),
fsSettings.getElasticsearch().getType(),
SignTool.sign(path.concat(File.separator).concat(esfile)));
}
Collection<String> listFolder = getFolderDirectory(path);
for (String esfolder : listFolder) {
removeEsDirectoryRecursively(esfolder);
}
esDelete(fsSettings.getElasticsearch().getIndex(), FsCrawlerUtil.INDEX_TYPE_FOLDER, SignTool.sign(path));
}
/**
* Add to bulk an IndexRequest
*/
public void esIndex(BulkProcessor bulkProcessor, String index, String type, String id,
Doc doc) throws Exception {
esIndex(bulkProcessor, index, type, id, DocParser.toJson(doc));
}
public void esIndex(BulkProcessor bulkProcessor, String index, String type, String id, fr.pilato.elasticsearch.crawler.fs.meta.doc.Path path)
throws Exception {
esIndex(bulkProcessor, index, type, id, PathParser.toJson(path));
}
/**
* Add to bulk an IndexRequest in JSon format
*/
public void esIndex(BulkProcessor bulkProcessor, String index, String type, String id, String json) {
logger.debug("Indexing in ES " + index + ", " + type + ", " + id);
logger.trace("JSon indexed : {}", json);
if (!closed) {
bulkProcessor.add(new IndexRequest(index, type, id).source(json));
} else {
logger.warn("trying to add new file while closing crawler. Document [{}]/[{}]/[{}] has been ignored", index, type, id);
}
}
/**
* Add to bulk a DeleteRequest
*/
public void esDelete(String index, String type, String id) {
logger.debug("Deleting from ES " + index + ", " + type + ", " + id);
if (!closed) {
esClientManager.bulkProcessor().add(new DeleteRequest(index, type, id));
} else {
logger.warn("trying to remove a file while closing crawler. Document [{}]/[{}]/[{}] has been ignored", index, type, id);
}
}
}
/**
* Check whether the given CharSequence has actual text.
* More specifically, returns <code>true</code> if the string not <code>null</code>,
* its length is greater than 0, and it contains at least one non-whitespace character.
* <p><pre>
* StringUtils.hasText(null) = false
* StringUtils.hasText("") = false
* StringUtils.hasText(" ") = false
* StringUtils.hasText("12345") = true
* StringUtils.hasText(" 12345 ") = true
* </pre>
*
* @param str the CharSequence to check (may be <code>null</code>)
* @return <code>true</code> if the CharSequence is not <code>null</code>,
* its length is greater than 0, and it does not contain whitespace only
* @see java.lang.Character#isWhitespace
*/
public static boolean hasText(CharSequence str) {
if (!hasLength(str)) {
return false;
}
int strLen = str.length();
for (int i = 0; i < strLen; i++) {
if (!Character.isWhitespace(str.charAt(i))) {
return true;
}
}
return false;
}
private static boolean hasLength(CharSequence str) {
return str != null && str.length() > 0;
}
public int getRunNumber() {
return runNumber.get();
}
}