package focusedCrawler.target;
import java.io.IOException;
import java.nio.file.Path;
import java.nio.file.Paths;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import focusedCrawler.config.ConfigService;
import focusedCrawler.target.classifier.TargetClassifier;
import focusedCrawler.target.classifier.TargetClassifierException;
import focusedCrawler.target.classifier.TargetClassifierFactory;
import focusedCrawler.target.classifier.TargetRelevance;
import focusedCrawler.target.model.Page;
import focusedCrawler.target.repository.ElasticSearchRestTargetRepository;
import focusedCrawler.target.repository.ElasticSearchTargetRepository;
import focusedCrawler.target.repository.FileSystemTargetRepository;
import focusedCrawler.target.repository.FileSystemTargetRepository.DataFormat;
import focusedCrawler.target.repository.FilesTargetRepository;
import focusedCrawler.target.repository.TargetRepository;
import focusedCrawler.target.repository.elasticsearch.ElasticSearchConfig;
import focusedCrawler.util.CommunicationException;
import focusedCrawler.util.LangDetection;
import focusedCrawler.util.storage.Storage;
import focusedCrawler.util.storage.StorageConfig;
import focusedCrawler.util.storage.StorageDefault;
import focusedCrawler.util.storage.StorageException;
import focusedCrawler.util.storage.distribution.StorageBinder;
import focusedCrawler.util.storage.distribution.StorageCreator;
public class TargetStorage extends StorageDefault {
public static final Logger logger = LoggerFactory.getLogger(TargetStorage.class);
private TargetRepository targetRepository;
private Storage linkStorage;
private TargetClassifier targetClassifier;
private TargetStorageConfig config;
private LangDetection langDetector = new LangDetection();
private TargetStorageMonitor monitor;
public TargetStorage(TargetClassifier targetClassifier,
TargetRepository targetRepository,
Storage linkStorage,
TargetStorageMonitor monitor,
TargetStorageConfig config) {
this.targetClassifier = targetClassifier;
this.targetRepository = targetRepository;
this.linkStorage = linkStorage;
this.config = config;
this.monitor = monitor;
}
/**
* Inserts a page into the repository.
*/
@Override
public Object insert(Object obj) throws StorageException {
Page page = (Page) obj;
if (config.isEnglishLanguageDetectionEnabled()) {
// Only accept English language
if (this.langDetector.isEnglish(page) == false) {
logger.info("Ignoring non-English page: " + page.getURL().toString());
return null;
}
}
try {
TargetRelevance relevance;
if (targetClassifier != null) {
relevance = targetClassifier.classify(page);
} else {
relevance = TargetRelevance.RELEVANT;
}
page.setTargetRelevance(relevance);
if (relevance.isRelevant() || config.isSaveNegativePages()) {
targetRepository.insert(page);
}
if (relevance.isRelevant()) {
if (config.isBipartite()) {
// set the page is as authority if using backlinks
page.setAuth(true);
}
linkStorage.insert(page);
} else {
if (!config.isHardFocus()) {
if (config.isBipartite()) {
if (page.isHub()) {
linkStorage.insert(page);
}
} else {
linkStorage.insert(page);
}
}
}
monitor.countPage(page, relevance.isRelevant(), relevance.getRelevance());
if (monitor.getTotalOfPages() > config.getVisitedPageLimit()) {
logger.info("Visited page limit exceeded. Exiting crawler. pagelimit=" + config.getVisitedPageLimit());
System.exit(0);
}
} catch (CommunicationException ex) {
logger.error("Communication error while inserting.", ex);
throw new StorageException(ex.getMessage(), ex);
} catch (TargetClassifierException tce) {
logger.error("Classification error while inserting.", tce);
}
return null;
}
public static void runServer(String configPath, String modelPath, String dataPath,
String indexName, String typeName, ConfigService config) {
try {
TargetStorageConfig targetStorageConfig = config.getTargetStorageConfig();
StorageConfig linkStorageConfig = config.getLinkStorageConfig().getStorageServerConfig();
Storage linkStorage = new StorageCreator(linkStorageConfig).produce();
Storage targetStorage = createTargetStorage(configPath, modelPath, dataPath, indexName,
typeName, targetStorageConfig, linkStorage);
StorageBinder binder = new StorageBinder(targetStorageConfig.getStorageServerConfig());
binder.bind(targetStorage);
} catch (Exception e) {
logger.error("Error while starting TargetStorage", e);
}
}
public static Storage createTargetStorage(String configPath, String modelPath, String dataPath,
String esIndexName, String esTypeName,
TargetStorageConfig config, Storage linkStorage)
throws IOException {
//if one wants to use a classifier
TargetClassifier targetClassifier = null;
if(config.isUseClassifier()){
targetClassifier = TargetClassifierFactory.create(modelPath);
}
TargetRepository targetRepository = createTargetRepository(dataPath, esIndexName,
esTypeName, config);
TargetStorageMonitor monitor = new TargetStorageMonitor(dataPath);
Storage targetStorage = new TargetStorage(targetClassifier, targetRepository,
linkStorage, monitor, config);
return targetStorage;
}
private static TargetRepository createTargetRepository(String dataPath,
String esIndexName,
String esTypeName,
TargetStorageConfig config) {
Path targetDirectory = Paths.get(dataPath, config.getTargetStorageDirectory());
String dataFormat = config.getDataFormat();
boolean compressData = config.getCompressData();
boolean hashFilename = config.getHashFileName();
logger.info("Using DATA_FORMAT: " + dataFormat);
switch (dataFormat) {
case "FILES":
return new FilesTargetRepository(targetDirectory, config.getMaxFileSize());
case "FILESYSTEM_JSON":
return new FileSystemTargetRepository(targetDirectory, DataFormat.JSON,
hashFilename, compressData);
case "FILESYSTEM_CBOR":
return new FileSystemTargetRepository(targetDirectory, DataFormat.CBOR,
hashFilename, compressData);
case "FILESYSTEM_HTML":
return new FileSystemTargetRepository(targetDirectory, DataFormat.HTML,
hashFilename, compressData);
case "ELASTICSEARCH":
if (esIndexName == null || esIndexName.isEmpty()) {
throw new IllegalArgumentException("ElasticSearch index name not provided!");
}
if (esTypeName == null || esTypeName.isEmpty()) {
esTypeName = "page";
}
ElasticSearchConfig esconfig = config.getElasticSearchConfig();
if (esconfig.getRestApiHosts() == null) {
return new ElasticSearchTargetRepository(esconfig, esIndexName, esTypeName);
} else {
return new ElasticSearchRestTargetRepository(esconfig, esIndexName, esTypeName);
}
default:
throw new IllegalArgumentException("Invalid data format provided: " + dataFormat);
}
}
public void close() {
targetRepository.close();
monitor.close();
}
}