package focusedCrawler.tools;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.net.URLDecoder;
import java.nio.file.DirectoryStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.zip.InflaterInputStream;
import org.apache.commons.codec.binary.Base64;
import org.apache.tika.Tika;
import org.elasticsearch.action.search.SearchResponse;
import org.elasticsearch.action.search.SearchType;
import org.elasticsearch.client.Client;
import org.elasticsearch.common.unit.TimeValue;
import org.elasticsearch.index.query.QueryBuilders;
import org.elasticsearch.search.SearchHit;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.dataformat.cbor.CBORFactory;
import com.google.common.base.Preconditions;
import focusedCrawler.memex.cdr.CDR2Document;
import focusedCrawler.target.model.Page;
import focusedCrawler.target.model.ParsedData;
import focusedCrawler.target.model.TargetModelCbor;
import focusedCrawler.target.model.TargetModelElasticSearch;
import focusedCrawler.target.model.TargetModelJson;
import focusedCrawler.target.repository.elasticsearch.ElasticSearchClientFactory;
import focusedCrawler.target.repository.elasticsearch.ElasticSearchConfig;
import focusedCrawler.util.CliTool;
import focusedCrawler.util.parser.PaginaURL;
import io.airlift.airline.Command;
import io.airlift.airline.Option;
//
// TODO: Refactor this class to something simpler and easily maintainable
//
@Command(name="ElasticSearchIndexer", description="Index crawled data in ElasticSearch")
public class ElasticSearchIndexer extends CliTool {
static final ObjectMapper cborMapper = new ObjectMapper(new CBORFactory());
static final ObjectMapper jsonMapper = new ObjectMapper();
static final String format = "yyyy-MM-dd'T'HH:mm:ss";
// Output format option
@Option(name={"-of", "--output-format"}, description="Format used for output data: {ACHE,CDR}")
String outputFormat = "ACHE";
// Elastic Search output options
@Option(name={"-oi", "--output-es-index"}, required=true, description="ElasticSearch index name (output)")
String outputIndex;
@Option(name={"-ot", "--output-es-type"}, required=true, description="ElasticSearch index type (output)")
String outputType;
@Option(name={"-ou", "--output-es-url"}, description="ElasticSearch full HTTP URL address")
String elasticSearchServer = "http://localhost:9200";
@Option(name={"-oa", "--output-es-auth"}, description="User and password for ElasticSearch in format: user:pass")
String userPass = null;
@Option(name={"-obs", "--output-es-bulk-size"}, description="ElasticSearch bulk size")
int bulkSize = 25;
// Input options
@Option(name={"-if", "--input-format"}, description="Format of input data: {CBOR,FILE,ELASTICSEARCH}")
String inputFormat = "FILE";
@Option(name={"-id", "--input-dir"}, description="Input directory, if using CBOR or FILE")
String inputDirectory;
// Elastic Search input options
@Option(name={"-ii", "--input-es-index"}, description="Input ES index, if using ELASTICSEARCH")
String inputIndex;
@Option(name={"-it", "--input-es-type"}, description="Input ES type, if using ELASTICSEARCH")
String inputType;
@Option(name={"-ih", "--input-es-hostname"}, description="Input ES hostname, if using ELASTICSEARCH")
String inputHostname = "localhost";
@Option(name={"-ic", "--input-es-cluster"}, description="Input ES cluster name, if using ELASTICSEARCH")
String inputClusterName = "elasticsearch";
@Option(name={"-ip", "--input-es-port"}, description="Input ES port number, if using ELASTICSEARCH")
int inputPort = 9300;
// Filtering options
@Option(name={"-sd", "--start-date"}, description="Only index data fetcher after this date")
String startStr = null;
@Option(name={"-en", "--end-date"}, description="Only index data fetched before this date")
String endStr = null;
public static void main(String[] args) throws Exception {
CliTool.run(args, new ElasticSearchIndexer());
}
public void execute() throws Exception {
SimpleBulkIndexer bulkIndexer = new SimpleBulkIndexer(elasticSearchServer, userPass, bulkSize);
if(inputFormat.equals("ELASTICSEARCH")) {
indexFromElasticSearch(bulkIndexer, outputFormat, outputIndex, outputType);
}
else {
Date startDate = startStr != null ? new SimpleDateFormat(format).parse(startStr) : null;
Date endDate = endStr != null ? new SimpleDateFormat(format).parse(endStr) : null;
Preconditions.checkNotNull(inputDirectory, "Input directory option can't be null");
Path inputPath = Paths.get(inputDirectory);
indexFromFile(bulkIndexer, outputIndex, outputType, startDate, endDate,
inputPath, outputFormat, inputFormat);
}
bulkIndexer.close();
}
private void indexFromFile(SimpleBulkIndexer bulkIndexer, String indexName,
String typeName, Date startDate, Date endDate,
Path inputPath, String outputFormat, String inputFormat)
throws IOException {
DirectoryStream<Path> fileStream = Files.newDirectoryStream(inputPath);
for (Path filePath : fileStream) {
File f = filePath.toFile();
if(f.isDirectory()) {
// recursivelly index files in the subfolder
indexFromFile(bulkIndexer, indexName, typeName, startDate, endDate,
filePath, outputFormat, inputFormat);
continue;
}
try {
Date fileDate = new Date(f.lastModified());
if(startDate != null && fileDate.before(startDate)) continue;
if(endDate != null && fileDate.after(endDate)) continue;
String id;
Object doc;
if (inputFormat.equals("CBOR")) {
TargetModelCbor input = cborMapper.readValue(f, focusedCrawler.target.model.TargetModelCbor.class);
if (outputFormat.equals("ACHE")) {
id = input.url;
doc = new TargetModelElasticSearch(input);
}
else if (outputFormat.equals("CDR")) {
id = null;
doc = new MemexCrawlSchema(input);
}
else {
throw new IllegalArgumentException("Invalid output format = "+outputFormat);
}
}
else if(inputFormat.equals("FILE")){
final byte[] bytes = Files.readAllBytes(filePath);
String fileAsString = new String(bytes);
String url = URLDecoder.decode(f.getName(), "UTF-8");
if (outputFormat.equals("ACHE")) {
Page page = new Page(new URL(url), fileAsString);
page.setParsedData(new ParsedData(new PaginaURL(page)));
id = url;
doc = new TargetModelElasticSearch(page);
}
else if (outputFormat.equals("CDR")) {
Tika tika = new Tika();
String mediaType = tika.detect(bytes);
if(mediaType != null && !mediaType.startsWith("text")) {
fileAsString = Base64.encodeBase64String(bytes);
}
id = url;
doc = new MemexCrawlSchema(
url,
f.lastModified(),
"NYU",
"ACHE-script",
fileAsString,
mediaType,
null
);
}
else {
throw new IllegalArgumentException("Invalid output schema");
}
}
else if(inputFormat.equals("FILESYSTEM_JSON_ZIP")){
if (outputFormat.equals("CDR2")) {
final byte[] bytes = Files.readAllBytes(filePath);
TargetModelJson pageModel = null;
try(InputStream gzip = new InflaterInputStream(new ByteArrayInputStream(bytes))) {
pageModel = jsonMapper.readValue(gzip, TargetModelJson.class);
}
if(pageModel == null) {
continue;
}
List<String> contentTypeHeader = pageModel.getResponseHeaders().get("Content-Type");
if(contentTypeHeader == null) {
contentTypeHeader = pageModel.getResponseHeaders().get("content-type");
}
if(contentTypeHeader == null || contentTypeHeader.size() == 0) {
continue;
}
if(!contentTypeHeader.iterator().next().contains("text/html")) {
continue;
}
id = pageModel.getUrl();
HashMap<String, Object> crawlData = new HashMap<>();
crawlData.put("response_headers", pageModel.getResponseHeaders());
doc = new CDR2Document.Builder()
.setUrl(pageModel.getUrl())
.setTimestamp(pageModel.getFetchTime())
.setContentType("text/html")
.setTeam("NYU")
.setCrawler("ACHE")
.setRawContent(pageModel.getContentAsString())
.setCrawlData(crawlData)
.build();
} else {
throw new IllegalArgumentException("Invalid output schema");
}
}
else {
throw new IllegalArgumentException("Invalid input format = "+inputFormat);
}
bulkIndexer.addDocument(indexName, typeName, doc, id);
}
catch(Exception e) {
System.err.println("Problem while indexing file: "+f.getCanonicalPath());
e.printStackTrace();
}
}
fileStream.close();
}
private void indexFromElasticSearch(SimpleBulkIndexer bulkIndexer, String outputFormat,
String outputIndex, String outputType)
throws IOException {
Preconditions.checkNotNull(inputIndex, "Input index can't be null");
Preconditions.checkNotNull(inputType, "Input type can't be null");
ElasticSearchConfig config = new ElasticSearchConfig(inputHostname, inputPort, inputClusterName);
Client client = ElasticSearchClientFactory.createClient(config);
SearchResponse scrollResp = client.prepareSearch(inputIndex)
.setQuery(QueryBuilders.matchAllQuery())
.setTypes(inputType)
.setSearchType(SearchType.SCAN)
.setScroll(new TimeValue(60000))
.setSize(100)
.execute().actionGet();
while (true) {
for (SearchHit hit : scrollResp.getHits().getHits()) {
String id;
Object doc;
TargetModelElasticSearch pageModel;
final String json = hit.getSourceAsString();
try {
pageModel = jsonMapper.readValue(json, TargetModelElasticSearch.class);
} catch (IOException e) {
throw new RuntimeException("Failed to unserialize json object="+json);
}
if (outputFormat.equals("ACHE")) {
id = pageModel.getUrl();
doc = pageModel;
}
else if (outputFormat.equals("CDR")) {
id = null;
doc = new MemexCrawlSchema(
pageModel.getUrl(),
pageModel.getRetrieved().getTime(),
"NYU",
"ACHE-script",
pageModel.getHtml(),
"text/html",
null
);
}
else {
throw new IllegalArgumentException("Invalid output format ("+outputFormat+")");
}
bulkIndexer.addDocument(outputIndex, outputType, doc, id);
}
scrollResp = client.prepareSearchScroll(scrollResp.getScrollId())
.setScroll(new TimeValue(600000))
.execute().actionGet();
//Break condition: No hits are returned
if (scrollResp.getHits().getHits().length == 0) {
break;
}
}
}
}