package focusedCrawler.memex.cdr;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.Date;
import java.util.HashMap;
import java.util.Iterator;
import java.util.zip.GZIPOutputStream;
import focusedCrawler.target.model.TargetModelJson;
import focusedCrawler.target.repository.FileSystemTargetRepository;
import focusedCrawler.target.repository.FileSystemTargetRepository.DataFormat;
import focusedCrawler.target.repository.FilesTargetRepository;
import focusedCrawler.tools.SimpleBulkIndexer;
import focusedCrawler.util.CliTool;
import io.airlift.airline.Command;
import io.airlift.airline.Option;
@Command(name="AcheToCdrExporter", description="Exports crawled data to CDR format")
public class AcheToCdrExporter extends CliTool {
//
// Input data options
//
@Option(name = "--input-path", description="Path to ACHE data target folder", required=true)
private String inputPath;
@Option(name={"--repository-type", "-rt"}, description="Which repository type should be used", required=true)
private RepositoryType repositoryType;
public enum RepositoryType {
FILES, FILESYSTEM_JSON;
}
@Option(name="--fs-hashed", description="Whether ACHE filesystem repository files names are hashed")
private boolean hashFilename = false;
@Option(name="--fs-compressed", description="Whether ACHE filesystem repository files is compressed")
private boolean compressData = false;
//
// Options for output data format
//
@Option(name="--cdr-version", description="Which CDR version should be used")
private CDRVersion cdrVersion = CDRVersion.CDRv2;
public enum CDRVersion {
CDRv2, CDRv3
}
@Option(name="--output-file", description="Gziped output file containing data formmated as per CDR schema")
private String outputFile;
// Elastic Search output options
@Option(name={"--output-es-index", "-oi"}, description="ElasticSearch index name (output)")
String outputIndex;
@Option(name={"--output-es-type", "-ot"}, description="ElasticSearch index type (output)")
String outputType;
@Option(name={"--output-es-url", "-ou"}, description="ElasticSearch full HTTP URL address")
String elasticSearchServer = "http://localhost:9200";
@Option(name={"--output-es-auth", "-oa"}, description="User and password for ElasticSearch in format: user:pass")
String userPass = null;
@Option(name={"--output-es-bulk-size", "-obs"}, description="ElasticSearch bulk size")
int bulkSize = 25;
//
// Runtime variables
//
private int processedPages = 0;
private PrintWriter out;
private SimpleBulkIndexer bulkIndexer;
private String id;
private Object doc;
public static void main(String[] args) throws Exception {
CliTool.run(args, new AcheToCdrExporter());
}
@Override
public void execute() throws Exception {
System.out.println("Reading ACHE data from: "+inputPath);
System.out.println("Generating CDR file at: "+outputFile);
System.out.println(" Compressed repository: "+compressData);
System.out.println(" Hashed file name: "+hashFilename);
if(outputFile != null) {
GZIPOutputStream gzipStream = new GZIPOutputStream(new FileOutputStream(outputFile));
out = new PrintWriter(gzipStream, true);
}
if(elasticSearchServer != null) {
bulkIndexer = new SimpleBulkIndexer(elasticSearchServer, userPass, bulkSize);
}
Iterator<TargetModelJson> it;
if(repositoryType == RepositoryType.FILESYSTEM_JSON) {
FileSystemTargetRepository repository = new FileSystemTargetRepository(inputPath,
DataFormat.JSON, hashFilename, compressData);
it = repository.iterator();
} else {
FilesTargetRepository repository = new FilesTargetRepository(inputPath);
it = repository.iterator();
}
while (it.hasNext()) {
TargetModelJson pageModel = it.next();
try{
processRecord(pageModel);
processedPages++;
if(processedPages % 100 == 0) {
System.out.printf("Processed %d pages\n", processedPages);
}
} catch(Exception e) {
System.err.println("Failed to process record.\n" + e.toString());
}
}
System.out.printf("Processed %d pages\n", processedPages);
//it.close();
if(out != null) out.close();
if(bulkIndexer!= null) bulkIndexer.close();
System.out.println("done.");
}
private void processRecord(TargetModelJson pageModel) throws IOException {
String contentType = pageModel.getContentType();
if (contentType == null || contentType.isEmpty()) {
System.err.println("Ignoring URL with no content-type: "+pageModel.getUrl());
return;
}
if (!contentType.startsWith("text/")) {
return;
}
if(cdrVersion == CDRVersion.CDRv2) {
createCDR2DocumentJson(pageModel);
} else {
createCDR3DocumentJson(pageModel);
}
if(doc != null&& out != null) {
out.println(doc);
}
if(bulkIndexer != null) {
bulkIndexer.addDocument(outputIndex, outputType, doc, id);
}
}
public void createCDR2DocumentJson(TargetModelJson pageModel) {
HashMap<String, Object> crawlData = new HashMap<>();
crawlData.put("response_headers", pageModel.getResponseHeaders());
CDR2Document.Builder builder = new CDR2Document.Builder()
.setUrl(pageModel.getUrl())
.setTimestamp(pageModel.getFetchTime())
.setContentType(pageModel.getContentType())
.setVersion("2.0")
.setTeam("NYU")
.setCrawler("ACHE")
.setRawContent(pageModel.getContentAsString())
.setCrawlData(crawlData);
CDR2Document doc = builder.build();
this.id = doc.getId();
this.doc = doc;
}
public void createCDR3DocumentJson(TargetModelJson pageModel) {
HashMap<String, Object> crawlData = new HashMap<>();
crawlData.put("response_headers", pageModel.getResponseHeaders());
CDR3Document.Builder builder = new CDR3Document.Builder()
.setUrl(pageModel.getUrl())
.setTimestampCrawl(new Date(pageModel.getFetchTime()))
.setTimestampIndex(new Date())
.setContentType(pageModel.getContentType())
.setTeam("NYU")
.setCrawler("ACHE")
.setRawContent(pageModel.getContentAsString());
CDR3Document doc = builder.build();
this.id = doc.getId();
this.doc = doc;
}
}