package focusedCrawler.tools;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.net.URLEncoder;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.Date;
import org.elasticsearch.action.search.SearchResponse;
import org.elasticsearch.action.search.SearchType;
import org.elasticsearch.client.Client;
import org.elasticsearch.common.unit.TimeValue;
import org.elasticsearch.index.query.QueryBuilders;
import org.elasticsearch.search.SearchHit;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.dataformat.cbor.CBORFactory;
import focusedCrawler.target.repository.elasticsearch.ElasticSearchClientFactory;
import focusedCrawler.target.repository.elasticsearch.ElasticSearchConfig;
public class DumpDataFromElasticSearch {
static final ObjectMapper cborMapper = new ObjectMapper(new CBORFactory());
static final ObjectMapper jsonMapper = new ObjectMapper();
static class DDTDocument {
public String url;
public String html;
public String text;
public int length;
public String query;
public Date retrieved;
public String tag;
}
public static void main(String[] args) throws Exception {
String inputIndex = "patent_trolls";
String inputType = "page";
String inputHostname = "localhost";
String inputClusterName = "elasticsearch";
int inputPort = 9300;
final String basePath = "/data/classifiers/patent_trolls/training_data";
ElasticSearchConfig config = new ElasticSearchConfig(inputHostname, inputPort, inputClusterName);
Client client = ElasticSearchClientFactory.createClient(config);
SearchResponse scrollResp = client.prepareSearch(inputIndex)
.setQuery(QueryBuilders.matchAllQuery())
.setTypes(inputType)
.setSearchType(SearchType.SCAN)
.setScroll(new TimeValue(60000))
.setSize(100)
.execute().actionGet();
while (true) {
for (SearchHit hit : scrollResp.getHits().getHits()) {
DDTDocument doc;
final String json = hit.getSourceAsString();
try {
doc = jsonMapper.readValue(json, DDTDocument.class);
} catch (IOException e) {
throw new RuntimeException("Failed to unserialize json object="+json);
}
Path folderPath;
if(doc.tag != null && doc.tag.equals("Relevant")) {
folderPath = Paths.get(basePath, "positive");
} else if(doc.tag != null && doc.tag.equals("Irrelevant")) {
folderPath = Paths.get(basePath, "negative");
} else {
System.err.println("Found unlabeled document.");
continue;
}
final File folder = folderPath.toFile();
if(!folder.exists()) folder.mkdirs();
final String filename = URLEncoder.encode(doc.url, "UTF-8");
FileWriter fw = new FileWriter(folderPath.resolve(filename).toFile());
fw.write(doc.html);
fw.close();
}
scrollResp = client.prepareSearchScroll(scrollResp.getScrollId())
.setScroll(new TimeValue(600000))
.execute().actionGet();
//Break condition: No hits are returned
if (scrollResp.getHits().getHits().length == 0) {
break;
}
}
}
}