package edu.isi.dig.elasticsearch; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.ByteArrayInputStream; import java.io.File; import java.io.FileNotFoundException; import java.io.FileReader; import java.io.FileWriter; import java.io.IOException; import java.io.UnsupportedEncodingException; import java.security.KeyManagementException; import java.security.KeyStoreException; import java.security.NoSuchAlgorithmException; import java.security.cert.CertificateException; import java.security.cert.X509Certificate; import javax.net.ssl.HostnameVerifier; import javax.net.ssl.SSLContext; import io.searchbox.client.JestClient; import io.searchbox.client.JestClientFactory; import io.searchbox.client.JestResult; import io.searchbox.client.config.HttpClientConfig; import io.searchbox.core.Search; import io.searchbox.core.SearchScroll; import io.searchbox.params.Parameters; import io.searchbox.params.SearchType; import net.sf.json.JSONArray; import net.sf.json.JSONObject; import net.sf.json.JSONSerializer; import org.apache.commons.cli.BasicParser; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.CommandLineParser; import org.apache.commons.cli.HelpFormatter; import org.apache.commons.cli.Option; import org.apache.commons.cli.Options; import org.apache.http.conn.ssl.NoopHostnameVerifier; import org.apache.http.conn.ssl.SSLConnectionSocketFactory; import org.apache.http.ssl.SSLContextBuilder; import org.apache.http.ssl.TrustStrategy; import org.apache.tika.language.LanguageIdentifier; import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.AutoDetectParser; import org.apache.tika.sax.BodyContentHandler; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.xml.sax.ContentHandler; public class ScanAndScroll { private static Logger LOG = LoggerFactory.getLogger(ScanAndScroll.class); private final JestClient client; private final static String SCROLL = "5m"; private int outputType=0; private String outputFile; private int runTika; private String htmlField; public static void main(String args[]) throws IOException{ Options options = createCommandLineOptions(); CommandLine cl = parse(args, options, ScanAndScroll.class.getSimpleName()); if(cl == null) { return; } String esUrl; String esHostName ; String esPort; String esProtocol; String esUserName; String esPassword; int pageSize; String esQueryFile=null; String esQuery; int docLimit; int outputtype; int runtika; String htmlField; if(cl.hasOption("htmlfield")){ htmlField = (String) cl.getOptionValue("htmlfield"); }else{ htmlField = "raw_content"; //this is where it is from CDR by default } if(cl.hasOption("runtika")){ runtika = Integer.parseInt(cl.getOptionValue("runtika")); }else{ runtika = 1; // run tika by default } if(cl.hasOption("outputtype")){ outputtype = Integer.parseInt(cl.getOptionValue("outputtype")); }else{ outputtype = 0; //create json array by default } if(cl.hasOption("esurl")){ esUrl = (String) cl.getOptionValue("esurl"); }else { if (cl.hasOption("eshostname")){ esHostName = (String) cl.getOptionValue("eshostname"); }else{ esHostName = "localhost"; } if(cl.hasOption("esport")){ esPort = (String) cl.getOptionValue("esport"); }else{ esPort = "9200"; } if(cl.hasOption("esprotocol")){ esProtocol = (String) cl.getOptionValue("esprotocol"); }else{ esProtocol = "http"; } esUrl = esProtocol + "://" + esHostName + ":" + esPort; } if(cl.hasOption("esusername")){ esUserName = (String) cl.getOptionValue("esusername"); }else{ esUserName = ""; } if(cl.hasOption("espassword")){ esPassword = (String) cl.getOptionValue("espassword"); }else{ esPassword = ""; } if(cl.hasOption("pagesize")){ pageSize = Integer.parseInt(cl.getOptionValue("pagesize")); }else{ pageSize = 25; //lucky number? } if(cl.hasOption("esquery")){ esQueryFile = (String) cl.getOptionValue("esquery"); } if(esQueryFile == null){ esQuery = "{\"query\" : {\"match_all\" : {}}}"; //get everything }else{ BufferedReader brQuery = new BufferedReader(new FileReader(new File(esQueryFile))); String line = null; StringBuilder sbQuery = new StringBuilder(); while((line = brQuery.readLine()) != null){ sbQuery.append(line.trim()); } esQuery = sbQuery.toString(); brQuery.close(); } if(cl.hasOption("doclimit")){ docLimit = Integer.parseInt(cl.getOptionValue("doclimit")); }else{ docLimit = 100; //limit the scroller to get only 100 documents out of the potential trillions } String esIndex = (String) cl.getOptionValue("esindex"); String esDocType = (String) cl.getOptionValue("esdoctype"); String outPutFilePath = (String) cl.getOptionValue("outputfile"); ScanAndScroll sas; try { sas = new ScanAndScroll(esUrl, esUserName, esPassword,outPutFilePath,outputtype,outPutFilePath,runtika,htmlField); sas.executeQuery(esQuery, pageSize, esIndex, esDocType,docLimit); } catch (FileNotFoundException | UnsupportedEncodingException e) { LOG.error("Error executing query:" + e); } } private static Options createCommandLineOptions() { Options options = new Options(); options.addOption(new Option("esprotocol", "esprotocol",true, "http or https")); options.addOption(new Option("esindex", "esindex",true, "elasticsearch index name")); options.addOption(new Option("esdoctype", "esdoctype",true, "elasticsearch doc type")); options.addOption(new Option("esusername", "esusername",true, "elasticsearch username")); options.addOption(new Option("espassword", "espassword",true, "elasticsearch password")); options.addOption(new Option("esport", "esport",true, "elasticsearch port")); options.addOption(new Option("eshostname", "eshostname",true, "elasticsearch hostname")); options.addOption(new Option("esquery","esquery",true,"elasticsearch query file")); options.addOption(new Option("pagesize", "pagesize", true,"number of documents per shard to get at one time")); options.addOption(new Option("outputfile","outputfile",true,"output file path")); options.addOption(new Option("doclimit","doclimit",true, "number of documents retrieved, -1 to get trillion")); options.addOption(new Option("outputtype","outputtype",true,"0 for json array, 1 for json lines")); options.addOption(new Option("runtika","runtika",true,"0 for no, 1 for yes")); options.addOption(new Option("esurl","esurl",true,"url for the es server, should be used instead of esprotocol, esport and eshostname")); options.addOption(new Option("htmlfield","htmlfield",true,"name of the html field in json which contains raw html")); return options; } public static CommandLine parse(String args[], Options options, String commandName) { CommandLineParser parser = new BasicParser(); CommandLine cl = null; try { /** * PARSE THE COMMAND LINE ARGUMENTS * */ cl = parser.parse(options, args); if (cl == null || cl.getOptions().length == 0 || cl.hasOption("help")) { HelpFormatter hf = new HelpFormatter(); hf.printHelp(commandName, options); return null; } } catch (Exception e) { LOG.error("Error occured while parsing arguments!", e); return cl; } return cl; } public ScanAndScroll(String url,String username, String password,String outputFilePath,int outputType,String outputFile,int runTika,String htmlField) throws FileNotFoundException, UnsupportedEncodingException{ SSLContext sslContext; try { sslContext = new SSLContextBuilder().loadTrustMaterial(null, new TrustStrategy() { public boolean isTrusted(X509Certificate[] arg0, String arg1) throws CertificateException { return true; } }).build(); } catch (NoSuchAlgorithmException | KeyManagementException | KeyStoreException e) { throw new IllegalStateException(e); } // Skip hostname checks HostnameVerifier hostnameVerifier = NoopHostnameVerifier.INSTANCE; SSLConnectionSocketFactory sslSocketFactory = new SSLConnectionSocketFactory(sslContext, hostnameVerifier); HttpClientConfig.Builder httpClientBuilder = new HttpClientConfig.Builder(url.toString()) .sslSocketFactory(sslSocketFactory) .readTimeout(30000) // Milliseconds .multiThreaded(false); System.out.println(url); if(username.trim() != "" && password.trim() != ""){ httpClientBuilder.defaultCredentials(username, password); } JestClientFactory jcf = new JestClientFactory(); jcf.setHttpClientConfig(httpClientBuilder.build()); this.client = jcf.getObject(); this.outputType = outputType; this.outputFile = outputFile; this.runTika = runTika; this.htmlField = htmlField; } private JSONObject extractTika(String contents){ JSONObject jObj = (JSONObject)JSONSerializer.toJSON(contents); if(jObj.containsKey("_source")) { JSONObject jObjSource = jObj.getJSONObject("_source"); if(jObjSource.containsKey(htmlField)) { String rawHtml = jObjSource.getString(htmlField); ByteArrayInputStream bIs = new ByteArrayInputStream(rawHtml.getBytes()); Metadata metadata = new Metadata(); AutoDetectParser adp = new AutoDetectParser(); ContentHandler handler = new BodyContentHandler(10*1024*1024); try { adp.parse(bIs, handler, metadata); String[] metadataNames = metadata.names(); JSONObject jObjMetadata = new JSONObject(); for(String metadataName:metadataNames) { String[] values = metadata.getValues(metadataName); JSONArray jArray = new JSONArray(); for(String mValue: values) { jArray.add(mValue); } jObjMetadata.accumulate(metadataName, jArray); } //remove empty lines from the text String rawTextAdjusted = handler.toString().replaceAll("(?m)^[ \t]*\r?\n", ""); //detect language LanguageIdentifier li = new LanguageIdentifier(rawTextAdjusted); jObjSource.accumulate("tikametadata", jObjMetadata); jObjSource.accumulate("raw_text", rawTextAdjusted); jObjSource.accumulate("rawtextdetectedlanguage", li.getLanguage()); } catch (Exception e) { LOG.error("Error:",e);; } } } return jObj; } public void executeQuery(String query, int pageSize,String index, String docType,int docLimit){ Search search = new Search.Builder(query) .addIndex(index) .addType(docType) .setParameter(Parameters.SEARCH_TYPE,SearchType.SCAN) .setParameter(Parameters.SIZE, pageSize) .setParameter(Parameters.SCROLL, SCROLL) .build(); System.out.println(query + "$$$$"); boolean runTikaExtractor = true; if(this.runTika == 0){ runTikaExtractor = false; } else if(this.runTika == 1) { runTikaExtractor= true; } try { JestResult searchResult = client.execute(search); //System.out.println(searchResult.getJsonString()); String scrollId = searchResult.getJsonObject().get("_scroll_id").getAsString(); int currentResultSize = 0; int numDocs = 0; do { JSONArray jArrayResult = new JSONArray(); SearchScroll scrollRequest = new SearchScroll.Builder(scrollId, SCROLL) .setParameter(Parameters.SIZE, pageSize) .build(); JestResult scrollResult = client.execute(scrollRequest); scrollId = scrollResult.getJsonObject().get("_scroll_id").getAsString(); JSONObject jObj = (JSONObject) JSONSerializer.toJSON(scrollResult.getJsonString()); JSONArray jArrayHits = jObj.getJSONObject("hits").getJSONArray("hits"); for(int i=0;i<jArrayHits.size();i++){ if(runTikaExtractor){ jArrayResult.add(extractTika(jArrayHits.getString(i)).toString()); }else{ jArrayResult.add(jArrayHits.getString(i).toString()); } } writeToFile(jArrayResult); // Note: Current result size will be Page Size * number of shards currentResultSize = jArrayHits.size(); numDocs+=currentResultSize; System.out.println("num docs:" + String.valueOf(numDocs)); if(docLimit != -1 && numDocs >= docLimit){ break; } } while (currentResultSize != 0); } catch (IOException e) { LOG.error("Error retrieving from Elasticsearch", e); } } private void writeToFile(JSONArray jArray){ try{ FileWriter fw = new FileWriter(outputFile, true); BufferedWriter writer = new BufferedWriter(fw); if(outputType == 0){ writer.write(jArray.toString()); }else if(outputType == 1){ for(int i=0;i<jArray.size();i++){ writer.write(jArray.getJSONObject(i).getJSONObject("_source").getString("url").trim() + "\t" + jArray.getString(i)); writer.newLine(); } } writer.close(); }catch(Exception e){ System.out.println(e.getMessage()); } } }