package org.lobid.lodmill; import java.io.Reader; import java.io.StringReader; import java.util.Calendar; import java.util.concurrent.TimeUnit; import org.culturegraph.mf.exceptions.MetafactureException; import org.culturegraph.mf.framework.DefaultObjectPipe; import org.culturegraph.mf.framework.ObjectReceiver; import org.culturegraph.mf.framework.annotations.Description; import org.culturegraph.mf.framework.annotations.In; import org.culturegraph.mf.framework.annotations.Out; import org.culturegraph.mf.stream.source.Opener; import org.elasticsearch.ElasticsearchException; import org.elasticsearch.action.search.SearchResponse; import org.elasticsearch.action.search.SearchType; import org.elasticsearch.client.transport.TransportClient; import org.elasticsearch.common.settings.ImmutableSettings; import org.elasticsearch.common.transport.InetSocketTransportAddress; import org.elasticsearch.common.unit.TimeValue; import org.elasticsearch.index.query.QueryBuilders; import org.elasticsearch.search.SearchHit; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * Reads an elasticsearch index and emits the _source field of all documents . * * @author Pascal Christoph (dr0i) */ @In(Void.class) @Out(Reader.class) @Description("Reads an elasticsearch index and emits the _source field of all documents.") public final class ElasticsearchReader extends DefaultObjectPipe<String, ObjectReceiver<Reader>>implements Opener { private static final Logger LOG = LoggerFactory.getLogger(ElasticsearchReader.class); private String hostname; private String clustername; private String indexname; private int batchSize = 10; private int to = Integer.MAX_VALUE; private TransportClient transportClient; private SearchResponse response; private final long lastTime = Calendar.getInstance().getTimeInMillis(); private String shards; /** * Sets the elasticsearch hostname * * @param hostname may be an IP or a domain name */ public void setHostname(final String hostname) { this.hostname = hostname; } /** * Sets the elasticsearch cluster name. * * @param clustername the name of the cluster */ public void setClustername(final String clustername) { this.clustername = clustername; } /** * Sets the elasticsearch index name. * * @param indexname the name of the index */ public void setIndexname(final String indexname) { this.indexname = indexname; } /** * Sets the size of the result set . Will be multiplicated with number of * shards. * * @param batchSize the size of the result set fetched at once */ public void setBatchSize(final int batchSize) { this.batchSize = batchSize; } /** * Sets which shards should be searched May be comma separated list . * * @param shards the beginning of the range of the result set */ public void setShards(final String shards) { this.shards = shards; } /** * Sets end of range of the result set . * * @param to the end of the range of the result set */ public void setTo(final int to) { this.to = to; } @Override public void process(String ignore) { if (hostname == null || clustername == null || indexname == null) { LOG.error("Pass 3 params: <hostname> <clustername> <indexname>"); return; } initScrollSearch(); logStatus(); harvestAndProcess(); } private void initScrollSearch() { transportClient = new TransportClient(ImmutableSettings.settingsBuilder() .put("cluster.name", clustername).put("client.transport.sniff", false) .put("client.transport.ping_timeout", 20, TimeUnit.SECONDS).build()); transportClient .addTransportAddress(new InetSocketTransportAddress(hostname, 9300)); response = transportClient.prepareSearch(indexname) .setSearchType(SearchType.SCAN).setPreference("_shards:" + shards) .setScroll(TimeValue.timeValueHours(20)) .setQuery(QueryBuilders.matchAllQuery()).setExplain(false) .setSize(batchSize).execute().actionGet(); } private void logStatus() { LOG.info("Amount of shards: " + transportClient.prepareSearch(indexname) .execute().actionGet().getTotalShards()); LOG.info("Starting querying in partitions of " + (batchSize * response.getTotalShards())); } private void harvestAndProcess() throws ElasticsearchException { int cnt = 0; while (true) { try { response = transportClient.prepareSearchScroll(response.getScrollId()) .setScroll("1h").execute().actionGet(); java.util.Iterator<SearchHit> hitIt = response.getHits().iterator(); while (hitIt.hasNext()) { getReceiver().process(new StringReader( hitIt.next().getSource().get("mabXml").toString())); cnt++; } } catch (MetafactureException e) { LOG.error("Problems with elasticsearch, index '" + indexname + "' at doc number '" + cnt + "'", e); getReceiver().closeStream(); break; } LOG.info("Doc " + cnt + " ,sec:" + ((Calendar.getInstance().getTimeInMillis() - lastTime) / 1000)); // Break condition: No hits are returned or range is exceeded if (response.getHits().getHits().length == 0 || cnt >= to) { getReceiver().closeStream(); break; } } } }