SimpleReplicaNode.java example

Explorer
lucene-solr-master
- lucene
- solr
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.lucene.replicator.nrt;

import java.io.BufferedOutputStream;
import java.io.EOFException;
import java.io.IOException;
import java.io.InputStream;
import java.net.ServerSocket;
import java.net.Socket;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Random;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicLong;

import org.apache.lucene.document.Document;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.MatchAllDocsQuery;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.SearcherFactory;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.store.MockDirectoryWrapper;
import org.apache.lucene.store.RateLimitedIndexOutput;
import org.apache.lucene.store.RateLimiter;
import org.apache.lucene.util.LuceneTestCase;

class SimpleReplicaNode extends ReplicaNode {
  final int tcpPort;
  final Jobs jobs;

  // Rate limits incoming bytes/sec when fetching files:
  final RateLimiter fetchRateLimiter;
  final AtomicLong bytesSinceLastRateLimiterCheck = new AtomicLong();
  final Random random;

  /** Changes over time, as primary node crashes and moves around */
  int curPrimaryTCPPort;

  public SimpleReplicaNode(Random random, int id, int tcpPort, Path indexPath, long curPrimaryGen, int primaryTCPPort,
                           SearcherFactory searcherFactory, boolean doCheckIndexOnClose) throws IOException {
    super(id, getDirectory(random, id, indexPath, doCheckIndexOnClose), searcherFactory, System.out);
    this.tcpPort = tcpPort;
    this.random = new Random(random.nextLong());

    // Random IO throttling on file copies: 5 - 20 MB/sec:
    double mbPerSec = 5 * (1.0 + 3*random.nextDouble());
    message(String.format(Locale.ROOT, "top: will rate limit file fetch to %.2f MB/sec", mbPerSec));
    fetchRateLimiter = new RateLimiter.SimpleRateLimiter(mbPerSec);
    this.curPrimaryTCPPort = primaryTCPPort;
    
    start(curPrimaryGen);

    // Handles fetching files from primary:
    jobs = new Jobs(this);
    jobs.setName("R" + id + ".copyJobs");
    jobs.setDaemon(true);
    jobs.start();
  }

  @Override
  protected void launch(CopyJob job) {
    jobs.launch(job);
  }

  @Override
  public void close() throws IOException {
    // Can't be sync'd when calling jobs since it can lead to deadlock:
    jobs.close();
    message("top: jobs closed");
    synchronized(mergeCopyJobs) {
      for (CopyJob job : mergeCopyJobs) {
        message("top: cancel merge copy job " + job);
        job.cancel("jobs closing", null);
      }
    }
    super.close();
  }

  @Override
  protected CopyJob newCopyJob(String reason, Map<String,FileMetaData> files, Map<String,FileMetaData> prevFiles,
                               boolean highPriority, CopyJob.OnceDone onceDone) throws IOException {
    Connection c;
    CopyState copyState;

    // Exceptions in here mean something went wrong talking over the socket, which are fine (e.g. primary node crashed):
    try {
      c = new Connection(curPrimaryTCPPort);
      c.out.writeByte(SimplePrimaryNode.CMD_FETCH_FILES);
      c.out.writeVInt(id);
      if (files == null) {
        // No incoming CopyState: ask primary for latest one now
        c.out.writeByte((byte) 1);
        c.flush();
        copyState = SimpleServer.readCopyState(c.in);
        files = copyState.files;
      } else {
        c.out.writeByte((byte) 0);
        copyState = null;
      }
    } catch (Throwable t) {
      throw new NodeCommunicationException("exc while reading files to copy", t);
    }

    return new SimpleCopyJob(reason, c, copyState, this, files, highPriority, onceDone);
  }

  static Directory getDirectory(Random random, int id, Path path, boolean doCheckIndexOnClose) throws IOException {
    MockDirectoryWrapper dir = LuceneTestCase.newMockFSDirectory(path);
    
    dir.setAssertNoUnrefencedFilesOnClose(true);
    dir.setCheckIndexOnClose(doCheckIndexOnClose);

    // Corrupt any index files not referenced by current commit point; this is important (increases test evilness) because we may have done
    // a hard crash of the previous JVM writing to this directory and so MDW's corrupt-unknown-files-on-close never ran:
    Node.nodeMessage(System.out, id, "top: corrupt unknown files");
    dir.corruptUnknownFiles();

    return dir;
  }

  static final byte CMD_NEW_NRT_POINT = 0;

  // Sent by primary to replica to pre-copy merge files:
  static final byte CMD_PRE_COPY_MERGE = 17;

  /** Handles incoming request to the naive TCP server wrapping this node */
  void handleOneConnection(ServerSocket ss, AtomicBoolean stop, InputStream is, Socket socket, DataInput in, DataOutput out, BufferedOutputStream bos) throws IOException, InterruptedException {
    //message("one connection: " + socket);
    outer:
    while (true) {
      byte cmd;
      while (true) {
        if (is.available() > 0) {
          break;
        }
        if (stop.get()) {
          return;
        }
        Thread.sleep(10);
      }

      try {
        cmd = in.readByte();
      } catch (EOFException eofe) {
        break;
      }

      switch(cmd) {
      case CMD_NEW_NRT_POINT:
        {
          long version = in.readVLong();
          long newPrimaryGen = in.readVLong();
          Thread.currentThread().setName("recv-" + version);
          curPrimaryTCPPort = in.readInt();
          message("newNRTPoint primaryTCPPort=" + curPrimaryTCPPort + " version=" + version + " newPrimaryGen=" + newPrimaryGen);
          newNRTPoint(newPrimaryGen, version);
        }
        break;

      case SimplePrimaryNode.CMD_GET_SEARCHING_VERSION:
        // This is called when primary has crashed and we need to elect a new primary from all the still running replicas:

        // Tricky: if a sync is just finishing up, i.e. managed to finish copying all files just before we crashed primary, and is now
        // in the process of opening a new reader, we need to wait for it, to be sure we really pick the most current replica:
        if (isCopying()) {
          message("top: getSearchingVersion: now wait for finish sync");
          // TODO: use immediate concurrency instead of polling:
          while (isCopying() && stop.get() == false) {
            Thread.sleep(10);
            message("top: curNRTCopy=" + curNRTCopy);
          }
          message("top: getSearchingVersion: done wait for finish sync");
        }
        if (stop.get() == false) {
          out.writeVLong(getCurrentSearchingVersion());
        } else {
          message("top: getSearchingVersion: stop waiting for finish sync: stop is set");
        }
        break;

      case SimplePrimaryNode.CMD_SEARCH:
        {
          Thread.currentThread().setName("search");
          IndexSearcher searcher = mgr.acquire();
          try {
            long version = ((DirectoryReader) searcher.getIndexReader()).getVersion();
            int hitCount = searcher.search(new TermQuery(new Term("body", "the")), 1).totalHits;
            //node.message("version=" + version + " searcher=" + searcher);
            out.writeVLong(version);
            out.writeVInt(hitCount);
            bos.flush();
          } finally {
            mgr.release(searcher);
          }
        }
        continue outer;

      case SimplePrimaryNode.CMD_SEARCH_ALL:
        {
          Thread.currentThread().setName("search all");
          IndexSearcher searcher = mgr.acquire();
          try {
            long version = ((DirectoryReader) searcher.getIndexReader()).getVersion();
            int hitCount = searcher.search(new MatchAllDocsQuery(), 1).totalHits;
            //node.message("version=" + version + " searcher=" + searcher);
            out.writeVLong(version);
            out.writeVInt(hitCount);
            bos.flush();
          } finally {
            mgr.release(searcher);
          }
        }
        continue outer;

      case SimplePrimaryNode.CMD_MARKER_SEARCH:
        {
          Thread.currentThread().setName("msearch");
          int expectedAtLeastCount = in.readVInt();
          IndexSearcher searcher = mgr.acquire();
          try {
            long version = ((DirectoryReader) searcher.getIndexReader()).getVersion();
            int hitCount = searcher.count(new TermQuery(new Term("marker", "marker")));
            if (hitCount < expectedAtLeastCount) {
              message("marker search: expectedAtLeastCount=" + expectedAtLeastCount + " but hitCount=" + hitCount);
              TopDocs hits = searcher.search(new TermQuery(new Term("marker", "marker")), expectedAtLeastCount);
              List<Integer> seen = new ArrayList<>();
              for(ScoreDoc hit : hits.scoreDocs) {
                Document doc = searcher.doc(hit.doc);
                seen.add(Integer.parseInt(doc.get("docid").substring(1)));
              }
              Collections.sort(seen);
              message("saw markers:");
              for(int marker : seen) {
                message("saw m" + marker);
              }
            }

            out.writeVLong(version);
            out.writeVInt(hitCount);
            bos.flush();
          } finally {
            mgr.release(searcher);
          }
        }
        continue outer;

      case SimplePrimaryNode.CMD_COMMIT:
        Thread.currentThread().setName("commit");
        commit();
        out.writeByte((byte) 1);
        break;

      case SimplePrimaryNode.CMD_CLOSE:
        Thread.currentThread().setName("close");
        ss.close();
        out.writeByte((byte) 1);
        break outer;

      case CMD_PRE_COPY_MERGE:
        Thread.currentThread().setName("merge copy");

        long newPrimaryGen = in.readVLong();
        curPrimaryTCPPort = in.readVInt();
        Map<String,FileMetaData> files = SimpleServer.readFilesMetaData(in);
        message("done reading files to copy files=" + files.keySet());
        AtomicBoolean finished = new AtomicBoolean();
        CopyJob job = launchPreCopyMerge(finished, newPrimaryGen, files);
        message("done launching copy job files=" + files.keySet());

        // Silly keep alive mechanism, else if e.g. we (replica node) crash, the primary
        // won't notice for a very long time:
        boolean success = false;
        try {
          int count = 0;
          while (true) {
            if (finished.get() || stop.get()) {
              break;
            }
            Thread.sleep(10);
            count++;
            if (count == 100) {
              // Once per second or so, we send a keep alive
              message("send merge pre copy keep alive... files=" + files.keySet());

              // To be evil, we sometimes fail to keep-alive, e.g. simulating a long GC pausing us:
              if (random.nextBoolean()) {
                out.writeByte((byte) 0);
                count = 0;
              }
            }
          }

          out.writeByte((byte) 1);
          bos.flush();
          success = true;
        } finally {
          message("done merge copy files=" + files.keySet() + " success=" + success);
        }
        break;

      default:
        throw new IllegalArgumentException("unrecognized cmd=" + cmd);
      }
      bos.flush();

      break;
    }
  }

  @Override
  protected void sendNewReplica() throws IOException {
    message("send new_replica to primary tcpPort=" + curPrimaryTCPPort);
    try (Connection c = new Connection(curPrimaryTCPPort)) {
      c.out.writeByte(SimplePrimaryNode.CMD_NEW_REPLICA);
      c.out.writeVInt(tcpPort);
      c.flush();
      c.s.shutdownOutput();
    } catch (Throwable t) {
      message("ignoring exc " + t + " sending new_replica to primary tcpPort=" + curPrimaryTCPPort);
    }
  }

  @Override
  public IndexOutput createTempOutput(String prefix, String suffix, IOContext ioContext) throws IOException {
    return new RateLimitedIndexOutput(fetchRateLimiter, super.createTempOutput(prefix, suffix, ioContext));
  }
}