ceph deep scrub 源码分析

ceph为了保持数据的一致性，默认，每天会进行一次scrub，每周会进行一次deep scrub。这里将就何时进行deep scrub进行分析

首先看一下关于scrub的一些参数

$ ceph daemon osd.0 config show|grep scrub

"osd_scrub_invalid_stats": "true",   ##标记scrub是否有效
"osd_max_scrubs": "1",               ##标记一个ceph OSD daemon内能够同时进行scrubbing的操作数
"osd_scrub_begin_hour": "22",        ##标记scrub开始的时间，晚上22:00
"osd_scrub_end_hour": "7",           ##标记scrub结束的时候，早上7:00
"osd_scrub_load_threshold": "0.5",   ##标记最大负载，超过这个负载scrub就不执行
"osd_scrub_min_interval": "86400",   ##标记最小执行scrub间隔，86400秒=1天
"osd_scrub_max_interval": "604800",  ##标记最大执行scrub间隔，604800秒=7天
"osd_scrub_interval_randomize_ratio": "0.5", ##标记随机执行scrub的间隔，0.5，就是50%
"osd_scrub_chunk_min": "5",          ##标记每次scrub的最小数据块
"osd_scrub_chunk_max": "25",         ##标记每次scrub的最大数据块
"osd_scrub_sleep": "0",              ##标记当前scrub结束，执行下次scrub的等待时间，增加该值，会导致scrub变慢，客户端影响反而会减小
"osd_scrub_auto_repair": "false",    ##标记根据深度清洗，是否进行修复操作
"osd_scrub_auto_repair_num_errors": "5",   ##标记当清洗后出现的errors低于该阈值，会自动触发修复操作
"osd_deep_scrub_interval": "604800",    ##标记深度清洗间隔，604800秒=7天
"osd_deep_scrub_randomize_ratio": "0.15", ##标记随机深度清洗概率， 0.15=15%
"osd_deep_scrub_stride": "524288",               ##标记深度清洗时读取数据大小，512K
"osd_deep_scrub_update_digest_min_age": "7200",  ##标记要进行深度清洗的对象上次清洗时间戳最小要超过7200秒
"osd_debug_scrub_chance_rewrite_digest": "0",    ##
"osd_scrub_priority": "5",                 ##标记进行scrub的优先级
"osd_scrub_cost": "52428800",              ##标记进行scrub的io 50M

下面查看具体代码src/osd/PG.cc

行957：

     PG::Scrubber::Scrubber()
  : reserved(false), reserve_failed(false),
   epoch_start(0),
   active(false), queue_snap_trim(false),
   waiting_on(0), shallow_errors(0), deep_errors(0), fixed(0),
   must_scrub(false), must_deep_scrub(false), must_repair(false),
   auto_repair(false),
   num_digest_updates_pending(0),
   state(INACTIVE),
   deep(false),
   seed(0)
{
主要关注must_scrub must_deep_scrub must_repair auto_repair deep 设置

行2072：

bool PG::queue_scrub()
{
  assert(_lock.is_locked());
  if (is_scrubbing()) {
    return false;
  }
  scrubber.must_scrub = false;  must_scrub 设置为false
  state_set(PG_STATE_SCRUBBING);
  if (scrubber.must_deep_scrub) {
    state_set(PG_STATE_DEEP_SCRUB);
    scrubber.must_deep_scrub = false;  ##must_deep_scrub设置为false
  }
  if (scrubber.must_repair || scrubber.auto_repair) {
    state_set(PG_STATE_REPAIR);
    scrubber.must_repair = false;  ##must_repair 设置为false
  }
  requeue_scrub();
  return true;
}

行3319：

bool PG::sched_scrub()
{
  assert(_lock.is_locked());
  if (!(is_primary() && is_active() && is_clean() && !is_scrubbing())) {
    return false;
  }

  double deep_scrub_interval = 0;
  pool.info.opts.get(pool_opts_t::DEEP_SCRUB_INTERVAL, &deep_scrub_interval);
  if (deep_scrub_interval <= 0) {
    deep_scrub_interval = cct->_conf->osd_deep_scrub_interval; ##从配置获取深度清洗时间间隔
  }
  bool time_for_deep = ceph_clock_now(cct) >=
    info.history.last_deep_scrub_stamp + deep_scrub_interval;  ##如果当前时间大于(上次深度清洗时间+深度清洗间隔),则设置time_for_deep为 true

  bool deep_coin_flip = false;
  // Only add random deep scrubs when NOT user initiated scrub
  if (!scrubber.must_scrub)
          ##随机执行deep scrub，如果随机值小于osd_deep_scrub_randomize_ratio * 100，deep_coin_flip 设置为true
      deep_coin_flip = (rand() % 100) < cct->_conf->osd_deep_scrub_randomize_ratio * 100;
  dout(20) << __func__ << ": time_for_deep=" << time_for_deep << " deep_coin_flip=" << deep_coin_flip << dendl;

  time_for_deep = (time_for_deep || deep_coin_flip);  ##根据time_for_deep和deep_coin_flip的值设置time_for_deep

  //NODEEP_SCRUB so ignore time initiated deep-scrub
  if (osd->osd->get_osdmap()->test_flag(CEPH_OSDMAP_NODEEP_SCRUB) ||
      pool.info.has_flag(pg_pool_t::FLAG_NODEEP_SCRUB))
    time_for_deep = false;

  if (!scrubber.must_scrub) {
    assert(!scrubber.must_deep_scrub);

    //NOSCRUB so skip regular scrubs
    if ((osd->osd->get_osdmap()->test_flag(CEPH_OSDMAP_NOSCRUB) ||
     pool.info.has_flag(pg_pool_t::FLAG_NOSCRUB)) && !time_for_deep) {
      if (scrubber.reserved) {
    // cancel scrub if it is still in scheduling,
    // so pgs from other pools where scrub are still legal
    // have a chance to go ahead with scrubbing.
    clear_scrub_reserved();
    scrub_unreserve_replicas();
      }
      return false;
    }
  }

  if (cct->_conf->osd_scrub_auto_repair
      && get_pgbackend()->auto_repair_supported()
      && time_for_deep
      // respect the command from user, and not do auto-repair
      && !scrubber.must_repair
      && !scrubber.must_scrub
      && !scrubber.must_deep_scrub) {
    dout(20) << __func__ << ": auto repair with deep scrubbing" << dendl;
    scrubber.auto_repair = true;
  } else {
    // this happens when user issue the scrub/repair command during
    // the scheduling of the scrub/repair (e.g. request reservation)
    scrubber.auto_repair = false;
  }

  bool ret = true;
  if (!scrubber.reserved) {
    assert(scrubber.reserved_peers.empty());
    if (osd->inc_scrubs_pending()) {
      dout(20) << "sched_scrub: reserved locally, reserving replicas" << dendl;
      scrubber.reserved = true;
      scrubber.reserved_peers.insert(pg_whoami);
      scrub_reserve_replicas();
    } else {
      dout(20) << "sched_scrub: failed to reserve locally" << dendl;
      ret = false;
    }
  }
  if (scrubber.reserved) {
    if (scrubber.reserve_failed) {
      dout(20) << "sched_scrub: failed, a peer declined" << dendl;
      clear_scrub_reserved();
      scrub_unreserve_replicas();
      ret = false;
    } else if (scrubber.reserved_peers.size() == acting.size()) {
      dout(20) << "sched_scrub: success, reserved self and replicas" << dendl;
      if (time_for_deep) {  ##执行deep scrub
    dout(10) << "sched_scrub: scrub will be deep" << dendl;
    state_set(PG_STATE_DEEP_SCRUB);
      }
      queue_scrub();
    } else {
      // none declined, since scrubber.reserved is set
      dout(20) << "sched_scrub: reserved " << scrubber.reserved_peers << ", waiting for replicas" << dendl;
    }
  }

  return ret;
}

总结一下，ceph产生deep-scrub的原因有如下:

osd_deep_scrub_interval 到了，会执行deep-scrub
osd_deep_scrub_randomize_ratio 概率随机执行
人为命令执行
osd_scrub_auto_repair 设置
recovery 任务完成后,必定执行deep-scrub

注意：

集群初始化完成，deep-scrub的时间默认是pg创建的时间(src/mon/PGMonitor.cc行1000左右)，在非人为干预的情况下，会通过osd_deep_scrub_randomize_ratio 随机对pg进行deep-scrub，直到osd_deep_scrub_interval时间点，会对所有到达该时间的pg执行deep-scrub