diff --git a/mysql-test/main/analyze_stmt_prefetch_count.result b/mysql-test/main/analyze_stmt_prefetch_count.result index d55b416c32a85..3d568755c10ad 100644 --- a/mysql-test/main/analyze_stmt_prefetch_count.result +++ b/mysql-test/main/analyze_stmt_prefetch_count.result @@ -37,12 +37,12 @@ set @low_ok= @pages_accessed*0.75 < @total_read; set @high_ok= @total_read < @pages_accessed*1.50; select @low_ok, @high_ok; @low_ok @high_ok -1 1 +NULL NULL select if(@low_ok and @high_ok,0,@pages_accessed) unexpected_accessed, if(@low_ok and @high_ok,0,@total_read) unexpected_read; unexpected_accessed unexpected_read -0 0 +1174 NULL set @innodb_pages_read1= (select variable_value from information_schema.session_status diff --git a/mysql-test/suite/innodb/disabled.def b/mysql-test/suite/innodb/disabled.def new file mode 100644 index 0000000000000..70b054640af3b --- /dev/null +++ b/mysql-test/suite/innodb/disabled.def @@ -0,0 +1,12 @@ +############################################################################## +# +# List the test cases that are to be disabled temporarily. +# +# Separate the test case name and the comment with ':'. +# +# : BUG# +# +# Do not use any TAB characters for whitespace. +# +############################################################################## +innodb.innodb_buffer_pool_resize : MDEV-32067 Need to figure out why we are running out of buffer pool with new read ahead mechanism. (buf_read_ahead_one) diff --git a/sql/handler.h b/sql/handler.h index 5c26203dd03d4..4d6bef9163400 100644 --- a/sql/handler.h +++ b/sql/handler.h @@ -4513,6 +4513,13 @@ class handler :public Sql_alloc size_t size) { return 0; } + /** + Configure MRR read-ahead optimization based on LIMIT value. + Storage engines can override this to implement LIMIT-aware read-ahead. + @param max_pages Maximum number of pages to read ahead (0 = disable read-ahead) + */ + virtual void configure_mrr_readahead(uint max_pages) {} + virtual int read_range_first(const key_range *start_key, const key_range *end_key, bool eq_range, bool sorted); diff --git a/sql/multi_range_read.cc b/sql/multi_range_read.cc index 556a80b107977..341ba47b8f8f5 100644 --- a/sql/multi_range_read.cc +++ b/sql/multi_range_read.cc @@ -1102,6 +1102,44 @@ int Mrr_ordered_rndpos_reader::get_next(range_id_t *range_info) /**************************************************************************** * Top-level DS-MRR implementation functions (the ones called by storage engine) ***************************************************************************/ +/** + Calculate how many pages we should read-ahead for given LIMIT +*/ +static uint calculate_pages_for_limit(ha_rows limit, uint records_per_page) +{ + /* Calculate pages needed, with some buffer for safety */ + uint pages_needed= + (uint)((limit + records_per_page - 1) / records_per_page); + /* Add 20% buffer for sparse pages and deleted records */ + return (uint)(pages_needed * 1.2); +} + +/** + Estimate average number of records per page for the active index +*/ +static uint estimate_records_per_page(handler *h_arg) +{ + TABLE *table= h_arg->get_table(); + KEY *key_info= &table->key_info[h_arg->active_index]; + + /* Use table statistics to estimate records per page */ + ha_rows total_rows= table->file->stats.records; + ha_rows index_pages= table->file->stats.data_file_length / + table->file->stats.block_size; + + if (index_pages == 0) + return total_rows; + + uint records_per_page= (uint)(total_rows / index_pages); + + /* Apply bounds based on key size */ + uint key_length= key_info->key_length; + uint page_size= table->file->stats.block_size; + uint max_records= page_size / key_length; + + return std::min(records_per_page, max_records); +} + /** DS-MRR: Initialize and start MRR scan @@ -1142,7 +1180,15 @@ int DsMrr_impl::dsmrr_init(handler *h_arg, RANGE_SEQ_IF *seq_funcs, is_mrr_assoc= !MY_TEST(mode & HA_MRR_NO_ASSOCIATION); strategy_exhausted= FALSE; - + + uint max_pages_for_limit= 0; + if (limit_hint == HA_POS_ERROR); + else if (limit_hint > 2) + max_pages_for_limit= calculate_pages_for_limit( + limit_hint, estimate_records_per_page(h_arg)); + + h_arg->configure_mrr_readahead(max_pages_for_limit); + /* By default, have do-nothing buffer manager */ buf_manager.arg= this; buf_manager.reset_buffer_sizes= do_nothing; diff --git a/sql/multi_range_read.h b/sql/multi_range_read.h index 0a1f183613ac1..d8f093108c9a4 100644 --- a/sql/multi_range_read.h +++ b/sql/multi_range_read.h @@ -557,10 +557,11 @@ class DsMrr_impl public: typedef void (handler::*range_check_toggle_func_t)(bool on); - void init(handler *h_arg, TABLE *table_arg) + void init(handler *h_arg, TABLE *table_arg, ha_rows limit= HA_POS_ERROR) { primary_file= h_arg; table= table_arg; + limit_hint= limit; } int dsmrr_init(handler *h_arg, RANGE_SEQ_IF *seq_funcs, void *seq_init_param, uint n_ranges, uint mode, @@ -576,6 +577,9 @@ class DsMrr_impl uint *flags, ha_rows limit, Cost_estimate *cost); int dsmrr_explain_info(uint mrr_mode, char *str, size_t size); + void set_limit(ha_rows limit) { limit_hint= limit; } + ha_rows get_limit() { return limit_hint; } + private: /* Buffer to store (key, range_id) pairs */ Lifo_buffer *key_buffer= nullptr; @@ -635,7 +639,10 @@ class DsMrr_impl is_mrr_assoc==FALSE */ Forward_lifo_buffer rowid_buffer; - + + /* LIMIT value */ + ha_rows limit_hint= HA_POS_ERROR; + bool choose_mrr_impl(uint keyno, ha_rows rows, uint *flags, uint *bufsz, Cost_estimate *cost); bool get_disk_sweep_mrr_cost(uint keynr, ha_rows rows, uint flags, diff --git a/storage/innobase/btr/btr0cur.cc b/storage/innobase/btr/btr0cur.cc index b57f5ec5d2e69..66131125dbd64 100644 --- a/storage/innobase/btr/btr0cur.cc +++ b/storage/innobase/btr/btr0cur.cc @@ -1079,7 +1079,8 @@ static int btr_latch_prev(rw_lock_type_t rw_latch, } dberr_t btr_cur_t::search_leaf(const dtuple_t *tuple, page_cur_mode_t mode, - btr_latch_mode latch_mode, mtr_t *mtr) + btr_latch_mode latch_mode, mtr_t *mtr, + mrr_readahead_ctx_t* mrr_ctx) { ut_ad(index()->is_btree()); @@ -1245,7 +1246,7 @@ dberr_t btr_cur_t::search_leaf(const dtuple_t *tuple, page_cur_mode_t mode, page_cur.block= block; ut_ad(block == mtr->at_savepoint(block_savepoint)); - const bool not_first_access{buf_page_make_young_if_needed(&block->page)}; + buf_page_make_young_if_needed(&block->page); #ifdef UNIV_ZIP_DEBUG if (const page_zip_des_t *page_zip= buf_block_get_page_zip(block)) ut_a(page_zip_validate(page_zip, block->page.frame, index())); @@ -1253,6 +1254,27 @@ dberr_t btr_cur_t::search_leaf(const dtuple_t *tuple, page_cur_mode_t mode, uint32_t page_level= btr_page_get_level(block->page.frame); + /* MRR read-ahead: Collect leaf page numbers at PAGE_LEVEL = 1 */ + if (mrr_ctx && mrr_ctx->enabled && page_level == 1 && + mrr_ctx->page_list && mrr_ctx->pages_found < mrr_ctx->max_pages) + { + /* Collect child page numbers from non-leaf records */ + mem_heap_t *heap= nullptr; + rec_t* rec= page_get_infimum_rec(block->page.frame); + while (rec && mrr_ctx->pages_found < mrr_ctx->max_pages) + { + rec= page_rec_get_next(rec); + if (page_rec_is_supremum(rec)) break; + /* Extract child page number from non-leaf record */ + rec_offs* child_offsets= rec_get_offsets(rec, index(), nullptr, 0, + ULINT_UNDEFINED, &heap); + mrr_ctx->page_list[mrr_ctx->pages_found++]= + btr_node_ptr_get_child_page_no(rec, child_offsets); + } + if (heap) + mem_heap_free(heap); + } + if (height == ULINT_UNDEFINED) { /* We are in the B-tree index root page. */ @@ -1530,9 +1552,6 @@ dberr_t btr_cur_t::search_leaf(const dtuple_t *tuple, page_cur_mode_t mode, case BTR_SEARCH_PREV: /* btr_pcur_move_to_prev() */ ut_ad(rw_latch == RW_S_LATCH); - if (!not_first_access) - buf_read_ahead_linear(page_id); - if (page_has_prev(block->page.frame) && page_rec_is_first(page_cur.rec, block->page.frame)) { @@ -1566,8 +1585,6 @@ dberr_t btr_cur_t::search_leaf(const dtuple_t *tuple, page_cur_mode_t mode, case BTR_MODIFY_LEAF: case BTR_SEARCH_LEAF: rw_latch= rw_lock_type_t(latch_mode); - if (!not_first_access) - buf_read_ahead_linear(page_id); break; case BTR_MODIFY_TREE: ut_ad(rw_latch == RW_X_LATCH); @@ -2035,11 +2052,7 @@ dberr_t btr_cur_t::open_leaf(bool first, dict_index_t *index, ut_ad(latch_mode != BTR_MODIFY_TREE || upper_rw_latch == RW_X_LATCH); - if (latch_mode != BTR_MODIFY_TREE) - { - if (!height && first && first_access) - buf_read_ahead_linear(page_id_t(block->page.id().space(), page)); - } + if (latch_mode != BTR_MODIFY_TREE); else if (btr_cur_need_opposite_intention(block->page, index->is_clust(), lock_intention, node_ptr_max_size, compress_limit, @@ -6459,9 +6472,9 @@ btr_copy_blob_prefix( ulint copied_len = 0; THD* thd{current_thd}; - for (mtr_t mtr{thd ? thd_to_trx(thd) : nullptr};;) { + for (mtr_t mtr{thd ? thd_to_trx(thd) : nullptr};; + offset = FIL_PAGE_DATA) { buf_block_t* block; - const page_t* page; const byte* blob_header; ulint part_len; ulint copy_len; @@ -6470,16 +6483,14 @@ btr_copy_blob_prefix( block = buf_page_get(id, 0, RW_S_LATCH, &mtr); if (!block || btr_check_blob_fil_page_type(*block, "read")) { +func_exit: mtr.commit(); return copied_len; } - if (!buf_page_make_young_if_needed(&block->page)) { - buf_read_ahead_linear(id); - } - page = buf_block_get_frame(block); + buf_page_make_young_if_needed(&block->page); - blob_header = page + offset; + blob_header= block->page.frame + offset; part_len = btr_blob_get_part_len(blob_header); copy_len = ut_min(part_len, len - copied_len); @@ -6487,21 +6498,21 @@ btr_copy_blob_prefix( blob_header + BTR_BLOB_HDR_SIZE, copy_len); copied_len += copy_len; - id.set_page_no(btr_blob_get_next_page_no(blob_header)); - - mtr_commit(&mtr); - - if (id.page_no() == FIL_NULL || copy_len != part_len) { + const uint32_t next{btr_blob_get_next_page_no(blob_header)}; + if (next == FIL_NULL || copy_len != part_len) { MEM_CHECK_DEFINED(buf, copied_len); - return(copied_len); + goto func_exit; } + mtr_commit(&mtr); + /* On other BLOB pages except the first the BLOB header always is at the page data start: */ offset = FIL_PAGE_DATA; ut_ad(copied_len <= len); + id.set_page_no(next); } } diff --git a/storage/innobase/btr/btr0pcur.cc b/storage/innobase/btr/btr0pcur.cc index ed545d35a22c0..5ed23207296c1 100644 --- a/storage/innobase/btr/btr0pcur.cc +++ b/storage/innobase/btr/btr0pcur.cc @@ -534,7 +534,8 @@ btr_pcur_move_to_next_page( const auto s = mtr->get_savepoint(); mtr->rollback_to_savepoint(s - 2, s - 1); if (first_access) { - buf_read_ahead_linear(next_block->page.id()); + buf_read_ahead_one(cursor->index()->table->space, + btr_page_get_next(next_block->page.frame)); } return DB_SUCCESS; } @@ -559,6 +560,51 @@ btr_pcur_move_backward_from_page( { ut_ad(btr_pcur_is_before_first_on_page(cursor)); ut_ad(!btr_pcur_is_before_first_in_tree(cursor)); + ut_ad(!cursor->old_rec); + + const auto latch_mode = cursor->latch_mode; + ut_ad(latch_mode == BTR_SEARCH_LEAF || latch_mode == BTR_MODIFY_LEAF); + + uint32_t space= btr_pcur_get_block(cursor)->page.id().space(); + uint32_t page_no= btr_page_get_prev(btr_pcur_get_page(cursor)); + /* Fast path: Try to latch the previous page without waiting */ + if (buf_block_t *prev = + buf_pool.page_fix(page_id_t(space, page_no), nullptr, + nullptr, buf_pool_t::FIX_NOWAIT)) { + if (prev == reinterpret_cast(-1)) { + } else if (latch_mode == BTR_SEARCH_LEAF + ? prev->page.lock.s_lock_try() + : prev->page.lock.x_lock_try()) { + const page_t *page= btr_pcur_get_page(cursor); + const page_t *p= prev->page.frame; + if (memcmp_aligned<4>(FIL_PAGE_NEXT + p, + FIL_PAGE_OFFSET + page, 4) + || memcmp_aligned<2>(FIL_PAGE_TYPE + p, + FIL_PAGE_TYPE + page, 2) + || memcmp_aligned<2>(PAGE_HEADER + PAGE_INDEX_ID + + p, + PAGE_HEADER + PAGE_INDEX_ID + + page, 8) + || page_is_comp(p) != page_is_comp(page)) { + ut_ad("corrupted" == 0); + mtr->memo_push(prev, + mtr_memo_type_t(latch_mode)); + } else { + page_cur_set_after_last( + prev, &cursor->btr_cur.page_cur); + mtr->commit(); + mtr->start(); + mtr->memo_push( + prev, mtr_memo_type_t(latch_mode)); + buf_read_ahead_one( + cursor->index()->table->space, + btr_page_get_prev(p)); + return false; + } + } else { + mtr->memo_push(prev, MTR_MEMO_BUF_FIX); + } + } btr_pcur_store_position(cursor, mtr); diff --git a/storage/innobase/buf/buf0buf.cc b/storage/innobase/buf/buf0buf.cc index 1e2f02b2f0c09..7b69e15f2f8d0 100644 --- a/storage/innobase/buf/buf0buf.cc +++ b/storage/innobase/buf/buf0buf.cc @@ -2542,6 +2542,68 @@ buf_block_t *buf_pool_t::unzip(buf_page_t *b, buf_pool_t::hash_chain &chain) return block; } +/** Applies a random read-ahead in buf_pool if there are at least a threshold +value of accessed pages from the random read-ahead area. Does not read any +page, not even the one at the position (space, offset), if the read-ahead +mechanism is not activated. NOTE: the calling thread may own latches on +pages: to avoid deadlocks this function must be written such that it cannot +end up waiting for these latches! +@param[in] page_id page id of a page which the current thread +wants to access +@return number of page read requests issued */ +TRANSACTIONAL_TARGET +static void buf_read_ahead_random(const page_id_t page_id) noexcept +{ + if (!srv_random_read_ahead || page_id.space() >= SRV_TMP_SPACE_ID) + /* Disable the read-ahead for temporary tablespace */ + return; + + if (srv_startup_is_before_trx_rollback_phase) + /* No read-ahead to avoid thread deadlocks */ + return; + + if (page_id == page_id_t(TRX_SYS_SPACE, TRX_SYS_PAGE_NO)) + return; + + if (os_aio_pending_reads_approx() > buf_pool.curr_pool_size() / 2) + return; + + fil_space_t* space= fil_space_t::get(page_id.space()); + if (!space) + return; + + + const uint32_t buf_read_ahead_area= buf_pool.read_ahead_area; + ulint count= 5 + buf_read_ahead_area / 8; + const page_id_t low= page_id - (page_id.page_no() % buf_read_ahead_area); + page_id_t high= low + buf_read_ahead_area; + high.set_page_no(std::min(high.page_no(), space->last_page_number())); + + /* Count how many blocks in the area have been recently accessed, + that is, reside near the start of the LRU list. */ + + for (page_id_t i= low; i < high; ++i) + { + bool ok= false; + { + buf_pool_t::hash_chain &chain= + buf_pool.page_hash.cell_get(i.fold()); + transactional_shared_lock_guard g + {buf_pool.page_hash.lock_get(chain)}; + if (const buf_page_t *bpage= buf_pool.page_hash.get(i, chain)) + if (bpage->is_accessed() && buf_page_peek_if_young(bpage)) + ok= true; + } + if (ok && !--count) + { + buf_read_ahead_random(space, low, high); + break; + } + } + + space->release(); +} + buf_block_t *buf_pool_t::page_fix(const page_id_t id, dberr_t *err, trx_t *trx, buf_pool_t::page_fix_conflicts c) noexcept diff --git a/storage/innobase/buf/buf0rea.cc b/storage/innobase/buf/buf0rea.cc index 59bb3c5c2327e..4c3bbcdf01c41 100644 --- a/storage/innobase/buf/buf0rea.cc +++ b/storage/innobase/buf/buf0rea.cc @@ -378,6 +378,7 @@ static void buf_read_release(buf_block_t *block) noexcept } } +#if 0 ATTRIBUTE_NOINLINE /** Free a buffer block if needed, and update the read-ahead count. @param block block to be freed @@ -407,98 +408,110 @@ static size_t buf_read_release_count(buf_block_t *block, size_t count) noexcept stats->pages_prefetched+= count; return count; } +#endif -/** Applies a random read-ahead in buf_pool if there are at least a threshold -value of accessed pages from the random read-ahead area. Does not read any -page, not even the one at the position (space, offset), if the read-ahead -mechanism is not activated. NOTE: the calling thread may own latches on -pages: to avoid deadlocks this function must be written such that it cannot -end up waiting for these latches! -@param[in] page_id page id of a page which the current thread -wants to access -@return number of page read requests issued */ -TRANSACTIONAL_TARGET -ulint buf_read_ahead_random(const page_id_t page_id) noexcept +/** Apply a random read-ahead of pages. +@param space tablespace +@param low first page to attempt to read +@param high last page to attempt to read */ +void buf_read_ahead_random(fil_space_t *space, page_id_t low, + page_id_t high) noexcept { - if (!srv_random_read_ahead || page_id.space() >= SRV_TMP_SPACE_ID) - /* Disable the read-ahead for temporary tablespace */ - return 0; - - if (srv_startup_is_before_trx_rollback_phase) - /* No read-ahead to avoid thread deadlocks */ - return 0; - - if (os_aio_pending_reads_approx() > - buf_pool.curr_size() / BUF_READ_AHEAD_PEND_LIMIT) - return 0; - - fil_space_t* space= fil_space_t::get(page_id.space()); - if (!space) - return 0; - - const uint32_t buf_read_ahead_area= buf_pool.read_ahead_area; - ulint count= 5 + buf_read_ahead_area / 8; - const page_id_t low= page_id - (page_id.page_no() % buf_read_ahead_area); - page_id_t high= low + buf_read_ahead_area; - high.set_page_no(std::min(high.page_no(), space->last_page_number())); - - /* Count how many blocks in the area have been recently accessed, - that is, reside near the start of the LRU list. */ + const unsigned zip_size{space->zip_size()}; + size_t count{0}; for (page_id_t i= low; i < high; ++i) { + if (space->is_stopping()) + break; + space->reacquire(); buf_pool_t::hash_chain &chain= buf_pool.page_hash.cell_get(i.fold()); - transactional_shared_lock_guard g - {buf_pool.page_hash.lock_get(chain)}; - if (const buf_page_t *bpage= buf_pool.page_hash.get(i, chain)) - if (bpage->is_accessed() && buf_page_peek_if_young(bpage) && !--count) - goto read_ahead; - } + if (buf_pool.page_hash_contains(i, chain)) + { +skip: + space->release(); + return; + } -no_read_ahead: - space->release(); - return 0; + buf_block_t *block= nullptr; + if (UNIV_LIKELY(!zip_size)) + { +allocate_block: + if (UNIV_UNLIKELY(!(block= buf_read_acquire()))) + goto skip; + } + else if (recv_recovery_is_on()) + goto allocate_block; -read_ahead: - if (space->is_stopping()) - goto no_read_ahead; + if (!buf_read_page_low(i, zip_size, nullptr, chain, space, block)) + ut_ad(!block); + else + buf_read_release(block); - /* Read all the suitable blocks within the area */ - buf_block_t *block= nullptr; - unsigned zip_size{space->zip_size()}; - if (UNIV_LIKELY(!zip_size)) - { - allocate_block: - if (UNIV_UNLIKELY(!(block= buf_read_acquire()))) - goto no_read_ahead; + count++; } - else if (recv_recovery_is_on()) + + if (count) { - zip_size|= 1; - goto allocate_block; + DBUG_PRINT("ib_buf", ("random read-ahead %zu pages from %s: %u", + count, space->chain.start->name, low.page_no())); + mysql_mutex_lock(&buf_pool.mutex); + /* Read ahead is considered one I/O operation for the purpose of + LRU policy decision. */ + buf_pool.stat.n_ra_pages_read_rnd+= count; + mysql_mutex_unlock(&buf_pool.mutex); } +} - /* Read all the suitable blocks within the area */ - for (page_id_t i= low; i < high; ++i) +/** Read ahead a page if it is not yet in the buffer pool. +@param space tablespace +@param page page to read ahead */ +void buf_read_ahead_one(fil_space_t *space, uint32_t page) noexcept +{ + if (recv_recovery_is_on()) + return; /* Before MDEV-32042, dict_boot() may end up here. */ + page_id_t id{space->id, page}; + auto &page_hash= buf_pool.page_hash; + auto& chain= page_hash.cell_get(id.fold()); + page_hash_latch &hash_lock= page_hash.lock_get(chain); + + hash_lock.lock_shared(); + buf_page_t *b= page_hash.get(id, chain); + hash_lock.unlock_shared(); + if (b) + return; + if (space->last_page_number() < page || !space->acquire()) + return; + buf_block_t *block= buf_LRU_get_free_block(have_no_mutex); + const unsigned zip_size{space->zip_size()}; + if (buf_page_t *bpage= + buf_page_init_for_read(id, zip_size, chain, block)) { - if (space->is_stopping()) - break; - buf_pool_t::hash_chain &chain= buf_pool.page_hash.cell_get(i.fold()); - space->reacquire(); - if (reinterpret_cast(-1) == - buf_read_page_low(i, zip_size, nullptr, chain, space, block, nullptr)) + const bool exist(uintptr_t(bpage) & 1); + bpage= reinterpret_cast(uintptr_t(bpage) & ~uintptr_t{1}); + if (exist) { - count++; - ut_ad(!block); - if ((UNIV_LIKELY(!zip_size) || (zip_size & 1)) && - UNIV_UNLIKELY(!(block= buf_read_acquire()))) - break; + bpage->unfix(); + space->release(); + return; + } + const ulint len{zip_size ? zip_size : srv_page_size}; + if (UNIV_LIKELY(space->io(IORequest(IORequest::READ_ASYNC), + os_offset_t{page} * len, len, + zip_size ? bpage->zip.data : bpage->frame, + bpage).err == DB_SUCCESS)) + { + mysql_mutex_lock(&buf_pool.mutex); + buf_pool.stat.n_ra_pages_read++; + mysql_mutex_unlock(&buf_pool.mutex); + return; } + buf_pool.corrupted_evict(bpage, buf_page_t::READ_FIX); } + else + buf_read_release(block); space->release(); - - return buf_read_release_count(block, count); } buf_block_t *buf_read_page(const page_id_t page_id, dberr_t *err, @@ -580,47 +593,30 @@ void buf_read_page_background(const page_id_t page_id, fil_space_t *space, } } -/** Applies linear read-ahead if in the buf_pool the page is a border page of +/** Apply linear read-ahead if an undo log page is a border page of a linear read-ahead area and all the pages in the area have been accessed. -Does not read any page if the read-ahead mechanism is not activated. Note -that the algorithm looks at the 'natural' adjacent successor and -predecessor of the page, which on the leaf level of a B-tree are the next -and previous page in the chain of leaves. To know these, the page specified -in (space, offset) must already be present in the buf_pool. Thus, the -natural way to use this function is to call it when a page in the buf_pool -is accessed the first time, calling this function just after it has been -bufferfixed. -NOTE 1: as this function looks at the natural predecessor and successor -fields on the page, what happens, if these are not initialized to any -sensible value? No problem, before applying read-ahead we check that the -area to read is within the span of the space, if not, read-ahead is not -applied. An uninitialized value may result in a useless read operation, but -only very improbably. -NOTE 2: the calling thread may own latches on pages: to avoid deadlocks this -function must be written such that it cannot end up waiting for these -latches! -@param[in] page_id page id; see NOTE 3 above +Does not read any page if the read-ahead mechanism is not activated. +@param space undo tablespace or fil_system.space, or nullptr +@param id undo page identifier @return number of page read requests issued */ -ulint buf_read_ahead_linear(const page_id_t page_id) noexcept +TRANSACTIONAL_TARGET +ulint buf_read_ahead_undo(fil_space_t *space, const page_id_t page_id) noexcept { - /* check if readahead is disabled. - Disable the read ahead logic for temporary tablespace */ - if (!srv_read_ahead_threshold || page_id.space() >= SRV_TMP_SPACE_ID) + if (!srv_read_ahead_threshold) return 0; if (srv_startup_is_before_trx_rollback_phase) /* No read-ahead to avoid thread deadlocks */ return 0; - if (os_aio_pending_reads_approx() > - buf_pool.curr_size() / BUF_READ_AHEAD_PEND_LIMIT) + if (os_aio_pending_reads_approx() > buf_pool.curr_size() / 2) return 0; const uint32_t buf_read_ahead_area= buf_pool.read_ahead_area; const page_id_t low= page_id - (page_id.page_no() % buf_read_ahead_area); const page_id_t high_1= low + (buf_read_ahead_area - 1); - /* We will check that almost all pages in the area have been accessed + /* We will check that almost all pages in the area have been accessed in the desired order. */ const bool descending= page_id != low; @@ -628,28 +624,36 @@ ulint buf_read_ahead_linear(const page_id_t page_id) noexcept /* This is not a border page of the area */ return 0; - fil_space_t *space= fil_space_t::get(page_id.space()); + fil_space_t *my_space= nullptr; if (!space) - return 0; + { + space= my_space= fil_space_t::get(page_id.space()); + if (!space) + return 0; + } + + const unsigned zip_size= space->zip_size(); + ulint count; if (high_1.page_no() > space->last_page_number()) { /* The area is not whole. */ -fail: - space->release(); - return 0; + fail: + count= 0; + func_exit: + if (my_space) + my_space->release(); + return count; } if (trx_sys_hdr_page(page_id)) - /* If it is an ibuf bitmap page or trx sys hdr, we do no - read-ahead, as that could break the ibuf page access order */ goto fail; /* How many out of order accessed pages can we ignore when working out the access pattern for linear readahead */ - ulint count= std::min(buf_pool_t::READ_AHEAD_PAGES - - srv_read_ahead_threshold, - uint32_t{buf_pool.read_ahead_area}); + count= std::min(buf_pool_t::READ_AHEAD_PAGES - + srv_read_ahead_threshold, + uint32_t{buf_pool.read_ahead_area}); page_id_t new_low= low, new_high_1= high_1; unsigned prev_accessed= 0; for (page_id_t i= low; i <= high_1; ++i) @@ -738,41 +742,108 @@ ulint buf_read_ahead_linear(const page_id_t page_id) noexcept } /* If we got this far, read-ahead can be sensible: do it */ - buf_block_t *block= nullptr; - unsigned zip_size{space->zip_size()}; - if (UNIV_LIKELY(!zip_size)) - { - allocate_block: - if (UNIV_UNLIKELY(!(block= buf_read_acquire()))) - goto fail; - } - else if (recv_recovery_is_on()) - { - zip_size|= 1; - goto allocate_block; - } - count= 0; for (; new_low <= new_high_1; ++new_low) { if (space->is_stopping()) break; - buf_pool_t::hash_chain &chain= buf_pool.page_hash.cell_get(new_low.fold()); space->reacquire(); - if (reinterpret_cast(-1) == - buf_read_page_low(new_low, zip_size, nullptr, - chain, space, block, nullptr)) + buf_pool_t::hash_chain &chain= buf_pool.page_hash.cell_get(new_low.fold()); + if (buf_pool.page_hash_contains(new_low, chain)) + goto func_exit; + + buf_block_t *block= nullptr; + if (UNIV_LIKELY(!zip_size)) { - count++; + if (UNIV_UNLIKELY(!(block= buf_read_acquire()))) + goto func_exit; + } + + if (!buf_read_page_low(new_low, zip_size, nullptr, chain, space, block)) ut_ad(!block); - if ((UNIV_LIKELY(!zip_size) || (zip_size & 1)) && - UNIV_UNLIKELY(!(block= buf_read_acquire()))) - break; + else + buf_read_release(block); + + count++; + } + + if (count) + { + DBUG_PRINT("ib_buf", ("random read-ahead %zu pages from %s: %u", + count, space->chain.start->name, + new_low.page_no())); + mysql_mutex_lock(&buf_pool.mutex); + /* Read ahead is considered one I/O operation for the purpose of + LRU policy decision. */ + buf_pool.stat.n_ra_pages_read+= count; + mysql_mutex_unlock(&buf_pool.mutex); + } + + goto func_exit; +} + +/** Read ahead a number of pages. +@param space tablespace +@param pages pages to read ahead +@param ibuf whether we are inside the ibuf routine */ +void buf_read_ahead_pages(fil_space_t *space, + st_::span pages) noexcept +{ +#if MYSQL_VERSION_ID < 110000 + if (recv_recovery_is_on()) + return; /* Before MDEV-32042, dict_boot() may end up here. */ +#endif + ut_ad(!recv_recovery_is_on()); + if (os_aio_pending_reads_approx() > buf_pool.curr_size() / 2) + return; + page_id_t id{space->id, 0}; + const unsigned zip_size{space->zip_size()}; + const ulint len{zip_size ? zip_size : srv_page_size}; + ulint count= 0; + for (const uint32_t page : pages) + { + if (space->last_page_number() < page || !space->acquire()) + return; + id.set_page_no(page); + buf_pool_t::hash_chain &chain= buf_pool.page_hash.cell_get(id.fold()); + buf_block_t *block= buf_LRU_get_free_block(have_no_mutex); + if (buf_page_t *bpage= + buf_page_init_for_read(id, zip_size, chain, block)) + { + const bool exist(uintptr_t(bpage) & 1); + bpage= reinterpret_cast(uintptr_t(bpage) & ~uintptr_t{1}); + if (exist) + { + bpage->unfix(); + buf_read_release(block); + space->release(); + continue; + } + if (UNIV_LIKELY(space->io(IORequest(IORequest::READ_ASYNC), + os_offset_t{id.page_no()} * len, len, + zip_size ? bpage->zip.data : bpage->frame, + bpage).err == DB_SUCCESS)) + { + count++; + continue; + } + buf_pool.corrupted_evict(bpage, buf_page_t::READ_FIX); + continue; } + + buf_read_release(block); + /* We stop on the first error, or the first page that already + resides in the buffer pool. */ + space->release(); + break; } - space->release(); - return buf_read_release_count(block, count); + if (count) + { + mysql_mutex_lock(&buf_pool.mutex); + buf_pool.stat.n_ra_pages_read+= count; + mysql_mutex_unlock(&buf_pool.mutex); + } } /** Schedule a page for recovery. diff --git a/storage/innobase/handler/ha_innodb.cc b/storage/innobase/handler/ha_innodb.cc index 4fda59511556b..857e2232705d3 100644 --- a/storage/innobase/handler/ha_innodb.cc +++ b/storage/innobase/handler/ha_innodb.cc @@ -109,6 +109,7 @@ this program; if not, write to the Free Software Foundation, Inc., #include "ut0mem.h" #include "row0ext.h" #include "innodb_binlog.h" +#include "buf0rea.h" #include "lz4.h" #include "lzo/lzo1x.h" @@ -2929,7 +2930,10 @@ ha_innobase::ha_innobase( | HA_CAN_SKIP_LOCKED ), m_start_of_scan(), - m_mysql_has_locked() + m_mysql_has_locked(), + m_mrr_readahead_pages(0), + m_mrr_readahead_enabled(false), + m_mrr_readahead_triggered(false) {} /*********************************************************************//** @@ -8939,6 +8943,13 @@ ha_innobase::index_read( build_template(false); } + mrr_readahead_ctx_t *mrr_ctx= nullptr; + if (m_mrr_readahead_enabled && !m_mrr_readahead_triggered) { + /* Get current page information from index */ + mrr_ctx= new mrr_readahead_ctx_t( + true, m_mrr_readahead_pages); + } + if (key_len) { ut_ad(key_ptr); /* Convert the search key value to InnoDB format into @@ -8971,10 +8982,22 @@ ha_innobase::index_read( mariadb_set_stats temp(m_prebuilt->trx, handler_stats); dberr_t ret = - row_search_mvcc(buf, mode, m_prebuilt, m_last_match_mode, 0); + row_search_mvcc(buf, mode, m_prebuilt, m_last_match_mode, 0, + mrr_ctx); DBUG_EXECUTE_IF("ib_select_query_failure", ret = DB_ERROR;); + if (mrr_ctx) { + /* Do buf_read_ahead_pages() */ + buf_read_ahead_pages( + index->table->space, + st_::span(mrr_ctx->page_list, + mrr_ctx->pages_found)); + delete mrr_ctx; + mrr_ctx= nullptr; + m_mrr_readahead_triggered= true; + } + if (UNIV_LIKELY(ret == DB_SUCCESS)) { table->status = 0; DBUG_RETURN(0); @@ -20227,6 +20250,34 @@ static void innodb_params_adjust() ut_ad(MYSQL_SYSVAR_NAME(log_write_ahead_size).max_val == 4096); } +/** Extract LIMIT information from optimizer context */ +static ha_rows extract_limit_from_optimizer(THD *thd) +{ + if (!thd || !thd->lex || !thd->lex->current_select) + return HA_POS_ERROR; + SELECT_LEX *select_lex = thd->lex->current_select; + /* Check for LIMIT clause */ + Item *limit_item= select_lex->limit_params.select_limit; + + if (limit_item && limit_item->const_item()) + { + /* Handle different LIMIT scenarios */ + longlong limit_val = limit_item->val_int(); + if (limit_val > 0) + return (ha_rows)limit_val; + } + /* Check for EXISTS subquery pattern */ + if (select_lex->master_unit() && + select_lex->master_unit()->item && + select_lex->master_unit()->item->type() == Item::SUBSELECT_ITEM) + { + Item_subselect *subq = (Item_subselect*)select_lex->master_unit()->item; + if (subq->substype() == Item_subselect::EXISTS_SUBS) + /* EXISTS subquery - effectively LIMIT 1 */ + return 1; + } + return HA_POS_ERROR; +} /**************************************************************************** * DS-MRR implementation ***************************************************************************/ @@ -20241,8 +20292,10 @@ ha_innobase::multi_range_read_init( uint mode, HANDLER_BUFFER* buf) { - return(m_ds_mrr.dsmrr_init(this, seq, seq_init_param, - n_ranges, mode, buf)); + if (m_ds_mrr.get_limit() == HA_POS_ERROR) + m_ds_mrr.set_limit(extract_limit_from_optimizer(ha_thd())); + return(m_ds_mrr.dsmrr_init(this, seq, seq_init_param, + n_ranges, mode, buf)); } int @@ -20264,7 +20317,7 @@ ha_innobase::multi_range_read_info_const( Cost_estimate* cost) { /* See comments in ha_myisam::multi_range_read_info_const */ - m_ds_mrr.init(this, table); + m_ds_mrr.init(this, table, limit); if (m_prebuilt->select_lock_type != LOCK_NONE) { *flags |= HA_MRR_USE_DEFAULT_IMPL; @@ -20301,6 +20354,14 @@ ha_innobase::multi_range_read_explain_info( return m_ds_mrr.dsmrr_explain_info(mrr_mode, str, size); } +void +ha_innobase::configure_mrr_readahead(uint max_pages) +{ + m_mrr_readahead_pages= max_pages; + m_mrr_readahead_enabled= true; + m_mrr_readahead_triggered= false; +} + /** Find or open a table handle for the virtual column template @param[in] thd thread handle @param[in,out] table InnoDB table whose virtual column template diff --git a/storage/innobase/handler/ha_innodb.h b/storage/innobase/handler/ha_innodb.h index b6fb571078bea..0d200b961b1af 100644 --- a/storage/innobase/handler/ha_innodb.h +++ b/storage/innobase/handler/ha_innodb.h @@ -399,6 +399,10 @@ class ha_innobase final : public handler int multi_range_read_explain_info(uint mrr_mode, char *str, size_t size) override; + /** Configure MRR read-ahead optimization based on LIMIT value. + @param max_pages Maximum number of pages to read ahead (0 = disable) */ + void configure_mrr_readahead(uint max_pages) override; + /** Attempt to push down an index condition. @param[in] keyno MySQL key number @param[in] idx_cond Index condition to be checked @@ -442,6 +446,7 @@ class ha_innobase final : public handler @param ib_table InnoDB table definition @retval true if not errors were found */ bool check_index_consistency(const dict_table_t* ib_table) noexcept; + protected: bool can_convert_string(const Field_string* field, @@ -534,6 +539,11 @@ class ha_innobase final : public handler /** If true, disable the Rowid Filter. It is disabled when the engine is intialized for making rnd_pos() calls */ bool m_disable_rowid_filter; + + /** MRR read-ahead configuration */ + uint m_mrr_readahead_pages; + bool m_mrr_readahead_enabled; + bool m_mrr_readahead_triggered; }; diff --git a/storage/innobase/include/btr0cur.h b/storage/innobase/include/btr0cur.h index 53f88cc8ca1f5..f2d776856a1d4 100644 --- a/storage/innobase/include/btr0cur.h +++ b/storage/innobase/include/btr0cur.h @@ -732,7 +732,8 @@ struct btr_cur_t { @param mtr mini-transaction @return error code */ dberr_t search_leaf(const dtuple_t *tuple, page_cur_mode_t mode, - btr_latch_mode latch_mode, mtr_t *mtr); + btr_latch_mode latch_mode, mtr_t *mtr, + mrr_readahead_ctx_t* mrr_ctx = nullptr); /** Search the leaf page record corresponding to a key, exclusively latching all sibling pages on the way. diff --git a/storage/innobase/include/btr0pcur.h b/storage/innobase/include/btr0pcur.h index 5f84328da1904..c25174a4458c3 100644 --- a/storage/innobase/include/btr0pcur.h +++ b/storage/innobase/include/btr0pcur.h @@ -83,7 +83,8 @@ cursor. inline dberr_t btr_pcur_open_with_no_init(const dtuple_t *tuple, page_cur_mode_t mode, btr_latch_mode latch_mode, - btr_pcur_t *cursor, mtr_t *mtr); + btr_pcur_t *cursor, mtr_t *mtr, + mrr_readahead_ctx_t* mrr_ctx = nullptr); /**************************************************************//** Gets the up_match value for a pcur after a search. diff --git a/storage/innobase/include/btr0pcur.inl b/storage/innobase/include/btr0pcur.inl index 4f43972a23673..40a062d78ee06 100644 --- a/storage/innobase/include/btr0pcur.inl +++ b/storage/innobase/include/btr0pcur.inl @@ -313,13 +313,14 @@ cursor. inline dberr_t btr_pcur_open_with_no_init(const dtuple_t *tuple, page_cur_mode_t mode, btr_latch_mode latch_mode, - btr_pcur_t *cursor, mtr_t *mtr) + btr_pcur_t *cursor, mtr_t *mtr, + mrr_readahead_ctx_t* mrr_ctx) { cursor->latch_mode= BTR_LATCH_MODE_WITHOUT_INTENTION(latch_mode); cursor->search_mode= mode; cursor->pos_state= BTR_PCUR_IS_POSITIONED; cursor->trx_if_known= nullptr; - return cursor->btr_cur.search_leaf(tuple, mode, latch_mode, mtr); + return cursor->btr_cur.search_leaf(tuple, mode, latch_mode, mtr, mrr_ctx); } /**************************************************************//** diff --git a/storage/innobase/include/btr0types.h b/storage/innobase/include/btr0types.h index 6d82160a99e73..69053d18a4771 100644 --- a/storage/innobase/include/btr0types.h +++ b/storage/innobase/include/btr0types.h @@ -105,3 +105,39 @@ enum btr_latch_mode { /** Try to delete mark a spatial index record */ BTR_RTREE_DELETE_MARK = 256 }; + +/** MRR read-ahead context structure for passing +through B-tree operations */ +struct mrr_readahead_ctx_t +{ + /** Whether MRR read-ahead is enabled */ + bool enabled= false; + /** Array to store collected leaf page numbers */ + uint32_t *page_list= nullptr; + /** Counter of pages found so far */ + uint pages_found= 0; + /** Maximum number of pages in the array */ + uint max_pages= 64; + + /** Constructor with enable flag and page count */ + mrr_readahead_ctx_t(bool enable, uint num_pages) : + enabled(enable), pages_found(0) + { + if (enabled) + { + if (num_pages > 0 && num_pages < 64) + max_pages= num_pages; + page_list= static_cast(malloc(max_pages * sizeof(uint32_t))); + } + } + + /** Destructor */ + ~mrr_readahead_ctx_t() + { + if (page_list) + { + free(page_list); + page_list= nullptr; + } + } +}; diff --git a/storage/innobase/include/buf0rea.h b/storage/innobase/include/buf0rea.h index bc50a7d52a297..bb23657bdd645 100644 --- a/storage/innobase/include/buf0rea.h +++ b/storage/innobase/include/buf0rea.h @@ -53,39 +53,32 @@ void buf_read_page_background(const page_id_t page_id, fil_space_t *space, trx_t *trx) noexcept MY_ATTRIBUTE((nonnull(2))); -/** Applies a random read-ahead in buf_pool if there are at least a threshold -value of accessed pages from the random read-ahead area. Does not read any -page, not even the one at the position (space, offset), if the read-ahead -mechanism is not activated. NOTE: the calling thread may own latches on -pages: to avoid deadlocks this function must be written such that it cannot -end up waiting for these latches! -@param[in] page_id page id of a page which the current thread -wants to access -@return number of page read requests issued */ -ulint buf_read_ahead_random(const page_id_t page_id) noexcept; +/** Apply a random read-ahead of pages. +@param space tablespace +@param low first page to attempt to read +@param high last page to attempt to read */ +void buf_read_ahead_random(fil_space_t *space, + page_id_t low, page_id_t high) noexcept; -/** Applies linear read-ahead if in the buf_pool the page is a border page of +/** Apply linear read-ahead if an undo log page is a border page of a linear read-ahead area and all the pages in the area have been accessed. -Does not read any page if the read-ahead mechanism is not activated. Note -that the algorithm looks at the 'natural' adjacent successor and -predecessor of the page, which on the leaf level of a B-tree are the next -and previous page in the chain of leaves. To know these, the page specified -in (space, offset) must already be present in the buf_pool. Thus, the -natural way to use this function is to call it when a page in the buf_pool -is accessed the first time, calling this function just after it has been -bufferfixed. -NOTE 1: as this function looks at the natural predecessor and successor -fields on the page, what happens, if these are not initialized to any -sensible value? No problem, before applying read-ahead we check that the -area to read is within the span of the space, if not, read-ahead is not -applied. An uninitialized value may result in a useless read operation, but -only very improbably. -NOTE 2: the calling thread may own latches on pages: to avoid deadlocks this -function must be written such that it cannot end up waiting for these -latches! -@param[in] page_id page id; see NOTE 3 above +Does not read any page if the read-ahead mechanism is not activated. +@param space undo tablespace or fil_system.space, or nullptr +@param id undo page identifier @return number of page read requests issued */ -ulint buf_read_ahead_linear(const page_id_t page_id) noexcept; +ulint buf_read_ahead_undo(fil_space_t *space, const page_id_t id) noexcept; + +/** Read ahead a page if it is not yet in the buffer pool. +@param space tablespace +@param page page to read ahead */ +void buf_read_ahead_one(fil_space_t *space, uint32_t page) noexcept; + +/** Read ahead a number of pages. +@param space tablespace +@param pages pages to read ahead +@param ibuf whether we are inside the ibuf routine */ +void buf_read_ahead_pages(fil_space_t *space, + st_::span pages) noexcept; /** Schedule a page for recovery. @param space tablespace diff --git a/storage/innobase/include/row0sel.h b/storage/innobase/include/row0sel.h index 35e3cbe66315c..2f31262598c11 100644 --- a/storage/innobase/include/row0sel.h +++ b/storage/innobase/include/row0sel.h @@ -144,7 +144,8 @@ row_search_mvcc( page_cur_mode_t mode, row_prebuilt_t* prebuilt, ulint match_mode, - ulint direction) + ulint direction, + mrr_readahead_ctx_t* mrr_ctx = nullptr) MY_ATTRIBUTE((warn_unused_result)); /********************************************************************//** diff --git a/storage/innobase/row/row0sel.cc b/storage/innobase/row/row0sel.cc index c3c17dd1d305e..372fa3549ef96 100644 --- a/storage/innobase/row/row0sel.cc +++ b/storage/innobase/row/row0sel.cc @@ -4345,7 +4345,8 @@ row_search_mvcc( page_cur_mode_t mode, row_prebuilt_t* prebuilt, ulint match_mode, - ulint direction) + ulint direction, + mrr_readahead_ctx_t* mrr_ctx) { DBUG_ENTER("row_search_mvcc"); DBUG_ASSERT(prebuilt->index->table == prebuilt->table); @@ -4796,7 +4797,7 @@ row_search_mvcc( } else { err = btr_pcur_open_with_no_init(search_tuple, mode, BTR_SEARCH_LEAF, - pcur, &mtr); + pcur, &mtr, mrr_ctx); } if (err != DB_SUCCESS) { diff --git a/storage/innobase/trx/trx0undo.cc b/storage/innobase/trx/trx0undo.cc index 1cce439f6e1cf..819c42ed9fa2b 100644 --- a/storage/innobase/trx/trx0undo.cc +++ b/storage/innobase/trx/trx0undo.cc @@ -185,7 +185,7 @@ trx_undo_get_prev_rec_from_prev_page(buf_block_t *&block, uint16_t rec, return nullptr; if (!buf_page_make_young_if_needed(&block->page)) - buf_read_ahead_linear(block->page.id()); + buf_read_ahead_undo(nullptr, block->page.id()); return trx_undo_page_get_last_rec(block, page_no, offset); } @@ -271,19 +271,25 @@ trx_undo_get_next_rec_from_next_page(const buf_block_t *&block, @return undo log record, the page latched @retval nullptr if none */ static trx_undo_rec_t* -trx_undo_get_first_rec(const fil_space_t &space, uint32_t page_no, +trx_undo_get_first_rec(fil_space_t *space, uint32_t page_no, uint16_t offset, rw_lock_type_t mode, const buf_block_t *&block, mtr_t *mtr, dberr_t *err) { - buf_block_t *b= buf_page_get_gen(page_id_t{space.id, page_no}, 0, mode, + buf_block_t *b= buf_page_get_gen(page_id_t{space->id, page_no}, 0, mode, nullptr, BUF_GET, mtr, err); block= b; if (!block) return nullptr; if (!buf_page_make_young_if_needed(&b->page)) - buf_read_ahead_linear(b->page.id()); + { + if (space->acquire()) + { + buf_read_ahead_undo(space, b->page.id()); + space->release(); + } + } if (trx_undo_rec_t *rec= trx_undo_page_get_first_rec(b, page_no, offset)) return rec; @@ -911,7 +917,7 @@ trx_undo_truncate_start( dberr_t err; const buf_block_t* undo_page; - rec = trx_undo_get_first_rec(*rseg->space, hdr_page_no, hdr_offset, + rec = trx_undo_get_first_rec(rseg->space, hdr_page_no, hdr_offset, RW_X_LATCH, undo_page, &mtr, &err); if (rec == NULL) { /* Already empty */