forked from yuanrongxi/innodb
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbtr0btr.cc
1858 lines (1439 loc) · 49.1 KB
/
btr0btr.cc
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#include "btr0btr.h"
#include "fsp0fsp.h"
#include "page0page.h"
#include "btr0cur.h"
#include "btr0sea.h"
#include "btr0pcur.h"
#include "rem0cmp.h"
#include "lock0lock.h"
#include "ibuf0ibuf.h"
/*如果是顺序插入,这个页分裂的一个记录数因子*/
#define BTR_PAGE_SEQ_INSERT_LIMIT 5
static void btr_page_create(page_t* page, dict_tree_t* tree, mtr_t* mtr);
UNIV_INLINE void btr_node_ptr_set_child_page_no(rec_t* rec, ulint page_no, mtr_t* mtr);
static rec_t* btr_page_get_father_node_ptr(dict_tree_t* tree, page_t* page, mtr_t* mtr);
static void btr_page_empty(page_t* page, mtr_t* mtr);
static ibool btr_page_insert_fits(btr_cur_t* cursor, rec_t* split_rec, dtuple_t* tuple);
/**************************************************************************/
/*获得root node所在的page,并且会在上面加上x-latch*/
page_t* btr_root_get(dict_tree_t* tree, mtr_t* mtr)
{
ulint space;
ulint root_page_no;
page_t* root;
ut_ad(mtr_memo_contains(mtr, dict_tree_get_lock(tree), MTR_MEMO_X_LOCK)
|| mtr_memo_contains(mtr, dict_tree_get_lock(tree), MTR_MEMO_S_LOCK));
space = dict_tree_get_space(tree);
root_page_no = dict_tree_get_page(tree);
root = btr_page_get(space, root_page_no, RW_X_LATCH, mtr);
return root;
}
/*获得B-TREE上rec位置的上一条记录*/
rec_t* btr_get_prev_user_rec(rec_t* rec, mtr_t* mtr)
{
page_t* page;
page_t* prev_page;
ulint prev_page_no;
rec_t* prev_rec;
ulint space;
page = buf_frame_align(rec);
if(page_get_infimum_rec(page) != rec){
prev_rec = page_rec_get_prev(rec);
if(page_get_infimum_rec(page) != prev_rec)
return prev_rec;
}
prev_page_no = btr_page_get_prev(page, mtr);
space = buf_frame_get_space_id(page);
if(prev_page_no != FIL_NULL){
prev_page = buf_page_get_with_no_latch(space, prev_page_no, mtr);
/*对持有latch的判断,一定会持有latch的*/
ut_ad((mtr_memo_contains(mtr, buf_block_align(prev_page), MTR_MEMO_PAGE_S_FIX))
|| (mtr_memo_contains(mtr, buf_block_align(prev_page), MTR_MEMO_PAGE_X_FIX)));
prev_rec = page_rec_get_prev(page_get_supremum_rec(prev_page));
return prev_rec;
}
return NULL;
}
/*获得B-TREE上rec位置的下一条记录*/
rec_t* btr_get_next_user_rec(rec_t* rec, mtr_t* mtr)
{
page_t* page;
page_t* next_page;
ulint next_page_no;
rec_t* next_rec;
ulint space;
page = buf_frame_align(rec);
if(page_get_supremum_rec(page) != rec){
next_rec = page_rec_get_next(rec);
if(page_get_supremum_rec(page) != next_rec)
return next_rec;
}
next_page_no = btr_page_get_next(page, mtr);
space = buf_frame_get_space_id(page);
if(next_page_no != FIL_NULL){
next_page = buf_page_get_with_no_latch(space, next_page_no, mtr);
ut_ad((mtr_memo_contains(mtr, buf_block_align(next_page), MTR_MEMO_PAGE_S_FIX))
|| (mtr_memo_contains(mtr, buf_block_align(next_page), MTR_MEMO_PAGE_X_FIX)));
next_rec = page_rec_get_next(page_get_infimum_rec(page));
return next_rec;
}
return NULL;
}
static void btr_page_create(page_t* page, dict_tree_t* tree, mtr_t* mtr)
{
ut_ad(mtr_memo_contains(mtr, buf_block_align(page), MTR_MEMO_PAGE_X_FIX));
/*在内存中构建一个页结构,并且构建逻辑结构和默认记录*/
page_create(page, mtr);
btr_page_set_index_id(page, tree->id, mtr);
}
/*在insert buffer上开辟一个页空间*/
static page_t* btr_page_alloc_for_ibuf(dict_tree_t* tree, mtr_t* mtr)
{
fil_addr_t node_addr;
page_t* root;
page_t* new_page;
root = btr_root_get(tree, mtr);
/*获得一个page需要存储的fil addr空间*/
node_addr = flst_get_first(root + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST, mtr);
ut_a(node_addr.page != FIL_NULL);
/*在buf中获得一个新页的空间,这个BUF和磁盘一一对应的*/
new_page = buf_page_get(dict_tree_get_space(tree), node_addr.page, RW_X_LATCH, mtr);
buf_page_dbg_add_level(new_page, SYNC_TREE_NODE_NEW);
/*从空闲的磁盘队列中删除对应page的node addr*/
flst_remove(root + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST, new_page + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST_NODE, mtr);
ut_ad(flst_validate(root + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST, mtr));
return new_page;
}
/*B-TREE上分配页空间*/
page_t* btr_page_alloc(dict_index_t* tree, ulint hint_page_no, byte file_direction, ulint level, mtr_t* mtr)
{
fseg_header_t* seg_header;
page_t* root;
page_t* new_page;
ulint new_page_no;
ut_ad(mtr_memo_contains(mtr, dict_tree_get_lock(tree), MTR_MEMO_X_LOCK));
/*直接在ibuf上分配一个页*/
if(tree->type & DICT_IBUF)
return btr_page_alloc_for_ibuf(tree, mtr);
/*获得root节点对应的页*/
root = btr_root_get(tree, mtr);
if(level == 0)
seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_LEAF;
else
seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_TOP;
/*在对应的表空间中分配一个页ID*/
new_page_no = fseg_alloc_free_page_general(seg_header, hint_page_no, file_direction, TRUE, mtr);
if(new_page_no == FIL_NULL) /*为获得合法的页ID*/
return NULL;
/*在IBUF上获得一个页空间,这个页还未真正初始化,需要调用btr_page_create对其初始化*/
new_page = buf_page_get(dict_tree_get_space(tree), new_page_no, RW_X_LATCH, mtr);
return new_page;
}
/*获得index对应的B-TREE上page的数量*/
ulint btr_get_size(dict_index_t* index, ulint flag)
{
fseg_header_t* seg_header;
page_t* root;
ulint n;
ulint dummy;
mtr_t mtr;
mtr_start(&mtr);
/*对树加上一个s-latch*/
mtr_s_lock(dict_tree_get_lock(index->tree), &mtr);
root = btr_root_get(index->tree, &mtr);
if(flag == BTR_N_LEAF_PAGES){ /*获取叶子节点个数,直接返回对应segment中页的数量即可*/
seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_LEAF;
fseg_n_reserved_pages(seg_header, &n, &mtr);
}
else if(flag == BTR_TOTAL_SIZE){ /*获得所有的叠加*/
seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_TOP;
n = fseg_n_reserved_pages(seg_header, &dummy, &mtr);
seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_LEAF;
n += fseg_n_reserved_pages(seg_header, &dummy, &mtr);
}
else{
ut_a(0);
}
mtr_commit(&mtr);
return n;
}
/*ibuf回收page空间*/
static void btr_page_free_for_ibuf(dict_tree_t* tree, page_t* page, mtr_t* mtr)
{
page_t* root;
ut_ad(mtr_memo_contains(mtr, buf_block_align(page), MTR_MEMO_PAGE_X_FIX));
root = btr_root_get(tree, mtr);
/*直接添加到root的ibuf free list当中就行*/
flst_add_first(root + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST, page + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST_NODE, mtr);
ut_a(flst_validate(root + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST, mtr));
}
/*回收page空间*/
void btr_page_free_low(dict_tree_t* tree, page_t* page, ulint level, mtr_t* mtr)
{
fseg_header_t* seg_header;
page_t* root;
ulint space;
ulint page_no;
ut_ad(mtr_memo_contains(mtr, buf_block_align(page), MTR_MEMO_PAGE_X_FIX));
buf_frame_modify_clock_inc(page);
if(tree->type & DICT_IBUF){
btr_page_free_for_ibuf(tree, page, mtr);
return;
}
/*获得对应表空间的segment位置信息*/
root = btr_root_get(tree, mtr);
if(level == 0)
seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_LEAF;
else
seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_TOP;
space = buf_frame_get_space_id(page);
page_no = buf_frame_get_page_no(page);
/*在表空间中释放页*/
fseg_free_page(seg_header, space, page_no, mtr);
}
/*释放page空间*/
void btr_page_free(dict_index_t* tree, page_t* page, mtr_t* mtr)
{
ut_ad(mtr_memo_contains(mtr, buf_block_align(page), MTR_MEMO_PAGE_X_FIX));
level = btr_page_get_level(page, mtr);
btr_page_free_low(tree, page, level, mtr);
}
/*设置child node的fil addr*/
UNIV_INLINE void btr_node_ptr_set_child_page_no(rec_t* rec, ulint page_no, mtr_t* mtr)
{
ulint n_fields;
byte* field;
ulint len;
ut_ad(0 < btr_page_get_level(buf_frame_align(rec), mtr));
n_fields = rec_get_n_fields(rec);
field = rec_get_nth_field(rec, n_fields - 1, &len);
ut_ad(len == 4);
/*将page no写入到rec中的最后一列上*/
mlog_write_ulint(field, page_no, MLOG_4BYTES, mtr);
}
/*通过node ptr记录获得对应的page页*/
static page_t* btr_node_ptr_get_child(rec_t* node_ptr, mtr_t* mtr)
{
ulint page_no;
ulint space;
page_t* page;
space = buf_frame_get_space_id(node_ptr);
page_no = btr_node_ptr_get_child_page_no(node_ptr);
page = btr_page_get(space, page_no, RW_X_LATCH, mtr);
return page;
}
/*返回page也所在节点的上一层节点的node ptr 记录*/
static rec_t* btr_page_get_father_for_rec(dict_tree_t* tree, page_t* page, rec_t* user_rec, mtr_t* mtr)
{
mem_heap_t* heap;
dtuple_t* tuple;
btr_cur_t cursor;
rec_t* node_ptr;
ut_ad(mtr_memo_contains(mtr, dict_tree_get_lock(tree), MTR_MEMO_X_LOCK));
ut_a(user_rec != page_get_supremum_rec(page));
ut_a(user_rec != page_get_infimum_rec(page));
ut_ad(dict_tree_get_page(tree) != buf_frame_get_page_no(page));
heap = mem_heap_create(100);
/*构建一个node ptr 记录,将page no写入tuple中*/
tuple = dict_tree_build_node_ptr(tree, user_sec, 0, heap, btr_page_get_level(page, mtr));
btr_cur_search_to_nth_level(UT_LIST_GET_FIRST(tree->tree_indexes), btr_page_get_level(page, mtr) + 1,
tuple, PAGE_CUR_LE, BTR_CONT_MODIFY_TREE,
&cursor, 0, mtr);
node_ptr = btr_cur_get_rec(&cursor);
/*node ptr中的孩子节点page no 与page中的page no不同,说明有错误,输出错误信息*/
if(btr_node_ptr_get_child_page_no(node_ptr) != buf_frame_get_page_no(page)){
fprintf(stderr, "InnoDB: Dump of the child page:\n");
buf_page_print(buf_frame_align(page));
fprintf(stderr,"InnoDB: Dump of the parent page:\n");
buf_page_print(buf_frame_align(node_ptr));
fprintf(stderr,
"InnoDB: Corruption of an index tree: table %s, index %s,\n"
"InnoDB: father ptr page no %lu, child page no %lu\n",
(UT_LIST_GET_FIRST(tree->tree_indexes))->table_name,
(UT_LIST_GET_FIRST(tree->tree_indexes))->name,
btr_node_ptr_get_child_page_no(node_ptr),
buf_frame_get_page_no(page));
page_rec_print(page_rec_get_next(page_get_infimum_rec(page)));
page_rec_print(node_ptr);
fprintf(stderr,
"InnoDB: You should dump + drop + reimport the table to fix the\n"
"InnoDB: corruption. If the crash happens at the database startup, see\n"
"InnoDB: section 6.1 of http://www.innodb.com/ibman.html about forcing\n"
"InnoDB: recovery. Then dump + drop + reimport.\n");
}
ut_a(btr_node_ptr_get_child_page_no(node_ptr) == buf_frame_get_page_no(page));
mem_heap_free(heap);
}
/*获得page上一层节点node ptr*/
static rec_t* btr_page_get_father_node_ptr(dict_tree_t* tree, page_t* page, mtr_t* mtr)
{
return btr_page_get_father_for_rec(tree, page, page_rec_get_next(page_get_infimum_rec(page)), mtr);
}
/*建立一个btree,并返回root node 的page ID*/
ulint btr_create(ulint type, ulint space, dulint index_id, mtr_t* mtr)
{
ulint page_no;
buf_frame_t* ibuf_hdr_frame;
buf_frame_t* frame;
page_t* page;
if(type & DICT_IBUF){
/*分配一个fil segment*/
ibuf_hdr_frame = fseg_create(space, 0, IBUF_HEADER + IBUF_TREE_SEG_HEADER, mtr);
buf_page_dbg_add_level(ibuf_hdr_frame, SYNC_TREE_NODE_NEW);
ut_ad(buf_frame_get_page_no(ibuf_hdr_frame) == IBUF_HEADER_PAGE_NO);
/*在ibuf_hdr_frame上分配一个页*/
page_no = fseg_alloc_free_page(ibuf_hdr_frame + IBUF_HEADER + IBUF_TREE_SEG_HEADER,
IBUF_TREE_ROOT_PAGE_NO, FSP_UP, mtr);
frame = buf_page_get(space, page_no, RW_X_LATCH, mtr);
}
else{
/*在表空间上创建一个file segment,并位于root的PAGE_BTR_SEG_TOP中*/
frame = fseg_create(space, 0, PAGE_HEADER + PAGE_BTR_SEG_TOP, mtr);
}
if(frame == NULL)
return FIL_NULL;
page_no = buf_frame_get_page_no(frame);
buf_page_dbg_add_level(frame, SYNC_TREE_NODE_NEW);
if(type & DICT_IBUF){
ut_ad(page_no == IBUF_TREE_ROOT_PAGE_NO);
/*初始化空闲的ibuf 磁盘列表,用于存储释放的PAGE*/
flst_init(frame + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST, mtr);
}
else{ /*如果不是ibuf的b tree,创建一个fil segment用于存储leaf page*/
fseg_create(space, page_no, PAGE_HEADER + PAGE_BTR_SEG_LEAF, mtr);
buf_page_dbg_add_level(frame, SYNC_TREE_NODE_NEW);
}
/*在fil segment上创建一个page的页逻辑结构*/
page = page_create(frame, mtr);
/*设置page的index id*/
btr_page_set_index_id(page, index_id, mtr);
/*设置LEVEL*/
btr_page_set_level(page, 0, mtr);
/*设置叶子page关联关系*/
btr_page_set_next(page, FIL_NULL, mtr);
btr_page_set_prev(page, FIL_NULL, mtr);
/*做什么用的??*/
ibuf_reset_free_bits_with_type(type, page);
ut_ad(page_get_max_insert_size(page, 2) > 2 * BTR_PAGE_MAX_REC_SIZE);
return page_no;
}
/*释放整个btree的page,root page不做释放*/
void btr_free_but_not_root(ulint space, ulint root_page_no)
{
ibool finished;
page_t* root;
mtr_t mtr;
/*叶子节点释放*/
leaf_loop:
mtr_start(&mtr);
/*释放真个root对应的segment的page*/
root = btr_page_get(space, root_page_no, RW_X_LATCH, &mtr);
finished = fseg_free_step(root + PAGE_HEADER + PAGE_BTR_SEG_LEAF, &mtr);
mtr_commit(&mtr);
if(!finished)
goto leaf_loop;
/*枝干节点释放,不会释放root对应的头页,头页中有fsegment的信息*/
top_loop:
mtr_start(&mtr);
root = btr_page_get(space, root_page_no, RW_X_LATCH, &mtr);
finished = fseg_free_step_not_header(root + PAGE_HEADER + PAGE_BTR_SEG_TOP, &mtr);
mtr_commit(&mtr);
if(!finished)
goto top_loop;
}
/*释放btree对应的root page*/
void btr_free_root(ulint space, ulint root_page_no, mtr_t* mtr)
{
ibool finished;
page_t* root;
root = btr_page_get(space, root_page_no, RW_X_LATCH, mtr);
/*删除掉对应的自适应hash索引*/
btr_search_drop_hash_index(root);
top_loop:
finished = fseg_free_step(root + PAGE_HEADER + PAGE_BTR_SEG_TOP, mtr);
if(!finished)
goto top_loop;
}
static void btr_page_reorganize_low(ibool recovery, page_t* page, mtr_t* mtr)
{
page_t* new_page;
ulint log_mode;
ut_ad(mtr_memo_contains(mtr, buf_block_align(page), MTR_MEMO_PAGE_X_FIX));
/*写入一条page reorganize的日志*/
mlog_write_initial_log_record(page, MLOG_PAGE_REORGANIZE, mtr);
/*关闭mini transcation log模式*/
log_mode = mtr_set_log_mode(mtr, MTR_LOG_NONE);
new_page = buf_frame_alloc();
buf_frame_copy(new_page, page);
/*如果不是在恢复redo log的过程,就删除掉对应的哈希索引*/
if(!recovery)
btr_search_drop_page_hash_index(page);
/*重新在page空间上构建一个新的page逻辑结构*/
page_create(page, mtr);
/*把整个new page中的记录全部转移到page上,和buf_frame_copy不同,这个应该是逻辑拷贝*/
page_copy_rec_list_end_no_locks(page, new_page, page_get_infimum_rec(new_page), mtr);
/*设置二级索引对应操作的事务ID*/
page_set_max_trx_id(page, page_get_max_trx_id(new_page));
/*重组更新page对应的事务锁*/
if(!recovery)
lock_move_reorganize_page(page, new_page);
/*释放掉临时的页*/
buf_frame_free(new_page);
/*恢复mini transcation log模式*/
mtr_set_log_mode(mtr, log_mode);
}
/*在非redo过程中重组page*/
void btr_page_reorganize(page_t* page, mtr_t* mtr)
{
btr_page_reorganize_low(FALSE, page, mtr);
}
/*在redo log恢复过程中重组page*/
byte* btr_parse_page_reorganize(byte* ptr, byte* end_ptr, page_t* page, mtr_t* mtr)
{
ut_ad(ptr && end_ptr);
if(page)
btr_page_reorganize_low(TRUE, page, mtr);
return ptr;
}
/*btree索引page清空*/
static void btr_page_empty(page_t* page, mtr_t* mtr)
{
ut_ad(mtr_memo_contains(mtr, buf_block_align(page), MTR_MEMO_PAGE_X_FIX));
btr_search_drop_page_hash_index(page);
page_create(page, mtr);
}
rec_t* btr_root_raise_and_insert(btr_cur_t* cursor, dtuple_t* tuple, mtr_t* mtr)
{
dict_tree_t* tree;
page_t* root;
page_t* new_page;
ulint new_page_no;
rec_t* rec;
mem_heap_t* heap;
dtuple_t* node_ptr;
ulint level;
rec_t* node_ptr_rec;
page_cur_t* page_cursor;
root = btr_cur_get_page(cursor);
tree = btr_cur_get_tree(cursor);
ut_ad(dict_tree_get_page(tree) == buf_frame_get_page_no(root));
ut_ad(mtr_memo_contains(mtr, dict_tree_get_lock(tree), MTR_MEMO_X_LOCK));
ut_ad(mtr_memo_contains(mtr, buf_block_align(root), MTR_MEMO_PAGE_X_FIX));
btr_search_drop_page_hash_index(root);
/*在表空间中分配一个页空间*/
new_page = btr_page_alloc(tree, 0, FSP_NO_DIR, btr_page_get_level(root, mtr), mtr);
/*在btree上创建一个新页*/
btr_page_create(new_page, tree, mtr);
level = btr_page_get_level(root, mtr);
/*设置层高*/
btr_page_set_level(new_page, level, mtr);
btr_page_set_level(root, level + 1, mtr);
/*将root中所有的记录移到new_page中*/
page_move_rec_list_end(new_page, root, page_get_infimum_rec(root), mtr);
/*事务锁转移*/
lock_update_root_raise(new_page, root);
heap = mem_heap_create(100);
/*获得new page的第一条有效记录*/
rec = page_rec_get_next(page_get_infimum_rec(new_page));
new_page_no = buf_frame_get_page_no(new_page);
node_ptr = dict_tree_build_node_ptr(tree, rec, new_page_no, heap, level);
/*将root page重新分配空间*/
btr_page_reorganize(root, mtr);
page_cursor = btr_cur_get_page_cur(cursor);
/*将new page的第一条记录(node_ptr)插入到root中*/
page_cur_set_before_first(root, page_cursor);
node_ptr_rec = page_cur_tuple_insert(page_cursor, node_ptr, mtr);
ut_ad(node_ptr_rec);
btr_set_min_rec_mark(node_ptr_rec, mtr);
mem_heap_free(heap);
ibuf_reset_free_bits(UT_LIST_GET_FIRST(tree->tree_indexes), new_page);
/*重新定位page cursor的指向的位置,也会改变btree cursor,btree_cursor将会指向new page上的对应记录*/
page_cur_search(new_page, tuple, PAGE_CUR_LE, page_cursor);
return btr_page_split_and_insert(cursor, tuple, mtr);
}
/*判断页是否可以向左聚集进行分裂,并确定分裂的位置,也就是说后面插入的记录范围可能都在左边*/
ibool btr_page_get_split_rec_to_left(btr_cur_t* cursor, rec_t** split_rec)
{
page_t* page;
rec_t* insert_point;
rec_t* infimum;
page = btr_cur_get_page(cursor);
insert_point = btr_cur_get_rec(cursor);
if ((page_header_get_ptr(page, PAGE_LAST_INSERT) == page_rec_get_next(insert_point))
&& (page_header_get_field(page, PAGE_DIRECTION) == PAGE_LEFT)
&& ((page_header_get_field(page, PAGE_N_DIRECTION) >= BTR_PAGE_SEQ_INSERT_LIMIT)
|| (page_header_get_field(page, PAGE_N_DIRECTION) + 1>= page_get_n_recs(page)))) {
infimum = page_get_infimum_rec(page);
/*直接从insert point处分裂,如果insert point和infimum太近,从它的下一条记录处分裂*/
if ((infimum != insert_point) && (page_rec_get_next(infimum) != insert_point))
*split_rec = insert_point;
else
*split_rec = page_rec_get_next(insert_point);
return TRUE;
}
return FALSE;
}
/*判断页是否可以向右聚集进行分裂,并确定分裂的位置,也就是说后面插入的记录范围可能都在右边*/
ibool btr_page_get_split_rec_to_right(btr_cur_t* cursor, rec_t** split_rec)
{
page_t* page;
rec_t* insert_point;
rec_t* supremum;
page = btr_cur_get_page(cursor);
insert_point = btr_cur_get_rec(cursor);
if ((page_header_get_ptr(page, PAGE_LAST_INSERT) == insert_point)
&& (page_header_get_field(page, PAGE_DIRECTION) == PAGE_RIGHT)
&& ((page_header_get_field(page, PAGE_N_DIRECTION) >= BTR_PAGE_SEQ_INSERT_LIMIT)
|| (page_header_get_field(page, PAGE_N_DIRECTION) + 1 >= page_get_n_recs(page)))) {
supremum = page_get_supremum_rec(page);
/*从insert_point后面第3条记录开始分裂,否则直接从insert point处分裂*/
if ((page_rec_get_next(insert_point) != supremum) && (page_rec_get_next(page_rec_get_next(insert_point)) != supremum)
&& (page_rec_get_next(page_rec_get_next(page_rec_get_next(insert_point))) != supremum)) {
/* If there are >= 3 user records up from the insert point, split all but 2 off */
*split_rec = page_rec_get_next(page_rec_get_next(page_rec_get_next(insert_point)));
}
else
*split_rec = NULL;
return TRUE;
}
return FALSE;
}
/*确定从page 中间进行分裂执行时的记录位置,一般是随机插入时会进行判断*/
static rec_t* btr_page_get_sure_split_rec(btr_cur_t* cursor, dtuple_t* tuple)
{
page_t* page;
ulint insert_size;
ulint free_space;
ulint total_data;
ulint total_n_recs;
ulint total_space;
ulint incl_data;
rec_t* ins_rec;
rec_t* rec;
rec_t* next_rec;
ulint n;
page = btr_cur_get_page(cursor);
/*获得插入记录的在磁盘中应该占用的空间*/
insert_size = rec_get_converted_size(tuple);
/*获得页的记录可用空间*/
free_space = page_get_free_space_of_empty();
total_data = page_get_data_size(page) + insert_size;
total_n_recs = page_get_n_recs(page) + 1;
ut_ad(total_n_recs >= 2);
/*计算现在page中被占用的空间数,记录空间 + 记录索引槽slots*/
total_space = total_data + page_dir_calc_reserved_space(total_n_recs);
n = 0;
incl_data = 0;
/*tuple应该插入的位置*/
ins_rec = btr_cur_get_rec(cursor);
rec = page_get_infimum_rec(page);
for(;;){
if(rec == ins_rec) /*不需要分裂,tuple记录可以直接插入到page*/
rec = NULL;
else if(rec == NULL)
rec = page_rec_get_next(ins_rec);
else
rec = page_rec_get_next(rec);
}
/*假设插入tuple,统计插入后的数据长度,然后根据这个长度进行分裂判断*/
if(rec == NULL)
incl_data += insert_size;
else
incl_data += rec_get_size(rec);
n ++;
/*从infimum到rec位置的占用空间总和大于总使用空间一半,可以在rec记录处分裂*/
if(incl_data + page_dir_calc_reserved_space(n) >= total_space / 2){
/*占用空间的总和小于页的可用空间*/
if(incl_data + page_dir_calc_reserved_space(n) <= free_space){
if(rec == ins_rec) /*从insert rec处分裂?还是不需要分裂*/
next_rec = NULL;
else if(rec == NULL) /*从ins_rec的下一条记录处分裂*/
next_rec = page_rec_get_next(ins_rec);
else
next_rec = page_rec_get_next(rec);
if(next_rec != page_get_supremum_rec(page))
return next_rec;
}
return rec;
}
return NULL;
}
/*page页中间分裂,split rec是否可以作为分裂点*/
static ibool btr_page_insert_fits(btr_cur_t* cursor, rec_t* split_rec, dtuple_t* tuple)
{
page_t* page;
ulint insert_size;
ulint free_space;
ulint total_data;
ulint total_n_recs;
rec_t* rec;
rec_t* end_rec;
page = btr_cur_get_page(cursor);
insert_size = rec_get_converted_size(tuple);
free_space = page_get_free_space_of_empty();
total_data = page_get_data_size(page) + insert_size;
total_n_recs = page_get_n_recs(page) + 1;
/*未指定分裂点,从页第一条记录到cursor指向的记录区间,确定分裂点区间*/
if(split_rec == NULL){
rec = page_rec_get_next(page_get_infimum_rec(page));
end_rec = page_rec_get_next(btr_cur_get_rec(cursor));
}
else if(cmp_dtuple_rec(tuple, split_rec) >= 0){ /*指定分裂位置,从开始到指定记录位置,且tuple落在split_rec之后的位置*/
rec = page_rec_get_next(page_get_infimum_rec(page));
end_rec = split_rec;
}
else{ /*tuple 在split_rec之前*/
rec = split_rec;
end_rec = page_get_supremum_rec(page);
}
if (total_data + page_dir_calc_reserved_space(total_n_recs) <= free_space)
return TRUE;
while(rec != end_rec){
total_data -= rec_get_size(rec);
total_n_recs --;
/*能保证页存下相对应的数据*/
if(total_data + page_dir_calc_reserved_space(total_n_recs) <= free_space)
return TRUE;
rec = page_rec_get_next(rec);
}
return FALSE;
}
/*将一个tuple插入到btree中的非叶子节点*/
void btr_insert_on_non_leaf_level(dict_tree_t* tree, ulint level, dtuple_t* tuple, mtr_t* mtr)
{
big_rec_t* dummy_big_rec;
btr_cur_t cursor;
ulint err;
rec_t* rec;
ut_ad(level > 0);
btr_cur_search_to_nth_level(UT_LIST_GET_FIRST(tree->tree_indexes), level, tuple,
PAGE_CUR_LE, BTR_CONT_MODIFY_TREE, &cursor, 0, mtr);
err = btr_cur_pessimistic_insert(BTR_NO_LOCKING_FLAG | BTR_KEEP_SYS_FLAG | BTR_NO_UNDO_LOG_FLAG,
&cursor, tuple, &rec, &dummy_big_rec, NULL, mtr);
ut_a(err == DB_SUCCESS);
}
/*页从中间分裂后,先修改分裂后在父亲页上的node ptr记录,然后更改兄弟页之间的前后关联关系*/
static void btr_attach_half_pages(dict_tree_t* tree, page_t* page, rec_t* split_rec, page_t* new_page, ulint direction, mtr_t* mtr)
{
ulint space;
rec_t* node_ptr;
page_t* prev_page;
page_t* next_page;
ulint prev_page_no;
ulint next_page_no;
ulint level;
page_t* lower_page;
page_t* upper_page;
ulint lower_page_no;
ulint upper_page_no;
dtuple_t* node_ptr_upper;
mem_heap_t* heap;
/*对page的mtr log做判断*/
ut_ad(mtr_memo_contains(mtr, buf_block_align(page), MTR_MEMO_PAGE_X_FIX));
ut_ad(mtr_memo_contains(mtr, buf_block_align(new_page), MTR_MEMO_PAGE_X_FIX));
/*分裂方向从高到低*/
if(direction == FSP_DOWN){
lower_page_no = buf_frame_get_page_no(new_page);
upper_page_no = buf_frame_get_page_no(page);
lower_page = new_page;
upper_page = page;
/*获得指向page的node ptr*/
node_ptr = btr_page_get_father_node_ptr(tree, page, mtr);
/*将lower_page_no的信息替换原来的node ptr,原来的node ptr的值将作为新node ptr插入到level + 1层的页上*/
btr_node_ptr_set_child_page_no(node_ptr, lower_page_no, mtr);
}
else{ /*从低到高*/
/*原来的node ptr不变,再后面添加一条分裂出来的页的node ptr到level+1层上*/
lower_page_no = buf_frame_get_page_no(page);
upper_page_no = buf_frame_get_page_no(new_page);
lower_page = page;
upper_page = new_page;
}
heap = mem_heap_create(100);
level = btr_page_get_level(page, mtr);
/*构建一个分裂出来的node ptr*/
node_ptr_upper = dict_tree_build_node_ptr(tree, split_rec, upper_page_no, heap, level);
/*将记录插入到更高level + 1层*/
btr_insert_on_non_leaf_level(tree, level + 1, node_ptr_upper, mtr);
/*获得分裂前page的前后关系*/
prev_page_no = btr_page_get_prev(page, mtr);
next_page_no = btr_page_get_next(page, mtr);
space = buf_frame_get_space_id(page);
/*修改prev page中的隐射关系*/
if(prev_page_no != FIL_NULL){
prev_page = btr_page_get(space, prev_page_no, RW_X_LATCH, mtr); /*获得前一个page*/
btr_page_set_next(prev_page, lower_page_no, mtr);
}
/*修改next page中的隐射关系*/
if(next_page_no != FIL_NULL){
next_page = btr_page_get(space, next_page_no, RW_X_LATCH, mtr);
btr_page_set_prev(next_page, upper_page_no, mtr);
}
/*修改lower page的前后关系*/
btr_page_set_prev(lower_page, prev_page_no, mtr);
btr_page_set_next(lower_page, upper_page_no, mtr);
btr_page_set_level(lower_page, level, mtr);
/*修改page的前后关系*/
btr_page_set_prev(upper_page, lower_page_no, mtr);
btr_page_set_next(upper_page, next_page_no, mtr);
btr_page_set_level(upper_page, level, mtr);
}
rec_t* btr_page_split_and_insert(btr_cur_t* cursor, dtuple_t* tuple, mtr_t* mtr)
{
dict_tree_t* tree;
page_t* page;
ulint page_no;
byte direction;
ulint hint_page_no;
page_t* new_page;
rec_t* split_rec;
page_t* left_page;
page_t* right_page;
page_t* insert_page;
page_cur_t* page_cursor;
rec_t* first_rec;
byte* buf;
rec_t* move_limit;
ibool insert_will_fit;
ulint n_iterations = 0;
rec_t* rec;
func_start:
tree = btr_cur_get_tree(cursor);
ut_ad(mtr_memo_contains(mtr, dict_tree_get_lock(tree), MTR_MEMO_X_LOCK));
ut_ad(rw_lock_own(dict_tree_get_lock(tree), RW_LOCK_EX));
page = btr_cur_get_page(cursor);
ut_ad(mtr_memo_contains(mtr, buf_block_align(page), MTR_MEMO_PAGE_X_FIX));
ut_ad(page_get_n_recs(page) >= 2);
page_no = buf_frame_get_page_no(page);
/*如果split_rec == NULL,意味着tuple是插入到upper page的第一条记录上,half-page*/
if(n_iterations > 0){
direction = FSP_UP;
hint_page_no = page_no + 1;
/*确定分裂的位置*/
split_rec = btr_page_get_sure_split_rec(cursor, tuple);
}
else if(btr_page_get_split_rec_to_right(cursor, &split_rec)){ /*向右聚集分裂*/
direction = FSP_UP;
hint_page_no = page_no + 1;
}
else if(btr_page_get_split_rec_to_left(cursor, &split_rec)){ /*向左聚集分裂*/
direction = FSP_DOWN;
hint_page_no = page_no - 1;
}
else{
direction = FSP_UP;
hint_page_no = page_no + 1; /*表空间申请页的起始页号,作为分配页的一个预测依据*/
/*从中间分裂,预测中间记录作为分裂点*/
split_rec = page_get_middle_rec(page);
}
/*分配一个新page空间并初始化page*/
new_page = btr_page_alloc(tree, hint_page_no, direction, btr_page_get_level(page, mtr), mtr);
btr_page_create(new_page, tree, mtr);
/*确定高位page的第一条记录*/
if(split_rec != NULL){
first_rec = split_rec;
move_limit = split_rec;
}
else{
buf = mem_alloc(rec_get_converted_size(tuple));
/*将tuple转化成upper page的第一条记录*/
first_rec = rec_convert_dtuple_to_rec(buf, tuple);
move_limit = page_rec_get_next(btr_cur_get_rec(cursor));
}
/*进行page关联关系的修改*/
btr_attach_half_pages(tree, page, first_rec, new_page, direction, mtr);
if(split_rec == NULL)
mem_free(buf);
/*再次确定是否可以在split rec上插入tuple触发分裂,确定插入是否适合*/
insert_will_fit = btr_page_insert_fits(cursor, split_rec, tuple);
if(insert_will_fit && (btr_page_get_level(page, mtr) == 0)){ /*leaf page层*/
mtr_memo_release(mtr, dict_tree_get_lock(tree), MTR_MEMO_X_LOCK);
}
/*记录和行锁的转移*/
if(direction == FSP_DOWN){ /*记录转移,向左聚集*/
page_move_rec_list_start(new_page, page, move_limit, mtr);
left_page = new_page;
right_page = page;
/*事务锁的继承和转移*/
lock_update_split_left(right_page, left_page);
}
else{ /*向右聚集分裂*/
page_move_rec_list_end(new_page, page, move_limit, mtr);
left_page = page;
right_page = new_page;
lock_update_split_right(right_page, left_page);
}
/*确定新记录插入的页,因为页分裂了*/
if(split_rec == NULL)
insert_page = right_page;
else if(cmp_tuple_rec(tuple, first_rec) >= 0)
insert_page = right_page;
else
insert_page = left_page;
/*进行tuple插入*/
page_cursor = btr_cur_get_page_cur(cursor);
page_cur_search(insert_page, tuple, PAGE_CUR_LE, page_cursor);
rec = page_cur_tuple_insert(page_cursor, tuple, mtr);
if(rec != NULL){
ibuf_update_free_bits_for_two_pages_low(cursor->index, left_page, right_page, mtr);
return rec;
}
/*如果tuple插入是不适合的,进行reorganization*/
btr_page_reorganize(insert_page, mtr);
page_cur_search(insert_page, tuple, PAGE_CUR_LE, page_cursor);
rec = page_cur_tuple_insert(page_cursor, tuple, mtr);
if(rec == NULL){ /*重组后再次尝试插入,还是不适合,可能要进行再次分裂*/
ibuf_reset_free_bits(cursor->index, new_page);
n_iterations++;
ut_ad(n_iterations < 2);
ut_ad(!insert_will_fit);