Skip to content

Commit fd6e8c1

Browse files
amotinbehlendorf
authored andcommitted
BRT: Rework structures and locks to be per-vdev
While block cloning operation from the beginning was made per-vdev, before this change most of its data were protected by two pool- wide locks. It created lots of lock contention in many workload. This change makes most of block cloning data structures per-vdev, which allows to lock them separately. The only pool-wide lock now it spa_brt_lock, protecting array of per-vdev pointers and in most cases taken as reader. Also this splits per-vdev locks into three different ones: bv_pending_lock protects the AVL-tree of pending operations in open context, bv_mos_entries_lock protects BRT ZAP object from while being prefetched, and bv_lock protects the rest of per-vdev context during TXG commit process. There should be no functional difference aside of some optimizations. Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Reviewed-by: Pawel Jakub Dawidek <pjd@FreeBSD.org> Reviewed-by: Brian Atkinson <batkinson@lanl.gov> Signed-off-by: Alexander Motin <mav@FreeBSD.org> Sponsored by: iXsystems, Inc. Closes openzfs#16740
1 parent 309ce63 commit fd6e8c1

File tree

6 files changed

+400
-532
lines changed

6 files changed

+400
-532
lines changed

cmd/zdb/zdb.c

Lines changed: 13 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -2119,9 +2119,6 @@ dump_brt(spa_t *spa)
21192119
return;
21202120
}
21212121

2122-
brt_t *brt = spa->spa_brt;
2123-
VERIFY(brt);
2124-
21252122
char count[32], used[32], saved[32];
21262123
zdb_nicebytes(brt_get_used(spa), used, sizeof (used));
21272124
zdb_nicebytes(brt_get_saved(spa), saved, sizeof (saved));
@@ -2132,11 +2129,8 @@ dump_brt(spa_t *spa)
21322129
if (dump_opt['T'] < 2)
21332130
return;
21342131

2135-
for (uint64_t vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) {
2136-
brt_vdev_t *brtvd = &brt->brt_vdevs[vdevid];
2137-
if (brtvd == NULL)
2138-
continue;
2139-
2132+
for (uint64_t vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++) {
2133+
brt_vdev_t *brtvd = spa->spa_brt_vdevs[vdevid];
21402134
if (!brtvd->bv_initiated) {
21412135
printf("BRT: vdev %" PRIu64 ": empty\n", vdevid);
21422136
continue;
@@ -2160,20 +2154,21 @@ dump_brt(spa_t *spa)
21602154
if (!do_histo)
21612155
printf("\n%-16s %-10s\n", "DVA", "REFCNT");
21622156

2163-
for (uint64_t vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) {
2164-
brt_vdev_t *brtvd = &brt->brt_vdevs[vdevid];
2165-
if (brtvd == NULL || !brtvd->bv_initiated)
2157+
for (uint64_t vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++) {
2158+
brt_vdev_t *brtvd = spa->spa_brt_vdevs[vdevid];
2159+
if (!brtvd->bv_initiated)
21662160
continue;
21672161

21682162
uint64_t counts[64] = {};
21692163

21702164
zap_cursor_t zc;
21712165
zap_attribute_t *za = zap_attribute_alloc();
2172-
for (zap_cursor_init(&zc, brt->brt_mos, brtvd->bv_mos_entries);
2166+
for (zap_cursor_init(&zc, spa->spa_meta_objset,
2167+
brtvd->bv_mos_entries);
21732168
zap_cursor_retrieve(&zc, za) == 0;
21742169
zap_cursor_advance(&zc)) {
21752170
uint64_t refcnt;
2176-
VERIFY0(zap_lookup_uint64(brt->brt_mos,
2171+
VERIFY0(zap_lookup_uint64(spa->spa_meta_objset,
21772172
brtvd->bv_mos_entries,
21782173
(const uint64_t *)za->za_name, 1,
21792174
za->za_integer_length, za->za_num_integers,
@@ -8227,14 +8222,11 @@ dump_mos_leaks(spa_t *spa)
82278222
}
82288223
}
82298224

8230-
if (spa->spa_brt != NULL) {
8231-
brt_t *brt = spa->spa_brt;
8232-
for (uint64_t vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) {
8233-
brt_vdev_t *brtvd = &brt->brt_vdevs[vdevid];
8234-
if (brtvd != NULL && brtvd->bv_initiated) {
8235-
mos_obj_refd(brtvd->bv_mos_brtvdev);
8236-
mos_obj_refd(brtvd->bv_mos_entries);
8237-
}
8225+
for (uint64_t vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++) {
8226+
brt_vdev_t *brtvd = spa->spa_brt_vdevs[vdevid];
8227+
if (brtvd->bv_initiated) {
8228+
mos_obj_refd(brtvd->bv_mos_brtvdev);
8229+
mos_obj_refd(brtvd->bv_mos_entries);
82388230
}
82398231
}
82408232

include/sys/brt_impl.h

Lines changed: 43 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -86,28 +86,38 @@ typedef struct brt_vdev_phys {
8686
uint64_t bvp_savedspace;
8787
} brt_vdev_phys_t;
8888

89-
typedef struct brt_vdev {
89+
struct brt_vdev {
9090
/*
91-
* VDEV id.
91+
* Pending changes from open contexts.
9292
*/
93-
uint64_t bv_vdevid;
93+
kmutex_t bv_pending_lock;
94+
avl_tree_t bv_pending_tree[TXG_SIZE];
9495
/*
95-
* Is the structure initiated?
96-
* (bv_entcount and bv_bitmap are allocated?)
96+
* Protects bv_mos_*.
9797
*/
98-
boolean_t bv_initiated;
98+
krwlock_t bv_mos_entries_lock ____cacheline_aligned;
99+
/*
100+
* Protects all the fields starting from bv_initiated.
101+
*/
102+
krwlock_t bv_lock ____cacheline_aligned;
103+
/*
104+
* VDEV id.
105+
*/
106+
uint64_t bv_vdevid ____cacheline_aligned;
99107
/*
100108
* Object number in the MOS for the entcount array and brt_vdev_phys.
101109
*/
102110
uint64_t bv_mos_brtvdev;
103111
/*
104-
* Object number in the MOS for the entries table.
112+
* Object number in the MOS and dnode for the entries table.
105113
*/
106114
uint64_t bv_mos_entries;
115+
dnode_t *bv_mos_entries_dnode;
107116
/*
108-
* Entries to sync.
117+
* Is the structure initiated?
118+
* (bv_entcount and bv_bitmap are allocated?)
109119
*/
110-
avl_tree_t bv_tree;
120+
boolean_t bv_initiated;
111121
/*
112122
* Does the bv_entcount[] array needs byte swapping?
113123
*/
@@ -120,6 +130,26 @@ typedef struct brt_vdev {
120130
* This is the array with BRT entry count per BRT_RANGESIZE.
121131
*/
122132
uint16_t *bv_entcount;
133+
/*
134+
* bv_entcount[] potentially can be a bit too big to sychronize it all
135+
* when we just changed few entcounts. The fields below allow us to
136+
* track updates to bv_entcount[] array since the last sync.
137+
* A single bit in the bv_bitmap represents as many entcounts as can
138+
* fit into a single BRT_BLOCKSIZE.
139+
* For example we have 65536 entcounts in the bv_entcount array
140+
* (so the whole array is 128kB). We updated bv_entcount[2] and
141+
* bv_entcount[5]. In that case only first bit in the bv_bitmap will
142+
* be set and we will write only first BRT_BLOCKSIZE out of 128kB.
143+
*/
144+
ulong_t *bv_bitmap;
145+
/*
146+
* bv_entcount[] needs updating on disk.
147+
*/
148+
boolean_t bv_entcount_dirty;
149+
/*
150+
* brt_vdev_phys needs updating on disk.
151+
*/
152+
boolean_t bv_meta_dirty;
123153
/*
124154
* Sum of all bv_entcount[]s.
125155
*/
@@ -133,45 +163,10 @@ typedef struct brt_vdev {
133163
*/
134164
uint64_t bv_savedspace;
135165
/*
136-
* brt_vdev_phys needs updating on disk.
137-
*/
138-
boolean_t bv_meta_dirty;
139-
/*
140-
* bv_entcount[] needs updating on disk.
141-
*/
142-
boolean_t bv_entcount_dirty;
143-
/*
144-
* bv_entcount[] potentially can be a bit too big to sychronize it all
145-
* when we just changed few entcounts. The fields below allow us to
146-
* track updates to bv_entcount[] array since the last sync.
147-
* A single bit in the bv_bitmap represents as many entcounts as can
148-
* fit into a single BRT_BLOCKSIZE.
149-
* For example we have 65536 entcounts in the bv_entcount array
150-
* (so the whole array is 128kB). We updated bv_entcount[2] and
151-
* bv_entcount[5]. In that case only first bit in the bv_bitmap will
152-
* be set and we will write only first BRT_BLOCKSIZE out of 128kB.
166+
* Entries to sync.
153167
*/
154-
ulong_t *bv_bitmap;
155-
uint64_t bv_nblocks;
156-
} brt_vdev_t;
157-
158-
/*
159-
* In-core brt
160-
*/
161-
typedef struct brt {
162-
krwlock_t brt_lock;
163-
spa_t *brt_spa;
164-
#define brt_mos brt_spa->spa_meta_objset
165-
uint64_t brt_rangesize;
166-
uint64_t brt_usedspace;
167-
uint64_t brt_savedspace;
168-
avl_tree_t brt_pending_tree[TXG_SIZE];
169-
kmutex_t brt_pending_lock[TXG_SIZE];
170-
/* Sum of all entries across all bv_trees. */
171-
uint64_t brt_nentries;
172-
brt_vdev_t *brt_vdevs;
173-
uint64_t brt_nvdevs;
174-
} brt_t;
168+
avl_tree_t bv_tree;
169+
};
175170

176171
/* Size of bre_offset / sizeof (uint64_t). */
177172
#define BRT_KEY_WORDS (1)
@@ -188,7 +183,7 @@ typedef struct brt_entry {
188183

189184
typedef struct brt_pending_entry {
190185
blkptr_t bpe_bp;
191-
int bpe_count;
186+
uint64_t bpe_count;
192187
avl_node_t bpe_node;
193188
} brt_pending_entry_t;
194189

include/sys/spa.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@ extern "C" {
5353
/*
5454
* Forward references that lots of things need.
5555
*/
56+
typedef struct brt_vdev brt_vdev_t;
5657
typedef struct spa spa_t;
5758
typedef struct vdev vdev_t;
5859
typedef struct metaslab metaslab_t;

include/sys/spa_impl.h

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -412,8 +412,12 @@ struct spa {
412412
uint64_t spa_dedup_dspace; /* Cache get_dedup_dspace() */
413413
uint64_t spa_dedup_checksum; /* default dedup checksum */
414414
uint64_t spa_dspace; /* dspace in normal class */
415+
uint64_t spa_rdspace; /* raw (non-dedup) --//-- */
415416
boolean_t spa_active_ddt_prune; /* ddt prune process active */
416-
struct brt *spa_brt; /* in-core BRT */
417+
brt_vdev_t **spa_brt_vdevs; /* array of per-vdev BRTs */
418+
uint64_t spa_brt_nvdevs; /* number of vdevs in BRT */
419+
uint64_t spa_brt_rangesize; /* pool's BRT range size */
420+
krwlock_t spa_brt_lock; /* Protects brt_vdevs/nvdevs */
417421
kmutex_t spa_vdev_top_lock; /* dueling offline/remove */
418422
kmutex_t spa_proc_lock; /* protects spa_proc* */
419423
kcondvar_t spa_proc_cv; /* spa_proc_state transitions */

0 commit comments

Comments
 (0)