Skip to content
1 change: 1 addition & 0 deletions ft/cachetable/cachetable-internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,7 @@ struct cachefile {
// If set then fclose will not be logged in recovery log.
bool skip_log_recover_on_close;
int fd; /* Bug: If a file is opened read-only, then it is stuck in read-only. If it is opened read-write, then subsequent writers can write to it too. */
unsigned int blocksize; /* Filesystem block size for O_DIRECT operations */
CACHETABLE cachetable;
struct fileid fileid;
// the filenum is used as an identifer of the cachefile
Expand Down
13 changes: 13 additions & 0 deletions ft/cachetable/cachetable.cc
Original file line number Diff line number Diff line change
Expand Up @@ -369,6 +369,11 @@ toku_cachetable_reserve_filenum(CACHETABLE ct) {
return ct->cf_list.reserve_filenum();
}

static unsigned int fd_blocksize(int fd) {
toku_struct_stat st;
return (unsigned int) toku_os_fstat(fd, &st) ? 512 : st.st_blksize;
}

static void create_new_cachefile(
CACHETABLE ct,
FILENUM filenum,
Expand All @@ -387,6 +392,7 @@ static void create_new_cachefile(

newcf->filenum = filenum;
newcf->fd = fd;
newcf->blocksize = fd_blocksize(fd);
newcf->fname_in_env = toku_xstrdup(fname_in_env);
bjm_init(&newcf->bjm);
*cfptr = newcf;
Expand Down Expand Up @@ -427,6 +433,7 @@ int toku_cachetable_openfd_with_filenum (CACHEFILE *cfptr, CACHETABLE ct, int fd
// fix up the fields in the cachefile
existing_cf->filenum = filenum;
existing_cf->fd = fd;
existing_cf->blocksize = fd_blocksize(fd);
existing_cf->fname_in_env = toku_xstrdup(fname_in_env);
bjm_init(&existing_cf->bjm);

Expand Down Expand Up @@ -498,6 +505,11 @@ toku_cachefile_get_fd (CACHEFILE cf) {
return cf->fd;
}

unsigned int
toku_cachefile_get_blocksize (CACHEFILE cf) {
return cf->blocksize;
}

static void cachefile_destroy(CACHEFILE cf) {
if (cf->free_userdata) {
cf->free_userdata(cf, cf->userdata);
Expand Down Expand Up @@ -535,6 +547,7 @@ void toku_cachefile_close(CACHEFILE *cfp, bool oplsn_valid, LSN oplsn) {
int r = close(cf->fd);
assert(r == 0);
cf->fd = -1;
cf->blocksize = 512;

// destroy the parts of the cachefile
// that do not persist across opens/closes
Expand Down
3 changes: 3 additions & 0 deletions ft/cachetable/cachetable.h
Original file line number Diff line number Diff line change
Expand Up @@ -501,6 +501,9 @@ void toku_cachefile_close (CACHEFILE*, bool oplsn_valid, LSN oplsn);
// Grabs a read lock protecting the fd
int toku_cachefile_get_fd (CACHEFILE);

// Get the blocksize associated with the cachefile
unsigned int toku_cachefile_get_blocksize (CACHEFILE);

// Get the iname (within the environment) associated with the cachefile
// Return the filename
char * toku_cachefile_fname_in_env (CACHEFILE cf);
Expand Down
4 changes: 2 additions & 2 deletions ft/ft-internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -454,8 +454,8 @@ void toku_ft_get_status(FT_STATUS);
void toku_flusher_thread_set_callback(void (*callback_f)(int, void*), void* extra);

// For upgrade
int toku_upgrade_subtree_estimates_to_stat64info(int fd, FT ft) __attribute__((nonnull));
int toku_upgrade_msn_from_root_to_header(int fd, FT ft) __attribute__((nonnull));
int toku_upgrade_subtree_estimates_to_stat64info(int fd, unsigned int block_size, FT ft) __attribute__((nonnull));
int toku_upgrade_msn_from_root_to_header(int fd, unsigned int block_size, FT ft) __attribute__((nonnull));

// A callback function is invoked with the key, and the data.
// The pointers (to the bytevecs) must not be modified. The data must be copied out before the callback function returns.
Expand Down
4 changes: 2 additions & 2 deletions ft/ft-ops.cc
Original file line number Diff line number Diff line change
Expand Up @@ -796,7 +796,7 @@ toku_ft_status_update_pivot_fetch_reason(ftnode_fetch_extra *bfe)
}
}

int toku_ftnode_fetch_callback(CACHEFILE UU(cachefile),
int toku_ftnode_fetch_callback(CACHEFILE cachefile,
PAIR p,
int fd,
BLOCKNUM blocknum,
Expand All @@ -815,7 +815,7 @@ int toku_ftnode_fetch_callback(CACHEFILE UU(cachefile),
// evaluate what piece of the the node is necessary until we get it at
// least partially into memory
int r =
toku_deserialize_ftnode_from(fd, blocknum, fullhash, node, ndd, bfe);
toku_deserialize_ftnode_from(fd, toku_cachefile_get_blocksize(cachefile), blocknum, fullhash, node, ndd, bfe);
if (r != 0) {
if (r == TOKUDB_BAD_CHECKSUM) {
fprintf(
Expand Down
2 changes: 1 addition & 1 deletion ft/ft-ops.h
Original file line number Diff line number Diff line change
Expand Up @@ -266,7 +266,7 @@ void toku_ft_serialize_layer_destroy(void);
void toku_maybe_truncate_file (int fd, uint64_t size_used, uint64_t expected_size, uint64_t *new_size);
// Effect: truncate file if overallocated by at least 32MiB

void toku_maybe_preallocate_in_file (int fd, int64_t size, int64_t expected_size, int64_t *new_size);
void toku_maybe_preallocate_in_file (int fd, unsigned int block_size, int64_t size, int64_t expected_size, int64_t *new_size);
// Effect: make the file bigger by either doubling it or growing by 16MiB whichever is less, until it is at least size
// Return 0 on success, otherwise an error number.

Expand Down
10 changes: 5 additions & 5 deletions ft/ft.cc
Original file line number Diff line number Diff line change
Expand Up @@ -410,7 +410,7 @@ void toku_ft_create(FT *ftp, FT_OPTIONS options, CACHEFILE cf, TOKUTXN txn) {
toku_ft_init_reflock(ft);

// Assign blocknum for root block, also dirty the header
ft->blocktable.create();
ft->blocktable.create(toku_cachefile_get_blocksize(cf));
ft->blocktable.allocate_blocknum(&ft->h->root_blocknum, ft);

ft_init(ft, options, cf);
Expand All @@ -431,9 +431,8 @@ int toku_read_ft_and_store_in_cachefile (FT_HANDLE ft_handle, CACHEFILE cf, LSN
return 0;
}

int fd = toku_cachefile_get_fd(cf);
const char *fn = toku_cachefile_fname_in_env(cf);
int r = toku_deserialize_ft_from(fd, fn, max_acceptable_lsn, &ft);
int r = toku_deserialize_ft_from(toku_cachefile_get_fd(cf), toku_cachefile_get_blocksize(cf), fn, max_acceptable_lsn, &ft);
if (r == TOKUDB_BAD_CHECKSUM) {
fprintf(stderr, "Checksum failure while reading header in file %s.\n", toku_cachefile_fname_in_env(cf));
assert(false); // make absolutely sure we crash before doing anything else
Expand Down Expand Up @@ -868,9 +867,10 @@ toku_ft_update_descriptor_with_fd(FT ft, DESCRIPTOR desc, int fd) {
// the checksum is four bytes, so that's where the magic number comes from
// make space for the new descriptor and write it out to disk
DISKOFF offset, size;
assert(fd == toku_cachefile_get_fd(ft->cf));
size = toku_serialize_descriptor_size(desc) + 4;
ft->blocktable.realloc_descriptor_on_disk(size, &offset, ft, fd);
toku_serialize_descriptor_contents_to_fd(fd, desc, offset);
toku_serialize_descriptor_contents_to_fd(fd, toku_cachefile_get_blocksize(ft->cf), desc, offset);

// cleanup the old descriptor and set the in-memory descriptor to the new one
toku_destroy_dbt(&ft->descriptor.dbt);
Expand Down Expand Up @@ -1036,7 +1036,7 @@ garbage_helper(BLOCKNUM blocknum, int64_t UU(size), int64_t UU(address), void *e
ftnode_fetch_extra bfe;
bfe.create_for_full_read(info->ft);
int fd = toku_cachefile_get_fd(info->ft->cf);
int r = toku_deserialize_ftnode_from(fd, blocknum, 0, &node, &ndd, &bfe);
int r = toku_deserialize_ftnode_from(fd, toku_cachefile_get_blocksize(info->ft->cf), blocknum, 0, &node, &ndd, &bfe);
if (r != 0) {
goto no_node;
}
Expand Down
16 changes: 9 additions & 7 deletions ft/serialize/block_allocator.cc
Original file line number Diff line number Diff line change
Expand Up @@ -56,10 +56,10 @@ Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
#endif

void BlockAllocator::CreateInternal(uint64_t reserve_at_beginning,
uint64_t alignment) {
// the alignment must be at least 512 and aligned with 512 to work with
uint64_t alignment, unsigned int blocksize) {
// the alignment must be at least blocksize and aligned with blocksize to work with
// direct I/O
invariant(alignment >= 512 && (alignment % 512) == 0);
invariant(alignment >= blocksize && (alignment % blocksize) == 0);

_reserve_at_beginning = reserve_at_beginning;
_alignment = alignment;
Expand All @@ -68,8 +68,9 @@ void BlockAllocator::CreateInternal(uint64_t reserve_at_beginning,
_tree = new MhsRbTree::Tree(alignment);
}

void BlockAllocator::Create(uint64_t reserve_at_beginning, uint64_t alignment) {
CreateInternal(reserve_at_beginning, alignment);
void BlockAllocator::Create(uint64_t reserve_at_beginning, uint64_t alignment,
unsigned int blocksize) {
CreateInternal(reserve_at_beginning, alignment, blocksize);
_tree->Insert({reserve_at_beginning, MAX_BYTE});
VALIDATE();
}
Expand All @@ -80,9 +81,10 @@ void BlockAllocator::Destroy() {

void BlockAllocator::CreateFromBlockPairs(uint64_t reserve_at_beginning,
uint64_t alignment,
unsigned int blocksize,
struct BlockPair *translation_pairs,
uint64_t n_blocks) {
CreateInternal(reserve_at_beginning, alignment);
CreateInternal(reserve_at_beginning, alignment, blocksize);
_n_blocks = n_blocks;

struct BlockPair *XMALLOC_N(n_blocks, pairs);
Expand Down Expand Up @@ -124,7 +126,7 @@ static inline uint64_t Align(uint64_t value, uint64_t ba_alignment) {

// Effect: Allocate a block. The resulting block must be aligned on the
// ba->alignment (which to make direct_io happy must be a positive multiple of
// 512).
// blocksize).
void BlockAllocator::AllocBlock(uint64_t size,
uint64_t *offset) {
// Allocator does not support size 0 blocks. See block_allocator_free_block.
Expand Down
7 changes: 5 additions & 2 deletions ft/serialize/block_allocator.h
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,8 @@ class BlockAllocator {
// reserve_at_beginning (IN) Size of reserved block at beginning.
// This size does not have to be aligned.
// alignment (IN) Block alignment.
void Create(uint64_t reserve_at_beginning, uint64_t alignment);
// blocksize (IN) Blocksize.
void Create(uint64_t reserve_at_beginning, uint64_t alignment, unsigned int blocksize);

// Effect: Create a block allocator, in which the first RESERVE_AT_BEGINNING
// bytes are not put into a block.
Expand All @@ -110,8 +111,10 @@ class BlockAllocator {
// reserve_at_beginning (IN) Size of reserved block at beginning.
// This size does not have to be aligned.
// alignment (IN) Block alignment.
// blocksize (IN) Block size.
void CreateFromBlockPairs(uint64_t reserve_at_beginning,
uint64_t alignment,
unsigned int blocksize,
struct BlockPair *pairs,
uint64_t n_blocks);

Expand Down Expand Up @@ -173,7 +176,7 @@ class BlockAllocator {
virtual ~BlockAllocator(){};

private:
void CreateInternal(uint64_t reserve_at_beginning, uint64_t alignment);
void CreateInternal(uint64_t reserve_at_beginning, uint64_t alignment, unsigned int blocksize);

// How much to reserve at the beginning
uint64_t _reserve_at_beginning;
Expand Down
41 changes: 25 additions & 16 deletions ft/serialize/block_table.cc
Original file line number Diff line number Diff line change
Expand Up @@ -136,8 +136,13 @@ int block_table::create_from_buffer(

// Determine the file size
int64_t file_size = 0;
r = toku_os_get_file_size(fd, &file_size);
lazy_assert_zero(r);
int blocksize;
toku_struct_stat st;

r = toku_os_fstat(fd, &st);
lazy_assert_zero(r );
blocksize = r ? 512 : st.st_blksize;
file_size = r ? st.st_size : 0;
invariant(file_size >= 0);
_safe_file_size = file_size;

Expand All @@ -159,13 +164,14 @@ int block_table::create_from_buffer(
_bt_block_allocator->CreateFromBlockPairs(
BlockAllocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE,
BlockAllocator::BLOCK_ALLOCATOR_ALIGNMENT,
blocksize,
pairs,
n_pairs);

return 0;
}

void block_table::create() {
void block_table::create(unsigned int blocksize) {
// Does not initialize the block allocator
_create_internal();

Expand All @@ -187,7 +193,7 @@ void block_table::create() {
// Create an empty block allocator.
_bt_block_allocator->Create(
BlockAllocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE,
BlockAllocator::BLOCK_ALLOCATOR_ALIGNMENT);
BlockAllocator::BLOCK_ALLOCATOR_ALIGNMENT, blocksize);
}

// TODO: Refactor with FT-303
Expand Down Expand Up @@ -484,6 +490,7 @@ void block_table::_realloc_on_disk_internal(BLOCKNUM b,
}

void block_table::_ensure_safe_write_unlocked(int fd,
unsigned int disk_block_size,
DISKOFF block_size,
DISKOFF block_offset) {
// Requires: holding _mutex
Expand All @@ -496,7 +503,7 @@ void block_table::_ensure_safe_write_unlocked(int fd,

int64_t size_after;
toku_maybe_preallocate_in_file(
fd, size_needed, _safe_file_size, &size_after);
fd, disk_block_size, size_needed, _safe_file_size, &size_after);

_mutex_lock();
_safe_file_size = size_after;
Expand All @@ -516,7 +523,7 @@ void block_table::realloc_on_disk(BLOCKNUM b,
_verify_valid_freeable_blocknum(t, b);
_realloc_on_disk_internal(b, size, offset, ft, for_checkpoint);

_ensure_safe_write_unlocked(fd, size, *offset);
_ensure_safe_write_unlocked(fd, toku_cachefile_get_blocksize(ft->cf), size, *offset);
_mutex_unlock();
}

Expand Down Expand Up @@ -550,14 +557,16 @@ void block_table::_alloc_inprogress_translation_on_disk_unlocked() {
// Effect: Serializes the blocktable to a wbuf (which starts uninitialized)
// A clean shutdown runs checkpoint start so that current and inprogress are
// copies.
// The resulting wbuf buffer is guaranteed to be be 512-byte aligned and the
// total length is a multiple of 512 (so we pad with zeros at the end if
// The resulting wbuf buffer is guaranteed to be be blocksize-byte aligned and the
// total length is a multiple of blocksize (so we pad with zeros at the end if
// needd)
// The address is guaranteed to be 512-byte aligned, but the size is not
// The address is guaranteed to be blocksize-byte aligned, but the size is not
// guaranteed.
// It *is* guaranteed that we can read up to the next 512-byte boundary,
// It *is* guaranteed that we can read up to the next blocksize-byte boundary,
// however
// blocksize equates to the blocksize of the filesystem cf is on.
void block_table::serialize_translation_to_wbuf(int fd,
unsigned int blocksize,
struct wbuf *w,
int64_t *address,
int64_t *size) {
Expand All @@ -566,11 +575,11 @@ void block_table::serialize_translation_to_wbuf(int fd,

BLOCKNUM b = make_blocknum(RESERVED_BLOCKNUM_TRANSLATION);
_alloc_inprogress_translation_on_disk_unlocked(); // The allocated block
// must be 512-byte
// must be blocksize-byte
// aligned to make
// O_DIRECT happy.
uint64_t size_translation = _calculate_size_on_disk(t);
uint64_t size_aligned = roundup_to_multiple(512, size_translation);
uint64_t size_aligned = roundup_to_multiple(blocksize, size_translation);
invariant((int64_t)size_translation == t->block_translation[b.b].size);
{
// Init wbuf
Expand All @@ -582,7 +591,7 @@ void block_table::serialize_translation_to_wbuf(int fd,
__LINE__,
size_translation,
t->block_translation[b.b].u.diskoff);
char *XMALLOC_N_ALIGNED(512, size_aligned, buf);
char *XMALLOC_N_ALIGNED(blocksize, size_aligned, buf);
for (uint64_t i = size_translation; i < size_aligned; i++)
buf[i] = 0; // fill in the end of the buffer with zeros.
wbuf_init(w, buf, size_aligned);
Expand All @@ -604,9 +613,9 @@ void block_table::serialize_translation_to_wbuf(int fd,
wbuf_int(w, checksum);
*address = t->block_translation[b.b].u.diskoff;
*size = size_translation;
invariant((*address) % 512 == 0);
invariant((*address) % blocksize == 0);

_ensure_safe_write_unlocked(fd, size_aligned, *address);
_ensure_safe_write_unlocked(fd, blocksize, size_aligned, *address);
_mutex_unlock();
}

Expand Down Expand Up @@ -1028,7 +1037,7 @@ void block_table::realloc_descriptor_on_disk(DISKOFF size,
int fd) {
_mutex_lock();
_realloc_descriptor_on_disk_unlocked(size, offset, ft);
_ensure_safe_write_unlocked(fd, size, *offset);
_ensure_safe_write_unlocked(fd, toku_cachefile_get_blocksize(ft->cf), size, *offset);
_mutex_unlock();
}

Expand Down
4 changes: 3 additions & 1 deletion ft/serialize/block_table.h
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ class block_table {
TRANSLATION_DEBUG
};

void create();
void create(unsigned int blocksize);

int create_from_buffer(int fd,
DISKOFF location_on_disk,
Expand Down Expand Up @@ -146,6 +146,7 @@ class block_table {

// Serialization
void serialize_translation_to_wbuf(int fd,
unsigned int blocksize,
struct wbuf *w,
int64_t *address,
int64_t *size);
Expand Down Expand Up @@ -258,6 +259,7 @@ class block_table {
// File management
void _maybe_truncate_file(int fd, uint64_t size_needed_before);
void _ensure_safe_write_unlocked(int fd,
unsigned int disk_block_size,
DISKOFF block_size,
DISKOFF block_offset);

Expand Down
Loading