Merge pull request #6 from ffengc/dev

dev to main
ffengc · May 8, 2024 · 2091a83 · 2091a83
2 parents 0feae7d + 286b82b
commit 2091a83
Show file tree

Hide file tree

Showing 12 changed files with 581 additions and 44 deletions.
diff --git a/README.md b/README.md
@@ -1,9 +1,10 @@
-# Google-tcmalloc-simulation-implementation
+# Google-tcmalloc-simulation-implementation(未完成)
 谷歌开源项目tcmalloc高并发内存池学习和模拟实现
 
 开题日期：20240504
 
-- [Google-tcmalloc-simulation-implementation](#google-tcmalloc-simulation-implementation)
+- [Google-tcmalloc-simulation-implementation(未完成)](#google-tcmalloc-simulation-implementation未完成)
+  - [==bugs to fix (项目目前待解决的问题)==](#bugs-to-fix-项目目前待解决的问题)
   - [前言](#前言)
   - [threadCache整体框架](#threadcache整体框架)
   - [开始写threadCache代码](#开始写threadcache代码)
@@ -22,9 +23,23 @@
   - [page\_cache内存释放](#page_cache内存释放)
   - [大于256k的情况](#大于256k的情况)
   - [处理代码中`new`的问题](#处理代码中new的问题)
+  - [解决free，使其不用传大小](#解决free使其不用传大小)
+  - [多线程场景下深度测试](#多线程场景下深度测试)
+  - [分析性能瓶颈](#分析性能瓶颈)
+  - [用Radix Tree进行优化](#用radix-tree进行优化)
 
 ***
 
+## ==bugs to fix (项目目前待解决的问题)==
+
+1. 在ubuntu_arm64环境下，如果调用多线程，出现段错误（原因未知，待解决）
+2. 在ubuntu_arm64环境下，radix tree需要用第三棵，前两棵用不了，需要解决。
+3. 在window32位环境下，可以偶尔成功运行，出现偶发段错误，原因未知，待解决。
+
+经过radixtree优化后，模拟实现的tcmalloc效率高于malloc。(win32下测试，会出现偶发段错误)
+
+![](./assets/5.png)
+
 ## 前言
 
 当前项目是实现一个高并发的内存池，他的原型是google的一个开源项目tcmalloc，tcmalloc全称 Thread-Caching Malloc，即线程缓存的malloc，实现了高效的多线程内存管理，用于替代系统的内存分配相关的函数(malloc、free)。
@@ -1196,4 +1211,92 @@ void page_cache::release_span_to_page(span* s) {
 
 ## 处理代码中`new`的问题
 
-代码中有些地方用了`new span`。这个就很不对。我们弄这个tcmalloc是用来替代malloc的，既然是替代，那我们的代码里面怎么能有`new`，`new`也是调用`malloc`的，所以我们要改一下。
+代码中有些地方用了`new span`。这个就很不对。我们弄这个tcmalloc是用来替代malloc的，既然是替代，那我们的代码里面怎么能有`new`，`new`也是调用`malloc`的，所以我们要改一下。
+
+然后之前是写了一个定长内存池的，可以用来代替new。
+
+**博客地址：[内存池是什么原理？｜内存池简易模拟实现｜为学习高并发内存池tcmalloc做准备](https://blog.csdn.net/Yu_Cblog/article/details/131741601)**
+
+page_cache.hpp
+```cpp
+class page_cache {
+private:
+    span_list __span_lists[PAGES_NUM];
+    static page_cache __s_inst;
+    page_cache() = default;
+    page_cache(const page_cache&) = delete;
+    std::unordered_map<PAGE_ID, span*> __id_span_map;
+    object_pool<span> __span_pool;
+```
+多加一个`object_pool<span> __span_pool;`对象。
+
+然后，`new span`的地方都替换掉。`delete`的地方也换掉就行。
+
+然后这里面也改一下。
+
+tcmalloc.hpp
+```cpp
+static void* tcmalloc(size_t size) {
+    if (size > MAX_BYTES) {
+        // 处理申请大内存的情况
+        size_t align_size = size_class::round_up(size);
+        size_t k_page = align_size >> PAGE_SHIFT;
+        page_cache::get_instance()->__page_mtx.lock();
+        span* cur_span = page_cache::get_instance()->new_span(k_page); // 直接找pc
+        page_cache::get_instance()->__page_mtx.unlock();
+        void* ptr = (void*)(cur_span->__page_id << PAGE_SHIFT); // span转化成地址
+        return ptr;
+    }
+    if (p_tls_thread_cache == nullptr) {
+        // 相当于单例
+        // p_tls_thread_cache = new thread_cache;
+        static object_pool<thread_cache> tc_pool;
+        p_tls_thread_cache = tc_pool.new_();
+    }
+#ifdef PROJECT_DEBUG
+    LOG(DEBUG) << "tcmalloc find tc from mem" << std::endl;
+#endif
+    return p_tls_thread_cache->allocate(size);
+}
+```
+
+## 解决free，使其不用传大小
+
+因为我们已经有页号到span的映射了。所以我们在span里面增加一个字段，obj_size就行。
+
+## 多线程场景下深度测试
+
+**首先要明确一点，我们不是去造一个轮子，我们要和malloc对比，不是说要比malloc快多少，因为我们在很多细节上，和tcmalloc差的还是很远的。**
+
+测试代码可以见bench\_mark.cc。
+
+结果
+```bash
+parallels@ubuntu-linux-22-04-desktop:~/Project/Google-tcmalloc-simulation-implementation$ ./out
+==========================================================
+4个线程并发执行10轮次，每轮次concurrent alloc 1000次: 花费：27877 ms
+4个线程并发执行10轮次，每轮次concurrent dealloc 1000次: 花费：52190 ms
+4个线程并发concurrent alloc&dealloc 40000次，总计花费：80067 ms
+
+
+4个线程并发执行10次，每轮次malloc 1000次: 花费：2227ms
+4个线程并发执行10轮次，每轮次free 1000次: 花费：1385 ms
+4个线程并发malloc&free 40000次，总计花费：3612 ms
+==========================================================
+parallels@ubuntu-linux-22-04-desktop:~/Project/Google-tcmalloc-simulation-implementation$
+```
+
+比malloc差。
+
+## 分析性能瓶颈
+
+linux和windows(VS STUDIO)下都有很多性能分析的工具，可以检测哪里调用的时间多。
+
+在这里直接出结论：锁用了很多时间。
+
+可以用基数树进行优化。
+
+## 用Radix Tree进行优化
+
+radix tree 我们可以直接用tcmalloc源码里面的。`page_map.hpp`。
+
diff --git a/assets/5.png b/assets/5.png
diff --git a/bench_mark.cc b/bench_mark.cc
@@ -0,0 +1,85 @@
+
+
+#include "./include/tcmalloc.hpp"
+#include <atomic>
+#include <thread>
+
+// ntimes 一轮申请和释放内存的次数
+// rounds 轮次
+void BenchmarkMalloc(size_t ntimes, size_t nworks, size_t rounds) {
+    std::vector<std::thread> vthread(nworks);
+    std::atomic<size_t> malloc_costtime(0);
+    std::atomic<size_t> free_costtime(0);
+    for (size_t k = 0; k < nworks; ++k) {
+        vthread[k] = std::thread([&, k]() {
+            std::vector<void*> v;
+            v.reserve(ntimes);
+            for (size_t j = 0; j < rounds; ++j) {
+                size_t begin1 = clock();
+                for (size_t i = 0; i < ntimes; i++) {
+                    v.push_back(malloc(16));
+                    // v.push_back(malloc((16 + i) % 8192 + 1));
+                }
+                size_t end1 = clock();
+                size_t begin2 = clock();
+                for (size_t i = 0; i < ntimes; i++) {
+                    free(v[i]);
+                }
+                size_t end2 = clock();
+                v.clear();
+                malloc_costtime += (end1 - begin1);
+                free_costtime += (end2 - begin2);
+            }
+        });
+    }
+    for (auto& t : vthread) {
+        t.join();
+    }
+    std::cout << nworks << "threads run" << rounds << " times, each round malloc " << ntimes << " times, cost: " << malloc_costtime.load() << "ms\n";
+    std::cout << nworks << "threads run" << rounds << " times, each round free " << ntimes << " times,  cost: " << free_costtime.load() << " ms\n";
+    std::cout << nworks << "threads run malloc and free " << nworks * rounds * ntimes << " time, total cost: " << malloc_costtime.load() + free_costtime.load() << " ms\n";
+}
+
+// 单轮次申请释放次数 线程数 轮次
+void BenchmarkConcurrentMalloc(size_t ntimes, size_t nworks, size_t rounds) {
+    std::vector<std::thread> vthread(nworks);
+    std::atomic<size_t> malloc_costtime(0);
+    std::atomic<size_t> free_costtime(0);
+    for (size_t k = 0; k < nworks; ++k) {
+        vthread[k] = std::thread([&]() {
+            std::vector<void*> v;
+            v.reserve(ntimes);
+            for (size_t j = 0; j < rounds; ++j) {
+                size_t begin1 = clock();
+                for (size_t i = 0; i < ntimes; i++) {
+                    v.push_back(tcmalloc(16));
+                    // v.push_back(ConcurrentAlloc((16 + i) % 8192 + 1));
+                }
+                size_t end1 = clock();
+                size_t begin2 = clock();
+                for (size_t i = 0; i < ntimes; i++) {
+                    tcfree(v[i]);
+                }
+                size_t end2 = clock();
+                v.clear();
+                malloc_costtime += (end1 - begin1);
+                free_costtime += (end2 - begin2);
+            }
+        });
+    }
+    for (auto& t : vthread) {
+        t.join();
+    }
+    std::cout << nworks << "threads run" << rounds << " times, each round malloc " << ntimes << " times, cost: " << malloc_costtime.load() << "ms\n";
+    std::cout << nworks << "threads run" << rounds << " times, each round free " << ntimes << " times,  cost: " << free_costtime.load() << " ms\n";
+    std::cout << nworks << "threads run tcmalloc and tcfree " << nworks * rounds * ntimes << " time, total cost: " << malloc_costtime.load() + free_costtime.load() << " ms\n";
+}
+
+int main() {
+    size_t n = 1000;
+    BenchmarkConcurrentMalloc(n, 4, 10);
+    std::cout << std::endl
+              << std::endl;
+    BenchmarkMalloc(n, 4, 10);
+    return 0;
+}
diff --git a/include/common.hpp b/include/common.hpp
@@ -28,8 +28,10 @@ static const size_t PAGE_SHIFT = 13;
 
 #if defined(_WIN64) || defined(__x86_64__) || defined(__ppc64__) || defined(__aarch64__)
 typedef unsigned long long PAGE_ID;
+#define SYS_BYTES 64
 #else
 typedef size_t PAGE_ID;
+#define SYS_BYTES 32
 #endif
 
 inline static void* system_alloc(size_t kpage) {
@@ -199,6 +201,7 @@ class span {
     size_t __use_count = 0; // 切成段小块内存，被分配给threadCache的计数器
     void* __free_list = nullptr; // 切好的小块内存的自由链表
     bool __is_use = false; // 是否在被使用
+    size_t __obj_size; // 切好的小对象的大小
 };
 
 // 带头双向循环链表

diff --git a/include/object_pool.hpp b/include/object_pool.hpp
@@ -0,0 +1,53 @@
+
+
+#ifndef __YUFC_OBJECT_POOL_HPP__
+#define __YUFC_OBJECT_POOL_HPP__
+
+#include <iostream>
+#include <vector>
+#include "./common.hpp"
+
+#define __DEFAULT_KB__ 128
+
+
+
+template <class T>
+class object_pool {
+private:
+    char* __memory = nullptr; // char 方便切
+    size_t __remain_bytes = 0; // 大块内存在切的过程中剩余的字节数
+    void* __free_list = nullptr; // 还回来的时候形成的自由链表
+public:
+    T* new_() {
+        T* obj = nullptr;
+        // 不够空间 首选是把还回来的内存块对象进行再次利用
+        if (__free_list) {
+            // 头删
+            void* next = *((void**)__free_list);
+            obj = (T*)__free_list;
+            __free_list = next;
+            return obj;
+        }
+        if (__remain_bytes < sizeof(T)) {
+            // 空间不够了，要重新开一个空间
+            __remain_bytes = __DEFAULT_KB__ * 1024;
+            __memory = (char*)malloc(__remain_bytes);
+            if (__memory == nullptr) {
+                throw std::bad_alloc();
+            }
+        }
+        obj = (T*)__memory;
+        size_t obj_size = sizeof(T) < sizeof(void*) ? sizeof(void*) : sizeof(T);
+        __memory += obj_size;
+        __remain_bytes -= obj_size;
+        new (obj) T;
+        return obj;
+    }
+    void delete_(T* obj) {
+        obj->~T();
+        *(void**)obj = __free_list;
+        __free_list = obj;
+    }
+};
+
+#endif
diff --git a/include/page_cache.hpp b/include/page_cache.hpp
@@ -4,14 +4,18 @@
 #define __YUFC_PAGE_CACHE_HPP__
 
 #include "./common.hpp"
+#include "./object_pool.hpp"
+#include "./page_map.hpp"
 
 class page_cache {
 private:
     span_list __span_lists[PAGES_NUM];
     static page_cache __s_inst;
     page_cache() = default;
     page_cache(const page_cache&) = delete;
-    std::unordered_map<PAGE_ID, span*> __id_span_map;
+    // std::unordered_map<PAGE_ID, span*> __id_span_map;
+    TCMalloc_PageMap3<SYS_BYTES - PAGE_SHIFT> __id_span_map;
+    object_pool<span> __span_pool;
 
 public:
     std::mutex __page_mtx;
@@ -21,6 +25,7 @@ class page_cache {
     span* map_obj_to_span(void* obj);
     // 释放空闲的span回到pc，并合并相邻的span
     void release_span_to_page(span* s, size_t size = 0);
+
 public:
     // 获取一个K页的span
     span* new_span(size_t k);