audio_buffer新增缓冲区池, 优化内存命中率, 新增simd加速计算

This commit is contained in:
Nanako 2024-06-29 20:42:09 +08:00
parent f69f3088f2
commit 3ea2f0ab4f
10 changed files with 327 additions and 50 deletions

View File

@ -10,6 +10,8 @@ if (WIN32)
set(RTAUDIO_API_DS OFF CACHE BOOL "" FORCE)
elseif(APPLE)
set(RTAUDIO_API_CORE ON CACHE BOOL "" FORCE)
else()
set(RTAUDIO_API_ALSA ON CACHE BOOL "" FORCE)
endif()
# setup spdlog
@ -37,4 +39,3 @@ add_subdirectory(third_party/spdlog)
add_subdirectory(third_party/mempool)
add_subdirectory(third_party/taskflow)
add_subdirectory(third_party/glfw)

View File

@ -1,6 +1,6 @@
cmake_minimum_required(VERSION 3.5)
project(core)
set(CMAKE_CXX_STANDARD 23)
set(CMAKE_CXX_STANDARD 26)
set(ALL_FILES "")
retrieve_files(${CMAKE_CURRENT_SOURCE_DIR} ALL_FILES)
@ -33,6 +33,13 @@ elseif(UNIX AND NOT APPLE)
target_compile_definitions(${PROJECT_NAME} PUBLIC PLATFORM_WINDOWS=0 PLATFORM_MACOS=0 PLATFORM_LINUX=1 GLFW_EXPOSE_NATIVE_X11)
endif()
# cpu amd or arm
if (CMAKE_SYSTEM_PROCESSOR MATCHES "amd64.*|x86_64.*|AMD64.*")
target_compile_definitions(${PROJECT_NAME} PUBLIC CPU_AMD64=1 CPU_ARM=0)
elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "arm.*|ARM.*")
target_compile_definitions(${PROJECT_NAME} PUBLIC CPU_AMD64=0 CPU_ARM=1)
endif()
if (CMAKE_BUILD_TYPE MATCHES "Debug")
target_compile_definitions(${PROJECT_NAME} PUBLIC BUILD_DEBUG=1)
else()

View File

@ -1,57 +1,159 @@
#include "audio_buffer.h"
#include <cstring>
#include <experimental/simd>
void audio_buffer::resize(uint32_t channel_num, uint32_t block_size) {
std::scoped_lock lock(lock_);
buffer_.resize(channel_num);
headers_.resize(channel_num);
for (uint32_t i = 0; i < channel_num; i++) {
buffer_[i].resize(block_size);
headers_[i] = buffer_[i].data();
#include "audio_buffer_pool.h"
#include "misc/cpu_simd.h"
#include "misc/likely.h"
void(*audio_buffer::add_func)(audio_buffer& in_buffer, audio_buffer& from_buffer, float percent);
void(*audio_buffer::multiple_func)(audio_buffer& in_buffer, float percent);
template<int simd_size>
void add_simd(audio_buffer& in_buffer, audio_buffer& from_buffer, float percent) {
using namespace std::experimental;
using simd_type = simd_abi::fixed_size<simd_size>;
simd<sample_t, simd_type> percent_simd(percent);
for (uint32_t channel_index = 0; channel_index < in_buffer.get_num_channels(); channel_index++) {
sample_t* channel = in_buffer.get_headers()[channel_index];
sample_t* in_channel = from_buffer.get_headers()[channel_index];
int i = 0;
for (; i < in_buffer.get_num_samples(); i += simd_size) {
simd<sample_t, simd_type> a(channel, element_aligned);
simd<sample_t, simd_type> b(in_channel, element_aligned);
a += b * percent_simd;
a.copy_to(channel, element_aligned);
channel += simd_size;
in_channel += simd_size;
}
// if the number of samples is not a multiple of simd_size
for (; i < in_buffer.get_num_samples(); ++i) {
channel[i] += in_channel[i] * percent;
}
}
for (auto& channel : buffer_) {
std::memset(channel.data(), 0, channel.size() * sizeof(sample_t));
}
}
void add_no_simd(audio_buffer& in_buffer, audio_buffer& from_buffer, float percent) {
for (uint32_t channel_index = 0; channel_index < in_buffer.get_num_channels(); channel_index++) {
sample_t* channel = in_buffer.get_headers()[channel_index];
sample_t* in_channel = from_buffer.get_headers()[channel_index];
for (int i = 0; i < in_buffer.get_num_samples(); ++i) {
channel[i] += in_channel[i] * percent;
}
}
}
template<int simd_size>
void multiple_simd(audio_buffer& in_buffer, float percent) {
using namespace std::experimental;
using simd_type = simd_abi::fixed_size<simd_size>;
simd<sample_t, simd_type> percent_simd(percent);
for (auto channel : in_buffer.get_headers_vector()) {
int i = 0;
for (; i < in_buffer.get_num_samples(); i += simd_size) {
simd<sample_t, simd_type> a(channel, element_aligned);
a *= percent_simd;
a.copy_to(channel, element_aligned);
channel += simd_size;
}
// if the number of samples is not a multiple of simd_size
for (; i < in_buffer.get_num_samples(); ++i) {
channel[i] *= percent;
}
}
}
void multiple_no_simd(audio_buffer& in_buffer, float percent) {
for (auto channel : in_buffer.get_headers_vector()) {
for (int i = 0; i < in_buffer.get_num_samples(); ++i) {
channel[i] *= percent;
}
}
}
audio_buffer::audio_buffer() {
using namespace std::experimental;
static bool func_initialized = false;
if (UNLIKELY(!func_initialized)) {
cpuid cpu;
#define DEFINE_SIMD_FUNC(simd_max) \
constexpr size_t simd_size = simd_max / sizeof(sample_t) / 8; \
add_func = &add_simd<simd_size>; \
multiple_func = &multiple_simd<simd_size>;
#if CPU_AMD64
if (cpu.support_avx512()) {
DEFINE_SIMD_FUNC(512)
} else if (cpu.support_avx() || cpu.support_avx2()) {
DEFINE_SIMD_FUNC(256)
} else if (cpu.support_sse()) {
DEFINE_SIMD_FUNC(128)
}
#endif
#if CPU_ARM
if (cpu.support_neon128()) {
DEFINE_SIMD_FUNC(128)
} else if (cpu.support_neon64()) {
DEFINE_SIMD_FUNC(64)
}
#endif
if (!add_func) {
add_func = &add_no_simd;
multiple_func = &multiple_no_simd;
}
func_initialized = true;
}
#undef DEFINE_SIMD_FUNC
}
audio_buffer::~audio_buffer() {
free();
}
void audio_buffer::resize(uint32_t channel_num, uint32_t frame_size) {
frame_size_ = frame_size;
free();
audio_buffer_pool* pool = get_audio_buffer_pool();
for (int i = 0; i < channel_num; ++i) {
sample_t* block = pool->alloc(frame_size);
headers_.push_back(block);
}
clear();
}
void audio_buffer::clear() {
std::scoped_lock lock(lock_);
for (auto& channel : buffer_) {
std::memset(channel.data(), 0, channel.size() * sizeof(sample_t));
for (sample_t* channel : headers_) {
std::memset(channel, 0, frame_size_ * sizeof(sample_t));
}
}
void audio_buffer::mix(audio_buffer& from_buffer, float percent) {
void audio_buffer::add(audio_buffer& from_buffer, float percent) {
std::scoped_lock lock(lock_);
// will be optimized by compiler
for (uint32_t channel_index = 0; channel_index < buffer_.size(); channel_index++) {
auto& channel = buffer_[channel_index];
auto& in_channel = from_buffer.buffer_[channel_index];
for (uint32_t sample_index = 0; sample_index < channel.size(); sample_index++) {
channel[sample_index] += in_channel[sample_index] * percent;
}
}
add_func(*this, from_buffer, percent);
}
void audio_buffer::multiple(float percent) {
std::scoped_lock lock(lock_);
// will be optimized by compiler
for (auto& channel : buffer_) {
for (auto& sample : channel) {
sample *= percent;
}
}
multiple_func(*this, percent);
}
std::vector<sample_t> audio_buffer::get_interleaved_buffer() {
std::scoped_lock lock(lock_);
std::vector<sample_t> audio_buffer::get_interleaved_buffer() const {
std::vector<sample_t> result;
result.reserve(buffer_[0].size() * buffer_.size());
for (uint32_t sample_index = 0; sample_index < buffer_[0].size(); sample_index++) {
for (uint32_t channel_index = 0; channel_index < buffer_.size(); channel_index++) {
result.push_back(buffer_[channel_index][sample_index]);
result.reserve(headers_.size() * frame_size_);
for (int i = 0; i < frame_size_; ++i) {
for (const sample_t* channel : headers_) {
result.push_back(channel[i]);
}
}
return result;
}
void audio_buffer::free() {
for (sample_t* header : headers_)
get_audio_buffer_pool()->free(header);
headers_.clear();
}

View File

@ -3,24 +3,32 @@
#include <mutex>
#include <vector>
#include "extern.h"
#include <functional>
class CORE_API audio_buffer {
public:
static void(*add_func)(audio_buffer& in_buffer, audio_buffer& from_buffer, float percent);
static void(*multiple_func)(audio_buffer& in_buffer, float percent);
audio_buffer();
~audio_buffer();
sample_t** get_headers() { return headers_.data(); }
const std::vector<sample_t*>& get_headers_vector() { return headers_; }
[[nodiscard]] uint32_t get_num_channels() const { return buffer_.size(); }
[[nodiscard]] uint32_t get_num_samples() const { return buffer_[0].size(); }
[[nodiscard]] uint32_t get_num_channels() const { return headers_.size(); }
[[nodiscard]] uint32_t get_num_samples() const { return frame_size_; }
void resize(uint32_t channel_num, uint32_t block_size);
void resize(uint32_t channel_num, uint32_t frame_size);
void clear();
void mix(audio_buffer& from_buffer, float percent = 1.f);
void add(audio_buffer& from_buffer, float percent = 1.f);
void multiple(float percent);
[[nodiscard]] std::vector<sample_t> get_interleaved_buffer();
[[nodiscard]] std::vector<sample_t> get_interleaved_buffer() const;
private:
void free();
private:
std::vector<std::vector<sample_t>> buffer_;
std::vector<sample_t*> headers_{};
std::mutex lock_{};
uint32_t frame_size_ = 0;
};

View File

@ -0,0 +1,15 @@
#include "audio_buffer_pool.h"
IMPL_SINGLETON_INSTANCE(audio_buffer_pool)
void audio_buffer_pool::init(singleton_initliazer& initliazer) {
}
sample_t * audio_buffer_pool::alloc(uint32_t block_size) {
auto* alloc_block = static_cast<sample_t*>(pool_.alloc(block_size * sizeof(sample_t)));
return alloc_block;
}
void audio_buffer_pool::free(sample_t* block) {
pool_.free(block);
}

View File

@ -0,0 +1,18 @@
#pragma once
#include "mempool.h"
#include "misc/singleton/singleton.h"
class CORE_API audio_buffer_pool : public singleton_t<audio_buffer_pool> {
public:
void init(singleton_initliazer& initliazer) override;
const char* get_name() override { return "audio_buffer_pool"; }
sample_t* alloc(uint32_t block_size);
void free(sample_t* block);
protected:
[[nodiscard]] mempool<>& get_pool() { return pool_; }
private:
mempool<> pool_ = mempool(1024000 * 4); // 一个4MB的内存池, 如果一个缓冲区大小为1024个样本, 那么这个内存池可以分配1024 * 4个缓冲区
};
DEFINE_SINGLETON_INSTANCE(audio_buffer_pool)

121
core/misc/cpu_simd.h Normal file
View File

@ -0,0 +1,121 @@
#pragma once
#include <algorithm>
#include <cpuid.h>
#include <vector>
#ifdef _MSC_VER
#include <intrin.h>
#else
#include <cpuid.h>
#endif
inline void get_cpuid(int info[4], int infoType) {
#ifdef _MSC_VER
__cpuidex(info, infoType, 0);
#else
__cpuid_count(infoType, 0, info[0], info[1], info[2], info[3]);
#endif
}
enum class simd_type {
sse2,
sse3,
ssse3,
sse41,
sse42,
avx,
avx2,
avx512,
neon64,
neon128,
};
inline std::vector<simd_type> get_simd_support_type() {
unsigned int eax, ebx, ecx, edx;
__get_cpuid(1, &eax, &ebx, &ecx, &edx);
std::vector<simd_type> simd_types;
#if CPU_AMD64
if (ecx & bit_SSE2) {
simd_types.push_back(simd_type::sse2);
}
if (ecx & bit_SSE3) {
simd_types.push_back(simd_type::sse3);
}
if (ecx & bit_SSSE3) {
simd_types.push_back(simd_type::ssse3);
}
if (ecx & bit_SSE4_1) {
simd_types.push_back(simd_type::sse41);
}
if (ecx & bit_SSE4_2) {
simd_types.push_back(simd_type::sse42);
}
if (ecx & bit_AVX) {
simd_types.push_back(simd_type::avx);
}
if (ebx & bit_AVX2) {
simd_types.push_back(simd_type::avx2);
}
if (ebx & bit_AVX512F) {
simd_types.push_back(simd_type::avx512);
}
#endif
#if CPU_ARM
uint64_t id_aa64isar0_ = 0;
uint64_t id_aa64pfr0_ = 0;
// Reading the ID_AA64ISAR0_EL1 register
asm volatile("mrs %0, ID_AA64ISAR0_EL1" : "=r" (id_aa64isar0_));
// Reading the ID_AA64PFR0_EL1 register
asm volatile("mrs %0, ID_AA64PFR0_EL1" : "=r" (id_aa64pfr0_));
if ((id_aa64isar0_ >> 24) & 0xf) {
simd_types.push_back(simd_type::neon64);
}
if ((id_aa64isar0_ >> 28) & 0xf) {
simd_types.push_back(simd_type::neon128);
}
#endif
return simd_types;
}
class CORE_API cpuid {
public:
cpuid() {
simd_types = get_simd_support_type();
}
[[nodiscard]] bool support_simd(simd_type simd) const {
return std::ranges::find(simd_types, simd) != simd_types.end();
}
[[nodiscard]] bool support_sse() const {
return support_simd(simd_type::sse42) || support_simd(simd_type::sse41) || support_simd(simd_type::ssse3) || support_simd(simd_type::sse3) || support_simd(simd_type::sse2);
}
[[nodiscard]] bool support_avx() const {
return support_simd(simd_type::avx);
}
[[nodiscard]] bool support_avx2() const {
return support_simd(simd_type::avx2);
}
[[nodiscard]] bool support_avx512() const {
return support_simd(simd_type::avx512);
}
[[nodiscard]] bool support_neon() const {
return support_neon64() || support_neon128();
}
[[nodiscard]] bool support_neon64() const {
return support_simd(simd_type::neon64);
}
[[nodiscard]] bool support_neon128() const {
return support_simd(simd_type::neon128);
}
private:
std::vector<simd_type> simd_types;
};

View File

@ -21,11 +21,11 @@ public:
messages_.push(message);
}
void push_message(const std::function<void()>& func) {
lamba_thread_message* message = mem_pool_.alloc<lamba_thread_message>(func);
auto* message = mem_pool_.alloc<lamba_thread_message>(func);
push_message(message);
}
void process_messages();
private:
std::queue<thread_message*> messages_;
mempool mem_pool_ = mempool(1024000);
mempool<> mem_pool_ = mempool(1024000);
};

View File

@ -2,7 +2,7 @@ cmake_minimum_required(VERSION 3.5)
project(mempool)
set(CMAKE_C_STANDARD 99)
set(CMAKE_CPP_STANDARD 23)
set(CMAKE_CPP_STANDARD 17)
add_library(${PROJECT_NAME} SHARED
mempool/ncx_core.h

View File

@ -1,19 +1,24 @@
#pragma once
#include "ncx_slab.h"
#include <cstdlib>
template<int32_t alignment = 64>
class mempool {
public:
mempool(size_t pool_size = 1024000) {
const auto space = (u_char*)malloc(pool_size);
mempool(size_t pool_size = 1024000) { // 1024KB
// 分配一个缓存对齐的内存
// const auto space = static_cast<u_char*>(operator new(pool_size, std::align_val_t(alignment)));
const auto space = static_cast<u_char*>(malloc(pool_size));
mem_pool_ = (ncx_slab_pool_t*)space;
mem_pool_->addr = space;
mem_pool_ = reinterpret_cast<ncx_slab_pool_t*>(space);
mem_pool_->addr = space;
mem_pool_->min_shift = 3;
mem_pool_->end = space + pool_size;
mem_pool_->end = space + pool_size;
ncx_slab_init(mem_pool_);
}
~mempool() {
// operator delete(mem_pool_->addr, std::align_val_t(alignment));
free(mem_pool_->addr);
}
[[nodiscard]] void* alloc(size_t size) const {