资讯
标准委员会动态/ide/编译器信息放在这里
编译器信息最新动态推荐关注hellogcc公众号 本周 的
文章
隆重介绍proxy 3.0版本,更好用的fat pointer库
https://devblogs.microsoft.com/cppblog/announcing-the-proxy-3-library-for-dynamic-polymorphism/
看个乐呵,其实还是要面向需求
这个概念起码16/17年就有了,隔壁rust有用,folly::Poly没听说有谁用
为什么 const 无法让 C 代码跑得更快?
https://linux.cn/article-11339-1.html
省流,约束自己的,编译器足够聪明能分析出优化点,用const指引没啥帮助
使用static 快十倍
https://mazzo.li/posts/c-performance-anecdote.html
看看作者给的B代码
代码语言:javascript复制
代码语言:javascript复制uint64_t modulus = 1ULL << 31; // 2^31
// static uint64_t modulus = 1ULL << 31; // 2^31
uint64_t loop(uint64_t N, uint64_t S, uint64_t P, uint64_t Q) {
for (uint64_t i = 0; i < N; i ) {
S = (S*P Q) % modulus;
}
return S;
}
代码语言:javascript复制
把modulus改成static就内联了。
不是哥们,你直接用宏/constexpr得了呗,又不改动,全局变量影响多大没有数?
回顾shared ptr实现
https://andreasfertig.blog/2024/09/understanding-the-inner-workings-of-cpp-smart-pointers-the-shared_ptr/
代码语言:javascript复制template<typename T>
class shared_ptr {
ctrl_blk_base* ctrl_blk_{};
T* t_{};
shared_ptr(ctrl_blk_with_storage<T>* cb)
: shared_ptr{cb, cb->get()}
{}
shared_ptr(ctrl_blk_base* cb, T* t)
: ctrl_blk_{cb}
, t_{t}
{}
template<typename U, typename... Args>
friend shared_ptr<U> make_shared(Args&&... vals);
public:
shared_ptr() = default;
shared_ptr(T* t)
: shared_ptr{new ctrl_blk<T>{t}, t}
{}
~shared_ptr()
{
if(ctrl_blk_) { ctrl_blk_->release_shared(); }
}
shared_ptr(const shared_ptr& rhs)
: ctrl_blk_{rhs.ctrl_blk_}
, t_{rhs.t_}
{
if(ctrl_blk_) { ctrl_blk_->add_shared(); }
}
shared_ptr(shared_ptr&& rhs)
: ctrl_blk_{rhs.ctrl_blk_}
, t_{rhs.t_}
{
rhs.ctrl_blk_ = nullptr;
rhs.t_ = nullptr;
}
shared_ptr& operator=(const shared_ptr& rhs)
{
shared_ptr{rhs}.swap(*this); // forward to copy ctor
return *this;
}
shared_ptr& operator=(shared_ptr&& rhs)
{
shared_ptr{std::move(rhs)}.swap(*this); // forward to move-ctor
return *this;
}
void swap(shared_ptr& rhs)
{
std::swap(t_, rhs.t_);
std::swap(ctrl_blk_, rhs.ctrl_blk_);
}
};
template<typename T, typename... Args>
shared_ptr<T> make_shared(Args&&... vals)
{
return new ctrl_blk_with_storage<T>(std::forward<Args>(vals)...);
}
struct ctrl_blk_base {
std::atomic_uint64_t shared_ref_count_{1};
void add_shared() { shared_ref_count_; }
auto dec() { return --shared_ref_count_; }
virtual void release_shared() = 0;
};
template<typename T>
struct ctrl_blk : ctrl_blk_base {
T* data_;
explicit ctrl_blk(T* data)
: ctrl_blk_base{}
, data_{data}
{}
void release_shared() override
{
if(0 == dec()) {
delete data_;
delete this; // self delete
}
}
};
template<typename T>
struct ctrl_blk_with_storage : ctrl_blk_base {
T in_place_;
template<typename... Args>
explicit ctrl_blk_with_storage(Args&&... vals)
: ctrl_blk_base{}
, in_place_{std::forward<Args>(vals)...}
{}
T* get() { return &in_place_; }
void release_shared() override
{
if(0 == dec()) {
delete this; // self delete
}
}
};
代码语言:javascript复制
非常简单,大家看懂了吗
RealtimeSanitizer
https://clang.llvm.org/docs/RealtimeSanitizer.html
llvm引入了新的sanitizer, RTSan 标记了noblocking的函数只要监测到路径中存在 malloc, free, pthread_mutex_lock,就会报错
看样例
代码语言:javascript复制
#include <vector>
void violation() [[clang::nonblocking]]{
std::vector<float> v;
v.resize(100);
}
int main() {
violation();
return 0;
}
//clang -fsanitize=realtime -g example_realtime_violation.cpp
输出
代码语言:javascript复制
代码语言:javascript复制clang -fsanitize=realtime -g example_realtime_violation.cpp
./a.out
Real-time violation: intercepted call to real-time unsafe function `malloc` in real-time context! Stack trace:
0 0x000102893034 in __rtsan::PrintStackTrace() rtsan_stack.cpp:45
1 0x000102892e64 in __rtsan::Context::ExpectNotRealtime(char const*) rtsan_context.cpp:78
2 0x00010289397c in malloc rtsan_interceptors.cpp:286
3 0x000195bd7bd0 in operator new(unsigned long) 0x1c (libc abi.dylib:arm64 0x16bd0)
4 0x5c7f00010230f07c (<unknown module>)
5 0x00010230f058 in std::__1::__libcpp_allocate[abi:ue170006](unsigned long, unsigned long) new:324
6 0x00010230effc in std::__1::allocator<float>::allocate[abi:ue170006](unsigned long) allocator.h:114
... snip ...
10 0x00010230e4bc in std::__1::vector<float, std::__1::allocator<float>>::__append(unsigned long) vector:1162
11 0x00010230dcdc in std::__1::vector<float, std::__1::allocator<float>>::resize(unsigned long) vector:1981
12 0x00010230dc28 in violation() main.cpp:5
13 0x00010230dd64 in main main.cpp:9
14 0x0001958960dc (<unknown module>)
15 0x2f557ffffffffffc (<unknown module>)
代码语言:javascript复制
还是很准的,可以更好的控制快速路径中的可能阻塞的函数调用
如何使用?最新llvm
代码语言:javascript复制cmake -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_PROJECTS="clang" -DLLVM_ENABLE_RUNTIMES="compiler-rt" <path to source>/llvm
使用bfloat16来压缩浮点数
https://lemire.me/blog/2024/09/02/compressing-floating-point-numbers-quickly-by-converting-them-to-brain-floats/
lemire新活,
并不需要那么的精确,可以使用brain float 16 显然计算带宽四倍,如果用上SIMD,那速度可以更快
代码语言:javascript复制
代码语言:javascript复制#include <immintrin.h>
#include <cstddef>
#include <cstdint>
void to_float16(uint16_t *dst, const double *src, size_t length) {
size_t i = 0;
__mmask8 mask;
// Process 8 elements at a time
for (; i 7 < length; i = 8) {
// Load 8 double-precision floats
__m512d src_vec = _mm512_loadu_pd(&src[i]);
// Convert to 16-bit floats with rounding
__m128bh dst_vec = _mm256_cvtneps_pbh(_mm512_cvt_roundpd_ps(src_vec, _MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC));
// Store the result
_mm_storeu_si128((__m128i*)&dst[i], *(__m128i*)&dst_vec);
}
// Handle remaining elements
if (i < length) {
// Create a mask for the remaining elements
mask = (1 << (length - i)) - 1;
// Load remaining double-precision floats
__m512d src_vec = _mm512_maskz_loadu_pd(mask, &src[i]);
// Convert to 16-bit floats with rounding
__m128bh dst_vec = _mm256_cvtneps_pbh(_mm512_cvt_roundpd_ps(src_vec, _MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC));
// Store the result with masking
_mm_mask_storeu_epi16(&dst[i], mask, *(__m128i*)&dst_vec);
}
}
void from_float16(double *dst, const uint16_t *src, size_t length) {
size_t i = 0;
__mmask8 mask;
// Process 8 elements at a time
for (; i 7 < length; i = 8) {
// Load 8 half-precision floats
__m128i src_vec = _mm_loadu_si128((__m128i*)&src[i]);
// Convert to double-precision floats
__m512d dst_vec = _mm512_cvtps_pd(_mm256_cvtpbh_ps(*(__m128bh*)&src_vec));
// Store the result
_mm512_storeu_pd(&dst[i], dst_vec);
}
// Handle remaining elements
if (i < length) {
// Create a mask for the remaining elements
mask = (1 << (length - i)) - 1;
// Load remaining half-precision floats
__m128i src_vec = _mm_maskz_loadu_epi16(mask, &src[i]);
// Convert to double-precision floats
__m512d dst_vec = _mm512_cvtps_pd(_mm256_cvtpbh_ps( *(__m128bh*)&src_vec));
// Store the result with masking
_mm512_mask_storeu_pd(&dst[i], mask, dst_vec);
}
}
代码语言:javascript复制
作者在zen4服务器上测试压缩2亿条每秒,解压0.9亿条每秒
考虑geo/AI之类的不精确场景,这种压缩带来的提升是非常迅速的
简单代码段分享
一个c的enum to string
代码语言:javascript复制#include <stdio.h>
#include <string.h>
#define NUMARGS(...) (sizeof((int[]){__VA_ARGS__})/sizeof(int))
#define ENUM_TO_STRING(ENUM_NAME, ...)
enum ENUM_NAME { __VA_ARGS__ };
char ENUM_NAME##_strings[] = #__VA_ARGS__ ;
long ENUM_NAME##strings_indices[NUMARGS(__VA_ARGS__)];
char *ENUM_NAME##_to_string(enum ENUM_NAME value) {
static int init = 0;
if(init == 0){
int n = 0;
ENUM_NAME##strings_indices[n ] = 0;
char* curr_pos = strchr(ENUM_NAME##_strings,',');
while(curr_pos){
*curr_pos = '