C++ 中文周刊 2024-09-07 第168期

2024-09-10 21:20:52 浏览数 (1)

资讯

标准委员会动态/ide/编译器信息放在这里

编译器信息最新动态推荐关注hellogcc公众号 本周 的

文章

隆重介绍proxy 3.0版本,更好用的fat pointer库

https://devblogs.microsoft.com/cppblog/announcing-the-proxy-3-library-for-dynamic-polymorphism/

看个乐呵,其实还是要面向需求

这个概念起码16/17年就有了,隔壁rust有用,folly::Poly没听说有谁用

为什么 const 无法让 C 代码跑得更快?

https://linux.cn/article-11339-1.html

省流,约束自己的,编译器足够聪明能分析出优化点,用const指引没啥帮助

使用static 快十倍

https://mazzo.li/posts/c-performance-anecdote.html

看看作者给的B代码

代码语言:javascript复制
代码语言:javascript复制
uint64_t modulus = 1ULL << 31; // 2^31
// static uint64_t modulus = 1ULL << 31; // 2^31
uint64_t loop(uint64_t N, uint64_t S, uint64_t P, uint64_t Q) {
  for (uint64_t i = 0; i < N; i  ) {
    S = (S*P Q) % modulus;
  }
  return S;
}
代码语言:javascript复制

把modulus改成static就内联了。

不是哥们,你直接用宏/constexpr得了呗,又不改动,全局变量影响多大没有数?

回顾shared ptr实现

https://andreasfertig.blog/2024/09/understanding-the-inner-workings-of-cpp-smart-pointers-the-shared_ptr/

代码语言:javascript复制
template<typename T>
class shared_ptr {
  ctrl_blk_base* ctrl_blk_{};
  T*             t_{};

  shared_ptr(ctrl_blk_with_storage<T>* cb)
  : shared_ptr{cb, cb->get()}
  {}

  shared_ptr(ctrl_blk_base* cb, T* t)
  : ctrl_blk_{cb}
  , t_{t}
  {}

  template<typename U, typename... Args>
  friend shared_ptr<U> make_shared(Args&&... vals);

public:
  shared_ptr() = default;

  shared_ptr(T* t)
  : shared_ptr{new ctrl_blk<T>{t}, t}
  {}

  ~shared_ptr()
  {
    if(ctrl_blk_) { ctrl_blk_->release_shared(); }
  }

  shared_ptr(const shared_ptr& rhs)
  : ctrl_blk_{rhs.ctrl_blk_}
  , t_{rhs.t_}
  {
    if(ctrl_blk_) { ctrl_blk_->add_shared(); }
  }

  shared_ptr(shared_ptr&& rhs)
  : ctrl_blk_{rhs.ctrl_blk_}
  , t_{rhs.t_}
  {
    rhs.ctrl_blk_ = nullptr;
    rhs.t_        = nullptr;
  }

  shared_ptr& operator=(const shared_ptr& rhs)
  {
    shared_ptr{rhs}.swap(*this);  // forward to copy ctor
    return *this;
  }

  shared_ptr& operator=(shared_ptr&& rhs)
  {
    shared_ptr{std::move(rhs)}.swap(*this);  // forward to move-ctor
    return *this;
  }

  void swap(shared_ptr& rhs)
  {
    std::swap(t_, rhs.t_);
    std::swap(ctrl_blk_, rhs.ctrl_blk_);
  }
};

template<typename T, typename... Args>
shared_ptr<T> make_shared(Args&&... vals)
{
  return new ctrl_blk_with_storage<T>(std::forward<Args>(vals)...);
}

struct ctrl_blk_base {
  std::atomic_uint64_t shared_ref_count_{1};

  void add_shared() {   shared_ref_count_; }
  auto dec() { return --shared_ref_count_; }

  virtual void release_shared() = 0;
};

template<typename T>
struct ctrl_blk : ctrl_blk_base {
  T* data_;

  explicit ctrl_blk(T* data)
  : ctrl_blk_base{}
  , data_{data}
  {}

  void release_shared() override
  {
    if(0 == dec()) {
      delete data_;
      delete this;  // self delete
    }
  }
};

template<typename T>
struct ctrl_blk_with_storage : ctrl_blk_base {
  T in_place_;

  template<typename... Args>
  explicit ctrl_blk_with_storage(Args&&... vals)
  : ctrl_blk_base{}
  , in_place_{std::forward<Args>(vals)...}
  {}

  T* get() { return &in_place_; }

  void release_shared() override
  {
    if(0 == dec()) {
      delete this;  // self delete
    }
  }
};
代码语言:javascript复制

非常简单,大家看懂了吗

RealtimeSanitizer

https://clang.llvm.org/docs/RealtimeSanitizer.html

llvm引入了新的sanitizer, RTSan 标记了noblocking的函数只要监测到路径中存在 malloc, free, pthread_mutex_lock,就会报错

看样例

代码语言:javascript复制

#include <vector>

void violation() [[clang::nonblocking]]{
  std::vector<float> v;
  v.resize(100);
}

int main() {
  violation();
  return 0;
}

//clang   -fsanitize=realtime -g example_realtime_violation.cpp

输出

代码语言:javascript复制
代码语言:javascript复制
clang   -fsanitize=realtime -g example_realtime_violation.cpp

./a.out
Real-time violation: intercepted call to real-time unsafe function `malloc` in real-time context! Stack trace:

0 0x000102893034 in __rtsan::PrintStackTrace() rtsan_stack.cpp:45
1 0x000102892e64 in __rtsan::Context::ExpectNotRealtime(char const*) rtsan_context.cpp:78
2 0x00010289397c in malloc rtsan_interceptors.cpp:286
3 0x000195bd7bd0 in operator new(unsigned long) 0x1c (libc  abi.dylib:arm64 0x16bd0)
4 0x5c7f00010230f07c  (<unknown module>)
5 0x00010230f058 in std::__1::__libcpp_allocate[abi:ue170006](unsigned long, unsigned long) new:324
6 0x00010230effc in std::__1::allocator<float>::allocate[abi:ue170006](unsigned long) allocator.h:114
 ... snip ...
10 0x00010230e4bc in std::__1::vector<float, std::__1::allocator<float>>::__append(unsigned long) vector:1162
11 0x00010230dcdc in std::__1::vector<float, std::__1::allocator<float>>::resize(unsigned long) vector:1981
12 0x00010230dc28 in violation() main.cpp:5
13 0x00010230dd64 in main main.cpp:9
14 0x0001958960dc  (<unknown module>)
15 0x2f557ffffffffffc  (<unknown module>)
代码语言:javascript复制

还是很准的,可以更好的控制快速路径中的可能阻塞的函数调用

如何使用?最新llvm

代码语言:javascript复制
cmake -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_PROJECTS="clang" -DLLVM_ENABLE_RUNTIMES="compiler-rt" <path to source>/llvm

使用bfloat16来压缩浮点数

https://lemire.me/blog/2024/09/02/compressing-floating-point-numbers-quickly-by-converting-them-to-brain-floats/

lemire新活,

并不需要那么的精确,可以使用brain float 16 显然计算带宽四倍,如果用上SIMD,那速度可以更快

代码语言:javascript复制
代码语言:javascript复制
#include <immintrin.h>
#include <cstddef>
#include <cstdint>

void to_float16(uint16_t *dst, const double *src, size_t length) {
    size_t i = 0;
    __mmask8 mask;

    // Process 8 elements at a time
    for (; i   7 < length; i  = 8) {
        // Load 8 double-precision floats
        __m512d src_vec = _mm512_loadu_pd(&src[i]);

        // Convert to 16-bit floats with rounding
        __m128bh dst_vec = _mm256_cvtneps_pbh(_mm512_cvt_roundpd_ps(src_vec, _MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC));

        // Store the result
        _mm_storeu_si128((__m128i*)&dst[i], *(__m128i*)&dst_vec);
    }

    // Handle remaining elements
    if (i < length) {
        // Create a mask for the remaining elements
        mask = (1 << (length - i)) - 1;

        // Load remaining double-precision floats
        __m512d src_vec = _mm512_maskz_loadu_pd(mask, &src[i]);

        // Convert to 16-bit floats with rounding
        __m128bh dst_vec = _mm256_cvtneps_pbh(_mm512_cvt_roundpd_ps(src_vec, _MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC));

        // Store the result with masking
        _mm_mask_storeu_epi16(&dst[i], mask, *(__m128i*)&dst_vec);
    }
}


void from_float16(double *dst, const uint16_t *src, size_t length) {
    size_t i = 0;
    __mmask8 mask;

    // Process 8 elements at a time
    for (; i   7 < length; i  = 8) {
        // Load 8 half-precision floats
        __m128i src_vec = _mm_loadu_si128((__m128i*)&src[i]);

        // Convert to double-precision floats
        __m512d dst_vec = _mm512_cvtps_pd(_mm256_cvtpbh_ps(*(__m128bh*)&src_vec));

        // Store the result
        _mm512_storeu_pd(&dst[i], dst_vec);
    }

    // Handle remaining elements
    if (i < length) {
        // Create a mask for the remaining elements
        mask = (1 << (length - i)) - 1;

        // Load remaining half-precision floats
        __m128i src_vec = _mm_maskz_loadu_epi16(mask, &src[i]);

        // Convert to double-precision floats
        __m512d dst_vec = _mm512_cvtps_pd(_mm256_cvtpbh_ps( *(__m128bh*)&src_vec));

        // Store the result with masking
        _mm512_mask_storeu_pd(&dst[i], mask, dst_vec);
    }
}
代码语言:javascript复制

作者在zen4服务器上测试压缩2亿条每秒,解压0.9亿条每秒

考虑geo/AI之类的不精确场景,这种压缩带来的提升是非常迅速的

简单代码段分享

一个c的enum to string

代码语言:javascript复制
#include <stdio.h>
#include <string.h>

#define NUMARGS(...)  (sizeof((int[]){__VA_ARGS__})/sizeof(int))

#define ENUM_TO_STRING(ENUM_NAME, ...)                                                   
    enum ENUM_NAME { __VA_ARGS__ };                                                      
    char ENUM_NAME##_strings[] = #__VA_ARGS__ ;                                          
    long ENUM_NAME##strings_indices[NUMARGS(__VA_ARGS__)];                               
    char *ENUM_NAME##_to_string(enum ENUM_NAME value) {                                  
        static int init = 0;                                                             
        if(init == 0){                                                                   
            int n = 0;                                                                   
            ENUM_NAME##strings_indices[n  ] = 0;                                         
            char* curr_pos = strchr(ENUM_NAME##_strings,',');                            
            while(curr_pos){                                                             
                *curr_pos = '';                                                        
                ENUM_NAME##strings_indices[n  ]= (  curr_pos - ENUM_NAME##_strings);     
                curr_pos = strchr(curr_pos,',');                                         
            }                                                                            
            init  ;                                                                      
        }                                                                                
        return  (char *)ENUM_NAME##_strings ENUM_NAME##strings_indices[value];           
    }

/* Usage just create the enum */
ENUM_TO_STRING(Color,RED,GREEN,BLUE,VIOLET)

int main(void) 
{
    printf("%sn",Color_to_string(RED));
    printf("%sn",Color_to_string(BLUE));
    printf("%sn",Color_to_string(GREEN));
    printf("%sn",Color_to_string(VIOLET));
    printf("%sn",Color_to_string(GREEN));
    printf("%sn",Color_to_string(BLUE));

    return 0;
}
代码语言:javascript复制

godbolt https://godbolt.org/z/fcnaxdsGa

c 有magic_enum,别学哈

互动环节

最近感觉文章越来越少了,重复度越来越高,难道说我的周刊到瓶颈了?已经通过剥削群友的方式拿到了供稿,又有东西可以发了

0 人点赞