Rust 不同方式创建Arc对象性能对比

背景

在C++（C++11）中有了移动语义后，标准库中的很多容器都实现了原地构建对象的接口，例如 vector::emplace_pack。得益于此，在适当的场合下，可以大量减少临时对象的创建和对象拷贝，从而提升程序性能。因此想到rust应该也有类似的机制，来避免不必要的对象创建和拷贝开销。

C++移动语义：将一个右值（比如一个临时对象表达式）传给一个函数的时候，是优先绑定到参数形式是右值引用的函数上的

测试验证

以Arc对象创建为测试例子：
+ 创建临时变量，传入临时变量创建Arc： create_arc_with_temp_var
+ 原地创建，直接在new入参里创建对象：create_arc_inplace
猜测原地创建方式会使用移动语义，不会有对象拷贝发生。通过临时变量创建，会发生一次对象拷贝，性能下降。
以下为完整测试代码：

use std::sync::atomic::AtomicI64;

static GVALUE: AtomicI64 = AtomicI64::new(0);
const BUFFER_SIZE:usize = 10240;

struct TestClass {
    a: i64,
    b: i64,
    buf: [u8; BUFFER_SIZE],  // 使用大数组来模拟超大对象
}

/// 创建普通对象，参考组
fn create_normal() -> TestClass {
    let v = GVALUE.fetch_add(1, std::sync::atomic::Ordering::SeqCst); // 防止编译器优化，下同
    TestClass{ a: v, b:0, buf: [0;BUFFER_SIZE]}
}

fn create_arc_with_temp_var() -> std::sync::Arc {
    let v = GVALUE.fetch_add(1, std::sync::atomic::Ordering::SeqCst);
    let obj = TestClass{ a: v, b:0, buf: [0;BUFFER_SIZE]};
    std::sync::Arc::new(obj)
}

fn create_arc_inplace() -> std::sync::Arc {
    let v = GVALUE.fetch_add(1, std::sync::atomic::Ordering::SeqCst);
    std::sync::Arc::new(TestClass{ a: v, b:0, buf: [0; BUFFER_SIZE]})
}

pub fn test_main() {
    let repeat: usize = 1000000;
    let mut sum: i64 = 0;
    let mut bufsize:usize = 0;

    let start_time = std::time::SystemTime::now();
    for _ in 0..repeat {
        let obj = create_normal();
        // 下面两句避免编译器警告和阻止编译器优化， 下同。
        sum += obj.a + obj.b;
        bufsize += obj.buf.len();
    }
    println!("create normal: {:?}", start_time.elapsed().unwrap());

    sum = 0;
    bufsize = 0;
    let start_time = std::time::SystemTime::now();
    for _ in 0..repeat {
        let obj = create_arc_with_temp_var();
        sum += obj.a + obj.b;
        bufsize += obj.buf.len();
    }
    println!("create arc with temp var: {:?}", start_time.elapsed().unwrap());

    sum = 0;
    bufsize = 0;
    let start_time = std::time::SystemTime::now();
    for _ in 0..repeat {
        let obj = create_arc_inplace();
        sum += obj.a + obj.b;
        bufsize += obj.buf.len();
    }
    println!("create arc inplace: {:?}", start_time.elapsed().unwrap());
    println!("done {}-{}", sum, bufsize);
}

fn main() {
    test_main();
}

保存为 test.rs, 直接使用rustc进行默认选项编译：rustc test.rs -o test.exe, 执行test.exe结果：

create normal: 634.3058ms
create arc with temp var: 2.1003936s
create arc inplace: 1.5110131s

可以看出，确实如猜测一样，原地创建的方式更快。
生成中间汇编文件rustc --emit asm=test.S test.rs，从汇编看是否有对象拷贝发生:

create_arc_with_temp_var

.seh_proc _ZN7arctest24create_arc_with_temp_var17heca6a2e43008164bE
	movl	$30808, %eax
	callq	__chkstk
	subq	%rax, %rsp
	.seh_stackalloc 30808
	.seh_endprologue
	movb	$4, 55(%rsp)
	movb	55(%rsp), %r8b
	leaq	_ZN7arctest6GVALUE17h56fc0879255456eaE(%rip), %rcx
	movl	$1, %edx
	callq	_ZN4core4sync6atomic9AtomicI649fetch_add17hc2e5755d85a2d4e3E
	movq	%rax, 40(%rsp)
	leaq	10312(%rsp), %rcx
	xorl	%edx, %edx
	movl	$10240, %r8d
	callq	memset
	movq	40(%rsp), %rax
	movq	%rax, 56(%rsp)
	movq	$0, 64(%rsp)
	leaq	56(%rsp), %rcx
	addq	$16, %rcx
	leaq	10312(%rsp), %rdx
	movl	$10240, %r8d
	callq	memcpy                
	leaq	20552(%rsp), %rcx
	leaq	56(%rsp), %rdx
	movl	$10256, %r8d
	callq	memcpy                 # 多了一次拷贝
	leaq	20552(%rsp), %rcx
	callq	_ZN5alloc4sync12Arc$LT$T$GT$3new17hbf63ab203eef9e84E
	movq	%rax, 32(%rsp)
	movq	32(%rsp), %rax
	addq	$30808, %rsp
	retq
	.seh_endproc

create_arc_inplace

seh_proc _ZN7arctest18create_arc_inplace17hfe33c4907cd49bc4E
	movl	$20552, %eax
	callq	__chkstk
	subq	%rax, %rsp
	.seh_stackalloc 20552
	.seh_endprologue
	movb	$4, 55(%rsp)
	movb	55(%rsp), %r8b
	leaq	_ZN7arctest6GVALUE17h56fc0879255456eaE(%rip), %rcx
	movl	$1, %edx
	callq	_ZN4core4sync6atomic9AtomicI649fetch_add17hc2e5755d85a2d4e3E
	movq	%rax, 40(%rsp)
	leaq	10312(%rsp), %rcx
	xorl	%edx, %edx
	movl	$10240, %r8d
	callq	memset
	movq	40(%rsp), %rax
	movq	%rax, 56(%rsp)
	movq	$0, 64(%rsp)
	leaq	56(%rsp), %rcx
	addq	$16, %rcx
	leaq	10312(%rsp), %rdx
	movl	$10240, %r8d
	callq	memcpy
	leaq	56(%rsp), %rcx
	callq	_ZN5alloc4sync12Arc$LT$T$GT$3new17hbf63ab203eef9e84E
	movq	%rax, 32(%rsp)
	movq	32(%rsp), %rax
	addq	$20552, %rsp
	retq
	.seh_endproc

从汇编来看，临时变量的创建方式确实是多了一次拷贝。但是栈上到堆上的拷贝依然发生了：
std::sync::Arc::new

.seh_proc _ZN5alloc4sync12Arc$LT$T$GT$3new17hbf63ab203eef9e84E
	.seh_handler __CxxFrameHandler3, @unwind, @except
	pushq	%rbp
	.seh_pushreg %rbp
	movl	$10416, %eax
	callq	__chkstk
	subq	%rax, %rsp
	.seh_stackalloc 10416
	leaq	128(%rsp), %rbp
	.seh_setframe %rbp, 128
	.seh_endprologue
	movq	$-2, 10280(%rbp)
	movq	%rcx, -40(%rbp)
	movb	$0, 10263(%rbp)
	movb	$1, 10263(%rbp)
	movl	$10272, %ecx
	movl	$8, %edx
	callq	_ZN5alloc5alloc15exchange_malloc17h3c474a0026f2bb85E
	movq	%rax, -32(%rbp)
.Ltmp10:
	movl	$1, %ecx
	callq	_ZN4core4sync6atomic11AtomicUsize3new17hf7ef8b97eb7f268bE
.Ltmp11:
	movq	%rax, -24(%rbp)
	jmp	.LBB74_1
.LBB74_1:
	movq	-24(%rbp), %rax
	movq	%rax, 10264(%rbp)
	movq	10264(%rbp), %rax
	movq	%rax, -16(%rbp)
.Ltmp12:
	movl	$1, %ecx
	callq	_ZN4core4sync6atomic11AtomicUsize3new17hf7ef8b97eb7f268bE
.Ltmp13:
	movq	%rax, -48(%rbp)
	jmp	.LBB74_3
.LBB74_3:
	movq	-40(%rbp), %rdx
	movq	-48(%rbp), %rax
	movq	%rax, 10272(%rbp)
	movq	10272(%rbp), %rax
	movq	%rax, -8(%rbp)
	movb	$0, 10263(%rbp)
	movq	%rbp, %rcx
	movq	%rcx, -72(%rbp)
	movl	$10256, %r8d
	movq	%r8, -64(%rbp)
	callq	memcpy         # 栈到堆拷贝 
	# ...

说明rust并没有实现真正的原地创建（类似C++原地构造机制）。

开启编译优化，再来看结论是否一致：rustc test.rs -o test_opt.exe -O

create normal: 11.3087ms
create arc with temp var: 422.8822ms
create arc inplace: 436.6645ms

时间消耗基本一致，应该是编译器把这个拷贝优化掉了。感兴趣的可以自行分析汇编。

结论

rustc不支持原地构建，不管以何种方式创建Arc依然会发生栈到堆的拷贝，但其编译器优化对临时变量的优化还是比较到位的。尽管编译器优化足够优秀，我们在写代码时，还是应该尽量减少临时变量的创建，尽可能使用原地构建的方式，避免在特定场景下编译器优化生效。

注：测试所用的rustc版本 rustc 1.56.0 (09c42c458 2021-10-18)

Rust 不同方式创建Arc对象性能对比

C/C++/C#相关栏目本月热门文章