七叶笔记 » golang编程 » Golang的GC性能优化技巧

Golang的GC性能优化技巧

分类: golang编程 | 浏览: 567

slice 预先分配内存

当 slice 的容量小于1024时，容量是按照2倍大小增长的。当容量大于1024，增长的容量是原来的1.25倍。看下面的离例子：

 func appendOne(num int) []int {
    var res []int
    for i := 0; i < num; i++ {
        res = append(res, i)
    }
    return res
}

func appendMany(num int) []int {
    res := make([]int, 0, num)
    for i := 0; i < num; i++ {
        res = append(res, i)
    }
    return res
}
复制代码

函数 appendOne 没有指定初始容量大小， appendMany 指定了初始的容量大小。进行一下benchmark测试：

 func BenchmarkAppendOne(b *testing.B) {
    num := 10000
    for i := 0; i < b.N; i++ {
        _ = appendOne(num)
    }
}

func BenchmarkAppendMany(b *testing.B) {
    num := 10000
    for i := 0; i < b.N; i++ {
        _ = appendMany(num)
    }
}
复制代码

运行测试

 $ go test -bench=. -benchmem                                                      
goos: darwin
goarch: amd64
pkg: com.learn/gormLearn/gc_gc
BenchmarkAppendOne-4               23163             50675 ns/op          386296 B/op         20 allocs/op
BenchmarkAppendMany-4              96781             12241 ns/op           81920 B/op          1 allocs/op
PASS
复制代码

可以看到， AppendMany 每次进行操作进行1次的内存分配，每次内存分配，分配了 81920 B , 每次操作耗时 12241 ns ，这些指标都好于 AppendOne 。一次性分配需要的内存大小， slice 不需要在扩大底层数组时进行内存分配，旧的底层数据依然能够复用，这显然减少了GC的压力。

同样地,新建 map 也可以指定大小。

 func makeMap(num int){
    m := make(map[int]int,num)
    for i:=0;i<len(num);i++{
        m[i]=i
    }
}
复制代码

这可以减少内存拷贝的开销，也可以减少rehash开销。

map中保存值，而不是指针，使用分段map

看下面的列子，map分别保存指针，值。

 func timeGC() time.Duration {
    start := time.Now()
    runtime.GC()
    return time.Since(start)
}

func mapPointer(num int) {
    m := make(map[int]*int, num)
    for i := 0; i < num; i++ {
        m[i] = &i
    }
    runtime.GC()
    fmt.Printf("With %T, GC took %s\n", m, timeGC())
    _ = m[0]
}

func mapValue(num int) {
    m := make(map[int]int, num)
    for i := 0; i < num; i++ {
        m[i] = i
    }
    runtime.GC()
    fmt.Printf("With %T, GC took %s\n", m, timeGC())
    _ = m[0]
}

func mapPointerShard(num int) {
    shards := make([]map[int]*int, 100)
    for i := range shards {
        shards[i] = make(map[int]*int)
    }
    for i := 0; i < num; i++ {
        shards[i%100][i] = &i
    }
    runtime.GC()
    fmt.Printf("With map shards (%T), GC took %s\n", shards, timeGC())
    _ = shards[0][0]
}

func mapValueShard(num int) {
    shards := make([]map[int]int, 100)
    for i := range shards {
        shards[i] = make(map[int]int)
    }
    for i := 0; i < num; i++ {
        shards[i%100][i] = i
    }
    runtime.GC()
    fmt.Printf("With map shards (%T), GC took %s\n", shards, timeGC())
    _ = shards[0][0]
}

const N = 5e7 // 5000w

func BenchmarkMapPointer(b *testing.B) {
    mapPointer(N)
}

func BenchmarkMapValue(b *testing.B) {
    mapValue(N)
}

func BenchmarkMapPointerShard(b *testing.B) {
    mapPointerShard(N)
}

func BenchmarkMapValueShard(b *testing.B) {
    mapValueShard(N)
}
复制代码

运行

 $ go test -bench=^BenchmarkMapPointer$ -benchmem
With map[int]*int, GC took 545.139836ms
goos: darwin
goarch: amd64
pkg: com.learn/gormLearn/gc_gc
BenchmarkMapPointer-4                  1        9532798100 ns/op        1387850488 B/op   724960 allocs/op

$ go test -bench=^BenchmarkMapPointerShard$ -benchmem
With map shards ([]map[int]*int), GC took 688.39764ms
goos: darwin
goarch: amd64
pkg: com.learn/gormLearn/gc_gc
BenchmarkMapPointerShard-4             1        20670458639 ns/op       4286763416 B/op  1901279 allocs/op

$ go test -bench=^BenchmarkMapValueShard$ -benchmem
With map shards ([]map[int]int), GC took 1.965519ms
goos: darwin
goarch: amd64
pkg: com.learn/gormLearn/gc_gc
BenchmarkMapValueShard-4               1        16190847776 ns/op       4385268936 B/op  1918445 allocs/op

$ go test -bench=^BenchmarkMapValue$ -benchmem 
With map[int]int, GC took 22.993926ms
goos: darwin
goarch: amd64
pkg: com.learn/gormLearn/gc_gc
BenchmarkMapValue-4            1        8253025035 ns/op        1444338752 B/op   724512 allocs/op
复制代码

可以看到，使用分段的，保存值的map的GC耗时最小。加上 GODEBUG=gctrace=1 分析GC轨迹：

 $ GODEBUG=gctrace=1 go test -bench=^BenchmarkMapPointer$ -benchmem
...
gc 3 @0.130s 19%: 0.006+424+0.013 ms clock, 0.027+0.18/424/848+0.055 ms cpu, 1224->1224->1224 MB, 1225 MB goal, 4 P
gc 4 @9.410s 2%: 0.005+543+0.002 ms clock, 0.022+0/543/1628+0.011 ms cpu, 1325->1325->1323 MB, 2448 MB goal, 4 P (forced)
gc 5 @9.957s 3%: 0.003+547+0.003 ms clock, 0.013+0/547/1631+0.013 ms cpu, 1323->1323->1323 MB, 2647 MB goal, 4 P (forced)
With map[int]*int, GC took 550.40821ms

复制代码

为了理解打印的日志，我们要理解 gctrace , 以 0.013+0/547/1631+0.013 ms cpu 为例子，GC分为三个阶段。

Mark Prepare (STW) 。 0.013 表示标记阶段的全局暂停（stop the wrold）时间。
Marking 。 0/547/1631 , 0表示 mutator assist 的耗时， 547 ， 1631 都是标记GC的耗时。
Mark Termination (STW) 。 0.013 表示标记结束阶段的全局暂停（stop the wrold）时间。

 $ GODEBUG=gctrace=1 go test -bench=^BenchmarkMapValue$ -benchmem
...
gc 3 @0.018s 0%: 0.005+0.14+0.015 ms clock, 0.021+0.054/0.020/0.19+0.060 ms cpu, 1224->1224->1224 MB, 1225 MB goal, 4 P
gc 4 @8.334s 0%: 0.006+21+0.003 ms clock, 0.027+0/6.4/21+0.013 ms cpu, 1379->1379->1334 MB, 2448 MB goal, 4 P (forced)
gc 5 @8.358s 0%: 0.003+19+0.003 ms clock, 0.014+0/5.0/20+0.015 ms cpu, 1334->1334->1334 MB, 2668 MB goal, 4 P (forced)
复制代码

可以看到， map 保存值比保存指针的耗时少，主要是在GC的标记阶段耗时更少。

string与[]byte的转换

Golang中， string 从设计上是不可变（immutable）的。因此, string 和 []byte 的类型转换，都是产生一份新的副本。

 func Example() {
    s := "Hello,world"
    b := []byte(s)
}
复制代码

如果确定转换的 string / []byte 不会被修改，可以进行直接的转换，这样不会生成原有变量的副本。新的变量共享底层的数据指针。

 func String2Bytes(s string) []byte {
    stringHeader := (*reflect.StringHeader)(unsafe.Pointer(&s))
    bh := reflect.SliceHeader{
        Data: stringHeader.Data,
        Len:  stringHeader.Len,
        Cap:  stringHeader.Len,
    }
    return *(*[]byte)(unsafe.Pointer(&bh))
}

func Bytes2String(b []byte) string {
    sliceHeader
    sh := reflect.StringHeader{
        Data: sliceHeader.Data,
        Len:  sliceHeader.Len,
    }
    return *(*string)(unsafe.Pointer(&sh))
}
复制代码

函数返回值使用值，不使用指针

对占用空间少，频繁分配的函数，如果函数返回指针，会带来内存逃逸，使得原来可以分配在栈（stack）上的内存，需要分配在堆（heap）上。在栈上进行小对象拷贝的性能很好，比分配对象在堆上要好得多。看下面的例子，2个函数分别返回值，和指针。

 type S struct {
    a, b, c int64
    d, e, f string
    g, h, i float64
}

func byCopy() S {
    return S{
        a: 1, b: 1, c: 1,
        e: "lyp", f: "lyp",
        g: 1.0, h: 1.0, i: 1.0,
    }
}

func byPointer() *S {
    return &S{
        a: 1, b: 1, c: 1,
        e: "lyp", f: "lyp",
        g: 1.0, h: 1.0, i: 1.0,
    }
}
复制代码

benchmark函数

 func BenchmarkMemoryStack(b *testing.B) {
    var s S

    f, err := os.Create("stack.out")
    if err != nil {
        panic(err)
    }
    defer f.Close()

    err = trace.Start(f)
    if err != nil {
        panic(err)
    }

    for i := 0; i < b.N; i++ {
        s = byCopy()
    }

    trace.Stop()

    b.StopTimer()
    _ = fmt.Sprintf("%v", s.a)
}

func BenchmarkMemoryHeap(b *testing.B) {
    var s *S

    f, err := os.Create("heap.out")
    if err != nil {
        panic(err)
    }
    defer f.Close()

    err = trace.Start(f)
    if err != nil {
        panic(err)
    }

    for i := 0; i < b.N; i++ {
        s = byPointer()
    }

    trace.Stop()

    b.StopTimer()
    _ = fmt.Sprintf("%v", s.a)
}
复制代码

运行

  go test ./... -bench=BenchmarkMemoryHeap -benchmem -run=^$ -count=10           
goos: darwin
goarch: amd64
pkg: com.learn/gormLearn/gc_gc
BenchmarkMemoryHeap-4           19625536                53.0 ns/op            96 B/op          1 allocs/op

go test ./... -bench=BenchmarkMemoryStack -benchmem -run=^$ -count=10                                   
goos: darwin
goarch: amd64
pkg: com.learn/gormLearn/gc_gc
BenchmarkMemoryStack-4          163253341                7.22 ns/op            0 B/op          0 allocs/op
复制代码

可以看到，栈分配(返回值)的耗时是 7.22 ns/op ,而堆分配（返回指针）的耗时是 53.0 ns/op 。

使用struct{}优化

Golang中，没有集合(set)。如果要实现一个集合，可以使用 struct{} 作为值。

 func assign(num int) {
    m := make(map[int]bool, num)
    for i := 0; i < num; i++ {
        m[i] = true
    }
}

func assignStruct(num int) {
    m := make(map[int]struct{}, num)
    for i := 0; i < num; i++ {
        m[i] = struct{}{}
    }
}
复制代码

struct{} 经过编译器特殊优化，指向同一个内存地址(runtime.zerobase)，不占用空间。

GC分析的工具

go tool pprof
go tool trace
go build -gcflags=”-m”
GODEBUG=”gctrace=1”

Golang GC 技巧

七叶笔记

Golang的GC性能优化技巧

slice 预先分配内存

map中保存值，而不是指针，使用分段map

string与[]byte的转换

函数返回值使用值，不使用指针

使用struct{}优化

GC分析的工具

相关文章

更多编程视频和电子书关注公众号

最近发表

标签列表