Fix TestSnapshotV3RestoreMultiMemberAdd flakes (leaks)

- most important: unix's socket transport should not keep idle
connections. For top-level Transport we close them using:
    f3c518025e/server/etcdserver/api/rafthttp/transport.go (L226)
    but currently we don't have access to close them witing the nest (unix) transport. Short idle deadline is good enough.

  - Use dialContext (instead of dial) to make sure context is passed down the stack
  - Make sure Context is cancelled as soon as the operation is done in pipeline
  - nit: use dedicated method to yeld goroutines.

Tested with:
```
d=$(date +"%Y%m%d_%H%M")
(cd tests && go test --timeout=60m ./integration/snapshot -run TestSnapshotV3RestoreMultiMemberAdd -v --count=180 2>&1 | tee log_${d}.log)
```

There were transports & cmux leaked:

```
   leak.go:118: Test appears to have leaked a Transport:
        internal/poll.runtime_pollWait(0x7f6c5c3784c8, 0x72, 0xffffffffffffffff)
        	/usr/lib/google-golang/src/runtime/netpoll.go:222 +0x55
        internal/poll.(*pollDesc).wait(0xc003296298, 0x72, 0x0, 0x18, 0xffffffffffffffff)
        	/usr/lib/google-golang/src/internal/poll/fd_poll_runtime.go:87 +0x45
        internal/poll.(*pollDesc).waitRead(...)
        	/usr/lib/google-golang/src/internal/poll/fd_poll_runtime.go:92
        internal/poll.(*FD).Read(0xc003296280, 0xc0031f60a8, 0x18, 0x18, 0x0, 0x0, 0x0)
        	/usr/lib/google-golang/src/internal/poll/fd_unix.go:166 +0x1d5
        net.(*netFD).Read(0xc003296280, 0xc0031f60a8, 0x18, 0x18, 0x18, 0xc0009056e2, 0x203000)
        	/usr/lib/google-golang/src/net/fd_posix.go:55 +0x4f
        net.(*conn).Read(0xc000010258, 0xc0031f60a8, 0x18, 0x18, 0x0, 0x0, 0x0)
        	/usr/lib/google-golang/src/net/net.go:183 +0x91
        github.com/soheilhy/cmux.(*bufferedReader).Read(0xc0003d24e0, 0xc0031f60a8, 0x18, 0x18, 0xc0003d24d0, 0xc0009056e2, 0xc000278400)
        	/home/ptab/private/golang/pkg/mod/github.com/soheilhy/cmux@v0.1.5/buffer.go:53 +0x12d
        github.com/soheilhy/cmux.hasHTTP2Preface(0x1367e20, 0xc0003d24e0, 0x7f6c5c699f40)
        	/home/ptab/private/golang/pkg/mod/github.com/soheilhy/cmux@v0.1.5/matchers.go:195 +0x8a
        github.com/soheilhy/cmux.matchersToMatchWriters.func1(0x7f6c5c699f40, 0xc000010258, 0x1367e20, 0xc0003d24e0, 0xc000010258)
        	/home/ptab/private/golang/pkg/mod/github.com/soheilhy/cmux@v0.1.5/cmux.go:128 +0x39
        github.com/soheilhy/cmux.(*cMux).serve(0xc003228690, 0x138c410, 0xc000010258, 0xc00327f740, 0xc0059ba860)
        	/home/ptab/private/golang/pkg/mod/github.com/soheilhy/cmux@v0.1.5/cmux.go:192 +0x1e7
        created by github.com/soheilhy/cmux.(*cMux).Serve
        	/home/ptab/private/golang/pkg/mod/github.com/soheilhy/cmux@v0.1.5/cmux.go:179 +0x191

        internal/poll.runtime_pollWait(0x7f6c5c60f3f0, 0x72, 0xffffffffffffffff)
        	/usr/lib/google-golang/src/runtime/netpoll.go:222 +0x55
        internal/poll.(*pollDesc).wait(0xc000d53018, 0x72, 0x1000, 0x1000, 0xffffffffffffffff)
        	/usr/lib/google-golang/src/internal/poll/fd_poll_runtime.go:87 +0x45
        internal/poll.(*pollDesc).waitRead(...)
        	/usr/lib/google-golang/src/internal/poll/fd_poll_runtime.go:92
        internal/poll.(*FD).Read(0xc000d53000, 0xc000cfd000, 0x1000, 0x1000, 0x0, 0x0, 0x0)
        	/usr/lib/google-golang/src/internal/poll/fd_unix.go:166 +0x1d5
        net.(*netFD).Read(0xc000d53000, 0xc000cfd000, 0x1000, 0x1000, 0x3, 0x3, 0x1000000000001)
        	/usr/lib/google-golang/src/net/fd_posix.go:55 +0x4f
        net.(*conn).Read(0xc00031a570, 0xc000cfd000, 0x1000, 0x1000, 0x0, 0x0, 0x0)
        	/usr/lib/google-golang/src/net/net.go:183 +0x91
        net/http.(*persistConn).Read(0xc00093b320, 0xc000cfd000, 0x1000, 0x1000, 0x577750, 0x60, 0x0)
        	/usr/lib/google-golang/src/net/http/transport.go:1933 +0x77
        bufio.(*Reader).fill(0xc005702fc0)
        	/usr/lib/google-golang/src/bufio/bufio.go:101 +0x108
        bufio.(*Reader).Peek(0xc005702fc0, 0x1, 0xc00077c660, 0xc003b082a0, 0xc000d08de0, 0x5ae586, 0x11dd6c0)
        	/usr/lib/google-golang/src/bufio/bufio.go:139 +0x4f
        net/http.(*persistConn).readLoop(0xc00093b320)
        	/usr/lib/google-golang/src/net/http/transport.go:2094 +0x1a8
        created by net/http.(*Transport).dialConn
        	/usr/lib/google-golang/src/net/http/transport.go:1754 +0xdaa

        net/http.(*persistConn).writeLoop(0xc00093b320)
        	/usr/lib/google-golang/src/net/http/transport.go:2393 +0xf7
        created by net/http.(*Transport).dialConn
        	/usr/lib/google-golang/src/net/http/transport.go:1755 +0xdcf

        sync.runtime_Semacquire(0xc0059ba868)
        	/usr/lib/google-golang/src/runtime/sema.go:56 +0x45
        sync.(*WaitGroup).Wait(0xc0059ba860)
        	/usr/lib/google-golang/src/sync/waitgroup.go:130 +0x65
        github.com/soheilhy/cmux.(*cMux).Serve.func1(0xc003228690, 0xc0059ba860)
        	/home/ptab/private/golang/pkg/mod/github.com/soheilhy/cmux@v0.1.5/cmux.go:158 +0x56
        github.com/soheilhy/cmux.(*cMux).Serve(0xc003228690, 0x13698c0, 0xc00377a0f0)
        	/home/ptab/private/golang/pkg/mod/github.com/soheilhy/cmux@v0.1.5/cmux.go:173 +0x115
        go.etcd.io/etcd/server/v3/embed.(*Etcd).servePeers.func1(0xc0007cc360, 0x122b75f)
        	/home/ptab/corp/etcd/server/embed/etcd.go:518 +0x2b9
        go.etcd.io/etcd/server/v3/embed.(*Etcd).servePeers.func3(0xc00036d080, 0xc0059330a0)
        	/home/ptab/corp/etcd/server/embed/etcd.go:549 +0x182
        created by go.etcd.io/etcd/server/v3/embed.(*Etcd).servePeers
        	/home/ptab/corp/etcd/server/embed/etcd.go:543 +0x73a
--- FAIL: TestSnapshotV3RestoreMultiMemberAdd (17.74s)
```
release-3.5
Piotr Tabor 2021-04-16 18:55:56 +02:00
parent f3c518025e
commit 17b982382e
4 changed files with 20 additions and 9 deletions

View File

@ -15,6 +15,7 @@
package transport
import (
"context"
"net"
"net/http"
"strings"
@ -31,11 +32,11 @@ func NewTransport(info TLSInfo, dialtimeoutd time.Duration) (*http.Transport, er
t := &http.Transport{
Proxy: http.ProxyFromEnvironment,
Dial: (&net.Dialer{
DialContext: (&net.Dialer{
Timeout: dialtimeoutd,
// value taken from http.DefaultTransport
KeepAlive: 30 * time.Second,
}).Dial,
}).DialContext,
// value taken from http.DefaultTransport
TLSHandshakeTimeout: 10 * time.Second,
TLSClientConfig: cfg,
@ -45,15 +46,20 @@ func NewTransport(info TLSInfo, dialtimeoutd time.Duration) (*http.Transport, er
Timeout: dialtimeoutd,
KeepAlive: 30 * time.Second,
}
dial := func(net, addr string) (net.Conn, error) {
return dialer.Dial("unix", addr)
}
dialContext := func(ctx context.Context, net, addr string) (net.Conn, error) {
return dialer.DialContext(ctx, "unix", addr)
}
tu := &http.Transport{
Proxy: http.ProxyFromEnvironment,
Dial: dial,
DialContext: dialContext,
TLSHandshakeTimeout: 10 * time.Second,
TLSClientConfig: cfg,
// Cost of reopening connection on sockets is low, and they are mostly used in testing.
// Long living unix-transport connections were leading to 'leak' test flakes.
// Alternativly the returned Transport (t) should override CloseIdleConnections to
// forward it to 'tu' as well.
IdleConnTimeout: time.Microsecond,
}
ut := &unixTransport{tu}

View File

@ -15,6 +15,7 @@
package proxy
import (
"context"
"fmt"
"io"
mrand "math/rand"
@ -295,6 +296,7 @@ func (s *server) To() string {
func (s *server) listenAndServe() {
defer s.closeWg.Done()
ctx := context.Background()
s.lg.Info("proxy is listening on", zap.String("from", s.From()))
close(s.readyc)
@ -380,7 +382,7 @@ func (s *server) listenAndServe() {
}
continue
}
out, err = tp.Dial(s.to.Scheme, s.to.Host)
out, err = tp.DialContext(ctx, s.to.Scheme, s.to.Host)
} else {
out, err = net.Dial(s.to.Scheme, s.to.Host)
}

View File

@ -16,6 +16,7 @@ package proxy
import (
"bytes"
"context"
"crypto/tls"
"fmt"
"io/ioutil"
@ -615,7 +616,7 @@ func send(t *testing.T, data []byte, scheme, addr string, tlsInfo transport.TLSI
if terr != nil {
t.Fatal(terr)
}
out, err = tp.Dial(scheme, addr)
out, err = tp.DialContext(context.Background(), scheme, addr)
} else {
out, err = net.Dial(scheme, addr)
}

View File

@ -19,6 +19,7 @@ import (
"context"
"errors"
"io/ioutil"
"runtime"
"sync"
"time"
@ -139,6 +140,7 @@ func (p *pipeline) post(data []byte) (err error) {
go func() {
select {
case <-done:
cancel()
case <-p.stopc:
waitSchedule()
cancel()
@ -173,4 +175,4 @@ func (p *pipeline) post(data []byte) (err error) {
}
// waitSchedule waits other goroutines to be scheduled for a while
func waitSchedule() { time.Sleep(time.Millisecond) }
func waitSchedule() { runtime.Gosched() }