Fix TestSnapshotV3RestoreMultiMemberAdd flakes (leaks)
- most important: unix's socket transport should not keep idle
connections. For top-level Transport we close them using:
f3c518025e/server/etcdserver/api/rafthttp/transport.go (L226)
but currently we don't have access to close them witing the nest (unix) transport. Short idle deadline is good enough.
- Use dialContext (instead of dial) to make sure context is passed down the stack
- Make sure Context is cancelled as soon as the operation is done in pipeline
- nit: use dedicated method to yeld goroutines.
Tested with:
```
d=$(date +"%Y%m%d_%H%M")
(cd tests && go test --timeout=60m ./integration/snapshot -run TestSnapshotV3RestoreMultiMemberAdd -v --count=180 2>&1 | tee log_${d}.log)
```
There were transports & cmux leaked:
```
leak.go:118: Test appears to have leaked a Transport:
internal/poll.runtime_pollWait(0x7f6c5c3784c8, 0x72, 0xffffffffffffffff)
/usr/lib/google-golang/src/runtime/netpoll.go:222 +0x55
internal/poll.(*pollDesc).wait(0xc003296298, 0x72, 0x0, 0x18, 0xffffffffffffffff)
/usr/lib/google-golang/src/internal/poll/fd_poll_runtime.go:87 +0x45
internal/poll.(*pollDesc).waitRead(...)
/usr/lib/google-golang/src/internal/poll/fd_poll_runtime.go:92
internal/poll.(*FD).Read(0xc003296280, 0xc0031f60a8, 0x18, 0x18, 0x0, 0x0, 0x0)
/usr/lib/google-golang/src/internal/poll/fd_unix.go:166 +0x1d5
net.(*netFD).Read(0xc003296280, 0xc0031f60a8, 0x18, 0x18, 0x18, 0xc0009056e2, 0x203000)
/usr/lib/google-golang/src/net/fd_posix.go:55 +0x4f
net.(*conn).Read(0xc000010258, 0xc0031f60a8, 0x18, 0x18, 0x0, 0x0, 0x0)
/usr/lib/google-golang/src/net/net.go:183 +0x91
github.com/soheilhy/cmux.(*bufferedReader).Read(0xc0003d24e0, 0xc0031f60a8, 0x18, 0x18, 0xc0003d24d0, 0xc0009056e2, 0xc000278400)
/home/ptab/private/golang/pkg/mod/github.com/soheilhy/cmux@v0.1.5/buffer.go:53 +0x12d
github.com/soheilhy/cmux.hasHTTP2Preface(0x1367e20, 0xc0003d24e0, 0x7f6c5c699f40)
/home/ptab/private/golang/pkg/mod/github.com/soheilhy/cmux@v0.1.5/matchers.go:195 +0x8a
github.com/soheilhy/cmux.matchersToMatchWriters.func1(0x7f6c5c699f40, 0xc000010258, 0x1367e20, 0xc0003d24e0, 0xc000010258)
/home/ptab/private/golang/pkg/mod/github.com/soheilhy/cmux@v0.1.5/cmux.go:128 +0x39
github.com/soheilhy/cmux.(*cMux).serve(0xc003228690, 0x138c410, 0xc000010258, 0xc00327f740, 0xc0059ba860)
/home/ptab/private/golang/pkg/mod/github.com/soheilhy/cmux@v0.1.5/cmux.go:192 +0x1e7
created by github.com/soheilhy/cmux.(*cMux).Serve
/home/ptab/private/golang/pkg/mod/github.com/soheilhy/cmux@v0.1.5/cmux.go:179 +0x191
internal/poll.runtime_pollWait(0x7f6c5c60f3f0, 0x72, 0xffffffffffffffff)
/usr/lib/google-golang/src/runtime/netpoll.go:222 +0x55
internal/poll.(*pollDesc).wait(0xc000d53018, 0x72, 0x1000, 0x1000, 0xffffffffffffffff)
/usr/lib/google-golang/src/internal/poll/fd_poll_runtime.go:87 +0x45
internal/poll.(*pollDesc).waitRead(...)
/usr/lib/google-golang/src/internal/poll/fd_poll_runtime.go:92
internal/poll.(*FD).Read(0xc000d53000, 0xc000cfd000, 0x1000, 0x1000, 0x0, 0x0, 0x0)
/usr/lib/google-golang/src/internal/poll/fd_unix.go:166 +0x1d5
net.(*netFD).Read(0xc000d53000, 0xc000cfd000, 0x1000, 0x1000, 0x3, 0x3, 0x1000000000001)
/usr/lib/google-golang/src/net/fd_posix.go:55 +0x4f
net.(*conn).Read(0xc00031a570, 0xc000cfd000, 0x1000, 0x1000, 0x0, 0x0, 0x0)
/usr/lib/google-golang/src/net/net.go:183 +0x91
net/http.(*persistConn).Read(0xc00093b320, 0xc000cfd000, 0x1000, 0x1000, 0x577750, 0x60, 0x0)
/usr/lib/google-golang/src/net/http/transport.go:1933 +0x77
bufio.(*Reader).fill(0xc005702fc0)
/usr/lib/google-golang/src/bufio/bufio.go:101 +0x108
bufio.(*Reader).Peek(0xc005702fc0, 0x1, 0xc00077c660, 0xc003b082a0, 0xc000d08de0, 0x5ae586, 0x11dd6c0)
/usr/lib/google-golang/src/bufio/bufio.go:139 +0x4f
net/http.(*persistConn).readLoop(0xc00093b320)
/usr/lib/google-golang/src/net/http/transport.go:2094 +0x1a8
created by net/http.(*Transport).dialConn
/usr/lib/google-golang/src/net/http/transport.go:1754 +0xdaa
net/http.(*persistConn).writeLoop(0xc00093b320)
/usr/lib/google-golang/src/net/http/transport.go:2393 +0xf7
created by net/http.(*Transport).dialConn
/usr/lib/google-golang/src/net/http/transport.go:1755 +0xdcf
sync.runtime_Semacquire(0xc0059ba868)
/usr/lib/google-golang/src/runtime/sema.go:56 +0x45
sync.(*WaitGroup).Wait(0xc0059ba860)
/usr/lib/google-golang/src/sync/waitgroup.go:130 +0x65
github.com/soheilhy/cmux.(*cMux).Serve.func1(0xc003228690, 0xc0059ba860)
/home/ptab/private/golang/pkg/mod/github.com/soheilhy/cmux@v0.1.5/cmux.go:158 +0x56
github.com/soheilhy/cmux.(*cMux).Serve(0xc003228690, 0x13698c0, 0xc00377a0f0)
/home/ptab/private/golang/pkg/mod/github.com/soheilhy/cmux@v0.1.5/cmux.go:173 +0x115
go.etcd.io/etcd/server/v3/embed.(*Etcd).servePeers.func1(0xc0007cc360, 0x122b75f)
/home/ptab/corp/etcd/server/embed/etcd.go:518 +0x2b9
go.etcd.io/etcd/server/v3/embed.(*Etcd).servePeers.func3(0xc00036d080, 0xc0059330a0)
/home/ptab/corp/etcd/server/embed/etcd.go:549 +0x182
created by go.etcd.io/etcd/server/v3/embed.(*Etcd).servePeers
/home/ptab/corp/etcd/server/embed/etcd.go:543 +0x73a
--- FAIL: TestSnapshotV3RestoreMultiMemberAdd (17.74s)
```
release-3.5
parent
f3c518025e
commit
17b982382e
|
@ -15,6 +15,7 @@
|
|||
package transport
|
||||
|
||||
import (
|
||||
"context"
|
||||
"net"
|
||||
"net/http"
|
||||
"strings"
|
||||
|
@ -31,11 +32,11 @@ func NewTransport(info TLSInfo, dialtimeoutd time.Duration) (*http.Transport, er
|
|||
|
||||
t := &http.Transport{
|
||||
Proxy: http.ProxyFromEnvironment,
|
||||
Dial: (&net.Dialer{
|
||||
DialContext: (&net.Dialer{
|
||||
Timeout: dialtimeoutd,
|
||||
// value taken from http.DefaultTransport
|
||||
KeepAlive: 30 * time.Second,
|
||||
}).Dial,
|
||||
}).DialContext,
|
||||
// value taken from http.DefaultTransport
|
||||
TLSHandshakeTimeout: 10 * time.Second,
|
||||
TLSClientConfig: cfg,
|
||||
|
@ -45,15 +46,20 @@ func NewTransport(info TLSInfo, dialtimeoutd time.Duration) (*http.Transport, er
|
|||
Timeout: dialtimeoutd,
|
||||
KeepAlive: 30 * time.Second,
|
||||
}
|
||||
dial := func(net, addr string) (net.Conn, error) {
|
||||
return dialer.Dial("unix", addr)
|
||||
}
|
||||
|
||||
dialContext := func(ctx context.Context, net, addr string) (net.Conn, error) {
|
||||
return dialer.DialContext(ctx, "unix", addr)
|
||||
}
|
||||
tu := &http.Transport{
|
||||
Proxy: http.ProxyFromEnvironment,
|
||||
Dial: dial,
|
||||
DialContext: dialContext,
|
||||
TLSHandshakeTimeout: 10 * time.Second,
|
||||
TLSClientConfig: cfg,
|
||||
// Cost of reopening connection on sockets is low, and they are mostly used in testing.
|
||||
// Long living unix-transport connections were leading to 'leak' test flakes.
|
||||
// Alternativly the returned Transport (t) should override CloseIdleConnections to
|
||||
// forward it to 'tu' as well.
|
||||
IdleConnTimeout: time.Microsecond,
|
||||
}
|
||||
ut := &unixTransport{tu}
|
||||
|
||||
|
|
|
@ -15,6 +15,7 @@
|
|||
package proxy
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"io"
|
||||
mrand "math/rand"
|
||||
|
@ -295,6 +296,7 @@ func (s *server) To() string {
|
|||
func (s *server) listenAndServe() {
|
||||
defer s.closeWg.Done()
|
||||
|
||||
ctx := context.Background()
|
||||
s.lg.Info("proxy is listening on", zap.String("from", s.From()))
|
||||
close(s.readyc)
|
||||
|
||||
|
@ -380,7 +382,7 @@ func (s *server) listenAndServe() {
|
|||
}
|
||||
continue
|
||||
}
|
||||
out, err = tp.Dial(s.to.Scheme, s.to.Host)
|
||||
out, err = tp.DialContext(ctx, s.to.Scheme, s.to.Host)
|
||||
} else {
|
||||
out, err = net.Dial(s.to.Scheme, s.to.Host)
|
||||
}
|
||||
|
|
|
@ -16,6 +16,7 @@ package proxy
|
|||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"crypto/tls"
|
||||
"fmt"
|
||||
"io/ioutil"
|
||||
|
@ -615,7 +616,7 @@ func send(t *testing.T, data []byte, scheme, addr string, tlsInfo transport.TLSI
|
|||
if terr != nil {
|
||||
t.Fatal(terr)
|
||||
}
|
||||
out, err = tp.Dial(scheme, addr)
|
||||
out, err = tp.DialContext(context.Background(), scheme, addr)
|
||||
} else {
|
||||
out, err = net.Dial(scheme, addr)
|
||||
}
|
||||
|
|
|
@ -19,6 +19,7 @@ import (
|
|||
"context"
|
||||
"errors"
|
||||
"io/ioutil"
|
||||
"runtime"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
|
@ -139,6 +140,7 @@ func (p *pipeline) post(data []byte) (err error) {
|
|||
go func() {
|
||||
select {
|
||||
case <-done:
|
||||
cancel()
|
||||
case <-p.stopc:
|
||||
waitSchedule()
|
||||
cancel()
|
||||
|
@ -173,4 +175,4 @@ func (p *pipeline) post(data []byte) (err error) {
|
|||
}
|
||||
|
||||
// waitSchedule waits other goroutines to be scheduled for a while
|
||||
func waitSchedule() { time.Sleep(time.Millisecond) }
|
||||
func waitSchedule() { runtime.Gosched() }
|
||||
|
|
Loading…
Reference in New Issue