etcd/etcdmain/etcd.go

// Copyright 2015 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package etcdmain

import (
	"crypto/tls"
	"encoding/json"
	"fmt"
	"io/ioutil"
	"net"
	"net/http"
	_ "net/http/pprof"
	"os"
	"path"
	"reflect"
	"runtime"
	"strings"
	"time"

	"github.com/coreos/etcd/discovery"
	"github.com/coreos/etcd/etcdserver"
	"github.com/coreos/etcd/etcdserver/api/v2http"
	"github.com/coreos/etcd/pkg/cors"
	"github.com/coreos/etcd/pkg/fileutil"
	pkgioutil "github.com/coreos/etcd/pkg/ioutil"
	"github.com/coreos/etcd/pkg/osutil"
	runtimeutil "github.com/coreos/etcd/pkg/runtime"
	"github.com/coreos/etcd/pkg/transport"
	"github.com/coreos/etcd/pkg/types"
	"github.com/coreos/etcd/proxy/httpproxy"
	"github.com/coreos/etcd/rafthttp"
	"github.com/coreos/etcd/version"
	"github.com/coreos/go-systemd/daemon"
	systemdutil "github.com/coreos/go-systemd/util"
	"github.com/coreos/pkg/capnslog"
	"github.com/prometheus/client_golang/prometheus"
)

type dirType string

var plog = capnslog.NewPackageLogger("github.com/coreos/etcd", "etcdmain")

const (
	// the owner can make/remove files inside the directory
	privateDirMode = 0700

	// internal fd usage includes disk usage and transport usage.
	// To read/write snapshot, snap pkg needs 1. In normal case, wal pkg needs
	// at most 2 to read/lock/write WALs. One case that it needs to 2 is to
	// read all logs after some snapshot index, which locates at the end of
	// the second last and the head of the last. For purging, it needs to read
	// directory, so it needs 1. For fd monitor, it needs 1.
	// For transport, rafthttp builds two long-polling connections and at most
	// four temporary connections with each member. There are at most 9 members
	// in a cluster, so it should reserve 96.
	// For the safety, we set the total reserved number to 150.
	reservedInternalFDNum = 150
)

var (
	dirMember = dirType("member")
	dirProxy  = dirType("proxy")
	dirEmpty  = dirType("empty")
)

func startEtcdOrProxyV2() {
	cfg := NewConfig()
	err := cfg.Parse(os.Args[1:])
	if err != nil {
		plog.Errorf("error verifying flags, %v. See 'etcd --help'.", err)
		switch err {
		case errUnsetAdvertiseClientURLsFlag:
			plog.Errorf("When listening on specific address(es), this etcd process must advertise accessible url(s) to each connected client.")
		}
		os.Exit(1)
	}
	setupLogging(cfg)

	var stopped <-chan struct{}

	plog.Infof("etcd Version: %s\n", version.Version)
	plog.Infof("Git SHA: %s\n", version.GitSHA)
	plog.Infof("Go Version: %s\n", runtime.Version())
	plog.Infof("Go OS/Arch: %s/%s\n", runtime.GOOS, runtime.GOARCH)

	GoMaxProcs := runtime.GOMAXPROCS(0)
	plog.Infof("setting maximum number of CPUs to %d, total number of available CPUs is %d", GoMaxProcs, runtime.NumCPU())

	// TODO: check whether fields are set instead of whether fields have default value
	if cfg.Name != defaultName && cfg.InitialCluster == initialClusterFromName(defaultName) {
		cfg.InitialCluster = initialClusterFromName(cfg.Name)
	}

	if cfg.Dir == "" {
		cfg.Dir = fmt.Sprintf("%v.etcd", cfg.Name)
		plog.Warningf("no data-dir provided, using default data-dir ./%s", cfg.Dir)
	}

	which := identifyDataDirOrDie(cfg.Dir)
	if which != dirEmpty {
		plog.Noticef("the server is already initialized as %v before, starting as etcd %v...", which, which)
		switch which {
		case dirMember:
			stopped, err = startEtcd(cfg)
		case dirProxy:
			err = startProxy(cfg)
		default:
			plog.Panicf("unhandled dir type %v", which)
		}
	} else {
		shouldProxy := cfg.isProxy()
		if !shouldProxy {
			stopped, err = startEtcd(cfg)
			if derr, ok := err.(*etcdserver.DiscoveryError); ok && derr.Err == discovery.ErrFullCluster {
				if cfg.shouldFallbackToProxy() {
					plog.Noticef("discovery cluster full, falling back to %s", fallbackFlagProxy)
					shouldProxy = true
				}
			}
		}
		if shouldProxy {
			err = startProxy(cfg)
		}
	}

	if err != nil {
		if derr, ok := err.(*etcdserver.DiscoveryError); ok {
			switch derr.Err {
			case discovery.ErrDuplicateID:
				plog.Errorf("member %q has previously registered with discovery service token (%s).", cfg.Name, cfg.Durl)
				plog.Errorf("But etcd could not find valid cluster configuration in the given data dir (%s).", cfg.Dir)
				plog.Infof("Please check the given data dir path if the previous bootstrap succeeded")
				plog.Infof("or use a new discovery token if the previous bootstrap failed.")
			case discovery.ErrDuplicateName:
				plog.Errorf("member with duplicated name has registered with discovery service token(%s).", cfg.Durl)
				plog.Errorf("please check (cURL) the discovery token for more information.")
				plog.Errorf("please do not reuse the discovery token and generate a new one to bootstrap the cluster.")
			default:
				plog.Errorf("%v", err)
				plog.Infof("discovery token %s was used, but failed to bootstrap the cluster.", cfg.Durl)
				plog.Infof("please generate a new discovery token and try to bootstrap again.")
			}
			os.Exit(1)
		}

		if strings.Contains(err.Error(), "include") && strings.Contains(err.Error(), "--initial-cluster") {
			plog.Infof("%v", err)
			if cfg.InitialCluster == initialClusterFromName(cfg.Name) {
				plog.Infof("forgot to set --initial-cluster flag?")
			}
			if types.URLs(cfg.apurls).String() == defaultInitialAdvertisePeerURLs {
				plog.Infof("forgot to set --initial-advertise-peer-urls flag?")
			}
			if cfg.InitialCluster == initialClusterFromName(cfg.Name) && len(cfg.Durl) == 0 {
				plog.Infof("if you want to use discovery service, please set --discovery flag.")
			}
			os.Exit(1)
		}
		plog.Fatalf("%v", err)
	}

	osutil.HandleInterrupts()

	if systemdutil.IsRunningSystemd() {
		// At this point, the initialization of etcd is done.
		// The listeners are listening on the TCP ports and ready
		// for accepting connections. The etcd instance should be
		// joined with the cluster and ready to serve incoming
		// connections.
		err := daemon.SdNotify("READY=1")
		if err != nil {
			plog.Errorf("failed to notify systemd for readiness: %v", err)
			if err == daemon.SdNotifyNoSocket {
				plog.Errorf("forgot to set Type=notify in systemd service file?")
			}
		}
	}

	<-stopped
	osutil.Exit(0)
}

// startEtcd launches the etcd server and HTTP handlers for client/server communication.
func startEtcd(cfg *config) (<-chan struct{}, error) {
	urlsmap, token, err := getPeerURLsMapAndToken(cfg, "etcd")
	if err != nil {
		return nil, fmt.Errorf("error setting up initial cluster: %v", err)
	}

	if cfg.PeerAutoTLS && cfg.peerTLSInfo.Empty() {
		var phosts []string
		for _, u := range cfg.lpurls {
			phosts = append(phosts, u.Host)
		}
		cfg.peerTLSInfo, err = transport.SelfCert(path.Join(cfg.Dir, "fixtures/peer"), phosts)
		if err != nil {
			plog.Fatalf("could not get certs (%v)", err)
		}
	} else if cfg.PeerAutoTLS {
		plog.Warningf("ignoring peer auto TLS since certs given")
	}

	if !cfg.peerTLSInfo.Empty() {
		plog.Infof("peerTLS: %s", cfg.peerTLSInfo)
	}

	var plns []net.Listener
	for _, u := range cfg.lpurls {
		if u.Scheme == "http" {
			if !cfg.peerTLSInfo.Empty() {
				plog.Warningf("The scheme of peer url %s is HTTP while peer key/cert files are presented. Ignored peer key/cert files.", u.String())
			}
			if cfg.peerTLSInfo.ClientCertAuth {
				plog.Warningf("The scheme of peer url %s is HTTP while client cert auth (--peer-client-cert-auth) is enabled. Ignored client cert auth for this url.", u.String())
			}
		}
		var (
			l      net.Listener
			tlscfg *tls.Config
		)

		if !cfg.peerTLSInfo.Empty() {
			tlscfg, err = cfg.peerTLSInfo.ServerConfig()
			if err != nil {
				return nil, err
			}
		}

		l, err = rafthttp.NewListener(u, tlscfg)
		if err != nil {
			return nil, err
		}

		urlStr := u.String()
		plog.Info("listening for peers on ", urlStr)
		defer func() {
			if err != nil {
				l.Close()
				plog.Info("stopping listening for peers on ", urlStr)
			}
		}()
		plns = append(plns, l)
	}

	if cfg.ClientAutoTLS && cfg.clientTLSInfo.Empty() {
		var chosts []string
		for _, u := range cfg.lcurls {
			chosts = append(chosts, u.Host)
		}
		cfg.clientTLSInfo, err = transport.SelfCert(path.Join(cfg.Dir, "fixtures/client"), chosts)
		if err != nil {
			plog.Fatalf("could not get certs (%v)", err)
		}
	} else if cfg.ClientAutoTLS {
		plog.Warningf("ignoring client auto TLS since certs given")
	}

	var ctlscfg *tls.Config
	if !cfg.clientTLSInfo.Empty() {
		plog.Infof("clientTLS: %s", cfg.clientTLSInfo)
		ctlscfg, err = cfg.clientTLSInfo.ServerConfig()
		if err != nil {
			return nil, err
		}
	}

	sctxs := make(map[string]*serveCtx)
	for _, u := range cfg.lcurls {
		if u.Scheme == "http" {
			if !cfg.clientTLSInfo.Empty() {
				plog.Warningf("The scheme of client url %s is HTTP while peer key/cert files are presented. Ignored key/cert files.", u.String())
			}
			if cfg.clientTLSInfo.ClientCertAuth {
				plog.Warningf("The scheme of client url %s is HTTP while client cert auth (--client-cert-auth) is enabled. Ignored client cert auth for this url.", u.String())
			}
		}
		if u.Scheme == "https" && ctlscfg == nil {
			return nil, fmt.Errorf("TLS key/cert (--cert-file, --key-file) must be provided for client url %s with HTTPs scheme", u.String())
		}

		ctx := &serveCtx{host: u.Host}

		if u.Scheme == "https" {
			ctx.secure = true
		} else {
			ctx.insecure = true
		}

		if sctxs[u.Host] != nil {
			if ctx.secure {
				sctxs[u.Host].secure = true
			}
			if ctx.insecure {
				sctxs[u.Host].insecure = true
			}
			continue
		}

		var l net.Listener

		l, err = net.Listen("tcp", u.Host)
		if err != nil {
			return nil, err
		}

		var fdLimit uint64
		if fdLimit, err = runtimeutil.FDLimit(); err == nil {
			if fdLimit <= reservedInternalFDNum {
				plog.Fatalf("file descriptor limit[%d] of etcd process is too low, and should be set higher than %d to ensure internal usage", fdLimit, reservedInternalFDNum)
			}
			l = transport.LimitListener(l, int(fdLimit-reservedInternalFDNum))
		}

		l, err = transport.NewKeepAliveListener(l, "tcp", nil)
		ctx.l = l
		if err != nil {
			return nil, err
		}

		plog.Info("listening for client requests on ", u.Host)
		defer func() {
			if err != nil {
				l.Close()
				plog.Info("stopping listening for client requests on ", u.Host)
			}
		}()
		sctxs[u.Host] = ctx
	}

	srvcfg := &etcdserver.ServerConfig{
		Name:                    cfg.Name,
		ClientURLs:              cfg.acurls,
		PeerURLs:                cfg.apurls,
		DataDir:                 cfg.Dir,
		DedicatedWALDir:         cfg.WalDir,
		SnapCount:               cfg.SnapCount,
		MaxSnapFiles:            cfg.MaxSnapFiles,
		MaxWALFiles:             cfg.MaxWalFiles,
		InitialPeerURLsMap:      urlsmap,
		InitialClusterToken:     token,
		DiscoveryURL:            cfg.Durl,
		DiscoveryProxy:          cfg.Dproxy,
		NewCluster:              cfg.isNewCluster(),
		ForceNewCluster:         cfg.ForceNewCluster,
		PeerTLSInfo:             cfg.peerTLSInfo,
		TickMs:                  cfg.TickMs,
		ElectionTicks:           cfg.electionTicks(),
		AutoCompactionRetention: cfg.autoCompactionRetention,
		QuotaBackendBytes:       cfg.QuotaBackendBytes,
		StrictReconfigCheck:     cfg.StrictReconfigCheck,
		EnablePprof:             cfg.enablePprof,
	}
	var s *etcdserver.EtcdServer
	s, err = etcdserver.NewServer(srvcfg)
	if err != nil {
		return nil, err
	}
	s.Start()
	osutil.RegisterInterruptHandler(s.Stop)

	if cfg.corsInfo.String() != "" {
		plog.Infof("cors = %s", cfg.corsInfo)
	}
	ch := http.Handler(&cors.CORSHandler{
		Handler: v2http.NewClientHandler(s, srvcfg.ReqTimeout()),
		Info:    cfg.corsInfo,
	})
	ph := v2http.NewPeerHandler(s)

	// Start the peer server in a goroutine
	for _, l := range plns {
		go func(l net.Listener) {
			plog.Fatal(servePeerHTTP(l, ph))
		}(l)
	}
	// Start a client server goroutine for each listen address
	for _, sctx := range sctxs {
		go func(sctx *serveCtx) {
			// read timeout does not work with http close notify
			// TODO: https://github.com/golang/go/issues/9524
			plog.Fatal(serve(sctx, s, ctlscfg, ch))
		}(sctx)
	}

	<-s.ReadyNotify()
	return s.StopNotify(), nil
}

// startProxy launches an HTTP proxy for client communication which proxies to other etcd nodes.
func startProxy(cfg *config) error {
	pt, err := transport.NewTimeoutTransport(cfg.peerTLSInfo, time.Duration(cfg.ProxyDialTimeoutMs)*time.Millisecond, time.Duration(cfg.ProxyReadTimeoutMs)*time.Millisecond, time.Duration(cfg.ProxyWriteTimeoutMs)*time.Millisecond)
	if err != nil {
		return err
	}
	pt.MaxIdleConnsPerHost = httpproxy.DefaultMaxIdleConnsPerHost

	tr, err := transport.NewTimeoutTransport(cfg.peerTLSInfo, time.Duration(cfg.ProxyDialTimeoutMs)*time.Millisecond, time.Duration(cfg.ProxyReadTimeoutMs)*time.Millisecond, time.Duration(cfg.ProxyWriteTimeoutMs)*time.Millisecond)
	if err != nil {
		return err
	}

	cfg.Dir = path.Join(cfg.Dir, "proxy")
	err = os.MkdirAll(cfg.Dir, privateDirMode)
	if err != nil {
		return err
	}

	var peerURLs []string
	clusterfile := path.Join(cfg.Dir, "cluster")

	b, err := ioutil.ReadFile(clusterfile)
	switch {
	case err == nil:
		if cfg.Durl != "" {
			plog.Warningf("discovery token ignored since the proxy has already been initialized. Valid cluster file found at %q", clusterfile)
		}
		if cfg.DnsCluster != "" {
			plog.Warningf("DNS SRV discovery ignored since the proxy has already been initialized. Valid cluster file found at %q", clusterfile)
		}
		urls := struct{ PeerURLs []string }{}
		err = json.Unmarshal(b, &urls)
		if err != nil {
			return err
		}
		peerURLs = urls.PeerURLs
		plog.Infof("proxy: using peer urls %v from cluster file %q", peerURLs, clusterfile)
	case os.IsNotExist(err):
		var urlsmap types.URLsMap
		urlsmap, _, err = getPeerURLsMapAndToken(cfg, "proxy")
		if err != nil {
			return fmt.Errorf("error setting up initial cluster: %v", err)
		}

		if cfg.Durl != "" {
			var s string
			s, err = discovery.GetCluster(cfg.Durl, cfg.Dproxy)
			if err != nil {
				return err
			}
			if urlsmap, err = types.NewURLsMap(s); err != nil {
				return err
			}
		}
		peerURLs = urlsmap.URLs()
		plog.Infof("proxy: using peer urls %v ", peerURLs)
	default:
		return err
	}

	clientURLs := []string{}
	uf := func() []string {
		gcls, gerr := etcdserver.GetClusterFromRemotePeers(peerURLs, tr)
		// TODO: remove the 2nd check when we fix GetClusterFromRemotePeers
		// GetClusterFromRemotePeers should not return nil error with an invalid empty cluster
		if gerr != nil {
			plog.Warningf("proxy: %v", gerr)
			return []string{}
		}
		if len(gcls.Members()) == 0 {
			return clientURLs
		}
		clientURLs = gcls.ClientURLs()

		urls := struct{ PeerURLs []string }{gcls.PeerURLs()}
		b, jerr := json.Marshal(urls)
		if jerr != nil {
			plog.Warningf("proxy: error on marshal peer urls %s", jerr)
			return clientURLs
		}

		err = pkgioutil.WriteAndSyncFile(clusterfile+".bak", b, 0600)
		if err != nil {
			plog.Warningf("proxy: error on writing urls %s", err)
			return clientURLs
		}
		err = os.Rename(clusterfile+".bak", clusterfile)
		if err != nil {
			plog.Warningf("proxy: error on updating clusterfile %s", err)
			return clientURLs
		}
		if !reflect.DeepEqual(gcls.PeerURLs(), peerURLs) {
			plog.Noticef("proxy: updated peer urls in cluster file from %v to %v", peerURLs, gcls.PeerURLs())
		}
		peerURLs = gcls.PeerURLs()

		return clientURLs
	}
	ph := httpproxy.NewHandler(pt, uf, time.Duration(cfg.ProxyFailureWaitMs)*time.Millisecond, time.Duration(cfg.ProxyRefreshIntervalMs)*time.Millisecond)
	ph = &cors.CORSHandler{
		Handler: ph,
		Info:    cfg.corsInfo,
	}

	if cfg.isReadonlyProxy() {
		ph = httpproxy.NewReadonlyHandler(ph)
	}
	// Start a proxy server goroutine for each listen address
	for _, u := range cfg.lcurls {
		var (
			l      net.Listener
			tlscfg *tls.Config
		)
		if !cfg.clientTLSInfo.Empty() {
			tlscfg, err = cfg.clientTLSInfo.ServerConfig()
			if err != nil {
				return err
			}
		}

		l, err := transport.NewListener(u.Host, u.Scheme, tlscfg)
		if err != nil {
			return err
		}

		host := u.String()
		go func() {
			plog.Info("proxy: listening for client requests on ", host)
			mux := http.NewServeMux()
			mux.Handle("/metrics", prometheus.Handler())
			mux.Handle("/", ph)
			plog.Fatal(http.Serve(l, mux))
		}()
	}
	return nil
}

// getPeerURLsMapAndToken sets up an initial peer URLsMap and cluster token for bootstrap or discovery.
func getPeerURLsMapAndToken(cfg *config, which string) (urlsmap types.URLsMap, token string, err error) {
	switch {
	case cfg.Durl != "":
		urlsmap = types.URLsMap{}
		// If using discovery, generate a temporary cluster based on
		// self's advertised peer URLs
		urlsmap[cfg.Name] = cfg.apurls
		token = cfg.Durl
	case cfg.DnsCluster != "":
		var clusterStr string
		clusterStr, token, err = discovery.SRVGetCluster(cfg.Name, cfg.DnsCluster, cfg.InitialClusterToken, cfg.apurls)
		if err != nil {
			return nil, "", err
		}
		urlsmap, err = types.NewURLsMap(clusterStr)
		// only etcd member must belong to the discovered cluster.
		// proxy does not need to belong to the discovered cluster.
		if which == "etcd" {
			if _, ok := urlsmap[cfg.Name]; !ok {
				return nil, "", fmt.Errorf("cannot find local etcd member %q in SRV records", cfg.Name)
			}
		}
	default:
		// We're statically configured, and cluster has appropriately been set.
		urlsmap, err = types.NewURLsMap(cfg.InitialCluster)
		token = cfg.InitialClusterToken
	}
	return urlsmap, token, err
}

// identifyDataDirOrDie returns the type of the data dir.
// Dies if the datadir is invalid.
func identifyDataDirOrDie(dir string) dirType {
	names, err := fileutil.ReadDir(dir)
	if err != nil {
		if os.IsNotExist(err) {
			return dirEmpty
		}
		plog.Fatalf("error listing data dir: %s", dir)
	}

	var m, p bool
	for _, name := range names {
		switch dirType(name) {
		case dirMember:
			m = true
		case dirProxy:
			p = true
		default:
			plog.Warningf("found invalid file/dir %s under data dir %s (Ignore this if you are upgrading etcd)", name, dir)
		}
	}

	if m && p {
		plog.Fatal("invalid datadir. Both member and proxy directories exist.")
	}
	if m {
		return dirMember
	}
	if p {
		return dirProxy
	}
	return dirEmpty
}

func setupLogging(cfg *config) {
	capnslog.SetGlobalLogLevel(capnslog.INFO)
	if cfg.Debug {
		capnslog.SetGlobalLogLevel(capnslog.DEBUG)
	}
	if cfg.LogPkgLevels != "" {
		repoLog := capnslog.MustRepoLogger("github.com/coreos/etcd")
		settings, err := repoLog.ParseLogLevelConfig(cfg.LogPkgLevels)
		if err != nil {
			plog.Warningf("couldn't parse log level string: %s, continuing with default levels", err.Error())
			return
		}
		repoLog.SetLogLevel(settings)
	}
}

func checkSupportArch() {
	// TODO qualify arm64
	if runtime.GOARCH == "amd64" {
		return
	}
	if env, ok := os.LookupEnv("ETCD_UNSUPPORTED_ARCH"); ok && env == runtime.GOARCH {
		plog.Warningf("running etcd on unsupported architecture %q since ETCD_UNSUPPORTED_ARCH is set", env)
		return
	}
	plog.Errorf("etcd on unsupported platform without ETCD_UNSUPPORTED_ARCH=%s set.", runtime.GOARCH)
	os.Exit(1)
}