Skip to content

Commit 27b1b1f

Browse files
bradfitzraggi
andcommitted
control/controlhttp: simplify, fix race dialing, remove priority concept
controlhttp has the responsibility of dialing a set of candidate control endpoints in a way that minimizes user facing latency. If one control endpoint is unavailable we promptly dial another, racing across the dimensions of: IPv6, IPv4, port 80, and port 443, over multiple server endpoints. In the case that the top priority endpoint was not available, the prior implementation would hang waiting for other results, so as to try to return the highest priority successful connection to the rest of the client code. This hang would take too long with a large dialplan and sufficient client to endpoint latency as to cause the server to timeout the connection due to inactivity in the intermediate state. Instead of trying to prioritize non-ideal candidate connections, the first successful connection is now used unconditionally, improving user facing latency and avoiding any delays that would encroach on the server-side timeout. The tests are converted to memnet and synctest, running on all platforms. Fixes tailscale#8442 Fixes tailscale/corp#32534 Co-authored-by: James Tucker <james@tailscale.com> Change-Id: I4eb57f046d8b40403220e40eb67a31c41adb3a38 Signed-off-by: Brad Fitzpatrick <bradfitz@tailscale.com> Signed-off-by: James Tucker <james@tailscale.com> (cherry picked from commit db048e9)
1 parent c99901e commit 27b1b1f

File tree

4 files changed

+304
-323
lines changed

4 files changed

+304
-323
lines changed

cmd/tailscale/depaware.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -182,7 +182,7 @@ tailscale.com/cmd/tailscale dependencies: (generated by github.com/tailscale/dep
182182
tailscale.com/util/lineiter from tailscale.com/hostinfo+
183183
L tailscale.com/util/linuxfw from tailscale.com/net/netns
184184
tailscale.com/util/mak from tailscale.com/cmd/tailscale/cli+
185-
tailscale.com/util/multierr from tailscale.com/control/controlhttp+
185+
tailscale.com/util/multierr from tailscale.com/health+
186186
tailscale.com/util/must from tailscale.com/clientupdate/distsign+
187187
tailscale.com/util/nocasemaps from tailscale.com/types/ipproto
188188
tailscale.com/util/prompt from tailscale.com/cmd/tailscale/cli

control/controlhttp/client.go

Lines changed: 41 additions & 136 deletions
Original file line numberDiff line numberDiff line change
@@ -26,14 +26,12 @@ import (
2626
"errors"
2727
"fmt"
2828
"io"
29-
"math"
3029
"net"
3130
"net/http"
3231
"net/http/httptrace"
3332
"net/netip"
3433
"net/url"
3534
"runtime"
36-
"sort"
3735
"sync/atomic"
3836
"time"
3937

@@ -51,7 +49,6 @@ import (
5149
"tailscale.com/syncs"
5250
"tailscale.com/tailcfg"
5351
"tailscale.com/tstime"
54-
"tailscale.com/util/multierr"
5552
)
5653

5754
var stdDialer net.Dialer
@@ -108,158 +105,63 @@ func (a *Dialer) dial(ctx context.Context) (*ClientConn, error) {
108105
}
109106
candidates := a.DialPlan.Candidates
110107

111-
// Otherwise, we try dialing per the plan. Store the highest priority
112-
// in the list, so that if we get a connection to one of those
113-
// candidates we can return quickly.
114-
var highestPriority int = math.MinInt
115-
for _, c := range candidates {
116-
if c.Priority > highestPriority {
117-
highestPriority = c.Priority
118-
}
119-
}
120-
121-
// This context allows us to cancel in-flight connections if we get a
122-
// highest-priority connection before we're all done.
108+
// Create a context to be canceled as we return, so once we get a good connection,
109+
// we can drop all the other ones.
123110
ctx, cancel := context.WithCancel(ctx)
124111
defer cancel()
125112

126113
// Now, for each candidate, kick off a dial in parallel.
127114
type dialResult struct {
128-
conn *ClientConn
129-
err error
130-
addr netip.Addr
131-
priority int
132-
}
133-
resultsCh := make(chan dialResult, len(candidates))
134-
135-
var pending atomic.Int32
136-
pending.Store(int32(len(candidates)))
137-
for _, c := range candidates {
138-
go func(ctx context.Context, c tailcfg.ControlIPCandidate) {
139-
var (
140-
conn *ClientConn
141-
err error
142-
)
143-
144-
// Always send results back to our channel.
145-
defer func() {
146-
resultsCh <- dialResult{conn, err, c.IP, c.Priority}
147-
if pending.Add(-1) == 0 {
148-
close(resultsCh)
149-
}
150-
}()
151-
152-
// If non-zero, wait the configured start timeout
153-
// before we do anything.
154-
if c.DialStartDelaySec > 0 {
155-
a.logf("[v2] controlhttp: waiting %.2f seconds before dialing %q @ %v", c.DialStartDelaySec, a.Hostname, c.IP)
156-
tmr, tmrChannel := a.clock().NewTimer(time.Duration(c.DialStartDelaySec * float64(time.Second)))
157-
defer tmr.Stop()
158-
select {
159-
case <-ctx.Done():
160-
err = ctx.Err()
161-
return
162-
case <-tmrChannel:
163-
}
164-
}
115+
conn *ClientConn
116+
err error
117+
}
118+
resultsCh := make(chan dialResult) // unbuffered, never closed
165119

166-
// Now, create a sub-context with the given timeout and
167-
// try dialing the provided host.
168-
ctx, cancel := context.WithTimeout(ctx, time.Duration(c.DialTimeoutSec*float64(time.Second)))
169-
defer cancel()
120+
dialCand := func(cand tailcfg.ControlIPCandidate) (*ClientConn, error) {
121+
a.logf("[v2] controlhttp: waited %.2f seconds, dialing %q @ %s", cand.DialStartDelaySec, a.Hostname, cand.IP.String())
170122

171-
// This will dial, and the defer above sends it back to our parent.
172-
a.logf("[v2] controlhttp: trying to dial %q @ %v", a.Hostname, c.IP)
173-
conn, err = a.dialHost(ctx, c.IP)
174-
}(ctx, c)
123+
ctx, cancel := context.WithTimeout(ctx, time.Duration(cand.DialTimeoutSec*float64(time.Second)))
124+
defer cancel()
125+
return a.dialHost(ctx, cand.IP)
175126
}
176127

177-
var results []dialResult
178-
for res := range resultsCh {
179-
// If we get a response that has the highest priority, we don't
180-
// need to wait for any of the other connections to finish; we
181-
// can just return this connection.
182-
//
183-
// TODO(andrew): we could make this better by keeping track of
184-
// the highest remaining priority dynamically, instead of just
185-
// checking for the highest total
186-
if res.priority == highestPriority && res.conn != nil {
187-
a.logf("[v1] controlhttp: high-priority success dialing %q @ %v from dial plan", a.Hostname, res.addr)
188-
189-
// Drain the channel and any existing connections in
190-
// the background.
128+
for _, cand := range candidates {
129+
timer := time.AfterFunc(time.Duration(cand.DialStartDelaySec*float64(time.Second)), func() {
191130
go func() {
192-
for _, res := range results {
193-
if res.conn != nil {
194-
res.conn.Close()
131+
conn, err := dialCand(cand)
132+
select {
133+
case resultsCh <- dialResult{conn, err}:
134+
if err == nil {
135+
a.logf("[v1] controlhttp: succeeded dialing %q @ %v from dial plan", a.Hostname, cand.IP.String())
195136
}
196-
}
197-
for res := range resultsCh {
198-
if res.conn != nil {
199-
res.conn.Close()
137+
case <-ctx.Done():
138+
if conn != nil {
139+
conn.Close()
200140
}
201141
}
202-
if a.drainFinished != nil {
203-
close(a.drainFinished)
204-
}
205142
}()
206-
return res.conn, nil
207-
}
208-
209-
// This isn't a highest-priority result, so just store it until
210-
// we're done.
211-
results = append(results, res)
143+
})
144+
defer timer.Stop()
212145
}
213146

214-
// After we finish this function, close any remaining open connections.
215-
defer func() {
216-
for _, result := range results {
217-
// Note: below, we nil out the returned connection (if
218-
// any) in the slice so we don't close it.
219-
if result.conn != nil {
220-
result.conn.Close()
147+
var errs []error
148+
for {
149+
select {
150+
case res := <-resultsCh:
151+
if res.err == nil {
152+
return res.conn, nil
221153
}
154+
errs = append(errs, res.err)
155+
if len(errs) == len(candidates) {
156+
// If we get here, then we didn't get anywhere with our dial plan; fall back to just using DNS.
157+
a.logf("controlhttp: failed dialing using DialPlan, falling back to DNS; errs=%s", errors.Join(errs...))
158+
return a.dialHost(ctx, netip.Addr{})
159+
}
160+
case <-ctx.Done():
161+
a.logf("controlhttp: context aborted dialing")
162+
return nil, ctx.Err()
222163
}
223-
224-
// We don't drain asynchronously after this point, so notify our
225-
// channel when we return.
226-
if a.drainFinished != nil {
227-
close(a.drainFinished)
228-
}
229-
}()
230-
231-
// Sort by priority, then take the first non-error response.
232-
sort.Slice(results, func(i, j int) bool {
233-
// NOTE: intentionally inverted so that the highest priority
234-
// item comes first
235-
return results[i].priority > results[j].priority
236-
})
237-
238-
var (
239-
conn *ClientConn
240-
errs []error
241-
)
242-
for i, result := range results {
243-
if result.err != nil {
244-
errs = append(errs, result.err)
245-
continue
246-
}
247-
248-
a.logf("[v1] controlhttp: succeeded dialing %q @ %v from dial plan", a.Hostname, result.addr)
249-
conn = result.conn
250-
results[i].conn = nil // so we don't close it in the defer
251-
return conn, nil
252164
}
253-
if ctx.Err() != nil {
254-
a.logf("controlhttp: context aborted dialing")
255-
return nil, ctx.Err()
256-
}
257-
258-
merr := multierr.New(errs...)
259-
260-
// If we get here, then we didn't get anywhere with our dial plan; fall back to just using DNS.
261-
a.logf("controlhttp: failed dialing using DialPlan, falling back to DNS; errs=%s", merr.Error())
262-
return a.dialHost(ctx, netip.Addr{})
263165
}
264166

265167
// The TS_FORCE_NOISE_443 envknob forces the controlclient noise dialer to
@@ -388,6 +290,9 @@ func (a *Dialer) dialHost(ctx context.Context, optAddr netip.Addr) (*ClientConn,
388290
}
389291

390292
var err80, err443 error
293+
if forceTLS {
294+
err80 = errors.New("TLS forced: no port 80 dialed")
295+
}
391296
for {
392297
select {
393298
case <-ctx.Done():

control/controlhttp/constants.go

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,6 @@ type Dialer struct {
9898
logPort80Failure atomic.Bool
9999

100100
// For tests only
101-
drainFinished chan struct{}
102101
omitCertErrorLogging bool
103102
testFallbackDelay time.Duration
104103

0 commit comments

Comments
 (0)