Skip to content

Commit eef93a0

Browse files
committed
ipn/proxies: retry getting missing proxies
When proxies are removed to be re-added, the exported ProxyTo/ProxyFor fns may return proxy-not-found errors, which may result in either dropped conn or worse use of local proxies (when not in proxy-lockdown mode). To avoid the subtle race, which apparently is reproducible every now & then in our lab test setting, add a retry to re-fetch missing proxies after waiting for a fixed 5sec period (for now), in the hope that re-add goes through by then and the proxy makes itself available.
1 parent ebc1b17 commit eef93a0

2 files changed

Lines changed: 50 additions & 12 deletions

File tree

intra/ipn/proxies.go

Lines changed: 49 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -138,15 +138,16 @@ var (
138138
const (
139139
udptimeoutsec = 5 * 60 // 5m
140140
tcptimeoutsec = (2 * 60 * 60) + (40 * 60) // 2h40m
141-
getproxytimeout = 5 * time.Second
142-
tlsHandshakeTimeout = 30 * time.Second // some proxies take a long time to handshake
141+
tlsHandshakeTimeout = 30 * time.Second // some proxies take a long time to handshake
143142
responseHeaderTimeout = 60 * time.Second
144143
tzzTimeout = 2 * time.Minute // time between new connections before proxies transition to idle
145144
lastOKThreshold = 10 * time.Minute // time between last OK and now before pinging & un-pinning
146145
pintimeout = 10 * time.Minute // time to keep a pin
147146
alwaysPin = true // always pin to a proxy no matter the errors
148147
maxFailingPinTrackTTl = 30 * time.Second // max period to track a failing to-be-pinned proxy
149148
maxStallPeriodSec = 10 // max duration to stall a failing proxy
149+
maxWaitPeriodSec = 5 // max duration to wait for a missing proxy to be added
150+
getproxytimeout = 5 * time.Second
150151
)
151152

152153
// type checks
@@ -449,6 +450,8 @@ func (px *proxifier) removeProxy(id string, force bool) bool {
449450
// May return both a Proxy and an error, in which case, the error
450451
// denotes that while the Proxy is not healthy, it is still registered.
451452
func (px *proxifier) ProxyTo(ipp netip.AddrPort, uid string, pids []string) (theone Proxy, err error) {
453+
waitedForMissingProxy := false
454+
452455
ippstr := ipp.String()
453456
e := func(err error) error {
454457
return fmt.Errorf("%v for %s to %s among %v", err, uid, ippstr, pids)
@@ -463,13 +466,23 @@ func (px *proxifier) ProxyTo(ipp netip.AddrPort, uid string, pids []string) (the
463466
stalledSec := uint32(0)
464467

465468
if len(pids) == 1 { // there's no other pid to choose from
469+
retryPin:
466470
p, err := px.pinID(uid, ipp, pids[0]) // repin
467471
if err != nil || p == nil {
468472
err = core.OneErr(err, errProxyNotFound)
469-
stalledSec = px.stall(uid + ippstr)
473+
if !waitedForMissingProxy {
474+
// wait for the missing proxy to be added before returning error
475+
waitedForMissingProxy = true
476+
stalledSec = px.stall(uid + ippstr)
477+
if stalledSec < maxWaitPeriodSec {
478+
time.Sleep(time.Duration(maxWaitPeriodSec-stalledSec) * time.Second)
479+
stalledSec = maxWaitPeriodSec
480+
}
481+
goto retryPin
482+
}
470483
}
471-
logev(err)("proxy: pin: %s+%s; pin pid0: %s (stalled? %ds); err? %v",
472-
uid, ippstr, pids[0], stalledSec, err)
484+
logev(err)("proxy: pin: %s+%s; pin pid0: %s (stalled? %ds / waited? %t); err? %v",
485+
uid, ippstr, pids[0], stalledSec, waitedForMissingProxy, err)
473486
if p != nil {
474487
if !hasroute(p, ippstr) {
475488
px.delpin(uid, ipp)
@@ -486,7 +499,6 @@ func (px *proxifier) ProxyTo(ipp netip.AddrPort, uid string, pids []string) (the
486499
}
487500

488501
var lopinned string
489-
var notok []Proxy
490502

491503
pinnedpid, pinok := px.getpin(uid, ipp)
492504
chosen := has(pids, pinnedpid)
@@ -516,6 +528,7 @@ func (px *proxifier) ProxyTo(ipp netip.AddrPort, uid string, pids []string) (the
516528
px.delpin(uid, ipp)
517529
}
518530

531+
var notok []Proxy
519532
notokproxies := make([]string, 0)
520533
endproxies := make([]string, 0)
521534
pausedproxies := make([]string, 0)
@@ -531,6 +544,7 @@ func (px *proxifier) ProxyTo(ipp netip.AddrPort, uid string, pids []string) (the
531544
uid, ipp, idstr(theone), stalledSec, loproxies, missproxies, notokproxies, norouteproxies, pausedproxies, endproxies)
532545
}()
533546

547+
retrySearch:
534548
for _, pid := range pids {
535549
if pinok && pid == pinnedpid { // already tried above
536550
continue
@@ -540,8 +554,9 @@ func (px *proxifier) ProxyTo(ipp netip.AddrPort, uid string, pids []string) (the
540554
continue // process later
541555
}
542556

543-
p, err := px.ProxyFor(pid)
557+
p, err := px.proxyFor(pid)
544558
if err != nil || p == nil { // proxy 404
559+
// TODO: errors.Is(err, errProxyNotFound)?
545560
missproxies = append(missproxies, pid)
546561
continue
547562
}
@@ -592,7 +607,20 @@ func (px *proxifier) ProxyTo(ipp netip.AddrPort, uid string, pids []string) (the
592607
missproxies = append(missproxies, pid)
593608
}
594609

595-
stalledSec = px.stall(uid + ippstr)
610+
if len(missproxies) > 0 && !waitedForMissingProxy {
611+
// wait for the missing proxy to be added before returning error
612+
waitedForMissingProxy = true
613+
stalledSec = px.stall(uid + ippstr)
614+
if stalledSec < maxWaitPeriodSec {
615+
time.Sleep(time.Duration(maxWaitPeriodSec-stalledSec) * time.Second)
616+
stalledSec = maxWaitPeriodSec
617+
}
618+
log.W("proxy: pin: %s+%s; missing: %v; notok: %v; noroute: %v; paused: %v; ended: %v; waited: %ds",
619+
uid, ippstr, missproxies, notokproxies, norouteproxies, pausedproxies, endproxies, stalledSec)
620+
pids = missproxies
621+
missproxies = make([]string, 0)
622+
goto retrySearch
623+
}
596624

597625
if len(notokproxies) > 0 {
598626
return nil, e(errNoProxyHealthy)
@@ -624,7 +652,7 @@ func (px *proxifier) stall(k string) (secs uint32) {
624652
}
625653

626654
func (px *proxifier) pinID(uid string, ipp netip.AddrPort, id string) (Proxy, error) {
627-
p, err := px.ProxyFor(id)
655+
p, err := px.proxyFor(id)
628656
if err != nil || p == nil {
629657
err = core.OneErr(err, errProxyNotFound)
630658
return p, fmt.Errorf("proxy: pin: id %s; err: %v", id, err)
@@ -673,10 +701,20 @@ func (px *proxifier) clearpins() (int, int) {
673701
// As a special case, if it takes longer than getproxytimeout, it returns an error.
674702
// ProxyFor implements Proxies.
675703
func (px *proxifier) ProxyFor(id string) (Proxy, error) {
676-
defer core.Recover(core.Exit11, "pxr.ProxyFor."+id)
704+
p, err := px.proxyFor(id)
705+
if errors.Is(err, errProxyNotFound) {
706+
log.W("proxy: for: %s; not found; waiting for %ds...", maxWaitPeriodSec, id)
707+
time.Sleep(maxWaitPeriodSec)
708+
}
709+
p, err = px.proxyFor(id)
710+
return p, err
711+
}
712+
713+
func (px *proxifier) proxyFor(id string) (Proxy, error) {
714+
defer core.Recover(core.Exit11, "pxr.proxyFor."+id)
677715

678716
if len(id) <= 0 {
679-
return nil, errProxyNotFound
717+
return nil, errMissingProxyID
680718
}
681719

682720
if immutable(id) { // fast path for immutable proxies

intra/ipn/proxy.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -193,7 +193,7 @@ func (pxr *proxifier) addProxy(id, txt string) (p Proxy, err error) {
193193
pxr.Lock()
194194
lp := pxr.lp
195195
pxr.Unlock()
196-
if p, _ = pxr.ProxyFor(id); p != nil {
196+
if p, _ = pxr.proxyFor(id); p != nil {
197197
if wgp, ok := p.(WgProxy); ok && wgp.update(id, txt) {
198198
newcfg, readd := wgp.OnProtoChange(lp)
199199
if readd || len(newcfg) > 0 {

0 commit comments

Comments
 (0)