@@ -407,6 +407,8 @@ type logQueuer struct {
407
407
loggerTTL time.Duration
408
408
loggers map [string ]agentLoggerLifecycle
409
409
logCache logCache
410
+
411
+ retries map [string ]* retryState
410
412
}
411
413
412
414
func (l * logQueuer ) work (ctx context.Context ) {
@@ -427,87 +429,120 @@ func (l *logQueuer) work(ctx context.Context) {
427
429
}
428
430
}
429
431
432
+ func (l * logQueuer ) newLogger (ctx context.Context , log agentLog , queuedLogs []agentsdk.Log ) (agentLoggerLifecycle , error ) {
433
+ client := agentsdk .New (l .coderURL )
434
+ client .SetSessionToken (log .agentToken )
435
+ logger := l .logger .With (slog .F ("resource_name" , log .resourceName ))
436
+ client .SDK .SetLogger (logger )
437
+
438
+ _ , err := client .PostLogSource (ctx , agentsdk.PostLogSourceRequest {
439
+ ID : sourceUUID ,
440
+ Icon : "/icon/k8s.png" ,
441
+ DisplayName : "Kubernetes" ,
442
+ })
443
+ if err != nil {
444
+ // This shouldn't fail sending the log, as it only affects how they
445
+ // appear.
446
+ logger .Error (ctx , "post log source" , slog .Error (err ))
447
+ l .scheduleRetry (ctx , log .agentToken )
448
+ return agentLoggerLifecycle {}, err
449
+ }
450
+
451
+ ls := agentsdk .NewLogSender (logger )
452
+ sl := ls .GetScriptLogger (sourceUUID )
453
+
454
+ gracefulCtx , gracefulCancel := context .WithCancel (context .Background ())
455
+
456
+ // connect to Agent v2.0 API, since we don't need features added later.
457
+ // This maximizes compatibility.
458
+ arpc , err := client .ConnectRPC20 (gracefulCtx )
459
+ if err != nil {
460
+ logger .Error (ctx , "drpc connect" , slog .Error (err ))
461
+ gracefulCancel ()
462
+ l .scheduleRetry (ctx , log .agentToken )
463
+ return agentLoggerLifecycle {}, err
464
+ }
465
+ go func () {
466
+ err := ls .SendLoop (gracefulCtx , arpc )
467
+ // if the send loop exits on its own without the context
468
+ // canceling, timeout the logger and force it to recreate.
469
+ if err != nil && ctx .Err () == nil {
470
+ l .loggerTimeout (log .agentToken )
471
+ }
472
+ }()
473
+
474
+ closeTimer := l .clock .AfterFunc (l .loggerTTL , func () {
475
+ logger .Info (ctx , "logger timeout firing" )
476
+ l .loggerTimeout (log .agentToken )
477
+ })
478
+ lifecycle := agentLoggerLifecycle {
479
+ scriptLogger : sl ,
480
+ close : func () {
481
+ // We could be stopping for reasons other than the timeout. If
482
+ // so, stop the timer.
483
+ closeTimer .Stop ()
484
+ defer gracefulCancel ()
485
+ timeout := l .clock .AfterFunc (5 * time .Second , gracefulCancel )
486
+ defer timeout .Stop ()
487
+ logger .Info (ctx , "logger closing" )
488
+
489
+ if err := sl .Flush (gracefulCtx ); err != nil {
490
+ // ctx err
491
+ logger .Warn (gracefulCtx , "timeout reached while flushing" )
492
+ return
493
+ }
494
+
495
+ if err := ls .WaitUntilEmpty (gracefulCtx ); err != nil {
496
+ // ctx err
497
+ logger .Warn (gracefulCtx , "timeout reached while waiting for log queue to empty" )
498
+ }
499
+
500
+ _ = arpc .DRPCConn ().Close ()
501
+ client .SDK .HTTPClient .CloseIdleConnections ()
502
+ },
503
+ }
504
+ lifecycle .closeTimer = closeTimer
505
+ return lifecycle , nil
506
+ }
507
+
430
508
func (l * logQueuer ) processLog (ctx context.Context , log agentLog ) {
431
509
l .mu .Lock ()
432
510
defer l .mu .Unlock ()
433
- queuedLogs := l .logCache .push (log )
511
+
512
+ queuedLogs := l .logCache .get (log .agentToken )
513
+ if isAgentLogEmpty (log ) {
514
+ if queuedLogs == nil {
515
+ return
516
+ }
517
+ } else {
518
+ queuedLogs = l .logCache .push (log )
519
+ }
520
+
434
521
lgr , ok := l .loggers [log .agentToken ]
435
522
if ! ok {
436
- client := agentsdk .New (l .coderURL )
437
- client .SetSessionToken (log .agentToken )
438
- logger := l .logger .With (slog .F ("resource_name" , log .resourceName ))
439
- client .SDK .SetLogger (logger )
440
-
441
- _ , err := client .PostLogSource (ctx , agentsdk.PostLogSourceRequest {
442
- ID : sourceUUID ,
443
- Icon : "/icon/k8s.png" ,
444
- DisplayName : "Kubernetes" ,
445
- })
446
- if err != nil {
447
- // This shouldn't fail sending the log, as it only affects how they
448
- // appear.
449
- logger .Error (ctx , "post log source" , slog .Error (err ))
523
+ // skip if we're in a retry cooldown window
524
+ if rs := l .retries [log .agentToken ]; rs != nil && rs .timer != nil {
525
+ return
450
526
}
451
527
452
- ls := agentsdk .NewLogSender (logger )
453
- sl := ls .GetScriptLogger (sourceUUID )
454
-
455
- gracefulCtx , gracefulCancel := context .WithCancel (context .Background ())
456
-
457
- // connect to Agent v2.0 API, since we don't need features added later.
458
- // This maximizes compatibility.
459
- arpc , err := client .ConnectRPC20 (gracefulCtx )
528
+ var err error
529
+ lgr , err = l .newLogger (ctx , log , queuedLogs )
460
530
if err != nil {
461
- logger .Error (ctx , "drpc connect" , slog .Error (err ))
462
- gracefulCancel ()
531
+ l .scheduleRetry (ctx , log .agentToken )
463
532
return
464
533
}
465
- go func () {
466
- err := ls .SendLoop (gracefulCtx , arpc )
467
- // if the send loop exits on its own without the context
468
- // canceling, timeout the logger and force it to recreate.
469
- if err != nil && ctx .Err () == nil {
470
- l .loggerTimeout (log .agentToken )
471
- }
472
- }()
473
-
474
- closeTimer := l .clock .AfterFunc (l .loggerTTL , func () {
475
- logger .Info (ctx , "logger timeout firing" )
476
- l .loggerTimeout (log .agentToken )
477
- })
478
- lifecycle := agentLoggerLifecycle {
479
- scriptLogger : sl ,
480
- close : func () {
481
- // We could be stopping for reasons other than the timeout. If
482
- // so, stop the timer.
483
- closeTimer .Stop ()
484
- defer gracefulCancel ()
485
- timeout := l .clock .AfterFunc (5 * time .Second , gracefulCancel )
486
- defer timeout .Stop ()
487
- logger .Info (ctx , "logger closing" )
488
-
489
- if err := sl .Flush (gracefulCtx ); err != nil {
490
- // ctx err
491
- logger .Warn (gracefulCtx , "timeout reached while flushing" )
492
- return
493
- }
494
-
495
- if err := ls .WaitUntilEmpty (gracefulCtx ); err != nil {
496
- // ctx err
497
- logger .Warn (gracefulCtx , "timeout reached while waiting for log queue to empty" )
498
- }
499
-
500
- _ = arpc .DRPCConn ().Close ()
501
- client .SDK .HTTPClient .CloseIdleConnections ()
502
- },
503
- }
504
- lifecycle .closeTimer = closeTimer
505
- l .loggers [log .agentToken ] = lifecycle
506
- lgr = lifecycle
534
+ l .loggers [log .agentToken ] = lgr
507
535
}
508
536
509
537
lgr .resetCloseTimer (l .loggerTTL )
510
- _ = lgr .scriptLogger .Send (ctx , queuedLogs ... )
538
+ if len (queuedLogs ) == 0 {
539
+ return
540
+ }
541
+ if err := lgr .scriptLogger .Send (ctx , queuedLogs ... ); err != nil {
542
+ l .scheduleRetry (ctx , log .agentToken )
543
+ return
544
+ }
545
+ l .clearRetry (log .agentToken )
511
546
l .logCache .delete (log .agentToken )
512
547
}
513
548
@@ -518,6 +553,8 @@ func (l *logQueuer) processDelete(log agentLog) {
518
553
delete (l .loggers , log .agentToken )
519
554
520
555
}
556
+ l .clearRetry (log .agentToken )
557
+ l .logCache .delete (log .agentToken )
521
558
l .mu .Unlock ()
522
559
523
560
if ok {
@@ -549,6 +586,64 @@ func (l *agentLoggerLifecycle) resetCloseTimer(ttl time.Duration) {
549
586
}
550
587
}
551
588
589
+ // retryState tracks exponential backoff for an agent token.
590
+ type retryState struct {
591
+ delay time.Duration
592
+ timer * quartz.Timer
593
+ }
594
+
595
+ func (l * logQueuer ) ensureRetryMap () {
596
+ if l .retries == nil {
597
+ l .retries = make (map [string ]* retryState )
598
+ }
599
+ }
600
+
601
+ func (l * logQueuer ) scheduleRetry (ctx context.Context , token string ) {
602
+ l .ensureRetryMap ()
603
+
604
+ rs := l .retries [token ]
605
+ if rs == nil {
606
+ rs = & retryState {delay : time .Second }
607
+ l .retries [token ] = rs
608
+ }
609
+
610
+ if rs .timer != nil {
611
+ return
612
+ }
613
+
614
+ if rs .delay < time .Second {
615
+ rs .delay = time .Second
616
+ } else if rs .delay > 30 * time .Second {
617
+ rs .delay = 30 * time .Second
618
+ }
619
+
620
+ l .logger .Info (ctx , "scheduling retry" , slog .F ("delay" , rs .delay .String ()))
621
+
622
+ rs .timer = l .clock .AfterFunc (rs .delay , func () {
623
+ l .mu .Lock ()
624
+ if cur := l .retries [token ]; cur != nil {
625
+ cur .timer = nil
626
+ }
627
+ l .mu .Unlock ()
628
+
629
+ l .q <- agentLog {op : opLog , agentToken : token }
630
+ })
631
+
632
+ rs .delay *= 2
633
+ if rs .delay > 30 * time .Second {
634
+ rs .delay = 30 * time .Second
635
+ }
636
+ }
637
+
638
+ func (l * logQueuer ) clearRetry (token string ) {
639
+ if rs := l .retries [token ]; rs != nil {
640
+ if rs .timer != nil {
641
+ rs .timer .Stop ()
642
+ }
643
+ delete (l .retries , token )
644
+ }
645
+ }
646
+
552
647
func newColor (value ... color.Attribute ) * color.Color {
553
648
c := color .New (value ... )
554
649
c .EnableColor ()
@@ -572,3 +667,15 @@ func (l *logCache) push(log agentLog) []agentsdk.Log {
572
667
func (l * logCache ) delete (token string ) {
573
668
delete (l .logs , token )
574
669
}
670
+
671
+ func (l * logCache ) get (token string ) []agentsdk.Log {
672
+ logs , ok := l .logs [token ]
673
+ if ! ok {
674
+ return nil
675
+ }
676
+ return logs
677
+ }
678
+
679
+ func isAgentLogEmpty (log agentLog ) bool {
680
+ return log .resourceName == "" && log .log .Output == "" && log .log .CreatedAt .IsZero ()
681
+ }
0 commit comments