-
Notifications
You must be signed in to change notification settings - Fork 0
feat(reliability): add retry governor controls and telemetry #40
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -44,6 +44,7 @@ import { | |
| getFastSessionMaxInputItems, | ||
| getRateLimitToastDebounceMs, | ||
| getRetryAllAccountsMaxRetries, | ||
| getRetryAllAccountsAbsoluteCeilingMs, | ||
| getRetryAllAccountsMaxWaitMs, | ||
| getRetryAllAccountsRateLimited, | ||
| getFallbackToGpt52OnUnsupportedGpt53, | ||
|
|
@@ -156,6 +157,10 @@ import { | |
| } from "./lib/request/rate-limit-backoff.js"; | ||
| import { isEmptyResponse } from "./lib/request/response-handler.js"; | ||
| import { addJitter } from "./lib/rotation.js"; | ||
| import { | ||
| decideRetryAllAccountsRateLimited, | ||
| type RetryAllAccountsRateLimitDecisionReason, | ||
| } from "./lib/request/retry-governor.js"; | ||
| import { SessionAffinityStore } from "./lib/session-affinity.js"; | ||
| import { LiveAccountSync } from "./lib/live-account-sync.js"; | ||
| import { RefreshGuardian } from "./lib/refresh-guardian.js"; | ||
|
|
@@ -344,6 +349,9 @@ export const OpenAIOAuthPlugin: Plugin = async ({ client }: PluginInput) => { | |
| streamFailoverAttempts: number; | ||
| streamFailoverRecoveries: number; | ||
| streamFailoverCrossAccountRecoveries: number; | ||
| retryGovernorStopsWaitExceedsMax: number; | ||
| retryGovernorStopsRetryLimitReached: number; | ||
| retryGovernorStopsAbsoluteCeilingExceeded: number; | ||
| cumulativeLatencyMs: number; | ||
| lastRequestAt: number | null; | ||
| lastError: string | null; | ||
|
|
@@ -365,11 +373,32 @@ export const OpenAIOAuthPlugin: Plugin = async ({ client }: PluginInput) => { | |
| streamFailoverAttempts: 0, | ||
| streamFailoverRecoveries: 0, | ||
| streamFailoverCrossAccountRecoveries: 0, | ||
| retryGovernorStopsWaitExceedsMax: 0, | ||
| retryGovernorStopsRetryLimitReached: 0, | ||
| retryGovernorStopsAbsoluteCeilingExceeded: 0, | ||
| cumulativeLatencyMs: 0, | ||
| lastRequestAt: null, | ||
| lastError: null, | ||
| }; | ||
|
|
||
| const recordRetryGovernorStopReason = ( | ||
| reason: RetryAllAccountsRateLimitDecisionReason, | ||
| ): void => { | ||
| switch (reason) { | ||
| case "wait-exceeds-max": | ||
| runtimeMetrics.retryGovernorStopsWaitExceedsMax += 1; | ||
| return; | ||
| case "retry-limit-reached": | ||
| runtimeMetrics.retryGovernorStopsRetryLimitReached += 1; | ||
| return; | ||
| case "absolute-ceiling-exceeded": | ||
| runtimeMetrics.retryGovernorStopsAbsoluteCeilingExceeded += 1; | ||
| return; | ||
| default: | ||
| return; | ||
| } | ||
| }; | ||
|
|
||
| type TokenSuccess = Extract<TokenResult, { type: "success" }>; | ||
| type TokenSuccessWithAccount = TokenSuccess & { | ||
| accountIdOverride?: string; | ||
|
|
@@ -1124,6 +1153,8 @@ export const OpenAIOAuthPlugin: Plugin = async ({ client }: PluginInput) => { | |
| const retryAllAccountsRateLimited = getRetryAllAccountsRateLimited(pluginConfig); | ||
| const retryAllAccountsMaxWaitMs = getRetryAllAccountsMaxWaitMs(pluginConfig); | ||
| const retryAllAccountsMaxRetries = getRetryAllAccountsMaxRetries(pluginConfig); | ||
| const retryAllAccountsAbsoluteCeilingMs = | ||
| getRetryAllAccountsAbsoluteCeilingMs(pluginConfig); | ||
| const unsupportedCodexPolicy = getUnsupportedCodexPolicy(pluginConfig); | ||
| const fallbackOnUnsupportedCodexModel = unsupportedCodexPolicy === "fallback"; | ||
| const fallbackToGpt52OnUnsupportedGpt53 = | ||
|
|
@@ -1397,6 +1428,7 @@ export const OpenAIOAuthPlugin: Plugin = async ({ client }: PluginInput) => { | |
| }; | ||
|
|
||
| let allRateLimitedRetries = 0; | ||
| let accumulatedAllRateLimitedWaitMs = 0; | ||
| let emptyResponseRetries = 0; | ||
| const attemptedUnsupportedFallbackModels = new Set<string>(); | ||
| if (model) { | ||
|
|
@@ -2368,20 +2400,53 @@ while (attempted.size < Math.max(1, accountCount)) { | |
|
|
||
| const waitMs = accountManager.getMinWaitTimeForFamily(modelFamily, model); | ||
| const count = accountManager.getAccountCount(); | ||
| const jitteredWaitMs = waitMs > 0 ? addJitter(waitMs, 0.2) : 0; | ||
| const plannedWaitMs = | ||
| retryAllAccountsAbsoluteCeilingMs > 0 | ||
| ? Math.min( | ||
| jitteredWaitMs, | ||
| Math.max( | ||
| 0, | ||
| retryAllAccountsAbsoluteCeilingMs - accumulatedAllRateLimitedWaitMs, | ||
| ), | ||
| ) | ||
| : jitteredWaitMs; | ||
|
Comment on lines
+2404
to
+2413
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The pre-capping of Fix: Remove the pre-capping logic and let the retry governor make the full decision: const jitteredWaitMs = waitMs > 0 ? addJitter(waitMs, 0.2) : 0;
const retryDecision = decideRetryAllAccountsRateLimited({
enabled: retryAllAccountsRateLimited,
accountCount: count,
waitMs: jitteredWaitMs, // Pass jittered wait, not pre-capped
maxWaitMs: retryAllAccountsMaxWaitMs,
currentRetryCount: allRateLimitedRetries,
maxRetries: retryAllAccountsMaxRetries,
accumulatedWaitMs: accumulatedAllRateLimitedWaitMs,
absoluteCeilingMs: retryAllAccountsAbsoluteCeilingMs,
});
if (retryDecision.shouldRetry) {
// Cap the actual wait time here after the decision
const plannedWaitMs = retryAllAccountsAbsoluteCeilingMs > 0
? Math.min(jitteredWaitMs, Math.max(0, retryAllAccountsAbsoluteCeilingMs - accumulatedAllRateLimitedWaitMs))
: jitteredWaitMs;
await sleepWithCountdown(plannedWaitMs, countdownMessage);
// ... rest of the logic
}Spotted by Graphite |
||
| const retryDecision = decideRetryAllAccountsRateLimited({ | ||
| enabled: retryAllAccountsRateLimited, | ||
| accountCount: count, | ||
| waitMs: plannedWaitMs, | ||
| maxWaitMs: retryAllAccountsMaxWaitMs, | ||
| currentRetryCount: allRateLimitedRetries, | ||
| maxRetries: retryAllAccountsMaxRetries, | ||
| accumulatedWaitMs: accumulatedAllRateLimitedWaitMs, | ||
| absoluteCeilingMs: retryAllAccountsAbsoluteCeilingMs, | ||
| }); | ||
|
Comment on lines
+2414
to
+2423
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
the old guard was with this also makes the the raw Prompt To Fix With AIThis is a comment left during a code review.
Path: index.ts
Line: 2414-2423
Comment:
**`wait-exceeds-max` check is now jitter-aware, breaking determinism near the threshold**
the old guard was `waitMs <= retryAllAccountsMaxWaitMs` where `waitMs` was the raw, pre-jitter value from `accountManager.getMinWaitTimeForFamily`. the new code passes `plannedWaitMs` (jitter already applied) to the governor, so the check becomes:
```
jitteredWaitMs > maxWaitMs
```
with `addJitter(waitMs, 0.2)` producing a value in `[0.8×waitMs, 1.2×waitMs]`, a raw account wait of exactly `maxWaitMs` now has a ~50% chance of being rejected by `wait-exceeds-max` depending on `Math.random()`. an operator who sets `CODEX_AUTH_RETRY_ALL_MAX_WAIT_MS = 5000` expecting retries to proceed whenever the account's raw wait is ≤ 5000 ms will see roughly half those retries blocked non-deterministically.
this also makes the `retryGovernorStopsWaitExceedsMax` counter hard to interpret — it inflates under high jitter even without any actual threshold breach.
the raw `waitMs` is already logged (`logDebug` on line 2441) but isn't what the governor evaluates. passing the raw value as `maxWaitMs` input and keeping `plannedWaitMs` only for the actual sleep would restore the original deterministic semantics.
How can I resolve this? If you propose a fix, please make it concise. |
||
|
|
||
| if ( | ||
| retryAllAccountsRateLimited && | ||
| count > 0 && | ||
| waitMs > 0 && | ||
| (retryAllAccountsMaxWaitMs === 0 || | ||
| waitMs <= retryAllAccountsMaxWaitMs) && | ||
| allRateLimitedRetries < retryAllAccountsMaxRetries | ||
| ) { | ||
| if (retryDecision.shouldRetry) { | ||
| const countdownMessage = `All ${count} account(s) rate-limited. Waiting`; | ||
| await sleepWithCountdown(addJitter(waitMs, 0.2), countdownMessage); | ||
| await sleepWithCountdown(plannedWaitMs, countdownMessage); | ||
| allRateLimitedRetries++; | ||
| accumulatedAllRateLimitedWaitMs += plannedWaitMs; | ||
| continue; | ||
coderabbitai[bot] marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| } | ||
| recordRetryGovernorStopReason(retryDecision.reason); | ||
| if ( | ||
| retryDecision.reason !== "disabled" && | ||
| retryDecision.reason !== "no-accounts" && | ||
| retryDecision.reason !== "no-wait" | ||
| ) { | ||
| logDebug("Retry governor blocked all-rate-limited retry", { | ||
| reason: retryDecision.reason, | ||
| accountCount: count, | ||
| waitMs, | ||
| plannedWaitMs, | ||
| retryCount: allRateLimitedRetries, | ||
| accumulatedWaitMs: accumulatedAllRateLimitedWaitMs, | ||
| maxWaitMs: retryAllAccountsMaxWaitMs, | ||
| maxRetries: retryAllAccountsMaxRetries, | ||
| absoluteCeilingMs: retryAllAccountsAbsoluteCeilingMs, | ||
| }); | ||
| } | ||
|
|
||
| const waitLabel = waitMs > 0 ? formatWaitTime(waitMs) : "a bit"; | ||
| const message = | ||
|
|
@@ -3763,6 +3828,9 @@ while (attempted.size < Math.max(1, accountCount)) { | |
| `Stream failover attempts: ${runtimeMetrics.streamFailoverAttempts}`, | ||
| `Stream failover recoveries: ${runtimeMetrics.streamFailoverRecoveries}`, | ||
| `Stream failover cross-account recoveries: ${runtimeMetrics.streamFailoverCrossAccountRecoveries}`, | ||
| `Retry governor stops (wait>max): ${runtimeMetrics.retryGovernorStopsWaitExceedsMax}`, | ||
| `Retry governor stops (retry limit): ${runtimeMetrics.retryGovernorStopsRetryLimitReached}`, | ||
| `Retry governor stops (absolute ceiling): ${runtimeMetrics.retryGovernorStopsAbsoluteCeilingExceeded}`, | ||
| `Empty-response retries: ${runtimeMetrics.emptyResponseRetries}`, | ||
| `Session affinity entries: ${sessionAffinityEntries}`, | ||
| `Live sync: ${liveSyncSnapshot?.running ? "on" : "off"} (${liveSyncSnapshot?.reloadCount ?? 0} reloads)`, | ||
|
|
@@ -3798,6 +3866,24 @@ while (attempted.size < Math.max(1, accountCount)) { | |
| String(runtimeMetrics.streamFailoverCrossAccountRecoveries), | ||
| "accent", | ||
| ), | ||
| formatUiKeyValue( | ||
| ui, | ||
| "Retry governor stops (wait>max)", | ||
| String(runtimeMetrics.retryGovernorStopsWaitExceedsMax), | ||
| "warning", | ||
| ), | ||
| formatUiKeyValue( | ||
| ui, | ||
| "Retry governor stops (retry limit)", | ||
| String(runtimeMetrics.retryGovernorStopsRetryLimitReached), | ||
| "warning", | ||
| ), | ||
| formatUiKeyValue( | ||
| ui, | ||
| "Retry governor stops (absolute ceiling)", | ||
| String(runtimeMetrics.retryGovernorStopsAbsoluteCeilingExceeded), | ||
| "warning", | ||
| ), | ||
| formatUiKeyValue(ui, "Empty-response retries", String(runtimeMetrics.emptyResponseRetries), "warning"), | ||
| formatUiKeyValue(ui, "Session affinity entries", String(sessionAffinityEntries), "muted"), | ||
| formatUiKeyValue( | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,72 @@ | ||
| export interface RetryAllAccountsRateLimitDecisionInput { | ||
| enabled: boolean; | ||
| accountCount: number; | ||
| waitMs: number; | ||
| maxWaitMs: number; | ||
| currentRetryCount: number; | ||
| maxRetries: number; | ||
| accumulatedWaitMs: number; | ||
| absoluteCeilingMs: number; | ||
| } | ||
|
|
||
| export type RetryAllAccountsRateLimitDecisionReason = | ||
| | "allowed" | ||
| | "disabled" | ||
| | "no-accounts" | ||
| | "no-wait" | ||
| | "wait-exceeds-max" | ||
| | "retry-limit-reached" | ||
| | "absolute-ceiling-exceeded"; | ||
|
|
||
| export interface RetryAllAccountsRateLimitDecision { | ||
| shouldRetry: boolean; | ||
| reason: RetryAllAccountsRateLimitDecisionReason; | ||
| } | ||
|
|
||
| function clampNonNegative(value: number): number { | ||
| if (!Number.isFinite(value)) return 0; | ||
| return Math.max(0, Math.floor(value)); | ||
| } | ||
|
|
||
| function normalizeRetryLimit(value: number): number { | ||
| if (!Number.isFinite(value)) return Number.POSITIVE_INFINITY; | ||
| return clampNonNegative(value); | ||
| } | ||
|
|
||
| /** | ||
| * Decide whether "retry all accounts when rate-limited" should run for the current loop. | ||
| * | ||
| * This helper is pure and deterministic so retry behavior can be tested without | ||
| * exercising the full request pipeline. | ||
| */ | ||
| export function decideRetryAllAccountsRateLimited( | ||
| input: RetryAllAccountsRateLimitDecisionInput, | ||
| ): RetryAllAccountsRateLimitDecision { | ||
| const accountCount = clampNonNegative(input.accountCount); | ||
| const waitMs = clampNonNegative(input.waitMs); | ||
| const maxWaitMs = clampNonNegative(input.maxWaitMs); | ||
| const currentRetryCount = clampNonNegative(input.currentRetryCount); | ||
| const maxRetries = normalizeRetryLimit(input.maxRetries); | ||
| const accumulatedWaitMs = clampNonNegative(input.accumulatedWaitMs); | ||
| const absoluteCeilingMs = clampNonNegative(input.absoluteCeilingMs); | ||
|
|
||
| if (!input.enabled) { | ||
| return { shouldRetry: false, reason: "disabled" }; | ||
| } | ||
| if (accountCount === 0) { | ||
| return { shouldRetry: false, reason: "no-accounts" }; | ||
| } | ||
| if (waitMs === 0) { | ||
| return { shouldRetry: false, reason: "no-wait" }; | ||
| } | ||
| if (maxWaitMs > 0 && waitMs > maxWaitMs) { | ||
| return { shouldRetry: false, reason: "wait-exceeds-max" }; | ||
| } | ||
| if (currentRetryCount >= maxRetries) { | ||
| return { shouldRetry: false, reason: "retry-limit-reached" }; | ||
| } | ||
| if (absoluteCeilingMs > 0 && accumulatedWaitMs + waitMs > absoluteCeilingMs) { | ||
| return { shouldRetry: false, reason: "absolute-ceiling-exceeded" }; | ||
| } | ||
ndycode marked this conversation as resolved.
Show resolved
Hide resolved
Comment on lines
+68
to
+70
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
the caller in const plannedWaitMs =
retryAllAccountsAbsoluteCeilingMs > 0
? Math.min(jitteredWaitMs, Math.max(0, retryAllAccountsAbsoluteCeilingMs - accumulatedAllRateLimitedWaitMs))
: jitteredWaitMs;so by the time the governor sees
consequence: to fix, either:
Prompt To Fix With AIThis is a comment left during a code review.
Path: lib/request/retry-governor.ts
Line: 68-70
Comment:
**`absolute-ceiling-exceeded` is unreachable — telemetry counter always 0**
the caller in `index.ts` pre-clamps `plannedWaitMs` before passing it here:
```typescript
const plannedWaitMs =
retryAllAccountsAbsoluteCeilingMs > 0
? Math.min(jitteredWaitMs, Math.max(0, retryAllAccountsAbsoluteCeilingMs - accumulatedAllRateLimitedWaitMs))
: jitteredWaitMs;
```
so by the time the governor sees `waitMs`, two invariants hold when `absoluteCeilingMs > 0`:
1. if `remaining > 0`: `plannedWaitMs ≤ remaining`, so `accumulatedWaitMs + waitMs ≤ absoluteCeilingMs` — this branch can never be true
2. if `remaining ≤ 0`: `plannedWaitMs = 0` — the `no-wait` guard on line 59 fires first, the ceiling check is never reached
consequence: `retryGovernorStopsAbsoluteCeilingExceeded` stays at 0 in every production run. the "Retry governor stops (absolute ceiling)" dashboard counter is dead telemetry — when the ceiling _is_ being hit it is silently reported as `no-wait` instead. the test at `test/index-retry.test.ts:520` even hard-asserts the value is `0`, confirming the counter never fires.
to fix, either:
- drop the pre-clamping in `index.ts` and let the governor own the ceiling decision (the governor already caps `accum + wait > ceiling` correctly), or
- remove the governor's ceiling check and only fire the `absolute-ceiling-exceeded` metric explicitly in `index.ts` when `remaining ≤ 0` and `jitteredWaitMs > 0`
How can I resolve this? If you propose a fix, please make it concise. |
||
| return { shouldRetry: true, reason: "allowed" }; | ||
| } | ||
Uh oh!
There was an error while loading. Please reload this page.