fix(gateway): harden manager calls and guild circuit-breaker ETS handling

This commit is contained in:
Hampus Kraft 2026-02-19 19:49:56 +00:00
parent 2264f1decb
commit b178c90879
No known key found for this signature in database
GPG Key ID: 6090864C465A454D
3 changed files with 74 additions and 31 deletions

View File

@ -136,7 +136,7 @@ update_circuit_state(GuildPid, Result, PrevState) ->
IsSuccess = is_success_result(Result), IsSuccess = is_success_result(Result),
case {IsSuccess, PrevState} of case {IsSuccess, PrevState} of
{true, half_open} -> {true, half_open} ->
ets:delete(?CIRCUIT_BREAKER_TABLE, GuildPid), safe_delete(GuildPid),
ok; ok;
{true, closed} -> {true, closed} ->
reset_failures(GuildPid); reset_failures(GuildPid);
@ -152,7 +152,7 @@ is_success_result(_) -> false.
reset_failures(GuildPid) -> reset_failures(GuildPid) ->
case safe_lookup(GuildPid) of case safe_lookup(GuildPid) of
[{_, State}] -> [{_, State}] ->
ets:insert(?CIRCUIT_BREAKER_TABLE, {GuildPid, State#{failures => 0}}), safe_insert(GuildPid, State#{failures => 0}),
ok; ok;
[] -> [] ->
ok ok
@ -163,27 +163,21 @@ record_failure(GuildPid) ->
Now = erlang:system_time(millisecond), Now = erlang:system_time(millisecond),
case safe_lookup(GuildPid) of case safe_lookup(GuildPid) of
[] -> [] ->
ets:insert( safe_insert(GuildPid, #{
?CIRCUIT_BREAKER_TABLE, state => closed,
{GuildPid, #{ failures => 1,
state => closed, concurrent => 0
failures => 1, }),
concurrent => 0
}}
),
ok; ok;
[{_, #{failures := F} = State}] when F + 1 >= ?FAILURE_THRESHOLD -> [{_, #{failures := F} = State}] when F + 1 >= ?FAILURE_THRESHOLD ->
ets:insert( safe_insert(GuildPid, State#{
?CIRCUIT_BREAKER_TABLE, state => open,
{GuildPid, State#{ failures => F + 1,
state => open, opened_at => Now
failures => F + 1, }),
opened_at => Now
}}
),
ok; ok;
[{_, #{failures := F} = State}] -> [{_, #{failures := F} = State}] ->
ets:insert(?CIRCUIT_BREAKER_TABLE, {GuildPid, State#{failures => F + 1}}), safe_insert(GuildPid, State#{failures => F + 1}),
ok ok
end. end.
@ -191,19 +185,16 @@ record_failure(GuildPid) ->
acquire_slot(GuildPid) -> acquire_slot(GuildPid) ->
case safe_lookup(GuildPid) of case safe_lookup(GuildPid) of
[] -> [] ->
ets:insert( safe_insert(GuildPid, #{
?CIRCUIT_BREAKER_TABLE, state => closed,
{GuildPid, #{ failures => 0,
state => closed, concurrent => 1
failures => 0, }),
concurrent => 1
}}
),
ok; ok;
[{_, #{concurrent := C}}] when C >= ?MAX_CONCURRENT -> [{_, #{concurrent := C}}] when C >= ?MAX_CONCURRENT ->
{error, too_many_requests}; {error, too_many_requests};
[{_, #{concurrent := C} = State}] -> [{_, #{concurrent := C} = State}] ->
ets:insert(?CIRCUIT_BREAKER_TABLE, {GuildPid, State#{concurrent => C + 1}}), safe_insert(GuildPid, State#{concurrent => C + 1}),
ok ok
end. end.
@ -211,12 +202,36 @@ acquire_slot(GuildPid) ->
release_slot(GuildPid) -> release_slot(GuildPid) ->
case safe_lookup(GuildPid) of case safe_lookup(GuildPid) of
[{_, #{concurrent := C} = State}] when C > 0 -> [{_, #{concurrent := C} = State}] when C > 0 ->
ets:insert(?CIRCUIT_BREAKER_TABLE, {GuildPid, State#{concurrent => C - 1}}), safe_insert(GuildPid, State#{concurrent => C - 1}),
ok; ok;
_ -> _ ->
ok ok
end. end.
-spec safe_insert(pid(), map()) -> ok.
safe_insert(GuildPid, State) ->
ensure_table(),
try ets:insert(?CIRCUIT_BREAKER_TABLE, {GuildPid, State}) of
true -> ok
catch
error:badarg ->
ensure_table(),
try ets:insert(?CIRCUIT_BREAKER_TABLE, {GuildPid, State}) of
true -> ok
catch
error:badarg -> ok
end
end.
-spec safe_delete(pid()) -> ok.
safe_delete(GuildPid) ->
ensure_table(),
try ets:delete(?CIRCUIT_BREAKER_TABLE, GuildPid) of
true -> ok
catch
error:badarg -> ok
end.
-spec safe_lookup(pid()) -> list(). -spec safe_lookup(pid()) -> list().
safe_lookup(GuildPid) -> safe_lookup(GuildPid) ->
try ets:lookup(?CIRCUIT_BREAKER_TABLE, GuildPid) of try ets:lookup(?CIRCUIT_BREAKER_TABLE, GuildPid) of
@ -369,6 +384,18 @@ record_failure_opens_circuit_test() ->
?assertEqual(open, maps:get(state, State)), ?assertEqual(open, maps:get(state, State)),
Pid ! done. Pid ! done.
record_failure_recreates_missing_table_test() ->
catch ets:delete(?CIRCUIT_BREAKER_TABLE),
Pid = spawn(fun() ->
receive
done -> ok
end
end),
?assertEqual(ok, record_failure(Pid)),
[{Pid, State}] = ets:lookup(?CIRCUIT_BREAKER_TABLE, Pid),
?assertEqual(1, maps:get(failures, State)),
Pid ! done.
is_success_result_test() -> is_success_result_test() ->
?assertEqual(true, is_success_result({ok, #{}})), ?assertEqual(true, is_success_result({ok, #{}})),
?assertEqual(false, is_success_result({error, timeout})), ?assertEqual(false, is_success_result({error, timeout})),

View File

@ -248,7 +248,14 @@ call_shard(GuildId, Request, Timeout) ->
-spec call_via_manager(term(), pos_integer()) -> term(). -spec call_via_manager(term(), pos_integer()) -> term().
call_via_manager(Request, Timeout) -> call_via_manager(Request, Timeout) ->
gen_server:call(?MODULE, Request, Timeout + 1000). case catch gen_server:call(?MODULE, Request, Timeout + 1000) of
{'EXIT', {timeout, _}} ->
{error, timeout};
{'EXIT', _} ->
{error, unavailable};
Reply ->
Reply
end.
-spec forward_call(guild_id(), term(), state()) -> {term(), state()}. -spec forward_call(guild_id(), term(), state()) -> {term(), state()}.
forward_call(GuildId, {start_or_lookup, _} = Request, State) -> forward_call(GuildId, {start_or_lookup, _} = Request, State) ->

View File

@ -136,6 +136,8 @@ call_shard(SessionId, Request, Timeout) ->
case shard_pid_from_table(SessionId) of case shard_pid_from_table(SessionId) of
{ok, Pid} -> {ok, Pid} ->
case catch gen_server:call(Pid, Request, Timeout) of case catch gen_server:call(Pid, Request, Timeout) of
{'EXIT', {timeout, _}} ->
{error, timeout};
{'EXIT', _} -> {'EXIT', _} ->
call_via_manager(SessionId, Request, Timeout); call_via_manager(SessionId, Request, Timeout);
Reply -> Reply ->
@ -147,7 +149,14 @@ call_shard(SessionId, Request, Timeout) ->
-spec call_via_manager(session_id(), term(), pos_integer()) -> term(). -spec call_via_manager(session_id(), term(), pos_integer()) -> term().
call_via_manager(SessionId, Request, Timeout) -> call_via_manager(SessionId, Request, Timeout) ->
gen_server:call(?MODULE, {proxy_call, SessionId, Request, Timeout}, Timeout + 1000). case catch gen_server:call(?MODULE, {proxy_call, SessionId, Request, Timeout}, Timeout + 1000) of
{'EXIT', {timeout, _}} ->
{error, timeout};
{'EXIT', _} ->
{error, unavailable};
Reply ->
Reply
end.
-spec forward_call(session_id(), term(), pos_integer(), state()) -> {term(), state()}. -spec forward_call(session_id(), term(), pos_integer(), state()) -> {term(), state()}.
forward_call(SessionId, Request, Timeout, State) -> forward_call(SessionId, Request, Timeout, State) ->