Software Safety
Published in

Software Safety

Using TLA+ to Model Cascading Failures

Business vector created by fullvector

Modeling and its Discontents

Example One: The Naive Cluster

------------------- MODULE provisioning_naive -------------------
EXTENDS TLC, Integers, Sequences
VMs == {"Server1","Server2","Server3"}(*--algorithm provisioning
variables
Cluster = [v \in VMs |-> 4],
define
ServersHealthy == \A vm \in VMs: Cluster[vm] <= 9
end define;
end algorithm *)
==================================================================
Events == [target: VMs, size: 0..4]
Incidents(set) == set \X set \X set \X set...incident \in Incidents(Events)
macro trigger_event(event) begin
Cluster[event.target] := Cluster[event.target] + event.size
end macro;
begin
while incident /= <<>> do
curr := Head(incident);
incident := Tail(incident);
trigger_event(curr)
end while;
end algorithm *)
------------------ MODULE provisioning_naive --------------------
EXTENDS TLC, Integers, Sequences
VMs == {"Server1","Server2","Server3"}
Incidents(set) == set \X set \X set \X set
Events == [target: VMs, size: 0..4]
(*--algorithm provisioning
variables
incident \in Incidents(Events),
Cluster = [v \in VMs |-> 4],
curr = ""; \* helper: current event
define
ServersHealthy == \A vm \in VMs: Cluster[vm] <= 9
end define;
macro trigger_event(event) begin
Cluster[event.target] := Cluster[event.target] + event.size
end macro;
begin
while incident /= <<>> do
curr := Head(incident);
incident := Tail(incident);
trigger_event(curr)
end while;
end algorithm *)===================================================================
Finished computing initial states: 531441 distinct states generated.
Error: Invariant ServersHealthy is violated.
Error: The behavior up to this point is:
/\ incident = << [target |-> "Server1", size |-> 2],
[target |-> "Server1", size |-> 4],
[target |-> "Server1", size |-> 4],
[target |-> "Server1", size |-> 4] >>
/\ curr = ""
/\ pc = "Lbl_1"
/\ Cluster = [Server1 |-> 4, Server2 |-> 4, Server3 |-> 4]
State 2: <Action line 67, col 10 to line 74, col 62 of module provisioning_naive>
/\ incident = << [target |-> "Server1", size |-> 4],
[target |-> "Server1", size |-> 4],
[target |-> "Server1", size |-> 4] >>
/\ curr = [target |-> "Server1", size |-> 2]
/\ pc = "Lbl_1"
/\ Cluster = [Server1 |-> 6, Server2 |-> 4, Server3 |-> 4]
State 3: <Action line 67, col 10 to line 74, col 62 of module provisioning_naive>
/\ incident = <<[target |-> "Server1", size |-> 4], [target |-> "Server1", size |-> 4]>>
/\ curr = [target |-> "Server1", size |-> 4]
/\ pc = "Lbl_1"
/\ Cluster = [Server1 |-> 10, Server2 |-> 4, Server3 |-> 4]

Example 2: Load Balancing

------------------- MODULE provisioning_load ---------------------
EXTENDS TLC, Integers, Sequences
VMs == {1,2,3}
Incidents(set) == set \X set \X set \X set
Events == [size: 1..4]
(*--algorithm provisioning
variables
incident \in Incidents(Events),
Cluster = [v \in VMs |-> 4],
curr = "", \* helper: current event
rrobin = 1;
define
ServersHealthy == \A vm \in VMs: Cluster[vm] <= 9
end define;
macro trigger_event(event) begin
Cluster[rrobin] := Cluster[rrobin] + event.size;
if rrobin = 3 then
rrobin := 1
else
rrobin := rrobin + 1
end if;
end macro;
begin
while incident /= <<>> do
curr := Head(incident);
incident := Tail(incident);
trigger_event(curr)
end while;
end algorithm *)==================================================================
-------------------- MODULE provisioning_load ---------------------
EXTENDS TLC, Integers, Sequences
VMs == {1,2,3}
Incidents(set) == set \X set \X set \X set
Events == [size: 1..4]
(*--algorithm provisioning
variables
Cluster = [v \in VMs |-> 4],
define
ServersHealthy == \A vm \in VMs: Cluster[vm] <= 9
end define;
macro trigger_event(event) begin
Cluster[rrobin] := Cluster[rrobin] + event.size;
if rrobin = 3 then
rrobin := 1;
else
rrobin := rrobin + 1
end if;
end macro;
process garbage_collection = "recover resources"
variables
Gcollection \in [1..3 -> 1..4],
begin Collect:
with i \in VMs do
if Gcollection[i] >= Cluster[i] then
Cluster[i] := 0;
else
Cluster[i] := Cluster[i] - Gcollection[i];
end if;
end with;
end process;
process incoming = "increasing resource usage"
variables
incident \in Incidents(Events),
curr = "", \* helper: current event
rrobin = 1;
begin Increase:
while incident /= <<>> do
curr := Head(incident);
incident := Tail(incident);
trigger_event(curr);
end while;
end process;
end algorithm; *)
====================================================================
ServersHealthy == \E vm \in VMs: Cluster[vm] <= 9

Example 3: Autoscaling

(*--algorithm provisioning
variables
VMs \in [1..2 -> 0..10]
define
ServersHealthy == <>(\E v \in 1..Len(VMs): VMs[v] <= 9)
end define;
fair process VM_dies = "kill a VM"
begin Fail:
await Len(VMs) > 1;
VMs := SelectSeq(VMs, LAMBDA x: x < 8);
end process;
fair process scale_up = "autoscale up"
variable
load = FALSE;
begin Scale_up:
load := \E v \in 1..Len(VMs): VMs[v] > 6;
if load then
VMs := Append(VMs, 4);
end if;
end process;
fair process scale_down = "autoscale down"
variables
load = FALSE
begin Scale_Down:
await Len(VMs) > 1;
load := \A v \in 1..Len(VMs): VMs[v] < 4;
if load then
VMs := Tail(VMs);
end if;
end process;
Error: Deadlock reached.
Error: The behavior up to this point is:
State 1: <Initial predicate>
/\ load_ = FALSE
/\ pc = ( "kill a VM" :> "Fail" @@
"autoscale up" :> "Scale_up" @@
"autoscale down" :> "Scale_Down" @@
"modify load on VMs" :> "modify_load" )
/\ VMs = <<8, 8>>
/\ load = FALSE
State 2: <Action line 83, col 9 to line 87, col 38 of module provisioning_autoscale>
/\ load_ = FALSE
/\ pc = ( "kill a VM" :> "Done" @@
"autoscale up" :> "Scale_up" @@
"autoscale down" :> "Scale_Down" @@
"modify load on VMs" :> "modify_load" )
/\ VMs = <<>>
/\ load = FALSE
State 3: <Action line 91, col 13 to line 98, col 27 of module provisioning_autoscale>
/\ load_ = FALSE
/\ pc = ( "kill a VM" :> "Done" @@
"autoscale up" :> "Done" @@
"autoscale down" :> "Scale_Down" @@
"modify load on VMs" :> "modify_load" )
/\ VMs = <<>>
/\ load = FALSE
----------------- MODULE provisioning_autoscale -------------------
EXTENDS TLC, Integers, Sequences
(*--algorithm provisioning
variables
VMs \in [1..2 -> 0..10]
define
ServersHealthy == <>(\E v \in 1..Len(VMs): VMs[v] <= 9)
end define;
fair process VM_dies = "kill a VM"
begin Fail:
await Len(VMs) > 1;
VMs := SelectSeq(VMs, LAMBDA x: x < 8);
end process;
fair process scale_up = "autoscale up"
variable
load = FALSE;
begin Scale_up:
load := Len(VMs) < 2 \/ \E v \in 1..Len(VMs): VMs[v] > 6;
if load then
VMs := Append(VMs, 4);
end if;
end process;
fair process scale_down = "autoscale down"
variables
load = FALSE
begin Scale_Down:
await Len(VMs) > 1;
load := \A v \in 1..Len(VMs): VMs[v] < 4;
if load then
VMs := Tail(VMs);
end if;
end process;
end algorithm; *)==================================================================

Using Modeling as Part of the Development Process

  1. Technology selection. Too often we choose tools based on what we’re already familiar with or what is trendy and cool. Modeling can help you determine what behaviors you’re looking for from the components of your architecture
  2. Monitoring and alerting strategy. Knowing how systems are likely to fail can help you figure out what to monitor without having to wait for failure to happen.
  3. SLOs. Part of the process of modeling is figuring out what the desired outcomes actually are before you’ve written any code. You can take those conclusions and draft your SLOs from them.

Other Models To Look At

--

--

How do we build software that is safe? Exploration of formula specifications, safety science and algorithm ethics. Header art courtesy of Freepik.com

Get the Medium app

A button that says 'Download on the App Store', and if clicked it will lead you to the iOS App store
A button that says 'Get it on, Google Play', and if clicked it will lead you to the Google Play store