Introduction to vectorized models¶

Vectorized models allow us to easily and quickly perform runs with many policies at the same time.

Example without vectorization¶

Here we run a model for a single policy

In [108]:

Copied!





from heavylight import LightModel
import pandas as pd
from collections import defaultdict
import numpy as np


class EasyModel(LightModel):
    def __init__(self, mortality_rate):
        super().__init__()
        self.mortality_rate = mortality_rate

    def pols_if(self, t):
        if t == 0:
            return 1
        return self.pols_if(t-1) - self.pols_death(t-1)
    
    def pols_death(self, t):
        return self.mortality_rate * self.pols_if(t)
    
    
single = EasyModel(0.01)
single.RunModel(100)
single.df
from heavylight import LightModel
import pandas as pd
from collections import defaultdict
import numpy as np


class EasyModel(LightModel):
    def __init__(self, mortality_rate):
        super().__init__()
        self.mortality_rate = mortality_rate

    def pols_if(self, t):
        if t == 0:
            return 1
        return self.pols_if(t-1) - self.pols_death(t-1)
    
    def pols_death(self, t):
        return self.mortality_rate * self.pols_if(t)
    
    
single = EasyModel(0.01)
single.RunModel(100)
single.df

Out[108]:

	pols_death	pols_if
0	0.010000	1.000000
1	0.009900	0.990000
2	0.009801	0.980100
3	0.009703	0.970299
4	0.009606	0.960596
...	...	...
96	0.003810	0.381047
97	0.003772	0.377237
98	0.003735	0.373464
99	0.003697	0.369730
100	0.003660	0.366032

101 rows × 2 columns

Many mortality rates, no vectorization¶

Suppose we want to run the model for rate_count equally spaces mortality rates between 0 and 1. Without vectorization we need to write some logic to

Run the model for the different rates
Collect the results from the different model runs

This is shown below

In [109]:

Copied!





def get_rates_results_no_vectorization(rate_count: int, projection_length: int):
    all_models = []
    for i in range(rate_count):
        mortality_rate = i / rate_count
        model_with_rate = EasyModel(mortality_rate)
        model_with_rate.RunModel(projection_length)
        all_models.append(model_with_rate)
    combined_cache = defaultdict(lambda: defaultdict(list))
    for model in all_models:
        for function_key, function_results in model.cache.items():
            for result_key, result_value in function_results.items():
                combined_cache[function_key][result_key].append(result_value)
    combined_df = pd.DataFrame(combined_cache)
    return combined_df

get_rates_results_no_vectorization(rate_count=1000, projection_length=100)
def get_rates_results_no_vectorization(rate_count: int, projection_length: int):
    all_models = []
    for i in range(rate_count):
        mortality_rate = i / rate_count
        model_with_rate = EasyModel(mortality_rate)
        model_with_rate.RunModel(projection_length)
        all_models.append(model_with_rate)
    combined_cache = defaultdict(lambda: defaultdict(list))
    for model in all_models:
        for function_key, function_results in model.cache.items():
            for result_key, result_value in function_results.items():
                combined_cache[function_key][result_key].append(result_value)
    combined_df = pd.DataFrame(combined_cache)
    return combined_df

get_rates_results_no_vectorization(rate_count=1000, projection_length=100)

Out[109]:

	pols_if	pols_death
0	[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...	[0.0, 0.001, 0.002, 0.003, 0.004, 0.005, 0.006...
1	[1.0, 0.999, 0.998, 0.997, 0.996, 0.995, 0.994...	[0.0, 0.000999, 0.001996, 0.002991, 0.003984, ...
2	[1.0, 0.998001, 0.996004, 0.994009, 0.992016, ...	[0.0, 0.000998001, 0.001992008, 0.002982027, 0...
3	[1.0, 0.997002999, 0.994011992, 0.991026973, 0...	[0.0, 0.000997002999, 0.001988023984, 0.002973...
4	[1.0, 0.996005996001, 0.992023968016, 0.988053...	[0.0, 0.000996005996001, 0.0019840479360320002...
...	...	...
96	[1.0, 0.9084203817511969, 0.8251482132286804, ...	[0.0, 0.0009084203817511969, 0.001650296426457...
97	[1.0, 0.9075119613694457, 0.8234979168022231, ...	[0.0, 0.0009075119613694457, 0.001646995833604...
98	[1.0, 0.9066044494080763, 0.8218509209686187, ...	[0.0, 0.0009066044494080763, 0.001643701841937...
99	[1.0, 0.9056978449586682, 0.8202072191266815, ...	[0.0, 0.0009056978449586683, 0.001640414438253...
100	[1.0, 0.9047921471137096, 0.8185668046884281, ...	[0.0, 0.0009047921471137096, 0.001637133609376...

101 rows × 2 columns

Vectorization¶

What is vectorization?¶

Vectorization just means using an array library like NumPy so that operations generally happen between arrays and not between single numbers.

For example, in the expression

self.pols_if(t-1) - self.pols_death(t-1)

The subtraction happens between numpy arrays returned by self.pols_if(t-1) and self.pols_death(t-1).

Returning to our example¶

See that we no longer need to write logic to iterate over the policies.

In [110]:

Copied!

vectorized_model = EasyModel(np.linspace(0, .999, 1000))
vectorized_model.RunModel(100)
vectorized_model.df
vectorized_model = EasyModel(np.linspace(0, .999, 1000))
vectorized_model.RunModel(100)
vectorized_model.df

Out[110]:

	pols_death	pols_if
0	[0.0, 0.001, 0.002, 0.003, 0.004, 0.005, 0.006...	1
1	[0.0, 0.000999, 0.001996, 0.002991, 0.003984, ...	[1.0, 0.999, 0.998, 0.997, 0.996, 0.995, 0.994...
2	[0.0, 0.000998001, 0.001992008, 0.002982027, 0...	[1.0, 0.998001, 0.996004, 0.994009, 0.992016, ...
3	[0.0, 0.000997002999, 0.001988023984, 0.002973...	[1.0, 0.997002999, 0.994011992, 0.991026973, 0...
4	[0.0, 0.000996005996001, 0.0019840479360320002...	[1.0, 0.996005996001, 0.992023968016, 0.988053...
...	...	...
96	[0.0, 0.0009084203817511969, 0.001650296426457...	[1.0, 0.9084203817511969, 0.8251482132286804, ...
97	[0.0, 0.0009075119613694457, 0.001646995833604...	[1.0, 0.9075119613694457, 0.8234979168022231, ...
98	[0.0, 0.0009066044494080763, 0.001643701841937...	[1.0, 0.9066044494080763, 0.8218509209686187, ...
99	[0.0, 0.0009056978449586683, 0.001640414438253...	[1.0, 0.9056978449586682, 0.8202072191266815, ...
100	[0.0, 0.0009047921471137096, 0.001637133609376...	[1.0, 0.9047921471137096, 0.8185668046884281, ...

101 rows × 2 columns

Performance benefits¶

See that the vectorized code runs much faster

In [111]:

Copied!

%%timeit
get_rates_results_no_vectorization(rate_count=1000, projection_length=100)
%%timeit
get_rates_results_no_vectorization(rate_count=1000, projection_length=100)

1.45 s ± 63.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

In [112]:

Copied!

%%timeit
vectorized_model.RunModel(proj_len=100)
%%timeit
vectorized_model.RunModel(proj_len=100)

2.09 ms ± 146 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)

Why vectorized code is faster¶

Function call overhead¶

If we have a million policies, each function called will have to be called a million times. There might be 1000 unique function calls in the calculation, so that is a billion function calls. It isn't performant to make a billion function calls due to function call overheads. On top of Python's typical function call overhead, actuarial modeling frameworks will perform additional actions on each function call like checking for cached values.

We can avoid making large numbers of function calls with vectorization.

Python overheads¶

Summing a Python list of 1,000,000 numbers will be much slower than summing a NumPy array of 1,000,000 numbers. This is because NumPy is using compiled code that is not written in Python. Even if you wrote the code in C, it still might be slower than NumPy because NumPy is highly optimized.

We can avoid using Python to iterate over a large collection of policies with vectorization.

Initialization conditions¶

The t==0 condition on pols_if can cause a bug in aggregated results.

    def pols_if(self, t):
        if t == 0:
            return 1
        return self.pols_if(t-1) - self.pols_death(t-1)

In [113]:

Copied!

vectorized_model.df_agg[:2]
vectorized_model.df_agg[:2]

Out[113]:

	pols_death	pols_if
0	499.5000	1.0
1	166.6665	500.5

We can fix the bug by initializing pols_if as an array with the correct shape.

    def pols_if(self, t):
        if t == 0:
            return np.ones_like(self.mortality_rate)
        return self.pols_if(t-1) - self.pols_death(t-1)

In [114]:

Copied!





class GoodInitializationModel(LightModel):
    def __init__(self, mortality_rate):
        super().__init__()
        self.mortality_rate = mortality_rate

    def pols_if(self, t):
        if t == 0:
            return np.ones_like(self.mortality_rate)
        return self.pols_if(t-1) - self.pols_death(t-1)
    
    def pols_death(self, t):
        return self.mortality_rate * self.pols_if(t)
    
bugfix_model = GoodInitializationModel(np.linspace(0, .999, 1000))
bugfix_model.RunModel(100)
bugfix_model.df_agg[:2]
class GoodInitializationModel(LightModel):
    def __init__(self, mortality_rate):
        super().__init__()
        self.mortality_rate = mortality_rate

    def pols_if(self, t):
        if t == 0:
            return np.ones_like(self.mortality_rate)
        return self.pols_if(t-1) - self.pols_death(t-1)
    
    def pols_death(self, t):
        return self.mortality_rate * self.pols_if(t)
    
bugfix_model = GoodInitializationModel(np.linspace(0, .999, 1000))
bugfix_model.RunModel(100)
bugfix_model.df_agg[:2]

Out[114]:

	pols_death	pols_if
0	499.5000	1000.0
1	166.6665	500.5

Running multiple scenarios with broadcasting¶

This is a bit advanced, but possibly useful in some situations. Recommended reading is the NumPy documentation on broadcasting.

Typical models will operate on arrays of size P (policies). If you want to run multiple scenarios you can either

Set up a loop over S scenarios
Use broadcasting to operate on arrays with shape (S, P) without looping

In our code, we demonstrate the broadcasting approach with a multiplicative factor we apply to mortality rates. Maybe we want to adjust the rates by factors of [-.01, 0, .01].

In [115]:

Copied!





class BroadcastedModel(LightModel):
    def __init__(self, mortality_rate: np.ndarray, factors: np.ndarray):
        super().__init__()
        self.mortality_rate = mortality_rate
        self.factors = factors

    def pols_if(self, t):
        if t == 0:
            return np.ones((len(self.mortality_rate), len(self.factors)))
        return self.pols_if(t-1) - self.pols_death(t-1)
    
    def pols_death(self, t):
        return self.mortality_rate * (1 + self.factors) * self.pols_if(t)
class BroadcastedModel(LightModel):
    def __init__(self, mortality_rate: np.ndarray, factors: np.ndarray):
        super().__init__()
        self.mortality_rate = mortality_rate
        self.factors = factors

    def pols_if(self, t):
        if t == 0:
            return np.ones((len(self.mortality_rate), len(self.factors)))
        return self.pols_if(t-1) - self.pols_death(t-1)
    
    def pols_death(self, t):
        return self.mortality_rate * (1 + self.factors) * self.pols_if(t)

In [116]:

Copied!





broadcasted_model = BroadcastedModel(
    np.array([.01, .02, .03]), # shape (3,)
    np.array([[-.01], [0.], [.01]]) # shape (3, 1)
)
broadcasted_model.RunModel(100)
broadcasted_model.df
broadcasted_model = BroadcastedModel(
    np.array([.01, .02, .03]), # shape (3,)
    np.array([[-.01], [0.], [.01]]) # shape (3, 1)
)
broadcasted_model.RunModel(100)
broadcasted_model.df

Out[116]:

	pols_death	pols_if
0	[[0.0099, 0.0198, 0.029699999999999997], [0.01...	[[1.0, 1.0, 1.0], [1.0, 1.0, 1.0], [1.0, 1.0, ...
1	[[0.00980199, 0.019407960000000002, 0.02881791...	[[0.9901, 0.9802, 0.9703], [0.99, 0.98, 0.97],...
2	[[0.009704950299, 0.019023682392000002, 0.0279...	[[0.9802980099999999, 0.96079204, 0.94148209],...
3	[[0.0096088712910399, 0.0186470134806384, 0.02...	[[0.970593059701, 0.9417683576079999, 0.913520...
4	[[0.009513743465258606, 0.01827780261372176, 0...	[[0.96098418840996, 0.9231213441273616, 0.8863...
...	...	...
96	[[0.003809123061986519, 0.002903195053362312, ...	[[0.3847599052511635, 0.14662601279607634, 0.0...
97	[[0.0037714127436728525, 0.002845711791305738,...	[[0.380950782189177, 0.14372281774271403, 0.05...
98	[[0.0037340757575104913, 0.0027893666978378844...	[[0.3771793694455041, 0.1408771059514083, 0.05...
99	[[0.003697108407511137, 0.0027341372372206942,...	[[0.3734452936879936, 0.1380877392535704, 0.05...
100	[[0.0036605070342767766, 0.0026800013199237247...	[[0.3697481852804825, 0.13535360201634972, 0.0...

101 rows × 2 columns