-
Notifications
You must be signed in to change notification settings - Fork 219
/
returns.py
522 lines (402 loc) · 18.4 KB
/
returns.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
########################################################################
#
# Functions for calculating Total Return, Annualized Returns, etc.
#
########################################################################
#
# This file is part of FinanceOps:
#
# https://github.com/Hvass-Labs/FinanceOps
#
# Published under the MIT License. See the file LICENSE for details.
#
# Copyright 2018 by Magnus Erik Hvass Pedersen
#
########################################################################
import numpy as np
import pandas as pd
from numba import jit
import data
from data_keys import *
########################################################################
# Constants.
# Average number of days in a year. Normal years have 365 days,
# but every 4th year is a leap-year with 366 days.
DAYS_PER_YEAR = 365.25
# Average number of business- or trading-days in a year.
BDAYS_PER_YEAR = 251.67
########################################################################
# Public functions.
def total_return(df):
"""
Calculate the "Total Return" of a stock when dividends are
reinvested in the stock.
The formula is:
Total_Return[t] = Total_Return[t-1] * (Dividend[t] + Share_Price[t]) / Share_Price[t-1]
:param df:
Pandas data-frame assumed to contain SHARE_PRICE and DIVIDEND.
:return:
Pandas series with the Total Return.
"""
# Copy the relevant data so we don't change it.
df2 = df[[SHARE_PRICE, DIVIDEND]].copy()
# Fill NA-values in the Dividend-column with zeros.
df2[DIVIDEND].fillna(0, inplace=True)
# Calculate the daily Total Return.
tot_ret_daily = (df2[DIVIDEND] + df2[SHARE_PRICE]) / df2[SHARE_PRICE].shift(1)
# Calculate the cumulative Total Return.
tot_ret = tot_ret_daily.cumprod()
# Replace the first row's NA with 1.0
tot_ret.iloc[0] = 1.0
return tot_ret
def annualized_returns(series, years):
"""
Calculate the annualized returns for all possible
periods of the given number of years.
For example, given the Total Return of a stock we want
to know the annualized returns of all holding-periods
of 10 years.
:param series:
Pandas series e.g. with the Total Return of a stock.
Assumed to be daily data.
:param years:
Number of years in each period.
:return:
Pandas series of same length as the input series. Each
day has the annualized return of the period starting
that day and for the given number of years. The end of
the series has NA for the given number of years.
"""
# Number of days to shift data. All years have 365 days
# except leap-years which have 366 and occur every 4th year.
# So on average a year has 365.25 days.
days = int(years * 365.25)
# Calculate annualized returns for all periods of this length.
# Note: It is important we have daily (interpolated) data,
# otherwise the series.shift(365) would shift much more than
# a year, if the data only contains e.g. 250 days per year.
ann_return = (series.shift(-days) / series) ** (1 / years) - 1.0
return ann_return
def prepare_ann_returns(df, years, key=PSALES, subtract=None):
"""
Prepare annualized returns e.g. for making a scatter-plot.
The x-axis is given by the key (e.g. PSALES) and the y-axis
would be the annualized returns.
:param df:
Pandas DataFrame with columns named key and TOTAL_RETURN.
:param years:
Number of years for annualized returns.
:param key:
Name of the data-column for x-axis e.g. PSALES or PBOOK.
:param subtract:
Pandas Series to be subtracted from ann-returns
to adjust for e.g. growth in sales-per-share.
:return:
(x, y) Pandas Series with key and adjusted ANN_RETURN.
"""
# Create a new data-frame so we don't modify the original.
# We basically just use this to sync the data we are
# interested in for the common dates and avoid NA-data.
df2 = pd.DataFrame()
# Copy the key-data e.g. PSALES.
df2[key] = df[key]
# Calculate all annualized returns for all periods of
# the given number of years using the Total Return.
ann_return = annualized_returns(series=df[TOTAL_RETURN], years=years)
if subtract is None:
# Add the ann-returns to the new data-frame.
df2[ANN_RETURN] = ann_return
else:
# Calculate all annualized returns for the series
# that must be subtracted e.g. sales-per-share.
ann_return_subtract = annualized_returns(series=subtract, years=years)
# Subtract the ann. returns for the total return
# and the adjustment (e.g. sales-per-share).
# Then add the result to the new data-frame.
df2[ANN_RETURN] = ann_return - ann_return_subtract
# Drop all rows with NA.
df2.dropna(axis=0, how='any', inplace=True)
# Retrieve the relevant data.
x = df2[key]
y = df2[ANN_RETURN]
return x, y
@jit(forceobj=True)
def mean_std_ann_returns(series, min_years=7, max_years=15,
days_per_year=DAYS_PER_YEAR):
"""
Calculate mean and std.dev. annualized returns for a range
of investment periods.
For each day we calculate the annualized returns for a whole
range of periods between e.g. 7 and 15 years into the future,
then we take the mean and std.dev. of all those annualized
returns. This smoothens the effect of random mispricing at
the time of sale, so it is more obvious if there is a
relationship between a predictive signal and future returns.
This function uses "Numba jit" so the for-loop is compiled
into machine-code which gives a 10x speed increase after the
function has been called once.
Note the use of jit with "forceobj=True" which allows for a mix
of ordinary Python and optimized code, which makes the code easier
to read. The fastest jit-mode is actually "fallback" which is about
twice as fast, but it generates several warnings and will be
deprecated in the future. The slowest mode is "nopython=True" where
the for-loop is made inside a nested function that is fully
compatible with Numba jit, but that is actually twice as slow as
"forceobj=True" in this case. So this is the best compromise and
still gives a 10x speed-up.
:param series:
Pandas Series with e.g. the Total Return of a stock.
:param min_years:
Min number of years for investment periods.
:param max_years:
Max number of years for investment periods.
:param days_per_year:
Number of days in a year for the data in `series`. Use one of these:
- DAYS_PER_YEAR for daily interpolated data with leap-years.
- BDAYS_PER_YEAR for the average number of trading-days per year.
:return:
Pandas DataFrame where each day has the mean and std.dev. for the
annualized returns of the future investment periods starting
that day and for all periods between `min_year` and `max_year`.
"""
# The idea of this algorithm is to step through the data
# one day at a time. For each day we lookup an array of
# e.g. Total Return values in the future and calculate the
# annualized returns, and then take the mean and std.dev.
# of that. There may be a faster and more clever way of
# implementing this, but it is fast enough for our purposes
# when using Numba jit to speed-up the for-loop.
# Min / max number of days for the periods we consider.
# For example, between 7 and 15 years into the future.
min_days = int(min_years * days_per_year)
max_days = int(max_years * days_per_year)
# Ensure series is long enough.
if len(series) < max_days:
msg = 'len(series)={0} is too short for max_days={1}'
msg = msg.format(len(series), max_days)
raise ValueError(msg)
# Exponent used for calculating annualized returns.
exponent = days_per_year / np.arange(min_days, max_days)
# Number of days in the output arrays.
num_days = len(series) - max_days
# Pre-allocate output arrays for the mean and std.dev. ann. returns.
mean_ann_rets = np.zeros(num_days, dtype=np.float)
std_ann_rets = np.zeros(num_days, dtype=np.float)
# Convert the Pandas Series into a numpy array
# which is supported by the numba jit compiler.
values = series.to_numpy()
# For each day do the following.
# Note: This for-loop will be optimized by the numba jit compiler.
for i in range(num_days):
# Get the Total Return value for the i'th day.
tot_ret_today = values[i]
# Get array of Total Return values for future days.
tot_ret_future = values[i + min_days:i + max_days]
# Annualized Returns between today and those future days.
ann_rets = (tot_ret_future / tot_ret_today) ** exponent - 1.0
# Mean annualized returns.
mean_ann_rets[i] = np.mean(ann_rets)
# Standard deviation for the annualized returns.
std_ann_rets[i] = np.std(ann_rets)
# Create a Pandas DataFrame with the result.
data = {MEAN_ANN_RETURN: mean_ann_rets, STD_ANN_RETURN: std_ann_rets}
df_result = pd.DataFrame(data=data, index=series.index[0:num_days])
return df_result
def prepare_mean_ann_returns(df, min_years=7, max_years=15, key=PSALES):
"""
Prepare mean annualized returns e.g. for making a scatter-plot.
The x-axis is given by the key (e.g. PSALES) and the y-axis
would be the mean annualized returns.
For each day we calculate the annualized returns for a whole
range of periods between e.g. 7 and 15 years into the future,
then we take the mean of all those annualized returns. This
smoothens the effect of random mispricing at the time of sale,
so it is more obvious if there is a relationship between the
predictive signal and future returns.
:param df:
Pandas DataFrame with columns named key and TOTAL_RETURN
both assumed to have daily interpolated data.
:param min_years:
Min number of years for investment periods.
:param max_years:
Max number of years for investment periods.
:param key:
Name of the data-column for x-axis e.g. PSALES or PBOOK.
:return:
(x, y) Pandas Series with key and mean ANN_RETURN_MEAN.
"""
# Get the common start-dates for the data-columns.
dfs = [df[TOTAL_RETURN].dropna(), df[key].dropna()]
start_date, _ = data.common_period(dfs=dfs)
# Get the individual data-columns from their common start-dates.
# NOTE: We use dropna() on them individually so as to keep as much
# data as possible when calculating the mean ann. returns.
# This means we have to sync the array-lengths further below.
df_key = df[key][start_date:].dropna()
df_tot_ret = df[TOTAL_RETURN][start_date:].dropna()
# Calculate both the mean and std.dev. ann. returns.
# Assume the Total Return has daily interpolated data.
df_mean_ann_rets = mean_std_ann_returns(series=df_tot_ret,
min_years=min_years,
max_years=max_years,
days_per_year=DAYS_PER_YEAR)
# Common length for the two arrays.
common_len = min(len(df_mean_ann_rets), len(df_key))
# The predictive signal e.g. P/Sales.
x = df_key[0:common_len]
# The mean annualized returns.
y = df_mean_ann_rets[MEAN_ANN_RETURN].iloc[0:common_len]
return x, y
def bond_annualized_returns(df, num_years):
"""
Calculate the annualized returns from investing and reinvesting in a bond.
This results in a list of Pandas Series ann_returns with the annualized
returns for [1, 2, ..., num_years] investment years.
For example ann_returns[0] are for 1-year investment periods and
ann_returns[9] are for 10-year periods.
:param df:
Pandas DataFrame with BOND_YIELD data for 1-year maturity.
:param num_years:
Max number of investment years.
:return:
List of Pandas Series.
"""
# The idea is to repeatedly shift the bond-yields
# and update a cumulative product so as to get the
# compounded return through the years.
# Init the list of annualized returns. For 1-year
# investment periods these are just the bond-yields.
ann_returns = [df[BOND_YIELD].copy()]
# Init the cumulative product of bond-yields,
# which will be updated when reinvesting in the
# bonds through the years.
cum_prod = df[BOND_YIELD].copy() + 1.0
# Init the bond-yields shifted one year.
# These will be shifted 365 steps for each year.
shifted = cum_prod.copy()
# For increasing number of investment years.
# The bond-yields were used as the 1st year above.
for years in range(2, num_years + 1):
# Shift the bond-yields one year.
# Note leap-years are not taken into account so
# there will be a slight drift for longer periods,
# but it probably only causes a very small error.
shifted = shifted.shift(-365)
# Accumulate the bond-yields so cum_prod holds the
# cumulative return from reinvesting in the bonds.
cum_prod *= shifted
# Remove NA from the end of the series.
cum_prod.dropna(inplace=True)
# Calculate the annualized returns.
ann_ret = cum_prod ** (1 / years) - 1.0
# Rename the data-column.
ann_ret.rename(ANN_RETURN)
# Add to the list of annualized returns for all years.
ann_returns.append(ann_ret)
return ann_returns
def daily_returns(df, start_date, end_date):
"""
Calculate the daily returns for the TOTAL_RETURN of the given
DataFrame between the given dates.
:param df: Pandas DataFrame containing TOTAL_RETURN for all days.
:param start_date: Only use data from this date.
:param end_date: Only use data to this date.
:return: None.
"""
# Get the Total Return for this date-range.
tot_ret = df[TOTAL_RETURN][start_date:end_date]
# Calculate the daily returns assuming we have daily data.
daily_ret = tot_ret.shift(-1) / tot_ret
return daily_ret
def reinvestment_growth(df, smooth=True):
"""
Estimate the growth in the Total Return from reinvestment
of dividends (and ignoring taxes). This is done by subtracting
the yearly changes in Total Return and Share-Price.
The resulting growth-rates are quite erratic, possibly
because of slight changes in the dividend-dates. This
can be smoothed by a moving average.
:param df: Pandas DataFrame with SHARE_PRICE and TOTAL_RETURN.
:param smooth: Boolean whether to smooth the growth-rates.
:return: Pandas Series with the reinvestment growth.
"""
# Yearly percentage change in share-price. Assume the data is daily.
price_change = df[SHARE_PRICE].pct_change(periods=365)
# Yearly percentage change in Total Return. Assume the data is daily.
tot_ret_change = df[TOTAL_RETURN].pct_change(periods=365)
# The difference is the growth from reinvestment of dividends.
growth = tot_ret_change - price_change
# Smoothen the growth-rates using moving average?
if smooth:
growth = growth.rolling(window=20).mean()
# Remove empty rows.
growth = growth.dropna()
return growth
def dividend_yield(df):
"""
Calculate the daily Dividend Yield of a stock, as the forward-filled
TTM Dividend divided by the daily Share-Price.
:param df: Pandas DataFrame with SHARE_PRICE and DIVIDEND_TTM.
:return: Pandas Series with the data.
"""
return (df[DIVIDEND_TTM].ffill() / df[SHARE_PRICE]).rename(DIVIDEND_YIELD)
def max_drawdown(df, window=None):
"""
Calculate the Maximum Drawdown for all the individual columns in `df`.
This gives the Max Drawdown for each time-step, calculated either
from the beginning or from the preceding window of the given length.
If you want the worst drawdown that has occurred through all time-steps,
then you should call the `min()` function on the resulting DataFrame,
for example: `max_drawdown(df=df_prices).min()`
:param df:
Pandas DataFrame typically with share-prices but could have any data.
If the DataFrame contains data for more stocks, then the columns are
the stock-tickers and the rows are the time-steps.
:param window:
If `None` then calculate the Max Drawdown from the beginning.
If an integer then calculate the Max Drawdown for a rolling window
of that length.
:return:
Pandas DataFrame with the Max Drawdown time-series.
"""
if window is None:
# Calculate Max Drawdown from the beginning.
max_dd = df / df.cummax() - 1.0
else:
# Calculate Max Drawdown for a rolling window.
max_dd = df / df.rolling(window=window).max() - 1.0
return max_dd
def max_pullup(df, window=None):
"""
Calculate the Maximum Pullup for all the individual columns in `df`.
This gives the Max Pullup for each time-step, calculated either
from the beginning or from the preceding window of the given length.
If you want the highest pullup that has occurred through all time-steps,
then you should call the `max()` function on the resulting DataFrame,
for example: `max_pullup(df=df_prices).max()`
Unlike the function `max_drawdown` which is typically calculated for the
entire DataFrame `df` by setting `window=None`, it is often more useful
to set the `window` parameter when calculating the Max Pullup using this
function. This is because many stocks tend to have increasing prices over
time, so the "pullup" or recovery from their historical lows is potentially
infinite over time. So it may be more useful to only consider the pullup
over e.g. 1-year periods by setting the `window` parameter accordingly.
:param df:
Pandas DataFrame typically with share-prices but could have any data.
If the DataFrame contains data for more stocks, then the columns are
the stock-tickers and the rows are the time-steps.
:param window:
If `None` then calculate the Max Pullup from the beginning.
If an integer then calculate the Max Pullup for a rolling window
of that length.
:return:
Pandas DataFrame with the Max Pullup time-series.
"""
if window is None:
# Calculate Max Pullup from the beginning.
max_pu = df / df.cummin() - 1.0
else:
# Calculate Max Pullup for a rolling window.
max_pu = df / df.rolling(window=window).min() - 1.0
return max_pu
########################################################################