August 9, 2025
Running the stairs at the Harvard stadium always feels rough (and yet I keep doing it because it's really fun to do with friends). In a previous post, I have described how I could automatically retrieve statistics and weather data of all my stadium runs. Today I want to see if one can learn anything from this data: Can I predict the run time, given weather and sleep scores? What parameters are most important?
So far I have collected data points, still in the low-data regime, so deep learning won't work. But maybe we can learn a lot with a simple linear model, where is the duration of the stadium runs, the input features (temperature, dew point, sleep score, number of stadiums I ran within the past 2 months prior this run, or combination of these variables), and are parameters of the model, to be inferred. The input features also contain an entry of value , to allow for a bias.
It is clear that the run duration depends on a lot more variables than these features: what I ate before running, if I feel like pushing hard, if someone is in front of me and I cannot pass them, if a friend pushes/pulls me to go faster... We model these unknowns with Gaussian noise added to the output of the linear model: where is a positive scalar to infer and are i.i.d. normally distributed random variables. The likelihood corresponding to the data reads Taking the logarithm, we get the log likelihood The maximum likelihood gives us the best parameters to fit the data: where is a matrix whose rows contain the features of each sample, and is a vector containing the duration of all samples. Here is the list of features I have used:
The program is very simple and only requires solving a linear system with the size of the number of features. Below are the results of the predictions:
Predictions of the model against the real values used to calibrate this model. Error bars correspond to 1 \(\sigma\).
The model seems to do reasonably well, at least in capturing the general trends. The important question now is the following: what are the important variables to predict the run time?
We assume that until now we have explored a reasonably wide set of conditions. Thus, we can expect each input feature to take values with a standard deviation that is close to the empirical one. The importance of feature on the duration can be quantified with first order Sobol indices, computed as where . The list of features and their first order Sobol index are summarized below:
Feature | First order Sobol index |
---|---|
temp | 0.297 |
sleep_score | 0.164 |
sleep_score * sleep_score | 0.157 |
sleep_score * temp | 0.104 |
dwpt | 0.066 |
hour * temp | 0.029 |
hour * dwpt | 0.028 |
temp * temp | 0.026 |
num_runs_2m * num_runs_2m | 0.021 |
hour * sleep_score | 0.020 |
month * month | 0.015 |
num_runs_2m * sleep_score | 0.012 |
month * num_runs_2m | 0.011 |
hour | 0.010 |
dwpt * temp | 0.008 |
month * sleep_score | 0.005 |
month * temp | 0.004 |
hour * hour | 0.004 |
month * dwpt | 0.004 |
hour * month | 0.004 |
num_runs_2m * temp | 0.004 |
dwpt * dwpt | 0.002 |
month | 0.002 |
hour * num_runs_2m | 0.002 |
sleep_score * dwpt | 0.001 |
num_runs_2m * dwpt | 0.001 |
num_runs_2m | 0.000 |
1 | 0.000 |
Perhaps unsurprisingly, the temperature is the most important feature to predict the run time, followed by the sleep score and the dewpoint (related to humidity).
So now we can ask, given this model and this data, what are the perfect conditions to run the stairs? What I did here is to replace in my feature vectors one quantity of interest at a time, and compute the average of the predicted durations. Here is what I got for the temperature, sleep score, and dew point:
So according to this simple model, the best temperature to run the stadium would be around 15 Celcius degrees, with a dewpoint at 3 Celcius degrees and after a relatively good night of sleep (score of 80). Obviously this analysis relies on crude assumptions, such as the form of this simple model, but we still obtain reasonable values. This certainly won't limit my stadium runs to these "optimal" conditions, I'm sure that the main factor is more training!
The results were produced with the following code:
#!/usr/bin/env python
import argparse
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
def main():
parser = argparse.ArgumentParser()
parser.add_argument('csv_data', type=str, help='Path to csv file')
parser.add_argument('--skip', type=int, default=7, help='number of first points to skip for the analysis')
parser.add_argument('--out-dir', type=str, default=None, help='output directory')
args = parser.parse_args()
out_dir = args.out_dir
skip = args.skip
df = pd.read_csv(args.csv_data, parse_dates=['datetime'])
Xvars = ['temp', 'dwpt', 'sleep_score']
Yvar = 'duration'
X = []
Y = df[Yvar].to_numpy()[skip:]
for var in Xvars:
X.append(df[var].to_numpy()[skip:])
# Engineered variable: number of runs the past 2 months
time = df['datetime'].to_numpy()
num_runs_2m = []
for t in time[skip:]:
n = np.count_nonzero(np.logical_and(time <= t, t - pd.DateOffset(days=60) < time))
num_runs_2m.append(n)
X.append(np.array(num_runs_2m))
Xvars.append('num_runs_2m')
# Engineered variable: time of the year
X.append(pd.to_datetime(df['datetime']).dt.month[skip:])
Xvars.append('month')
X.append(pd.to_datetime(df['datetime']).dt.hour[skip:])
Xvars.append('hour')
# add quadratic term for features
nbase = len(X)
for i in range(nbase):
for j in range(i+1):
X.append(X[i] * X[j])
Xvars.append(f'{Xvars[i]} * {Xvars[j]}')
# bias
X.append(np.ones_like(X[0]))
Xvars.append('1')
# max likelihood estimate
X = np.array(X).T
Y = np.array(Y)
A = X.T @ X
b = np.dot(Y, X)
theta = np.linalg.solve(A, b)
sigma = np.sqrt( np.mean((Y - X @ theta)**2) / 2 )
# test sensitivity of the model
n = len(theta)
varx = np.var(X, axis=0)
sobol_indices = theta**2 * varx
sobol_indices /= np.sum(sobol_indices)
print("Feature | First order Sobol index")
print(":------------------------ | :----------------------")
for i in reversed(np.argsort(sobol_indices)):
print(f"{Xvars[i].ljust(26)}| {sobol_indices[i]:.3f}")
# plot predictions
Ypred = X @ theta
tlo = 22
thi = 26
fig, ax = plt.subplots(figsize=(6,6))
ax.errorbar(Y/60, Ypred/60, yerr=sigma/60, fmt='o', capsize=2)
ax.plot([tlo, thi], [tlo, thi], '--k')
ax.set_xlim(tlo, thi)
ax.set_ylim(tlo, thi)
ax.set_aspect('equal')
ax.set_xlabel('run time (min)')
ax.set_ylabel('predicted run time (min)')
plt.tight_layout()
if out_dir:
plt.savefig(os.path.join(out_dir, 'predictions.svg'))
else:
plt.show()
figsize = (3, 2.2)
# best temperature?
Tmin = -10
Tmax = 40
temps = np.linspace(Tmin, Tmax, 500)
mean_duration = []
std_duration = []
for T in temps:
X_ = X.copy()
X_[:,0] = T
k = nbase
for i in range(nbase):
for j in range(i+1):
X_[:,k] = X_[:,i] * X_[:,j]
k += 1
mean_duration.append(np.mean(X_ @ theta))
std_duration.append(np.sqrt(np.var(X_ @ theta) + sigma**2))
mean_duration = np.array(mean_duration) / 60
std_duration = np.array(std_duration) / 60
id_T_opt = np.argmin(mean_duration)
T_opt = temps[id_T_opt]
fig, ax = plt.subplots(figsize=figsize)
ax.fill_between(temps, mean_duration-std_duration, mean_duration+std_duration, lw=0, alpha=0.1, color='C0')
ax.plot(temps, mean_duration)
ax.plot([temps[id_T_opt]], [mean_duration[id_T_opt]], 'o', color='C0', markersize=10)
ax.set_xlabel('temperature (Celcius degrees)')
ax.set_ylabel('run time (min)')
ax.set_xlim(Tmin, Tmax)
plt.tight_layout()
if out_dir:
plt.savefig(os.path.join(out_dir, 'optimal_temp.svg'))
else:
plt.show()
plt.close()
# best sleep score?
ssmin = 50
ssmax = 100
sleep_scores = np.linspace(ssmin, ssmax, 500)
mean_duration = []
std_duration = []
for ss in sleep_scores:
X_ = X.copy()
X_[:,2] = ss
k = nbase
for i in range(nbase):
for j in range(i+1):
X_[:,k] = X_[:,i] * X_[:,j]
k += 1
mean_duration.append(np.mean(X_ @ theta))
std_duration.append(np.sqrt(np.var(X_ @ theta) + sigma**2))
mean_duration = np.array(mean_duration) / 60
std_duration = np.array(std_duration) / 60
id_ss_opt = np.argmin(mean_duration)
ss_opt = sleep_scores[id_ss_opt]
fig, ax = plt.subplots(figsize=figsize)
ax.fill_between(sleep_scores, mean_duration-std_duration, mean_duration+std_duration, lw=0, alpha=0.1, color='C0')
ax.plot(sleep_scores, mean_duration)
ax.plot([sleep_scores[id_ss_opt]], [mean_duration[id_ss_opt]], 'o', color='C0', markersize=10)
ax.set_xlabel('sleep score')
ax.set_ylabel('run time (min)')
ax.set_xlim(ssmin, ssmax)
plt.tight_layout()
if out_dir:
plt.savefig(os.path.join(out_dir, 'optimal_sleep_score.svg'))
else:
plt.show()
plt.close()
# best dew point?
dpmin = -20
dpmax = 20
dewpoints = np.linspace(dpmin, dpmax, 500)
mean_duration = []
std_duration = []
for dp in dewpoints:
X_ = X.copy()
X_[:,1] = dp
k = nbase
for i in range(nbase):
for j in range(i+1):
X_[:,k] = X_[:,i] * X_[:,j]
k += 1
mean_duration.append(np.mean(X_ @ theta))
std_duration.append(np.sqrt(np.var(X_ @ theta) + sigma**2))
mean_duration = np.array(mean_duration) / 60
std_duration = np.array(std_duration) / 60
id_dp_opt = np.argmin(mean_duration)
dp_opt = dewpoints[id_dp_opt]
fig, ax = plt.subplots(figsize=figsize)
ax.fill_between(dewpoints, mean_duration-std_duration, mean_duration+std_duration, lw=0, alpha=0.1, color='C0')
ax.plot(dewpoints, mean_duration)
ax.plot([dewpoints[id_dp_opt]], [mean_duration[id_dp_opt]], 'o', color='C0', markersize=10)
ax.set_xlabel('dew point (Celcius degrees)')
ax.set_ylabel('run time (min)')
ax.set_xlim(dpmin, dpmax)
plt.tight_layout()
if out_dir:
plt.savefig(os.path.join(out_dir, 'optimal_dewpoint.svg'))
else:
plt.show()
plt.close()
print(f"Best temperature: {T_opt}")
print(f"Best sleep score: {ss_opt}")
print(f"Best dew point: {dp_opt}")
if __name__ == '__main__':
main()