Household power consumption
Contents
This is a self-correcting activity generated by nbgrader. Fill in any place that says
YOUR CODE HERE
orYOUR ANSWER HERE
. Run subsequent cells to check your code.
Household power consumption¶
The goal of this activity is to analyze a time series in order to predict the electric consumption of a home.
It uses a dataset gathering measurements for a house located in Sceaux (France) between December 2006 and November 2010.
Attribute description is as follows:
date: date in format dd/mm/yyyy
time: time in format hh:mm:ss
global_active_power: household global minute-averaged active power (in kilowatt)
global_reactive_power: household global minute-averaged reactive power (in kilowatt)
voltage: minute-averaged voltage (in volt)
global_intensity: household global minute-averaged current intensity (in ampere)
sub_metering_1: energy sub-metering No. 1 (in watt-hour of active energy). It corresponds to the kitchen, containing mainly a dishwasher, an oven and a microwave (hot plates are not electric but gas powered).
sub_metering_2: energy sub-metering No. 2 (in watt-hour of active energy). It corresponds to the laundry room, containing a washing-machine, a tumble-drier, a refrigerator and a light.
sub_metering_3: energy sub-metering No. 3 (in watt-hour of active energy). It corresponds to an electric water-heater and an air-conditioner.
The active energy consumed every minute (in watt hour) in the household by electrical equipment not measured in sub-meterings 1, 2 and 3 is geven par the following formula:
global_active_power*1000/60 - sub_metering_1 - sub_metering_2 - sub_metering_3
Environment setup¶
import platform
print(f"Python version: {platform.python_version()}")
assert platform.python_version_tuple() >= ("3", "6")
import os # To access locally extracted file
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
print(f"NumPy version: {np.__version__}")
# Setup plots
%matplotlib inline
plt.rcParams["figure.figsize"] = 10, 8
%config InlineBackend.figure_format = 'retina'
sns.set()
import sklearn
print(f"scikit-learn version: {sklearn.__version__}")
assert sklearn.__version__ >= "0.20"
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
print(f"TensorFlow version: {tf.__version__}")
print(f"Keras version: {tf.keras.__version__}")
from tensorflow.keras.utils import get_file
from tensorflow.keras.losses import mean_squared_error
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, Reshape, Lambda, LSTM
def plot_series(series, y_true, y_pred=None, x_label="$t$", y_label="$x(t)$"):
"""Plot a time series with actual and predicted future values
series: vector of shape (time steps, )
y_true: scalar (if only 1 ahead step) or vector of shape (ahead steps,)
y_pred: scalar (if only 1 ahead step) or vector of shape (ahead steps,)"""
plt.plot(series, ".-", label="Inputs")
n_steps = series.shape[0]
# Calculate the number of steps ahead (= number of future values)
n_steps_ahead = 1
if not np.isscalar(y_true):
n_steps_ahead = y_true.shape[0]
# Plot actual future values
plt.plot(np.arange(n_steps, n_steps + n_steps_ahead), y_true, "ro-", label="Labels")
if y_pred is not None:
# Plot predicted future values
plt.plot(
np.arange(n_steps, n_steps + n_steps_ahead),
y_pred,
"bx-",
label="Predicted",
markersize=10,
)
if x_label:
plt.xlabel(x_label, fontsize=16)
if y_label:
plt.ylabel(y_label, fontsize=16)
plt.legend(fontsize=14)
def plot_loss(history):
"""Plot training loss for a Keras model
Takes a Keras History object as parameter"""
loss = history.history["loss"]
epochs = range(1, len(loss) + 1)
plt.figure(figsize=(10, 5))
plt.plot(epochs, loss, ".--", label="Training loss")
final_loss = loss[-1]
title = "Training loss: {:.4f}".format(final_loss)
plt.ylabel("Loss")
if "val_loss" in history.history:
val_loss = history.history["val_loss"]
plt.plot(epochs, val_loss, "o-", label="Validation loss")
final_val_loss = val_loss[-1]
title += ", Validation loss: {:.4f}".format(final_val_loss)
plt.title(title)
plt.legend()
Step 1: loading the data¶
# Download and extract the dataset
zip_path = get_file(
origin="https://archive.ics.uci.edu/ml/machine-learning-databases/00235/household_power_consumption.zip",
fname="household_power_consumption.zip",
extract=True,
)
file_path, _ = os.path.splitext(zip_path)
file_path += ".txt"
print(f"Dataset extracted at {file_path}")
# Load the dataset into a DataFrame
# - "Date" and "Time" columns are merged into a "Date_time" attribute, which is used as index column
# - Missing values ("nan" and "?") are converted into NumPy NaNs
df_power = pd.read_csv(
file_path,
sep=";",
parse_dates={"Date_time": ["Date", "Time"]},
infer_datetime_format=True,
low_memory=False,
na_values=["?"],
index_col="Date_time",
)
print(f"df_power: {df_power.shape}")
Step 2: exploring the data¶
Use pandas to gain insights about the dataset.
# YOUR CODE HERE
# YOUR CODE HERE
# YOUR CODE HERE
Step 3: preparing the data¶
# Compute number and percent of missing values among features
def find_missing_values(df):
total_missing = df.isnull().sum()
percent_missing = (total_missing * 100 / df.isnull().count()).sort_values(
ascending=False
)
return pd.concat(
[total_missing, percent_missing], axis=1, keys=["Total", "Percent"]
)
find_missing_values(df_power).head(n=10)
# Show the first samples with missing values
df_power[df_power.isnull().any(axis=1)].head(n=10)
# Fill missing values with mean for all features
def fill_na(df):
n_features = df.shape[1]
for j in range(0, n_features):
df.iloc[:, j] = df.iloc[:, j].fillna(df.iloc[:, j].mean())
return df
df_power = fill_na(df_power)
# Check that there are no remaining missing values
df_power.isnull().sum()
# Resample dataset over hours rather than minutes, to speed up computations
df_power = df_power.resample("h").mean()
print(f"df_power: {df_power.shape}")
df_power.head(n=10)
plot_cols = ["Global_active_power", "Voltage", "Global_intensity"]
# Plot several features resampled over hour for the whole dataset
df_plotted_cols = df_power[plot_cols]
_ = df_plotted_cols.plot(subplots=True)
# Plot several features resampled over hour for the first 20 days of the dataset
_ = df_plotted_cols[:480].plot(subplots=True)
Question¶
Split the dataset using (70%, 20%, 10%) ratios.
# Split dataset between training, validation and test sets
# No shuffling to preserve time dependencies
n_samples = len(df_power)
# YOUR CODE HERE
print(f"df_train: {df_train.shape}")
print(f"df_val: {df_val.shape}")
print(f"df_test: {df_test.shape}")
Question¶
Standardize the splitted sets.
# Standardize the sets using metrics computed on training set
train_mean = df_train.mean()
train_std = df_train.std()
# YOUR CODE HERE
# Look at the distribution of the features
df_std = (df_power - train_mean) / train_std
df_std = df_std.melt(var_name="Features", value_name="Normalized_values")
plt.figure(figsize=(12, 6))
ax = sns.violinplot(x="Features", y="Normalized_values", data=df_std)
_ = ax.set_xticklabels(df_power.keys(), rotation=90)
# Split a dataset into time windows
# input_width is the number of input time steps
# label_width is the number of predicted time steps
def split_into_windows(dataset, input_width, label_width):
inputs = []
labels = []
start_index = input_width
end_index = len(dataset) - label_width
for i in range(start_index, end_index):
input_indices = range(i - input_width, i)
inputs.append(dataset[input_indices])
label_indices = range(i, i + label_width)
labels.append(dataset[label_indices])
return np.array(inputs), np.array(labels)
def plot_features(series, y_true, y_pred=None, title=None):
plot_cols = [0, 2, 3]
fig, axes = plt.subplots(
nrows=len(plot_cols), ncols=1, sharey=True, figsize=(12, 8)
)
if title:
fig.suptitle(title, fontsize=18)
for i, col in enumerate(plot_cols):
plt.sca(axes[i])
plot_series(
series=series[:, col],
y_true=y_true[:, col],
y_pred=y_pred[:, col] if y_pred is not None else None,
x_label="$Time (h)$",
y_label=df_train.columns[col],
)
Question¶
Complete the definition of the train()
function.
def train(model, x_train, y_train, x_val, y_val):
# Train a model using Adam, mean_squared_error for loss and mae for metric
# YOUR CODE HERE
Step 4: training models¶
n_features = df_power.shape[1]
# Hyperparameters
n_steps_before = 24
n_steps_ahead = 5
n_epochs = 20
x_train, y_train = split_into_windows(df_train.values, n_steps_before, n_steps_ahead)
x_val, y_val = split_into_windows(df_val.values, n_steps_before, n_steps_ahead)
x_test, y_test = split_into_windows(df_test.values, n_steps_before, n_steps_ahead)
print(f"x_train: {x_train.shape}, y_train: {y_train.shape}")
print(f"x_val: {x_val.shape}, y_val: {y_val.shape}")
print(f"x_test: {x_test.shape}, y_test: {y_test.shape}")
# Plot last validation series
plot_features(x_val[-1], y_val[-1])
Naïve forecasting¶
# Duplicate last values for all features
y_pred_naive = np.tile(x_val[:, -1:, :], (n_steps_ahead, 1))
print(f"y_pred_naive: {y_pred_naive.shape}")
print(f"Naïve predictor MSE: {np.mean(mean_squared_error(y_val, y_pred_naive)):0.05f}")
# Plot forecasting for last validation series
# YOUR CODE HERE
Dense network¶
dense_model = Sequential(
[
# Take the last time-step.
# Shape [batch, time, features] => [batch, 1, features]
Lambda(lambda x: x[:, -1:, :]),
# Shape => [batch, 1, dense_units]
Dense(units=512, activation="relu"),
# Shape => [batch, n_steps_ahead*n_features]
Dense(
units=n_steps_ahead * n_features, kernel_initializer=tf.initializers.zeros()
),
# Shape => [batch, n_steps_ahead, n_features]
Reshape([n_steps_ahead, n_features]),
]
)
history = train(dense_model, x_train, y_train, x_val, y_val)
plot_loss(history)
y_pred_dense = dense_model.predict(x_val)
print(f"Dense network MSE: {np.mean(mean_squared_error(y_val, y_pred_dense)):0.05f}")
# Plot forecasting for last validation series
# YOUR CODE HERE
Recurrent network¶
Using the architecture of your choice, define a recurrent neural network able to beat the dense model.
# YOUR CODE HERE
# YOUR CODE HERE
# YOUR CODE HERE
# YOUR CODE HERE
# YOUR CODE HERE