ENT306
Project
Logistic regression
#Install package
import sys
!{sys.executable} -m pip install numpy
!{sys.executable} -m pip install matplotlib
!{sys.executable} -m pip install scikit-learn
import numpy as np
import matplotlib.pyplot as plt
from time import time
import random
import sklearn
1) Device Type:
Choose a device type (Lights, Camera, Smart Speaker, Thermostat,…) and plot the data
#import kagglehub #if from kagglehub
#path = kagglehub.dataset_download("rabieelkharoua/predict-smart-home-device-efficiency-dataset")
from csv import *
filename = f"smart_home_device_usage_data.csv"
#open file
with open(filename, mode='r') as file:
reader = reader(file)
data = list(reader) # Convert in list to make all lines accessible
n=len(data)-1 #number of lines
DeviceType='...'
count = sum(1 for row in data if len(row) > 1 and row[1] == DeviceType)
print(f"Nombre d'occurrences de l'objet choisi dans la colonne 2 : {count}")
xi_list=np.zeros((count,3)) # three parameters
yi_list=np.zeros(count) #labels
i=0
for row in data[1:]:
if row[1]==DeviceType:
yi_list[i]=row[7] # label: 0 - Inefficient, 1 - Efficient
xi_list[i,0]=row[2] #usagehoursperday
xi_list[i,1]=row[3] #energyconsumption
xi_list[i,2]=row[6] #DeviceAgeMonths
i+=1
assert(i==count)
n=i #data number=count
# normalize data (features between [0,1]):
min_values = np.array([0, 0, 0]) # Features minimum
max_values = np.array([25,10, 60]) # Features maximum
x_normalized = (xi_list - min_values) / (max_values - min_values)
xi_list=x_normalized
#print(np.shape(x_normalized))
Nombre d'occurrences de l'objet choisi dans la colonne 2 : 1039
We are given a family of $n$ points $x_i=(x^0_i,x^1_i,x^2_i)$ in $\mathbb{R}^3$, along with associated $\textit{labels}$ $(y_i)_{1 \leq i \leq n}$ in ${0,1}$.
In previous work, we aimed to establish a relationship between $x$ and $y$ in the form:
- if $\sigma(\langle w, x\rangle) \geq 0.5$, then $y=1$,
- else $y=0$,
where $w = (w^0, w^1)\in \mathbb{R}^2$ was the unknown of the problem and $\sigma$ was the sigmoid function $z \mapsto \tfrac{1}{1+e^{-z}}$. Then, for a new unlabeled data $x$, we were computing $\sigma(\langle w,x\rangle) \in [0,1]$, allowing us to classify it as either $y=0$ or $y=1$ while incorporating a measure of uncertainty in the prediction.
To simplify the model, we assumed that the data were centered at $0$ and set the bias term $b=0$: we only had to minimize the log-loss function with respect to the weights.
In general, a bias term $b$ is required, meaning that we introduce an additional parameter $b \in \mathbb{R}$ and seek a relationship of the form:
- if $\sigma(\langle w, x\rangle + b) \geq 0.5$, then $y=1$,
- else $y=0$,
since there is no reason to assume that the separating hyperplane should pass through the origin.
However, we can eliminate the bias term by increasing the dimensionality from $d$ to $d + 1$. In our case, this means increasing the dimensions of both $x$ and $w$ from $3$ to $4$: we redefine $$x=(x^0,x^1,x^2,1) \textrm{ and } w=(w^0,w^1,w^2,b).$$ This transformation allows us to maintain the same matrix notation as before in the optimization step.
The log-loss function without a penalization term is thus given by: \begin{equation*} f(w)=-\frac1n \sum_{i=1}^n (y_i log (\sigma(\langle w,x_i \rangle))+ (1-y_i)log (1-\sigma(\langle w,x_i \rangle)). \end{equation*}
Data plot
# data plot
from mpl_toolkits.mplot3d import Axes3D
xi_list_bias=np.zeros((n,4)) #increasing dimension from 3 to 4
for i in range(n): #redefining data x in x=(x0,x1,x2,1)
xi_list_bias[i, 0:3]= ...
xi_list_bias[i, 3]= ...
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
for i in range(n//2):
if yi_list[i] == 1: # if x_i is of class 1 the point is red : 1 - Efficient
ax.scatter(xi_list[i, 0], xi_list[i, 1], xi_list[i, 2], color="red", marker=".")
else: # else green #0 - Inefficient
ax.scatter(xi_list[i, 0], xi_list[i, 1], xi_list[i, 2], color="green", marker=".")
elevation_angle = 60
azimuthal_angle = 15
ax.view_init(elevation_angle, azimuthal_angle)
ax.set_xlabel("X axis, usage hours/day")
ax.set_ylabel("Y axis, device age")
ax.set_zlabel("Z axis, energy consumption")
plt.title(u"Data representation.\n" + r"$x_i$ is a green point if $y_i = 0$, red point if $y_i = 1$."+u"\n"+ r"0 - Inefficient, 1 - Efficient")
#plt.axis("tight")
plt.show()
2. Log-loss function
Define a log-loss function with penalty term L2 or with a barrier function.
...
3. Gradient-descent algorithm:
Write a gradient-descent algorithm with the Armijo or the Wolfe-Armijo rule.
# gradient
...
w=np.random.rand(4)
epsilon = 1e-8
print("This quantity must be of order 1e-8 : ", np.linalg.norm(gradf(w) - np.array([(f(w + epsilon*np.eye(4)[0]) - f(w))/(epsilon), (f(w + epsilon*np.eye(4)[1]) - f(w))/(epsilon),(f(w + epsilon*np.eye(4)[2]) - f(w))/(epsilon),(f(w + epsilon*np.eye(4)[3]) - f(w))/(epsilon)])))
This quantity must be of order 1e-8 : 3.603214732619758e-08
# Gradient descent
...
Best weights/bias
%%time
x0 = np.array([1.,1.,1.,1.])
...
xlist_approx = ...
print(len(xlist_approx))
print(xlist_approx[-1])
Visualization: separation line
## P(y=1|x)
def opt_f(wstar,z):
return ...
## Display separation hyperplan from wstar, bstar (optimal weights/bias)
dom_points =[0,1,0,1,0,1]
grid_size = 50
# 3D Grid
x = np.linspace(dom_points[0], dom_points[1], grid_size)
y = np.linspace(dom_points[2], dom_points[3], grid_size)
z = np.linspace(dom_points[4], dom_points[5], grid_size)
X, Y, Z = np.meshgrid(x, y, z)
## plot the separation hyperplan
X2, Y2 = np.meshgrid(x, y)
if xlist_approx[-1][2] != 0:
Z2 = (-xlist_approx[-1][0] * X2 - xlist_approx[-1][1] * Y2 - xlist_approx[-1][3]) / xlist_approx[-1][2]
mask = (Z2 >= 0) & (Z2 <= 1)
Z2[~mask] = np.nan
# Compute P(y=1|x)
P = np.zeros((grid_size, grid_size, grid_size))
for i in range(grid_size):
for j in range(grid_size):
for k in range(grid_size):
P[i, j, k] = ...
fig = plt.figure(figsize=(10, 10))
ax = fig.add_subplot(111, projection='3d')
ax.plot_surface(X2, Y2, Z2, color='cyan') #separation hyperplan
scatter = ax.scatter(X.flatten(), Y.flatten(), Z.flatten(), c=P.flatten(), cmap='viridis', s=0.2,alpha=0.3) #change opacity with alpha
cbar = plt.colorbar(scatter, ax=ax, shrink=0.5)
cbar.set_label("Valeur de W")
for i in range(n):
if yi_list[i] == 1:
ax.scatter(xi_list[i, 0], xi_list[i, 1], xi_list[i, 2], color="red", label="y=1" if i == 0 else "")
else:
ax.scatter(xi_list[i, 0], xi_list[i, 1], xi_list[i, 2], color="green", label="y=0" if i == 0 else "")
elevation_angle = 70 #90
azimuthal_angle = 0#35
ax.view_init(elevation_angle, azimuthal_angle)
ax.set_xlabel("X axis, usage hours/day")
ax.set_ylabel("Y axis, device Age Months")
ax.set_zlabel("Z axis, energy consumption")
ax.set_title(r"Data separation, green points y=0, red points y=1")
ax.set_aspect("equal", adjustable = "box") # pour que les axes aient la même échelle
plt.show()
4. Multi-layer perceptron
Compare the results you obtained with a multi-layer perceptron (no hidden layer, one hidden layer and five hidden layers resp.).
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import log_loss
############
## No hidden layer
mlp_no_hidden = ... #for comparison, use activation="logistic"
...
log_loss_no_hidden = ...
print(f"Log-loss (no hidden layer) : {log_loss_no_hidden:.4f}")
weights_no_hidden = ...
bias_no_hidden = ...
print(weights_no_hidden.transpose()," ", bias_no_hidden)
## Compare to our model:
print(f"Log-loss (our model) : {...:.3f}")
print(xlist_approx[-1])
############
## One hidden layer, 3 neurons, activation=logistic
mlp_one_hidden = ...
...
log_loss_one_hidden = ...
print(f"Log-loss (one layer) : {log_loss_one_hidden:.4f}")
weights_one_hidden = ...
bias_one_hidden = ...
############
## Five hidden layers, 9 neurons, activation=relu
mlp_five_hidden = ...
...
log_loss_five_hidden = ...
print(f"Log-loss (5 hidden layers) : {log_loss_five_hidden:.4f}")
6. Visualization
With one layer and three neurons, compute P (y = 1|x) in a function opt_f and visualize the results. You should recover the results from mlp_one_hidden.predict_proba.
## P(y=1|x)
def opt_f(wstar,bstar,z):
W1 = wstar[:9].reshape(3, 3).transpose() # Hidden layer weights (3 neurons, 3 inputs)
W2 = wstar[9:].reshape(1, 3) # Outputs weights (1 neuron, 3 hidden inputs)
B1 = np.array([bstar[0],bstar[1],bstar[2]]).reshape(3,1)
B2 = bstar[3]
z = np.array(z).reshape(3, 1)
H = ... #hidden layer
P = ...
return P #carreful: must return a float
## Visualization with opt_f
dom_points =[0,1,0,1,0,1]
#dom_points = [0,np.max(xi_list[:, 0]), 0,np.max(xi_list[:, 1]),0,np.max(xi_list[:, 2])]#[-11, 11, -11, 11,]
grid_size = 50
# Génération de la grille 3D
x = np.linspace(dom_points[0], dom_points[1], grid_size)
y = np.linspace(dom_points[2], dom_points[3], grid_size)
z = np.linspace(dom_points[4], dom_points[5], grid_size)
X, Y, Z = np.meshgrid(x, y, z)
P = np.zeros((grid_size, grid_size,grid_size))
w_opt=np.concatenate((weights_one_hidden[0].flatten(),weights_one_hidden[1].flatten()))
print(w_opt)
b_opt=np.concatenate((bias_one_hidden[0].flatten(),bias_one_hidden[1].flatten()))
print(b_opt)
for i in range(grid_size):
for j in range(grid_size):
for k in range(grid_size):
P[i, j, k] =opt_f(...)
fig = plt.figure(figsize=(10, 10))
ax = fig.add_subplot(111, projection='3d')
scatter = ax.scatter(X.flatten(), Y.flatten(), Z.flatten(), c=P.flatten(), cmap='viridis', s=0.2,alpha=1)
# Barre de couleurs
cbar = plt.colorbar(scatter, ax=ax, shrink=0.5)
cbar.set_label("Valeur de W")
for i in range(n):
if yi_list[i] == 1:
ax.scatter(xi_list[i, 0], xi_list[i, 1], xi_list[i, 2], color="red", label="y=1" if i == 0 else "")
else:
ax.scatter(xi_list[i, 0], xi_list[i, 1], xi_list[i, 2], color="green", label="y=0" if i == 0 else "")
# Légendes et étiquettes
ax.set_xlabel("X axis, usage hours/day")
ax.set_ylabel("Y axis, device Age Months")
ax.set_zlabel("Z axis, energy consumption")
ax.set_title(r"Data separation, green points y=0, red points y=1")
ax.set_aspect("equal", adjustable = "box") # pour que les axes aient la même échelle
plt.show()
## Data
## Visualization with mlp_one_hidden
dom_points =[0,1,0,1,0,1]
grid_size = 50
x = np.linspace(dom_points[0], dom_points[1], grid_size)
y = np.linspace(dom_points[2], dom_points[3], grid_size)
z = np.linspace(dom_points[4], dom_points[5], grid_size)
X, Y, Z = np.meshgrid(x, y, z)
P = np.zeros((grid_size, grid_size,grid_size))
for i in range(grid_size):
for j in range(grid_size):
for k in range(grid_size):
P[i, j, k] = ... # with mlp_one_hidden
fig = plt.figure(figsize=(10, 10))
ax = fig.add_subplot(111, projection='3d')
scatter = ax.scatter(X.flatten(), Y.flatten(), Z.flatten(), c=P.flatten(), cmap='viridis', s=0.2,alpha=1)
cbar = plt.colorbar(scatter, ax=ax, shrink=0.5)
cbar.set_label("P(y=1|x)")
for i in range(n):
if yi_list[i] == 1:
ax.scatter(xi_list[i, 0], xi_list[i, 1], xi_list[i, 2], color="red", label="y=1" if i == 0 else "")
else:
ax.scatter(xi_list[i, 0], xi_list[i, 1], xi_list[i, 2], color="green", label="y=0" if i == 0 else "")
ax.set_xlabel("X axis, usage hours/day")
ax.set_ylabel("Y axis, device Age Months")
ax.set_zlabel("Z axis, energy consumption")
ax.set_title(r"Data separation, green points y=0, red points y=1")
ax.set_aspect("equal", adjustable = "box")
plt.show()
Conclusion: In PDF or with comments
How do the parameters seem to influence classification?
Bonus: What would you recommend to avoid overfitting? How to be more confidence in our model?