diff --git a/machine_learning/linear_regression.py b/machine_learning/linear_regression.py index 5b1e663116cc..fc28ffc4aed4 100644 --- a/machine_learning/linear_regression.py +++ b/machine_learning/linear_regression.py @@ -1,11 +1,28 @@ -""" -Linear regression is the most basic type of regression commonly used for -predictive analysis. The idea is pretty simple: we have a dataset and we have -features associated with it. Features should be chosen very cautiously -as they determine how much our model will be able to make future predictions. -We try to set the weight of these features, over many iterations, so that they best -fit our dataset. In this particular code, I had used a CSGO dataset (ADR vs -Rating). We try to best fit a line through dataset and estimate the parameters. +"""Linear Regression Implementation. + +Linear regression is a fundamental supervised machine learning algorithm used for +predictive analysis. It models the relationship between a dependent variable (y) +and one or more independent variables (x) by fitting a linear equation. + +Mathematical Foundation: + The model assumes: y = θ₀ + θ₁x₁ + θ₂x₂ + ... + θₙxₙ + ε + where θ are the parameters (weights) and ε is the error term. + + The cost function (Mean Squared Error) is minimized using gradient descent: + J(θ) = (1/2m) * Σ(h(x⁽ⁱ⁾) - y⁽ⁱ⁾)² + + Gradient descent update rule: + θⱼ := θⱼ - α * (∂J/∂θⱼ) + +Time Complexity: + - Training: O(n * m * iterations) where n = features, m = samples + - Prediction: O(n) per sample + +Space Complexity: O(n * m) for storing the dataset + +References: + - https://en.wikipedia.org/wiki/Linear_regression + - https://en.wikipedia.org/wiki/Gradient_descent """ # /// script @@ -18,12 +35,26 @@ import httpx import numpy as np +from numpy.typing import NDArray + + +def collect_dataset() -> NDArray: + """Collect dataset of CSGO player statistics. + Fetches a CSV dataset containing ADR (Average Damage per Round) vs Rating + of CSGO players from an external source. -def collect_dataset(): - """Collect dataset of CSGO - The dataset contains ADR vs Rating of a Player - :return : dataset obtained from the link, as matrix + Returns: + NDArray: A numpy matrix containing the dataset with ADR and Rating values. + + Raises: + httpx.TimeoutException: If the request times out after 10 seconds. + httpx.HTTPError: If there's an error fetching the dataset. + + Example: + >>> dataset = collect_dataset() # doctest: +SKIP + >>> dataset.shape[1] == 2 # doctest: +SKIP + True """ response = httpx.get( "https://raw.githubusercontent.com/yashLadha/The_Math_of_Intelligence/" @@ -35,31 +66,47 @@ def collect_dataset(): for item in lines: item = item.split(",") data.append(item) - data.pop(0) # This is for removing the labels from the list + data.pop(0) # Remove the header labels dataset = np.matrix(data) return dataset -def run_steep_gradient_descent(data_x, data_y, len_data, alpha, theta): - """Run steep gradient descent and updates the Feature vector accordingly_ - :param data_x : contains the dataset - :param data_y : contains the output associated with each data-entry - :param len_data : length of the data_ - :param alpha : Learning rate of the model - :param theta : Feature vector (weight's for our model) - ;param return : Updated Feature's, using - curr_features - alpha_ * gradient(w.r.t. feature) - >>> import numpy as np - >>> data_x = np.array([[1, 2], [3, 4]]) - >>> data_y = np.array([5, 6]) - >>> len_data = len(data_x) - >>> alpha = 0.01 - >>> theta = np.array([0.1, 0.2]) - >>> run_steep_gradient_descent(data_x, data_y, len_data, alpha, theta) - array([0.196, 0.343]) +def run_steep_gradient_descent( + data_x: NDArray, + data_y: NDArray, + len_data: int, + alpha: float, + theta: NDArray, +) -> NDArray: + """Perform one iteration of gradient descent to update feature weights. + + Gradient descent is an optimization algorithm that iteratively adjusts + parameters to minimize the cost function. + + Args: + data_x: Input feature matrix of shape (m, n) where m = samples, n = features. + data_y: Target values array of shape (m,). + len_data: Number of training samples. + alpha: Learning rate controlling the step size (typically 0.001 to 0.1). + theta: Current weight vector of shape (1, n). + + Returns: + NDArray: Updated weight vector after one gradient descent step. + + Time Complexity: O(m * n) for matrix operations. + Space Complexity: O(m * n) for intermediate calculations. + + Example: + >>> import numpy as np + >>> data_x = np.array([[1, 2], [3, 4]]) + >>> data_y = np.array([5, 6]) + >>> len_data = len(data_x) + >>> alpha = 0.01 + >>> theta = np.array([0.1, 0.2]) + >>> run_steep_gradient_descent(data_x, data_y, len_data, alpha, theta) + array([0.196, 0.343]) """ n = len_data - prod = np.dot(theta, data_x.transpose()) prod -= data_y.transpose() sum_grad = np.dot(prod, data_x) @@ -67,19 +114,41 @@ def run_steep_gradient_descent(data_x, data_y, len_data, alpha, theta): return theta -def sum_of_square_error(data_x, data_y, len_data, theta): - """Return sum of square error for error calculation - :param data_x : contains our dataset - :param data_y : contains the output (result vector) - :param len_data : len of the dataset - :param theta : contains the feature vector - :return : sum of square error computed from given feature's +def sum_of_square_error( + data_x: NDArray, + data_y: NDArray, + len_data: int, + theta: NDArray, +) -> float: + """Calculate the Sum of Squared Errors (SSE) for the current model. + + SSE measures how well the model fits the data by computing the sum of + squared differences between predicted and actual values. + + Args: + data_x: Input feature matrix of shape (m, n). + data_y: Actual target values of shape (m,). + len_data: Number of data samples. + theta: Current weight vector of shape (1, n). + + Returns: + float: The mean squared error value (SSE divided by 2m). + + Time Complexity: O(m * n) for prediction and error calculation. + Space Complexity: O(m) for storing predictions. Example: - >>> vc_x = np.array([[1.1], [2.1], [3.1]]) - >>> vc_y = np.array([1.2, 2.2, 3.2]) - >>> round(sum_of_square_error(vc_x, vc_y, 3, np.array([1])),3) - np.float64(0.005) + >>> import numpy as np + >>> vc_x = np.array([[1.1], [2.1], [3.1]]) + >>> vc_y = np.array([1.2, 2.2, 3.2]) + >>> round(sum_of_square_error(vc_x, vc_y, 3, np.array([1])), 3) + np.float64(0.005) + + >>> # Test with perfect fit + >>> x = np.array([[1], [2], [3]]) + >>> y = np.array([1, 2, 3]) + >>> sum_of_square_error(x, y, 3, np.array([1])) + np.float64(0.0) """ prod = np.dot(theta, data_x.transpose()) prod -= data_y.transpose() @@ -88,18 +157,30 @@ def sum_of_square_error(data_x, data_y, len_data, theta): return error -def run_linear_regression(data_x, data_y): - """Implement Linear regression over the dataset - :param data_x : contains our dataset - :param data_y : contains the output (result vector) - :return : feature for line of best fit (Feature vector) +def run_linear_regression(data_x: NDArray, data_y: NDArray) -> NDArray: + """Train a linear regression model using gradient descent. + + Iteratively optimizes the weight parameters to minimize the cost function + (mean squared error) over the training data. + + Args: + data_x: Input feature matrix of shape (m, n). + data_y: Target values of shape (m,). + + Returns: + NDArray: Optimized weight vector (theta) of shape (1, n). + + Time Complexity: O(iterations * m * n) where default iterations = 100000. + Space Complexity: O(m * n) for storing the dataset. + + Note: + The learning rate (alpha) is set to 0.0001550 and may need tuning + for different datasets. """ iterations = 100000 alpha = 0.0001550 - no_features = data_x.shape[1] len_data = data_x.shape[0] - 1 - theta = np.zeros((1, no_features)) for i in range(iterations): @@ -110,25 +191,47 @@ def run_linear_regression(data_x, data_y): return theta -def mean_absolute_error(predicted_y, original_y): - """Return sum of square error for error calculation - :param predicted_y : contains the output of prediction (result vector) - :param original_y : contains values of expected outcome - :return : mean absolute error computed from given feature's +def mean_absolute_error(predicted_y: list, original_y: list) -> float: + """Calculate Mean Absolute Error (MAE) between predicted and actual values. + + MAE is a common metric for regression models that measures the average + magnitude of errors without considering direction. + + Args: + predicted_y: List of predicted values. + original_y: List of actual/expected values. + + Returns: + float: The mean absolute error. + + Time Complexity: O(n) where n is the number of samples. + Space Complexity: O(1) for accumulator. - >>> predicted_y = [3, -0.5, 2, 7] - >>> original_y = [2.5, 0.0, 2, 8] - >>> mean_absolute_error(predicted_y, original_y) - 0.5 + Example: + >>> predicted_y = [3, -0.5, 2, 7] + >>> original_y = [2.5, 0.0, 2, 8] + >>> mean_absolute_error(predicted_y, original_y) + 0.5 + + >>> # Test with identical values (perfect prediction) + >>> mean_absolute_error([1, 2, 3], [1, 2, 3]) + 0.0 + + >>> # Test with negative values + >>> mean_absolute_error([-1, -2], [1, 2]) + 3.0 """ total = sum(abs(y - predicted_y[i]) for i, y in enumerate(original_y)) return total / len(original_y) -def main(): - """Driver function""" - data = collect_dataset() +def main() -> None: + """Driver function to demonstrate linear regression. + Loads the CSGO dataset, trains a linear regression model, + and prints the resulting feature vector. + """ + data = collect_dataset() len_data = data.shape[0] data_x = np.c_[np.ones(len_data), data[:, :-1]].astype(float) data_y = data[:, -1].astype(float)