Unverified Commit 74f5a4fb authored by Starfleet-Command's avatar Starfleet-Command Committed by GitHub
Browse files

add commented files

parent e2fd3fa9
......@@ -5,12 +5,19 @@ import numpy as np
from multiprocessing import Process, Queue
import math
# obtains the symmetric MAPE between a Numpy array of the predicted results and the actual results.
def getSmape(actual, predicted):
EPSILON = 1e-10
return np.mean(2.0 * np.abs(actual - predicted) / ((np.abs(actual) + np.abs(predicted)) + EPSILON))
# Same function as regs.py, creates the linear regression model with given X and Y
# Parameter Variables
# X: Data with only causal variable columns.
# Y: Column to predict
def linear_reg(X, Y):
# Fit linear regression model and return prediction
......@@ -18,16 +25,29 @@ def linear_reg(X, Y):
model_k.fit(X, Y)
return model_k
# In this program the regression model creation and prediction are separated
def predict(model, X):
return model.predict(X)
# Main function that uses the best model per sku-store combination, calculated with regs.py, to create a regression model and predictions
# Parameter Variables
# regs: the dataframe representing the csv obtained with regs.py
# data: the dataframe representing the sales data with all causal variables
# progress_counter: the counter variable, to accurately model the prgress of each subprocess
# concat_queue/smape_queue: the Queue structure assigned to each 'pipeline'. Needed to make each subprocess report results to main
def loopPreds(regs, data, progress_counter, features, concat_queue, smape_queue):
concats = []
smape = []
# CONCAT is the name of our column for the sku-store combinations. If the name is different, replace all
# references to CONCAT with the name of your column
for i in data.CONCAT.unique():
# We want only the data that matches with the current sku-data combo
dataSubset = data.loc[data['CONCAT'] == i]
regsSubset = regs.loc[regs['CONCAT'] == i]
......@@ -36,9 +56,14 @@ def loopPreds(regs, data, progress_counter, features, concat_queue, smape_queue)
tempPredict = []
tempReal = []
# If there are more columns that are not causal or they have different names, add/change them here
Trim_X = dataSubset.drop(columns=["QTY", "DMDUNIT", "DMDGROUP",
"LOC", "STARTDATE", "CONCAT"], axis=1)
# Obtain a further subset of dataSubset that only includes the features in the model
# A small problem: This is 'hardcoded' and does not scale well. However, we did not find a better way
# The issue was that the 'reference' to the object is needed, not simply the name. If a better way is found
# please feel free to replace this
if len(features[0]) == 1:
X = Trim_X[[features[0][0]]]
......@@ -52,20 +77,32 @@ def loopPreds(regs, data, progress_counter, features, concat_queue, smape_queue)
X = Trim_X[[features[0][0], features[0][1],
features[0][2], features[0][3]]]
# Data column to predict
Y = dataSubset.QTY
# obtain the regression model using best variables
model = linear_reg(X, Y)
# Periodically update the subprocess progress
progress_counter = progress_counter+1
if(progress_counter % 100 == 0):
print("Progress:" + str(progress_counter) +
"/"+str(len(data.CONCAT.unique())))
# Make predictions for each entry in the data (we assume the same sku-store combo does not have the same date)
for r in dataSubset.STARTDATE.unique():
# obtain the row that corresponds to a date
entry = dataSubset.loc[dataSubset["STARTDATE"] == r]
Trim_X = entry.drop(columns=["QTY", "DMDUNIT", "DMDGROUP",
"LOC", "STARTDATE", "CONCAT"], axis=1)
# Obtain a further subset of entry that only includes the features in the model
# A small problem: This is 'hardcoded' and does not scale well. However, we did not find a better way
# The issue was that the 'reference' to the object is needed, not simply the name. If a better way is found
# please feel free to replace this
if len(features[0]) == 1:
X = Trim_X[[features[0][0]]]
......@@ -79,22 +116,28 @@ def loopPreds(regs, data, progress_counter, features, concat_queue, smape_queue)
X = Trim_X[[features[0][0], features[0][1],
features[0][2], features[0][3]]]
# create the prediction and obtain the entries' real value.
predicted = predict(model, X)
real = entry.QTY
tempPredict.append(predicted)
tempReal.append(real)
# Obtain the Symmetric MAPE for each sku-store combo
concat_smape = getSmape(np.array(tempReal), np.array(tempPredict))
smape.append(concat_smape)
# when all is finished, return the concats and symmetric mape for all given combinations
concat_queue.put(concats)
smape_queue.put(smape)
if __name__ == "__main__":
# loading the model created with regs.py
regs = pd.read_csv("D:/DeepStorage/Regresiones.csv")
# Loading the data. If doing the 80/20 split, this should be the 20% not used in regs.py
data = pd.read_csv("D:/DeepStorage/Fixed_Full_FilteredSalesData.csv")
data = data.dropna()
......@@ -104,13 +147,19 @@ if __name__ == "__main__":
process_list = []
progress_counter = 0
features = regs["features"].apply(eval)
# amount of subprocesses to create. Data entries should be divisible by this
# it should not be higher than no. of logical processors in the machine that's running this
processes = 8
unique_concats = data.CONCAT.unique()
chunk = math.floor(len(unique_concats)/processes)
# Create the queues that will be required for subprocesses to send info
concat_queue = Queue()
smape_queue = Queue()
# Divide the data so that only the data entries that have concats that are in a subprocesses' chunk are sent to the subprocess
# Then start the subprocess
for i in range(0, processes):
conditions = data['CONCAT'].isin(unique_concats[i*chunk:(i+1)*chunk])
data_subset = data.loc[conditions]
......@@ -118,6 +167,8 @@ if __name__ == "__main__":
p = Process(target=loopPreds, args=(
regs, data_subset, progress_counter, features, concat_queue, smape_queue))
# If it is the last chunk, we attempt to create an irregular chunk to make sure all concats are included
# even if the amount of sku-store combos is not divisible by the number of processes
else:
conditions = data['CONCAT'].isin(
unique_concats[i*chunk:i*chunk + len(unique_concats)-(chunk*processes)])
......@@ -140,6 +191,8 @@ if __name__ == "__main__":
regInfo = pd.DataFrame(dictio)
print(regInfo)
# Output the SMAPE for each sku-store combination to the given csv
regInfo.to_csv("D:/DeepStorage/RealvsPredict.csv", index=False)
pass
......@@ -4,9 +4,13 @@ from sklearn.metrics import mean_squared_error
import numpy as np
import itertools
# Fit linear regression model and return RSS and R squared values
# Parameter Variables
# X: Data with only causal variable columns.
# Y: Column to predict
def fit_linear_reg(X, Y):
# Fit linear regression model and return RSS and R squared values
model_k = linear_model.LinearRegression(fit_intercept=True)
model_k.fit(X, Y)
RSS = mean_squared_error(Y, model_k.predict(X)) * len(Y)
......@@ -14,38 +18,57 @@ def fit_linear_reg(X, Y):
return RSS, R_squared
def best_subset(data):
# Initialization variables
Y = data.QTY
X = data.drop(columns=["QTY", "DMDUNIT", "DMDGROUP",
"LOC", "STARTDATE", "CONCAT"], axis=1)
k = 4
# Applies forward stepwise regression and returns the best model and its R2 per combination
# Parameter Variables
# Y: Data column to predict
# X: Causal Variable Data columns
# k: Maximum number of variables to be used in a model
# UseR2/ UseRSS: Which error(s) to use as a parameter to check if model has improved
def best_subset(data, X, Y, k, UseR2=False, UseRSS=True):
# List all the causal variables
remaining_features = list(X.columns.values)
features = []
# Due to 1 indexing of the loop...
# Initialize residuals lists
RSS_list, R_squared_list = [np.inf], [np.inf]
features_list = dict()
for i in range(1, k+1):
best_RSS = np.inf
best_R_squared = 0
for combo in itertools.combinations(remaining_features, 1):
# Store temp result
RSS = fit_linear_reg(X[list(combo) + features], Y)
RSS, rSquared = fit_linear_reg(X[list(combo) + features], Y)
# Lowest RSS is equivalent to best R. Only done if rss is True
if UseRSS and UseR2 == False:
# Update our best scores and features if the condition is met
if RSS < best_RSS:
best_RSS = RSS
best_R_squared = rSquared
best_feature = combo[0]
# Use best R2 only. Only done if UseR2 is true
elif UseR2 and UseRSS == False:
if rSquared > best_R_squared:
best_RSS = RSS
best_R_squared = rSquared
best_feature = combo[0]
if RSS[0] < best_RSS:
best_RSS = RSS[0]
best_R_squared = RSS[1]
# If both are true, attempt to use both. Not recommended
else:
if RSS < best_RSS or rSquared > best_R_squared:
best_RSS = RSS
best_R_squared = rSquared
best_feature = combo[0]
# Updating variables for next loop
# Updating variables for next loop. Best_feature is added permanently
features.append(best_feature)
remaining_features.remove(best_feature)
# Saving values for plotting
# Grow the lists with results obtained each iteration
RSS_list.append(best_RSS)
R_squared_list.append(best_R_squared)
features_list[i] = features.copy()
......@@ -58,13 +81,26 @@ def best_subset(data):
best_r = R_squared_list[i]
best_r_index = i
# If not enough data is obtained to create a reliable prediction/r2, we take the prediction with most variables, as they
# tend to be the most accurate.
if best_r_index == 0:
return features_list[4], R_squared_list[4]
return features_list[k], R_squared_list[k]
else:
return features_list[best_r_index], R_squared_list[best_r_index]
if __name__ == "__main__":
# Steps:
# 1. Load the data in csv format
# 2. Check the name/amount of causal variables and modify Y, X, and K
# 3. Wait
# 4. Wait some more
# 5. All Information will be outputted to the given path/csv
# 6. That is the csv to be loaded to the next step of the program, predict.py
# To create the 80/20 split, the data input should be 80% of all data, and then the data loaded to predict.py should be the remaining 20%
# Load the data
data = pd.read_csv("D:/DeepStorage/Fixed_Full_FilteredSalesData.csv")
data = data.dropna()
print("data read")
......@@ -74,13 +110,24 @@ if __name__ == "__main__":
r_squared = []
progress_counter = 0
# CONCAT is the name of our column for the sku-store combinations. If the name is different, replace all
# references to CONCAT with the name of your column
for i in data.CONCAT.unique():
dataSubset = data.loc[data['CONCAT'] == i]
temp_features, temp_Rsquared = best_subset(dataSubset)
Y = dataSubset.QTY
X = dataSubset.drop(columns=["QTY", "DMDUNIT", "DMDGROUP",
"LOC", "STARTDATE", "CONCAT"], axis=1)
k = 4
temp_features, temp_Rsquared = best_subset(
dataSubset, X, Y, k, False, True)
concats.append(i)
features.append(temp_features)
r_squared.append(temp_Rsquared)
# Counter that periodically updates the progress of the program
progress_counter = progress_counter+1
if(progress_counter % 250 == 0):
print("Progress:" + str(progress_counter) +
......@@ -91,6 +138,7 @@ if __name__ == "__main__":
regInfo = pd.DataFrame(dictio)
print(regInfo)
# Write all data to a csv with the specified path
regInfo.to_csv("D:/DeepStorage/Regresiones.csv", index=False)
pass
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment