Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
Samantha Gortarez
SIGMA_Ago2020
Commits
74f5a4fb
Unverified
Commit
74f5a4fb
authored
Nov 04, 2020
by
Starfleet-Command
Committed by
GitHub
Nov 04, 2020
Browse files
add commented files
parent
e2fd3fa9
Changes
2
Hide whitespace changes
Inline
Side-by-side
predict.py
View file @
74f5a4fb
...
...
@@ -5,12 +5,19 @@ import numpy as np
from
multiprocessing
import
Process
,
Queue
import
math
# obtains the symmetric MAPE between a Numpy array of the predicted results and the actual results.
def
getSmape
(
actual
,
predicted
):
EPSILON
=
1e-10
return
np
.
mean
(
2.0
*
np
.
abs
(
actual
-
predicted
)
/
((
np
.
abs
(
actual
)
+
np
.
abs
(
predicted
))
+
EPSILON
))
# Same function as regs.py, creates the linear regression model with given X and Y
# Parameter Variables
# X: Data with only causal variable columns.
# Y: Column to predict
def
linear_reg
(
X
,
Y
):
# Fit linear regression model and return prediction
...
...
@@ -18,16 +25,29 @@ def linear_reg(X, Y):
model_k
.
fit
(
X
,
Y
)
return
model_k
# In this program the regression model creation and prediction are separated
def
predict
(
model
,
X
):
return
model
.
predict
(
X
)
# Main function that uses the best model per sku-store combination, calculated with regs.py, to create a regression model and predictions
# Parameter Variables
# regs: the dataframe representing the csv obtained with regs.py
# data: the dataframe representing the sales data with all causal variables
# progress_counter: the counter variable, to accurately model the prgress of each subprocess
# concat_queue/smape_queue: the Queue structure assigned to each 'pipeline'. Needed to make each subprocess report results to main
def
loopPreds
(
regs
,
data
,
progress_counter
,
features
,
concat_queue
,
smape_queue
):
concats
=
[]
smape
=
[]
# CONCAT is the name of our column for the sku-store combinations. If the name is different, replace all
# references to CONCAT with the name of your column
for
i
in
data
.
CONCAT
.
unique
():
# We want only the data that matches with the current sku-data combo
dataSubset
=
data
.
loc
[
data
[
'CONCAT'
]
==
i
]
regsSubset
=
regs
.
loc
[
regs
[
'CONCAT'
]
==
i
]
...
...
@@ -36,9 +56,14 @@ def loopPreds(regs, data, progress_counter, features, concat_queue, smape_queue)
tempPredict
=
[]
tempReal
=
[]
# If there are more columns that are not causal or they have different names, add/change them here
Trim_X
=
dataSubset
.
drop
(
columns
=
[
"QTY"
,
"DMDUNIT"
,
"DMDGROUP"
,
"LOC"
,
"STARTDATE"
,
"CONCAT"
],
axis
=
1
)
# Obtain a further subset of dataSubset that only includes the features in the model
# A small problem: This is 'hardcoded' and does not scale well. However, we did not find a better way
# The issue was that the 'reference' to the object is needed, not simply the name. If a better way is found
# please feel free to replace this
if
len
(
features
[
0
])
==
1
:
X
=
Trim_X
[[
features
[
0
][
0
]]]
...
...
@@ -52,20 +77,32 @@ def loopPreds(regs, data, progress_counter, features, concat_queue, smape_queue)
X
=
Trim_X
[[
features
[
0
][
0
],
features
[
0
][
1
],
features
[
0
][
2
],
features
[
0
][
3
]]]
# Data column to predict
Y
=
dataSubset
.
QTY
# obtain the regression model using best variables
model
=
linear_reg
(
X
,
Y
)
# Periodically update the subprocess progress
progress_counter
=
progress_counter
+
1
if
(
progress_counter
%
100
==
0
):
print
(
"Progress:"
+
str
(
progress_counter
)
+
"/"
+
str
(
len
(
data
.
CONCAT
.
unique
())))
# Make predictions for each entry in the data (we assume the same sku-store combo does not have the same date)
for
r
in
dataSubset
.
STARTDATE
.
unique
():
# obtain the row that corresponds to a date
entry
=
dataSubset
.
loc
[
dataSubset
[
"STARTDATE"
]
==
r
]
Trim_X
=
entry
.
drop
(
columns
=
[
"QTY"
,
"DMDUNIT"
,
"DMDGROUP"
,
"LOC"
,
"STARTDATE"
,
"CONCAT"
],
axis
=
1
)
# Obtain a further subset of entry that only includes the features in the model
# A small problem: This is 'hardcoded' and does not scale well. However, we did not find a better way
# The issue was that the 'reference' to the object is needed, not simply the name. If a better way is found
# please feel free to replace this
if
len
(
features
[
0
])
==
1
:
X
=
Trim_X
[[
features
[
0
][
0
]]]
...
...
@@ -79,22 +116,28 @@ def loopPreds(regs, data, progress_counter, features, concat_queue, smape_queue)
X
=
Trim_X
[[
features
[
0
][
0
],
features
[
0
][
1
],
features
[
0
][
2
],
features
[
0
][
3
]]]
# create the prediction and obtain the entries' real value.
predicted
=
predict
(
model
,
X
)
real
=
entry
.
QTY
tempPredict
.
append
(
predicted
)
tempReal
.
append
(
real
)
# Obtain the Symmetric MAPE for each sku-store combo
concat_smape
=
getSmape
(
np
.
array
(
tempReal
),
np
.
array
(
tempPredict
))
smape
.
append
(
concat_smape
)
# when all is finished, return the concats and symmetric mape for all given combinations
concat_queue
.
put
(
concats
)
smape_queue
.
put
(
smape
)
if
__name__
==
"__main__"
:
# loading the model created with regs.py
regs
=
pd
.
read_csv
(
"D:/DeepStorage/Regresiones.csv"
)
# Loading the data. If doing the 80/20 split, this should be the 20% not used in regs.py
data
=
pd
.
read_csv
(
"D:/DeepStorage/Fixed_Full_FilteredSalesData.csv"
)
data
=
data
.
dropna
()
...
...
@@ -104,13 +147,19 @@ if __name__ == "__main__":
process_list
=
[]
progress_counter
=
0
features
=
regs
[
"features"
].
apply
(
eval
)
# amount of subprocesses to create. Data entries should be divisible by this
# it should not be higher than no. of logical processors in the machine that's running this
processes
=
8
unique_concats
=
data
.
CONCAT
.
unique
()
chunk
=
math
.
floor
(
len
(
unique_concats
)
/
processes
)
# Create the queues that will be required for subprocesses to send info
concat_queue
=
Queue
()
smape_queue
=
Queue
()
# Divide the data so that only the data entries that have concats that are in a subprocesses' chunk are sent to the subprocess
# Then start the subprocess
for
i
in
range
(
0
,
processes
):
conditions
=
data
[
'CONCAT'
].
isin
(
unique_concats
[
i
*
chunk
:(
i
+
1
)
*
chunk
])
data_subset
=
data
.
loc
[
conditions
]
...
...
@@ -118,6 +167,8 @@ if __name__ == "__main__":
p
=
Process
(
target
=
loopPreds
,
args
=
(
regs
,
data_subset
,
progress_counter
,
features
,
concat_queue
,
smape_queue
))
# If it is the last chunk, we attempt to create an irregular chunk to make sure all concats are included
# even if the amount of sku-store combos is not divisible by the number of processes
else
:
conditions
=
data
[
'CONCAT'
].
isin
(
unique_concats
[
i
*
chunk
:
i
*
chunk
+
len
(
unique_concats
)
-
(
chunk
*
processes
)])
...
...
@@ -140,6 +191,8 @@ if __name__ == "__main__":
regInfo
=
pd
.
DataFrame
(
dictio
)
print
(
regInfo
)
# Output the SMAPE for each sku-store combination to the given csv
regInfo
.
to_csv
(
"D:/DeepStorage/RealvsPredict.csv"
,
index
=
False
)
pass
regs.py
View file @
74f5a4fb
...
...
@@ -4,9 +4,13 @@ from sklearn.metrics import mean_squared_error
import
numpy
as
np
import
itertools
# Fit linear regression model and return RSS and R squared values
# Parameter Variables
# X: Data with only causal variable columns.
# Y: Column to predict
def
fit_linear_reg
(
X
,
Y
):
# Fit linear regression model and return RSS and R squared values
model_k
=
linear_model
.
LinearRegression
(
fit_intercept
=
True
)
model_k
.
fit
(
X
,
Y
)
RSS
=
mean_squared_error
(
Y
,
model_k
.
predict
(
X
))
*
len
(
Y
)
...
...
@@ -14,38 +18,57 @@ def fit_linear_reg(X, Y):
return
RSS
,
R_squared
def
best_subset
(
data
):
#
Initialization variables
Y
=
data
.
QTY
X
=
data
.
drop
(
columns
=
[
"QTY"
,
"DMDUNIT"
,
"DMDGROUP"
,
"LOC"
,
"STARTDATE"
,
"CONCAT"
],
axis
=
1
)
k
=
4
# Applies forward stepwise regression and returns the best model and its R2 per combination
# Parameter Variables
#
Y: Data column to predict
# X: Causal Variable Data columns
# k: Maximum number of variables to be used in a model
# UseR2/ UseRSS: Which error(s) to use as a parameter to check if model has improved
def
best_subset
(
data
,
X
,
Y
,
k
,
UseR2
=
False
,
UseRSS
=
True
):
# List all the causal variables
remaining_features
=
list
(
X
.
columns
.
values
)
features
=
[]
#
Due to 1 indexing of the loop...
#
Initialize residuals lists
RSS_list
,
R_squared_list
=
[
np
.
inf
],
[
np
.
inf
]
features_list
=
dict
()
for
i
in
range
(
1
,
k
+
1
):
best_RSS
=
np
.
inf
best_R_squared
=
0
for
combo
in
itertools
.
combinations
(
remaining_features
,
1
):
# Store temp result
RSS
=
fit_linear_reg
(
X
[
list
(
combo
)
+
features
],
Y
)
if
RSS
[
0
]
<
best_RSS
:
best_RSS
=
RSS
[
0
]
best_R_squared
=
RSS
[
1
]
best_feature
=
combo
[
0
]
# Updating variables for next loop
RSS
,
rSquared
=
fit_linear_reg
(
X
[
list
(
combo
)
+
features
],
Y
)
# Lowest RSS is equivalent to best R. Only done if rss is True
if
UseRSS
and
UseR2
==
False
:
# Update our best scores and features if the condition is met
if
RSS
<
best_RSS
:
best_RSS
=
RSS
best_R_squared
=
rSquared
best_feature
=
combo
[
0
]
# Use best R2 only. Only done if UseR2 is true
elif
UseR2
and
UseRSS
==
False
:
if
rSquared
>
best_R_squared
:
best_RSS
=
RSS
best_R_squared
=
rSquared
best_feature
=
combo
[
0
]
# If both are true, attempt to use both. Not recommended
else
:
if
RSS
<
best_RSS
or
rSquared
>
best_R_squared
:
best_RSS
=
RSS
best_R_squared
=
rSquared
best_feature
=
combo
[
0
]
# Updating variables for next loop. Best_feature is added permanently
features
.
append
(
best_feature
)
remaining_features
.
remove
(
best_feature
)
#
Saving values for plotting
#
Grow the lists with results obtained each iteration
RSS_list
.
append
(
best_RSS
)
R_squared_list
.
append
(
best_R_squared
)
features_list
[
i
]
=
features
.
copy
()
...
...
@@ -58,13 +81,26 @@ def best_subset(data):
best_r
=
R_squared_list
[
i
]
best_r_index
=
i
# If not enough data is obtained to create a reliable prediction/r2, we take the prediction with most variables, as they
# tend to be the most accurate.
if
best_r_index
==
0
:
return
features_list
[
4
],
R_squared_list
[
4
]
return
features_list
[
k
],
R_squared_list
[
k
]
else
:
return
features_list
[
best_r_index
],
R_squared_list
[
best_r_index
]
if
__name__
==
"__main__"
:
# Steps:
# 1. Load the data in csv format
# 2. Check the name/amount of causal variables and modify Y, X, and K
# 3. Wait
# 4. Wait some more
# 5. All Information will be outputted to the given path/csv
# 6. That is the csv to be loaded to the next step of the program, predict.py
# To create the 80/20 split, the data input should be 80% of all data, and then the data loaded to predict.py should be the remaining 20%
# Load the data
data
=
pd
.
read_csv
(
"D:/DeepStorage/Fixed_Full_FilteredSalesData.csv"
)
data
=
data
.
dropna
()
print
(
"data read"
)
...
...
@@ -74,13 +110,24 @@ if __name__ == "__main__":
r_squared
=
[]
progress_counter
=
0
# CONCAT is the name of our column for the sku-store combinations. If the name is different, replace all
# references to CONCAT with the name of your column
for
i
in
data
.
CONCAT
.
unique
():
dataSubset
=
data
.
loc
[
data
[
'CONCAT'
]
==
i
]
temp_features
,
temp_Rsquared
=
best_subset
(
dataSubset
)
Y
=
dataSubset
.
QTY
X
=
dataSubset
.
drop
(
columns
=
[
"QTY"
,
"DMDUNIT"
,
"DMDGROUP"
,
"LOC"
,
"STARTDATE"
,
"CONCAT"
],
axis
=
1
)
k
=
4
temp_features
,
temp_Rsquared
=
best_subset
(
dataSubset
,
X
,
Y
,
k
,
False
,
True
)
concats
.
append
(
i
)
features
.
append
(
temp_features
)
r_squared
.
append
(
temp_Rsquared
)
# Counter that periodically updates the progress of the program
progress_counter
=
progress_counter
+
1
if
(
progress_counter
%
250
==
0
):
print
(
"Progress:"
+
str
(
progress_counter
)
+
...
...
@@ -91,6 +138,7 @@ if __name__ == "__main__":
regInfo
=
pd
.
DataFrame
(
dictio
)
print
(
regInfo
)
# Write all data to a csv with the specified path
regInfo
.
to_csv
(
"D:/DeepStorage/Regresiones.csv"
,
index
=
False
)
pass
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment