forked from specialprocedures/matanga
-
Notifications
You must be signed in to change notification settings - Fork 0
/
sales.py
165 lines (127 loc) · 5.21 KB
/
sales.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
import pandas as pd
from datetime import datetime, timedelta
import numpy as np
def cum_diff(x):
vec = x["ready"].diff()
mask = x["ready"].shift(1) > x["ready"]
return abs(vec.mul(mask).sum())
def cum_diff_pend(x):
vec = x["pending"].diff()
mask = x["pending"].shift(1) > x["pending"]
return abs(vec.mul(mask).sum())
def caluclate_sales(df=None, timestamp=None, path=None):
ALL_COLS = [
"time_stamp",
"uid",
"date",
"sell_code",
"region",
"desc_en",
"description",
"ready",
"pending",
"quantity",
"group",
"sub",
"type",
"usd",
]
GROUP_COLS = [
"sell_code",
"region",
"desc_en",
"group",
"sub",
"type",
"quantity",
]
df_original = df.copy()
# Drop uneccesary listings
df = df[~df.group.isin(["Work/Reagent/Other", "Combo"])]
# Trim the date to size. I just want the one month.
START_DATE = datetime(2020, 2, 4)
END_DATE = datetime.now()
df = df[(df.time_stamp >= START_DATE) & (df.time_stamp <= END_DATE)].sort_values(
"time_stamp"
)
""" One of the big challenges with sales estimation is getting sales per day.
This is problematic because when the estimation function is applied per day
any changes between days are lost.
For example, if the last scrape of the day was at 10:00pm with 6 ready
and the next scrape was at 9:00am the next day with 4 ready, the two sales
between scrapes would be ignored.
This section brings forward the last value of each day to the beginning of
the next to avoid this problem.
"""
# Create a df with dates per uid
m = df[["date", "uid"]].drop_duplicates().rename(columns={"date": "time_stamp"})
df = pd.concat([df, m], sort=False).sort_values(["uid", "time_stamp"])
# Set empty dates to next day
df["date"] = df.groupby("uid")["date"].bfill()
# Bring forward values from previous day
df[[i for i in df if i != "uid"]] = df.groupby("uid").ffill()
df = df.dropna(subset=GROUP_COLS)
""" This function is the heart of the sales estimation process. It works by
creating two vectors:
a cumulative difference vector e.g:
f([6, 5, 7, 2, 1]) = [nan, 1, -2, 5, 1]
and a logical vector to spot upwards changes:
f([6, 5, 7, 2, 1]) = [False, True, False, True, True]
As False evaluates to 0, the two can be multiplied leaving only decreases
[nan, 1, -2, 5, 1] * [False, True, False, True, True] = [nan, 1, 0, 5, 1]
This vector can then be summed, giving the total sales:
sum([nan, 1, 0, 5, 1]) = 7
"""
# We first group by date, uid and GROUP_COLS (which may be duplication
# but belt and braces) and then apply the cummulative difference function.
# What this does is split the df into loads of smaller dfs, based on each listing
# to which the function is applied.
# Following the process, we need to add 'usd' (price) back in, because price is
# volatile and thus can't be used for grouping. I take the _minimum_ from a day
# because for most days for most listings, all prices will be the same. Where
# There is a difference, I use the minimum to show that on this day, the price
# changed.
print('Applying sales calculation on ready packages')
sold = (
df.groupby(["date", "uid"] + GROUP_COLS, observed=True)
.apply(cum_diff)
.reset_index()
.merge(df.groupby(["date", "uid"])["usd"].min().reset_index())
.rename(columns={0: "sold"})
)
sold["quantity_sold"] = sold["sold"] * sold["quantity"]
sold["usd_sold"] = sold["sold"] * sold["usd"]
sold["unit_price"] = sold["usd"] / sold["quantity"]
print('Applying sales calculation on preorders')
sold_pending = (
df.groupby(["date", "uid"] + GROUP_COLS, observed=True)
.apply(cum_diff_pend)
.reset_index()
.merge(df.groupby(["date", "uid"])["usd"].min().reset_index())
.rename(columns={0: "sold_pending"})
)
sold_pending["quantity_sold_pending"] = (
sold_pending["sold_pending"] * sold_pending["quantity"]
)
sold_pending["usd_sold_pending"] = (
sold_pending["sold_pending"] * sold_pending["usd"]
)
print('Merging and applying scrape count')
out = sold.merge(sold_pending, on=["date"] + GROUP_COLS, how="outer")
# Add in scrape_count to adjust for over/underscraping
scrape_count = (
df_original.groupby("date")["source"]
.nunique()
.reset_index()
.rename(columns={"source": "scrape_count"})
)
out = out.merge(scrape_count, on="date", how="left")
# and out to Excel, as Excel has lousy uft-8 support for csvs, and I do my
# checking in Excel
out = out.drop([i for i in out if "_y" in i], axis=1).rename(
columns={i: i.split("_")[0] for i in out if "_x" in i}
)
# out.to_excel(f"DATA/OUTPUT/SALES/EXCEL/sales_{timestamp}.xlsx", index=False)
# out.to_csv(f"DATA/OUTPUT/SALES/CSV/sales_{timestamp}.csv", index=False)
# out.to_feather(f"DATA/OUTPUT/SALES/FEATHER/sales_{timestamp}.feather")
return out