-
Notifications
You must be signed in to change notification settings - Fork 0
/
summary.py
101 lines (69 loc) · 3.05 KB
/
summary.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
# -*- coding: utf-8 -*-
"""summary.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1phEEVS99vSoQu75DwUTRQAVSza0hI8Rm
# Summary
"""
import pandas as pd
import numpy as np
import math
!pip install sklearn-ts==0.0.5
from sklearn_ts.splitter import custom_split
covid = pd.read_csv("https://raw.githubusercontent.com/owid/covid-19-data/master/public/data/owid-covid-data.csv")
#covid.head(2)
target = 'new_cases'
h = 14
dataset = covid[(covid['location']=='World')].copy()[[target, 'date']]
from matplotlib import pyplot as plt
fig, axes = plt.subplots(nrows=1, ncols=1, figsize=(8, 4))
dataset[[target]].plot(ax=axes)
fig.savefig(f'ts.png')
# prepare features
features = ['year', 'month', f'{h}_lag', f'{h}_lag_rolling', 'dayofweek', 'intercept', 'trend', 'log']
categorical_features = ['year', 'month', 'dayofweek']
numerical_features = ['intercept', 'trend', 'log', f'{h}_lag_rolling']
lag_features= []
dataset['date'] = pd.to_datetime(dataset['date'])
#dataset.index = dataset['date']
dataset['month'] = dataset['date'].dt.month
dataset['year'] = dataset['date'].dt.year
dataset['dayofweek'] = dataset['date'].dt.dayofweek
for lag in [h + i for i in range(14)]:
dataset[f'{lag}_lag'] = dataset[target].shift(lag)
lag_features.append(f'{lag}_lag')
dataset[f'rolling_{target}'] = dataset[target].rolling(window=h).mean()
dataset[f'{h}_lag_rolling'] = dataset[f'rolling_{target}'].shift(h)
dataset['intercept'] = 1
dataset['trend'] = range(dataset.shape[0])
dataset['log'] = dataset['trend'].apply(lambda x: math.log(x+1))
dataset = dataset[['date', target] + numerical_features + categorical_features + lag_features]
dataset = dataset.dropna()
#dataset.head(20)
splits, dates = custom_split(dataset, h=30, n_splits=6, gap=30)
dataset['flag'] = 'test'
dataset.loc[dates[0][0], 'flag'] = 'train'
#dataset.iloc[splits[0][0]]['flag']
fig, axes = plt.subplots(nrows=1, ncols=1, figsize=(8, 4))
pd.pivot_table(dataset.reset_index(drop=True), values=target, index='date', columns='flag', aggfunc='sum')[['train', 'test']].plot.line(ax=axes)
fig.savefig(f'split.png')
dataset['flag'] = None
for i in range(1, 5):
dataset.loc[dates[i][0], 'flag'] = i
fig, axes = plt.subplots(nrows=1, ncols=1, figsize=(8, 4))
pd.pivot_table(dataset.reset_index(drop=True), values=target, index='date', columns='flag', aggfunc='sum')[range(1, 5)].plot.line(ax=axes)
fig.savefig(f'cv.png')
dates[5][0]
from sklearn_ts.validator import check_model
#dataset = load_covid()['dataset']
from sklearn.linear_model import LinearRegression
params = {'fit_intercept': [False]}
regressor = LinearRegression(fit_intercept=False)
results = check_model(
regressor, params, dataset,
target='new_cases', features=features, categorical_features=categorical_features, user_transformers=[],
h=30, n_splits=5, gap=30,
plotting=True
)
from sklearn_ts.features.explainer import plot_features
plot_features(results['model'], results['features']);