python_random_boost/simulation.py at master · tkrabel/python_random_boost · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
"""
=================================================================
Simulation Study Comparing Random Boosting with Gradient Boosting
=================================================================

Author: Tobias Krabel

Compares Friedman (2001)'s standard Gradient Boosting framework with
Random Tree Depth Injection.
"""

from random_boost.random_boost import RandomBoostingRegressor, RandomBoostingClassifier
from random_boost.utils import gen_friedman_data

from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np

import time
import datetime

# Homegrown
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

def mae(y_true, y_pred):
    return mean_absolute_error(y_true, y_pred)

# Constants
N_SAMPLES = 20000
N_INPUTS = 20
N_COMPONENTS = 20
SIGNAL_TO_NOISE = 1.0
N_ROUNDS = 100

PARAMS = {
    'learning_rate': 0.1,
    'max_depth': 5,
    'n_estimators': 100
}

# Set seed
np.random.seed(0)

# Results
df_result = pd.DataFrame(columns=['run', 'model', 'rmse', 'mae', 'time_sec'])

for i in range(N_ROUNDS):
    print(f'Round #{i+1} of {N_ROUNDS}')

    # Data
    print('... generate Friedman data')
    X, y = gen_friedman_data(n_samples=N_SAMPLES,
                             n_inputs=N_INPUTS,
                             n_components=N_COMPONENTS,
                             stn=SIGNAL_TO_NOISE)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

    # Fit Random Boost
    print(f'... fit Random Boost')
    start_time = time.time()
    rb = RandomBoostingRegressor(n_estimators=PARAMS['n_estimators'],
                                 learning_rate=PARAMS['learning_rate'],
                                 max_depth=PARAMS['max_depth'])
    rb = rb.fit(X_train, y_train)
    time_rb = time.time() - start_time
    print(f'... took {time_rb} seconds')

    # Fit MART
    print(f'... fit MART')
    start_time = time.time()
    gb = GradientBoostingRegressor(n_estimators=PARAMS['n_estimators'],
                                   learning_rate=PARAMS['learning_rate'],
                                   max_depth=PARAMS['max_depth'])
    gb = gb.fit(X_train, y_train)
    time_gb = time.time() - start_time
    print(f'... took {time_gb} seconds\n')

    # Add Results to Container
    models = [rb, gb]
    df_result = pd.concat([
        df_result,
        pd.DataFrame(
            data={
                'run':([i+1] * 2),
                'model':['rb', 'xgb'],
                'rmse':[rmse(y_test, model.predict(X_test)) for model in models],
                'mae':[mae(y_test, model.predict(X_test)) for model in models],
                'time_sec':[time_rb, time_gb]
            })],
        axis=0,
        ignore_index=True
    )

# Save to file
now = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
df_result.to_csv(f'data/simulation_results/rb-vs-xgb-lr{PARAMS["learning_rate"]}-d{PARAMS["max_depth"]}-nest{PARAMS["n_estimators"]}-nocv-{now}.csv',
                 index=False)