jeudi 17 octobre 2019

xgboost: change random_state no effect even using subsample

xgboost seems to produce the same results even when the random_state variable is changed. This is a similar question to XGBRegressor: change random_state no effect, but I am actually using subsample and I am still not getting different results. Below is sample code. If I run the entire thing I get a training and validation MSE. If I then change the 'random_state' to a different number and re-run the code from "xgb_dict={" to the bottom (importantly not regenerating the data but only re-running the model) I get the exact same training and validation mean squared errors. I am using xgboost version 0.90. Any idea why it behaves this way?

import xgboost as xgb
import numpy as np
import pandas as pd
from scipy.stats import beta
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

def create_y(features):
    """Create fake data"""
    y = 3 * features['x1'] + features['x2'] + 1e3 * features['x3']
    high_x2_features = features.loc[features['x2'] > 1.4, :]
    y[features['x2'] > 1.4] = 3 * high_x2_features['x1'] \
            + 2 * high_x2_features['x2'] + 1e3 * high_x2_features['x3']
    y += 0.2 * (np.random.rand(features.shape[0]) - 0.4)
    return np.maximum(0, y)


def get_data(n_samples):
    """Fetch the data as a Pandas dataframe.

    The dataframe is structured such that every column is a feature (the last
    column is the target variable, i.e., the y) and every row is associated
    with a certain date/observation time.
    """

    features_df = pd.DataFrame({
        'x1':
        beta.rvs(2, 5, size=n_samples),
        'x2': (19 + 11.2 * (np.random.rand(n_samples)**
                                    (3 / 2) - 0.4)) / np.sqrt(252),
        'x3':
        np.maximum(1e-4, beta.rvs(1, 2000, size=n_samples)),
    })
    features_df['x3'] += (features_df['x1'] / 1000 +
                                      features_df['x2'] / 20000)
    data_df = features_df
    data_df['y'] = create_y(features_df)
    feature_names = list(set(data_df.columns) - {'y'})
    return data_df[feature_names + ['y']]


def split_features_target(data):
    """Split the given dataframe into the features and the target variable."""
    feature_names = [col for col in data.columns if col != 'y']
    features = data[feature_names]
    target = data['y']
    return features, target


def get_datasets(n_samples):
    """Get the data and split it into the training, validation, and test sets.

    The total numbers of samples is given by n_samples.
    """
    all_data = get_data(n_samples)
    train, test = train_test_split(all_data, test_size=0.2, shuffle=False)
    train, valid = train_test_split(train, test_size=0.2, shuffle=False)
    return train, valid, test


#get the training sets
train_data, valid_data, test_data = get_datasets(75_000)

train_features, train_target = split_features_target(train_data)
valid_features, valid_target = split_features_target(valid_data)

dtrain = xgb.DMatrix(train_features, label=train_target)

xgb_dict = {
        #'booster': 'dart',
        'booster': 'gbtree',
        #'booster': 'gblinear',
        'max_depth': 2,
        'random_state': 100,
        'learning_rate': 0.10,
        'objective': 'reg:squarederror',
        'verbosity': 1,
        'sample_type': 'uniform',
        'subsample': 0.6,
        'normalize_type': 'tree',
        'rate_drop': 0.0,
        'skip_drop': 0.0,
        'min_child_weight': 1,

    }
#starting hyperparameter of training rounds
training_rounds = 200

#train the model
bst = xgb.train(
    xgb_dict,
    dtrain,
    training_rounds,
    evals=[(xgb.DMatrix(valid_features, label=valid_target), 'RMSE')],
    early_stopping_rounds=10) #use the validation to tune the model and find early_stopping



#get training predictions and MSE
xgb_train_pred = bst.predict(xgb.DMatrix(train_features), ntree_limit=training_rounds)
xgb_train_mse = mean_squared_error(xgb_train_pred, train_target)

#get valid predictions and MSE
xgb_valid_pred = bst.predict(xgb.DMatrix(valid_features), ntree_limit=training_rounds)
xgb_valid_mse = mean_squared_error(xgb_valid_pred, valid_target)



Aucun commentaire:

Enregistrer un commentaire