[This article was first published on T. Moudiki's Webpage - R, and kindly contributed to R-bloggers]. (You can report issue about the content on this page here)
Want to share your content on R-bloggers? click here if you have a blog, or here if you don't.
Want to share your content on R-bloggers? click here if you have a blog, or here if you don't.
This week’s post is about mlsauce
(again), and LSBoost
in particular. No new working paper (still working on it), but:
- An updated R version, working at least on Linux and macOS (Windows users, if not working on your machine, give a try to the Windows Subsystem for Linux, WSL)
- A new updated documentation page
- My first StackOverflow question ever (still unanswered)
The examples below probably include some kind of leakage (great if you can spot it), but take it as an illustration.
0 – import packages
Importing mlsauce
from GitHub remains the preferred way to install it.
#!pip install numpy matplotlib scikit-learn !pip install git+https://github.com/Techtonique/mlsauce.git --verbose # Importing necessary libraries import mlsauce as ms import numpy as np import matplotlib.pyplot as plt from sklearn.datasets import load_breast_cancer from sklearn.preprocessing import StandardScaler from sklearn.decomposition import KernelPCA # Non-linear dimensionality reduction through the use of kernels from sklearn.model_selection import cross_val_score, train_test_split from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import accuracy_score
1 – Data preprocessing
# Load breast cancer dataset data = load_breast_cancer() X = data.data y = data.target print(X.shape) print(y.shape) (569, 30) (569,)
1 – 1 Kernel PCA features
# Standardize the features scaler = StandardScaler() X_scaled = scaler.fit_transform(X) # Perform Kernel PCA to extract 2 'good' features # (easier to visualize) kpca = KernelPCA(n_components=2) X_kpca = kpca.fit_transform(X_scaled) # Splitting the dataset into training and testing sets X_train_kpca, X_test_kpca, y_train, y_test = train_test_split(X_kpca, y, test_size=0.2, random_state=32) # Plotting the two principal components plt.figure(figsize=(8, 6)) plt.scatter(X_test_kpca[:, 0], X_test_kpca[:, 1], c=y_test, cmap='viridis') plt.xlabel('Kernel Principal Component 1') plt.ylabel('Kernel Principal Component 2') plt.title('Kernel PCA of Breast Cancer Dataset') plt.colorbar(label='Malignant (0) / Benign (1)') plt.show()
1 – 2 ‘Important’ features
# Training a Random Forest classifier rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42) rf_classifier.fit(X, y) # Feature importances importances = rf_classifier.feature_importances_ print(importances) indices = np.argsort(importances)[::-1] print(indices) # Select top 2 features top_two_indices = indices[:2] print(data.feature_names[top_two_indices]) X_rf = X[:,top_two_indices] # Splitting the dataset into training and testing sets X_train_rf, X_test_rf, y_train, y_test = train_test_split(X_rf, y, test_size=0.2, random_state=32) # Plotting the two principal components plt.figure(figsize=(8, 6)) plt.scatter(X_test_rf[:, 0], X_test_rf[:, 1], c=y_test, cmap='viridis') plt.xlabel("Most 'important' feature 1") plt.ylabel("Most 'important' feature 2") plt.title('Response for Breast Cancer Dataset') plt.colorbar(label='Malignant (0) / Benign (1)') plt.show() [0.03484323 0.01522515 0.06799034 0.06046164 0.00795845 0.01159704 0.06691736 0.10704566 0.00342279 0.00261508 0.0142637 0.00374427 0.01008506 0.02955283 0.00472157 0.00561183 0.00581969 0.00375975 0.00354597 0.00594233 0.08284828 0.01748526 0.0808497 0.13935694 0.01223202 0.01986386 0.03733871 0.13222509 0.00817908 0.00449731] [23 27 7 20 22 2 6 3 26 0 13 25 21 1 10 24 5 12 28 4 19 16 15 14 29 17 11 18 8 9] ['worst area' 'worst concave points']
2 – Adjust LSBoostClassifier
!pip install GPopt import GPopt as gp import mlsauce as ms from sklearn.model_selection import cross_val_score opt_objects_lsboost = [] def lsboost_cv(X_train, y_train, n_estimators=100, learning_rate=0.1, n_hidden_features=5, reg_lambda=0.1, dropout=0, tolerance=1e-4, n_clusters=2, seed=123, solver="ridge"): estimator = ms.LSBoostClassifier(n_estimators=int(n_estimators), learning_rate=learning_rate, n_hidden_features=int(n_hidden_features), reg_lambda=reg_lambda, dropout=dropout, tolerance=tolerance, n_clusters=int(n_clusters), seed=seed, solver=solver, verbose=0) return -cross_val_score(estimator, X_train, y_train, scoring='f1_macro', cv=5).mean() def optimize_lsboost(X_train, y_train, solver="ridge"): # objective function for hyperparams tuning def crossval_objective(x): return lsboost_cv( X_train=X_train, y_train=y_train, n_estimators=int(x[0]), learning_rate=x[1], n_hidden_features=int(x[2]), reg_lambda=x[3], dropout=x[4], tolerance=x[5], n_clusters=int(x[6]), solver = solver) gp_opt = gp.GPOpt(objective_func=crossval_objective, lower_bound = np.array([ 10, 0.001, 5, 1e-2, 0, 0, 0]), upper_bound = np.array([250, 0.4, 250, 1e4, 0.7, 1e-1, 4]), params_names=["n_estimators", "learning_rate", "n_hidden_features", "reg_lambda", "dropout", "tolerance", "n_clusters"], n_init=10, n_iter=190, seed=123) return {'parameters': gp_opt.optimize(verbose=2, abs_tol=1e-2), 'opt_object': gp_opt} opt_objects_lsboost.append(optimize_lsboost(X_train_kpca, y_train, solver="ridge")) opt_objects_lsboost.append(optimize_lsboost(X_train_rf, y_train, solver="ridge"))
3 – Graphs
display(opt_objects_lsboost[0]['parameters'].best_params) display(opt_objects_lsboost[1]['parameters'].best_params) opt_objects_lsboost[0]['parameters'].best_params['n_estimators'] = int(opt_objects_lsboost[0]['parameters'].best_params['n_estimators']) opt_objects_lsboost[1]['parameters'].best_params['n_estimators'] = int(opt_objects_lsboost[1]['parameters'].best_params['n_estimators']) opt_objects_lsboost[0]['parameters'].best_params['n_hidden_features'] = int(opt_objects_lsboost[0]['parameters'].best_params['n_hidden_features']) opt_objects_lsboost[1]['parameters'].best_params['n_hidden_features'] = int(opt_objects_lsboost[1]['parameters'].best_params['n_hidden_features']) opt_objects_lsboost[0]['parameters'].best_params['n_clusters'] = int(opt_objects_lsboost[0]['parameters'].best_params['n_clusters']) opt_objects_lsboost[1]['parameters'].best_params['n_clusters'] = int(opt_objects_lsboost[1]['parameters'].best_params['n_clusters']) {'n_estimators': 221.10595703125, 'learning_rate': 0.12772097778320313, 'n_hidden_features': 45.053253173828125, 'reg_lambda': 2496.6505697631837, 'dropout': 0.2851226806640625, 'tolerance': 0.0047698974609375, 'n_clusters': 3.1986083984375} {'n_estimators': 193.544921875, 'learning_rate': 0.3466668701171875, 'n_hidden_features': 208.9971923828125, 'reg_lambda': 1866.4632116699217, 'dropout': 0.37947998046875, 'tolerance': 0.01290283203125, 'n_clusters': 3.04443359375} import matplotlib.pyplot as plt import numpy as np from matplotlib.colors import ListedColormap from sklearn.inspection import DecisionBoundaryDisplay from sklearn.pipeline import make_pipeline from sklearn.ensemble import GradientBoostingClassifier classifiers = [RandomForestClassifier(), GradientBoostingClassifier(), ms.LSBoostClassifier(**opt_objects_lsboost[0]['parameters'].best_params), ms.LSBoostClassifier(**opt_objects_lsboost[1]['parameters'].best_params)] names = ["rf", "gb", "lsboost_pca", "lsboost_rf"] figure = plt.figure(figsize=(27, 9)) i = 1 datasets = [(X_kpca, y), (X_rf, y)] # iterate over datasets for ds_cnt, ds in enumerate(datasets): # preprocess dataset, split into training and test part X, y = ds[0], ds[1] X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.4, random_state=42 ) x_min, x_max = X[:, 0].min() - 0.5, X[:, 0].max() + 0.5 y_min, y_max = X[:, 1].min() - 0.5, X[:, 1].max() + 0.5 # just plot the dataset first cm = plt.cm.RdBu cm_bright = ListedColormap(["#FF0000", "#0000FF"]) ax = plt.subplot(len(datasets), len(classifiers) + 1, i) if ds_cnt == 0: ax.set_title("Input data") # Plot the training points ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright, edgecolors="k") # Plot the testing points ax.scatter( X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright, alpha=0.6, edgecolors="k" ) ax.set_xlim(x_min, x_max) ax.set_ylim(y_min, y_max) ax.set_xticks(()) ax.set_yticks(()) i += 1 # iterate over classifiers for name, clf in zip(names, classifiers): ax = plt.subplot(len(datasets), len(classifiers) + 1, i) clf = make_pipeline(StandardScaler(), clf) clf.fit(X_train, y_train) try: score = clf.score(X_test, y_test) except: # no scoring method available yet for prediction sets score = np.mean(clf.predict_proba(X_test).argmax(axis=1) == y_test) DecisionBoundaryDisplay.from_estimator( clf, X, cmap=cm, alpha=0.8, ax=ax, eps=0.5 ) # Plot the training points ax.scatter( X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright, edgecolors="k" ) # Plot the testing points ax.scatter( X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright, edgecolors="k", alpha=0.6, ) ax.set_xlim(x_min, x_max) ax.set_ylim(y_min, y_max) ax.set_xticks(()) ax.set_yticks(()) if ds_cnt == 0: ax.set_title(name) ax.text( x_max - 0.3, y_min + 0.3, ("%.2f" % score).lstrip("0"), size=15, horizontalalignment="right", ) i += 1 plt.tight_layout() plt.show() 43%|████▎ | 94/221 [00:00<00:00, 178.28it/s] 26%|██▋ | 51/193 [00:02<00:07, 18.66it/s] 54%|█████▍ | 51/94 [00:00<00:00, 449.07it/s] 100%|██████████| 51/51 [00:00<00:00, 61.11it/s]
To leave a comment for the author, please follow the link and comment on their blog: T. Moudiki's Webpage - R.
R-bloggers.com offers daily e-mail updates about R news and tutorials about learning R and many other topics. Click here if you're looking to post or find an R/data-science job.
Want to share your content on R-bloggers? click here if you have a blog, or here if you don't.