import matplotlib.pyplot as plt
%precision 3
ipython_plain = get_ipython().display_formatter.formatters['text/plain']
ipython_plain.for_type(np.float64, ipython_plain.lookup_by_type(float));
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
m23 = np.array([[1, 2, 3], [4, 5, 6]])
from sklearn.datasets import load_boston, load_iris, load_digits
iris, boston, digits = load_iris(), load_boston(), load_digits()
print(boston.keys())
print(iris.keys())
print(digits.keys())
dict_keys(['data', 'target', 'feature_names', 'DESCR', 'filename', 'data_module']) dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename', 'data_module']) dict_keys(['data', 'target', 'frame', 'feature_names', 'target_names', 'images', 'DESCR'])
from sklearn.datasets import make_blobs
Xb, yb = make_blobs(n_samples = 100,
n_features = 2,
centers = 3,
cluster_std = 1,
random_state = 1)
plt.scatter(Xb[:,0], Xb[:,1], c=yb);
from sklearn.datasets import make_regression
Xr, yr = make_regression(n_samples = 200,
n_features = 2,
n_informative = 2,
n_targets = 1,
noise = 10,
random_state = 1)
plt.figure(figsize=(6, 6))
ax = plt.axes(projection='3d')
ax.scatter3D(Xr[:,0], Xr[:,1], yr);
from sklearn.datasets import make_classification
Xc, yc = make_classification(n_samples = 200,
n_features = 2,
n_informative = 2,
n_redundant = 0,
n_classes = 2,
weights = [.25, .75],
random_state = 3)
plt.scatter(Xc[:,0], Xc[:,1], c=yc);
from sklearn.preprocessing import MinMaxScaler
X_new = MinMaxScaler().fit_transform(Xc)
X_new.min(), X_new.max()
(0.000, 1.000)
from sklearn.preprocessing import StandardScaler
X_new = StandardScaler().fit_transform(Xc)
X_new.mean(), X_new.std(), X_new.min(), X_new.max()
(0.000, 1.000, -2.625, 2.358)
from sklearn.preprocessing import RobustScaler
X_outliers = np.random.normal(0, 0.5, (10, 1))
X_outliers[0, 0] = 3
X_new = RobustScaler().fit_transform(X_outliers)
print(X_outliers.flatten())
print(X_new.flatten())
# RobustScaler scales by mean and quartile range
[ 3. -0.682 -0.191 0.425 0.178 0.665 1.334 0.059 1.039 0.295] [ 3.083 -1.217 -0.644 0.075 -0.212 0.356 1.137 -0.351 0.793 -0.075]
from sklearn.preprocessing import Normalizer
print(m23, end='\n\n')
X_l2norm = Normalizer().fit_transform(m23)
print(X_l2norm, f', L2 = {np.square(X_l2norm[0, :]).sum()}, '
f'x3/x1 = {X_l2norm[0,2]/X_l2norm[0,0]}')
X_l1norm = Normalizer(norm='l1').fit_transform(m23)
print(X_l1norm, f', L1 = {X_l1norm[0, :].sum()}, '
f'x3/x1 = {X_l1norm[0,2]/X_l1norm[0,0]}')
# NOTE: Normalizer defaults to axis=1 (i.e. over samples rather than features)!
[[1 2 3] [4 5 6]] [[0.267 0.535 0.802] [0.456 0.57 0.684]] , L2 = 1.0, x3/x1 = 3.0 [[0.167 0.333 0.5 ] [0.267 0.333 0.4 ]] , L1 = 1.0, x3/x1 = 3.0
from sklearn.preprocessing import PolynomialFeatures
feat = np.array([[2, 3]])
PolynomialFeatures(degree=2, include_bias=False).fit_transform(feat)
# x1, x2, x1*x2, x1^2, x2^2
array([[2., 3., 4., 6., 9.]])
from sklearn.preprocessing import FunctionTransformer
FunctionTransformer(lambda x: print(x.shape)).transform(m23)
(2, 3)
from sklearn.preprocessing import LabelBinarizer
feat = np.array([['A'], ['B'], ['C'], ['A']])
one_hot = LabelBinarizer()
feat_new = one_hot.fit_transform(feat)
print( feat_new )
print( one_hot.inverse_transform(feat_new), one_hot.classes_ )
[[1 0 0] [0 1 0] [0 0 1] [1 0 0]] ['A' 'B' 'C' 'A'] ['A' 'B' 'C']
import pandas as pd
df = pd.DataFrame([['A'], ['B'], ['C'], ['A']])
pd.get_dummies(df.iloc[:, 0])
| A | B | C | |
|---|---|---|---|
| 0 | 1 | 0 | 0 |
| 1 | 0 | 1 | 0 |
| 2 | 0 | 0 | 1 |
| 3 | 1 | 0 | 0 |
from sklearn.feature_extraction import DictVectorizer
feat = [{'he': 1, 'she': 2}, {'she': 3}]
vectorizer = DictVectorizer(sparse=False)
feat_new = vectorizer.fit_transform(feat)
print( feat_new )
print( vectorizer.get_feature_names() )
[[1. 2.] [0. 3.]] ['he', 'she']
from sklearn.impute import SimpleImputer
feat = np.array([[1, 2, 2], [3, 3, 3], [np.nan, 2, 2]])
SimpleImputer(strategy='mean').fit_transform(feat)
# 'strategy': 'mean', 'median', 'most_frequent', 'constant'
array([[1., 2., 2.],
[3., 3., 3.],
[2., 2., 2.]])
from sklearn.impute import KNNImputer
feat = np.array([[1, 2, 2], [3, 3, 3], [np.nan, 2, 2]])
KNNImputer(n_neighbors=1).fit_transform(feat)
array([[1., 2., 2.],
[3., 3., 3.],
[1., 2., 2.]])
X, y = make_blobs(n_samples = 10,
n_features = 2,
centers = 3,
cluster_std = 1,
random_state = 1)
y
array([2, 2, 2, 1, 0, 0, 0, 1, 0, 1])
from sklearn.cluster import KMeans
clt = KMeans(3, random_state=0)
clt.fit_predict(X)
array([2, 2, 2, 1, 0, 0, 0, 1, 0, 1], dtype=int32)
from sklearn.metrics import r2_score, mean_squared_error
y , y_hat = np.array([1, 2 , 3]) , np.array([1.5, 1.5, 3.5])
print( r2_score(y_hat, y) )
print( mean_squared_error(y_hat, y) )
print( mean_squared_error(y_hat, y, squared=False) ) # RMSE
0.71875 0.25 0.5
from statsmodels.regression.linear_model import OLS
from statsmodels.api import add_constant
X_scaled = StandardScaler().fit_transform(Xr)
X_bias = add_constant(X_scaled)
ols = OLS(yr, X_bias).fit()
ols.summary()
| Dep. Variable: | y | R-squared: | 0.961 |
|---|---|---|---|
| Model: | OLS | Adj. R-squared: | 0.960 |
| Method: | Least Squares | F-statistic: | 2396. |
| Date: | Sun, 27 Mar 2022 | Prob (F-statistic): | 5.55e-139 |
| Time: | 18:18:36 | Log-Likelihood: | -751.81 |
| No. Observations: | 200 | AIC: | 1510. |
| Df Residuals: | 197 | BIC: | 1520. |
| Df Model: | 2 | ||
| Covariance Type: | nonrobust |
| coef | std err | t | P>|t| | [0.025 | 0.975] | |
|---|---|---|---|---|---|---|
| const | 5.3474 | 0.740 | 7.229 | 0.000 | 3.889 | 6.806 |
| x1 | 50.4046 | 0.745 | 67.628 | 0.000 | 48.935 | 51.874 |
| x2 | 17.1283 | 0.745 | 22.981 | 0.000 | 15.658 | 18.598 |
| Omnibus: | 0.542 | Durbin-Watson: | 2.039 |
|---|---|---|---|
| Prob(Omnibus): | 0.763 | Jarque-Bera (JB): | 0.264 |
| Skew: | -0.018 | Prob(JB): | 0.876 |
| Kurtosis: | 3.174 | Cond. No. | 1.13 |