Most ML libraries operate on NumPy arrays behind the scenes. Mastering ndarray operations helps you preprocess data quickly, implement custom transforms, and pass clean matrices to Scikit-learn—all while visualizing with Matplotlib and organizing with Pandas.
import numpy as np, pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
# 1) Load with Pandas
df = pd.DataFrame({
'age':[22,35,58,45,26,33,52,41],
'income':[35,62,120,85,40,59,110,77],
'city':['A','B','A','C','B','A','C','B'],
'bought':[0,1,1,1,0,0,1,0]
})
# 2) Minimal encoding with NumPy (one-hot for 'city')
cities = pd.Categorical(df['city'])
X_city = pd.get_dummies(cities).to_numpy() # (n, k)
# 3) Numeric block as NumPy
X_num = df[['age','income']].to_numpy(dtype='float64')
# 4) Concatenate features
X = np.hstack([X_num, X_city]) # (n, d)
y = df['bought'].to_numpy()
# 5) Train/test split
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.25, random_state=42)
# 6) Standardize numeric columns (first 2 columns)
scaler = StandardScaler(with_mean=True, with_std=True)
X_tr[:, :2] = scaler.fit_transform(X_tr[:, :2])
X_te[:, :2] = scaler.transform(X_te[:, :2])
# 7) Fit a model
clf = LogisticRegression(max_iter=1000)
clf.fit(X_tr, y_tr)
print('Accuracy:', clf.score(X_te, y_te))
# 8) Visualize decision scores vs. a feature
scores = clf.decision_function(X_te)
plt.scatter(X_te[:,0], scores, c=y_te, cmap='bwr', edgecolors='k')
plt.xlabel('z-age'); plt.ylabel('score'); plt.title('Logit scores by standardized age')
plt.show()
Replace slow loops with array ops for feature engineering.
Z = df[['age','income']].to_numpy(dtype='float64')
# Feature crosses
cross = (Z[:,0] * Z[:,1])[:, None] # (n,1)
# Binning (example): income deciles → one-hot
q = np.quantile(Z[:,1], np.linspace(0, 1, 11))
bin_idx = np.searchsorted(q, Z[:,1], side='right') - 1
bin_idx = np.clip(bin_idx, 0, 9)
onehot_bins = np.eye(10, dtype=int)[bin_idx] # (n,10)
features = np.hstack([Z, cross, onehot_bins]) # final design matrix
Z = (X - X.mean(0))/X.std(0)C = (X - mu).T @ (X - mu) / (n-1)w = np.linalg.lstsq(X, y, rcond=None)[0]X = np.random.randn(100, 5)
mu = X.mean(0, keepdims=True)
S = ((X - mu).T @ (X - mu)) / (X.shape[0]-1) # covariance
Scikit-learn expects (n_samples, n_features) arrays. Use astype to align dtypes, and slice columns by position for numeric vs. categorical paths.
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
# Example: build on the original df
numeric_cols = ['age','income']
categorical_cols = ['city']
pre = ColumnTransformer(
transformers=[
('num', Pipeline([
('impute', SimpleImputer(strategy='median')),
('scale', StandardScaler())
]), numeric_cols),
('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
]
)
pipe = Pipeline([
('prep', pre),
('model', RandomForestClassifier(n_estimators=200, random_state=0))
])
pipe.fit(df[numeric_cols+categorical_cols], df['bought'])
import matplotlib.pyplot as plt
X = np.random.randn(500, 2)
plt.figure(figsize=(5,4))
plt.hist2d(X[:,0], X[:,1], bins=30, cmap='viridis')
plt.colorbar(label='count'); plt.xlabel('x1'); plt.ylabel('x2'); plt.title('Feature density')
plt.show()
np.isnan/np.isfinite masks for quick cleaning..fit.float64 during training for stability; downcast at the edges.X = df[['age','income']].to_numpy(float)
mask = ~np.isfinite(X)
col_means = np.nanmean(X, axis=0)
X[mask] = np.take(col_means, np.where(mask)[1])
C_CONTIGUOUS for heavy math; use np.ascontiguousarray.tile/repeat; cast once at the end.np.random.default_rng(seed) and pass random_state in estimators.import numpy as np, pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, RocCurveDisplay
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
# Synthetic dataset
rng = np.random.default_rng(9)
n = 800
age = rng.normal(35, 8, n)
income = rng.normal(70, 20, n)
city = rng.choice(['A','B','C'], size=n, p=[0.4,0.4,0.2])
# True rule (hidden)
z = 0.03*(age-35) + 0.02*(income-70) + (city=='C')*0.5 + rng.normal(0, 0.5, n)
y = (z > 0.0).astype(int)
# Frame → NumPy
df = pd.DataFrame({'age':age, 'income':income, 'city':city, 'y':y})
X_num = df[['age','income']].to_numpy(float)
X_city = pd.get_dummies(df['city']).to_numpy()
X = np.hstack([X_num, X_city])
y = df['y'].to_numpy()
# Split, scale numeric
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.3, random_state=0)
scaler = StandardScaler().fit(X_tr[:, :2])
X_tr[:, :2] = scaler.transform(X_tr[:, :2])
X_te[:, :2] = scaler.transform(X_te[:, :2])
# Train & evaluate
clf = LogisticRegression(max_iter=1000).fit(X_tr, y_tr)
proba = clf.predict_proba(X_te)[:,1]
print('AUC:', roc_auc_score(y_te, proba))
# Plot ROC
RocCurveDisplay.from_predictions(y_te, proba)
plt.show()
(n_samples, n_features) before .fit().fit scalers on train, then transform both train and test.float64; handle categoricals separately.arr.flags['C_CONTIGUOUS'] when passing to heavy ops.# 1) Build a NumPy-only one-hot encoder for an integer category array (0..k-1).
cats = np.array([0,2,1,2,0])
onehot = np.eye(cats.max()+1, dtype=int)[cats]
# 2) Write a function that z-scores each column of X using keepdims.
def zscore(X):
mu = X.mean(0, keepdims=True)
sd = X.std(0, keepdims=True)
return (X - mu) / (sd + 1e-9)
# 3) Using train/test split, compare a baseline logistic regression with and without scaling.
Author
🎥 Join me live on YouTubePassionate about coding and teaching, I publish practical tutorials on PHP, Python, JavaScript, SQL, and web development. My goal is to make learning simple, engaging, and project‑oriented with real examples and source code.