Vectorization replaces slow Python loops with compiled operations over entire arrays. The results are shorter code, fewer bugs, and large speedups—especially for numeric workloads.
Use %timeit
in Jupyter or the timeit
module in scripts. Below timings are illustrative; run locally to see your machine’s numbers.
import numpy as np
N = 1_000_000
x = np.random.rand(N)
# Python loop
def loop_square(x):
out = np.empty_like(x)
for i, v in enumerate(x):
out[i] = v*v
return out
# Vectorized with ufunc
vec = x * x # or np.square(x)
# %timeit loop_square(x)
# %timeit x*x
x = np.arange(10)
# Loop
out_loop = np.empty_like(x)
for i, v in enumerate(x):
out_loop[i] = 0 if v % 2 else v
# Vectorized
out_vec = np.where(x % 2 == 1, 0, x)
X = np.arange(12, dtype=float).reshape(3,4)
col_means = X.mean(axis=0) # (4,)
Xz = (X - col_means) # broadcast subtract
scales = np.array([1.0, 0.5, 2.0, 1.5])
Xs = X * scales # column-wise scaling
M = np.random.randint(0, 100, (1000, 1000))
# Loop sum by column (slow)
col_sum_loop = np.zeros(M.shape[1], dtype=int)
for j in range(M.shape[1]):
s = 0
for i in range(M.shape[0]):
s += M[i, j]
col_sum_loop[j] = s
# Vectorized
col_sum_vec = M.sum(axis=0)
col_mean_vec = M.mean(axis=0)
col_std_vec = M.std(axis=0)
N = 1_000_0
# Bad: repeatedly grows Python list
lst = []
for i in range(N):
lst.append(i*i)
arr_bad = np.array(lst)
# Better: preallocate and fill
arr = np.empty(N, dtype=np.int64)
for i in range(N):
arr[i] = i*i
# Best: vectorize completely
arr_vec = np.arange(N, dtype=np.int64)**2
A = np.array([[0,0],[1,1],[2,2]]) # (3,2)
B = np.array([[0,1],[1,2]]) # (2,2)
diff = A[:, None, :] - B[None, :, :] # (3,1,2) - (1,2,2) -> (3,2,2)
D = np.sqrt((diff**2).sum(axis=-1)) # (3,2)
Use with care; this returns a view by manipulating strides. Prefer high-level APIs when available.
from numpy.lib.stride_tricks import as_strided
x = np.arange(10)
w = 4
shape = (x.size - w + 1, w)
strides = (x.strides[0], x.strides[0])
windows = as_strided(x, shape=shape, strides=strides)
moving_sum = windows.sum(axis=1)
X = np.arange(12).reshape(3,4)
w = np.arange(4)
# Dot using broadcasting sum
y1 = (X * w).sum(axis=1)
# Equivalent with einsum (often clearer)
y2 = np.einsum('ij,j->i', X, w)
import timeit, numpy as np
setup = """import numpy as np
x = np.random.rand(1_000_00)
def loop_square(x):
out = np.empty_like(x)
for i, v in enumerate(x):
out[i] = v*v
return out
"""
t_loop = timeit.timeit('loop_square(x)', setup=setup, number=10)
t_vec = timeit.timeit('x*x', setup=setup, number=10)
print('loop:', t_loop, 'vectorized:', t_vec)
arr.flags['C_CONTIGUOUS']
/ ['F_CONTIGUOUS']
before heavy ops; use np.ascontiguousarray
if needed.np.vectorize
≠ speed: It’s a convenience wrapper, not a real vectorizer; prefer ufuncs/broadcasting.dtype
for large sums (e.g., use float64
).# 1) Replace negatives with 0 using a vectorized approach
x = np.array([3, -1, 5, -2, 7])
print(np.where(x < 0, 0, x))
# 2) Standardize columns of a (100, 3) array (z-score)
X = np.random.randn(100, 3)
Xz = (X - X.mean(axis=0)) / X.std(axis=0)
# 3) Compute pairwise L2 distances between (n,2) and (m,2) sets without loops
# 4) Using timeit, compare Python loop sum of squares vs (x**2).sum()
Author
🎥 Join me live on YouTubePassionate about coding and teaching, I publish practical tutorials on PHP, Python, JavaScript, SQL, and web development. My goal is to make learning simple, engaging, and project‑oriented with real examples and source code.