這節實作課將帶你親手操作三種核心重抽樣技術:驗證集法(Validation Set)、交叉驗證(Cross-Validation)與 Bootstrap。我們的目標是用 Auto 資料集預測 mpg(每加侖英里數),並比較不同模型複雜度的表現。
statsmodels 擬合模型、sklearn 做交叉驗證,兩者透過 ISLP.models.sklearn_sm 包裝器橋接。
驗證集法就像把披薩切成兩半,一半試吃(訓練)、一半評分(驗證)——簡單直接,但結果高度依賴「怎麼切」。我們將 Auto 的 392 筆資料對半分,用 horsepower 預測 mpg。
# === Google Colab + 本機相容資料路徑 ===
try:
from google.colab import drive
drive.mount('/content/drive')
DATA_PATH = '/content/drive/MyDrive/ISLP_data/'
except ImportError:
DATA_PATH = '/tmp/'
import numpy as np
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import statsmodels.api as sm
from ISLP import load_data
from ISLP.models import ModelSpec as MS, summarize, poly
from sklearn.model_selection import train_test_split
# 載入 Auto 資料集
Auto = load_data('Auto')
print(f"Auto shape: {Auto.shape}") # (392, 9)
print(Auto.head(3))
try:
from google.colab import drive
drive.mount('/content/drive')
DATA_PATH = '/content/drive/MyDrive/ISLP_data/'
except ImportError:
DATA_PATH = '/tmp/'
import numpy as np
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import statsmodels.api as sm
from ISLP import load_data
from ISLP.models import ModelSpec as MS
from sklearn.model_selection import train_test_split
Auto = load_data('Auto')
# 392 筆 → 196 訓練 + 196 驗證
Auto_train, Auto_valid = train_test_split(
Auto, test_size=196, random_state=0
)
print(f"訓練: {Auto_train.shape[0]} 筆, 驗證: {Auto_valid.shape[0]} 筆")
# 僅用 horsepower 預測 mpg
hp_mm = MS(['horsepower'])
X_train = hp_mm.fit_transform(Auto_train)
y_train = Auto_train['mpg']
model = sm.OLS(y_train, X_train).fit()
# 驗證集預測
X_valid = hp_mm.transform(Auto_valid)
y_valid = Auto_valid['mpg']
valid_pred = model.predict(X_valid)
mse_linear = np.mean((y_valid - valid_pred) ** 2)
print(f"線性模型驗證 MSE: {mse_linear:.4f}")
try:
from google.colab import drive
drive.mount('/content/drive')
DATA_PATH = '/content/drive/MyDrive/ISLP_data/'
except ImportError:
DATA_PATH = '/tmp/'
import numpy as np
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import statsmodels.api as sm
from ISLP import load_data
from ISLP.models import ModelSpec as MS, poly
from sklearn.model_selection import train_test_split
Auto = load_data('Auto')
Auto_train, Auto_valid = train_test_split(
Auto, test_size=196, random_state=0
)
def evalMSE(terms, response, train, test):
"""通用 MSE 評估:擬合 train → 預測 test → 回傳 MSE"""
mm = MS(terms)
X_train = mm.fit_transform(train)
y_train = train[response]
X_test = mm.transform(test)
y_test = test[response]
results = sm.OLS(y_train, X_train).fit()
test_pred = results.predict(X_test)
return np.mean((y_test - test_pred) ** 2)
# 比較線性、二次、三次多項式(random_state=0)
MSE = np.zeros(3)
for idx, degree in enumerate(range(1, 4)):
MSE[idx] = evalMSE(
[poly('horsepower', degree)], 'mpg',
Auto_train, Auto_valid
)
print("random_state=0:")
for d, mse in zip(range(1, 4), MSE):
print(f" 次數={d}: MSE={mse:.4f}")
try:
from google.colab import drive
drive.mount('/content/drive')
DATA_PATH = '/content/drive/MyDrive/ISLP_data/'
except ImportError:
DATA_PATH = '/tmp/'
import numpy as np
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import statsmodels.api as sm
from ISLP import load_data
from ISLP.models import ModelSpec as MS, poly
from sklearn.model_selection import train_test_split
Auto = load_data('Auto')
def evalMSE(terms, response, train, test):
mm = MS(terms)
X_train = mm.fit_transform(train)
y_train = train[response]
X_test = mm.transform(test)
y_test = test[response]
results = sm.OLS(y_train, X_train).fit()
return np.mean((y_test - results.predict(X_test)) ** 2)
# random_state=3 的分割
Auto_train2, Auto_valid2 = train_test_split(
Auto, test_size=196, random_state=3
)
MSE2 = np.zeros(3)
for idx, degree in enumerate(range(1, 4)):
MSE2[idx] = evalMSE(
[poly('horsepower', degree)], 'mpg',
Auto_train2, Auto_valid2
)
print("random_state=3:")
for d, mse in zip(range(1, 4), MSE2):
print(f" 次數={d}: MSE={mse:.4f}")
print("兩個 random_state 給出不同 MSE → 驗證集法的弱點!")
mpg 與 horsepower 呈非線性關係,二次曲線已足夠。
汽車製造商需根據引擎馬力預估油耗。只用線性模型(MSE ≈ 23.6),誤差可能導致油耗標示不準。改用二次模型(MSE ≈ 18.8)可大幅降低誤差。但換一組訓練/驗證分割,MSE 就從 18.8 變成 17.0——需要更穩健的評估方法!
交叉驗證的核心精神:不要只切一次,輪流切很多次再平均。想像找了 10 個朋友試吃新菜,每個人吃不同一口,綜合評分——比只問一個人客觀多了。
sklearn_sm() 包裝器,使 statsmodels 擬合的模型能被 sklearn 的 cross_validate() 使用。核心觀念:sklearn 期望模型有 fit()、predict()、score() 方法。
try:
from google.colab import drive
drive.mount('/content/drive')
DATA_PATH = '/content/drive/MyDrive/ISLP_data/'
except ImportError:
DATA_PATH = '/tmp/'
import numpy as np
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import statsmodels.api as sm
from ISLP import load_data
from ISLP.models import ModelSpec as MS, sklearn_sm
from sklearn.model_selection import cross_validate
Auto = load_data('Auto')
hp_model = sklearn_sm(sm.OLS, MS(['horsepower']))
X = Auto.drop(columns=['mpg'])
y = Auto['mpg']
# LOOCV: cv = 樣本數
cv_results = cross_validate(hp_model, X, y, cv=Auto.shape[0])
cv_err = np.mean(cv_results['test_score'])
print(f"LOOCV 估計 MSE: {cv_err:.4f}")
try:
from google.colab import drive
drive.mount('/content/drive')
DATA_PATH = '/content/drive/MyDrive/ISLP_data/'
except ImportError:
DATA_PATH = '/tmp/'
import numpy as np
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import statsmodels.api as sm
from ISLP import load_data
from ISLP.models import sklearn_sm
from sklearn.model_selection import cross_validate
Auto = load_data('Auto')
y = Auto['mpg']
H = np.array(Auto['horsepower'])
M = sklearn_sm(sm.OLS)
cv_error = np.zeros(5)
for i, d in enumerate(range(1, 6)):
X_poly = np.power.outer(H, np.arange(d + 1))
M_CV = cross_validate(M, X_poly, y, cv=Auto.shape[0])
cv_error[i] = np.mean(M_CV['test_score'])
print("LOOCV 多項式比較:")
for d, err in zip(range(1, 6), cv_error):
print(f" 次數={d}: CV MSE = {err:.4f}")
plt.figure(figsize=(8, 5))
plt.plot(range(1, 6), cv_error, 'o-', color='#58a6ff', linewidth=2)
plt.xlabel('Polynomial Degree')
plt.ylabel('LOOCV Estimated Test MSE')
plt.title('LOOCV: Polynomial Degree vs Test Error')
plt.grid(alpha=0.3)
plt.savefig('/tmp/islp_loocv_poly.png', dpi=100, bbox_inches='tight')
plt.show()
print("Chart saved to /tmp/islp_loocv_poly.png")
try:
from google.colab import drive
drive.mount('/content/drive')
DATA_PATH = '/content/drive/MyDrive/ISLP_data/'
except ImportError:
DATA_PATH = '/tmp/'
import numpy as np
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import statsmodels.api as sm
from ISLP import load_data
from ISLP.models import sklearn_sm
from sklearn.model_selection import cross_validate, KFold
Auto = load_data('Auto')
y = Auto['mpg']
H = np.array(Auto['horsepower'])
M = sklearn_sm(sm.OLS)
cv_10f = KFold(n_splits=10, shuffle=True, random_state=0)
cv_err_10f = np.zeros(5)
for i, d in enumerate(range(1, 6)):
X_poly = np.power.outer(H, np.arange(d + 1))
MCV = cross_validate(M, X_poly, y, cv=cv_10f)
cv_err_10f[i] = np.mean(MCV['test_score'])
print("10-Fold CV:")
for d, err in zip(range(1, 6), cv_err_10f):
print(f" 次數={d}: CV MSE = {err:.4f}")
print("與 LOOCV 結果幾乎相同,但只需擬合 10 次!")
try:
from google.colab import drive
drive.mount('/content/drive')
DATA_PATH = '/content/drive/MyDrive/ISLP_data/'
except ImportError:
DATA_PATH = '/tmp/'
import numpy as np
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import statsmodels.api as sm
from ISLP import load_data
from ISLP.models import ModelSpec as MS, sklearn_sm
from sklearn.model_selection import cross_validate, ShuffleSplit
Auto = load_data('Auto')
hp_model = sklearn_sm(sm.OLS, MS(['horsepower']))
X = Auto.drop(columns=['mpg'])
y = Auto['mpg']
# 10 次隨機分割取平均
val10 = ShuffleSplit(n_splits=10, test_size=196, random_state=0)
res10 = cross_validate(hp_model, X, y, cv=val10)
print(f"10-split 平均 MSE: {res10['test_score'].mean():.4f}")
print(f"10-split 標準差: {res10['test_score'].std():.4f}")
print("(標準差 ≠ 抽樣變異性,因訓練集有大量重疊)")
Bootstrap 的核心精神:從你的資料中反覆抽樣(有放回),用這些假資料的統計量分布來估計真實的不確定性。就像把一份食譜重寫 1000 次,每次隨機抽掉/重複一些步驟,看成品差異有多大。
try:
from google.colab import drive
drive.mount('/content/drive')
DATA_PATH = '/content/drive/MyDrive/ISLP_data/'
except ImportError:
DATA_PATH = '/tmp/'
import numpy as np
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
def boot_SE(func, D, n=None, B=1000, seed=0):
"""Bootstrap 標準誤估計: SE = sqrt(E[X^2] - E[X]^2)"""
rng = np.random.default_rng(seed)
first_, second_ = 0.0, 0.0
n = n or D.shape[0]
for _ in range(B):
idx = rng.choice(D.index, n, replace=True)
value = func(D, idx)
first_ += value
second_ += value ** 2
return np.sqrt(second_ / B - (first_ / B) ** 2)
print("boot_SE() 已定義 — 用於估計任意統計量的標準誤")
try:
from google.colab import drive
drive.mount('/content/drive')
DATA_PATH = '/content/drive/MyDrive/ISLP_data/'
except ImportError:
DATA_PATH = '/tmp/'
import numpy as np
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
from ISLP import load_data
def boot_SE(func, D, n=None, B=1000, seed=0):
rng = np.random.default_rng(seed)
first_, second_ = 0.0, 0.0
n = n or D.shape[0]
for _ in range(B):
idx = rng.choice(D.index, n, replace=True)
v = func(D, idx)
first_ += v
second_ += v ** 2
return np.sqrt(second_ / B - (first_ / B) ** 2)
Portfolio = load_data('Portfolio')
def alpha_func(D, idx):
cov_ = np.cov(D[['X', 'Y']].loc[idx], rowvar=False)
return (cov_[1, 1] - cov_[0, 1]) / (cov_[0, 0] + cov_[1, 1] - 2 * cov_[0, 1])
alpha_SE = boot_SE(alpha_func, Portfolio, B=1000, seed=0)
alpha_est = alpha_func(Portfolio, Portfolio.index)
print(f"α 點估計 = {alpha_est:.5f}")
print(f"Bootstrap SE(α) = {alpha_SE:.5f}")
print(f"近似 95% CI: [{alpha_est - 2*alpha_SE:.4f}, {alpha_est + 2*alpha_SE:.4f}]")
try:
from google.colab import drive
drive.mount('/content/drive')
DATA_PATH = '/content/drive/MyDrive/ISLP_data/'
except ImportError:
DATA_PATH = '/tmp/'
import numpy as np
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import statsmodels.api as sm
from ISLP import load_data
from ISLP.models import ModelSpec as MS, summarize
from sklearn.base import clone
from functools import partial
def boot_SE(func, D, n=None, B=1000, seed=0):
rng = np.random.default_rng(seed)
first_, second_ = 0.0, 0.0
n = n or D.shape[0]
for _ in range(B):
idx = rng.choice(D.index, n, replace=True)
v = func(D, idx)
first_ += v
second_ += v ** 2
return np.sqrt(second_ / B - (first_ / B) ** 2)
def boot_OLS(model_matrix, response, D, idx):
D_ = D.loc[idx]
Y_ = D_[response]
X_ = clone(model_matrix).fit_transform(D_)
return sm.OLS(Y_, X_).fit().params
Auto = load_data('Auto')
hp_func = partial(boot_OLS, MS(['horsepower']), 'mpg')
# Bootstrap SE
hp_se = boot_SE(hp_func, Auto, B=1000, seed=10)
print("Bootstrap SE:")
print(hp_se)
# 經典公式 SE
hp_model = sm.OLS(Auto['mpg'], MS(['horsepower']).fit_transform(Auto))
model_se = summarize(hp_model.fit())['std err']
print("\nClassical SE:")
print(model_se)
print("\nBootstrap SE 較大 → 更誠實,不依賴線性/固定 x 假設")
try:
from google.colab import drive
drive.mount('/content/drive')
DATA_PATH = '/content/drive/MyDrive/ISLP_data/'
except ImportError:
DATA_PATH = '/tmp/'
import numpy as np
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import statsmodels.api as sm
from ISLP import load_data
from ISLP.models import ModelSpec as MS, summarize, poly
from sklearn.base import clone
from functools import partial
def boot_SE(func, D, n=None, B=1000, seed=0):
rng = np.random.default_rng(seed)
first_, second_ = 0.0, 0.0
n = n or D.shape[0]
for _ in range(B):
idx = rng.choice(D.index, n, replace=True)
v = func(D, idx)
first_ += v
second_ += v ** 2
return np.sqrt(second_ / B - (first_ / B) ** 2)
def boot_OLS(model_matrix, response, D, idx):
D_ = D.loc[idx]
Y_ = D_[response]
X_ = clone(model_matrix).fit_transform(D_)
return sm.OLS(Y_, X_).fit().params
Auto = load_data('Auto')
quad_model = MS([poly('horsepower', 2, raw=True)])
quad_func = partial(boot_OLS, quad_model, 'mpg')
# Bootstrap SE
quad_se = boot_SE(quad_func, Auto, B=1000)
print("Bootstrap SE (quadratic):")
print(quad_se)
# Classical SE
M_q = sm.OLS(Auto['mpg'], quad_model.fit_transform(Auto))
quad_ce = summarize(M_q.fit())['std err']
print("\nClassical SE (quadratic):")
print(quad_ce)
print("\n二次模型假設合理 → Bootstrap 與古典 SE 接近")
mpg ~ horsepower 有非線性關係 → 殘差膨脹 → σ² 低估 → SE 低估。| 特性 | 驗證集法 | K-Fold CV | Bootstrap |
|---|---|---|---|
| 核心概念 | 單次隨機分割 | 輪流用不同子集驗證 | 有放回抽樣模擬分布 |
| 計算成本 | 極低(1 次擬合) | 中等(K 次擬合) | 高(B≈1000 次擬合) |
| 偏誤 | 可能高估 | 較低(K-fold 平衡) | 低(用完整資料) |
| 變異性估計 | 無 | 可提供(折疊間變異) | 精確(完整抽樣分布) |
| 適用場景 | 超大資料集初步篩選 | 模型選擇、超參數調優 | SE/CI 估計、不確定性量化 |
| sklearn API | train_test_split | cross_validate + KFold | 手寫(無內建) |
| 典型 n | 任意 | n ≥ 100 | n ≥ 30 |
Bootstrap 的核心哲理——「用多個略有不同的資料集重複估計,匯總出穩健結論」——完美對應到 subagent-driven-development 架構:
multi-model-code-review 的 4 模型交叉審查,≥2 共識才標記在 multi-model-code-review 中,≥2 模型共識才標記問題——正是「交叉驗證式」的品質保證。未來可擴展為「Bootstrap 式」:對同一段碼用不同提示詞/溫度重複審查,統計問題出現頻率與一致性。