這堂 Lab 課是第 4 章的實戰精華。我們會在同一個資料集(Smarket 股市資料)上依序用五種分類器做預測,然後切換到 Caravan 保險資料和 Bikeshare 共享單車資料,體驗不同資料特性如何影響演算法選擇。
sklearn 實作 Logistic Regression、LDA、QDA、Naive Bayes、KNN想像你是一個股票交易員,每天早上打開 Bloomberg 終端機,想用前五天的報酬率預測今天大盤是漲還是跌。這就是 Smarket 資料集的場景——它是 S&P 500 指數從 2001 到 2005 的每日資料,共 1,250 個交易日。
# Colab / 本機通用資料讀取
try:
from google.colab import drive
drive.mount('/content/drive')
DATA_PATH = '/content/drive/MyDrive/ISLP_data/'
except ImportError:
DATA_PATH = '/tmp/'
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from ISLP import load_data
from ISLP.models import summarize
# 載入 Smarket 股市資料
Smarket = load_data('Smarket')
print(f"資料維度: {Smarket.shape}")
print(f"欄位: {list(Smarket.columns)}")
print(Smarket.head(5))
# 相關係數矩陣(不含 Direction 類別變數)
corr = Smarket.drop(columns=['Direction']).corr()
print("\n相關係數矩陣:")
print(corr.round(3))
Logistic Regression(羅吉斯迴歸)像是一個「機率翻譯機」:把線性組合 \( \beta_0 + \beta_1 X_1 + \cdots \) 透過 sigmoid 函數壓縮到 (0,1) 之間,變成「上漲機率」。
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from ISLP import load_data
from ISLP.models import summarize
from ISLP import confusion_table
from sklearn.linear_model import LogisticRegression
import statsmodels.api as sm
Smarket = load_data('Smarket')
# 用 statsmodels 做 Logistic Regression(含 p-value)
X = sm.add_constant(Smarket[['Lag1','Lag2','Lag3','Lag4','Lag5','Volume']])
y = (Smarket['Direction'] == 'Up').astype(int)
model = sm.Logit(y, X).fit(disp=False)
print(summarize(model))
# 預測機率與分類
probs = model.predict(X)
pred_labels = np.where(probs > 0.5, 'Up', 'Down')
# 混淆矩陣
print("\n訓練集混淆矩陣:")
print(confusion_table(pred_labels, Smarket['Direction']))
# 訓練集準確率
accuracy = np.mean(pred_labels == Smarket['Direction'])
print(f"\n訓練集準確率: {accuracy:.1%}")
import matplotlib
matplotlib.use('Agg')
import numpy as np
import pandas as pd
from ISLP import load_data
from ISLP import confusion_table
import statsmodels.api as sm
Smarket = load_data('Smarket')
# 用布林向量分割:2005 之前 = 訓練,2005 = 測試
train = Smarket['Year'] < 2005
X_train = sm.add_constant(Smarket.loc[train, ['Lag1','Lag2']])
y_train = (Smarket.loc[train, 'Direction'] == 'Up').astype(int)
X_test = sm.add_constant(Smarket.loc[~train, ['Lag1','Lag2']])
y_test = (Smarket.loc[~train, 'Direction'] == 'Up').astype(int)
# 只用 Lag1, Lag2 重新擬合
model2 = sm.Logit(y_train, X_train).fit(disp=False)
probs_test = model2.predict(X_test)
pred_test = np.where(probs_test > 0.5, 'Up', 'Down')
print("測試集混淆矩陣(2005 年,252 個交易日):")
print(confusion_table(pred_test, Smarket.loc[~train, 'Direction']))
test_acc = np.mean(pred_test == Smarket.loc[~train, 'Direction'])
print(f"\n測試集準確率: {test_acc:.1%}")
print(f"(隨機猜測基準: {max(np.mean(y_test), 1-np.mean(y_test)):.1%})")
Logistic Regression 是銀行業偵測詐欺交易的經典首選。每天數百萬筆交易中,只有 0.1% 是詐欺(極度不平衡類別)。Logistic Regression 的優勢在於:它直接輸出機率,風控團隊可以動態調整閾值——平常設 0.5,可疑時段降到 0.3,平衡抓詐率與誤報率。
如果 Logistic Regression 是「直接用機率建模」,那 LDA 就是「先畫出每個類別的資料分布,再決定邊界」。LDA 假設每個類別的資料來自同一個共變異數矩陣(即不同類別的「形狀」相同,只是「位置」不同)的常態分佈。
import matplotlib
matplotlib.use('Agg')
import numpy as np
from ISLP import load_data
from ISLP import confusion_table
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
Smarket = load_data('Smarket')
train = Smarket['Year'] < 2005
X_train = Smarket.loc[train, ['Lag1','Lag2']]
y_train = Smarket.loc[train, 'Direction']
X_test = Smarket.loc[~train, ['Lag1','Lag2']]
y_test = Smarket.loc[~train, 'Direction']
# LDA 擬合(store_covariance=True 才能看群組平均數)
lda = LDA(store_covariance=True)
lda.fit(X_train, y_train)
# 先驗機率
print(f"先驗機率 π_Down = {lda.priors_[0]:.3f}, π_Up = {lda.priors_[1]:.3f}")
# 類別平均數
print(f"\n類別平均數:\nDown:\n{lda.means_[0]}\nUp:\n{lda.means_[1]}")
# 線性判別向量
print(f"\n判別向量 (scalings_): {lda.scalings_.ravel()}")
# 預測與評估
lda_pred = lda.predict(X_test)
print(f"\n測試集混淆矩陣:")
print(confusion_table(lda_pred, y_test))
print(f"測試準確率: {np.mean(lda_pred == y_test):.3f}")
# 後驗機率:檢查是否有高信心度的「下跌」預測
lda_prob = lda.predict_proba(X_test)
print(f"\n最大下跌機率: {lda_prob[:,0].max():.3f}")
print(f"下跌機率 > 0.9 的天數: {np.sum(lda_prob[:,0] > 0.9)}")
LDA 假設所有類別共用同一個共變異數矩陣——就像假設所有學生的鉛筆盒形狀都一樣。QDA 放寬這個假設:每個類別有自己的共變異數矩陣。代價是參數量暴增,但換來的是更靈活的非線性決策邊界。
import matplotlib
matplotlib.use('Agg')
import numpy as np
from ISLP import load_data
from ISLP import confusion_table
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as QDA
Smarket = load_data('Smarket')
train = Smarket['Year'] < 2005
X_train = Smarket.loc[train, ['Lag1','Lag2']]
y_train = Smarket.loc[train, 'Direction']
X_test = Smarket.loc[~train, ['Lag1','Lag2']]
y_test = Smarket.loc[~train, 'Direction']
# QDA 擬合
qda = QDA(store_covariance=True)
qda.fit(X_train, y_train)
qda_pred = qda.predict(X_test)
print("QDA 測試集混淆矩陣:")
print(confusion_table(qda_pred, y_test))
print(f"\nQDA 測試準確率: {np.mean(qda_pred == y_test):.3f}")
print(f"(對比 LDA: 訓練集 2005 年前資料,但 QDA 預測 2005 年有 ~60% 準確率)")
Naive Bayes 的「Naive」來自一個大膽的假設:給定類別後,所有特徵彼此獨立。這就像假設「今天 Lag1 的漲跌」和「昨天 Lag2 的漲跌」完全無關——在金融資料上顯然不成立。但神奇的是,即使假設錯了,Naive Bayes 常常還是能做出不錯的預測。
import matplotlib
matplotlib.use('Agg')
import numpy as np
from ISLP import load_data
from ISLP import confusion_table
from sklearn.naive_bayes import GaussianNB
Smarket = load_data('Smarket')
train = Smarket['Year'] < 2005
X_train = Smarket.loc[train, ['Lag1','Lag2']]
y_train = Smarket.loc[train, 'Direction']
X_test = Smarket.loc[~train, ['Lag1','Lag2']]
y_test = Smarket.loc[~train, 'Direction']
# Naive Bayes(高斯型)
nb = GaussianNB()
nb.fit(X_train, y_train)
# 檢查參數
print("各類別每個特徵的平均數 (theta_):")
print(f"Down: {nb.theta_[0]}")
print(f"Up: {nb.theta_[1]}")
print(f"\n各類別每個特徵的變異數 (var_):")
print(f"Down: {nb.var_[0]}")
print(f"Up: {nb.var_[1]}")
# 手動驗證平均數計算
print(f"\n手動計算 Down 類 Lag1 平均: {X_train[y_train=='Down']['Lag1'].mean():.4f}")
print(f"手動計算 Down 類 Lag1 變異: {X_train[y_train=='Down']['Lag1'].var(ddof=0):.4f}")
# 預測
nb_pred = nb.predict(X_test)
print(f"\nNaive Bayes 測試準確率: {np.mean(nb_pred == y_test):.3f}")
KNN 是最直覺的分類器——「近朱者赤,近墨者黑」。給定一個新資料點,找出訓練集中離它最近的 K 個鄰居,然後投票決定類別。KNN 沒有「訓練」階段(它只是把資料記下來),所有計算都發生在預測時。
import matplotlib
matplotlib.use('Agg')
import numpy as np
from ISLP import load_data
from ISLP import confusion_table
from sklearn.neighbors import KNeighborsClassifier
Smarket = load_data('Smarket')
train = Smarket['Year'] < 2005
X_train = Smarket.loc[train, ['Lag1','Lag2']]
y_train = Smarket.loc[train, 'Direction']
X_test = Smarket.loc[~train, ['Lag1','Lag2']]
y_test = Smarket.loc[~train, 'Direction']
# KNN with K=1
knn1 = KNeighborsClassifier(n_neighbors=1)
knn1.fit(X_train, y_train)
knn1_pred = knn1.predict(X_test)
print("K=1 測試集混淆矩陣:")
print(confusion_table(knn1_pred, y_test))
print(f"K=1 測試準確率: {np.mean(knn1_pred == y_test):.3f}")
# KNN with K=3
knn3 = KNeighborsClassifier(n_neighbors=3)
knn3.fit(X_train, y_train)
knn3_pred = knn3.predict(X_test)
print(f"\nK=3 測試準確率: {np.mean(knn3_pred == y_test):.3f}")
現在換一個完全不同風格的資料集:Caravan 保險資料。這是荷蘭一家保險公司的客戶資料,共 5,822 筆,85 個人口統計變數。目標是預測誰會買 caravan(露營拖車)保險——這是一個極度不平衡的問題:只有 6% 的人會買。
import matplotlib
matplotlib.use('Agg')
import numpy as np
import pandas as pd
from ISLP import load_data
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
Caravan = load_data('Caravan')
print(f"Caravan 維度: {Caravan.shape}")
# Purchase 是目標變數
print(f"\n購買比例: {Caravan['Purchase'].value_counts(normalize=True).round(4)}")
# 特徵與目標
X = Caravan.drop(columns=['Purchase'])
y = (Caravan['Purchase'] == 'Yes').astype(int)
# 標準化(KNN 的關鍵一步!)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
print(f"標準化後各特徵標準差範圍: {X_scaled.std(axis=0, ddof=0).min():.3f} ~ {X_scaled.std(axis=0, ddof=0).max():.3f}")
# 訓練/測試分割
X_train, X_test, y_train, y_test = train_test_split(
X_scaled, y, test_size=1000, random_state=42
)
# KNN with K=1
knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print(f"\nK=1 混淆矩陣:\n 預測No 預測Yes")
print(f"實際No {cm[0,0]} {cm[0,1]}")
print(f"實際Yes {cm[1,0]} {cm[1,1]}")
# 關鍵指標:在預測為 Yes 的人中,有多少真的買了?
precision_yes = cm[1,1] / (cm[0,1] + cm[1,1])
print(f"\n預測為「會買」的人中,真正買了的比例: {precision_yes:.1%}")
print(f"(隨機猜測基準: {y_test.mean():.1%})")
import matplotlib
matplotlib.use('Agg')
import numpy as np
import matplotlib.pyplot as plt
from ISLP import load_data
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
Caravan = load_data('Caravan')
X = StandardScaler().fit_transform(Caravan.drop(columns=['Purchase']))
y = (Caravan['Purchase'] == 'Yes').astype(int)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1000, random_state=42)
# 嘗試不同 K 值
Ks = [1, 3, 5, 7, 10, 15, 20]
results = []
for k in Ks:
knn = KNeighborsClassifier(n_neighbors=k)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
acc = np.mean(y_pred == y_test)
cm = np.zeros((2,2), dtype=int)
for i in range(len(y_test)):
cm[y_test.iloc[i], y_pred[i]] += 1
prec = cm[1,1] / (cm[0,1] + cm[1,1]) if (cm[0,1] + cm[1,1]) > 0 else 0
results.append({'K': k, 'Accuracy': acc, 'Precision(Yes)': prec})
print("K 值比較表:")
print(f"{'K':>5} {'整體準確率':>10} {'Yes精確度':>10}")
for r in results:
print(f"{r['K']:>5} {r['Accuracy']:>10.3f} {r['Precision(Yes)']:>10.3f}")
# 視覺化
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))
ax1.plot(Ks, [r['Accuracy'] for r in results], 'o-', color='#58a6ff')
ax1.set_xlabel('K'); ax1.set_ylabel('Accuracy'); ax1.set_title('整體準確率 vs K')
ax2.plot(Ks, [r['Precision(Yes)'] for r in results], 'o-', color='#3fb950')
ax2.set_xlabel('K'); ax2.set_ylabel('Precision (Yes)'); ax2.set_title('Yes 精確度 vs K')
fig.tight_layout()
plt.show()
最後一個 Lab 實例是 Bikeshare 資料集,記錄了華盛頓特區共享單車每小時的租借次數。這裡的反應變數是「次數」(非負整數),不適合用線性迴歸(可能預測出負的租借量),但非常適合用 Poisson Regression——第 4.6 節介紹的廣義線性模型。
import matplotlib
matplotlib.use('Agg')
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from ISLP import load_data
from ISLP.models import summarize, ModelSpec as MS
import statsmodels.api as sm
Bike = load_data('Bikeshare')
print(f"Bikeshare 維度: {Bike.shape}")
print(f"欄位: {list(Bike.columns)}")
# === 線性迴歸(基線比較) ===
X = MS(['mnth', 'hr', 'workingday', 'temp', 'weathersit']).fit_transform(Bike)
Y = Bike['bikers']
M_lm = sm.OLS(Y, X).fit()
print("\n=== 線性迴歸 ===")
print(summarize(M_lm))
# === Poisson Regression ===
M_pois = sm.GLM(Y, X, family=sm.families.Poisson()).fit()
print("\n=== Poisson Regression ===")
print(summarize(M_pois))
# === 視覺化:月份效應 ===
# 從更豐富的模型提取係數
X2 = MS(['mnth', 'hr', 'workingday', 'temp', 'weathersit']).fit_transform(Bike)
M2_pois = sm.GLM(Y, X2, family=sm.families.Poisson()).fit()
S2 = summarize(M2_pois)
# 提取並繪製月份係數
coef_month = S2[S2.index.str.contains('mnth')]['coef']
# 補上 December(基準類別,係數 = 負的其他月份總和)
months = Bike['mnth'].dtype.categories
coef_month = pd.concat([coef_month, pd.Series([-coef_month.sum()], index=['mnth[Dec]'])])
fig, ax = plt.subplots(figsize=(8, 5))
ax.bar(range(12), coef_month.values, color='#58a6ff', alpha=0.8)
ax.set_xticks(range(12))
ax.set_xticklabels(['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec'])
ax.set_xlabel('Month'); ax.set_ylabel('Coefficient (log scale)')
ax.set_title('Bikeshare: 月份對租借量的影響(Poisson)')
ax.axhline(y=0, color='#f85149', linestyle='--', linewidth=0.8)
plt.show()
電商平台需要預測每小時的訂單量來調度外送員。Poisson Regression 天然適合這個場景:反應變數是計數型(訂單數)、非負整數、變異數隨期望值增大而增大。搭配天氣、時段、促銷活動等預測變數,可以比線性迴歸更準確地預測尖峰需求。
| 方法 | 決策邊界 | 參數量 | 機率輸出 | 適用場景 |
|---|---|---|---|---|
| Logistic Regression | 線性 | p+1 | ✅ 天然 | 需要係數解釋(p-value)時首選 |
| LDA | 線性 | K·p + p(p+1)/2 | ✅ 後驗機率 | 各類別近似常態且共變異數相同 |
| QDA | 二次曲線 | K·(p + p(p+1)/2) | ✅ 後驗機率 | 各類別共變異數不同、樣本夠大 |
| Naive Bayes | 非線性 | K·2p(高斯型) | ✅ 後驗機率 | 高維度資料(p ≫ n)、文字分類 |
| KNN (K=1) | 極度不規則 | 0(無參數) | ❌ 需手算 | 決策邊界高度非線性、樣本量中等 |
subagent-performance-management skill 的模型汰換機制。