Appearance
机器学习基础面试题
1. 机器学习概述
问题:什么是机器学习?机器学习有哪些主要类型?
答案:
机器学习是人工智能的一个分支,通过算法让计算机从数据中学习规律,而无需显式编程。
主要类型:
机器学习
├── 监督学习(Supervised Learning)
│ ├── 分类(Classification)
│ └── 回归(Regression)
├── 无监督学习(Unsupervised Learning)
│ ├── 聚类(Clustering)
│ └── 降维(Dimensionality Reduction)
├── 半监督学习(Semi-supervised Learning)
└── 强化学习(Reinforcement Learning)监督学习:使用带标签的数据训练模型
- 分类:预测离散类别(如垃圾邮件检测)
- 回归:预测连续值(如房价预测)
无监督学习:使用无标签数据发现隐藏模式
- 聚类:将数据分组(如客户分群)
- 降维:减少数据维度(如 PCA)
强化学习:通过与环境交互学习最优策略
2. 数据预处理
问题:机器学习中的数据预处理有哪些步骤?
答案:
python
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split
# 1. 处理缺失值
# 删除缺失值
df.dropna(inplace=True)
# 填充缺失值
df['age'].fillna(df['age'].mean(), inplace=True) # 均值填充
df['category'].fillna(df['category'].mode()[0], inplace=True) # 众数填充
# 2. 处理异常值
def remove_outliers(df, column, threshold=3):
"""使用 Z-score 移除异常值"""
z_scores = np.abs((df[column] - df[column].mean()) / df[column].std())
return df[z_scores < threshold]
# 使用 IQR
def remove_outliers_iqr(df, column):
Q1 = df[column].quantile(0.25)
Q3 = df[column].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]
# 3. 特征缩放
# 标准化(Z-score)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# 归一化(Min-Max)
normalizer = MinMaxScaler()
X_normalized = normalizer.fit_transform(X)
# 4. 编码分类变量
# 标签编码(有序类别)
le = LabelEncoder()
df['category_encoded'] = le.fit_transform(df['category'])
# One-Hot 编码(无序类别)
df_encoded = pd.get_dummies(df, columns=['category'])
# 5. 特征工程
# 创建新特征
df['age_group'] = pd.cut(df['age'], bins=[0, 18, 35, 50, 100],
labels=['少年', '青年', '中年', '老年'])
# 多项式特征
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(degree=2)
X_poly = poly.fit_transform(X)
# 6. 数据集划分
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42, stratify=y
)3. 模型评估指标
问题:分类和回归模型常用的评估指标有哪些?
答案:
python
from sklearn.metrics import (
accuracy_score, precision_score, recall_score, f1_score,
confusion_matrix, classification_report,
mean_squared_error, mean_absolute_error, r2_score,
roc_auc_score, roc_curve
)
import numpy as np
# 1. 分类指标
y_true = [1, 0, 1, 1, 0, 1, 0, 0]
y_pred = [1, 0, 1, 0, 0, 1, 1, 0]
# 准确率
accuracy = accuracy_score(y_true, y_pred)
# 精确率
precision = precision_score(y_true, y_pred)
# 召回率
recall = recall_score(y_true, y_pred)
# F1 分数
f1 = f1_score(y_true, y_pred)
# 混淆矩阵
cm = confusion_matrix(y_true, y_pred)
print(cm)
# [[TN, FP]
# [FN, TP]]
# 综合报告
print(classification_report(y_true, y_pred))
# ROC-AUC
y_scores = [0.9, 0.1, 0.8, 0.3, 0.2, 0.7, 0.6, 0.1]
auc = roc_auc_score(y_true, y_scores)
# 2. 回归指标
y_true_reg = [3, -0.5, 2, 7]
y_pred_reg = [2.5, 0.0, 2, 8]
# MSE
mse = mean_squared_error(y_true_reg, y_pred_reg)
# RMSE
rmse = np.sqrt(mse)
# MAE
mae = mean_absolute_error(y_true_reg, y_pred_reg)
# R² 分数
r2 = r2_score(y_true_reg, y_pred_reg)
# 3. 交叉验证
from sklearn.model_selection import cross_val_score, KFold, StratifiedKFold
# K 折交叉验证
scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')
print(f"Cross-validation scores: {scores}")
print(f"Mean CV score: {scores.mean():.3f} (+/- {scores.std() * 2:.3f})")
# 分层 K 折(保持类别比例)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
for train_index, test_index in skf.split(X, y):
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]4. 线性回归
问题:什么是线性回归?如何实现?
答案:
python
import numpy as np
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
# 1. 简单线性回归
# 生成数据
np.random.seed(42)
X = 2 * np.random.rand(100, 1)
y = 4 + 3 * X + np.random.randn(100, 1)
# 训练模型
lin_reg = LinearRegression()
lin_reg.fit(X, y)
# 预测
X_new = np.array([[0], [2]])
y_pred = lin_reg.predict(X_new)
# 系数
print(f"截距: {lin_reg.intercept_[0]:.2f}")
print(f"斜率: {lin_reg.coef_[0][0]:.2f}")
# 2. 多元线性回归
X_multi = np.random.rand(100, 3)
y_multi = 2 + 3*X_multi[:, 0] + 5*X_multi[:, 1] - 2*X_multi[:, 2] + np.random.randn(100)
lin_reg_multi = LinearRegression()
lin_reg_multi.fit(X_multi, y_multi)
# 3. 多项式回归
poly_reg = Pipeline([
('poly', PolynomialFeatures(degree=2)),
('lin_reg', LinearRegression())
])
poly_reg.fit(X, y)
# 4. 正则化
# Ridge 回归(L2 正则化)
ridge = Ridge(alpha=1.0)
ridge.fit(X, y)
# Lasso 回归(L1 正则化)
lasso = Lasso(alpha=0.1)
lasso.fit(X, y)
# 5. 梯度下降实现
class GradientDescent:
def __init__(self, learning_rate=0.01, n_iterations=1000):
self.lr = learning_rate
self.n_iter = n_iterations
self.weights = None
self.bias = None
def fit(self, X, y):
n_samples, n_features = X.shape
self.weights = np.zeros(n_features)
self.bias = 0
for _ in range(self.n_iter):
# 预测
y_pred = np.dot(X, self.weights) + self.bias
# 计算梯度
dw = (1 / n_samples) * np.dot(X.T, (y_pred - y))
db = (1 / n_samples) * np.sum(y_pred - y)
# 更新参数
self.weights -= self.lr * dw
self.bias -= self.lr * db
def predict(self, X):
return np.dot(X, self.weights) + self.bias5. 逻辑回归
问题:什么是逻辑回归?与线性回归有什么区别?
答案:
python
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import make_classification
import numpy as np
# 1. 生成数据
X, y = make_classification(n_samples=1000, n_features=2, n_redundant=0,
n_informative=2, random_state=42)
# 2. 训练逻辑回归模型
log_reg = LogisticRegression()
log_reg.fit(X, y)
# 3. 预测
y_pred = log_reg.predict(X)
y_prob = log_reg.predict_proba(X)[:, 1] # 预测概率
# 4. 系数解释
print(f"系数: {log_reg.coef_}")
print(f"截距: {log_reg.intercept_}")
# 5. 手动实现 Sigmoid 函数
def sigmoid(z):
return 1 / (1 + np.exp(-z))
# 6. 逻辑回归原理
"""
线性回归: y = w·x + b
逻辑回归: p = sigmoid(w·x + b) = 1 / (1 + exp(-(w·x + b)))
区别:
1. 线性回归用于回归问题(连续值)
2. 逻辑回归用于分类问题(概率值)
3. 逻辑回归使用 Sigmoid 函数将输出映射到 (0, 1)
4. 使用对数似然损失而非 MSE
"""
# 7. 多分类逻辑回归
# One-vs-Rest (OvR)
log_reg_ovr = LogisticRegression(multi_class='ovr')
log_reg_ovr.fit(X_multi, y_multi)
# Softmax (Multinomial)
log_reg_softmax = LogisticRegression(multi_class='multinomial', solver='lbfgs')
log_reg_softmax.fit(X_multi, y_multi)6. 决策树
问题:决策树是如何工作的?如何防止过拟合?
答案:
python
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor, export_text, plot_tree
from sklearn.datasets import load_iris
import matplotlib.pyplot as plt
# 1. 加载数据
iris = load_iris()
X, y = iris.data, iris.target
# 2. 训练决策树
# 分类树
dt_clf = DecisionTreeClassifier(
max_depth=3, # 最大深度
min_samples_split=2, # 分裂所需最小样本数
min_samples_leaf=1, # 叶节点最小样本数
criterion='gini' # 分裂标准: 'gini' 或 'entropy'
)
dt_clf.fit(X, y)
# 回归树
dt_reg = DecisionTreeRegressor(max_depth=3)
dt_reg.fit(X, y)
# 3. 可视化
tree_rules = export_text(dt_clf, feature_names=iris.feature_names)
print(tree_rules)
# 绘制树
plt.figure(figsize=(20, 10))
plot_tree(dt_clf, feature_names=iris.feature_names,
class_names=iris.target_names, filled=True)
plt.show()
# 4. 特征重要性
importance = dt_clf.feature_importances_
for name, imp in zip(iris.feature_names, importance):
print(f"{name}: {imp:.3f}")
# 5. 防止过拟合的方法
"""
1. 限制树的深度 (max_depth)
2. 限制叶节点最小样本数 (min_samples_leaf)
3. 限制分裂所需最小样本数 (min_samples_split)
4. 限制叶节点最大数量 (max_leaf_nodes)
5. 剪枝
6. 使用随机森林或梯度提升等集成方法
"""
# 6. 分裂标准
"""
Gini 不纯度: 1 - Σ(p_i²)
- 衡量随机选择样本被错误分类的概率
- 计算更快,适合大数据集
信息熵: -Σ(p_i * log(p_i))
- 衡量系统的不确定性
- 计算较慢,但可能产生更平衡的树
"""7. 集成学习
问题:什么是集成学习?Bagging 和 Boosting 有什么区别?
答案:
python
from sklearn.ensemble import (
RandomForestClassifier, RandomForestRegressor,
GradientBoostingClassifier, GradientBoostingRegressor,
AdaBoostClassifier, VotingClassifier
)
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
# 1. 随机森林(Bagging)
rf_clf = RandomForestClassifier(
n_estimators=100, # 树的数量
max_depth=10,
min_samples_split=2,
min_samples_leaf=1,
max_features='sqrt', # 每次分裂考虑的特征数
bootstrap=True, # 是否自助采样
oob_score=True, # 是否使用袋外样本评估
random_state=42
)
rf_clf.fit(X_train, y_train)
# 袋外分数
print(f"OOB Score: {rf_clf.oob_score_:.3f}")
# 特征重要性
importances = rf_clf.feature_importances_
# 2. AdaBoost(Boosting)
ada_clf = AdaBoostClassifier(
estimator=DecisionTreeClassifier(max_depth=1),
n_estimators=200,
learning_rate=0.5,
random_state=42
)
ada_clf.fit(X_train, y_train)
# 3. 梯度提升(Gradient Boosting)
gb_clf = GradientBoostingClassifier(
n_estimators=100,
learning_rate=0.1,
max_depth=3,
random_state=42
)
gb_clf.fit(X_train, y_train)
# 4. XGBoost(需要安装 xgboost)
"""
import xgboost as xgb
xgb_clf = xgb.XGBClassifier(
n_estimators=100,
max_depth=3,
learning_rate=0.1,
random_state=42
)
xgb_clf.fit(X_train, y_train)
"""
# 5. 投票分类器
voting_clf = VotingClassifier(
estimators=[
('rf', RandomForestClassifier(n_estimators=100)),
('svc', SVC(probability=True)),
('lr', LogisticRegression())
],
voting='soft' # 'hard' 或 'soft'
)
voting_clf.fit(X_train, y_train)
# 6. Bagging vs Boosting
"""
Bagging(如随机森林):
- 并行训练多个基学习器
- 每个学习器使用不同的数据子集
- 降低方差,减少过拟合
- 基学习器通常较强
Boosting(如 AdaBoost、Gradient Boosting):
- 串行训练基学习器
- 每个新学习器关注之前分错的样本
- 降低偏差,提高准确率
- 基学习器通常较弱
- 容易过拟合,需要仔细调参
"""8. 支持向量机
问题:什么是 SVM?核函数有什么作用?
答案:
python
from sklearn.svm import SVC, SVR
from sklearn.datasets import make_classification, make_circles
import numpy as np
# 1. 线性 SVM
X, y = make_classification(n_samples=1000, n_features=2, n_redundant=0,
n_informative=2, n_clusters_per_class=1, random_state=42)
svm_linear = SVC(kernel='linear', C=1.0)
svm_linear.fit(X, y)
# 2. 非线性 SVM(RBF 核)
svm_rbf = SVC(kernel='rbf', C=1.0, gamma='scale')
svm_rbf.fit(X, y)
# 3. 多项式核
svm_poly = SVC(kernel='poly', degree=3, C=1.0)
svm_poly.fit(X, y)
# 4. 处理非线性可分数据
X_circle, y_circle = make_circles(n_samples=100, noise=0.1, factor=0.3, random_state=42)
svm_circle = SVC(kernel='rbf', gamma='auto')
svm_circle.fit(X_circle, y_circle)
# 5. 参数说明
"""
C: 正则化参数
- C 越大,对误分类惩罚越大,模型越复杂,可能过拟合
- C 越小,允许更多误分类,模型更简单,可能欠拟合
gamma: 核函数系数
- gamma 越大,单个样本影响范围越小,模型越复杂
- gamma 越小,单个样本影响范围越大,模型越简单
kernel: 核函数类型
- 'linear': 线性核
- 'rbf': 径向基函数(高斯核)
- 'poly': 多项式核
- 'sigmoid': Sigmoid 核
"""
# 6. 核函数的作用
"""
核函数将数据映射到高维空间,使得在低维空间线性不可分的数据,
在高维空间变得线性可分。
数学上,核函数 K(x, y) = φ(x)·φ(y),避免了直接计算高维映射 φ。
常用核函数:
1. 线性核: K(x, y) = x·y
2. 多项式核: K(x, y) = (γx·y + r)^d
3. RBF 核: K(x, y) = exp(-γ||x-y||²)
4. Sigmoid 核: K(x, y) = tanh(γx·y + r)
"""
# 7. 支持向量
support_vectors = svm_rbf.support_vectors_
support_indices = svm_rbf.support_
n_support = svm_rbf.n_support_9. 聚类算法
问题:常见的聚类算法有哪些?K-Means 是如何工作的?
答案:
python
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.metrics import silhouette_score, calinski_harabasz_score
from sklearn.preprocessing import StandardScaler
import numpy as np
# 1. K-Means
# 生成数据
np.random.seed(42)
X = np.random.rand(300, 2)
# 标准化
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# K-Means 聚类
kmeans = KMeans(n_clusters=3, random_state=42, n_init=10)
labels = kmeans.fit_predict(X_scaled)
# 聚类中心
centroids = kmeans.cluster_centers_
# 评估
silhouette = silhouette_score(X_scaled, labels)
print(f"Silhouette Score: {silhouette:.3f}")
# 2. 选择 K 值
inertias = []
silhouettes = []
K_range = range(2, 10)
for k in K_range:
kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
labels = kmeans.fit_predict(X_scaled)
inertias.append(kmeans.inertia_)
silhouettes.append(silhouette_score(X_scaled, labels))
# 肘部法则选择 K
# 绘制 inertias 随 K 变化的曲线,寻找"肘部"
# 3. DBSCAN
dbscan = DBSCAN(eps=0.5, min_samples=5)
labels_dbscan = dbscan.fit_predict(X_scaled)
# 噪声点数量
n_noise = list(labels_dbscan).count(-1)
# 4. 层次聚类
agg_clustering = AgglomerativeClustering(n_clusters=3)
labels_agg = agg_clustering.fit_predict(X_scaled)
# 5. K-Means 算法步骤
"""
1. 随机选择 K 个点作为初始聚类中心
2. 将每个点分配到最近的聚类中心
3. 重新计算每个簇的中心(均值)
4. 重复步骤 2-3,直到收敛(中心不再变化或变化很小)
优点:
- 简单高效
- 可扩展性好
缺点:
- 需要预先指定 K
- 对初始值敏感
- 对异常值敏感
- 假设簇是球形且大小相近
"""
# 6. K-Means++ 初始化
kmeans_pp = KMeans(n_clusters=3, init='k-means++', random_state=42, n_init=10)
kmeans_pp.fit(X_scaled)10. 降维
问题:什么是 PCA?有什么作用?
答案:
python
from sklearn.decomposition import PCA
from sklearn.datasets import load_iris
import matplotlib.pyplot as plt
import numpy as np
# 1. 加载数据
iris = load_iris()
X, y = iris.data, iris.target
# 2. PCA 降维
# 降到 2 维
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)
# 可视化
plt.figure(figsize=(8, 6))
for i, target_name in enumerate(iris.target_names):
plt.scatter(X_pca[y == i, 0], X_pca[y == i, 1], label=target_name)
plt.xlabel('First Principal Component')
plt.ylabel('Second Principal Component')
plt.legend()
plt.show()
# 3. PCA 属性
print(f"主成分: \n{pca.components_}")
print(f"解释方差比: {pca.explained_variance_ratio_}")
print(f"累计解释方差比: {np.cumsum(pca.explained_variance_ratio_)}")
# 4. 选择主成分数量
pca_full = PCA()
pca_full.fit(X)
# 找到解释 95% 方差所需的主成分数
cumsum = np.cumsum(pca_full.explained_variance_ratio_)
K = np.argmax(cumsum >= 0.95) + 1
print(f"需要 {K} 个主成分解释 95% 方差")
# 5. PCA 原理
"""
PCA(主成分分析)通过线性变换将数据投影到新的坐标系,
使得投影后的数据方差最大。
步骤:
1. 标准化数据(均值为0,方差为1)
2. 计算协方差矩阵
3. 计算协方差矩阵的特征值和特征向量
4. 选择前 K 个最大特征值对应的特征向量
5. 将数据投影到选定的特征向量上
作用:
1. 降维:减少数据维度,降低计算复杂度
2. 去噪:去除数据中的噪声
3. 可视化:将高维数据降到 2D/3D 进行可视化
4. 去除相关性:主成分之间不相关
"""
# 6. 其他降维方法
"""
- t-SNE: 非线性降维,适合可视化
- UMAP: 非线性降维,保留局部和全局结构
- LDA: 线性判别分析,有监督降维
- Autoencoder: 神经网络降维
"""
# t-SNE 示例
from sklearn.manifold import TSNE
tsne = TSNE(n_components=2, random_state=42)
X_tsne = tsne.fit_transform(X)
plt.figure(figsize=(8, 6))
for i, target_name in enumerate(iris.target_names):
plt.scatter(X_tsne[y == i, 0], X_tsne[y == i, 1], label=target_name)
plt.legend()
plt.show()11. 模型调优
问题:如何进行超参数调优?
答案:
python
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from scipy.stats import randint, uniform
# 1. 网格搜索
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [3, 5, 7, None],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4]
}
grid_search = GridSearchCV(
RandomForestClassifier(random_state=42),
param_grid,
cv=5,
scoring='accuracy',
n_jobs=-1,
verbose=1
)
grid_search.fit(X_train, y_train)
print(f"最佳参数: {grid_search.best_params_}")
print(f"最佳分数: {grid_search.best_score_:.3f}")
best_model = grid_search.best_estimator_
# 2. 随机搜索
param_distributions = {
'n_estimators': randint(50, 500),
'max_depth': [3, 5, 7, 10, None],
'min_samples_split': randint(2, 20),
'min_samples_leaf': randint(1, 10),
'max_features': ['sqrt', 'log2', None]
}
random_search = RandomizedSearchCV(
RandomForestClassifier(random_state=42),
param_distributions,
n_iter=100,
cv=5,
scoring='accuracy',
n_jobs=-1,
random_state=42,
verbose=1
)
random_search.fit(X_train, y_train)
# 3. 贝叶斯优化(使用 optuna 或 hyperopt)
"""
import optuna
def objective(trial):
n_estimators = trial.suggest_int('n_estimators', 50, 500)
max_depth = trial.suggest_int('max_depth', 3, 10)
min_samples_split = trial.suggest_int('min_samples_split', 2, 20)
model = RandomForestClassifier(
n_estimators=n_estimators,
max_depth=max_depth,
min_samples_split=min_samples_split,
random_state=42
)
score = cross_val_score(model, X_train, y_train, cv=5).mean()
return score
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)
print(f"最佳参数: {study.best_params}")
print(f"最佳分数: {study.best_value:.3f}")
"""
# 4. 学习曲线
from sklearn.model_selection import learning_curve
import matplotlib.pyplot as plt
train_sizes, train_scores, test_scores = learning_curve(
best_model, X_train, y_train, cv=5,
train_sizes=np.linspace(0.1, 1.0, 10),
scoring='accuracy'
)
train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)
plt.plot(train_sizes, train_mean, 'o-', color='blue', label='Training')
plt.fill_between(train_sizes, train_mean - train_std, train_mean + train_std, alpha=0.1, color='blue')
plt.plot(train_sizes, test_mean, 'o-', color='green', label='Cross-validation')
plt.fill_between(train_sizes, test_mean - test_std, test_mean + test_std, alpha=0.1, color='green')
plt.xlabel('Training Examples')
plt.ylabel('Accuracy')
plt.legend()
plt.show()