利用简单的线性模型,训练kaggle 房屋数据集:
import os
import random
import tarfile
import time
import zipfile
import pandas as pd
import requests
import torch
from torch import nn
from torch.utils import data
from matplotlib import pyplot as plt
from matplotlib_inline import backend_inlinedef plot(X, Y=None, xlabel=None, ylabel=None, legend=None, xlim=None,ylim=None, xscale='linear', yscale='linear',fmts=('-', 'm--', 'g-.', 'r:'), figsize=(3.5, 2.5), axes=None):if legend is None:legend = []set_figsize(figsize)axes = axes if axes else plt.gca()# 如果X有一个轴,输出Truedef has_one_axis(X):return (hasattr(X, "ndim") and X.ndim == 1 or isinstance(X, list)and not hasattr(X[0], "__len__"))if has_one_axis(X):X = [X]if Y is None:X, Y = [[]] * len(X), Xelif has_one_axis(Y):Y = [Y]if len(X) != len(Y):X = X * len(Y)axes.cla()for x, y, fmt in zip(X, Y, fmts):if len(x):axes.plot(x, y, fmt)else:axes.plot(y, fmt)set_axes(axes, xlabel, ylabel, xlim, ylim, xscale, yscale, legend)def use_svg_display():backend_inline.set_matplotlib_formats('svg')def set_figsize(figsize=(3.5, 2.5)):use_svg_display()plt.rcParams['figure.figsize'] = figsizedef set_axes(axes, xlabel, ylabel, xlim, ylim, xscale, yscale, legend):axes.set_xlabel(xlabel)axes.set_ylabel(ylabel)axes.set_xscale(xscale)axes.set_yscale(yscale)axes.set_xlim(xlim)axes.set_ylim(ylim)if legend:axes.legend(legend)axes.grid()def download(name, cache_dir=os.path.join('..', 'data')):assert name in DATA_HUB, f"{name} 不存在于 {DATA_HUB}"url, sha1_hash = DATA_HUB[name]os.makedirs(cache_dir, exist_ok=True)fname = os.path.join(cache_dir, url.split('/')[-1])if os.path.exists(fname):sha1 = hashlib.sha1()with open(fname, 'rb') as f:while True:data = f.read(1048576)if not data:breaksha1.update(data)if sha1.hexdigest() == sha1_hash:return fname # 命中缓存print(f'正在从{url}下载{fname}...')r = requests.get(url, stream=True, verify=True)with open(fname, 'wb') as f:f.write(r.content)return fnamedef download_extract(name, folder=None):fname = download(name)base_dir = os.path.dirname(fname)data_dir, ext = os.path.splitext(fname)if ext == '.zip':fp = zipfile.ZipFile(fname, 'r')elif ext in ('.tar', '.gz'):fp = tarfile.open(fname, 'r')else:assert False, '只有zip/tar文件可以被解压缩'fp.extractall(base_dir)return os.path.join(base_dir, folder) if folder else data_dirdef download_all():for name in DATA_HUB:download(name)DATA_HUB = dict()
DATA_URL = 'http://d2l-data.s3-accelerate.amazonaws.com/'DATA_HUB['kaggle_house_train'] = (DATA_URL + 'kaggle_house_pred_train.csv','585e9cc93e70b39160e7921475f9bcd7d31219ce')DATA_HUB['kaggle_house_test'] = (DATA_URL + 'kaggle_house_pred_test.csv','fa19780a7b011d9b009e8bff8e99922a8ee2eb90')train_data = pd.read_csv(download('kaggle_house_train'))
test_data = pd.read_csv(download('kaggle_house_test'))all_features = pd.concat((train_data.iloc[:,1:-1], test_data.iloc[:,1:])) #测试数据集没有标签numeric_features_index_name = all_features.dtypes[all_features.dtypes != 'object'].indexall_features[numeric_features_index_name] = all_features[numeric_features_index_name].apply(lambda x : (x-x.mean())/(x.std()))all_features[numeric_features_index_name] = all_features[numeric_features_index_name].fillna(0)all_features = pd.get_dummies(all_features, dummy_na=True)
bool_columns = all_features.select_dtypes(include=['bool']).columns
all_features[bool_columns] = all_features[bool_columns].astype(int)sample_count_train = train_data.shape[0]train_features = torch.tensor(all_features[:sample_count_train].values, dtype=torch.float32)
test_features = torch.tensor(all_features[sample_count_train:].values, dtype=torch.float32)
train_labels = torch.tensor(train_data.SalePrice.values.reshape(-1, 1), dtype=torch.float32)loss = nn.MSELoss()
in_features = train_features.shape[1]def get_net():net = nn.Sequential(nn.Linear(in_features,1))return netdef log_rmse(net, features, labels):clipped_preds = torch.clamp(net(features), 1, float('inf'))rmse = torch.sqrt(loss(torch.log(clipped_preds), torch.log(labels)))return rmse.item()def load_array(data_arrays, batch_size, is_train=True):dataset = data.TensorDataset(*data_arrays)return data.DataLoader(dataset, batch_size, shuffle=is_train)def train(net, train_features, train_labels, test_features, test_labels,num_epochs, learning_rate, weight_decay, batch_size):train_ls, test_ls = [], []train_iter = load_array((train_features, train_labels), batch_size)optimizer = torch.optim.Adam(net.parameters(), lr = learning_rate, weight_decay = weight_decay)for epoch in range(num_epochs):for X, y in train_iter:optimizer.zero_grad()l = loss(net(X), y)l.backward()optimizer.step()train_ls.append(log_rmse(net, train_features, train_labels))if test_labels is not None:test_ls.append(log_rmse(net, test_features, test_labels))return train_ls, test_lsdef get_k_fold_data(k, i, X, y):assert k > 1fold_size = X.shape[0] // kX_train, y_train = None, Nonefor j in range(k):idx = slice(j * fold_size, (j + 1) * fold_size)X_part, y_part = X[idx, :], y[idx]if j == i:X_valid, y_valid = X_part, y_partelif X_train is None:X_train, y_train = X_part, y_partelse:X_train = torch.cat([X_train, X_part], 0)y_train = torch.cat([y_train, y_part], 0)return X_train, y_train, X_valid, y_validdef k_fold(k, X_train, y_train, num_epochs, learning_rate, weight_decay, batch_size):train_l_sum, valid_l_sum = 0, 0for i in range(k):data = get_k_fold_data(k, i, X_train, y_train)# 检查训练集是否为空if data[0].shape[0] == 0:raise ValueError(f"折 {i+1} 的训练集为空!")net = get_net()train_ls, valid_ls = train(net, *data, num_epochs, learning_rate, weight_decay, batch_size)train_l_sum += train_ls[-1]valid_l_sum += valid_ls[-1]if i == 0:plot(list(range(1, num_epochs + 1)), [train_ls, valid_ls], xlabel='epoch', ylabel='rmse',xlim=[1, num_epochs], legend=['train','valid'], yscale='log')print(f'折{i + 1},训练log rmse{float(train_ls[-1]):f}, 'f'验证log rmse{float(valid_ls[-1]):f}')return train_l_sum / k, valid_l_sum / kk, num_epochs, lr, weight_decay, batch_size = 5, 100, 5, 0, 64
train_l, valid_l = k_fold(k, train_features, train_labels, num_epochs, lr, weight_decay, batch_size)
print(f'{k}-折验证:平均训练 log rmse: {float(train_l):f}, 平均验证log rmse: {float(valid_l):f}')
结果如下:
折1,训练log rmse0.170301, 验证log rmse0.156731
折2,训练log rmse0.162383, 验证log rmse0.191084
折3,训练log rmse0.164671, 验证log rmse0.168909
折4,训练log rmse0.167769, 验证log rmse0.154724
折5,训练log rmse0.162871, 验证log rmse0.182890
5-折验证:平均训练 log rmse: 0.165599, 平均验证log rmse: 0.17086
可以看到,随着训练,损失越来越小。