  1. 使用机器学习的目的是什么,需要完成哪些任务?
  2. 需要分析的数据是什么?


  1. 考虑算法的目的
    1. 若需要做目标值的预测可以选择监督学习算法,否则考虑无监督学习算法
    2. 确定为监督学习算法前提下,若目标值数据类型是离散型,可以考虑分类算法,否则考虑回归算法


  1. 考虑数据类型
    1. 需要充分理解数据的含义
    2. 充分考虑缺失值处理问题






对于图1的单变量线性回归,房子的价格与房子的大小有关,所以通过模型学习可以得出:y = w*x1 + bias 的公式,w表示权重、x1表示房子大小、bias表示偏正



同理,对于图2的多变量线性回归,可以得出:y = w1x1 + w2x2 + bias的公式,其他以此类推。





# 导包
from sklearn.linear_model import LinearRegression

# 参数

def __init__(self, fit_intercept=True, normalize=False, copy_X=True,n_jobs=None)

# fit_intercept=True:是否使用截距,即上述公式中的bias

# normalize=False:
This parameter is ignored when ``fit_intercept`` is set to False.
    If True, the regressors X will be normalized before regression by
    subtracting the mean and dividing by the l2-norm.
    If you wish to standardize, please use
    :class:`sklearn.preprocessing.StandardScaler` before calling ``fit`` on
    an estimator with ``normalize=False``.
n_jobs : int or None, optional (default=None)
    The number of jobs to use for the computation. This will only provide
    speedup for n_targets > 1 and sufficient large problems.
    ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
    ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
    for more details.

# 属性
coef_ : array of shape (n_features, ) or (n_targets, n_features)
    Estimated coefficients for the linear regression problem.
    If multiple targets are passed during the fit (y 2D), this
    is a 2D array of shape (n_targets, n_features), while if only
    one target is passed, this is a 1D array of length n_features.

# 方法
def fit(self, X, y, sample_weight=None):
# X : {array-like, sparse matrix} of shape (n_samples, n_features)Training data

# y:array-like of shape (n_samples,) or (n_samples, n_targets)Target values. Will be cast to X's dtype if necessary

# sample_weight : array-like of shape (n_samples,), default=None Individual weights for each sample

# Returns
    self : returns an instance of self.

# def predict(self, X):
# X : array_like or sparse matrix, shape (n_samples, n_features) Samples.

# Returns
    C : array, shape (n_samples,)
        Returns predicted values.




from sklearn.model_selection import train_test_split

def train_test_split(*arrays, **options):

# 参数
*arrays : sequence of indexables with same length / shape[0]
    Allowed inputs are lists, numpy arrays, scipy-sparse
    matrices or pandas dataframes.

test_size : float, int or None, optional (default=None)
    If float, should be between 0.0 and 1.0 and represent the proportion
    of the dataset to include in the test split. If int, represents the
    absolute number of test samples. If None, the value is set to the
    complement of the train size. If ``train_size`` is also None, it will
    be set to 0.25.
	规定划分测试集的大小 为None时默认划分0.25

train_size : float, int, or None, (default=None)
    If float, should be between 0.0 and 1.0 and represent the
    proportion of the dataset to include in the train split. If
    int, represents the absolute number of train samples. If None,
    the value is automatically set to the complement of the test size.
	划分训练集大小 为None时默认划分0.75

random_state : int, RandomState instance or None, optional (default=None)
    If int, random_state is the seed used by the random number generator;
    If RandomState instance, random_state is the random number generator;
    If None, the random number generator is the RandomState instance used
    by `np.random`.

shuffle : boolean, optional (default=True)
    Whether or not to shuffle the data before splitting. If shuffle=False
    then stratify must be None.

# 使用方法

# x_train训练集的特征 x_test测试集的特征 y_train训练集的目标值 y_test测试集的目标值
x_train, x_test, y_train, y_test = train_test_split(data, target, test_size=0.25)
from sklearn.metrics import explained_variance_score

score = explained_variance_score(y_true, y_pred)
	# y_true : array-like of shape (n_samples,) or (n_samples, n_outputs)
        Ground truth (correct) target values.
	# y_pred : array-like of shape (n_samples,) or (n_samples, n_outputs)
        Estimated target values.

	# sample_weight : array-like of shape (n_samples,), optional
        Sample weights.

	# multioutput : string in ['raw_values', 'uniform_average', \
                'variance_weighted'] or array-like of shape (n_outputs)
        Defines aggregating of multiple output scores.
        Array-like value defines weights used to average scores.

        'raw_values' :
            Returns a full set of scores in case of multioutput input.

        'uniform_average' :
            Scores of all outputs are averaged with uniform weight.

        'variance_weighted' :
            Scores of all outputs are averaged, weighted by the variances
            of each individual output.

	# Returns
    score : float or ndarray of floats
        The explained variance or ndarray if 'multioutput' is 'raw_values'.
from sklearn.metrics import mean_squared_error

def mean_squared_error(y_true, y_pred,
                   multioutput='uniform_average', squared=True):

	# y_true,y_pred,sample_weight同上
	# multioutput : string in ['raw_values', 'uniform_average']
        or array-like of shape (n_outputs)
        Defines aggregating of multiple output values.
        Array-like value defines weights used to average errors.

        'raw_values' :
            Returns a full set of errors in case of multioutput input.

        'uniform_average' :
            Errors of all outputs are averaged with uniform weight.
	# squared : boolean value, optional (default = True)
	    If True returns MSE value, if False returns RMSE value.

from sklearn.preprocessing import StandardScaler

class StandardScaler的__init__函数:
def __init__(self, copy=True, with_mean=True, with_std=True):

The standard score of a sample `x` is calculated as: z = (x - u) / s

def fit(self, X, y=None):
# 计算出均值和标准差,为后续的计算转换做准备
# X : 作为标准化蓝本的数据

def transform(self, X, copy=None):
# 对数据进行标准化操作
# X:同上

def inverse_transform(self, X, copy=None):
# 逆标准化操作,将标准化过的数据传入X,就可以逆标准化

def fit_transform(self, X, y=None, **fit_params):
# 先执行fit , 然后transform




# 数据集在datasets下的load_boston
from sklearn.datasets import load_boston

# 通过实例化可以查看数据集的结构
boston = load_boston()

   boston = load_boston()
print(boston.data)  # 获取特征
print(boston.target)  # 获取目标值
print(boston.feature_names)  # 获取特征名字
'feature_names': array(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD',
   'TAX', 'PTRATIO', 'B', 'LSTAT'], dtype='<U7'), 'DESCR': ".. _boston_dataset:\n\nBoston house prices dataset\n---------------------------\n\n**Data Set Characteristics:**  \n\n    :Number of Instances: 506 \n\n    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.\n\n    :Attribute Information (in order):\n        - CRIM     per capita crime rate by town\n        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.\n        - INDUS    proportion of non-retail business acres per town\n        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)\n        - NOX      nitric oxides concentration (parts per 10 million)\n        - RM       average number of rooms per dwelling\n        - AGE      proportion of owner-occupied units built prior to 1940\n        - DIS      weighted distances to five Boston employment centres\n        - RAD      index of accessibility to radial highways\n        - TAX      full-value property-tax rate per $10,000\n        - PTRATIO  pupil-teacher ratio by town\n        - B        1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town\n        - LSTAT    % lower status of the population\n        - MEDV     Median value of owner-occupied homes in $1000's\n\n    :Missing Attribute Values: None\n\n    :Creator: Harrison, D. and Rubinfeld, D.L.\n\nThis is a copy of UCI ML housing dataset.\nhttps://archive.ics.uci.edu/ml/machine-learning-databases/housing/\n\n\nThis dataset was taken from the StatLib library which is maintained at Carnegie Mellon University.\n\nThe Boston house-price data of Harrison, D. and Rubinfeld, D.L. 'Hedonic\nprices and the demand for clean air', J. Environ. Economics & Management,\nvol.5, 81-102, 1978.   Used in Belsley, Kuh & Welsch, 'Regression diagnostics\n...', Wiley, 1980.   N.B. Various transformations are used in the table on\npages 244-261 of the latter.\n\nThe Boston house-price data has been used in many machine learning papers that address regression\nproblems.   \n     \n.. topic:: References\n\n   - Belsley, Kuh & Welsch, 'Regression diagnostics: Identifying Influential Data and Sources of Collinearity', Wiley, 1980. 244-261.\n   - Quinlan,R. (1993). Combining Instance-Based and Model-Based Learning. In Proceedings on the Tenth International Conference of Machine Learning, 236-243, University of Massachusetts, Amherst. Morgan Kaufmann.\n", 'filename': 'D:\\My_IDE\\anaconda3\\unzip\\envs\\ML\\lib\\site-packages\\sklearn\\datasets\\data\\boston_house_prices.csv'}


boston = load_boston()
print(boston.data)  # 获取特征
print(boston.target)  # 获取目标值
print(boston.feature_names)  # 获取特征名字


from sklearn.datasets import load_boston
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import explained_variance_score, mean_squared_error

# 从数据集中加载数据
data = load_boston().data
target = load_boston().target  # 目标值

# 直接对数据进行切割 测试集0.25
x_train, x_test, y_train, y_test = train_test_split(data, target, test_size=.25)

# 直接调用线性回归算法对训练集训练
lr = LinearRegression()  # 生成对象
lr.fit(X=x_train, y=y_train)  # 传入训练集的特征集 训练集的目标集

# 调用predict对x_test进行预测
y_predict = lr.predict(x_test)
print('测试集预测房价结果:', y_predict)

# 查看预测的准确率
score = explained_variance_score(y_true=y_test, y_pred=y_predict)
print('准确率:', score)

# 查看MSE均方误差
mse = mean_squared_error(y_true=y_test, y_pred=y_predict)
print('MSE:', mse)


测试集预测房价结果: [21.46883434 25.12932984 25.46229753 24.29381014 10.77338021 23.6629044
 23.37791703 22.49010597 12.27856207 29.37523746 27.40418595 10.76141686
 17.18846688 34.25705244 29.39977567 21.07789847 13.28080843 26.40665946
 25.0711256  14.36977619 27.72165633 24.97684893  7.14863525 31.9399898
 34.1872509  18.28335265 13.78038185 30.71205105 25.05699424 31.3071684
 23.73084061 24.50247385 23.80750114 22.13219131 24.19862954 18.62644109
 20.33034999 23.75066724 17.86150927 24.76372503 36.45888055 15.58456341
 25.14583162 35.39194339 27.4397019  18.53702861 36.67282595 18.06890836
 33.40175423 19.29671574  8.36035202  4.91058191 43.74501362 23.70449026
 21.18441041 32.86471409 14.54004421 37.14938937 17.11641782 28.49420956
 17.12114592 14.17674722 23.88892771 36.87368112 15.58474291 26.22316745
 13.31399778 18.21362041 11.89413312 23.86939921 16.86764505 13.61438561
 17.6953322  20.14680644 18.69063686 19.41916909 16.0092952  10.97551792
 16.31726352 22.77547615 17.75585647 18.08693647 19.85811416 17.91815524
 34.72957214 25.13865124 43.09269266 17.41278109 30.01004371  6.27039642
 19.21093827 23.687117   31.91416595 30.9285005  23.04447041 19.39716369
  7.93354099 16.63689471 21.48214823 30.77142739 34.55218054 21.27732932
 28.55869948 25.82598709 19.18739213 15.04329649 26.88848523 16.32920073
 16.71884264  6.66521178  3.2633921  11.61679565 27.638216   19.34970675
 18.09687596 25.19032192 24.94721181 31.79454369 19.09566954 22.19418141
 15.75264749 17.70929177 21.25536948 35.56393675 16.53093603 21.88363038
准确率: 0.6852667652008373
MSE: 24.41094186688598
from sklearn.datasets import load_boston
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import explained_variance_score, mean_squared_error
from sklearn.preprocessing import StandardScaler

# 从数据集中加载数据
data = load_boston().data
target = load_boston().target  # 目标值

# 直接对数据进行切割 测试集0.25
x_train, x_test, y_train, y_test = train_test_split(data, target, test_size=.25)

# ------------------------------
# 对数据做标准化处理
std = StandardScaler()  # 生成std对象
x_train = std.fit_transform(x_train)
x_test = std.transform(x_test)  # 上面fit过了 就不需要再fit 直接transform

y_train = std.fit_transform(y_train.reshape(-1, 1))
y_test = std.transform(y_test.reshape(-1, 1))
# ------------------------------

# 直接调用线性回归算法对训练集训练
lr = LinearRegression()  # 生成对象
lr.fit(X=x_train, y=y_train)  # 传入训练集的特征集 训练集的目标集

# 调用predict对x_test进行预测
y_predict = lr.predict(x_test)
print('测试集预测房价结果:', y_predict)

# 查看预测的准确率
score = explained_variance_score(y_true=y_test, y_pred=y_predict)
print('准确率:', score)

# 查看MSE均方误差
mse = mean_squared_error(y_true=y_test, y_pred=y_predict)
print('MSE:', mse)


测试集预测房价结果: [[-1.40879126]
 [ 0.86696205]
 [ 0.53045858]
 [ 0.01976869]
 [ 0.13888412]
 [ 0.0853247 ]
 [ 0.1180086 ]
 [-1.1975903 ]
 [ 0.58981398]
 [ 0.23569153]
 [ 0.80432651]
 [ 1.39147711]
准确率: 0.6719360149536837
MSE: 0.34810740050393685









import pandas as pd
from sklearn.datasets import load_boston
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import explained_variance_score, mean_squared_error
from sklearn.preprocessing import StandardScaler

# 从数据集中加载数据
data = load_boston().data
target = load_boston().target  # 目标值

# 做特征选择
data = pd.DataFrame(data)
data.drop([4], inplace=True, axis=1)  # 删除NOX
data.drop([10], inplace=True, axis=1)  # 删除PTRATIO
data.drop([11], inplace=True, axis=1)  # 删除B

# 直接对数据进行切割 测试集0.25
x_train, x_test, y_train, y_test = train_test_split(data, target, test_size=.25)

# ------------------------------
# 对数据做标准化处理
std = StandardScaler()  # 生成std对象
x_train = std.fit_transform(x_train)
x_test = std.transform(x_test)  # 上面fit过了 就不需要再fit 直接transform

y_train = std.fit_transform(y_train.reshape(-1, 1))
y_test = std.transform(y_test.reshape(-1, 1))

# 直接调用线性回归算法对训练集训练
lr = LinearRegression()  # 生成对象
lr.fit(X=x_train, y=y_train)  # 传入训练集的特征集 训练集的目标集

# 调用predict对x_test进行预测
y_predict = lr.predict(x_test)
print('测试集预测房价结果:', y_predict)

# 查看预测的准确率
score = explained_variance_score(y_true=y_test, y_pred=y_predict)
print('准确率:', score)

# 查看MSE均方误差
mse = mean_squared_error(y_true=y_test, y_pred=y_predict)
print('MSE:', mse)


准确率: 0.756400957032327
MSE: 0.20941409554630147


import joblib

# 保存模型 
def dump(value, filename, compress=0, protocol=None, cache_size=None):
# value:需要保存的python对象,可以是任何python的object
# filename:保存的路径
# compress:压缩等级(0~9)9最高

# 加载模型
def load(filename, mmap_mode=None):
# filename:路径


import pandas as pd
from sklearn.datasets import load_boston
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import explained_variance_score, mean_squared_error
from sklearn.preprocessing import StandardScaler
import joblib
# 从数据集中加载数据
data = load_boston().data
target = load_boston().target  # 目标值

# 做特征选择
data = pd.DataFrame(data)
data.drop([4], inplace=True, axis=1)  # 删除NOX
data.drop([10], inplace=True, axis=1)  # 删除PTRATIO
data.drop([11], inplace=True, axis=1)  # 删除B

# 直接对数据进行切割 测试集0.25
x_train, x_test, y_train, y_test = train_test_split(data, target, test_size=.25)

# ------------------------------
# 对数据做标准化处理
std = StandardScaler()  # 生成std对象
x_train = std.fit_transform(x_train)
x_test = std.transform(x_test)  # 上面fit过了 就不需要再fit 直接transform

y_train = std.fit_transform(y_train.reshape(-1, 1))
y_test = std.transform(y_test.reshape(-1, 1))

# 加载模型
# lr = joblib.load('linearModel.pkl')

# 直接调用线性回归算法对训练集训练
lr = LinearRegression()  # 生成对象
lr.fit(X=x_train, y=y_train)  # 传入训练集的特征集 训练集的目标集

# *******************************
# 保存模型
joblib.dump(value=lr, filename='linearModel.pkl')

# *******************************

# 调用predict对x_test进行预测
y_predict = lr.predict(x_test)
print('测试集预测房价结果:', y_predict)

# 查看预测的准确率
score = explained_variance_score(y_true=y_test, y_pred=y_predict)
print('准确率:', score)

# 查看MSE均方误差
mse = mean_squared_error(y_true=y_test, y_pred=y_predict)
print('MSE:', mse)





