import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import SGDRegressor
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from sklearn.decomposition import PCA
from sklearn.feature_selection import VarianceThreshold
from sklearn.externals import joblib
data = pd.read_csv('/Users/lujiawen/Desktop/PUBG_to_stu/data/train_V2.csv')
data.head()
data.shape
np.any(pd.isnull(data))
data = data.dropna()
data.shape
group_count = data.groupby('matchId').count()
group_count = group_count[(group_count['Id']>70)]
group_count
data_true_ = data[data["matchId"].isin(group_count.index)]
data_true_
data_true_.shape
data_ = data_true_[::20]
data_.shape
data_.head()
waigua = 'kills'
waigua1 = 'longestKill'
data_.sort_values(by= waigua,ascending=False)[waigua].head()
plt.figure(figsize=(20, 8), dpi=80)
x= data_.index
y= data_[waigua]
plt.plot(x,y)
plt.grid(True, linestyle='--', alpha=0.5)
plt.show()
# 去掉击杀数大于20的玩家
data_true = data_[(data_[waigua]<20)]
data_true
data_true.shape
data_true.loc[data_true['matchType']=='solo','matchType'] = 1
data_true.loc[data_true['matchType']=='duo','matchType'] = 2
data_true.loc[data_true['matchType']=='squad','matchType'] = 3
data_true.loc[data_true['matchType']=='solo-fpp','matchType'] = 4
data_true.loc[data_true['matchType']=='duo-fpp','matchType'] = 5
data_true.loc[data_true['matchType']=='squad-fpp','matchType'] = 6
data_true.head()
match_type = data_true.groupby('matchType').count()
m = match_type.iloc[0:6,]
m
m.shape
data_new = data_true[data_true["matchType"].isin(m.index)]
data_new.head()
data_new["matchType"].head()
# data_new.shape
features= data_new.drop(['Id','groupId','matchId','winPlacePerc'],axis=1)
# features
# features=('assists,boosts,damageDealt,DBNOs,headshotKills,heals,killPlace,killPoints,kills,killStreaks,longestKill,matchDuration,matchType,maxPlace,numGroups,rankPoints,revives,rideDistance,roadKills,swimDistance,teamKills,vehicleDestroys,walkDistance,weaponsAcquired,winPoints').split(',')
# features
# 确定特征值和目标值
x = features #特征
y = data_new["winPlacePerc"] #目标
# 分割数据集
x_train, x_test, y_train, y_test = train_test_split(x, y) #random_state=22
x_train.shape
# x_test.shape
# 实例化一个转换器
transfer = StandardScaler()
# 调用fit_transform
x_train = transfer.fit_transform(x_train)
x_test = transfer.transform(x_test)
# # 低方差特征过滤
# transfer = VarianceThreshold(threshold=1)
# # 2、调用fit_transform
# dataa = transfer.fit_transform(x_train.iloc[:,:])
# print("删除低方差特征的结果:\n", dataa)
# # print("形状:\n", data.shape)
# transfer = PCA(n_components=0.9)
# # 2、调用fit_transform
# data1 = transfer.fit_transform(x_train)
estimator = Ridge(alpha=1.0)
estimator.fit(x_train, y_train)
joblib.dump(estimator, "/Users/lujiawen/Desktop/第八组/绝地求生.pkl")
y_predict = estimator.predict(x_test)
print("预测值为:\n", y_predict)
print("模型中的系数为:\n", estimator.coef_)
print("模型中的偏置为:\n", estimator.intercept_)
# 5.2 评价
# 均方误差
error = mean_squared_error(y_test, y_predict)
print("误差为:\n", error)