iLMS知識社群ePortfolioeeClass學習平台空大首頁登入
位置: 吳俊逸 > AI
Python Model Evaluation
by 吳俊逸 2018-05-29 14:47:44, 回應(0), 人氣(2565)

Training and Testing

y_data=df['price']
x_data=df.drop('price',axis=1) #drop price data in x data



from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.15, random_state=1)

print("number of test samples :", x_test.shape[0])
print("number of training samples:",x_train.shape[0])

Let's import LinearRegression from the module linear_model

from sklearn.linear_model import LinearRegression
lre=LinearRegression()
lre.fit(x_train[['horsepower']],y_train)
lre.score(x_train[['horsepower']],y_train) >> 0.6377940995166673
lre.score(x_test[['horsepower']],y_test) >> 0.707688374146705

Cross-validation Score

from sklearn.model_selection import cross_val_score

Rcross=cross_val_score(lre,x_data[['horsepower']], y_data,cv=4)



print("The mean of the folds are", Rcross.mean(),"and the standard deviation is" ,Rcross.std())
The mean of the folds are 0.522009915042119 and the standard deviation is 0.2911839444756029

Overfitting, Underfitting and Model Selection

lr=LinearRegression()
lr.fit(x_train[['horsepower', 'curb-weight', 'engine-size', 'highway-mpg']],y_train)

yhat_train=lr.predict(x_train[['horsepower', 'curb-weight', 'engine-size', 'highway-mpg']])
yhat_test=lr.predict(x_test[['horsepower', 'curb-weight', 'engine-size', 'highway-mpg']])

import matplotlib.pyplot as plt
import seaborn as sns

DistributionPlot(y_train,yhat_train,"Actual Values (Train)","Predicted Values (Train)",'Title')



DistributionPlot(y_test,yhat_test,"Actual Values (Test)","Predicted Values (Test)",Title)




from sklearn.preprocessing import PolynomialFeatures

x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.45, random_state=0)

pr=PolynomialFeatures(degree=5)
x_train_pr=pr.fit_transform(x_train[['horsepower']])
x_test_pr=pr.fit_transform(x_test[['horsepower']])

poly=LinearRegression()
poly.fit(x_train_pr,y_train)

yhat=poly.predict(x_test_pr )
PollyPlot(x_train[['horsepower']],x_test[['horsepower']],y_train,y_test,poly,pr)




poly.score(x_train_pr, y_train) >> 0.5567716899817778
poly.score(x_test_pr, y_test) >> -29.871838229908324 << Overfitting

Let's see how the R^2 changes on the test data for different order polynomials and plot the results:

Rsqu_test=[]

order=[1,2,3,4]
for n in order:
    pr=PolynomialFeatures(degree=n)
    
    x_train_pr=pr.fit_transform(x_train[['horsepower']])
    
    x_test_pr=pr.fit_transform(x_test[['horsepower']])    
    
    lr.fit(x_train_pr,y_train)
    
    Rsqu_test.append(lr.score(x_test_pr,y_test))

plt.plot(order,Rsqu_test)
plt.xlabel('order')
plt.ylabel('R^2')
plt.title('R^2 Using Test Data')
plt.text(3, 0.75, 'Maximum R^2 ')  



def f(order,test_data):
    pr=PolynomialFeatures(degree=order)
    x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=test_data, random_state=0)
    
    x_train_pr=pr.fit_transform(x_train[['horsepower']])
    x_test_pr=pr.fit_transform(x_test[['horsepower']])
    
    poly=LinearRegression()
    poly.fit(x_train_pr,y_train)
    PollyPlot(x_train[['horsepower']],x_test[['horsepower']],y_train,y_test,poly,pr)

interact(f, order=(0,6,1),test_data=(0.05,0.95,0.05))



Ridge regression

from sklearn.linear_model import Ridge
RigeModel=Ridge(alpha=0.01)
RigeModel.fit(x_train_pr,y_train)
yhat=RigeModel.predict(x_test_pr)
from sklearn.model_selection import GridSearchCV
parameters1= [{'alpha': [0.001,0.1,1, 10, 100, 1000,10000,100000,100000]}]

RR=Ridge()
Grid1 = GridSearchCV(RR, parameters1,cv=4) 
Grid1.fit(x_data[['horsepower', 'curb-weight', 'engine-size', 'highway-mpg']],y_data)
BestRR=Grid1.best_estimator_
BestRR.score(x_test[['horsepower', 'curb-weight', 'engine-size', 'highway-mpg']],y_test)

Example:
parameters2= [{'alpha': [0.001,0.1,1, 10, 100, 1000,10000,100000,100000],'normalize':[True,False]} ]
Grid2 = GridSearchCV(Ridge(), parameters2,cv=4)
Grid2.fit(x_data[['horsepower', 'curb-weight', 'engine-size', 'highway-mpg']],y_data)
BestRR2=Grid2.best_estimator_
BestRR2.score(x_test[['horsepower', 'curb-weight', 'engine-size', 'highway-mpg']],y_test)