Deepak Dubey on Cloud, Big Data Analytics, Data Science, Machine Learning, DevOps, Full Stack, Java: Data Analysis with Python

Data Analysis with Python - Model Evaluation and Refinement

import pandas as pd
import numpy as np

# Import clean data
path = 'https://s3-api.us-geo.objectstorage.softlayer.net/cf-courses-data/CognitiveClass/DA0101EN/module_5_auto.csv'
df = pd.read_csv(path)

df.to_csv('module_5_auto.csv')

First lets only use numeric data

df=df._get_numeric_data()
df.head()

Libraries for plotting

%%capture
! pip install ipywidgets

from IPython.display import display
from IPython.html import widgets
from IPython.display import display
from ipywidgets import interact, interactive, fixed, interact_manual

Functions for plotting

def DistributionPlot(RedFunction, BlueFunction, RedName, BlueName, Title):
width = 12
height = 10
plt.figure(figsize=(width, height))

ax1 = sns.distplot(RedFunction, hist=False, color="r", label=RedName)
ax2 = sns.distplot(BlueFunction, hist=False, color="b", label=BlueName, ax=ax1)

plt.title(Title)
plt.xlabel('Price (in dollars)')
plt.ylabel('Proportion of Cars')

plt.show()
plt.close()

def PollyPlot(xtrain, xtest, y_train, y_test, lr,poly_transform):
width = 12
height = 10
plt.figure(figsize=(width, height))

#training data
#testing data
# lr: linear regression object
#poly_transform: polynomial transformation object

xmax=max([xtrain.values.max(), xtest.values.max()])

xmin=min([xtrain.values.min(), xtest.values.min()])

x=np.arange(xmin, xmax, 0.1)

plt.plot(xtrain, y_train, 'ro', label='Training Data')
plt.plot(xtest, y_test, 'go', label='Test Data')
plt.plot(x, lr.predict(poly_transform.fit_transform(x.reshape(-1, 1))), label='Predicted Function')
plt.ylim([-10000, 60000])
plt.ylabel('Price')
plt.legend()

Part 1: Training and Testing

y_data = df['price']

x_data=df.drop('price',axis=1)

from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.15, random_state=1)

print("number of test samples :", x_test.shape[0])
print("number of training samples:",x_train.shape[0])

number of test samples : 31
number of training samples: 170

x_train1, x_test1, y_train1, y_test1 = train_test_split(x_data, y_data, test_size=0.4, random_state=0)
print("number of test samples :", x_test1.shape[0])
print("number of training samples:",x_train1.shape[0])

number of test samples : 81
number of training samples: 120

Let's import LinearRegression from the module linear_model.
from sklearn.linear_model import LinearRegression

We create a Linear Regression object:
lre=LinearRegression()

we fit the model using the feature horsepower
lre.fit(x_train[['horsepower']], y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
normalize=False)

Let's Calculate the R^2 on the test data:

lre.score(x_test[['horsepower']], y_test)
0.707688374146705

we can see the R^2 is much smaller using the test data.
lre.score(x_train[['horsepower']], y_train)

Find the R^2 on the test data using 90% of the data for training data

x_train1, x_test1, y_train1, y_test1 = train_test_split(x_data, y_data, test_size=0.1, random_state=0)
lre.fit(x_train1[['horsepower']],y_train1)
lre.score(x_test1[['horsepower']],y_test1)

0.7340722810055448

Wednesday, October 16, 2019

Data Analysis with Python - Model Evaluation and Refinement