%matplotlib inline
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.model_selection import cross_val_score
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
data = pd.read_csv("digit-recognizor.csv")
x = data.iloc[:,1:]
y = data.iloc[:,0]
print(x.shape)
print(y.shape)
(42000, 784) (42000,)
pca_line = PCA().fit(x)
plt.figure()
plt.plot(np.cumsum(pca_line.explained_variance_ratio_))
plt.xlabel("number of components after pca")
plt.ylabel("cumulative explained variance ratio")
plt.show()
score = []
for i in range(1, 101, 10):
x_dr = PCA(i).fit_transform(x)
once = cross_val_score(RFC(n_estimators=100, random_state=0), x_dr, y, cv=5).mean()
score.append(once)
plt.figure()
plt.plot(range(1, 101, 10), score)
plt.show()
score = []
for i in range(10, 25):
x_dr = PCA(i).fit_transform(x)
once = cross_val_score(RFC(n_estimators=100, random_state=0), x_dr, y, cv=5).mean()
score.append(once)
plt.figure()
plt.plot(range(10, 25), score)
plt.show()
x_dr = PCA(26).fit_transform(x)
cross_val_score(RFC(n_estimators=100, random_state=0), x_dr, y, cv=10)
array([0.945 , 0.94666667, 0.95261905, 0.94547619, 0.95095238, 0.9397619 , 0.94428571, 0.95214286, 0.95285714, 0.95095238])
from sklearn.neighbors import KNeighborsClassifier as kNN
cross_val_score(kNN(), x_dr, y, cv=5).mean()
0.9709285714285715
score = []
for i in range(10):
once = cross_val_score(kNN(i + 1), x_dr, y, cv=5).mean()
score.append(once)
plt.figure()
plt.plot(range(10), score)
plt.show()