[Dacon] 와인 품질 분류 경진대회

데이콘 실무역량 강화교육 간에 실시한 프로젝트인 와인분류 프로젝트입니다.

성능 향상을 위해 앙상블기법을 활용해 보았습니다.

또한 하드보팅과 소프트 보팅에 대해 배웠습니다.

0.준비

1) 데이터 로드

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).

import matplotlib.pyplot as plt
import pandas as pd
plt.rc('font',family = 'NanumBarunGothic')

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from xgboost.sklearn import XGBClassifier
from sklearn.model_selection import StratifiedKFold
from xgboost import XGBClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
import seaborn as sns
import numpy as np

train = pd.read_csv('/content/drive/MyDrive/와인1/train.csv')
test = pd.read_csv('/content/drive/MyDrive/와인1/test.csv')
submission = pd.read_csv('/content/drive/MyDrive/와인1/sample_submission.csv')

train

	index	quality	fixed acidity	volatile acidity	citric acid	residual sugar	chlorides	free sulfur dioxide	total sulfur dioxide	density	pH	sulphates	alcohol	type
0	0	5	5.6	0.695	0.06	6.8	0.042	9.0	84.0	0.99432	3.44	0.44	10.2	white
1	1	5	8.8	0.610	0.14	2.4	0.067	10.0	42.0	0.99690	3.19	0.59	9.5	red
2	2	5	7.9	0.210	0.39	2.0	0.057	21.0	138.0	0.99176	3.05	0.52	10.9	white
3	3	6	7.0	0.210	0.31	6.0	0.046	29.0	108.0	0.99390	3.26	0.50	10.8	white
4	4	6	7.8	0.400	0.26	9.5	0.059	32.0	178.0	0.99550	3.04	0.43	10.9	white
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
5492	5492	5	7.7	0.150	0.29	1.3	0.029	10.0	64.0	0.99320	3.35	0.39	10.1	white
5493	5493	6	6.3	0.180	0.36	1.2	0.034	26.0	111.0	0.99074	3.16	0.51	11.0	white
5494	5494	7	7.8	0.150	0.34	1.1	0.035	31.0	93.0	0.99096	3.07	0.72	11.3	white
5495	5495	5	6.6	0.410	0.31	1.6	0.042	18.0	101.0	0.99195	3.13	0.41	10.5	white
5496	5496	6	7.0	0.350	0.17	1.1	0.049	7.0	119.0	0.99297	3.13	0.36	9.7	white

5497 rows × 14 columns

data_preprocessing()

    26
   372
  1788
  2416
  3696
   304
     5
Name: quality, dtype: int64

	index	quality	fixed acidity	volatile acidity	citric acid	residual sugar	chlorides	free sulfur dioxide	total sulfur dioxide	density	pH	sulphates	alcohol	type	red	white	new_col	new_col2	new_col3
0	0	5	5.6	0.695	0.06	6.8	0.042	9.0	84.0	0.99432	3.44	0.44	10.2	white	0	1	0.124107	0.107143	6.295
1	1	5	8.8	0.610	0.14	2.4	0.067	10.0	42.0	0.99690	3.19	0.59	9.5	red	1	0	0.069318	0.238095	9.410
2	2	5	7.9	0.210	0.39	2.0	0.057	21.0	138.0	0.99176	3.05	0.52	10.9	white	0	1	0.026582	0.152174	8.110
3	3	6	7.0	0.210	0.31	6.0	0.046	29.0	108.0	0.99390	3.26	0.50	10.8	white	0	1	0.030000	0.268519	7.210
4	4	6	7.8	0.400	0.26	9.5	0.059	32.0	178.0	0.99550	3.04	0.43	10.9	white	0	1	0.051282	0.179775	8.200

2) 전처리

import numpy as np

def ACC(true,pred):
  score = np.mean(true==pred)
  return score

def make_plot():
  acc = ACC(y_valid,y_pred)

  validation = pd.DataFrame({'y_valid':y_valid,'y_pred':y_pred})

  validation_count = pd.DataFrame(validation['y_valid'].value_counts().sort_index())
  validation_count.loc[validation['y_pred'].value_counts().sort_index().index,'y_pred']=validation['y_pred'].value_counts().sort_index().values
  validation_count = validation_count.fillna(0)

  x = validation_count.index
  y_valid_count = validation_count['y_valid']
  y_pred_count = validation_count['y_pred']

  width = 0.35

  plt.figure(dpi=150)

  plt.title('ACC: '+str(acc)[:6])
  plt.xlabel('quality')
  plt.ylabel('count')

  p1 = plt.bar([idx-width/2 for idx in x],y_valid_count,width,label = 'real')
  p2 = plt.bar([idx+width/2 for idx in x],y_pred_count,width,label = 'pred')

  plt.legend()
  plt.show()

원핫인코딩

train = train.append(train[train['quality'].isin([4,7,8])])

train = pd.concat([train,pd.get_dummies(train['type'])],axis=1)
test = pd.concat([test,pd.get_dummies(test['type'])],axis=1)
train

	index	quality	fixed acidity	volatile acidity	citric acid	residual sugar	chlorides	free sulfur dioxide	total sulfur dioxide	density	pH	sulphates	alcohol	type	red	white
0	0	5	5.6	0.695	0.06	6.8	0.042	9.0	84.0	0.99432	3.44	0.44	10.2	white	0	1
1	1	5	8.8	0.610	0.14	2.4	0.067	10.0	42.0	0.99690	3.19	0.59	9.5	red	1	0
2	2	5	7.9	0.210	0.39	2.0	0.057	21.0	138.0	0.99176	3.05	0.52	10.9	white	0	1
3	3	6	7.0	0.210	0.31	6.0	0.046	29.0	108.0	0.99390	3.26	0.50	10.8	white	0	1
4	4	6	7.8	0.400	0.26	9.5	0.059	32.0	178.0	0.99550	3.04	0.43	10.9	white	0	1
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
5481	5481	7	5.4	0.835	0.08	1.2	0.046	13.0	93.0	0.99240	3.57	0.85	13.0	red	1	0
5482	5482	4	6.2	0.430	0.49	6.4	0.045	12.0	115.0	0.99630	3.27	0.57	9.0	white	0	1
5485	5485	4	6.3	0.280	0.22	9.5	0.040	30.0	111.0	0.99338	3.05	0.31	10.8	white	0	1
5491	5491	7	6.9	0.340	0.30	4.7	0.029	34.0	148.0	0.99165	3.36	0.49	12.3	white	0	1
5494	5494	7	7.8	0.150	0.34	1.1	0.035	31.0	93.0	0.99096	3.07	0.72	11.3	white	0	1

6759 rows × 16 columns

feature engineering

def make_plots(text):
  plt.title(text + ' vs quality')

  x = train.groupby('quality').mean().reset_index()['quality']
  y = train.groupby('quality').mean().reset_index()[text]
  
  plt.bar(x,y)
  plt.show()

train['new_col'] = train['volatile acidity'] / train['fixed acidity']
train['new_col2'] = train['free sulfur dioxide'] / train['total sulfur dioxide']
train['new_col3'] = train['volatile acidity'] + train['fixed acidity']
train

	index	quality	fixed acidity	volatile acidity	citric acid	residual sugar	chlorides	free sulfur dioxide	total sulfur dioxide	density	pH	sulphates	alcohol	type	red	white	new_col	new_col2	new_col3
0	0	5	5.6	0.695	0.06	6.8	0.042	9.0	84.0	0.99432	3.44	0.44	10.2	white	0	1	0.124107	0.107143	6.295
1	1	5	8.8	0.610	0.14	2.4	0.067	10.0	42.0	0.99690	3.19	0.59	9.5	red	1	0	0.069318	0.238095	9.410
2	2	5	7.9	0.210	0.39	2.0	0.057	21.0	138.0	0.99176	3.05	0.52	10.9	white	0	1	0.026582	0.152174	8.110
3	3	6	7.0	0.210	0.31	6.0	0.046	29.0	108.0	0.99390	3.26	0.50	10.8	white	0	1	0.030000	0.268519	7.210
4	4	6	7.8	0.400	0.26	9.5	0.059	32.0	178.0	0.99550	3.04	0.43	10.9	white	0	1	0.051282	0.179775	8.200
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
5481	5481	7	5.4	0.835	0.08	1.2	0.046	13.0	93.0	0.99240	3.57	0.85	13.0	red	1	0	0.154630	0.139785	6.235
5482	5482	4	6.2	0.430	0.49	6.4	0.045	12.0	115.0	0.99630	3.27	0.57	9.0	white	0	1	0.069355	0.104348	6.630
5485	5485	4	6.3	0.280	0.22	9.5	0.040	30.0	111.0	0.99338	3.05	0.31	10.8	white	0	1	0.044444	0.270270	6.580
5491	5491	7	6.9	0.340	0.30	4.7	0.029	34.0	148.0	0.99165	3.36	0.49	12.3	white	0	1	0.049275	0.229730	7.240
5494	5494	7	7.8	0.150	0.34	1.1	0.035	31.0	93.0	0.99096	3.07	0.72	11.3	white	0	1	0.019231	0.333333	7.950

6759 rows × 19 columns

|train.corr()['quality'].to_frame()

	quality
index	-0.007835
quality	1.000000
fixed acidity	-0.079555
volatile acidity	-0.282066
citric acid	0.091287
residual sugar	-0.028293
chlorides	-0.215320
free sulfur dioxide	0.082963
total sulfur dioxide	-0.034012
density	-0.316107
pH	0.016610
sulphates	0.048132
alcohol	0.460041
red	-0.120606
white	0.120606
new_col	-0.244637
new_col2	0.135119
new_col3	-0.111243

스케일링

test['new_col'] = test['volatile acidity'] / test['fixed acidity']
test['new_col2'] = test['free sulfur dioxide'] / test['total sulfur dioxide']
test['new_col3'] = test['volatile acidity'] + test['fixed acidity']

test

	index	fixed acidity	volatile acidity	citric acid	residual sugar	chlorides	free sulfur dioxide	total sulfur dioxide	density	pH	sulphates	alcohol	type	red	white	new_col	new_col2	new_col3
0	0	9.0	0.31	0.48	6.60	0.043	11.0	73.0	0.99380	2.90	0.38	11.6	white	0	1	0.034444	0.150685	9.31
1	1	13.3	0.43	0.58	1.90	0.070	15.0	40.0	1.00040	3.06	0.49	9.0	red	1	0	0.032331	0.375000	13.73
2	2	6.5	0.28	0.27	5.20	0.040	44.0	179.0	0.99480	3.19	0.69	9.4	white	0	1	0.043077	0.245810	6.78
3	3	7.2	0.15	0.39	1.80	0.043	21.0	159.0	0.99480	3.52	0.47	10.0	white	0	1	0.020833	0.132075	7.35
4	4	6.8	0.26	0.26	2.00	0.019	23.5	72.0	0.99041	3.16	0.47	11.8	white	0	1	0.038235	0.326389	7.06
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
995	995	7.1	0.59	0.02	2.30	0.082	24.0	94.0	0.99744	3.55	0.53	9.7	red	1	0	0.083099	0.255319	7.69
996	996	8.7	0.15	0.30	1.60	0.046	29.0	130.0	0.99420	3.22	0.38	9.8	white	0	1	0.017241	0.223077	8.85
997	997	8.8	0.66	0.26	1.70	0.074	4.0	23.0	0.99710	3.15	0.74	9.2	red	1	0	0.075000	0.173913	9.46
998	998	7.0	0.42	0.19	2.30	0.071	18.0	36.0	0.99476	3.39	0.56	10.9	red	1	0	0.060000	0.500000	7.42
999	999	8.5	0.21	0.26	9.25	0.034	73.0	142.0	0.99450	3.05	0.37	11.4	white	0	1	0.024706	0.514085	8.71

1000 rows × 18 columns

# scaler = StandardScaler()

# train[train.columns[2:-6]] = scaler.fit_transform(train[train.columns[2:-6]])
# train[train.columns[-3:]] = scaler.fit_transform(train[train.columns[-3:]])

scaler = MinMaxScaler()

train[train.columns[2:-6]] = scaler.fit_transform(train[train.columns[2:-6]])
test[test.columns[1:-6]] = scaler.transform(test[test.columns[1:-6]])
train[train.columns[-3:]] = scaler.fit_transform(train[train.columns[-3:]])
test[test.columns[-3:]] = scaler.fit_transform(test[test.columns[-3:]])

3) 성능 검증

def data_preprocessing():
  
  train = pd.read_csv('/content/drive/MyDrive/와인1/train.csv')
  test = pd.read_csv('/content/drive/MyDrive/와인1/test.csv')
  submission = pd.read_csv('/content/drive/MyDrive/와인1/sample_submission.csv')

  train = train.append(train[train['quality'].isin([4,7,8])])
  train = train.append(train[train['quality'].isin([7])])

  train = pd.concat([train, pd.get_dummies(train['type'])],axis=1)
  test = pd.concat([test, pd.get_dummies(test['type'])],axis=1)

  train['new_col'] = train['volatile acidity'] / train['fixed acidity']
  train['new_col2'] = train['free sulfur dioxide'] / train['total sulfur dioxide']
  train['new_col3'] = train['volatile acidity'] + train['fixed acidity']

  print(train['quality'].value_counts().sort_index())

  
  return train,test,submission

def test_preprocessing(test):
  test['new_col'] = test['volatile acidity'] / test['fixed acidity']
  test['new_col2'] = test['free sulfur dioxide'] / test['total sulfur dioxide']
  test['new_col3'] = test['volatile acidity'] + test['fixed acidity']
  return test

def scaling(train,test):
  scaler = MinMaxScaler()

  train[train.columns[2:-6]] = scaler.fit_transform(train[train.columns[2:-6]])
  test[test.columns[1:-6]] = scaler.transform(test[test.columns[1:-6]])
  
  train[train.columns[-3:]] = scaler.fit_transform(train[train.columns[-3:]])
  test[test.columns[-3:]] = scaler.fit_transform(test[test.columns[-3:]])
  return train,test

data_preprocessing()

    26
   372
  1788
  2416
  3696
   304
     5
Name: quality, dtype: int64





(      index  quality  fixed acidity  volatile acidity  citric acid  \
       0        5            5.6             0.695         0.06   
       1        5            8.8             0.610         0.14   
       2        5            7.9             0.210         0.39   
       3        6            7.0             0.210         0.31   
       4        6            7.8             0.400         0.26   
 ...     ...      ...            ...               ...          ...   
 5456        7            5.9             0.170         0.29   
 5466        7            6.0             0.290         0.41   
 5481        7            5.4             0.835         0.08   
 5491        7            6.9             0.340         0.30   
 5494        7            7.8             0.150         0.34   
 
       residual sugar  chlorides  free sulfur dioxide  total sulfur dioxide  \
              6.8      0.042                  9.0                  84.0   
              2.4      0.067                 10.0                  42.0   
              2.0      0.057                 21.0                 138.0   
              6.0      0.046                 29.0                 108.0   
              9.5      0.059                 32.0                 178.0   
 ...              ...        ...                  ...                   ...   
           3.1      0.030                 32.0                 123.0   
          10.8      0.048                 55.0                 149.0   
           1.2      0.046                 13.0                  93.0   
           4.7      0.029                 34.0                 148.0   
           1.1      0.035                 31.0                  93.0   
 
       density    pH  sulphates  alcohol   type  red  white   new_col  \
   0.99432  3.44       0.44     10.2  white    0      1  0.124107   
   0.99690  3.19       0.59      9.5    red    1      0  0.069318   
   0.99176  3.05       0.52     10.9  white    0      1  0.026582   
   0.99390  3.26       0.50     10.8  white    0      1  0.030000   
   0.99550  3.04       0.43     10.9  white    0      1  0.051282   
 ...       ...   ...        ...      ...    ...  ...    ...       ...   
0.98913  3.41       0.33     13.7  white    0      1  0.028814   
0.99370  3.09       0.59     11.0  white    0      1  0.048333   
0.99240  3.57       0.85     13.0    red    1      0  0.154630   
0.99165  3.36       0.49     12.3  white    0      1  0.049275   
0.99096  3.07       0.72     11.3  white    0      1  0.019231   
 
       new_col2  new_col3  
   0.107143     6.295  
   0.238095     9.410  
   0.152174     8.110  
   0.268519     7.210  
   0.179775     8.200  
 ...        ...       ...  
0.260163     6.070  
0.369128     6.290  
0.139785     6.235  
0.229730     7.240  
0.333333     7.950  
 
 [8607 rows x 19 columns],
      index  fixed acidity  volatile acidity  citric acid  residual sugar  \
      0            9.0              0.31         0.48            6.60   
      1           13.3              0.43         0.58            1.90   
      2            6.5              0.28         0.27            5.20   
      3            7.2              0.15         0.39            1.80   
      4            6.8              0.26         0.26            2.00   
 ..     ...            ...               ...          ...             ...   
  995            7.1              0.59         0.02            2.30   
  996            8.7              0.15         0.30            1.60   
  997            8.8              0.66         0.26            1.70   
  998            7.0              0.42         0.19            2.30   
  999            8.5              0.21         0.26            9.25   
 
      chlorides  free sulfur dioxide  total sulfur dioxide  density    pH  \
      0.043                 11.0                  73.0  0.99380  2.90   
      0.070                 15.0                  40.0  1.00040  3.06   
      0.040                 44.0                 179.0  0.99480  3.19   
      0.043                 21.0                 159.0  0.99480  3.52   
      0.019                 23.5                  72.0  0.99041  3.16   
 ..         ...                  ...                   ...      ...   ...   
    0.082                 24.0                  94.0  0.99744  3.55   
    0.046                 29.0                 130.0  0.99420  3.22   
    0.074                  4.0                  23.0  0.99710  3.15   
    0.071                 18.0                  36.0  0.99476  3.39   
    0.034                 73.0                 142.0  0.99450  3.05   
 
      sulphates  alcohol   type  red  white  
       0.38     11.6  white    0      1  
       0.49      9.0    red    1      0  
       0.69      9.4  white    0      1  
       0.47     10.0  white    0      1  
       0.47     11.8  white    0      1  
 ..         ...      ...    ...  ...    ...  
     0.53      9.7    red    1      0  
     0.38      9.8  white    0      1  
     0.74      9.2    red    1      0  
     0.56     10.9    red    1      0  
     0.37     11.4  white    0      1  
 
 [1000 rows x 15 columns],
      index  quality
      0        0
      1        0
      2        0
      3        0
      4        0
 ..     ...      ...
  995        0
  996        0
  997        0
  998        0
  999        0
 
 [1000 rows x 2 columns])

train

	index	quality	fixed acidity	volatile acidity	citric acid	residual sugar	chlorides	free sulfur dioxide	total sulfur dioxide	density	pH	sulphates	alcohol	type	red	white	new_col	new_col2	new_col3
0	0	5	0.148760	0.410000	0.036145	0.095092	0.054908	0.027778	0.179724	0.139001	0.551181	0.123596	0.318841	white	0	1	0.574215	0.100202	0.179466
1	1	5	0.413223	0.353333	0.084337	0.027607	0.096506	0.031250	0.082949	0.188741	0.354331	0.207865	0.217391	red	1	0	0.295792	0.257310	0.435318
2	2	5	0.338843	0.086667	0.234940	0.021472	0.079867	0.069444	0.304147	0.089647	0.244094	0.168539	0.420290	white	0	1	0.078620	0.154228	0.328542
3	3	6	0.264463	0.086667	0.186747	0.082822	0.061564	0.097222	0.235023	0.130904	0.409449	0.157303	0.405797	white	0	1	0.095988	0.293810	0.254620
4	4	6	0.330579	0.213333	0.156627	0.136503	0.083195	0.107639	0.396313	0.161751	0.236220	0.117978	0.420290	white	0	1	0.204138	0.187342	0.335934
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
5481	5481	7	0.132231	0.503333	0.048193	0.009202	0.061564	0.041667	0.200461	0.101986	0.653543	0.353933	0.724638	red	1	0	0.729321	0.139364	0.174538
5482	5482	4	0.198347	0.233333	0.295181	0.088957	0.059900	0.038194	0.251152	0.177174	0.417323	0.196629	0.144928	white	0	1	0.295979	0.096849	0.206982
5485	5485	4	0.206612	0.133333	0.132530	0.136503	0.051581	0.100694	0.241935	0.120879	0.244094	0.050562	0.405797	white	0	1	0.169391	0.295911	0.202875
5491	5491	7	0.256198	0.173333	0.180723	0.062883	0.033278	0.114583	0.327189	0.087527	0.488189	0.151685	0.623188	white	0	1	0.193940	0.247274	0.257084
5494	5494	7	0.330579	0.046667	0.204819	0.007669	0.043261	0.104167	0.200461	0.074224	0.259843	0.280899	0.478261	white	0	1	0.041262	0.371570	0.315400

6759 rows × 19 columns

test

	index	fixed acidity	volatile acidity	citric acid	residual sugar	chlorides	free sulfur dioxide	total sulfur dioxide	density	pH	sulphates	alcohol	type	red	white	new_col	new_col2	new_col3
0	0	0.429752	0.153333	0.289157	0.092025	0.056572	0.034722	0.154378	0.128976	0.125984	0.089888	0.521739	white	0	1	0.113120	0.172815	0.416000
1	1	0.785124	0.233333	0.349398	0.019939	0.101498	0.048611	0.078341	0.256217	0.251969	0.151685	0.144928	red	1	0	0.102319	0.475767	0.788211
2	2	0.223140	0.133333	0.162651	0.070552	0.051581	0.149306	0.398618	0.148255	0.354331	0.264045	0.202899	white	0	1	0.157233	0.301288	0.202947
3	3	0.280992	0.046667	0.234940	0.018405	0.056572	0.069444	0.352535	0.148255	0.614173	0.140449	0.289855	white	0	1	0.043567	0.147682	0.250947
4	4	0.247934	0.120000	0.156627	0.021472	0.016639	0.078125	0.152074	0.063621	0.330709	0.140449	0.550725	white	0	1	0.132492	0.410115	0.226526
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
995	995	0.272727	0.340000	0.012048	0.026074	0.121464	0.079861	0.202765	0.199152	0.637795	0.174157	0.246377	red	1	0	0.361746	0.314131	0.279579
996	996	0.404959	0.046667	0.180723	0.015337	0.061564	0.097222	0.285714	0.136688	0.377953	0.089888	0.260870	white	0	1	0.025211	0.270585	0.377263
997	997	0.413223	0.386667	0.156627	0.016871	0.108153	0.010417	0.039171	0.192597	0.322835	0.292135	0.173913	red	1	0	0.320362	0.204186	0.428632
998	998	0.264463	0.226667	0.114458	0.026074	0.103161	0.059028	0.069124	0.147484	0.511811	0.191011	0.420290	red	1	0	0.243711	0.644588	0.256842
999	999	0.388430	0.086667	0.156627	0.132669	0.041597	0.250000	0.313364	0.142472	0.244094	0.084270	0.492754	white	0	1	0.063356	0.663610	0.365474

1000 rows × 18 columns

data_preprocessing()
test_preprocessing()
scaling()

    26
   372
  1788
  2416
  3696
   304
     5
Name: quality, dtype: int64

features = train.columns[2:].drop(['type'])
# features = train.columns[2:].drop(['type','sulphates','total sulfur dioxide','residual sugar','free sulfur dioxide','citric acid','pH','fixed acidity'])
# features = train.columns[2:].drop(['type','total sulfur dioxide','pH'])
# features = train.columns[2:].drop(['type','new_col',	'new_col2'	,'new_col3'])
# features = train.columns[2:].drop(['type','white','red'])
features

Index(['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
       'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
       'pH', 'sulphates', 'alcohol', 'red', 'white', 'new_col', 'new_col2',
       'new_col3'],
      dtype='object')

X = train[features]
y = train['quality']

from sklearn.model_selection import StratifiedKFold
kfold = StratifiedKFold(n_splits=5,shuffle=True,random_state=42)

models = []
accs = []
for train_idx, valid_idx in kfold.split(X,y):
  X_train,X_valid = X.iloc[train_idx],X.iloc[valid_idx]
  y_train,y_valid = y.iloc[train_idx],y.iloc[valid_idx]

  model = RandomForestClassifier()

  model.fit(X_train,y_train)
  
  #추가
  models.append(model)

  y_pred = model.predict(X_valid)
  accs.append(ACC(y_valid,y_pred))
  make_plot()

  pred_ = pd.DataFrame(y_pred,y_valid).reset_index()
  pred_.columns = ['valid','pred']
  pred_['T/F'] = pred_['valid']==pred_['pred']
  print(pred_[pred_['T/F']==False].value_counts().sort_index())

png

valid  pred  T/F  
3      5     False      3
       6     False      2
4      5     False     10
       6     False      6
5      4     False      5
       6     False    105
       7     False     10
6      4     False      3
       5     False     64
       7     False     55
7      5     False      4
       6     False     26
8      6     False      6
9      7     False      1
dtype: int64

png

valid  pred  T/F  
3      5     False      1
       6     False      3
       7     False      1
4      5     False      4
       6     False      8
5      4     False      3
       6     False    104
       7     False      5
6      4     False      1
       5     False     73
       7     False     61
       8     False      2
7      5     False      2
       6     False     26
8      7     False      2
9      6     False      1
dtype: int64

png

valid  pred  T/F  
3      5     False     3
       6     False     2
4      5     False    10
       6     False     4
       7     False     2
5      4     False     5
       6     False    85
       7     False     8
6      4     False     2
       5     False    96
       7     False    48
7      5     False     2
       6     False    16
8      7     False     2
9      7     False     1
dtype: int64

png

valid  pred  T/F  
3      5     False     5
       6     False     1
4      5     False    10
       6     False     6
5      4     False     4
       6     False    96
       7     False     7
6      4     False     2
       5     False    99
       7     False    44
7      5     False     2
       6     False    22
8      7     False    10
9      7     False     1
dtype: int64

png

valid  pred  T/F  
3      5     False     3
       6     False     2
4      5     False     4
       6     False     4
5      4     False     3
       6     False    88
       7     False     5
6      4     False     3
       5     False    78
       7     False    61
       8     False     1
7      6     False    24
8      6     False     2
       7     False     6
9      7     False     1
dtype: int64

1.모델링

train,test,submission = data_preprocessing()
test = test_preprocessing(test)
train,test = scaling(train,test)

    26
   372
  1788
  2416
  3696
   304
     5
Name: quality, dtype: int64

features = train.columns[2:].drop(['type'])
# features = train.columns[2:].drop(['type','sulphates','total sulfur dioxide','residual sugar','free sulfur dioxide','citric acid','pH','fixed acidity'])
# features = train.columns[2:].drop(['type','total sulfur dioxide','pH'])
# features = train.columns[2:].drop(['type','new_col',	'new_col2'	,'new_col3'])
# features = train.columns[2:].drop(['type','white','red'])
features

Index(['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
       'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
       'pH', 'sulphates', 'alcohol', 'red', 'white', 'new_col', 'new_col2',
       'new_col3'],
      dtype='object')

test

	index	fixed acidity	volatile acidity	citric acid	residual sugar	chlorides	free sulfur dioxide	total sulfur dioxide	density	pH	sulphates	alcohol	type	red	white	new_col	new_col2	new_col3
0	0	0.429752	0.153333	0.289157	0.092025	0.056572	0.034722	0.154378	0.128976	0.125984	0.089888	0.521739	white	0	1	0.113120	0.172815	0.416000
1	1	0.785124	0.233333	0.349398	0.019939	0.101498	0.048611	0.078341	0.256217	0.251969	0.151685	0.144928	red	1	0	0.102319	0.475767	0.788211
2	2	0.223140	0.133333	0.162651	0.070552	0.051581	0.149306	0.398618	0.148255	0.354331	0.264045	0.202899	white	0	1	0.157233	0.301288	0.202947
3	3	0.280992	0.046667	0.234940	0.018405	0.056572	0.069444	0.352535	0.148255	0.614173	0.140449	0.289855	white	0	1	0.043567	0.147682	0.250947
4	4	0.247934	0.120000	0.156627	0.021472	0.016639	0.078125	0.152074	0.063621	0.330709	0.140449	0.550725	white	0	1	0.132492	0.410115	0.226526
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
995	995	0.272727	0.340000	0.012048	0.026074	0.121464	0.079861	0.202765	0.199152	0.637795	0.174157	0.246377	red	1	0	0.361746	0.314131	0.279579
996	996	0.404959	0.046667	0.180723	0.015337	0.061564	0.097222	0.285714	0.136688	0.377953	0.089888	0.260870	white	0	1	0.025211	0.270585	0.377263
997	997	0.413223	0.386667	0.156627	0.016871	0.108153	0.010417	0.039171	0.192597	0.322835	0.292135	0.173913	red	1	0	0.320362	0.204186	0.428632
998	998	0.264463	0.226667	0.114458	0.026074	0.103161	0.059028	0.069124	0.147484	0.511811	0.191011	0.420290	red	1	0	0.243711	0.644588	0.256842
999	999	0.388430	0.086667	0.156627	0.132669	0.041597	0.250000	0.313364	0.142472	0.244094	0.084270	0.492754	white	0	1	0.063356	0.663610	0.365474

1000 rows × 18 columns

X = train[features]
y = train['quality']

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier

models = []

rfc = RandomForestClassifier()

models.append(rfc)

gbc = GradientBoostingClassifier()

models.append(gbc)

etc = ExtraTreesClassifier()

models.append(etc)

models

[RandomForestClassifier(),
 GradientBoostingClassifier(),
 ExtraTreesClassifier()]

grid_n_estimator = [100,150, 300]
grid_ratio = [.1, .25, .5, .75, 1.0]
grid_learn = [.01, .03, .05, .1, .25]
grid_max_depth = [2,3, 4, 6, 8, None]
grid_min_samples = [5, 10, .03, .05, .10]
grid_criterion = ['gini', 'entropy']
grid_bool = [True, False]
grid_seed = [0]

best_models = {}
params=[
    {
        #RandomForestClassifier - http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html#sklearn.ensemble.RandomForestClassifier
        'n_estimators': grid_n_estimator, #default=10
        'criterion': grid_criterion, #default=”gini”
        'max_depth': grid_max_depth, #default=None
        'oob_score': [True,False], #default=False -- 12/31/17 set to reduce runtime -- The best parameter for RandomForestClassifier is {'criterion': 'entropy', 'max_depth': 6, 'n_estimators': 100, 'oob_score': True, 'random_state': 0} with a runtime of 146.35 seconds.
        'random_state': grid_seed
    },
    {   
        #GradientBoostingClassifier - http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html#sklearn.ensemble.GradientBoostingClassifier
        #'loss': ['deviance', 'exponential'], #default=’deviance’
        'learning_rate': [0.1,0.05], #default=0.1 -- 12/31/17 set to reduce runtime -- The best parameter for GradientBoostingClassifier is {'learning_rate': 0.05, 'max_depth': 2, 'n_estimators': 300, 'random_state': 0} with a runtime of 264.45 seconds.
        'n_estimators': [100,300], #default=100 -- 12/31/17 set to reduce runtime -- The best parameter for GradientBoostingClassifier is {'learning_rate': 0.05, 'max_depth': 2, 'n_estimators': 300, 'random_state': 0} with a runtime of 264.45 seconds.
        #'criterion': ['friedman_mse', 'mse', 'mae'], #default=”friedman_mse”
        'max_depth': [2,3], #default=3   
        'random_state': grid_seed
    },
    {
      'n_estimators': grid_n_estimator, #default=10
      'criterion': grid_criterion, #default=”gini”
      'max_depth': [2,3, 4, None], #default=None
      'random_state': grid_seed  
    }
    ]

for i,model in enumerate(models):
  model = GridSearchCV(model,param_grid = params[i], cv=7, return_train_score = True, verbose=2)

  model.fit(X,y)

  best_models[i] = model.best_estimator_

best_models

{0: RandomForestClassifier(n_estimators=300, oob_score=True, random_state=0),
 1: GradientBoostingClassifier(n_estimators=300, random_state=0),
 2: ExtraTreesClassifier(criterion='entropy', n_estimators=300, random_state=0)}

pred0 = best_models[0].predict(test[features])
pred1 = best_models[1].predict(test[features])
pred2 = best_models[2].predict(test[features])
pred = pd.DataFrame({'pred0':pred0,'pred1':pred1,'pred2':pred2})
pred

	pred0	pred1	pred2
0	6	7	6
1	6	6	6
2	6	5	6
3	5	6	5
4	6	7	6
...	...	...	...
995	6	5	6
996	6	6	6
997	5	5	5
998	6	6	6
999	6	6	6

1000 rows × 3 columns

pred['pred_hard'] = pred.mode(axis=1)[0].astype(int)
pred

	pred0	pred1	pred2	pred_hard
0	6	7	6	6
1	6	6	6	6
2	6	5	6	6
3	5	6	5	5
4	6	7	6	6
...	...	...	...	...
995	6	5	6	6
996	6	6	6	6
997	5	5	5	5
998	6	6	6	6
999	6	6	6	6

1000 rows × 4 columns

pred0 = best_models[0].predict_proba(test[features])
pred1 = best_models[1].predict_proba(test[features])
pred2 = best_models[2].predict_proba(test[features])
pred_soft = pd.DataFrame(pred0+pred1+pred2/3)
pred_soft.head()

	0	1	2	3	4	5	6
0	1.556208e-02	0.233188	0.584622	0.831922	0.615731	0.052308	1.165414e-08
1	1.111334e-02	0.309182	0.622020	1.293987	0.093276	0.003755	1.073159e-09
2	3.333911e-03	0.061468	0.947784	1.156639	0.146282	0.017827	8.611661e-11
3	1.035068e-06	0.101571	1.034493	0.979610	0.207639	0.010019	2.414811e-08
4	2.141515e-22	0.047925	0.109071	0.922461	1.193087	0.048566	1.222222e-02

pred['pred_soft']= pd.DataFrame(np.argmax(np.array(pred_soft), axis=1))+3
pred

	pred0	pred1	pred2	pred_hard	pred_soft
0	6	7	6	6	6
1	6	6	6	6	6
2	6	5	6	6	6
3	5	6	5	5	5
4	6	7	6	6	7
...	...	...	...	...	...
995	6	5	6	6	5
996	6	6	6	6	6
997	5	5	5	5	5
998	6	6	6	6	6
999	6	6	6	6	6

1000 rows × 5 columns

pred[pred['pred_hard']!=pred['pred_soft']]

	pred0	pred1	pred2	pred_hard	pred_soft
11	5	6	5	5	6
28	6	5	6	6	5
37	6	8	5	5	8
53	4	5	4	4	5
80	6	7	6	6	7
...	...	...	...	...	...
938	4	5	4	4	5
939	6	7	6	6	7
961	6	3	6	6	3
967	6	5	6	6	5
980	6	3	6	6	3

90 rows × 5 columns

submission['quality'] = pred['pred_hard']
submission.to_csv('submission_hard.csv',index=False)

submission['quality'] = pred['pred_soft']
submission.to_csv('submission_soft.csv',index=False)

이전[Dacon] 구내식당 식수 예측

다음[Dacon] 유전체 정보 분류 경진대회

	pred0	pred1	pred2
0	6	7	6
1	6	6	6
2	6	5	6
3	5	6	5
4	6	7	6
...	...	...	...
995	6	5	6
996	6	6	6
997	5	5	5
998	6	6	6
999	6	6	6

	pred0	pred1	pred2	pred_hard
0	6	7	6	6
1	6	6	6	6
2	6	5	6	6
3	5	6	5	5
4	6	7	6	6
...	...	...	...	...
995	6	5	6	6
996	6	6	6	6
997	5	5	5	5
998	6	6	6	6
999	6	6	6	6

	pred0	pred1	pred2	pred_hard	pred_soft
0	6	7	6	6	6
1	6	6	6	6	6
2	6	5	6	6	6
3	5	6	5	5	5
4	6	7	6	6	7
...	...	...	...	...	...
995	6	5	6	6	5
996	6	6	6	6	6
997	5	5	5	5	5
998	6	6	6	6	6
999	6	6	6	6	6

	pred0	pred1	pred2	pred_hard	pred_soft
11	5	6	5	5	6
28	6	5	6	6	5
37	6	8	5	5	8
53	4	5	4	4	5
80	6	7	6	6	7
...	...	...	...	...	...
938	4	5	4	4	5
939	6	7	6	6	7
961	6	3	6	6	3
967	6	5	6	6	5
980	6	3	6	6	3

	pred0	pred1	pred2
0	6	7	6
1	6	6	6
2	6	5	6
3	5	6	5
4	6	7	6
...	...	...	...
995	6	5	6
996	6	6	6
997	5	5	5
998	6	6	6
999	6	6	6

	pred0	pred1	pred2	pred_hard
0	6	7	6	6
1	6	6	6	6
2	6	5	6	6
3	5	6	5	5
4	6	7	6	6
...	...	...	...	...
995	6	5	6	6
996	6	6	6	6
997	5	5	5	5
998	6	6	6	6
999	6	6	6	6

	pred0	pred1	pred2	pred_hard	pred_soft
0	6	7	6	6	6
1	6	6	6	6	6
2	6	5	6	6	6
3	5	6	5	5	5
4	6	7	6	6	7
...	...	...	...	...	...
995	6	5	6	6	5
996	6	6	6	6	6
997	5	5	5	5	5
998	6	6	6	6	6
999	6	6	6	6	6

	pred0	pred1	pred2	pred_hard	pred_soft
11	5	6	5	5	6
28	6	5	6	6	5
37	6	8	5	5	8
53	4	5	4	4	5
80	6	7	6	6	7
...	...	...	...	...	...
938	4	5	4	4	5
939	6	7	6	6	7
961	6	3	6	6	3
967	6	5	6	6	5
980	6	3	6	6	3

	pred0	pred1	pred2
0	6	7	6
1	6	6	6
2	6	5	6
3	5	6	5
4	6	7	6
...	...	...	...
995	6	5	6
996	6	6	6
997	5	5	5
998	6	6	6
999	6	6	6

	pred0	pred1	pred2	pred_hard
0	6	7	6	6
1	6	6	6	6
2	6	5	6	6
3	5	6	5	5
4	6	7	6	6
...	...	...	...	...
995	6	5	6	6
996	6	6	6	6
997	5	5	5	5
998	6	6	6	6
999	6	6	6	6

	pred0	pred1	pred2	pred_hard	pred_soft
0	6	7	6	6	6
1	6	6	6	6	6
2	6	5	6	6	6
3	5	6	5	5	5
4	6	7	6	6	7
...	...	...	...	...	...
995	6	5	6	6	5
996	6	6	6	6	6
997	5	5	5	5	5
998	6	6	6	6	6
999	6	6	6	6	6

	pred0	pred1	pred2	pred_hard	pred_soft
11	5	6	5	5	6
28	6	5	6	6	5
37	6	8	5	5	8
53	4	5	4	4	5
80	6	7	6	6	7
...	...	...	...	...	...
938	4	5	4	4	5
939	6	7	6	6	7
961	6	3	6	6	3
967	6	5	6	6	5
980	6	3	6	6	3