[Dacon] 와인 품질 분류 경진대회

데이콘 실무역량 강화교육 간에 실시한 프로젝트인 와인분류 프로젝트입니다.

성능 향상을 위해 앙상블기법을 활용해 보았습니다.

또한 하드보팅과 소프트 보팅에 대해 배웠습니다.

0.준비

1) 데이터 로드

from google.colab import drive
drive.mount('/content/drive')
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
import matplotlib.pyplot as plt
import pandas as pd
plt.rc('font',family = 'NanumBarunGothic')

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from xgboost.sklearn import XGBClassifier
from sklearn.model_selection import StratifiedKFold
from xgboost import XGBClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
import seaborn as sns
import numpy as np
train = pd.read_csv('/content/drive/MyDrive/와인1/train.csv')
test = pd.read_csv('/content/drive/MyDrive/와인1/test.csv')
submission = pd.read_csv('/content/drive/MyDrive/와인1/sample_submission.csv')
train
index quality fixed acidity volatile acidity citric acid residual sugar chlorides free sulfur dioxide total sulfur dioxide density pH sulphates alcohol type
0 0 5 5.6 0.695 0.06 6.8 0.042 9.0 84.0 0.99432 3.44 0.44 10.2 white
1 1 5 8.8 0.610 0.14 2.4 0.067 10.0 42.0 0.99690 3.19 0.59 9.5 red
2 2 5 7.9 0.210 0.39 2.0 0.057 21.0 138.0 0.99176 3.05 0.52 10.9 white
3 3 6 7.0 0.210 0.31 6.0 0.046 29.0 108.0 0.99390 3.26 0.50 10.8 white
4 4 6 7.8 0.400 0.26 9.5 0.059 32.0 178.0 0.99550 3.04 0.43 10.9 white
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
5492 5492 5 7.7 0.150 0.29 1.3 0.029 10.0 64.0 0.99320 3.35 0.39 10.1 white
5493 5493 6 6.3 0.180 0.36 1.2 0.034 26.0 111.0 0.99074 3.16 0.51 11.0 white
5494 5494 7 7.8 0.150 0.34 1.1 0.035 31.0 93.0 0.99096 3.07 0.72 11.3 white
5495 5495 5 6.6 0.410 0.31 1.6 0.042 18.0 101.0 0.99195 3.13 0.41 10.5 white
5496 5496 6 7.0 0.350 0.17 1.1 0.049 7.0 119.0 0.99297 3.13 0.36 9.7 white

5497 rows × 14 columns

data_preprocessing()
3      26
4     372
5    1788
6    2416
7    3696
8     304
9       5
Name: quality, dtype: int64
index quality fixed acidity volatile acidity citric acid residual sugar chlorides free sulfur dioxide total sulfur dioxide density pH sulphates alcohol type red white new_col new_col2 new_col3
0 0 5 5.6 0.695 0.06 6.8 0.042 9.0 84.0 0.99432 3.44 0.44 10.2 white 0 1 0.124107 0.107143 6.295
1 1 5 8.8 0.610 0.14 2.4 0.067 10.0 42.0 0.99690 3.19 0.59 9.5 red 1 0 0.069318 0.238095 9.410
2 2 5 7.9 0.210 0.39 2.0 0.057 21.0 138.0 0.99176 3.05 0.52 10.9 white 0 1 0.026582 0.152174 8.110
3 3 6 7.0 0.210 0.31 6.0 0.046 29.0 108.0 0.99390 3.26 0.50 10.8 white 0 1 0.030000 0.268519 7.210
4 4 6 7.8 0.400 0.26 9.5 0.059 32.0 178.0 0.99550 3.04 0.43 10.9 white 0 1 0.051282 0.179775 8.200

2) 전처리

import numpy as np

def ACC(true,pred):
  score = np.mean(true==pred)
  return score
def make_plot():
  acc = ACC(y_valid,y_pred)

  validation = pd.DataFrame({'y_valid':y_valid,'y_pred':y_pred})

  validation_count = pd.DataFrame(validation['y_valid'].value_counts().sort_index())
  validation_count.loc[validation['y_pred'].value_counts().sort_index().index,'y_pred']=validation['y_pred'].value_counts().sort_index().values
  validation_count = validation_count.fillna(0)

  x = validation_count.index
  y_valid_count = validation_count['y_valid']
  y_pred_count = validation_count['y_pred']

  width = 0.35

  plt.figure(dpi=150)

  plt.title('ACC: '+str(acc)[:6])
  plt.xlabel('quality')
  plt.ylabel('count')

  p1 = plt.bar([idx-width/2 for idx in x],y_valid_count,width,label = 'real')
  p2 = plt.bar([idx+width/2 for idx in x],y_pred_count,width,label = 'pred')

  plt.legend()
  plt.show()

원핫인코딩

train = train.append(train[train['quality'].isin([4,7,8])])

train = pd.concat([train,pd.get_dummies(train['type'])],axis=1)
test = pd.concat([test,pd.get_dummies(test['type'])],axis=1)
train
index quality fixed acidity volatile acidity citric acid residual sugar chlorides free sulfur dioxide total sulfur dioxide density pH sulphates alcohol type red white
0 0 5 5.6 0.695 0.06 6.8 0.042 9.0 84.0 0.99432 3.44 0.44 10.2 white 0 1
1 1 5 8.8 0.610 0.14 2.4 0.067 10.0 42.0 0.99690 3.19 0.59 9.5 red 1 0
2 2 5 7.9 0.210 0.39 2.0 0.057 21.0 138.0 0.99176 3.05 0.52 10.9 white 0 1
3 3 6 7.0 0.210 0.31 6.0 0.046 29.0 108.0 0.99390 3.26 0.50 10.8 white 0 1
4 4 6 7.8 0.400 0.26 9.5 0.059 32.0 178.0 0.99550 3.04 0.43 10.9 white 0 1
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
5481 5481 7 5.4 0.835 0.08 1.2 0.046 13.0 93.0 0.99240 3.57 0.85 13.0 red 1 0
5482 5482 4 6.2 0.430 0.49 6.4 0.045 12.0 115.0 0.99630 3.27 0.57 9.0 white 0 1
5485 5485 4 6.3 0.280 0.22 9.5 0.040 30.0 111.0 0.99338 3.05 0.31 10.8 white 0 1
5491 5491 7 6.9 0.340 0.30 4.7 0.029 34.0 148.0 0.99165 3.36 0.49 12.3 white 0 1
5494 5494 7 7.8 0.150 0.34 1.1 0.035 31.0 93.0 0.99096 3.07 0.72 11.3 white 0 1

6759 rows × 16 columns

feature engineering

def make_plots(text):
  plt.title(text + ' vs quality')

  x = train.groupby('quality').mean().reset_index()['quality']
  y = train.groupby('quality').mean().reset_index()[text]
  
  plt.bar(x,y)
  plt.show()
train['new_col'] = train['volatile acidity'] / train['fixed acidity']
train['new_col2'] = train['free sulfur dioxide'] / train['total sulfur dioxide']
train['new_col3'] = train['volatile acidity'] + train['fixed acidity']
train
index quality fixed acidity volatile acidity citric acid residual sugar chlorides free sulfur dioxide total sulfur dioxide density pH sulphates alcohol type red white new_col new_col2 new_col3
0 0 5 5.6 0.695 0.06 6.8 0.042 9.0 84.0 0.99432 3.44 0.44 10.2 white 0 1 0.124107 0.107143 6.295
1 1 5 8.8 0.610 0.14 2.4 0.067 10.0 42.0 0.99690 3.19 0.59 9.5 red 1 0 0.069318 0.238095 9.410
2 2 5 7.9 0.210 0.39 2.0 0.057 21.0 138.0 0.99176 3.05 0.52 10.9 white 0 1 0.026582 0.152174 8.110
3 3 6 7.0 0.210 0.31 6.0 0.046 29.0 108.0 0.99390 3.26 0.50 10.8 white 0 1 0.030000 0.268519 7.210
4 4 6 7.8 0.400 0.26 9.5 0.059 32.0 178.0 0.99550 3.04 0.43 10.9 white 0 1 0.051282 0.179775 8.200
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
5481 5481 7 5.4 0.835 0.08 1.2 0.046 13.0 93.0 0.99240 3.57 0.85 13.0 red 1 0 0.154630 0.139785 6.235
5482 5482 4 6.2 0.430 0.49 6.4 0.045 12.0 115.0 0.99630 3.27 0.57 9.0 white 0 1 0.069355 0.104348 6.630
5485 5485 4 6.3 0.280 0.22 9.5 0.040 30.0 111.0 0.99338 3.05 0.31 10.8 white 0 1 0.044444 0.270270 6.580
5491 5491 7 6.9 0.340 0.30 4.7 0.029 34.0 148.0 0.99165 3.36 0.49 12.3 white 0 1 0.049275 0.229730 7.240
5494 5494 7 7.8 0.150 0.34 1.1 0.035 31.0 93.0 0.99096 3.07 0.72 11.3 white 0 1 0.019231 0.333333 7.950

6759 rows × 19 columns

|train.corr()['quality'].to_frame()
quality
index -0.007835
quality 1.000000
fixed acidity -0.079555
volatile acidity -0.282066
citric acid 0.091287
residual sugar -0.028293
chlorides -0.215320
free sulfur dioxide 0.082963
total sulfur dioxide -0.034012
density -0.316107
pH 0.016610
sulphates 0.048132
alcohol 0.460041
red -0.120606
white 0.120606
new_col -0.244637
new_col2 0.135119
new_col3 -0.111243

스케일링

test['new_col'] = test['volatile acidity'] / test['fixed acidity']
test['new_col2'] = test['free sulfur dioxide'] / test['total sulfur dioxide']
test['new_col3'] = test['volatile acidity'] + test['fixed acidity']
test
index fixed acidity volatile acidity citric acid residual sugar chlorides free sulfur dioxide total sulfur dioxide density pH sulphates alcohol type red white new_col new_col2 new_col3
0 0 9.0 0.31 0.48 6.60 0.043 11.0 73.0 0.99380 2.90 0.38 11.6 white 0 1 0.034444 0.150685 9.31
1 1 13.3 0.43 0.58 1.90 0.070 15.0 40.0 1.00040 3.06 0.49 9.0 red 1 0 0.032331 0.375000 13.73
2 2 6.5 0.28 0.27 5.20 0.040 44.0 179.0 0.99480 3.19 0.69 9.4 white 0 1 0.043077 0.245810 6.78
3 3 7.2 0.15 0.39 1.80 0.043 21.0 159.0 0.99480 3.52 0.47 10.0 white 0 1 0.020833 0.132075 7.35
4 4 6.8 0.26 0.26 2.00 0.019 23.5 72.0 0.99041 3.16 0.47 11.8 white 0 1 0.038235 0.326389 7.06
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
995 995 7.1 0.59 0.02 2.30 0.082 24.0 94.0 0.99744 3.55 0.53 9.7 red 1 0 0.083099 0.255319 7.69
996 996 8.7 0.15 0.30 1.60 0.046 29.0 130.0 0.99420 3.22 0.38 9.8 white 0 1 0.017241 0.223077 8.85
997 997 8.8 0.66 0.26 1.70 0.074 4.0 23.0 0.99710 3.15 0.74 9.2 red 1 0 0.075000 0.173913 9.46
998 998 7.0 0.42 0.19 2.30 0.071 18.0 36.0 0.99476 3.39 0.56 10.9 red 1 0 0.060000 0.500000 7.42
999 999 8.5 0.21 0.26 9.25 0.034 73.0 142.0 0.99450 3.05 0.37 11.4 white 0 1 0.024706 0.514085 8.71

1000 rows × 18 columns

# scaler = StandardScaler()

# train[train.columns[2:-6]] = scaler.fit_transform(train[train.columns[2:-6]])
# train[train.columns[-3:]] = scaler.fit_transform(train[train.columns[-3:]])

scaler = MinMaxScaler()

train[train.columns[2:-6]] = scaler.fit_transform(train[train.columns[2:-6]])
test[test.columns[1:-6]] = scaler.transform(test[test.columns[1:-6]])
train[train.columns[-3:]] = scaler.fit_transform(train[train.columns[-3:]])
test[test.columns[-3:]] = scaler.fit_transform(test[test.columns[-3:]])

3) 성능 검증

def data_preprocessing():
  
  train = pd.read_csv('/content/drive/MyDrive/와인1/train.csv')
  test = pd.read_csv('/content/drive/MyDrive/와인1/test.csv')
  submission = pd.read_csv('/content/drive/MyDrive/와인1/sample_submission.csv')

  train = train.append(train[train['quality'].isin([4,7,8])])
  train = train.append(train[train['quality'].isin([7])])

  train = pd.concat([train, pd.get_dummies(train['type'])],axis=1)
  test = pd.concat([test, pd.get_dummies(test['type'])],axis=1)

  train['new_col'] = train['volatile acidity'] / train['fixed acidity']
  train['new_col2'] = train['free sulfur dioxide'] / train['total sulfur dioxide']
  train['new_col3'] = train['volatile acidity'] + train['fixed acidity']

  print(train['quality'].value_counts().sort_index())

  
  return train,test,submission
def test_preprocessing(test):
  test['new_col'] = test['volatile acidity'] / test['fixed acidity']
  test['new_col2'] = test['free sulfur dioxide'] / test['total sulfur dioxide']
  test['new_col3'] = test['volatile acidity'] + test['fixed acidity']
  return test
def scaling(train,test):
  scaler = MinMaxScaler()

  train[train.columns[2:-6]] = scaler.fit_transform(train[train.columns[2:-6]])
  test[test.columns[1:-6]] = scaler.transform(test[test.columns[1:-6]])
  
  train[train.columns[-3:]] = scaler.fit_transform(train[train.columns[-3:]])
  test[test.columns[-3:]] = scaler.fit_transform(test[test.columns[-3:]])
  return train,test
data_preprocessing()
3      26
4     372
5    1788
6    2416
7    3696
8     304
9       5
Name: quality, dtype: int64





(      index  quality  fixed acidity  volatile acidity  citric acid  \
 0         0        5            5.6             0.695         0.06   
 1         1        5            8.8             0.610         0.14   
 2         2        5            7.9             0.210         0.39   
 3         3        6            7.0             0.210         0.31   
 4         4        6            7.8             0.400         0.26   
 ...     ...      ...            ...               ...          ...   
 5456   5456        7            5.9             0.170         0.29   
 5466   5466        7            6.0             0.290         0.41   
 5481   5481        7            5.4             0.835         0.08   
 5491   5491        7            6.9             0.340         0.30   
 5494   5494        7            7.8             0.150         0.34   
 
       residual sugar  chlorides  free sulfur dioxide  total sulfur dioxide  \
 0                6.8      0.042                  9.0                  84.0   
 1                2.4      0.067                 10.0                  42.0   
 2                2.0      0.057                 21.0                 138.0   
 3                6.0      0.046                 29.0                 108.0   
 4                9.5      0.059                 32.0                 178.0   
 ...              ...        ...                  ...                   ...   
 5456             3.1      0.030                 32.0                 123.0   
 5466            10.8      0.048                 55.0                 149.0   
 5481             1.2      0.046                 13.0                  93.0   
 5491             4.7      0.029                 34.0                 148.0   
 5494             1.1      0.035                 31.0                  93.0   
 
       density    pH  sulphates  alcohol   type  red  white   new_col  \
 0     0.99432  3.44       0.44     10.2  white    0      1  0.124107   
 1     0.99690  3.19       0.59      9.5    red    1      0  0.069318   
 2     0.99176  3.05       0.52     10.9  white    0      1  0.026582   
 3     0.99390  3.26       0.50     10.8  white    0      1  0.030000   
 4     0.99550  3.04       0.43     10.9  white    0      1  0.051282   
 ...       ...   ...        ...      ...    ...  ...    ...       ...   
 5456  0.98913  3.41       0.33     13.7  white    0      1  0.028814   
 5466  0.99370  3.09       0.59     11.0  white    0      1  0.048333   
 5481  0.99240  3.57       0.85     13.0    red    1      0  0.154630   
 5491  0.99165  3.36       0.49     12.3  white    0      1  0.049275   
 5494  0.99096  3.07       0.72     11.3  white    0      1  0.019231   
 
       new_col2  new_col3  
 0     0.107143     6.295  
 1     0.238095     9.410  
 2     0.152174     8.110  
 3     0.268519     7.210  
 4     0.179775     8.200  
 ...        ...       ...  
 5456  0.260163     6.070  
 5466  0.369128     6.290  
 5481  0.139785     6.235  
 5491  0.229730     7.240  
 5494  0.333333     7.950  
 
 [8607 rows x 19 columns],
      index  fixed acidity  volatile acidity  citric acid  residual sugar  \
 0        0            9.0              0.31         0.48            6.60   
 1        1           13.3              0.43         0.58            1.90   
 2        2            6.5              0.28         0.27            5.20   
 3        3            7.2              0.15         0.39            1.80   
 4        4            6.8              0.26         0.26            2.00   
 ..     ...            ...               ...          ...             ...   
 995    995            7.1              0.59         0.02            2.30   
 996    996            8.7              0.15         0.30            1.60   
 997    997            8.8              0.66         0.26            1.70   
 998    998            7.0              0.42         0.19            2.30   
 999    999            8.5              0.21         0.26            9.25   
 
      chlorides  free sulfur dioxide  total sulfur dioxide  density    pH  \
 0        0.043                 11.0                  73.0  0.99380  2.90   
 1        0.070                 15.0                  40.0  1.00040  3.06   
 2        0.040                 44.0                 179.0  0.99480  3.19   
 3        0.043                 21.0                 159.0  0.99480  3.52   
 4        0.019                 23.5                  72.0  0.99041  3.16   
 ..         ...                  ...                   ...      ...   ...   
 995      0.082                 24.0                  94.0  0.99744  3.55   
 996      0.046                 29.0                 130.0  0.99420  3.22   
 997      0.074                  4.0                  23.0  0.99710  3.15   
 998      0.071                 18.0                  36.0  0.99476  3.39   
 999      0.034                 73.0                 142.0  0.99450  3.05   
 
      sulphates  alcohol   type  red  white  
 0         0.38     11.6  white    0      1  
 1         0.49      9.0    red    1      0  
 2         0.69      9.4  white    0      1  
 3         0.47     10.0  white    0      1  
 4         0.47     11.8  white    0      1  
 ..         ...      ...    ...  ...    ...  
 995       0.53      9.7    red    1      0  
 996       0.38      9.8  white    0      1  
 997       0.74      9.2    red    1      0  
 998       0.56     10.9    red    1      0  
 999       0.37     11.4  white    0      1  
 
 [1000 rows x 15 columns],
      index  quality
 0        0        0
 1        1        0
 2        2        0
 3        3        0
 4        4        0
 ..     ...      ...
 995    995        0
 996    996        0
 997    997        0
 998    998        0
 999    999        0
 
 [1000 rows x 2 columns])
train
index quality fixed acidity volatile acidity citric acid residual sugar chlorides free sulfur dioxide total sulfur dioxide density pH sulphates alcohol type red white new_col new_col2 new_col3
0 0 5 0.148760 0.410000 0.036145 0.095092 0.054908 0.027778 0.179724 0.139001 0.551181 0.123596 0.318841 white 0 1 0.574215 0.100202 0.179466
1 1 5 0.413223 0.353333 0.084337 0.027607 0.096506 0.031250 0.082949 0.188741 0.354331 0.207865 0.217391 red 1 0 0.295792 0.257310 0.435318
2 2 5 0.338843 0.086667 0.234940 0.021472 0.079867 0.069444 0.304147 0.089647 0.244094 0.168539 0.420290 white 0 1 0.078620 0.154228 0.328542
3 3 6 0.264463 0.086667 0.186747 0.082822 0.061564 0.097222 0.235023 0.130904 0.409449 0.157303 0.405797 white 0 1 0.095988 0.293810 0.254620
4 4 6 0.330579 0.213333 0.156627 0.136503 0.083195 0.107639 0.396313 0.161751 0.236220 0.117978 0.420290 white 0 1 0.204138 0.187342 0.335934
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
5481 5481 7 0.132231 0.503333 0.048193 0.009202 0.061564 0.041667 0.200461 0.101986 0.653543 0.353933 0.724638 red 1 0 0.729321 0.139364 0.174538
5482 5482 4 0.198347 0.233333 0.295181 0.088957 0.059900 0.038194 0.251152 0.177174 0.417323 0.196629 0.144928 white 0 1 0.295979 0.096849 0.206982
5485 5485 4 0.206612 0.133333 0.132530 0.136503 0.051581 0.100694 0.241935 0.120879 0.244094 0.050562 0.405797 white 0 1 0.169391 0.295911 0.202875
5491 5491 7 0.256198 0.173333 0.180723 0.062883 0.033278 0.114583 0.327189 0.087527 0.488189 0.151685 0.623188 white 0 1 0.193940 0.247274 0.257084
5494 5494 7 0.330579 0.046667 0.204819 0.007669 0.043261 0.104167 0.200461 0.074224 0.259843 0.280899 0.478261 white 0 1 0.041262 0.371570 0.315400

6759 rows × 19 columns

test
index fixed acidity volatile acidity citric acid residual sugar chlorides free sulfur dioxide total sulfur dioxide density pH sulphates alcohol type red white new_col new_col2 new_col3
0 0 0.429752 0.153333 0.289157 0.092025 0.056572 0.034722 0.154378 0.128976 0.125984 0.089888 0.521739 white 0 1 0.113120 0.172815 0.416000
1 1 0.785124 0.233333 0.349398 0.019939 0.101498 0.048611 0.078341 0.256217 0.251969 0.151685 0.144928 red 1 0 0.102319 0.475767 0.788211
2 2 0.223140 0.133333 0.162651 0.070552 0.051581 0.149306 0.398618 0.148255 0.354331 0.264045 0.202899 white 0 1 0.157233 0.301288 0.202947
3 3 0.280992 0.046667 0.234940 0.018405 0.056572 0.069444 0.352535 0.148255 0.614173 0.140449 0.289855 white 0 1 0.043567 0.147682 0.250947
4 4 0.247934 0.120000 0.156627 0.021472 0.016639 0.078125 0.152074 0.063621 0.330709 0.140449 0.550725 white 0 1 0.132492 0.410115 0.226526
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
995 995 0.272727 0.340000 0.012048 0.026074 0.121464 0.079861 0.202765 0.199152 0.637795 0.174157 0.246377 red 1 0 0.361746 0.314131 0.279579
996 996 0.404959 0.046667 0.180723 0.015337 0.061564 0.097222 0.285714 0.136688 0.377953 0.089888 0.260870 white 0 1 0.025211 0.270585 0.377263
997 997 0.413223 0.386667 0.156627 0.016871 0.108153 0.010417 0.039171 0.192597 0.322835 0.292135 0.173913 red 1 0 0.320362 0.204186 0.428632
998 998 0.264463 0.226667 0.114458 0.026074 0.103161 0.059028 0.069124 0.147484 0.511811 0.191011 0.420290 red 1 0 0.243711 0.644588 0.256842
999 999 0.388430 0.086667 0.156627 0.132669 0.041597 0.250000 0.313364 0.142472 0.244094 0.084270 0.492754 white 0 1 0.063356 0.663610 0.365474

1000 rows × 18 columns

data_preprocessing()
test_preprocessing()
scaling()
3      26
4     372
5    1788
6    2416
7    3696
8     304
9       5
Name: quality, dtype: int64
features = train.columns[2:].drop(['type'])
# features = train.columns[2:].drop(['type','sulphates','total sulfur dioxide','residual sugar','free sulfur dioxide','citric acid','pH','fixed acidity'])
# features = train.columns[2:].drop(['type','total sulfur dioxide','pH'])
# features = train.columns[2:].drop(['type','new_col',	'new_col2'	,'new_col3'])
# features = train.columns[2:].drop(['type','white','red'])
features
Index(['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
       'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
       'pH', 'sulphates', 'alcohol', 'red', 'white', 'new_col', 'new_col2',
       'new_col3'],
      dtype='object')
X = train[features]
y = train['quality']
from sklearn.model_selection import StratifiedKFold
kfold = StratifiedKFold(n_splits=5,shuffle=True,random_state=42)

models = []
accs = []
for train_idx, valid_idx in kfold.split(X,y):
  X_train,X_valid = X.iloc[train_idx],X.iloc[valid_idx]
  y_train,y_valid = y.iloc[train_idx],y.iloc[valid_idx]

  model = RandomForestClassifier()

  model.fit(X_train,y_train)
  
  #추가
  models.append(model)

  y_pred = model.predict(X_valid)
  accs.append(ACC(y_valid,y_pred))
  make_plot()

  pred_ = pd.DataFrame(y_pred,y_valid).reset_index()
  pred_.columns = ['valid','pred']
  pred_['T/F'] = pred_['valid']==pred_['pred']
  print(pred_[pred_['T/F']==False].value_counts().sort_index())

png

valid  pred  T/F  
3      5     False      3
       6     False      2
4      5     False     10
       6     False      6
5      4     False      5
       6     False    105
       7     False     10
6      4     False      3
       5     False     64
       7     False     55
7      5     False      4
       6     False     26
8      6     False      6
9      7     False      1
dtype: int64

png

valid  pred  T/F  
3      5     False      1
       6     False      3
       7     False      1
4      5     False      4
       6     False      8
5      4     False      3
       6     False    104
       7     False      5
6      4     False      1
       5     False     73
       7     False     61
       8     False      2
7      5     False      2
       6     False     26
8      7     False      2
9      6     False      1
dtype: int64

png

valid  pred  T/F  
3      5     False     3
       6     False     2
4      5     False    10
       6     False     4
       7     False     2
5      4     False     5
       6     False    85
       7     False     8
6      4     False     2
       5     False    96
       7     False    48
7      5     False     2
       6     False    16
8      7     False     2
9      7     False     1
dtype: int64

png

valid  pred  T/F  
3      5     False     5
       6     False     1
4      5     False    10
       6     False     6
5      4     False     4
       6     False    96
       7     False     7
6      4     False     2
       5     False    99
       7     False    44
7      5     False     2
       6     False    22
8      7     False    10
9      7     False     1
dtype: int64

png

valid  pred  T/F  
3      5     False     3
       6     False     2
4      5     False     4
       6     False     4
5      4     False     3
       6     False    88
       7     False     5
6      4     False     3
       5     False    78
       7     False    61
       8     False     1
7      6     False    24
8      6     False     2
       7     False     6
9      7     False     1
dtype: int64

1.모델링

train,test,submission = data_preprocessing()
test = test_preprocessing(test)
train,test = scaling(train,test)
3      26
4     372
5    1788
6    2416
7    3696
8     304
9       5
Name: quality, dtype: int64
features = train.columns[2:].drop(['type'])
# features = train.columns[2:].drop(['type','sulphates','total sulfur dioxide','residual sugar','free sulfur dioxide','citric acid','pH','fixed acidity'])
# features = train.columns[2:].drop(['type','total sulfur dioxide','pH'])
# features = train.columns[2:].drop(['type','new_col',	'new_col2'	,'new_col3'])
# features = train.columns[2:].drop(['type','white','red'])
features
Index(['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
       'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
       'pH', 'sulphates', 'alcohol', 'red', 'white', 'new_col', 'new_col2',
       'new_col3'],
      dtype='object')
test
index fixed acidity volatile acidity citric acid residual sugar chlorides free sulfur dioxide total sulfur dioxide density pH sulphates alcohol type red white new_col new_col2 new_col3
0 0 0.429752 0.153333 0.289157 0.092025 0.056572 0.034722 0.154378 0.128976 0.125984 0.089888 0.521739 white 0 1 0.113120 0.172815 0.416000
1 1 0.785124 0.233333 0.349398 0.019939 0.101498 0.048611 0.078341 0.256217 0.251969 0.151685 0.144928 red 1 0 0.102319 0.475767 0.788211
2 2 0.223140 0.133333 0.162651 0.070552 0.051581 0.149306 0.398618 0.148255 0.354331 0.264045 0.202899 white 0 1 0.157233 0.301288 0.202947
3 3 0.280992 0.046667 0.234940 0.018405 0.056572 0.069444 0.352535 0.148255 0.614173 0.140449 0.289855 white 0 1 0.043567 0.147682 0.250947
4 4 0.247934 0.120000 0.156627 0.021472 0.016639 0.078125 0.152074 0.063621 0.330709 0.140449 0.550725 white 0 1 0.132492 0.410115 0.226526
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
995 995 0.272727 0.340000 0.012048 0.026074 0.121464 0.079861 0.202765 0.199152 0.637795 0.174157 0.246377 red 1 0 0.361746 0.314131 0.279579
996 996 0.404959 0.046667 0.180723 0.015337 0.061564 0.097222 0.285714 0.136688 0.377953 0.089888 0.260870 white 0 1 0.025211 0.270585 0.377263
997 997 0.413223 0.386667 0.156627 0.016871 0.108153 0.010417 0.039171 0.192597 0.322835 0.292135 0.173913 red 1 0 0.320362 0.204186 0.428632
998 998 0.264463 0.226667 0.114458 0.026074 0.103161 0.059028 0.069124 0.147484 0.511811 0.191011 0.420290 red 1 0 0.243711 0.644588 0.256842
999 999 0.388430 0.086667 0.156627 0.132669 0.041597 0.250000 0.313364 0.142472 0.244094 0.084270 0.492754 white 0 1 0.063356 0.663610 0.365474

1000 rows × 18 columns

X = train[features]
y = train['quality']
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier
models = []

rfc = RandomForestClassifier()

models.append(rfc)

gbc = GradientBoostingClassifier()

models.append(gbc)

etc = ExtraTreesClassifier()

models.append(etc)
models
[RandomForestClassifier(),
 GradientBoostingClassifier(),
 ExtraTreesClassifier()]
grid_n_estimator = [100,150, 300]
grid_ratio = [.1, .25, .5, .75, 1.0]
grid_learn = [.01, .03, .05, .1, .25]
grid_max_depth = [2,3, 4, 6, 8, None]
grid_min_samples = [5, 10, .03, .05, .10]
grid_criterion = ['gini', 'entropy']
grid_bool = [True, False]
grid_seed = [0]
best_models = {}
params=[
    {
        #RandomForestClassifier - http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html#sklearn.ensemble.RandomForestClassifier
        'n_estimators': grid_n_estimator, #default=10
        'criterion': grid_criterion, #default=”gini”
        'max_depth': grid_max_depth, #default=None
        'oob_score': [True,False], #default=False -- 12/31/17 set to reduce runtime -- The best parameter for RandomForestClassifier is {'criterion': 'entropy', 'max_depth': 6, 'n_estimators': 100, 'oob_score': True, 'random_state': 0} with a runtime of 146.35 seconds.
        'random_state': grid_seed
    },
    {   
        #GradientBoostingClassifier - http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html#sklearn.ensemble.GradientBoostingClassifier
        #'loss': ['deviance', 'exponential'], #default=’deviance’
        'learning_rate': [0.1,0.05], #default=0.1 -- 12/31/17 set to reduce runtime -- The best parameter for GradientBoostingClassifier is {'learning_rate': 0.05, 'max_depth': 2, 'n_estimators': 300, 'random_state': 0} with a runtime of 264.45 seconds.
        'n_estimators': [100,300], #default=100 -- 12/31/17 set to reduce runtime -- The best parameter for GradientBoostingClassifier is {'learning_rate': 0.05, 'max_depth': 2, 'n_estimators': 300, 'random_state': 0} with a runtime of 264.45 seconds.
        #'criterion': ['friedman_mse', 'mse', 'mae'], #default=”friedman_mse”
        'max_depth': [2,3], #default=3   
        'random_state': grid_seed
    },
    {
      'n_estimators': grid_n_estimator, #default=10
      'criterion': grid_criterion, #default=”gini”
      'max_depth': [2,3, 4, None], #default=None
      'random_state': grid_seed  
    }
    ]

for i,model in enumerate(models):
  model = GridSearchCV(model,param_grid = params[i], cv=7, return_train_score = True, verbose=2)

  model.fit(X,y)

  best_models[i] = model.best_estimator_

best_models
{0: RandomForestClassifier(n_estimators=300, oob_score=True, random_state=0),
 1: GradientBoostingClassifier(n_estimators=300, random_state=0),
 2: ExtraTreesClassifier(criterion='entropy', n_estimators=300, random_state=0)}
pred0 = best_models[0].predict(test[features])
pred1 = best_models[1].predict(test[features])
pred2 = best_models[2].predict(test[features])
pred = pd.DataFrame({'pred0':pred0,'pred1':pred1,'pred2':pred2})
pred
pred0 pred1 pred2
0 6 7 6
1 6 6 6
2 6 5 6
3 5 6 5
4 6 7 6
... ... ... ...
995 6 5 6
996 6 6 6
997 5 5 5
998 6 6 6
999 6 6 6

1000 rows × 3 columns

pred['pred_hard'] = pred.mode(axis=1)[0].astype(int)
pred
pred0 pred1 pred2 pred_hard
0 6 7 6 6
1 6 6 6 6
2 6 5 6 6
3 5 6 5 5
4 6 7 6 6
... ... ... ... ...
995 6 5 6 6
996 6 6 6 6
997 5 5 5 5
998 6 6 6 6
999 6 6 6 6

1000 rows × 4 columns

pred0 = best_models[0].predict_proba(test[features])
pred1 = best_models[1].predict_proba(test[features])
pred2 = best_models[2].predict_proba(test[features])
pred_soft = pd.DataFrame(pred0+pred1+pred2/3)
pred_soft.head()
0 1 2 3 4 5 6
0 1.556208e-02 0.233188 0.584622 0.831922 0.615731 0.052308 1.165414e-08
1 1.111334e-02 0.309182 0.622020 1.293987 0.093276 0.003755 1.073159e-09
2 3.333911e-03 0.061468 0.947784 1.156639 0.146282 0.017827 8.611661e-11
3 1.035068e-06 0.101571 1.034493 0.979610 0.207639 0.010019 2.414811e-08
4 2.141515e-22 0.047925 0.109071 0.922461 1.193087 0.048566 1.222222e-02
pred['pred_soft']= pd.DataFrame(np.argmax(np.array(pred_soft), axis=1))+3
pred
pred0 pred1 pred2 pred_hard pred_soft
0 6 7 6 6 6
1 6 6 6 6 6
2 6 5 6 6 6
3 5 6 5 5 5
4 6 7 6 6 7
... ... ... ... ... ...
995 6 5 6 6 5
996 6 6 6 6 6
997 5 5 5 5 5
998 6 6 6 6 6
999 6 6 6 6 6

1000 rows × 5 columns

pred[pred['pred_hard']!=pred['pred_soft']]
pred0 pred1 pred2 pred_hard pred_soft
11 5 6 5 5 6
28 6 5 6 6 5
37 6 8 5 5 8
53 4 5 4 4 5
80 6 7 6 6 7
... ... ... ... ... ...
938 4 5 4 4 5
939 6 7 6 6 7
961 6 3 6 6 3
967 6 5 6 6 5
980 6 3 6 6 3

90 rows × 5 columns

submission['quality'] = pred['pred_hard']
submission.to_csv('submission_hard.csv',index=False)
submission['quality'] = pred['pred_soft']
submission.to_csv('submission_soft.csv',index=False)