[Dacon] 유전체 정보 분류 경진대회

데이콘 유전체 정보 품종 분류 AI 경진대회

Import

import pandas as pd
import random
import os
import numpy as np
import matplotlib.pyplot as plt

from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
class CFG:
    SEED = 42
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(CFG.SEED) # Seed 고정

Data Load

from google.colab import drive
drive.mount('/content/drive')
Mounted at /content/drive
train = pd.read_csv('/content/drive/MyDrive/data/유전체 정보 품종 분류/train.csv')
test = pd.read_csv('/content/drive/MyDrive/data/유전체 정보 품종 분류/test.csv')
train.head()
id father mother gender trait SNP_01 SNP_02 SNP_03 SNP_04 SNP_05 ... SNP_07 SNP_08 SNP_09 SNP_10 SNP_11 SNP_12 SNP_13 SNP_14 SNP_15 class
0 TRAIN_000 0 0 0 2 G G A G A A G A C A ... A A G G A A G G A G A A A A A A A A B
1 TRAIN_001 0 0 0 2 A G A G C A A A A A ... A A G A A A A G A A G A G G A A A A C
2 TRAIN_002 0 0 0 2 G G G G A A G A C C ... A A G A G A A G A A A A A A A A A A B
3 TRAIN_003 0 0 0 1 A A G G A A G A A A ... G G A A G G A G G G G G G G A A G G A
4 TRAIN_004 0 0 0 2 G G G G C C A A C C ... A A A A A A G G A A A A A G A A G A C

5 rows × 21 columns

def get_x_y(df):
    if 'class' in df.columns:
        df_x = df.drop(columns=['id', 'class'])
        df_y = df['class']
        return df_x, df_y
    else:
        df_x = df.drop(columns=['id'])
        return df_x
train_x, train_y = get_x_y(train)
test_x = get_x_y(test)
train_x.head()
father mother gender trait SNP_01 SNP_02 SNP_03 SNP_04 SNP_05 SNP_06 SNP_07 SNP_08 SNP_09 SNP_10 SNP_11 SNP_12 SNP_13 SNP_14 SNP_15
0 0 0 0 2 G G A G A A G A C A A A A A G G A A G G A G A A A A A A A A
1 0 0 0 2 A G A G C A A A A A A G A A G A A A A G A A G A G G A A A A
2 0 0 0 2 G G G G A A G A C C G G A A G A G A A G A A A A A A A A A A
3 0 0 0 1 A A G G A A G A A A G G G G A A G G A G G G G G G G A A G G
4 0 0 0 2 G G G G C C A A C C A A A A A A A A G G A A A A A G A A G A

Data Pre-processing

Label-Encoding

class_le = preprocessing.LabelEncoder()
snp_le = preprocessing.LabelEncoder()
snp_col = [f'SNP_{str(x).zfill(2)}' for x in range(1,16)]
snp_col
['SNP_01',
 'SNP_02',
 'SNP_03',
 'SNP_04',
 'SNP_05',
 'SNP_06',
 'SNP_07',
 'SNP_08',
 'SNP_09',
 'SNP_10',
 'SNP_11',
 'SNP_12',
 'SNP_13',
 'SNP_14',
 'SNP_15']
snp_data = []
for col in snp_col:
    snp_data += list(train_x[col].values)
train_y = class_le.fit_transform(train_y)
snp_le.fit(snp_data)
LabelEncoder()
for col in train_x.columns:
    if col in snp_col:
        train_x[col] = snp_le.transform(train_x[col])
        test_x[col] = snp_le.transform(test_x[col])
train_x.head()
father mother gender trait SNP_01 SNP_02 SNP_03 SNP_04 SNP_05 SNP_06 SNP_07 SNP_08 SNP_09 SNP_10 SNP_11 SNP_12 SNP_13 SNP_14 SNP_15
0 0 0 0 2 5 1 0 4 2 0 0 5 0 5 1 0 0 0 0
1 0 0 0 2 1 1 2 0 0 1 0 4 0 1 0 4 5 0 0
2 0 0 0 2 5 5 0 4 3 5 0 4 4 1 0 0 0 0 0
3 0 0 0 1 0 5 0 4 0 5 5 0 5 1 5 5 5 0 5
4 0 0 0 2 5 5 3 0 3 0 0 0 0 5 0 0 1 0 4
from sklearn.model_selection import train_test_split

x_train,x_valid,y_train,y_valid = train_test_split(train_x,train_y,test_size = 0.2,random_state=42)
x_train.head()
father mother gender trait SNP_01 SNP_02 SNP_03 SNP_04 SNP_05 SNP_06 SNP_07 SNP_08 SNP_09 SNP_10 SNP_11 SNP_12 SNP_13 SNP_14 SNP_15
179 0 0 0 1 1 5 0 4 2 1 4 0 4 1 1 5 5 3 5
115 0 0 0 2 5 1 2 0 0 1 0 4 0 5 1 4 1 0 0
96 0 0 0 2 1 1 3 4 0 0 0 5 0 5 0 4 5 2 4
233 0 0 0 2 5 5 0 0 2 0 0 0 0 1 0 0 5 0 4
60 0 0 0 2 5 1 3 0 3 5 0 5 0 5 1 0 5 0 0
print(train_x.shape)
print(x_train.shape)
(262, 19)
(209, 19)
def data_preprocessing():
  train = pd.read_csv('/content/drive/MyDrive/data/유전체 정보 품종 분류/train.csv')
  test = pd.read_csv('/content/drive/MyDrive/data/유전체 정보 품종 분류/test.csv')

  train_x, train_y = get_x_y(train)
  test_x = get_x_y(test)
  
  class_le = preprocessing.LabelEncoder()
  snp_le = preprocessing.LabelEncoder()
  snp_col = [f'SNP_{str(x).zfill(2)}' for x in range(1,16)]

  snp_data = []
  for col in snp_col:
      snp_data += list(train_x[col].values)

  train_y = class_le.fit_transform(train_y)
  snp_le.fit(snp_data)

  for col in train_x.columns:
      if col in snp_col:
          train_x[col] = snp_le.transform(train_x[col])
          test_x[col] = snp_le.transform(test_x[col])

  x_train,x_valid,y_train,y_valid = train_test_split(train_x,train_y,test_size = 0.2,random_state=42)

  return x_train,x_valid,y_train,y_valid,test_x

Data Engineering

Feature Seletion

Feature Importance

model = RandomForestClassifier(random_state=CFG.SEED)
model.fit(x_train, y_train)
RandomForestClassifier(random_state=42)
from sklearn.metrics import f1_score

y_pred = model.predict(x_valid)
#f1_score(y_pred,y_valid, average='macro')

def make_plot():
  f1_sco = f1_score(y_pred,y_valid,average='macro')

  
  validation = pd.DataFrame({'y_valid':y_valid,'y_pred':y_pred})

  validation_count = pd.DataFrame(validation['y_valid'].value_counts().sort_index())
  validation_count.loc[validation['y_pred'].value_counts().sort_index().index,'y_pred']=validation['y_pred'].value_counts().sort_index().values
  validation_count = validation_count.fillna(0)

  x = validation_count.index
  y_valid_count = validation_count['y_valid']
  y_pred_count = validation_count['y_pred']

  width = 0.35

  plt.figure(dpi=100)

  plt.title('F1-score: '+str(f1_sco)[:6])
  plt.xlabel('quality')
  plt.ylabel('count')

  p1 = plt.bar([idx-width/2 for idx in x],y_valid_count,width,label = 'real')
  p2 = plt.bar([idx+width/2 for idx in x],y_pred_count,width,label = 'pred')

  plt.legend()
  plt.show()
make_plot()

png

클래스 1과 2에서 혼동하는 모습

feature_imp = pd.DataFrame(model.feature_importances_,index = train_x.columns,columns = ['importance'])
feature_imp.sort_values(by='importance',ascending=False)
importance
trait 0.149997
SNP_05 0.110934
SNP_10 0.098379
SNP_07 0.087005
SNP_08 0.083492
SNP_04 0.071618
SNP_12 0.056224
SNP_14 0.047945
SNP_03 0.047389
SNP_02 0.046950
SNP_01 0.046350
SNP_15 0.038371
SNP_11 0.035263
SNP_13 0.034316
SNP_09 0.028967
SNP_06 0.016798
mother 0.000000
gender 0.000000
father 0.000000

상관계수가 낮은 feature들이 많은 것을 알 수 있다.

그러한 feature 들을 제외하고 학습한 뒤 성능을 확인해보자

# 상관계수가 높은 상위 5개 feature
feature_5 = feature_imp.sort_values(by='importance',ascending=False).index[:5]
feature_5
Index(['trait', 'SNP_05', 'SNP_10', 'SNP_07', 'SNP_08'], dtype='object')
x_train,x_valid,y_train,y_valid,test_x = data_preprocessing()

x_train = x_train[feature_5]
x_valid = x_valid[feature_5]

# 정확한 비교를 위해 똑같은 랜덤포레스트 모델 사용
model = RandomForestClassifier(random_state=CFG.SEED)
model.fit(x_train, y_train)
y_pred = model.predict(x_valid)
make_plot()

png

학습에 포함시킬 feature 개수를 찾아보자

score_list = []
for i in range(len(feature_imp)-1,0,-1):
  feature_tmp = feature_imp.sort_values(by='importance',ascending=False).index[:i]
  x_train,x_valid,y_train,y_valid,test_x = data_preprocessing()

  x_train = x_train[feature_tmp]
  x_valid = x_valid[feature_tmp]

  # 정확한 비교를 위해 똑같은 랜덤포레스트 모델 사용
  model = RandomForestClassifier(random_state=CFG.SEED)
  model.fit(x_train, y_train)

  y_pred = model.predict(x_valid)
  score_list.append((i+1,f1_score(y_pred,y_valid, average='macro')))
score_list
[(19, 0.9821551132463967),
 (18, 0.9821551132463967),
 (17, 1.0),
 (16, 0.9638888888888889),
 (15, 0.9451090781140042),
 (14, 0.9821551132463967),
 (13, 0.9638888888888889),
 (12, 0.9638888888888889),
 (11, 0.9638888888888889),
 (10, 0.9451090781140042),
 (9, 0.9464653397391901),
 (8, 0.9464653397391901),
 (7, 0.9464653397391901),
 (6, 0.9257142857142857),
 (5, 0.9646739130434782),
 (4, 0.9277777777777777),
 (3, 0.7947368421052632),
 (2, 0.5806451612903226)]

상위 17개의 feature을 사용하는 것이 가장 성능이 좋은 것으로 보임

feature_tmp = feature_imp.sort_values(by='importance',ascending=False).index[:16]
x_train,x_valid,y_train,y_valid,test_x = data_preprocessing()

x_train = x_train[feature_tmp]
x_valid = x_valid[feature_tmp]


model = RandomForestClassifier(random_state=CFG.SEED)
model.fit(x_train, y_train)

y_pred = model.predict(x_valid)
f1_score(y_pred,y_valid, average='macro')
1.0

validation 셋에 대한 성능이 좋게 나왔으므로 전체데이터에 대한 학습을 통해 제출

# 제출 함수
def submit(file_name,pred):
  submit = pd.read_csv('/content/drive/MyDrive/data/유전체 정보 품종 분류/sample_submission.csv')
  submit['class'] = class_le.inverse_transform(pred)
  submit.to_csv(f'/content/drive/MyDrive/Colab Notebooks/dacon/{file_name}.csv', index=False)
  print('Done')
model = RandomForestClassifier(random_state=CFG.SEED)
model.fit(train_x[feature_tmp], train_y)

pred = model.predict(test_x[feature_tmp])
submit('rf_17_feature',pred)
Done

결과를 확인해보았으나 베이스라인코드의 제출결과와 같은 결과가 나옴

릿지 모델에도 적용

from sklearn.linear_model import RidgeClassifier

x_train,x_valid,y_train,y_valid,test_x = data_preprocessing()

model = RidgeClassifier(random_state=CFG.SEED)
model.fit(x_train, y_train)

pd.DataFrame(abs(model.coef_))
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
0 0.0 0.0 0.0 1.515546 0.008958 0.010171 0.011026 0.011176 0.006956 0.002418 0.023188 0.007393 0.013038 0.007025 0.013207 0.012130 0.006696 0.019875 0.005807
1 0.0 0.0 0.0 0.352045 0.040708 0.055363 0.086292 0.110782 0.237609 0.005510 0.124689 0.114634 0.083659 0.115097 0.043387 0.019054 0.052472 0.110547 0.042401
2 0.0 0.0 0.0 1.867591 0.031750 0.065533 0.075266 0.121959 0.230653 0.003092 0.101501 0.107241 0.096697 0.108072 0.056594 0.006924 0.045776 0.090672 0.036594
average_feature_effects = abs(model.coef_ )* np.asarray(x_train.mean(axis=0)).ravel()
feature_list = []
for i in range(3):
  # 각 클래스 별 상관관계가 높은 feature 상위 5개씩 학습할 feature에 추가
  tmp = np.argsort(average_feature_effects[i])[-10:][::-1]
  for j in tmp:
    feature_list.append(j)
feature_list = list(set(feature_list))

x_train,x_valid,y_train,y_valid,test_x = data_preprocessing()

x_train = x_train[x_train.columns[feature_list]]
x_valid = x_valid[x_valid.columns[feature_list]]

model = RidgeClassifier(random_state=CFG.SEED)
model.fit(x_train, y_train)

y_pred = model.predict(x_valid)
f1_score(y_pred,y_valid, average='macro')
0.9821551132463967
make_plot()

png

validation 셋에 대한 성능은 잘 나오지 않지만 제출결과 확인을 위해 제출

model = RidgeClassifier(random_state=CFG.SEED)
model.fit(train_x[train_x.columns[feature_list]], train_y)

pred = model.predict(test_x[test_x.columns[feature_list]])
submit('ridge_10_feature',pred)
Done

98점으로 약 2점가량 향상된 점수를 보임!

릿지모델을 통한 feature selection으로 성능향상을 노려봐야 할 것 같음