데이콘 유전체 정보 품종 분류 AI 경진대회
Import
import pandas as pd
import random
import os
import numpy as np
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
class CFG:
SEED = 42
def seed_everything(seed):
random.seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed)
np.random.seed(seed)
seed_everything(CFG.SEED) # Seed 고정
Data Load
from google.colab import drive
drive.mount('/content/drive')
Mounted at /content/drive
train = pd.read_csv('/content/drive/MyDrive/data/유전체 정보 품종 분류/train.csv')
test = pd.read_csv('/content/drive/MyDrive/data/유전체 정보 품종 분류/test.csv')
train.head()
id | father | mother | gender | trait | SNP_01 | SNP_02 | SNP_03 | SNP_04 | SNP_05 | ... | SNP_07 | SNP_08 | SNP_09 | SNP_10 | SNP_11 | SNP_12 | SNP_13 | SNP_14 | SNP_15 | class | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | TRAIN_000 | 0 | 0 | 0 | 2 | G G | A G | A A | G A | C A | ... | A A | G G | A A | G G | A G | A A | A A | A A | A A | B |
1 | TRAIN_001 | 0 | 0 | 0 | 2 | A G | A G | C A | A A | A A | ... | A A | G A | A A | A G | A A | G A | G G | A A | A A | C |
2 | TRAIN_002 | 0 | 0 | 0 | 2 | G G | G G | A A | G A | C C | ... | A A | G A | G A | A G | A A | A A | A A | A A | A A | B |
3 | TRAIN_003 | 0 | 0 | 0 | 1 | A A | G G | A A | G A | A A | ... | G G | A A | G G | A G | G G | G G | G G | A A | G G | A |
4 | TRAIN_004 | 0 | 0 | 0 | 2 | G G | G G | C C | A A | C C | ... | A A | A A | A A | G G | A A | A A | A G | A A | G A | C |
5 rows × 21 columns
def get_x_y(df):
if 'class' in df.columns:
df_x = df.drop(columns=['id', 'class'])
df_y = df['class']
return df_x, df_y
else:
df_x = df.drop(columns=['id'])
return df_x
train_x, train_y = get_x_y(train)
test_x = get_x_y(test)
train_x.head()
father | mother | gender | trait | SNP_01 | SNP_02 | SNP_03 | SNP_04 | SNP_05 | SNP_06 | SNP_07 | SNP_08 | SNP_09 | SNP_10 | SNP_11 | SNP_12 | SNP_13 | SNP_14 | SNP_15 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 0 | 0 | 2 | G G | A G | A A | G A | C A | A A | A A | G G | A A | G G | A G | A A | A A | A A | A A |
1 | 0 | 0 | 0 | 2 | A G | A G | C A | A A | A A | A G | A A | G A | A A | A G | A A | G A | G G | A A | A A |
2 | 0 | 0 | 0 | 2 | G G | G G | A A | G A | C C | G G | A A | G A | G A | A G | A A | A A | A A | A A | A A |
3 | 0 | 0 | 0 | 1 | A A | G G | A A | G A | A A | G G | G G | A A | G G | A G | G G | G G | G G | A A | G G |
4 | 0 | 0 | 0 | 2 | G G | G G | C C | A A | C C | A A | A A | A A | A A | G G | A A | A A | A G | A A | G A |
Data Pre-processing
Label-Encoding
class_le = preprocessing.LabelEncoder()
snp_le = preprocessing.LabelEncoder()
snp_col = [f'SNP_{str(x).zfill(2)}' for x in range(1,16)]
snp_col
['SNP_01',
'SNP_02',
'SNP_03',
'SNP_04',
'SNP_05',
'SNP_06',
'SNP_07',
'SNP_08',
'SNP_09',
'SNP_10',
'SNP_11',
'SNP_12',
'SNP_13',
'SNP_14',
'SNP_15']
snp_data = []
for col in snp_col:
snp_data += list(train_x[col].values)
train_y = class_le.fit_transform(train_y)
snp_le.fit(snp_data)
LabelEncoder()
for col in train_x.columns:
if col in snp_col:
train_x[col] = snp_le.transform(train_x[col])
test_x[col] = snp_le.transform(test_x[col])
train_x.head()
father | mother | gender | trait | SNP_01 | SNP_02 | SNP_03 | SNP_04 | SNP_05 | SNP_06 | SNP_07 | SNP_08 | SNP_09 | SNP_10 | SNP_11 | SNP_12 | SNP_13 | SNP_14 | SNP_15 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 0 | 0 | 2 | 5 | 1 | 0 | 4 | 2 | 0 | 0 | 5 | 0 | 5 | 1 | 0 | 0 | 0 | 0 |
1 | 0 | 0 | 0 | 2 | 1 | 1 | 2 | 0 | 0 | 1 | 0 | 4 | 0 | 1 | 0 | 4 | 5 | 0 | 0 |
2 | 0 | 0 | 0 | 2 | 5 | 5 | 0 | 4 | 3 | 5 | 0 | 4 | 4 | 1 | 0 | 0 | 0 | 0 | 0 |
3 | 0 | 0 | 0 | 1 | 0 | 5 | 0 | 4 | 0 | 5 | 5 | 0 | 5 | 1 | 5 | 5 | 5 | 0 | 5 |
4 | 0 | 0 | 0 | 2 | 5 | 5 | 3 | 0 | 3 | 0 | 0 | 0 | 0 | 5 | 0 | 0 | 1 | 0 | 4 |
from sklearn.model_selection import train_test_split
x_train,x_valid,y_train,y_valid = train_test_split(train_x,train_y,test_size = 0.2,random_state=42)
x_train.head()
father | mother | gender | trait | SNP_01 | SNP_02 | SNP_03 | SNP_04 | SNP_05 | SNP_06 | SNP_07 | SNP_08 | SNP_09 | SNP_10 | SNP_11 | SNP_12 | SNP_13 | SNP_14 | SNP_15 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
179 | 0 | 0 | 0 | 1 | 1 | 5 | 0 | 4 | 2 | 1 | 4 | 0 | 4 | 1 | 1 | 5 | 5 | 3 | 5 |
115 | 0 | 0 | 0 | 2 | 5 | 1 | 2 | 0 | 0 | 1 | 0 | 4 | 0 | 5 | 1 | 4 | 1 | 0 | 0 |
96 | 0 | 0 | 0 | 2 | 1 | 1 | 3 | 4 | 0 | 0 | 0 | 5 | 0 | 5 | 0 | 4 | 5 | 2 | 4 |
233 | 0 | 0 | 0 | 2 | 5 | 5 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 5 | 0 | 4 |
60 | 0 | 0 | 0 | 2 | 5 | 1 | 3 | 0 | 3 | 5 | 0 | 5 | 0 | 5 | 1 | 0 | 5 | 0 | 0 |
print(train_x.shape)
print(x_train.shape)
(262, 19)
(209, 19)
def data_preprocessing():
train = pd.read_csv('/content/drive/MyDrive/data/유전체 정보 품종 분류/train.csv')
test = pd.read_csv('/content/drive/MyDrive/data/유전체 정보 품종 분류/test.csv')
train_x, train_y = get_x_y(train)
test_x = get_x_y(test)
class_le = preprocessing.LabelEncoder()
snp_le = preprocessing.LabelEncoder()
snp_col = [f'SNP_{str(x).zfill(2)}' for x in range(1,16)]
snp_data = []
for col in snp_col:
snp_data += list(train_x[col].values)
train_y = class_le.fit_transform(train_y)
snp_le.fit(snp_data)
for col in train_x.columns:
if col in snp_col:
train_x[col] = snp_le.transform(train_x[col])
test_x[col] = snp_le.transform(test_x[col])
x_train,x_valid,y_train,y_valid = train_test_split(train_x,train_y,test_size = 0.2,random_state=42)
return x_train,x_valid,y_train,y_valid,test_x
Data Engineering
Feature Seletion
Feature Importance
model = RandomForestClassifier(random_state=CFG.SEED)
model.fit(x_train, y_train)
RandomForestClassifier(random_state=42)
from sklearn.metrics import f1_score
y_pred = model.predict(x_valid)
#f1_score(y_pred,y_valid, average='macro')
def make_plot():
f1_sco = f1_score(y_pred,y_valid,average='macro')
validation = pd.DataFrame({'y_valid':y_valid,'y_pred':y_pred})
validation_count = pd.DataFrame(validation['y_valid'].value_counts().sort_index())
validation_count.loc[validation['y_pred'].value_counts().sort_index().index,'y_pred']=validation['y_pred'].value_counts().sort_index().values
validation_count = validation_count.fillna(0)
x = validation_count.index
y_valid_count = validation_count['y_valid']
y_pred_count = validation_count['y_pred']
width = 0.35
plt.figure(dpi=100)
plt.title('F1-score: '+str(f1_sco)[:6])
plt.xlabel('quality')
plt.ylabel('count')
p1 = plt.bar([idx-width/2 for idx in x],y_valid_count,width,label = 'real')
p2 = plt.bar([idx+width/2 for idx in x],y_pred_count,width,label = 'pred')
plt.legend()
plt.show()
make_plot()
클래스 1과 2에서 혼동하는 모습
feature_imp = pd.DataFrame(model.feature_importances_,index = train_x.columns,columns = ['importance'])
feature_imp.sort_values(by='importance',ascending=False)
importance | |
---|---|
trait | 0.149997 |
SNP_05 | 0.110934 |
SNP_10 | 0.098379 |
SNP_07 | 0.087005 |
SNP_08 | 0.083492 |
SNP_04 | 0.071618 |
SNP_12 | 0.056224 |
SNP_14 | 0.047945 |
SNP_03 | 0.047389 |
SNP_02 | 0.046950 |
SNP_01 | 0.046350 |
SNP_15 | 0.038371 |
SNP_11 | 0.035263 |
SNP_13 | 0.034316 |
SNP_09 | 0.028967 |
SNP_06 | 0.016798 |
mother | 0.000000 |
gender | 0.000000 |
father | 0.000000 |
상관계수가 낮은 feature들이 많은 것을 알 수 있다.
그러한 feature 들을 제외하고 학습한 뒤 성능을 확인해보자
# 상관계수가 높은 상위 5개 feature
feature_5 = feature_imp.sort_values(by='importance',ascending=False).index[:5]
feature_5
Index(['trait', 'SNP_05', 'SNP_10', 'SNP_07', 'SNP_08'], dtype='object')
x_train,x_valid,y_train,y_valid,test_x = data_preprocessing()
x_train = x_train[feature_5]
x_valid = x_valid[feature_5]
# 정확한 비교를 위해 똑같은 랜덤포레스트 모델 사용
model = RandomForestClassifier(random_state=CFG.SEED)
model.fit(x_train, y_train)
y_pred = model.predict(x_valid)
make_plot()
학습에 포함시킬 feature 개수를 찾아보자
score_list = []
for i in range(len(feature_imp)-1,0,-1):
feature_tmp = feature_imp.sort_values(by='importance',ascending=False).index[:i]
x_train,x_valid,y_train,y_valid,test_x = data_preprocessing()
x_train = x_train[feature_tmp]
x_valid = x_valid[feature_tmp]
# 정확한 비교를 위해 똑같은 랜덤포레스트 모델 사용
model = RandomForestClassifier(random_state=CFG.SEED)
model.fit(x_train, y_train)
y_pred = model.predict(x_valid)
score_list.append((i+1,f1_score(y_pred,y_valid, average='macro')))
score_list
[(19, 0.9821551132463967),
(18, 0.9821551132463967),
(17, 1.0),
(16, 0.9638888888888889),
(15, 0.9451090781140042),
(14, 0.9821551132463967),
(13, 0.9638888888888889),
(12, 0.9638888888888889),
(11, 0.9638888888888889),
(10, 0.9451090781140042),
(9, 0.9464653397391901),
(8, 0.9464653397391901),
(7, 0.9464653397391901),
(6, 0.9257142857142857),
(5, 0.9646739130434782),
(4, 0.9277777777777777),
(3, 0.7947368421052632),
(2, 0.5806451612903226)]
상위 17개의 feature을 사용하는 것이 가장 성능이 좋은 것으로 보임
feature_tmp = feature_imp.sort_values(by='importance',ascending=False).index[:16]
x_train,x_valid,y_train,y_valid,test_x = data_preprocessing()
x_train = x_train[feature_tmp]
x_valid = x_valid[feature_tmp]
model = RandomForestClassifier(random_state=CFG.SEED)
model.fit(x_train, y_train)
y_pred = model.predict(x_valid)
f1_score(y_pred,y_valid, average='macro')
1.0
validation 셋에 대한 성능이 좋게 나왔으므로 전체데이터에 대한 학습을 통해 제출
# 제출 함수
def submit(file_name,pred):
submit = pd.read_csv('/content/drive/MyDrive/data/유전체 정보 품종 분류/sample_submission.csv')
submit['class'] = class_le.inverse_transform(pred)
submit.to_csv(f'/content/drive/MyDrive/Colab Notebooks/dacon/{file_name}.csv', index=False)
print('Done')
model = RandomForestClassifier(random_state=CFG.SEED)
model.fit(train_x[feature_tmp], train_y)
pred = model.predict(test_x[feature_tmp])
submit('rf_17_feature',pred)
Done
결과를 확인해보았으나 베이스라인코드의 제출결과와 같은 결과가 나옴
릿지 모델에도 적용
from sklearn.linear_model import RidgeClassifier
x_train,x_valid,y_train,y_valid,test_x = data_preprocessing()
model = RidgeClassifier(random_state=CFG.SEED)
model.fit(x_train, y_train)
pd.DataFrame(abs(model.coef_))
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.0 | 0.0 | 0.0 | 1.515546 | 0.008958 | 0.010171 | 0.011026 | 0.011176 | 0.006956 | 0.002418 | 0.023188 | 0.007393 | 0.013038 | 0.007025 | 0.013207 | 0.012130 | 0.006696 | 0.019875 | 0.005807 |
1 | 0.0 | 0.0 | 0.0 | 0.352045 | 0.040708 | 0.055363 | 0.086292 | 0.110782 | 0.237609 | 0.005510 | 0.124689 | 0.114634 | 0.083659 | 0.115097 | 0.043387 | 0.019054 | 0.052472 | 0.110547 | 0.042401 |
2 | 0.0 | 0.0 | 0.0 | 1.867591 | 0.031750 | 0.065533 | 0.075266 | 0.121959 | 0.230653 | 0.003092 | 0.101501 | 0.107241 | 0.096697 | 0.108072 | 0.056594 | 0.006924 | 0.045776 | 0.090672 | 0.036594 |
average_feature_effects = abs(model.coef_ )* np.asarray(x_train.mean(axis=0)).ravel()
feature_list = []
for i in range(3):
# 각 클래스 별 상관관계가 높은 feature 상위 5개씩 학습할 feature에 추가
tmp = np.argsort(average_feature_effects[i])[-10:][::-1]
for j in tmp:
feature_list.append(j)
feature_list = list(set(feature_list))
x_train,x_valid,y_train,y_valid,test_x = data_preprocessing()
x_train = x_train[x_train.columns[feature_list]]
x_valid = x_valid[x_valid.columns[feature_list]]
model = RidgeClassifier(random_state=CFG.SEED)
model.fit(x_train, y_train)
y_pred = model.predict(x_valid)
f1_score(y_pred,y_valid, average='macro')
0.9821551132463967
make_plot()
validation 셋에 대한 성능은 잘 나오지 않지만 제출결과 확인을 위해 제출
model = RidgeClassifier(random_state=CFG.SEED)
model.fit(train_x[train_x.columns[feature_list]], train_y)
pred = model.predict(test_x[test_x.columns[feature_list]])
submit('ridge_10_feature',pred)
Done
98점으로 약 2점가량 향상된 점수를 보임!
릿지모델을 통한 feature selection으로 성능향상을 노려봐야 할 것 같음