Data Science

[빅분기] 실기 작업형2 - 분류 모델 (2회기출) 본문

자격증

[빅분기] 실기 작업형2 - 분류 모델 (2회기출)

shinho0902 2021. 12. 3. 14:48

참고 데이터

https://www.kaggle.com/kukuroo3/ecommerce-shipping-data-competition-form

 

Commerce Shipping Data (competition form)

Classification problem

www.kaggle.com

 

 

 

In [1]:
import pandas as pd

x_train = pd.read_csv('../input/ecommerce-shipping-data-competition-form/X_train.csv')
x_test = pd.read_csv('../input/ecommerce-shipping-data-competition-form/X_test.csv')
y_train = pd.read_csv('../input/ecommerce-shipping-data-competition-form/y_train.csv')

# ID 분석 제외 # 
x_test_ID = x_test['ID']
x_test = x_test.drop(columns='ID')
x_train = x_train.drop(columns='ID')
y_train = y_train.drop(columns='ID')


# 결측치 # 
# print(x_train.info()) # 없음



# 범주형 #
# print(x_train['Warehouse_block'].unique()) # 인코딩
# print(x_train['Mode_of_Shipment'].unique()) # 인코딩
# print(x_train['Customer_care_calls'].unique()) # $7 -> 7 # 인코딩 혹은 astype
# print(x_train['Product_importance'].unique()) # 인코딩
# print(x_train['Gender'].unique()) # 인코딩

# Customer_care_calls
x_train['Customer_care_calls'] = x_train['Customer_care_calls'].replace('$7','7')
x_test['Customer_care_calls'] = x_test['Customer_care_calls'].replace('$7','7')
# print(x_train['Customer_care_calls'].unique())

# 라벨 인코딩
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
x_train['Warehouse_block'] = encoder.fit_transform(x_train['Warehouse_block'])
x_train['Mode_of_Shipment'] = encoder.fit_transform(x_train['Mode_of_Shipment'])
x_train['Customer_care_calls'] = encoder.fit_transform(x_train['Customer_care_calls'])
x_train['Product_importance'] = encoder.fit_transform(x_train['Product_importance'])
x_train['Gender'] = encoder.fit_transform(x_train['Gender'])

x_test['Warehouse_block'] = encoder.fit_transform(x_test['Warehouse_block'])
x_test['Mode_of_Shipment'] = encoder.fit_transform(x_test['Mode_of_Shipment'])
x_test['Customer_care_calls'] = encoder.fit_transform(x_test['Customer_care_calls'])
x_test['Product_importance'] = encoder.fit_transform(x_test['Product_importance'])
x_test['Gender'] = encoder.fit_transform(x_test['Gender'])


# 상관분석 # 
# print(x_train.corr()) # 없음


# 이상치 # 
pd.options.display.max_columns = None
# print(x_train.describe())
x_train_describe = x_train.describe()
Q1 = x_train_describe.loc['25%']
Q3 = x_train_describe.loc['75%']
IQR = Q3 - Q1
min_lim = Q1 - IQR * 1.5
max_lim = Q3 + IQR * 1.5
max_real = x_train_describe.loc['max']
min_real = x_train_describe.loc['min']
# print(max_lim < max_real) # Prior_purchases, Discount_offered
# print(min_lim > min_real) # 없음

# Prior_purchases 초과
condition = (x_train['Prior_purchases'] > max_lim['Prior_purchases'])
# print((x_train['Prior_purchases'] > max_lim['Prior_purchases']).sum()) # 599 개
x_train.loc[condition,'Prior_purchases'] = max_lim['Prior_purchases']
# print((x_train['Prior_purchases'] > max_lim['Prior_purchases']).sum()) # 0개
x_test.loc[condition,'Prior_purchases'] = max_lim['Prior_purchases']

# Discount_offered 초과
condition = (x_train['Discount_offered'] > max_lim['Discount_offered'])
# print((x_train['Discount_offered'] > max_lim['Discount_offered']).sum()) # 1314 개
x_train.loc[condition,'Discount_offered'] = max_lim['Discount_offered']
# print((x_train['Discount_offered'] > max_lim['Discount_offered']).sum()) # 0개
x_test.loc[condition,'Discount_offered'] = max_lim['Discount_offered']




# # 스케일링 # 
# from sklearn.preprocessing import RobustScaler
# scaler = RobustScaler()
# x_train = pd.DataFrame(scaler.fit_transform(x_train),columns=x_train.columns)
# # print(x_train)





# 모델링 : 제출용 #
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier
from xgboost import XGBClassifier

# model = BaggingClassifier(random_state=10,n_estimators=50)
model = RandomForestClassifier(random_state=10) # 0.733

model.fit(x_train,y_train.values.ravel())
y_test_proba = pd.DataFrame(model.predict_proba(x_test))
y_test_proba = y_test_proba.drop(columns=0)
y_test_proba = y_test_proba.rename(columns={1:'Reached.on.Time_Y.N'})
# print(y_test_proba)
result = pd.concat([x_test_ID,y_test_proba],axis=1)
# print(result)
result.to_csv('asdf.csv',index=False)
print(pd.read_csv('asdf.csv'))


# 추가) predicted 할 경우
# y_test_predicted = pd.DataFrame(model.predict(x_test),columns=y_train.columns)
# result = pd.concat([x_test_ID,y_test_predicted],axis=1)





# 모델링 : 평가용 #
# 데이터 분리
from sklearn.model_selection import train_test_split
X_TRAIN, X_TEST, Y_TRAIN, Y_TEST = train_test_split(x_train,y_train,test_size=0.2,random_state=10,stratify = y_train)

# 모델1
model = RandomForestClassifier(random_state=10) # 0.733

# 모델2
# model = BaggingClassifier(random_state=10,n_estimators=10) # default = 0.735
# model = BaggingClassifier(random_state=10,n_estimators=50) # 0.7455 # 처음에 선택했으나 랜포가 실제 스코어는 더 높음

# 모델3
# model = XGBClassifier(random_state=10,use_label_encoder=False) # 0.733


# 학습
model.fit(X_TRAIN,Y_TRAIN.values.ravel())
Y_TEST_PROBA = pd.DataFrame(model.predict_proba(X_TEST))
Y_TEST_PROBA = Y_TEST_PROBA.drop(columns=0)
Y_TEST_PROBA = Y_TEST_PROBA.rename(columns={1:'Reached.on.Time_Y.N'})

# 점수1
from sklearn.metrics import roc_auc_score
score_1 = roc_auc_score(Y_TEST,Y_TEST_PROBA)
print(score_1)

# 점수2
from sklearn.model_selection import cross_val_score
score_2 = cross_val_score(model,x_train,y_train.values.ravel(),scoring='roc_auc',cv=3)
print(score_2.mean())
 
         ID  Reached.on.Time_Y.N
0      6811                 0.48
1      4320                 0.75
2      5732                 0.37
3      7429                 0.71
4      2191                 1.00
...     ...                  ...
4396   2610                 1.00
4397   3406                 0.35
4398  10395                 0.42
4399   3646                 0.26
4400    573                 1.00

[4401 rows x 2 columns]
0.7207549330178238
0.7411551991636095
In [2]:
# Check test label AUC score

def check_test_label(submission):
    testlabel = pd.read_csv('../input/ecommerce-shipping-data-competition-form/test_label/y_test.csv')
    Truelabel = testlabel['Reached.on.Time_Y.N']
    guesslabel = result['Reached.on.Time_Y.N']
    
    print('AUC Score : ', roc_auc_score(Truelabel, guesslabel))
    
check_test_label(result)
 
AUC Score :  0.7331581262134881
 

 

Comments