Data Science

[빅분기] 실기 작업형2- 분류 모델 (dataq 공식예제) 본문

자격증

[빅분기] 실기 작업형2- 분류 모델 (dataq 공식예제)

shinho0902 2021. 12. 2. 15:39

아래는 백화점 고객의 1년 간 구매 데이터이다. 고객 3500명에 대한 학습용 데이터(y_train.csv, X_train.csv)를 이용하여 성별예측 모형을 마든 후, 이를 평가용 데이터(X_test.csv)에 적용하여 얻은 2482명 고객의 성별 예측값(남자일 확률)을 다음과 같은 형식(custid, gender)의 CSV 파일로 생성하시오. (제출한 모델의 성능은 ROC-AUC 평가지표에 따라 채점)

 

# 데이터 읽기
import pandas as pd
x_train = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/빅분기 실기/프리렉/data/x_train.csv',encoding='cp949')
x_test = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/빅분기 실기/프리렉/data/x_test.csv',encoding='cp949')
y_train = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/빅분기 실기/프리렉/data/y_train.csv',encoding='cp949')

# # 데이터 파일 읽기
# import pandas as pd
# x_test = pd.read_csv("data/X_test.csv")
# x_train = pd.read_csv("data/X_train.csv")
# y_train = pd.read_csv("data/y_train.csv")

# 기본키 드롭
x_test_cust_id = x_test['cust_id']
x_train = x_train.drop(columns='cust_id')
x_test = x_test.drop(columns='cust_id')
y_train = y_train.drop(columns='cust_id')

# 범주형 검사 #################
# print(x_train.describe())
# 범주형 주구매상품, 주구매지점


# # 범주형 삭제
# x_train = x_train.drop(columns=['주구매상품','주구매지점'],axis=1)
# x_test = x_test.drop(columns=['주구매상품','주구매지점'],axis=1)


# 범주형 라벨 인코딩
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
x_train['주구매상품'] = encoder.fit_transform(x_train['주구매상품'])
x_test['주구매상품'] = encoder.fit_transform(x_test['주구매상품'])
x_train['주구매지점'] = encoder.fit_transform(x_train['주구매지점'])
x_test['주구매지점'] = encoder.fit_transform(x_test['주구매지점'])


# 결측치 검사 ###############
# print(x_train.isnull().sum()) 
# 결측치 환불금액
x_train['환불금액'] = x_train['환불금액'].fillna(0)
x_test['환불금액'] = x_test['환불금액'].fillna(0)


# 상관분석 #############
data = pd.concat([x_train,y_train],axis=1)
# pd.options.display.max_columns = None
# print(data.corr())

# 총구매액,최대구매액 0.7
# 총구매액,내점일수 0.66 
x_train = x_train.drop(columns='총구매액')
x_test = x_test.drop(columns='총구매액')


# 이상치 검사 ###########
# pd.options.display.max_columns = None
# print(x_train.describe().T)
x_train_describe = x_train.describe()
Q1 = x_train_describe.loc['25%']
Q3 = x_train_describe.loc['75%']
IQR = Q3 - Q1
min_lim = Q1 - 1.5 * IQR
max_lim = Q3 + 1.5 * IQR
min_real = x_train.min()
max_real = x_train.max()
min_df = pd.concat([pd.DataFrame(min_lim),pd.DataFrame(min_real)],axis=1,keys=['min_lim','min_real'])
max_df = pd.concat([pd.DataFrame(max_lim),pd.DataFrame(max_real)],axis=1,keys=['max_lim','max_real'])

# print(min_df) # 미만: 없음
# print(max_df) # 초과: 내점일수

# print((x_train['내점일수'] > max_lim['내점일수']).sum()) # 이상치 284개 존재
condition = x_train['내점일수'] > max_lim['내점일수']
x_train.loc[condition,'내점일수'] = max_lim['내점일수']

condition = x_test['내점일수'] > max_lim['내점일수']
x_test.loc[condition,'내점일수'] = max_lim['내점일수']


# 스케일링 - 로버스트 ##########
from sklearn.preprocessing import RobustScaler
scaler = RobustScaler()
x_train = pd.DataFrame(scaler.fit_transform(x_train),columns=x_train.columns)
x_test = pd.DataFrame(scaler.fit_transform(x_test),columns=x_test.columns)


# 점검 #########
# pd.options.display.max_columns = None
# print(x_train.corr())
# print(x_train.info())
# print(x_train.descirbe())

###### 전처리 끝 #######


# 모델링 : 제출용
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=210, max_depth=None,random_state=10)
model.fit(x_train,y_train.values.ravel())
y_test_proba = pd.DataFrame(model.predict_proba(x_test)) # 남자(1)인지 확률을 구할때
y_test_proba = y_test_proba.drop(columns=0)
y_test_proba = y_test_proba.rename(columns={1:'gender'})
# print(y_test_proba)

"""
# 남자(1) 인지 분류할때
y_test_predicted = pd.DataFrame(model.predict(x_test),columns = y_train.columns)
# print(y_test_predicted)
"""

# 제출
result = pd.concat([x_test_cust_id,y_test_proba],axis=1)
result.to_csv('12345.csv',index=False)
print(pd.read_csv('12345.csv'))


###### 평가용 #######

# 데이터 분리
from sklearn.model_selection import train_test_split
X_TRAIN, X_TEST, Y_TRAIN, Y_TEST = train_test_split(x_train,y_train,test_size=0.2,random_state=10)

# 모델1 : 0.570
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=210, max_depth=None,random_state=10) # 0.5801 **

# 모델2 : 0.544
# from xgboost import XGBClassifier
# model = XGBClassifier(use_label_encoder=False)

# 모델3 : 0.558
# from sklearn.ensemble import BaggingClassifier
# model = BaggingClassifier(n_estimators=250,random_state=10) # 0.5932 ***

# 모델4 : 0.536 
# from sklearn.svm import SVC
# model = SVC(C=8) # 0.554

model.fit(X_TRAIN,Y_TRAIN.values.ravel())
Y_TEST_PREDICTED = model.predict(X_TEST)
Y_TEST_PREDICTED = pd.DataFrame(model.predict(X_TEST),columns = y_train.columns)

# 점수
from sklearn.metrics import roc_auc_score
score = roc_auc_score(Y_TEST,Y_TEST_PREDICTED) 
print('점수 : ',round(score,4))


###############

# 교차검증 점수
from sklearn.model_selection import cross_val_score
scores = cross_val_score(model, x_train, y_train.values.ravel(), scoring='roc_auc', cv=10)
print('교차 검증 : ', scores)
print('교차 검증 평균 : ', round(scores.mean(),4))

# 배깅 : 0.6282
# 랜포 : 0.6397 **

##############

 

output

      cust_id    gender
0        3500  0.571429
1        3501  0.214286
2        3502  0.195238
3        3503  0.471429
4        3504  0.461905
...       ...       ...
2477     5977  0.533333
2478     5978  0.519048
2479     5979  0.671429
2480     5980  0.419048
2481     5981  0.447619

[2482 rows x 2 columns]
점수 :  0.5842
교차 검증 :  [0.61269128 0.60838649 0.6459793  0.60779393 0.63135947 0.69325132
 0.59561788 0.68654434 0.6706283  0.64470392]
교차 검증 평균 :  0.6397
Comments