TIP¶

경고 메세지 뜰경우¶

분류 모델 학습중 경고가 뜰경우

# values.ravel()
model.fit(X_TRAIN, Y_TRAIN.values.ravel() )

처음부터 사용하지 말고, 문제 해결 불가시 사용

import warnings
warnings.filterwarnings(action='ignore')

데이터프레임 전체 다 보고 싶을때¶

import pandas as pd
pd.options.display.max_columns = None

탐색¶

# 요약정보 확인 #
print(data.info())

# 기초통계량 확인 #
print(data.describe())

# 상관분석 #
print(data.corr())

전처리¶

불필요한 열 삭제
결측값 처리하기
이상값 처리하기
데이터 스케일링
데이터 타입변경
범주형 인코딩
파생변수 만들기

1. 불필요한 열 삭제¶

기본키, 범주형, 무의미(주관적,상식적), 강한상관관계(0.6이상) ...

# 해당 범주형 안에 뭐가 있을까 # 
print(x_train['성별'].unique())

# 열 삭제 # 
data = data.drop(columns='id') # 1개 열 일때
data = data.drop(columns=['id','name'], axis=1) # 1개 이상 열 일때

# 기본키 #
X_test_cust_id = X_test['cust_id'] # 제출시 나중에 concat 으로 다시 붙여야함
X_train = X_train.drop(columns='cust_id')
y_train = y_train.drop(columns='cust_id')
X_test = X_test.drop(columns='cust_id')

# 강한상관관계 - 데이터 전체 (종속변수 포함) #
data = pd.concat([x_train,y_train], axis=1)
print(data.corr())

# 강한상관관계 - 지정한 컬럼 #
corr_ = x_train[['총구매액','최대구매액']].corr()
print(corr_)

2. 결측값 처리하기¶

평균값, 중위값, 0 ...

# 결측치 확인 #
print(x_train.isnull().sum())

# 평균값 대체 # 
cyl_mean = x_train['cyl'].mean()
x_train['cyl'] = x_train['cyl'].fillna(cyl_mean)

# 평균값 대체 : 모든 값을 각 변수의 평균으로 대체함 #
data_mean = data.fillna(data.mean())

# 중위값 대체 # 
qsec_median = x_train['qsec'].median()
x_train['qsec'] = x_train['qsec'].fillna(qsec_median)

# 다른값 대체 1 #
x_train['환불금액'] = x_train['환불금액'].fillna(0)

# 다른값 대체 2 #
print(X['gear'].unique())
## output -> ['4' '3' '*3' '5' '*5']

# gear열의 *3 -> 3 , *5 -> 5 로 변경
X['gear'] = X['gear'].replace('*3','3').replace('*5','5')

# 결측치 있는 행 삭제
# df 전체에서 결측값이 있으면 해당 행 삭제
data = data.dropna()

# 해당 하는 열의 결측값이 있으면 해당 행 삭제
data['RM'] = data['RM'].dropna()

3. 이상값 처리하기¶

사분범위(IQR) , 평균+표준편차

# 이상치가 있는지 대략적으로 살펴보기 #
print(x_train.describe())

# 사분범위(IQR) #
pd.options.display.max_columns = None
# print(x_train.describe())
x_train_describe = x_train.describe()
Q1 = x_train_describe.loc['25%']
Q3 = x_train_describe.loc['75%']
IQR = Q3 - Q1
min_lim = Q1 - IQR * 1.5
max_lim = Q3 + IQR * 1.5
max_real = x_train_describe.loc['max']
min_real = x_train_describe.loc['min']
print(max_lim < max_real) # 초과 : ...
print(min_lim > min_real) # 미만 : ...

# 이상치 갯수 확인
print((x_train['최대구매액'] > max_lim['최대구매액']).sum()) # 이상치 284개 존재

# 평균 + 표준편차 #
mean_ = data['총구매액'].mean()
std_ = data['총구매액'].std()
max_lim = mean_ + 1.5 * std_
min_lim = mean_ - 1.5 * std_

# 정상 범위 초과 인것 max_lim 으로 대체 #
condition = x_train['총구매액'] > max_lim['총구매액']
x_train.loc[condition,'총구매액'] = max_lim['총구매액']

# 정상 범위 미만 인것 min_lim 으로 대체 #
condition = x_train['총구매액'] < min_lim['총구매액']
x_train.loc[condition,'총구매액'] = min_lim['총구매액']

4. 데이터 스케일링¶

from sklearn.preprocessing

표준 크기 변환 - import StandardScaler

최대 최소 크기 변환 - import MinMaxScaler

로버스트 크기 변환 - import RobustScaler

# 표준 크기 변환 - StandardScaler # 
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_train = pd.DataFrame(x_train, columns = data.columns) # 데이터프레임 변환

# 최소 최대 크기 변환 - MinMaxScaler #
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
x_train = pd.DataFrame(scaler.fit_transform(x_train),columns=x_train.columns) # 한번에

# 로버스트 크기 변환 - RobustScaler #
from sklearn.preprocessing import RobustScaler
scaler = RobustScaler()
x_train = pd.DataFrame(scaler.fit_transform(x_train),columns=x_train.columns)

# 컬럼 한 개만 #
from sklearn.preprocessing import StandardScaler
x_train_qsec = x_train[['qsec']]  # 이 부분만 추가
scaler = StandardScaler()
x_train_qsec_scal = pd.DataFrame(scaler.fit_transform(x_train_qsec))
print(x_train_qsec_scal.describe())

5. 데이터 타입 변경¶

# 데이터 타입을 int64 로 변경 #
X['gear'] = X['gear'].astype('int64')

6. 범주형 인코딩¶

원 핫 인코딩 : pd.get_dummies(df['컬럼명'])
라벨 인코딩 : from sklearn.preprocessing import LabelEncoder
수동 인코딩 : df['컬럼명'].replace('값1':0).replace('값2':1)

# 원 핫 인코딩 #
columns_gender = pd.get_dummies(X_train['성별'])

# 라벨 인코딩 #
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
X_train['주구매상품'] = encoder.fit_transform(X_train['주구매상품'])

# 라벨 인코딩 2 # 
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
encoder = fit.([1,2,3,4,5,6,7,8]) # (주구매상품 unique 갯수 만큼)
X_train['주구매상품'] = encoder.transform(X_train['주구매상품'])

# 수동 인코딩 #
X_train['성별_new'] = X_train['성별'].replace('여자',0).replace('남자',1)

7. 파생변수 만들기¶

# 조건에 따라 새로운 컬럼 생성 #
condition = X['wt'] < 4
X.loc[condition, 'wt_class'] = 0
X.loc[~condition, 'wt_class'] = 1

# 계산된 새로운 컬럼 생성 #
X['qsec_4'] = X['qsec'] * 4

날짜 변환¶

# 날짜 변환 #
# datatime 칼럼의 데이터 타입을 날짜 타입(datetime)으로 변환하기
x_train['datetime'] = pd.to_datetime(x_train['datetime'])

# 년,월,일,시간,요일
x_train['year'] = x_train['datetime'].dt.year
x_train['month'] = x_train['datetime'].dt.month
x_train['day'] = x_train['datetime'].dt.day
x_train['hour'] = x_train['datetime'].dt.hour
x_train['dayofweek'] = x_train['datetime'].dt.dayofweek
x_train = x_train.drop(columns='datetime')

모델링¶

from sklearn.모듈 import 모델함수

model = 모델함수()

model.fit(x_train,y_train)

y_train의 예측값 = model.predict(x_train)

일단 '분류모형' OR '예측(회귀)모형' 만 구분해서

RandomForest, XGB (Classifier/Regressor) 우선시 사용하고,

모델 평가하고 돌아와서 가장 우수한 것 선택하기

</b>

# 예시 1 (예측값 : predict)
from sklearn.ensemble import RandomForestClassifier # RandomForestRegressor
model = RandomForestClassifier()
model.fit(x_train,y_train)
y_test_predicted = pd.DataFrame(model.predict(X_test),columns=y_train.columns)

# 예시 2 (예측확률 : predict_proba)
from xgboost import XGBClassifier # XGBRegressor
model = XGBClassifier(n_estimators=3000,max_depth=5)
model.fit(x_train,y_train)
y_test_proba = model.predict_proba(X_test)
y_test_proba = y_test_proba.drop(columns=0)
y_test_proba = pd.DataFrame(y_test_proba).rename(columns = {1:'gender'})

결과 제출¶

# 제출
result = pd.concat([x_test_cust_id, y_test_proba],axis=1)
result.to_csv('12345.csv',index=False)
print(pd.read_csv('12345.csv')) # 제출 후 다시 불러와서 확인

모델 평가¶

자체평가를 위해

먼저 train 셋을 다시 한번 나눠줘야함

# train 셋을 한번 더 나누기
from sklearn.model_selection import train_test_split
X_TRAIN, X_TEST, Y_TRAIN, Y_TEST = train_test_split(X_train,y_train,test_size=0.2,random_state=10)

각종 모델을 넣어보면서

앙상블을 우선시한다.
옵션은 default 로 먼저 실행
가장 점수 높은거 골라서 다시 하이퍼 파라미터 조정
시간은 20초 내외 실행가능하게 하기

분류 모델¶

# 분류

# ***랜덤포레스트 : 0.577
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()

# ***익스트림 그래디언트 부스팅 (XGB) : 0.559
from xgboost import XGBClassifier
model = XGBClassifier(n_estimators=3000,max_depth=5) # 0.5932 ***

# 의사결정나무 : 0.554
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier()

# 로지스틱 : 0.5
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()

# 서포트벡터(SVM) : 0.5
from sklearn.svm import SVC
model = SVC()

# 배깅 : 0.568
from sklearn.ensemble import BaggingClassifier
model = BaggingClassifier()

# K-최근접이웃(KNN) : 0.522
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier()

# 다층퍼셉트론 분류(MLP) : 0.5
from sklearn.neural_network import MLPClassifier
model = MLPClassifier()

예측 모델¶

# 예측(회귀)

# ***랜덤포레스트 회귀
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor()

# ***익스트림 그래디언트 부스팅 (XGB)
from xgboost import XGBRegressor
model = XGBRegressor()

# 선형 회귀
from sklearn.linear_model import LinearRegression
model = LinearRegression()

# 그레디언트 부스팅 회귀
from sklearn.ensemble import GradientBoostingRegressor
model = GradientBoostingRegressor()

스코어 확인¶

from sklearn.metrics import 평가함수

print(평가함수(Y_TEST, Y_TEST의 예측값))

# 학습
model.fit(X_TRAIN,Y_TRAIN.values.ravel())

# 예측
Y_TEST_PREDICTED = pd.DataFrame(model.predict(X_TEST))

# 모델 스코어 확인
from sklearn.metrics import roc_auc_score
print(roc_auc_score(Y_TEST,Y_TEST_PREDICTED))

# 교차검증
from sklearn.model_selection import cross_val_score
scores = cross_val_score(model, x_train, y_train.values.ravel(), scoring='roc_auc', cv=3)
print(scores.mean())

기타¶

검정 통계¶

# 독립표본 t 검증
import scipy.stats
result = scipy.stats.ttest_ind(data_40['총콜레스테롤'], data_50['총콜레스테롤'], equal_var=True)

그룹 / 카운트¶

# CHAS 칼럼과 RAD 칼럼 순으로 그룹을 지은후 각 그룹의 데이터 개수를 구하기
data_g = data.groupby(['CHAS','RAD'])['RAD'].count()


# A 컬럼 그룹화 B 컬럼의 갯수?
data_g = data.groupby(['A'])['B'].count()

일	월	화	수	목	금	토
1	2	3	4	5	6	7
8	9	10	11	12	13	14
15	16	17	18	19	20	21
22	23	24	25	26	27	28
29	30

[SQLD] SQLD 43회 합격 (0)	2021.12.17
[빅분기] 빅분기 3회 필기 합격 (0)	2021.12.17
[빅분기] 실기 작업형2 - 분류 모델 (2회기출) (0)	2021.12.03
[빅분기] 실기 작업형1 - 각종 메모 (0)	2021.12.03
[빅분기] 실기 작업형2- 분류 모델 (dataq 공식예제) (0)	2021.12.02

내 블로그 - 관리자 홈 전환	`Q` `Q`
새 글 쓰기	`W` `W`

글 수정 (권한 있는 경우)	`E` `E`
댓글 영역으로 이동	`C` `C`

이 페이지의 URL 복사	`S` `S`
맨 위로 이동	`T` `T`
티스토리 홈 이동	`H` `H`
단축키 안내	`Shift` + `/` `⇧` + `/`

Data Science