Data Science

[빅분기] 실기 작업형2- 예측 모델 (프리렉 p445) 본문

자격증

[빅분기] 실기 작업형2- 예측 모델 (프리렉 p445)

shinho0902 2021. 12. 2. 14:55

* 프리렉 교재를 참고하였으며, 교재와 다른 내용이 많이 포함되어 있습니다.

 

고객 10,866건에 대한 학습용데이터(x_train,y_train)를 이용하여 자전거 대여량 예측 모형을 만든다.

생성한 예측 모형으로 평가용데이터(x_test)에 해당하는 6,493건의 자전거 대여량 예측값을 csv 파일로 생성하시오

(제출한 모델의 성능은 R^2 score 평가지표에 따라 채점)

import pandas as pd
x_train=pd.read_csv('/content/drive/MyDrive/Colab Notebooks/빅분기 실기/프리렉/data/bike_x_train.csv',encoding='cp949')
y_train=pd.read_csv('/content/drive/MyDrive/Colab Notebooks/빅분기 실기/프리렉/data/bike_y_train.csv',encoding='cp949')
x_test=pd.read_csv('/content/drive/MyDrive/Colab Notebooks/빅분기 실기/프리렉/data/bike_x_test.csv',encoding='cp949')
y_train = y_train.rename(columns = {'癤풼atetime':'datetime'})

# pd.options.display.max_columns = None
# print(x_train.head())
# print(y_train.head())

#print(x_train.info())
#print(x_train.describe())

# 컬럼 저장
x_test_datetime = x_test['datetime']
y_train = y_train.drop(columns='datetime')

# 날짜 변환 #
# datatime 칼럼의 데이터 타입을 날짜 타입(datetime)으로 변환하기
x_train['datetime'] = pd.to_datetime(x_train['datetime'])
x_test['datetime'] = pd.to_datetime(x_test['datetime'])

# 년,월,일,시간,요일
x_train['year'] = x_train['datetime'].dt.year
x_train['month'] = x_train['datetime'].dt.month
x_train['day'] = x_train['datetime'].dt.day
x_train['hour'] = x_train['datetime'].dt.hour
x_train['dayofweek'] = x_train['datetime'].dt.dayofweek
x_train = x_train.drop(columns='datetime')

x_test['year'] = x_test['datetime'].dt.year
x_test['month'] = x_test['datetime'].dt.month
x_test['day'] = x_test['datetime'].dt.day
x_test['hour'] = x_test['datetime'].dt.hour
x_test['dayofweek'] = x_test['datetime'].dt.dayofweek
x_test = x_test.drop(columns='datetime')

# 결측치 검사 #
# print(x_train.isnull().sum())  
# 결측치 결과 : 없음

# 이상치 검사 #
# print(x_train.describe().T)
x_train_describe = x_train.describe()
Q1 = x_train_describe.loc['25%']
Q3 = x_train_describe.loc['75%']
IQR = Q3 - Q1
min_lim = Q1 - 1.5 * IQR
max_lim = Q3 + 1.5 * IQR
min_real = x_train.min()
max_real = x_train.max()
min_df = pd.concat([pd.DataFrame(min_lim),pd.DataFrame(min_real)],axis=1,keys=['min_lim','min_real'])
max_df = pd.concat([pd.DataFrame(max_lim),pd.DataFrame(max_real)],axis=1,keys=['max_lim','max_real'])

# print(min_df) # 미만: 없음
# print(max_df) # 초과: 풍속(31,56)

# 이상치 : 풍속
condition = x_train['풍속'] > max_lim['풍속']
x_train.loc[condition,'풍속'] = max_lim['풍속']
x_test.loc[condition,'풍속'] = max_lim['풍속']
# print(x_train['풍속'].max()) 

# 상관분석 #
# pd.options.display.max_columns = None
data = pd.concat([x_train,y_train],axis=1)
# print(data.corr())

# 계절,month : 0.97
x_train = x_train.drop(columns = '계절')
x_test = x_test.drop(columns = '계절')
#x_train = x_train.drop(columns = 'month')
#x_test = x_test.drop(columns = 'month')

# 온도,체감온도 : 0.98
x_train = x_train.drop(columns = '온도')
x_test = x_test.drop(columns = '온도')
#x_train = x_train.drop(columns = '체감온도')
#x_test = x_test.drop(columns = '체감온도')

# 근무일,dayofweerk : 0.70
x_train = x_train.drop(columns = '근무일')
x_test = x_test.drop(columns = '근무일')
#x_train = x_train.drop(columns = 'dayofweek')
#x_test = x_test.drop(columns = 'dayofweek')

# print(data.corr())

# 스케일링 # 
# print(x_train.describe())
from sklearn.preprocessing import RobustScaler
scaler = RobustScaler()
x_train = pd.DataFrame(scaler.fit_transform(x_train), columns = x_train.columns) 
x_test = pd.DataFrame(scaler.fit_transform(x_test), columns = x_test.columns)
# print(x_train.describe())


####### 전처리 끝 #########


# 모델링: 제출용 #
# import sklearn.ensemble
# print(dir(sklearn.ensemble))
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(n_estimators=151,max_depth=19,random_state=10) 
model.fit(x_train,y_train.values.ravel())
y_test_predicted = pd.DataFrame(model.predict(x_test),columns = y_train.columns)
y_test_predicted[y_test_predicted['count']<0] = 0 

result = pd.concat([x_test_datetime,y_test_predicted],axis=1)
# print(result)


# 제출 #
result.to_csv('12345.csv',index=False)
print(pd.read_csv('12345.csv'))



# 모델링: 평가용 # 
# 데이터분리
from sklearn.model_selection import train_test_split
X_TRAIN, X_TEST, Y_TRAIN, Y_TEST = train_test_split(x_train,y_train,test_size=0.2,random_state=10)

# 모델1 : 0.95168
from sklearn.ensemble import RandomForestRegressor
#model = RandomForestRegressor(n_estimators=100,max_depth=None,random_state=10) # default - 0.951681065886236
model = RandomForestRegressor(n_estimators=151,max_depth=19,random_state=10) # 0.9519134589969741

# # 모델2 : 0.67
# from sklearn.ensemble import AdaBoostRegressor
# model = AdaBoostRegressor()

# # 모델3 : 0.94
# from sklearn.ensemble import BaggingRegressor
# model = BaggingRegressor()

# # 모델3 : 0.86
# from xgboost import XGBRegressor
# model = XGBRegressor()

# 학습/예측
model.fit(X_TRAIN,Y_TRAIN.values.ravel())
Y_TEST_PREDICTED = pd.DataFrame(model.predict(X_TEST),columns = y_train.columns)
Y_TEST_PREDICTED[Y_TEST_PREDICTED['count']<0] = 0 

# 스코어
from sklearn.metrics import r2_score
print(r2_score(Y_TEST,Y_TEST_PREDICTED))

####

# 교차검증
from sklearn.model_selection import cross_val_score
scores = cross_val_score(model, x_train, y_train.values.ravel(), scoring='r2', cv=3)
print('교차 검증별 정확도\n', scores)
print('평균 검증 정확도\n', round(scores.mean(),4))
###

output

             datetime       count
0      2011-01-20 0:00    8.781457
1      2011-01-20 1:00    4.808683
2      2011-01-20 2:00    2.890728
3      2011-01-20 3:00    3.156220
4      2011-01-20 4:00    2.718070
...                ...         ...
6488  2012-12-31 19:00  418.323400
6489  2012-12-31 20:00  281.234437
6490  2012-12-31 21:00  184.145695
6491  2012-12-31 22:00  118.735099
6492  2012-12-31 23:00   78.907285

[6493 rows x 2 columns]
0.9519134589969741
교차 검증별 정확도
 [0.6824081  0.55642467 0.85700763]
평균 검증 정확도
 0.6986

          

Comments