기존 방문객 조회, 전년도 데이터 비교, 예상 방문객 계산
This commit is contained in:
278
lib/weekly_visitor_forecast_prophet.py
Normal file
278
lib/weekly_visitor_forecast_prophet.py
Normal file
@ -0,0 +1,278 @@
|
||||
#weekly_visitor_forecast_prophet.py
|
||||
import os, sys
|
||||
import re, requests
|
||||
from sqlalchemy import select, and_, func
|
||||
from sqlalchemy.orm import Session
|
||||
from prophet import Prophet
|
||||
from statsmodels.tsa.arima.model import ARIMA
|
||||
from sklearn.ensemble import RandomForestRegressor
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from datetime import date, datetime, timedelta
|
||||
|
||||
# 경로 설정: 프로젝트 루트 conf 폴더 내 db 및 스키마 모듈 임포트
|
||||
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
|
||||
from conf import db, db_schema
|
||||
from weather_forecast import get_weekly_precip # 변경된 날씨 예보 함수 임포트
|
||||
from lib.holiday import is_korean_holiday # holiday.py의 DB 기반 휴일 판단 함수
|
||||
from lib.common import load_config
|
||||
|
||||
# DB 테이블 객체 초기화
|
||||
pos = db_schema.pos
|
||||
ga4 = db_schema.ga4_by_date
|
||||
weather = db_schema.weather
|
||||
air = db_schema.air
|
||||
|
||||
config = load_config()
|
||||
serviceKey = config['DATA_API']['serviceKey']
|
||||
VISITOR_CA = tuple(config['POS']['VISITOR_CA'])
|
||||
|
||||
|
||||
|
||||
# --- 데이터 로딩 및 전처리 ---
|
||||
|
||||
def get_date_range(start_date, end_date):
|
||||
return pd.date_range(start_date, end_date).to_pydatetime().tolist()
|
||||
|
||||
def add_korean_holiday_feature(df):
|
||||
df['is_holiday'] = df['date'].apply(lambda d: 1 if is_korean_holiday(d.date()) else 0)
|
||||
return df
|
||||
|
||||
def fix_zero_visitors_weighted(df):
|
||||
df = df.copy()
|
||||
if 'date' not in df.columns and 'ds' in df.columns:
|
||||
df['date'] = df['ds']
|
||||
if 'pos_qty' not in df.columns and 'y' in df.columns:
|
||||
df['pos_qty'] = df['y']
|
||||
if 'is_holiday' not in df.columns:
|
||||
raise ValueError("DataFrame에 'is_holiday' 컬럼이 필요합니다.")
|
||||
df['year_month'] = df['date'].dt.strftime('%Y-%m')
|
||||
monthly_means = df[df['pos_qty'] > 0].groupby(['year_month', 'is_holiday'])['pos_qty'].mean()
|
||||
arr = df['pos_qty'].values.copy()
|
||||
for i in range(len(arr)):
|
||||
if arr[i] == 0:
|
||||
ym = df.iloc[i]['year_month']
|
||||
holiday_flag = df.iloc[i]['is_holiday']
|
||||
mean_val = monthly_means.get((ym, holiday_flag), np.nan)
|
||||
arr[i] = 0 if np.isnan(mean_val) else mean_val
|
||||
df['pos_qty'] = arr
|
||||
if 'y' in df.columns:
|
||||
df['y'] = df['pos_qty']
|
||||
df.drop(columns=['year_month'], inplace=True)
|
||||
return df
|
||||
|
||||
def load_data(session, start_date, end_date):
|
||||
dates = get_date_range(start_date, end_date)
|
||||
stmt_pos = select(
|
||||
pos.c.date,
|
||||
func.sum(pos.c.qty).label('pos_qty')
|
||||
).where(
|
||||
and_(
|
||||
pos.c.date >= start_date,
|
||||
pos.c.date <= end_date,
|
||||
pos.c.ca01 == '매표소',
|
||||
pos.c.ca03.in_(VISITOR_CA)
|
||||
)
|
||||
).group_by(pos.c.date)
|
||||
pos_data = {row.date: row.pos_qty for row in session.execute(stmt_pos).fetchall()}
|
||||
|
||||
stmt_ga4 = select(ga4.c.date, ga4.c.activeUsers).where(
|
||||
and_(ga4.c.date >= start_date, ga4.c.date <= end_date)
|
||||
)
|
||||
ga4_data = {row.date: row.activeUsers for row in session.execute(stmt_ga4).fetchall()}
|
||||
|
||||
stmt_weather = select(
|
||||
weather.c.date,
|
||||
weather.c.minTa,
|
||||
weather.c.maxTa,
|
||||
weather.c.sumRn,
|
||||
weather.c.avgRhm
|
||||
).where(
|
||||
and_(
|
||||
weather.c.date >= start_date,
|
||||
weather.c.date <= end_date,
|
||||
weather.c.stnId == 99
|
||||
)
|
||||
)
|
||||
weather_data = {row.date: row for row in session.execute(stmt_weather).fetchall()}
|
||||
|
||||
stmt_air = select(air.c.date, air.c.pm25).where(
|
||||
and_(
|
||||
air.c.date >= start_date,
|
||||
air.c.date <= end_date,
|
||||
air.c.station == '운정'
|
||||
)
|
||||
)
|
||||
air_data = {row.date: row.pm25 for row in session.execute(stmt_air).fetchall()}
|
||||
|
||||
records = []
|
||||
for d in dates:
|
||||
key = d.date() if isinstance(d, datetime) else d
|
||||
record = {
|
||||
'date': d,
|
||||
'pos_qty': pos_data.get(key, 0),
|
||||
'activeUsers': ga4_data.get(key, 0),
|
||||
'minTa': getattr(weather_data.get(key), 'minTa', 0) if weather_data.get(key) else 0,
|
||||
'maxTa': getattr(weather_data.get(key), 'maxTa', 0) if weather_data.get(key) else 0,
|
||||
'sumRn': getattr(weather_data.get(key), 'sumRn', 0) if weather_data.get(key) else 0,
|
||||
'avgRhm': getattr(weather_data.get(key), 'avgRhm', 0) if weather_data.get(key) else 0,
|
||||
'pm25': air_data.get(key, 0)
|
||||
}
|
||||
records.append(record)
|
||||
|
||||
df = pd.DataFrame(records)
|
||||
df = add_korean_holiday_feature(df)
|
||||
df = fix_zero_visitors_weighted(df)
|
||||
df['weekday'] = df['date'].dt.weekday
|
||||
return df
|
||||
|
||||
def prepare_prophet_df(df):
|
||||
prophet_df = pd.DataFrame({
|
||||
'ds': df['date'],
|
||||
'y': df['pos_qty'].astype(float),
|
||||
'minTa': df['minTa'].astype(float),
|
||||
'maxTa': df['maxTa'].astype(float),
|
||||
'sumRn': df['sumRn'].astype(float),
|
||||
'avgRhm': df['avgRhm'].astype(float),
|
||||
'pm25': df['pm25'].astype(float),
|
||||
'is_holiday': df['is_holiday'].astype(int)
|
||||
})
|
||||
return prophet_df
|
||||
|
||||
def train_and_predict_prophet(prophet_df, forecast_days=7):
|
||||
prophet_df = fix_zero_visitors_weighted(prophet_df)
|
||||
prophet_df.fillna({
|
||||
'minTa': 0,
|
||||
'maxTa': 0,
|
||||
'sumRn': 0,
|
||||
'avgRhm': 0,
|
||||
'pm25': 0,
|
||||
'is_holiday': 0
|
||||
}, inplace=True)
|
||||
|
||||
m = Prophet(weekly_seasonality=True, yearly_seasonality=True, daily_seasonality=False)
|
||||
m.add_regressor('minTa')
|
||||
m.add_regressor('maxTa')
|
||||
m.add_regressor('sumRn')
|
||||
m.add_regressor('avgRhm')
|
||||
m.add_regressor('pm25')
|
||||
m.add_regressor('is_holiday')
|
||||
|
||||
m.fit(prophet_df)
|
||||
future = m.make_future_dataframe(periods=forecast_days)
|
||||
future_dates = future['ds'].dt.strftime('%Y%m%d').tolist()
|
||||
|
||||
weekly_precip = get_weekly_precip(serviceKey) # {'YYYYMMDD': {'sumRn': x, 'minTa': y, 'maxTa': z, 'avgRhm': w}, ...}
|
||||
|
||||
# 미래 데이터에 강수량 및 기온/습도 반영
|
||||
sumRn_list = []
|
||||
minTa_list = []
|
||||
maxTa_list = []
|
||||
avgRhm_list = []
|
||||
for dt_str in future_dates:
|
||||
day_forecast = weekly_precip.get(dt_str, None)
|
||||
if day_forecast:
|
||||
sumRn_list.append(float(day_forecast.get('sumRn', 0)))
|
||||
minTa_list.append(float(day_forecast.get('minTa', 0)))
|
||||
maxTa_list.append(float(day_forecast.get('maxTa', 0)))
|
||||
avgRhm_list.append(float(day_forecast.get('avgRhm', 0)))
|
||||
else:
|
||||
sumRn_list.append(0)
|
||||
minTa_list.append(0)
|
||||
maxTa_list.append(0)
|
||||
avgRhm_list.append(0)
|
||||
|
||||
future['sumRn'] = sumRn_list
|
||||
future['minTa'] = minTa_list
|
||||
future['maxTa'] = maxTa_list
|
||||
future['avgRhm'] = avgRhm_list
|
||||
|
||||
# pm25는 과거 마지막 데이터 복사
|
||||
last_known = prophet_df.iloc[-1]
|
||||
future['pm25'] = last_known['pm25']
|
||||
|
||||
# is_holiday 계산
|
||||
future['is_holiday'] = future['ds'].apply(lambda d: 1 if is_korean_holiday(d.date()) else 0)
|
||||
|
||||
forecast = m.predict(future)
|
||||
|
||||
# 예측 결과 저장
|
||||
output_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'data', 'prophet_result.csv'))
|
||||
os.makedirs(os.path.dirname(output_path), exist_ok=True)
|
||||
|
||||
df_to_save = forecast[['ds', 'yhat']].copy()
|
||||
df_to_save.columns = ['date', 'visitor_forecast']
|
||||
df_to_save['date'] = df_to_save['date'].dt.strftime("%Y-%m-%d")
|
||||
|
||||
# 오늘 날짜 이후 데이터만 필터링
|
||||
today_str = date.today().strftime("%Y-%m-%d")
|
||||
df_to_save = df_to_save[df_to_save['date'] >= today_str]
|
||||
|
||||
# visitor_forecast를 정수로 변환
|
||||
df_to_save['visitor_forecast'] = df_to_save['visitor_forecast'].round().astype(int)
|
||||
|
||||
df_to_save.to_csv(output_path, index=False)
|
||||
|
||||
return forecast
|
||||
|
||||
def train_and_predict_arima(ts, forecast_days=7):
|
||||
model = ARIMA(ts, order=(5,1,0))
|
||||
model_fit = model.fit()
|
||||
forecast = model_fit.forecast(steps=forecast_days)
|
||||
return forecast
|
||||
|
||||
def train_and_predict_rf(df, forecast_days=7):
|
||||
from sklearn.ensemble import RandomForestRegressor
|
||||
df = df.copy()
|
||||
df['weekday'] = df['date'].dt.weekday
|
||||
X = df[['weekday', 'minTa', 'maxTa', 'sumRn', 'avgRhm', 'pm25']]
|
||||
y = df['pos_qty']
|
||||
model = RandomForestRegressor(n_estimators=100, random_state=42)
|
||||
model.fit(X, y)
|
||||
future_dates = pd.date_range(df['date'].max() + timedelta(days=1), periods=forecast_days)
|
||||
future_df = pd.DataFrame({
|
||||
'date': future_dates,
|
||||
'weekday': future_dates.weekday,
|
||||
'minTa': 0,
|
||||
'maxTa': 0,
|
||||
'sumRn': 0,
|
||||
'avgRhm': 0,
|
||||
'pm25': 0
|
||||
})
|
||||
future_df['pos_qty'] = model.predict(future_df[['weekday', 'minTa', 'maxTa', 'sumRn', 'avgRhm', 'pm25']])
|
||||
return future_df
|
||||
|
||||
def main():
|
||||
today = datetime.today().date()
|
||||
start_date = today - timedelta(days=365)
|
||||
end_date = today
|
||||
|
||||
with Session(db.engine) as session:
|
||||
df = load_data(session, start_date, end_date)
|
||||
|
||||
prophet_df = prepare_prophet_df(df)
|
||||
forecast_days = 7
|
||||
|
||||
forecast = train_and_predict_prophet(prophet_df, forecast_days)
|
||||
|
||||
# 예측 후 정수 변환
|
||||
forecast['yhat'] = forecast['yhat'].round().astype(int)
|
||||
forecast['yhat_lower'] = forecast['yhat_lower'].round().astype(int)
|
||||
forecast['yhat_upper'] = forecast['yhat_upper'].round().astype(int)
|
||||
|
||||
# 강수량 정보 포함 출력 (오늘 이후는 날씨 예보 데이터로 덮음)
|
||||
weekly_precip = get_weekly_precip(serviceKey)
|
||||
|
||||
# 최근 10일 예측 결과 출력
|
||||
output_df = forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']].tail(10).copy()
|
||||
output_df.columns = ['날짜', '예상 방문객', '하한', '상한']
|
||||
|
||||
print("이번 주 강수 예보:")
|
||||
for dt_str, val in weekly_precip.items():
|
||||
print(f"{dt_str}: 강수량={val['sumRn']:.1f}mm, 최저기온={val['minTa']}, 최고기온={val['maxTa']}, 습도={val['avgRhm']:.1f}%")
|
||||
|
||||
print("\n예측 방문객:")
|
||||
print(output_df.to_string(index=False))
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
Reference in New Issue
Block a user