static/lib/weekly_visitor_forecast_prophet.py

#weekly_visitor_forecast_prophet.py
import os, sys
import re, requests
from sqlalchemy import select, and_, func
from sqlalchemy.orm import Session
from prophet import Prophet
from statsmodels.tsa.arima.model import ARIMA
from sklearn.ensemble import RandomForestRegressor
import numpy as np
import pandas as pd
from datetime import date, datetime, timedelta

# 경로 설정: 프로젝트 루트 conf 폴더 내 db 및 스키마 모듈 임포트
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
from conf import db, db_schema
from weather_forecast import get_weekly_precip  # 변경된 날씨 예보 함수 임포트
from lib.holiday import is_korean_holiday  # holiday.py의 DB 기반 휴일 판단 함수
from lib.common import load_config

# DB 테이블 객체 초기화
pos = db_schema.pos
ga4 = db_schema.ga4_by_date
weather = db_schema.weather
air = db_schema.air

config = load_config()
serviceKey = config['DATA_API']['serviceKey']
VISITOR_CA = tuple(config['POS']['VISITOR_CA'])


# --- 데이터 로딩 및 전처리 ---

def get_date_range(start_date, end_date):
    return pd.date_range(start_date, end_date).to_pydatetime().tolist()

def add_korean_holiday_feature(df):
    df['is_holiday'] = df['date'].apply(lambda d: 1 if is_korean_holiday(d.date()) else 0)
    return df

def fix_zero_visitors_weighted(df):
    df = df.copy()
    if 'date' not in df.columns and 'ds' in df.columns:
        df['date'] = df['ds']
    if 'pos_qty' not in df.columns and 'y' in df.columns:
        df['pos_qty'] = df['y']
    if 'is_holiday' not in df.columns:
        raise ValueError("DataFrame에 'is_holiday' 컬럼이 필요합니다.")
    df['year_month'] = df['date'].dt.strftime('%Y-%m')
    monthly_means = df[df['pos_qty'] > 0].groupby(['year_month', 'is_holiday'])['pos_qty'].mean()
    arr = df['pos_qty'].values.copy()
    for i in range(len(arr)):
        if arr[i] == 0:
            ym = df.iloc[i]['year_month']
            holiday_flag = df.iloc[i]['is_holiday']
            mean_val = monthly_means.get((ym, holiday_flag), np.nan)
            arr[i] = 0 if np.isnan(mean_val) else mean_val
    df['pos_qty'] = arr
    if 'y' in df.columns:
        df['y'] = df['pos_qty']
    df.drop(columns=['year_month'], inplace=True)
    return df

def load_data(session, start_date, end_date):
    dates = get_date_range(start_date, end_date)
    stmt_pos = select(
        pos.c.date,
        func.sum(pos.c.qty).label('pos_qty')
    ).where(
        and_(
            pos.c.date >= start_date,
            pos.c.date <= end_date,
            pos.c.ca01 == '매표소',
            pos.c.ca03.in_(VISITOR_CA)
        )
    ).group_by(pos.c.date)
    pos_data = {row.date: row.pos_qty for row in session.execute(stmt_pos).fetchall()}

    stmt_ga4 = select(ga4.c.date, ga4.c.activeUsers).where(
        and_(ga4.c.date >= start_date, ga4.c.date <= end_date)
    )
    ga4_data = {row.date: row.activeUsers for row in session.execute(stmt_ga4).fetchall()}

    stmt_weather = select(
        weather.c.date,
        weather.c.minTa,
        weather.c.maxTa,
        weather.c.sumRn,
        weather.c.avgRhm
    ).where(
        and_(
            weather.c.date >= start_date,
            weather.c.date <= end_date,
            weather.c.stnId == 99
        )
    )
    weather_data = {row.date: row for row in session.execute(stmt_weather).fetchall()}

    stmt_air = select(air.c.date, air.c.pm25).where(
        and_(
            air.c.date >= start_date,
            air.c.date <= end_date,
            air.c.station == '운정'
        )
    )
    air_data = {row.date: row.pm25 for row in session.execute(stmt_air).fetchall()}

    records = []
    for d in dates:
        key = d.date() if isinstance(d, datetime) else d
        record = {
            'date': d,
            'pos_qty': pos_data.get(key, 0),
            'activeUsers': ga4_data.get(key, 0),
            'minTa': getattr(weather_data.get(key), 'minTa', 0) if weather_data.get(key) else 0,
            'maxTa': getattr(weather_data.get(key), 'maxTa', 0) if weather_data.get(key) else 0,
            'sumRn': getattr(weather_data.get(key), 'sumRn', 0) if weather_data.get(key) else 0,
            'avgRhm': getattr(weather_data.get(key), 'avgRhm', 0) if weather_data.get(key) else 0,
            'pm25': air_data.get(key, 0)
        }
        records.append(record)

    df = pd.DataFrame(records)
    df = add_korean_holiday_feature(df)
    df = fix_zero_visitors_weighted(df)
    df['weekday'] = df['date'].dt.weekday
    return df

def prepare_prophet_df(df):
    prophet_df = pd.DataFrame({
        'ds': df['date'],
        'y': df['pos_qty'].astype(float),
        'minTa': df['minTa'].astype(float),
        'maxTa': df['maxTa'].astype(float),
        'sumRn': df['sumRn'].astype(float),
        'avgRhm': df['avgRhm'].astype(float),
        'pm25': df['pm25'].astype(float),
        'is_holiday': df['is_holiday'].astype(int)
    })
    return prophet_df

def train_and_predict_prophet(prophet_df, forecast_days=7):
    prophet_df = fix_zero_visitors_weighted(prophet_df)
    prophet_df.fillna({
        'minTa': 0,
        'maxTa': 0,
        'sumRn': 0,
        'avgRhm': 0,
        'pm25': 0,
        'is_holiday': 0
    }, inplace=True)

    m = Prophet(weekly_seasonality=True, yearly_seasonality=True, daily_seasonality=False)
    m.add_regressor('minTa')
    m.add_regressor('maxTa')
    m.add_regressor('sumRn')
    m.add_regressor('avgRhm')
    m.add_regressor('pm25')
    m.add_regressor('is_holiday')

    m.fit(prophet_df)
    future = m.make_future_dataframe(periods=forecast_days)
    future_dates = future['ds'].dt.strftime('%Y%m%d').tolist()

    weekly_precip = get_weekly_precip(serviceKey)  # {'YYYYMMDD': {'sumRn': x, 'minTa': y, 'maxTa': z, 'avgRhm': w}, ...}

    # 미래 데이터에 강수량 및 기온/습도 반영
    sumRn_list = []
    minTa_list = []
    maxTa_list = []
    avgRhm_list = []
    for dt_str in future_dates:
        day_forecast = weekly_precip.get(dt_str, None)
        if day_forecast:
            sumRn_list.append(float(day_forecast.get('sumRn', 0)))
            minTa_list.append(float(day_forecast.get('minTa', 0)))
            maxTa_list.append(float(day_forecast.get('maxTa', 0)))
            avgRhm_list.append(float(day_forecast.get('avgRhm', 0)))
        else:
            sumRn_list.append(0)
            minTa_list.append(0)
            maxTa_list.append(0)
            avgRhm_list.append(0)

    future['sumRn'] = sumRn_list
    future['minTa'] = minTa_list
    future['maxTa'] = maxTa_list
    future['avgRhm'] = avgRhm_list

    # pm25는 과거 마지막 데이터 복사
    last_known = prophet_df.iloc[-1]
    future['pm25'] = last_known['pm25']

    # is_holiday 계산
    future['is_holiday'] = future['ds'].apply(lambda d: 1 if is_korean_holiday(d.date()) else 0)

    forecast = m.predict(future)

    # 예측 결과 저장
    output_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'data', 'prophet_result.csv'))
    os.makedirs(os.path.dirname(output_path), exist_ok=True)

    df_to_save = forecast[['ds', 'yhat']].copy()
    df_to_save.columns = ['date', 'visitor_forecast']
    df_to_save['date'] = df_to_save['date'].dt.strftime("%Y-%m-%d")

    # 오늘 날짜 이후 데이터만 필터링
    today_str = date.today().strftime("%Y-%m-%d")
    df_to_save = df_to_save[df_to_save['date'] >= today_str]

    # visitor_forecast를 정수로 변환
    df_to_save['visitor_forecast'] = df_to_save['visitor_forecast'].round().astype(int)

    df_to_save.to_csv(output_path, index=False)

    return forecast

def train_and_predict_arima(ts, forecast_days=7):
    model = ARIMA(ts, order=(5,1,0))
    model_fit = model.fit()
    forecast = model_fit.forecast(steps=forecast_days)
    return forecast

def train_and_predict_rf(df, forecast_days=7):
    from sklearn.ensemble import RandomForestRegressor
    df = df.copy()
    df['weekday'] = df['date'].dt.weekday
    X = df[['weekday', 'minTa', 'maxTa', 'sumRn', 'avgRhm', 'pm25']]
    y = df['pos_qty']
    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(X, y)
    future_dates = pd.date_range(df['date'].max() + timedelta(days=1), periods=forecast_days)
    future_df = pd.DataFrame({
        'date': future_dates,
        'weekday': future_dates.weekday,
        'minTa': 0,
        'maxTa': 0,
        'sumRn': 0,
        'avgRhm': 0,
        'pm25': 0
    })
    future_df['pos_qty'] = model.predict(future_df[['weekday', 'minTa', 'maxTa', 'sumRn', 'avgRhm', 'pm25']])
    return future_df

def main():
    today = datetime.today().date()
    start_date = today - timedelta(days=365)
    end_date = today

    with Session(db.engine) as session:
        df = load_data(session, start_date, end_date)

    prophet_df = prepare_prophet_df(df)
    forecast_days = 7

    forecast = train_and_predict_prophet(prophet_df, forecast_days)

    # 예측 후 정수 변환
    forecast['yhat'] = forecast['yhat'].round().astype(int)
    forecast['yhat_lower'] = forecast['yhat_lower'].round().astype(int)
    forecast['yhat_upper'] = forecast['yhat_upper'].round().astype(int)

    # 강수량 정보 포함 출력 (오늘 이후는 날씨 예보 데이터로 덮음)
    weekly_precip = get_weekly_precip(serviceKey)

    # 최근 10일 예측 결과 출력
    output_df = forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']].tail(10).copy()
    output_df.columns = ['날짜', '예상 방문객', '하한', '상한']

    print("이번 주 강수 예보:")
    for dt_str, val in weekly_precip.items():
        print(f"{dt_str}: 강수량={val['sumRn']:.1f}mm, 최저기온={val['minTa']}, 최고기온={val['maxTa']}, 습도={val['avgRhm']:.1f}%")

    print("\n예측 방문객:")
    print(output_df.to_string(index=False))

if __name__ == '__main__':
    main()