Files
naver-review-crawler/lib/lib.py

112 lines
4.7 KiB
Python

# lib/lib.py
import os, sys
import time
import pickle
import re
import undetected_chromedriver as uc
from datetime import datetime, timedelta
from selenium.webdriver.common.by import By
# 공통 설정 경로 추가 (필요 시)
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
# ─────────────────────────────────────────────
# ✅ 드라이버 생성 함수
# ─────────────────────────────────────────────
def create_mobile_driver(headless=True):
options = uc.ChromeOptions()
options.add_argument('--window-size=375,812')
if headless:
options.add_argument('--headless=new')
options.add_argument('--disable-gpu')
options.add_argument("--user-agent=Mozilla/5.0 (iPhone; CPU iPhone OS 14_0 like Mac OS X) "
"AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Mobile/15A372 Safari/604.1")
driver = uc.Chrome(options=options)
driver.set_window_size(375, 812)
return driver
# ─────────────────────────────────────────────
# ✅ 쿠키 저장 및 로드
# ─────────────────────────────────────────────
def save_cookies(driver, cookie_file):
cookies = driver.get_cookies()
for c in cookies:
c.pop("sameSite", None)
if "expiry" in c:
c["expires"] = c.pop("expiry")
with open(cookie_file, "wb") as f:
pickle.dump(cookies, f)
def load_cookies(driver, cookie_file):
with open(cookie_file, "rb") as f:
cookies = pickle.load(f)
for cookie in cookies:
driver.add_cookie(cookie)
# ─────────────────────────────────────────────
# ✅ 날짜 계산 유틸리티
# ─────────────────────────────────────────────
def get_start_end_dates(debug, cfg_start, cfg_end):
if debug:
return (
datetime.strptime(cfg_start, "%Y-%m-%d").date(),
datetime.strptime(cfg_end, "%Y-%m-%d").date(),
)
today = datetime.today()
weekday = today.weekday()
if weekday == 0:
start = today - timedelta(days=3)
end = today - timedelta(days=1)
else:
start = end = today - timedelta(days=1)
return start.date(), end.date()
# ─────────────────────────────────────────────
# ✅ 메시지 전송 실패 시 기본 메시지 전송
# ─────────────────────────────────────────────
def send_failure_message(sender, platforms):
for platform in platforms:
sender.send("# ❌ 리뷰 수집 실패: 플레이스 접근 또는 파싱 오류", platform=platform, use_webhook=False)
# ─────────────────────────────────────────────
# ✅ HTML 본문 정리 (리뷰 등)
# ─────────────────────────────────────────────
def clean_html_text(html):
html = re.sub(r'<br\s*/?>', '\n', html)
html = re.sub(r'<span class="pui__blind">.*?<\/span>', '', html, flags=re.DOTALL)
html = re.sub(r'<.*?>', '', html)
return html.strip()
def parse_korean_date(date_str):
try:
date_clean = " ".join(date_str.strip().split(" ")[:3]) # 요일 제거
return datetime.strptime(date_clean, "%Y년 %m월 %d").date()
except Exception as e:
print(f"[WARN] 날짜 파싱 실패: {date_str} ({e})")
return None
def click_more(driver):
try:
container = driver.find_element(By.CLASS_NAME, "place_section_content")
more_div = container.find_element(By.XPATH, "./following-sibling::div[1]")
more_btn = more_div.find_element(By.TAG_NAME, "a")
driver.execute_script("arguments[0].click();", more_btn)
time.sleep(2) # 클릭 후 대기
return True
except Exception:
return False
def extract_shop_name(driver):
try:
main = driver.find_element(By.CSS_SELECTOR, 'div[role="main"]')
title = main.find_element(By.ID, "_title")
name = title.find_element(By.TAG_NAME, "span").text.strip()
return name
except Exception as e:
print(f"[WARN] 업체명 추출 실패: {e}")
return "업체명 없음"