diff --git a/conf/__init__.py b/conf/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/lib/__init__.py b/lib/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/lib/biz_crawler.py b/lib/biz_crawler.py index 345564b..a4b9a2f 100644 --- a/lib/biz_crawler.py +++ b/lib/biz_crawler.py @@ -1,9 +1,6 @@ import os, sys -import re -import pickle import time -from datetime import datetime, timedelta -import undetected_chromedriver as uc +from datetime import datetime from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC @@ -15,59 +12,26 @@ from conf.config import ( COOKIE_FILE, DEBUG, MESSAGE_PLATFORMS, MATTERMOST_URL, MATTERMOST_BOT_TOKEN, MATTERMOST_CHANNEL_ID ) - from lib.send_message import MessageSender - -def get_start_end_dates(): - if DEBUG: - # 문자열 → datetime.date 변환 - start = datetime.strptime(CFG_START, "%Y-%m-%d").date() - end = datetime.strptime(CFG_END, "%Y-%m-%d").date() - return start, end - - today = datetime.today() - weekday = today.weekday() # 0 = Monday - if weekday == 0: # 월요일 → 금~일 - start = today - timedelta(days=3) - end = today - timedelta(days=1) - else: # 그 외 요일 → 어제 - start = end = today - timedelta(days=1) - return start.date(), end.date() +from lib.lib import ( + create_mobile_driver, + save_cookies, + load_cookies, + get_start_end_dates, + send_failure_message, + clean_html_text +) class NaverReviewCollector: def __init__(self, headless=HEADLESS): self.headless = headless self.driver = None self.total_reviews = 0 - self.start_date, self.end_date = get_start_end_dates() - self.reviews_by_place = {} + self.start_date, self.end_date = get_start_end_dates(DEBUG, CFG_START, CFG_END) + self.reviews_by_place = {} def create_driver(self): - options = uc.ChromeOptions() - options.add_argument('--window-size=375,812') - if self.headless: - options.add_argument("--headless=new") - options.add_argument("--disable-gpu") - - options.add_argument("--user-agent=Mozilla/5.0 (iPhone; CPU iPhone OS 14_0 like Mac OS X) " - "AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Mobile/15A372 Safari/604.1") - self.driver = uc.Chrome(options=options) - self.driver.set_window_size(375, 812) - - def save_cookies(self): - cookies = self.driver.get_cookies() - for c in cookies: - c.pop("sameSite", None) - if "expiry" in c: - c["expires"] = c.pop("expiry") - with open(COOKIE_FILE, "wb") as f: - pickle.dump(cookies, f) - - def load_cookies(self): - with open(COOKIE_FILE, "rb") as f: - cookies = pickle.load(f) - for cookie in cookies: - self.driver.add_cookie(cookie) + self.driver = create_mobile_driver(self.headless) def perform_login(self): wait = WebDriverWait(self.driver, 20) @@ -92,7 +56,7 @@ class NaverReviewCollector: if "captcha" in self.driver.page_source.lower(): input("CAPTCHA 수동 입력 후 Enter: ") - self.save_cookies() + save_cookies(self.driver, COOKIE_FILE) return True def is_login_required(self): @@ -158,15 +122,13 @@ class NaverReviewCollector: continue html = content_el.get_attribute("innerHTML") - html = re.sub(r'', '\n', html) - html = re.sub(r'.*?<\/span>', '', html, flags=re.DOTALL) - html = re.sub(r'<.*?>', '', html) + text = clean_html_text(html) reviews.append({ "작성자": author, "방문일": visit_date, "작성일": written_date, - "내용": html.strip() + "내용": text }) except: @@ -174,44 +136,35 @@ class NaverReviewCollector: except: pass return reviews - - # 메시지 보내기 + def send_to_message(self): today_str = datetime.today().strftime("%Y년 %m월 %d일") - now_str = datetime.now().strftime("%H:%M:%S") # 현재 시각 + now_str = datetime.now().strftime("%H:%M:%S") lines = [f"##### {today_str} 네이버 리뷰 크롤링 결과", ""] - lines.append(f"**수집 시간 :** {now_str}") # 수집 시간 추가 + lines.append(f"**수집 시간 :** {now_str}") lines.append(f"**총 리뷰 수 :** {self.total_reviews}") lines.append("") for place_name, reviews in self.reviews_by_place.items(): lines.append(f"- {place_name}: {len(reviews)}건 ") - lines.append("") - lines.append("---") - lines.append("") + lines.append("\n---\n") for idx, (place_name, reviews) in enumerate(self.reviews_by_place.items(), start=1): lines.append(f"**{idx}. {place_name}** ") lines.append("") - if not reviews: - lines.append("- 리뷰 없음") - lines.append("") + lines.append("- 리뷰 없음\n") else: for r in reviews: lines.append(f"- **작성일** : {r['작성일']} ") lines.append(f" **방문일** : {r['방문일']} ") lines.append(f" **작성자** : {r['작성자']} ") - lines.append(f" **내용** : {r['내용']}") - lines.append("") - + lines.append(f" **내용** : {r['내용']}\n") lines.append("---") - lines.append("") message = "\n".join(lines) - if not MESSAGE_PLATFORMS: - print("[WARN] 메시지 전송 플랫폼이 지정되지 않아 메시지를 발송하지 않습니다.") + print("[WARN] 메시지 전송 플랫폼이 지정되지 않음. 미전송") print(f"[DEBUG] {message}") return @@ -221,13 +174,12 @@ class NaverReviewCollector: mattermost_channel_id=MATTERMOST_CHANNEL_ID, ) - if not DEBUG: - sender.send(message, platforms=MESSAGE_PLATFORMS, use_webhook=False) - else: + if DEBUG: print(f"[DEBUG] message platform : {MESSAGE_PLATFORMS}") - print("[DEBUG] 디버그 모드에서는 메시지를 발송하지 않습니다.") + print("[DEBUG] 디버그 모드 메시지 미전송") print(f"[DEBUG] {message}") - + else: + sender.send(message, platforms=MESSAGE_PLATFORMS, use_webhook=False) def run(self): self.create_driver() @@ -236,7 +188,7 @@ class NaverReviewCollector: if os.path.exists(COOKIE_FILE): try: - self.load_cookies() + load_cookies(self.driver, COOKIE_FILE) self.driver.get("https://naver.com") time.sleep(1) except: @@ -259,7 +211,6 @@ class NaverReviewCollector: for biz_id in BIZ_ID: place_name = self.access_review_page(biz_id) print(f"\n=== [{place_name}({biz_id})] 리뷰 수집 시작 ===") - if self.is_login_required(): print("[WARN] 세션 만료 또는 쿠키 무효. 로그인 재진행") os.remove(COOKIE_FILE) @@ -271,33 +222,24 @@ class NaverReviewCollector: reviews = self.extract_reviews() print(f"[RESULT] 리뷰 {len(reviews)}개 수집됨") self.total_reviews += len(reviews) - - # 플레이스별 리뷰 저장 self.reviews_by_place[place_name] = reviews - except Exception as e: print(f"[ERROR] {biz_id} 처리 중 오류:", e) self.reviews_by_place[place_name] = [] self.driver.quit() - # 리뷰 수집 시도 자체가 실패했을 경우 (빈 딕셔너리) if not self.reviews_by_place: - from lib.send_message import MessageSender - from conf.config import MESSAGE_PLATFORMS, MATTERMOST_URL, MATTERMOST_BOT_TOKEN, MATTERMOST_CHANNEL_ID - sender = MessageSender( mattermost_url=MATTERMOST_URL, mattermost_bot_token=MATTERMOST_BOT_TOKEN, mattermost_channel_id=MATTERMOST_CHANNEL_ID, ) - - for platform in MESSAGE_PLATFORMS: - sender.send("# ❌ 리뷰 수집 실패: 플레이스 접근 또는 파싱 오류", platform=platform, use_webhook=False) - + send_failure_message(sender, MESSAGE_PLATFORMS) else: self.send_to_message() + if __name__ == "__main__": collector = NaverReviewCollector(headless=HEADLESS) collector.run() diff --git a/lib/lib.py b/lib/lib.py new file mode 100644 index 0000000..a12e0b3 --- /dev/null +++ b/lib/lib.py @@ -0,0 +1,112 @@ +# lib/lib.py + +import os +import sys +import time +import pickle +import re +from datetime import datetime, timedelta +import undetected_chromedriver as uc +from selenium.webdriver.common.by import By + +# 공통 설정 경로 추가 (필요 시) +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) + +# ───────────────────────────────────────────── +# ✅ 드라이버 생성 함수 +# ───────────────────────────────────────────── +def create_mobile_driver(headless=True): + options = uc.ChromeOptions() + options.add_argument('--window-size=375,812') + if headless: + options.add_argument('--headless=new') + options.add_argument('--disable-gpu') + + options.add_argument("--user-agent=Mozilla/5.0 (iPhone; CPU iPhone OS 14_0 like Mac OS X) " + "AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Mobile/15A372 Safari/604.1") + driver = uc.Chrome(options=options) + driver.set_window_size(375, 812) + return driver + +# ───────────────────────────────────────────── +# ✅ 쿠키 저장 및 로드 +# ───────────────────────────────────────────── +def save_cookies(driver, cookie_file): + cookies = driver.get_cookies() + for c in cookies: + c.pop("sameSite", None) + if "expiry" in c: + c["expires"] = c.pop("expiry") + with open(cookie_file, "wb") as f: + pickle.dump(cookies, f) + +def load_cookies(driver, cookie_file): + with open(cookie_file, "rb") as f: + cookies = pickle.load(f) + for cookie in cookies: + driver.add_cookie(cookie) + +# ───────────────────────────────────────────── +# ✅ 날짜 계산 유틸리티 +# ───────────────────────────────────────────── +def get_start_end_dates(debug, cfg_start, cfg_end): + if debug: + return ( + datetime.strptime(cfg_start, "%Y-%m-%d").date(), + datetime.strptime(cfg_end, "%Y-%m-%d").date(), + ) + + today = datetime.today() + weekday = today.weekday() + if weekday == 0: + start = today - timedelta(days=3) + end = today - timedelta(days=1) + else: + start = end = today - timedelta(days=1) + return start.date(), end.date() + +# ───────────────────────────────────────────── +# ✅ 메시지 전송 실패 시 기본 메시지 전송 +# ───────────────────────────────────────────── +def send_failure_message(sender, platforms): + for platform in platforms: + sender.send("# ❌ 리뷰 수집 실패: 플레이스 접근 또는 파싱 오류", platform=platform, use_webhook=False) + +# ───────────────────────────────────────────── +# ✅ HTML 본문 정리 (리뷰 등) +# ───────────────────────────────────────────── +def clean_html_text(html): + html = re.sub(r'', '\n', html) + html = re.sub(r'.*?<\/span>', '', html, flags=re.DOTALL) + html = re.sub(r'<.*?>', '', html) + return html.strip() + +def parse_korean_date(date_str): + try: + date_clean = " ".join(date_str.strip().split(" ")[:3]) # 요일 제거 + return datetime.strptime(date_clean, "%Y년 %m월 %d일").date() + except Exception as e: + print(f"[WARN] 날짜 파싱 실패: {date_str} ({e})") + return None + + +def click_more(driver): + try: + container = driver.find_element(By.CLASS_NAME, "place_section_content") + more_div = container.find_element(By.XPATH, "./following-sibling::div[1]") + more_btn = more_div.find_element(By.TAG_NAME, "a") + driver.execute_script("arguments[0].click();", more_btn) + time.sleep(2) # 클릭 후 대기 + return True + except Exception: + return False + +def extract_shop_name(driver): + try: + main = driver.find_element(By.CSS_SELECTOR, 'div[role="main"]') + title = main.find_element(By.ID, "_title") + name = title.find_element(By.TAG_NAME, "span").text.strip() + return name + except Exception as e: + print(f"[WARN] 업체명 추출 실패: {e}") + return "업체명 없음" diff --git a/lib/naver_review_crawler.py b/lib/naver_review_crawler.py new file mode 100644 index 0000000..2ce6b36 --- /dev/null +++ b/lib/naver_review_crawler.py @@ -0,0 +1,159 @@ +import os, sys +from datetime import datetime +from selenium.webdriver.common.by import By +from selenium.webdriver.support.ui import WebDriverWait +from selenium.webdriver.support import expected_conditions as EC + +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) + +from conf.config import ( + PLACE_IDS, START_DATE, END_DATE, DEBUG, + MESSAGE_PLATFORMS, MATTERMOST_URL, MATTERMOST_BOT_TOKEN, MATTERMOST_CHANNEL_ID + ) +from lib.send_message import MessageSender +from lib.lib import ( + create_mobile_driver, + get_start_end_dates, + parse_korean_date, + click_more, + extract_shop_name, + clean_html_text, + send_failure_message +) + +class NaverReviewMapCollector: + def __init__(self): + self.driver = None + self.total_reviews = 0 + self.start_date, self.end_date = get_start_end_dates(DEBUG, START_DATE, END_DATE) + self.reviews_by_place = {} + + def extract_reviews(self): + reviews = [] + WebDriverWait(self.driver, 10).until( + EC.presence_of_element_located((By.ID, "_review_list")) + ) + ul = self.driver.find_element(By.ID, "_review_list") + items = ul.find_elements(By.XPATH, './/li[contains(@class, "place_apply_pui")]') + for item in items: + try: + writer = "익명" + try: + writer = item.find_element(By.XPATH, "./div[1]/a[2]/div/span/span").text.strip() + except: + pass + + date_obj = None + try: + date_text = item.find_element(By.XPATH, "./div[7]/div[2]/div/span[1]/span[2]").text.strip() + date_obj = parse_korean_date(date_text) + except: + continue + + text = "" + try: + text = item.find_element(By.XPATH, "./div[5]/a").get_attribute("innerHTML") + except: + continue + + if date_obj and (self.start_date <= date_obj <= self.end_date): + reviews.append({ + "작성자": writer, + "작성일": date_obj, + "내용": clean_html_text(text) + }) + except Exception as e: + print(f"[WARN] 리뷰 추출 실패: {e}") + return reviews + + def send_to_message(self): + today_str = datetime.today().strftime("%Y년 %m월 %d일") + now_str = datetime.now().strftime("%H:%M:%S") + lines = [f"##### {today_str} 네이버 지도 리뷰 크롤링 결과", ""] + lines.append(f"**수집 시간 :** {now_str}") + lines.append(f"**총 리뷰 수 :** {self.total_reviews}") + lines.append("") + + for place_name, reviews in self.reviews_by_place.items(): + lines.append(f"- {place_name}: {len(reviews)}건 ") + lines.append("\n---\n") + + for idx, (place_name, reviews) in enumerate(self.reviews_by_place.items(), start=1): + lines.append(f"**{idx}. {place_name}** ") + lines.append("") + if not reviews: + lines.append("- 리뷰 없음\n") + else: + for r in reviews: + lines.append(f"- **작성일** : {r['작성일']} ") + lines.append(f" **작성자** : {r['작성자']} ") + lines.append(f" **내용** : {r['내용']}\n") + lines.append("---") + + message = "\n".join(lines) + if not MESSAGE_PLATFORMS: + print("[WARN] 메시지 전송 플랫폼 없음") + print(f"[DEBUG] {message}") + return + + sender = MessageSender( + mattermost_url=MATTERMOST_URL, + mattermost_token=MATTERMOST_BOT_TOKEN, + mattermost_channel_id=MATTERMOST_CHANNEL_ID, + ) + + if DEBUG: + print("[DEBUG] 디버그 모드로 메시지 미전송") + print(message) + else: + sender.send(message, platforms=MESSAGE_PLATFORMS, use_webhook=False) + + def run(self): + self.driver = create_mobile_driver() + + for place_id in PLACE_IDS: + url = f"https://m.place.naver.com/place/{place_id}/review/visitor?reviewSort=recent" + print(f"[INFO] 접근: {url}") + self.driver.get(url) + shop_name = extract_shop_name(self.driver) + all_reviews = [] + seen = set() + + while True: + new_reviews = self.extract_reviews() + if not new_reviews: + break + + filtered = [] + for r in new_reviews: + key = (r["작성자"], r["작성일"], r["내용"]) + if key not in seen: + seen.add(key) + filtered.append(r) + + if not filtered: + break + + all_reviews.extend(filtered) + if not click_more(self.driver): + break + + print(f"[DONE] {shop_name}: {len(all_reviews)}건 수집") + self.total_reviews += len(all_reviews) + self.reviews_by_place[shop_name] = all_reviews + + self.driver.quit() + + if not self.reviews_by_place: + sender = MessageSender( + mattermost_url=MATTERMOST_URL, + mattermost_token=MATTERMOST_BOT_TOKEN, + mattermost_channel_id=MATTERMOST_CHANNEL_ID, + ) + send_failure_message(sender, MESSAGE_PLATFORMS) + else: + self.send_to_message() + +if __name__ == "__main__": + collector = NaverReviewMapCollector() + collector.run() diff --git a/run.py b/run.py new file mode 100644 index 0000000..3c59c2c --- /dev/null +++ b/run.py @@ -0,0 +1,28 @@ +import os +import sys +from dotenv import load_dotenv + + +# 환경 변수 로드 +load_dotenv() + +# 프로젝트 루트 기준 경로 추가 +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), 'lib'))) + +# 실행 모드 확인 +mode = os.getenv("MODE", "").strip().lower() + +if mode == "biz": + from lib.biz_crawler import NaverReviewCollector + print("[INFO] 비즈니스 리뷰 수집기 실행") + collector = NaverReviewCollector() + collector.run() + +elif mode == "map": + from lib.naver_review_crawler import NaverMapReviewCollector + print("[INFO] 지도 리뷰 수집기 실행") + collector = NaverMapReviewCollector() + collector.run() + +else: + print("[ERROR] .env 파일에서 MODE 값을 설정해주세요. (biz 또는 map)")