From 8dd3bca62ceee57499507c901f30e5ca12037643 Mon Sep 17 00:00:00 2001 From: KWON Date: Thu, 10 Jul 2025 15:05:12 +0900 Subject: [PATCH] first-commit --- .env_sample | 47 +++++++ .gitignore | 2 + README.md | 17 +++ conf/config.py | 38 ++++++ lib/biz_crawler.py | 303 ++++++++++++++++++++++++++++++++++++++++++++ lib/send_message.py | 114 +++++++++++++++++ 6 files changed, 521 insertions(+) create mode 100644 .env_sample create mode 100644 .gitignore create mode 100644 README.md create mode 100644 conf/config.py create mode 100644 lib/biz_crawler.py create mode 100644 lib/send_message.py diff --git a/.env_sample b/.env_sample new file mode 100644 index 0000000..81d972a --- /dev/null +++ b/.env_sample @@ -0,0 +1,47 @@ +# 네이버 플레이스 ID (콤마로 구분하여 여러 개 입력 가능) +PLACE_IDS=12345678 + +# 네이버 비즈니스 페이지에서 보는 작성일 기준 리뷰 (콤마 구분) +BIZ_ID=1234567 + +# 각 플레이스당 최대 수집 수 +MAX_REVIEWS=100 + +# 네이버 로그인 계정 +NAVER_ID=Login_ID +NAVER_PW=Login_Password + +# 메시지 전송 플랫폼 선택 +# mattermost, synology_chat, telegram 중 선택(콤마로 구분) 또는 빈 값(발송 안함) +MESSAGE_PLATFORM=mattermost,telegram + +# Mattermost 설정 +# MATTERMOST_URL은 마지막에 '/' 없이 입력 +MATTERMOST_URL=https://example.com +# MATTERMOST_WEBHOOK_URL은 도메인을 제외한 경로, '/' 포함 +MATTERMOST_WEBHOOK_URL=/WEBHOOK_URL +MATTERMOST_CHANNEL_ID=CHANNEL_ID +MATTERMOST_BOT_TOKEN=BOT_TOKEN + +# Synology Chat Webhook URL (사용 시 설정) +SYNology_CHAT_WEBHOOK_URL=https://synology.chat/webhook/your_webhook_url + +# Telegram 설정 (사용 시 설정) +TELEGRAM_BOT_TOKEN=your_bot_token +TELEGRAM_CHAT_ID=your_chat_id + +# 설정 변경 지점 +# 헤드리스 모드 실행 여부 (True / False) +HEADLESS=True + +# 디버그 모드 여부 (True / False) +DEBUG=False + +# 필터링 시작일 (YYYY-MM-DD 형식) +START_DATE=2025-07-01 + +# 필터링 종료일 (YYYY-MM-DD 형식) +END_DATE=2025-07-10 + +# 쿠키 경로 수동지정 +#COOKIE_FILE=/path/to/naver_cookies.pkl diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..c515782 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +.env +**__pycache__** diff --git a/README.md b/README.md new file mode 100644 index 0000000..5db6692 --- /dev/null +++ b/README.md @@ -0,0 +1,17 @@ +# 네이버 리뷰 크롤러 +- 네이버 비즈니스, 네이버 map 기준 리뷰를 크롤링해 메시지를 보내줌 +# 폴더 구조 +``` bash +├── .env_sample # 환경변수 샘플 파일 (.env_sample) +├── .gitignore # Git 무시 파일 목록 +├── README.md # 프로젝트 설명 문서 +├── conf +│ └── config.py # 설정값 관리 파이썬 모듈 +├── data +│ └── naver_cookies.pkl # 네이버 로그인 쿠키 저장 파일 +└── lib + ├── biz_crawler.py # 네이버 비즈니스 프로필 리뷰 크롤러 + ├── naver_review_crawler.py # 네이버 지도 리뷰 크롤러 + └── send_message.py # 메시지 전송 기능 모듈 + +``` diff --git a/conf/config.py b/conf/config.py new file mode 100644 index 0000000..1e00b6c --- /dev/null +++ b/conf/config.py @@ -0,0 +1,38 @@ +import os + +def parse_bool(value): + if isinstance(value, bool): + return value + return str(value).lower() in ('true', '1', 'yes') + +def parse_list(value): + if not value: + return [] + return [v.strip() for v in value.split(',') if v.strip()] + +PLACE_IDS = parse_list(os.getenv("PLACE_IDS", "")) +BIZ_ID = parse_list(os.getenv("BIZ_ID", "")) +MAX_REVIEWS = int(os.getenv("MAX_REVIEWS", "100")) + +NAVER_ID = os.getenv("NAVER_ID", "") +NAVER_PW = os.getenv("NAVER_PW", "") + +MESSAGE_PLATFORMS = parse_list(os.getenv("MESSAGE_PLATFORM", "")) + +MATTERMOST_URL = os.getenv("MATTERMOST_URL", "") +MATTERMOST_WEBHOOK_URL = os.getenv("MATTERMOST_WEBHOOK_URL", "") +MATTERMOST_CHANNEL_ID = os.getenv("MATTERMOST_CHANNEL_ID", "") +MATTERMOST_BOT_TOKEN = os.getenv("MATTERMOST_BOT_TOKEN", "") + +SYNology_CHAT_WEBHOOK_URL = os.getenv("SYNology_CHAT_WEBHOOK_URL", "") + +TELEGRAM_BOT_TOKEN = os.getenv("TELEGRAM_BOT_TOKEN", "") +TELEGRAM_CHAT_ID = os.getenv("TELEGRAM_CHAT_ID", "") + +HEADLESS = parse_bool(os.getenv("HEADLESS", "True")) +DEBUG = parse_bool(os.getenv("DEBUG", "False")) + +START_DATE = os.getenv("START_DATE", "2025-07-01") +END_DATE = os.getenv("END_DATE", "2025-07-10") + +COOKIE_FILE = os.getenv("COOKIE_FILE", os.path.join(os.path.dirname(__file__), '..', 'data', 'naver_cookies.pkl')) diff --git a/lib/biz_crawler.py b/lib/biz_crawler.py new file mode 100644 index 0000000..345564b --- /dev/null +++ b/lib/biz_crawler.py @@ -0,0 +1,303 @@ +import os, sys +import re +import pickle +import time +from datetime import datetime, timedelta +import undetected_chromedriver as uc +from selenium.webdriver.common.by import By +from selenium.webdriver.support.ui import WebDriverWait +from selenium.webdriver.support import expected_conditions as EC + +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) +from conf.config import ( + HEADLESS, BIZ_ID, NAVER_ID, NAVER_PW, + START_DATE as CFG_START, END_DATE as CFG_END, + COOKIE_FILE, DEBUG, + MESSAGE_PLATFORMS, MATTERMOST_URL, MATTERMOST_BOT_TOKEN, MATTERMOST_CHANNEL_ID +) + +from lib.send_message import MessageSender + +def get_start_end_dates(): + if DEBUG: + # 문자열 → datetime.date 변환 + start = datetime.strptime(CFG_START, "%Y-%m-%d").date() + end = datetime.strptime(CFG_END, "%Y-%m-%d").date() + return start, end + + today = datetime.today() + weekday = today.weekday() # 0 = Monday + if weekday == 0: # 월요일 → 금~일 + start = today - timedelta(days=3) + end = today - timedelta(days=1) + else: # 그 외 요일 → 어제 + start = end = today - timedelta(days=1) + return start.date(), end.date() + +class NaverReviewCollector: + def __init__(self, headless=HEADLESS): + self.headless = headless + self.driver = None + self.total_reviews = 0 + self.start_date, self.end_date = get_start_end_dates() + self.reviews_by_place = {} + + def create_driver(self): + options = uc.ChromeOptions() + options.add_argument('--window-size=375,812') + if self.headless: + options.add_argument("--headless=new") + options.add_argument("--disable-gpu") + + options.add_argument("--user-agent=Mozilla/5.0 (iPhone; CPU iPhone OS 14_0 like Mac OS X) " + "AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Mobile/15A372 Safari/604.1") + self.driver = uc.Chrome(options=options) + self.driver.set_window_size(375, 812) + + def save_cookies(self): + cookies = self.driver.get_cookies() + for c in cookies: + c.pop("sameSite", None) + if "expiry" in c: + c["expires"] = c.pop("expiry") + with open(COOKIE_FILE, "wb") as f: + pickle.dump(cookies, f) + + def load_cookies(self): + with open(COOKIE_FILE, "rb") as f: + cookies = pickle.load(f) + for cookie in cookies: + self.driver.add_cookie(cookie) + + def perform_login(self): + wait = WebDriverWait(self.driver, 20) + self.driver.get(f"https://new.smartplace.naver.com/bizes/place/{BIZ_ID[0]}/reviews") + time.sleep(2) + + try: + modal = wait.until(EC.presence_of_element_located((By.ID, "modal-root"))) + modal.find_element(By.XPATH, './/button').click() + except: + pass + + try: + wait.until(EC.presence_of_element_located((By.ID, 'id'))).send_keys(NAVER_ID) + self.driver.find_element(By.ID, 'pw').send_keys(NAVER_PW) + self.driver.find_element(By.XPATH, '//button[@type="submit"]').click() + except Exception: + self.driver.quit() + return False + + time.sleep(3) + if "captcha" in self.driver.page_source.lower(): + input("CAPTCHA 수동 입력 후 Enter: ") + + self.save_cookies() + return True + + def is_login_required(self): + return "로그인이 필요한 기능" in self.driver.page_source + + def access_review_page(self, biz_id): + self.driver.get(f"https://new.smartplace.naver.com/bizes/place/{biz_id}/reviews") + time.sleep(2) + try: + el = WebDriverWait(self.driver, 10).until( + EC.presence_of_element_located((By.XPATH, '//*[starts-with(@class, "Header_btn_select_")]')) + ) + return el.text.strip() + except: + return "알수없음" + + def extract_reviews(self): + reviews = [] + try: + WebDriverWait(self.driver, 10).until( + EC.presence_of_element_located((By.XPATH, "//ul[starts-with(@class, 'Review_columns_list')]")) + ) + lis = self.driver.find_elements(By.XPATH, "//ul[starts-with(@class, 'Review_columns_list')]/li") + for li in lis: + if "Review_banner__" in li.get_attribute("class"): + continue + + try: + author = li.find_element(By.XPATH, ".//div[1]/a[2]/div/span/span").text.strip() + visit_text = li.find_element(By.XPATH, ".//div[2]/div[1]/span[2]/time").text.strip() + visit_date = datetime.strptime(visit_text.split("(")[0].replace(". ", "-").replace(".", ""), "%Y-%m-%d").strftime("%Y-%m-%d") + + spans = li.find_elements(By.XPATH, ".//div[2]/div[2]/span") + labels = [s.text.strip() for s in spans] + written_text = None + + if "작성일" in labels: + idx = labels.index("작성일") + written_text = spans[idx + 1].find_element(By.TAG_NAME, "time").text.strip() + elif "예약자" in labels: + try: + written_text = li.find_element(By.XPATH, ".//div[3]/div[1]/span[2]/time").text.strip() + except: + continue + + if not written_text: + continue + + written_date = datetime.strptime(written_text.split("(")[0].replace(". ", "-").replace(".", ""), "%Y-%m-%d").date() + if not (self.start_date <= written_date <= self.end_date): + continue + + content_el = None + for i in range(4, 7): + try: + el = li.find_element(By.XPATH, f"./div[{i}]/a") + if el and el.text.strip(): + content_el = el + break + except: + continue + if content_el is None: + continue + + html = content_el.get_attribute("innerHTML") + html = re.sub(r'', '\n', html) + html = re.sub(r'.*?<\/span>', '', html, flags=re.DOTALL) + html = re.sub(r'<.*?>', '', html) + + reviews.append({ + "작성자": author, + "방문일": visit_date, + "작성일": written_date, + "내용": html.strip() + }) + + except: + continue + except: + pass + return reviews + + # 메시지 보내기 + def send_to_message(self): + today_str = datetime.today().strftime("%Y년 %m월 %d일") + now_str = datetime.now().strftime("%H:%M:%S") # 현재 시각 + lines = [f"##### {today_str} 네이버 리뷰 크롤링 결과", ""] + lines.append(f"**수집 시간 :** {now_str}") # 수집 시간 추가 + lines.append(f"**총 리뷰 수 :** {self.total_reviews}") + lines.append("") + + for place_name, reviews in self.reviews_by_place.items(): + lines.append(f"- {place_name}: {len(reviews)}건 ") + lines.append("") + lines.append("---") + lines.append("") + + for idx, (place_name, reviews) in enumerate(self.reviews_by_place.items(), start=1): + lines.append(f"**{idx}. {place_name}** ") + lines.append("") + + if not reviews: + lines.append("- 리뷰 없음") + lines.append("") + else: + for r in reviews: + lines.append(f"- **작성일** : {r['작성일']} ") + lines.append(f" **방문일** : {r['방문일']} ") + lines.append(f" **작성자** : {r['작성자']} ") + lines.append(f" **내용** : {r['내용']}") + lines.append("") + + lines.append("---") + lines.append("") + + message = "\n".join(lines) + + if not MESSAGE_PLATFORMS: + print("[WARN] 메시지 전송 플랫폼이 지정되지 않아 메시지를 발송하지 않습니다.") + print(f"[DEBUG] {message}") + return + + sender = MessageSender( + mattermost_url=MATTERMOST_URL, + mattermost_token=MATTERMOST_BOT_TOKEN, + mattermost_channel_id=MATTERMOST_CHANNEL_ID, + ) + + if not DEBUG: + sender.send(message, platforms=MESSAGE_PLATFORMS, use_webhook=False) + else: + print(f"[DEBUG] message platform : {MESSAGE_PLATFORMS}") + print("[DEBUG] 디버그 모드에서는 메시지를 발송하지 않습니다.") + print(f"[DEBUG] {message}") + + + def run(self): + self.create_driver() + self.driver.get("https://naver.com") + time.sleep(1) + + if os.path.exists(COOKIE_FILE): + try: + self.load_cookies() + self.driver.get("https://naver.com") + time.sleep(1) + except: + os.remove(COOKIE_FILE) + self.driver.quit() + NaverReviewCollector(headless=False).run() + return + else: + if self.headless: + self.driver.quit() + NaverReviewCollector(headless=False).run() + return + if not self.perform_login(): + self.driver.quit() + return + self.driver.quit() + NaverReviewCollector(headless=self.headless).run() + return + + for biz_id in BIZ_ID: + place_name = self.access_review_page(biz_id) + print(f"\n=== [{place_name}({biz_id})] 리뷰 수집 시작 ===") + + if self.is_login_required(): + print("[WARN] 세션 만료 또는 쿠키 무효. 로그인 재진행") + os.remove(COOKIE_FILE) + self.driver.quit() + NaverReviewCollector(headless=False).run() + return + + try: + reviews = self.extract_reviews() + print(f"[RESULT] 리뷰 {len(reviews)}개 수집됨") + self.total_reviews += len(reviews) + + # 플레이스별 리뷰 저장 + self.reviews_by_place[place_name] = reviews + + except Exception as e: + print(f"[ERROR] {biz_id} 처리 중 오류:", e) + self.reviews_by_place[place_name] = [] + + self.driver.quit() + + # 리뷰 수집 시도 자체가 실패했을 경우 (빈 딕셔너리) + if not self.reviews_by_place: + from lib.send_message import MessageSender + from conf.config import MESSAGE_PLATFORMS, MATTERMOST_URL, MATTERMOST_BOT_TOKEN, MATTERMOST_CHANNEL_ID + + sender = MessageSender( + mattermost_url=MATTERMOST_URL, + mattermost_bot_token=MATTERMOST_BOT_TOKEN, + mattermost_channel_id=MATTERMOST_CHANNEL_ID, + ) + + for platform in MESSAGE_PLATFORMS: + sender.send("# ❌ 리뷰 수집 실패: 플레이스 접근 또는 파싱 오류", platform=platform, use_webhook=False) + + else: + self.send_to_message() + +if __name__ == "__main__": + collector = NaverReviewCollector(headless=HEADLESS) + collector.run() diff --git a/lib/send_message.py b/lib/send_message.py new file mode 100644 index 0000000..41bc50f --- /dev/null +++ b/lib/send_message.py @@ -0,0 +1,114 @@ +import requests + +class MessageSender: + def __init__(self, + mattermost_url: str = "", mattermost_token: str = "", mattermost_channel_id: str = "", + synology_webhook_url: str = "", + telegram_bot_token: str = "", telegram_chat_id: str = ""): + self.mattermost_url = mattermost_url.rstrip('/') + self.mattermost_token = mattermost_token + self.mattermost_channel_id = mattermost_channel_id + self.synology_webhook_url = synology_webhook_url + self.telegram_bot_token = telegram_bot_token + self.telegram_chat_id = telegram_chat_id + + def send(self, message: str, platforms=None, use_webhook: bool = False): + """ + 메시지 전송 + + :param message: 전송할 메시지 + :param platforms: 전송 플랫폼 리스트 (예: ['mattermost', 'telegram']) + :param use_webhook: mattermost에서 웹훅 사용 여부 (mattermost 전용) + """ + if not platforms: + print("[WARN] 전송할 플랫폼이 지정되지 않았습니다. 메시지를 보내지 않습니다.") + return False + + if isinstance(platforms, str): + platforms = [platforms] + + success = True + for platform in platforms: + p = platform.lower() + if p == "mattermost": + result = self._send_to_mattermost(message, use_webhook) + elif p == "synology": + result = self._send_to_synology_chat(message) + elif p == "telegram": + result = self._send_to_telegram(message) + else: + print(f"[ERROR] 지원하지 않는 플랫폼입니다: {p}") + result = False + if not result: + success = False + return success + + + def _send_to_mattermost(self, message: str, use_webhook: bool): + try: + if use_webhook: + response = requests.post( + self.mattermost_url, + json={"text": message}, + headers={"Content-Type": "application/json"} + ) + else: + url = f"{self.mattermost_url}/api/v4/posts" + headers = { + "Authorization": f"Bearer {self.mattermost_token}", + "Content-Type": "application/json" + } + payload = { + "channel_id": self.mattermost_channel_id, + "message": message + } + response = requests.post(url, json=payload, headers=headers) + + if response.status_code not in [200, 201]: + print(f"[ERROR] Mattermost 전송 실패: {response.status_code} {response.text}") + return False + else: + print("[INFO] Mattermost 메시지 전송 완료") + return True + + except Exception as e: + print(f"[ERROR] Mattermost 전송 예외: {e}") + return False + + def _send_to_synology_chat(self, message: str): + try: + payload = {"text": message} + headers = {"Content-Type": "application/json"} + response = requests.post(self.synology_webhook_url, json=payload, headers=headers) + + if response.status_code != 200: + print(f"[ERROR] Synology Chat 전송 실패: {response.status_code} {response.text}") + return False + else: + print("[INFO] Synology Chat 메시지 전송 완료") + return True + + except Exception as e: + print(f"[ERROR] Synology Chat 전송 예외: {e}") + return False + + def _send_to_telegram(self, message: str): + try: + url = f"https://api.telegram.org/bot{self.telegram_bot_token}/sendMessage" + payload = { + "chat_id": self.telegram_chat_id, + "text": message, + "parse_mode": "Markdown" + } + response = requests.post(url, data=payload) + + if response.status_code != 200: + print(f"[ERROR] Telegram 전송 실패: {response.status_code} {response.text}") + return False + else: + print("[INFO] Telegram 메시지 전송 완료") + return True + + except Exception as e: + print(f"[ERROR] Telegram 전송 예외: {e}") + return False