181 lines
6.1 KiB
Python
181 lines
6.1 KiB
Python
# naver_review_crawler.py
|
|
import os, sys
|
|
from datetime import datetime
|
|
from selenium.webdriver.common.by import By
|
|
from selenium.webdriver.support.ui import WebDriverWait
|
|
from selenium.webdriver.support import expected_conditions as EC
|
|
|
|
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
|
|
|
|
from conf.config import (
|
|
PLACE_IDS, START_DATE, END_DATE, DEBUG,
|
|
MESSAGE_PLATFORMS, MATTERMOST_URL, MATTERMOST_BOT_TOKEN, MATTERMOST_CHANNEL_ID
|
|
)
|
|
from lib.send_message import MessageSender
|
|
from lib.common import (
|
|
create_mobile_driver,
|
|
get_start_end_dates,
|
|
parse_korean_date,
|
|
click_more,
|
|
extract_shop_name,
|
|
clean_html_text,
|
|
send_failure_message
|
|
)
|
|
|
|
|
|
def debug(msg):
|
|
if DEBUG:
|
|
print(f"[DEBUG] {msg}")
|
|
|
|
|
|
class NaverMapReviewCollector:
|
|
def __init__(self):
|
|
self.driver = None
|
|
self.total_reviews = 0
|
|
self.start_date, self.end_date = get_start_end_dates(DEBUG, START_DATE, END_DATE)
|
|
self.reviews_by_place = {}
|
|
|
|
def extract_reviews(self):
|
|
reviews = []
|
|
try:
|
|
WebDriverWait(self.driver, 10).until(
|
|
EC.presence_of_element_located((By.ID, "_review_list"))
|
|
)
|
|
ul = self.driver.find_element(By.ID, "_review_list")
|
|
items = ul.find_elements(By.XPATH, './/li[contains(@class, "place_apply_pui")]')
|
|
|
|
for item in items:
|
|
try:
|
|
writer = "익명"
|
|
try:
|
|
writer = item.find_element(By.XPATH, "./div[1]/a[2]/div/span/span").text.strip()
|
|
except Exception:
|
|
pass
|
|
|
|
try:
|
|
date_text = item.find_element(By.XPATH, "./div[7]/div[2]/div/span[1]/span[2]").text.strip()
|
|
date_obj = parse_korean_date(date_text)
|
|
except Exception:
|
|
continue
|
|
|
|
if not (self.start_date <= date_obj <= self.end_date):
|
|
continue
|
|
|
|
try:
|
|
text_html = item.find_element(By.XPATH, "./div[5]/a").get_attribute("innerHTML")
|
|
content = clean_html_text(text_html)
|
|
except Exception:
|
|
continue
|
|
|
|
reviews.append({
|
|
"작성자": writer,
|
|
"작성일": date_obj,
|
|
"내용": content
|
|
})
|
|
|
|
except Exception as e:
|
|
debug(f"[WARN] 리뷰 항목 처리 중 오류: {e}")
|
|
except Exception as e:
|
|
debug(f"[ERROR] 리뷰 리스트 접근 실패: {e}")
|
|
return reviews
|
|
|
|
def send_to_message(self):
|
|
today_str = datetime.today().strftime("%Y년 %m월 %d일")
|
|
now_str = datetime.now().strftime("%H:%M:%S")
|
|
lines = [f"##### {today_str} 네이버 지도 리뷰 크롤링 결과", ""]
|
|
lines.append(f"**수집 시간 :** {now_str}")
|
|
lines.append(f"**총 리뷰 수 :** {self.total_reviews}")
|
|
lines.append("")
|
|
|
|
for place_name, reviews in self.reviews_by_place.items():
|
|
lines.append(f"- {place_name}: {len(reviews)}건 ")
|
|
lines.append("\n---\n")
|
|
|
|
for idx, (place_name, reviews) in enumerate(self.reviews_by_place.items(), start=1):
|
|
lines.append(f"**{idx}. {place_name}** ")
|
|
lines.append("")
|
|
if not reviews:
|
|
lines.append("- 리뷰 없음\n")
|
|
else:
|
|
for r in reviews:
|
|
lines.append(f"- **작성일** : {r['작성일']} ")
|
|
lines.append(f" **작성자** : {r['작성자']} ")
|
|
lines.append(f" **내용** : {r['내용']}\n")
|
|
lines.append("---")
|
|
|
|
message = "\n".join(lines)
|
|
|
|
if not MESSAGE_PLATFORMS:
|
|
print("[WARN] 메시지 전송 플랫폼이 지정되지 않음")
|
|
debug(message)
|
|
return
|
|
|
|
sender = MessageSender(
|
|
mattermost_url=MATTERMOST_URL,
|
|
mattermost_token=MATTERMOST_BOT_TOKEN,
|
|
mattermost_channel_id=MATTERMOST_CHANNEL_ID,
|
|
)
|
|
|
|
if DEBUG:
|
|
debug("디버그 모드로 메시지 전송 생략")
|
|
debug(message)
|
|
else:
|
|
sender.send(message, platforms=MESSAGE_PLATFORMS, use_webhook=False)
|
|
|
|
def run(self):
|
|
self.driver = create_mobile_driver()
|
|
|
|
for place_id in PLACE_IDS:
|
|
url = f"https://m.place.naver.com/place/{place_id}/review/visitor?reviewSort=recent"
|
|
print(f"[INFO] 접근: {url}")
|
|
try:
|
|
self.driver.get(url)
|
|
shop_name = extract_shop_name(self.driver)
|
|
except Exception as e:
|
|
print(f"[ERROR] {place_id} 매장 접근 오류: {e}")
|
|
continue
|
|
|
|
all_reviews = []
|
|
seen = set() # (작성자, 작성일, 내용) 기준으로 중복 제거
|
|
|
|
while True:
|
|
new_reviews = self.extract_reviews()
|
|
if not new_reviews:
|
|
break
|
|
|
|
filtered = []
|
|
for r in new_reviews:
|
|
key = (r["작성자"], r["작성일"], r["내용"])
|
|
if key not in seen:
|
|
seen.add(key)
|
|
filtered.append(r)
|
|
|
|
if not filtered:
|
|
break
|
|
|
|
all_reviews.extend(filtered)
|
|
|
|
if not click_more(self.driver):
|
|
break
|
|
|
|
print(f"[DONE] {shop_name}: {len(all_reviews)}건 수집")
|
|
self.total_reviews += len(all_reviews)
|
|
self.reviews_by_place[shop_name] = all_reviews
|
|
|
|
self.driver.quit()
|
|
|
|
if not self.reviews_by_place:
|
|
sender = MessageSender(
|
|
mattermost_url=MATTERMOST_URL,
|
|
mattermost_token=MATTERMOST_BOT_TOKEN,
|
|
mattermost_channel_id=MATTERMOST_CHANNEL_ID,
|
|
)
|
|
send_failure_message(sender, MESSAGE_PLATFORMS)
|
|
else:
|
|
self.send_to_message()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
collector = NaverMapReviewCollector()
|
|
collector.run()
|