From 57ad944ccbd06e3636ef0c0c7267f5319d9ba30a Mon Sep 17 00:00:00 2001 From: KWON Date: Thu, 3 Jul 2025 11:42:21 +0900 Subject: [PATCH] relese --- naver_review/main.py | 166 ++++++++++++++++++++++++++++--------------- 1 file changed, 109 insertions(+), 57 deletions(-) diff --git a/naver_review/main.py b/naver_review/main.py index d10b0b1..cf8ceec 100644 --- a/naver_review/main.py +++ b/naver_review/main.py @@ -4,115 +4,167 @@ from selenium.webdriver.chrome.options import Options from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from datetime import datetime -import config +import time +import config # 사용자 설정 파일 +# ─────────────────────────────────────────────────────── +# ✅ WebDriver 설정 (모바일 User-Agent 포함, 헤드리스 옵션 가능) +# ─────────────────────────────────────────────────────── def setup_driver(): chrome_options = Options() - chrome_options.add_argument("--headless=new") + chrome_options.add_argument("--headless=new") # 필요 시 주석 해제 chrome_options.add_argument("--no-sandbox") chrome_options.add_argument("--disable-dev-shm-usage") chrome_options.add_argument( "--user-agent=Mozilla/5.0 (iPhone; CPU iPhone OS 13_5 like Mac OS X) " "AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.1.1 Mobile/15E148 Safari/604.1" ) + print("[INFO] Chrome WebDriver 실행 중...") return webdriver.Chrome(options=chrome_options) -def click_more(driver): +# ─────────────────────────────────────────────────────── +# ✅ 날짜 문자열 파싱 함수 (예: '2025년 6월 8일 일요일') +# ─────────────────────────────────────────────────────── +def parse_korean_date(date_str): try: - wait = WebDriverWait(driver, 5) - more_btn = wait.until(EC.element_to_be_clickable(( - By.XPATH, '//div[contains(@class, "place_section_content")]/following-sibling::div//a[.//span[text()="더보기"]]'))) - driver.execute_script("arguments[0].click();", more_btn) - print("[INFO] '더보기' 버튼 클릭됨") - except Exception: - print("[INFO] '더보기' 버튼 없음 또는 클릭 실패") - -def parse_date(date_str): - try: - return datetime.strptime(date_str.replace(".", "-"), "%Y-%m-%d") - except Exception: + # '2025년 6월 8일 일요일' → '2025년 6월 8일' + date_clean = " ".join(date_str.strip().split(" ")[:3]) + return datetime.strptime(date_clean, "%Y년 %m월 %d일").date() + except Exception as e: + print(f"[WARN] 날짜 파싱 실패: {date_str} ({e})") return None -def is_within_range(date_str, start, end): - d = parse_date(date_str) - if not d: - return False - return start <= d <= end - -def get_place_name(driver): +# ─────────────────────────────────────────────────────── +# ✅ "더보기" 버튼 클릭 함수 (한 번만 클릭) +# ─────────────────────────────────────────────────────── +def click_more(driver): try: - title_div = driver.find_element(By.ID, "_title") - name = title_div.find_element(By.XPATH, './div[1]/span').text.strip() + container = driver.find_element(By.CLASS_NAME, "place_section_content") + more_div = container.find_element(By.XPATH, "./following-sibling::div[1]") + more_btn = more_div.find_element(By.TAG_NAME, "a") + driver.execute_script("arguments[0].click();", more_btn) + print("[INFO] 더보기 클릭") + time.sleep(2) + return True + except: + print("[INFO] 더보기 없음") + return False + +# ─────────────────────────────────────────────────────── +# ✅ 업체명 추출 함수 (페이지 상단의 "_title" ID 활용) +# ─────────────────────────────────────────────────────── +def extract_shop_name(driver): + try: + main = driver.find_element(By.CSS_SELECTOR, 'div[role="main"]') + title = main.find_element(By.ID, "_title") + name = title.find_element(By.TAG_NAME, "span").text.strip() return name except Exception as e: print(f"[WARN] 업체명 추출 실패: {e}") return "업체명 없음" -def crawl_naver_place_reviews(place_id, max_reviews, start_date, end_date): - url = f"https://m.place.naver.com/place/{place_id}/review/visitor?reviewSort=recent" - driver = setup_driver() - driver.get(url) - +# ─────────────────────────────────────────────────────── +# ✅ 리뷰 추출 함수: 작성자 / 날짜 / 본문 +# ─────────────────────────────────────────────────────── +def extract_reviews(driver): wait = WebDriverWait(driver, 10) wait.until(EC.presence_of_element_located((By.ID, "_review_list"))) - place_name = get_place_name(driver) - click_more(driver) - ul = driver.find_element(By.ID, "_review_list") items = ul.find_elements(By.XPATH, './/li[contains(@class, "place_apply_pui")]') - reviews = [] + for item in items: try: - # 작성자 + # ① 작성자: ./div[1]/a[2]/div/span/span writer = "익명" try: - writer = item.find_element(By.XPATH, './div[1]/a[2]/div/span/span').text.strip() + writer = item.find_element(By.XPATH, "./div[1]/a[2]/div/span/span").text.strip() except: pass - # 작성일 - date = "" + # ② 날짜: ./div[7]/div[2]/div/span[1]/span[2] + date = "날짜 없음" + date_obj = None try: - date_elem = item.find_element(By.XPATH, './div[7]/div[2]/div/span[1]/span[2]') - date = date_elem.text.strip().replace(" ", "").replace("년", "-").replace("월", "-").replace("일", "") + date_text = item.find_element(By.XPATH, "./div[7]/div[2]/div/span[1]/span[2]").text.strip() + date = date_text + date_obj = parse_korean_date(date_text) except: - continue + pass - if not is_within_range(date, start_date, end_date): - continue - - # 리뷰 본문 + # ③ 본문: ./div[5]/a text = "" try: - text = item.find_element(By.XPATH, './div[5]/a').text.strip() + text = item.find_element(By.XPATH, "./div[5]/a").text.strip() except: - continue + pass if text: reviews.append({ "writer": writer, "date": date, + "date_obj": date_obj, "text": text }) - - if len(reviews) >= max_reviews: - break except Exception as e: - print(f"[WARN] 리뷰 파싱 실패: {e}") + print(f"[WARN] 리뷰 추출 실패: {e}") + + return reviews + +# ─────────────────────────────────────────────────────── +# ✅ 특정 기간 내 리뷰 수집 함수 (날짜 필터 + 더보기 반복) +# ─────────────────────────────────────────────────────── +def crawl_reviews_within_range(place_id, start_date, end_date): + url = f"https://m.place.naver.com/place/{place_id}/review/visitor?reviewSort=recent" + driver = setup_driver() + print(f"[INFO] 리뷰 페이지 접속: {url}") + driver.get(url) + + shop_name = extract_shop_name(driver) + all_reviews = [] + seen = set() + + while True: + new_reviews = extract_reviews(driver) + if not new_reviews: + break + + filtered = [] + for r in new_reviews: + if r["date_obj"] is None: + continue + if start_date <= r["date_obj"] <= end_date: + key = (r["writer"], r["date_obj"], r["text"]) + if key not in seen: + seen.add(key) + filtered.append(r) + + if not filtered: + print("[INFO] 범위 내 리뷰 없음 → 수집 종료") + break + + all_reviews.extend(filtered) + + # 더 클릭할 필요가 없으면 종료 + if not click_more(driver): + break driver.quit() - print(f"[DONE] {place_name} ({place_id}) → 총 {len(reviews)}개 리뷰 수집") - return place_name, reviews + print(f"[DONE] [{shop_name}] {len(all_reviews)}개 리뷰 수집 완료") + return shop_name, all_reviews +# ─────────────────────────────────────────────────────── +# ✅ 메인 실행부 +# ─────────────────────────────────────────────────────── if __name__ == "__main__": - start_dt = datetime.strptime(config.START_DATE, "%Y-%m-%d") - end_dt = datetime.strptime(config.END_DATE, "%Y-%m-%d") + start_date = datetime.strptime(config.START_DATE, "%Y-%m-%d").date() + end_date = datetime.strptime(config.END_DATE, "%Y-%m-%d").date() for place_id in config.PLACE_IDS: - place_name, result = crawl_naver_place_reviews(place_id, config.MAX_REVIEWS, start_dt, end_dt) - print(f"\n==== [업체명: {place_name}] ====") - for i, r in enumerate(result, 1): + shop, reviews = crawl_reviews_within_range(place_id, start_date, end_date) + + print(f"\n==== {shop} ({place_id}) 리뷰 목록 ====") + for i, r in enumerate(reviews, 1): print(f"{i}. 작성자: {r['writer']}, 날짜: {r['date']}") print(f" 내용: {r['text']}\n")