From d10e7443eea3b05b5d0bde577be61c6162ca001a Mon Sep 17 00:00:00 2001 From: KWON Date: Fri, 4 Jul 2025 15:21:54 +0900 Subject: [PATCH 1/2] =?UTF-8?q?=EB=84=A4=EC=9D=B4=EB=B2=84=20=EB=A6=AC?= =?UTF-8?q?=EB=B7=B0=20=ED=81=AC=EB=A1=A4=EB=A7=81=20=EA=B8=B0=EB=8A=A5?= =?UTF-8?q?=EC=B6=94=EA=B0=80=20=EC=9E=91=EC=97=85?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- naver_cookies.pkl | Bin 0 -> 1227 bytes naver_review/biz_crawler.py | 255 +++++++++++++++++++++++++++++++++ naver_review/biz_run.py | 110 ++++++++++++++ naver_review/config.sample.py | 25 +++- naver_review/naver_cookies.pkl | Bin 0 -> 883 bytes 5 files changed, 388 insertions(+), 2 deletions(-) create mode 100644 naver_cookies.pkl create mode 100644 naver_review/biz_crawler.py create mode 100644 naver_review/biz_run.py create mode 100644 naver_review/naver_cookies.pkl diff --git a/naver_cookies.pkl b/naver_cookies.pkl new file mode 100644 index 0000000000000000000000000000000000000000..5fe8ffb30beeadd023d07884944a6e8dc53d5ff0 GIT binary patch literal 1227 zcma)*+iK%h6oxYsCrv}=JqFywwvIM*pj#))ijJ0KS%#A8e6S=-mSo9>LhqV@dZ)fm z->WzEq|=tR6X@O-3)X`FTMPdG$H6bZ4lZ8acm4Yx`uE+nyGyMo>$}_g%sP1cduNyS z-L==|(f-(T-~Z-?-A!0kxs)Zx{<#}uR_gWLKwWC+x#U(A_T8IN-`$kwff`jGhvF_f zm+R7V>b=)Lbyo)~sn4C;364(C35=nWzCP2#xBtL~1MZtQei&Xohu7WJnE;Rb@Fx5+ zy!|8@e)SZ-`yVRsvW~;6r?bi>p_8A}_!`{BOIPNs0*bX{;^NGHfq3giI&Ngqr6jt3 z}gqr4+fgvD1I4d+xtc}3>_r=LHa{Drun9#vZXQ|gkp z+WzI{HT(AB0suf17@Jb=+T4Mm68Yw0M?;i4n=V86v{Lg89SBt994dvL8ylh^5VT05 zrRCH7$)S~W!MP(E=Lu+eaze;Cxi+de4x^wg?Ufvj*!X&r#mBh|Ri_kHOcwb2y-1PG z!a2@0ZEx$9o}6UFu=S=P0)oPGFB>Wt#$&7jFV$)8kLjpN8D~|eQoSZL(I6IBF%BF} zFI6X{JyMHIUB+i3f#G9`&D04=&SV#kBzeZ~?O9;x9tbmx3}n<4!j(Ndz(gzAYn5-& zBw&+WjzeT%W&$etk}gG5Y1$)2c-T%@2CnE3E=x+9wk%LU6CWBsj|~nK6HBWBQq?-m z&CRW2I?Z~{4+SctvCk?b$I!TEb;&z`F6I6_+^{ zZ*3dV@rc>f2#}gIWk>oS4mS-O zLuF$fGPulECx7cFJ4BJ}MVhl}QX|3;FK|6B9UEtue7BS4&cyIBv-$9vo$vk%&5I9# zuKm}C-M1zmY*vz27en>|IHnuNSQ7}=AuRwkX3Mdh!1S@L$2zIQ4|tTM_;eef{4p*$ jG3JW;l@Q$h9fB-!AEV6w7yS7g{F^%*I)4Q2Yw!MdU-g-- literal 0 HcmV?d00001 diff --git a/naver_review/biz_crawler.py b/naver_review/biz_crawler.py new file mode 100644 index 0000000..0617311 --- /dev/null +++ b/naver_review/biz_crawler.py @@ -0,0 +1,255 @@ +import undetected_chromedriver as uc +from selenium.webdriver.common.by import By +from selenium.webdriver.support.ui import WebDriverWait +from selenium.webdriver.support import expected_conditions as EC +from selenium.common.exceptions import NoSuchElementException, TimeoutException +import pickle +import os +import time +from datetime import datetime +import sys +from config import HEADLESS, BIZ_ID, NAVER_ID, NAVER_PW, START_DATE, END_DATE, COOKIE_FILE + + +def create_driver(headless=False): + options = uc.ChromeOptions() + options.add_argument('--window-size=375,812') + if headless: + options.add_argument("--headless=new") + options.add_argument("--disable-gpu") + + options.add_argument("--user-agent=Mozilla/5.0 (iPhone; CPU iPhone OS 14_0 like Mac OS X) " + "AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Mobile/15A372 Safari/604.1") + + driver = uc.Chrome(options=options) + + if not headless: + # 명시적으로 모바일 해상도로 강제 설정 + driver.set_window_size(375, 812) + + return driver + + +def save_cookies(driver): + cookies = driver.get_cookies() + for c in cookies: + c.pop("sameSite", None) + if "expiry" in c: + c["expires"] = c.pop("expiry") + with open(COOKIE_FILE, "wb") as f: + pickle.dump(cookies, f) + print(f"[INFO] 쿠키 저장 완료 ({len(cookies)}개)") + + +def load_cookies(driver): + with open(COOKIE_FILE, "rb") as f: + cookies = pickle.load(f) + for cookie in cookies: + driver.add_cookie(cookie) + print(f"[INFO] 쿠키 로드 완료 ({len(cookies)}개)") + + +def perform_login(): + driver = create_driver(headless=False) + wait = WebDriverWait(driver, 20) + print("[INFO] 로그인 시도 중...") + + driver.get(f"https://new.smartplace.naver.com/bizes/place/{BIZ_ID[0]}/reviews") + time.sleep(2) + + try: + modal = wait.until(EC.presence_of_element_located((By.ID, "modal-root"))) + login_btn = modal.find_element(By.XPATH, './/button') + login_btn.click() + print("[INFO] 로그인 버튼 클릭됨") + except: + print("[WARN] 로그인 모달이 감지되지 않음") + + try: + wait.until(EC.presence_of_element_located((By.ID, 'id'))).send_keys(NAVER_ID) + driver.find_element(By.ID, 'pw').send_keys(NAVER_PW) + driver.find_element(By.XPATH, '//button[@type="submit"]').click() + print("[INFO] 로그인 폼 제출 완료") + except Exception as e: + print("[ERROR] 로그인 페이지 구성 실패:", e) + driver.quit() + return False + + time.sleep(3) + if "captcha" in driver.page_source.lower() or "자동입력 방지문자" in driver.page_source: + print("\n⚠️ CAPTCHA 감지됨. 브라우저에서 수동 입력 후 Enter 키를 누르세요...") + input("✅ 완료되었으면 Enter를 누르세요.") + + time.sleep(3) + save_cookies(driver) + driver.quit() + return True + + +def is_login_required(driver): + return "로그인이 필요한 기능" in driver.page_source + + +def access_review_page(driver, biz_id): + driver.get(f"https://new.smartplace.naver.com/bizes/place/{biz_id}/reviews") + time.sleep(2) + +import re + +def extract_reviews(driver): + reviews = [] + try: + WebDriverWait(driver, 10).until( + EC.presence_of_element_located((By.XPATH, "//ul[starts-with(@class, 'Review_columns_list')]")) + ) + lis = driver.find_elements(By.XPATH, "//ul[starts-with(@class, 'Review_columns_list')]/li") + + for li in lis: + if "Review_banner__" in li.get_attribute("class"): + continue + + try: + # 작성자 + author = li.find_element(By.XPATH, ".//div[1]/a[2]/div/span/span").text.strip() + + # 방문일 + visit_date_text = li.find_element(By.XPATH, ".//div[2]/div[1]/span[2]/time").text.strip() + visit_date = datetime.strptime( + visit_date_text.split("(")[0].replace(". ", "-").replace(".", ""), + "%Y-%m-%d" + ).strftime("%Y-%m-%d") + + # 작성일, 예약자 구분 후 날짜 텍스트 추출 예시 + + # li > div[2] > div[2] 영역 + date_container = li.find_element(By.XPATH, ".//div[2]/div[2]") + + # span 태그들 텍스트 확인 + spans = date_container.find_elements(By.TAG_NAME, "span") + labels = [span.text.strip() for span in spans] + + # 기본값: 작성일 날짜 위치 + written_date_text = None + + if "작성일" in labels: + # 작성일이 span[2] 위치 + # 예: labels = ['작성일', '2025-07-03'] + idx = labels.index("작성일") + # 날짜는 그 다음 span 위치(예시가 span[2]이면 index=1, +1) + if idx + 1 < len(spans): + written_date_text = spans[idx + 1].find_element(By.TAG_NAME, "time").text.strip() + elif "예약자" in labels: + # 예약자인 경우 날짜가 div[3] 에 있을 수 있음 + try: + written_date_text = li.find_element(By.XPATH, ".//div[3]/div[1]/span[2]/time").text.strip() + except NoSuchElementException: + written_date_text = None + + if not written_date_text: + # 날짜 못 찾으면 건너뛰기 + continue + + written_date = datetime.strptime( + written_date_text.split("(")[0].replace(". ", "-").replace(".", ""), + "%Y-%m-%d" + ).strftime("%Y-%m-%d") + + + # 날짜 필터링 + if not (START_DATE <= written_date <= END_DATE): + continue + + # 이미지 유무 체크 + has_image = False + try: + li.find_element(By.XPATH, ".//div[starts-with(@class, 'Review_img_slide_wrap_')]") + has_image = True + except NoSuchElementException: + pass + + # 내용 추출 + if has_image: + content_el = li.find_element(By.XPATH, "./div[5]/a") + else: + content_el = li.find_element(By.XPATH, "./div[4]/a") + + # innerHTML 얻기 + html_content = content_el.get_attribute("innerHTML") + + #
태그를 줄바꿈으로 변환 + for br_tag in ["
", "
", "
", "
", "
", "
"]: + html_content = html_content.replace(br_tag, "\n") + + # span.pui__blind 제거 (html 태그 형태) + html_content = re.sub(r'.*?<\/span>', '', html_content, flags=re.DOTALL) + + # 남은 html 태그가 있을 수 있으니 간단히 제거 (예: 태그 포함될 경우) + html_content = re.sub(r'<.*?>', '', html_content) + + # 공백 양쪽 제거 + content = html_content.strip() + + # 내용 없는 경우 6번째 div가 Review_btn_group_ 포함 확인 (생략 가능, 상황에 따라 if문으로 처리 가능) + + reviews.append({ + "작성자": author, + "방문일": visit_date, + "작성일": written_date, + "내용": content + }) + + except Exception as e: + print("[WARN] 리뷰 항목 처리 중 오류:", e) + + except TimeoutException: + print("[ERROR] 리뷰 리스트 로딩 실패: 페이지에 리스트가 없음") + except Exception as e: + print("[ERROR] 리뷰 전체 추출 실패:", e) + + return reviews + + +def run(headless=True): + if not os.path.exists(COOKIE_FILE): + if headless: + print("[WARN] HEADLESS 모드에서는 로그인 불가 → HEADLESS=False로 재시작 중...") + run(headless=False) + return + if not perform_login(): + print("[ERROR] 로그인 실패 → 종료") + return + + driver = create_driver(headless=headless) + driver.get("https://naver.com") + time.sleep(1) + load_cookies(driver) + + total_reviews = 0 + + for biz_id in BIZ_ID: + print(f"\n=== [{biz_id}] 리뷰 수집 시작 ===") + try: + access_review_page(driver, biz_id) + + if is_login_required(driver): + print("[WARN] 세션 만료 또는 쿠키 무효. 재로그인 필요") + driver.quit() + os.remove(COOKIE_FILE) + run(headless=False) + return + + reviews = extract_reviews(driver) + print(f"[RESULT] 리뷰 {len(reviews)}개 수집됨") + total_reviews += len(reviews) + + for r in reviews: + print(" -", r["방문일"], r["작성일"], r["작성자"], r["내용"]) + except Exception as e: + print(f"[ERROR] {biz_id} 처리 중 오류:", e) + + driver.quit() + print("\n[SUMMARY] 총 수집 리뷰 수:", total_reviews) + + +if __name__ == "__main__": + run(headless=HEADLESS) diff --git a/naver_review/biz_run.py b/naver_review/biz_run.py new file mode 100644 index 0000000..fac0447 --- /dev/null +++ b/naver_review/biz_run.py @@ -0,0 +1,110 @@ +import tkinter as tk +from tkinter import messagebox +from datetime import datetime, timedelta +import threading +import sys +import subprocess +import os + +# biz_crawler.py 에서 run_crawler 함수 임포트 +from biz_crawler import run_crawler + +def get_default_dates(): + today = datetime.today() + if today.weekday() == 0: # 월요일 + start = today - timedelta(days=3) + end = today - timedelta(days=1) + else: + start = today - timedelta(days=1) + end = today - timedelta(days=1) + return start.strftime("%Y-%m-%d"), end.strftime("%Y-%m-%d") + +class BizCrawlerGUI(tk.Tk): + def __init__(self): + super().__init__() + self.title("네이버 스마트플레이스 리뷰 크롤러") + self.geometry("600x450") + + frame = tk.Frame(self) + frame.pack(pady=10) + + tk.Label(frame, text="시작일 (YYYY-MM-DD):").grid(row=0, column=0, sticky="e") + tk.Label(frame, text="종료일 (YYYY-MM-DD):").grid(row=1, column=0, sticky="e") + + self.start_entry = tk.Entry(frame, width=15) + self.end_entry = tk.Entry(frame, width=15) + self.start_entry.grid(row=0, column=1, padx=5) + self.end_entry.grid(row=1, column=1, padx=5) + + start_default, end_default = get_default_dates() + self.start_entry.insert(0, start_default) + self.end_entry.insert(0, end_default) + + self.run_btn = tk.Button(self, text="크롤링 실행", command=self.start_crawling) + self.run_btn.pack(pady=10) + + self.log_text = tk.Text(self, height=20, state='disabled') + self.log_text.pack(fill=tk.BOTH, expand=True, padx=10, pady=10) + + def log(self, msg): + self.log_text.config(state='normal') + self.log_text.insert(tk.END, msg + "\n") + self.log_text.see(tk.END) + self.log_text.config(state='disabled') + + def start_crawling(self): + start_date = self.start_entry.get().strip() + end_date = self.end_entry.get().strip() + + try: + sd = datetime.strptime(start_date, "%Y-%m-%d") + ed = datetime.strptime(end_date, "%Y-%m-%d") + if sd > ed: + raise ValueError("시작일이 종료일보다 이후일 수 없습니다.") + except Exception as e: + messagebox.showerror("입력 오류", f"날짜 형식 오류 또는 범위 오류: {e}") + return + + self.run_btn.config(state='disabled') + self.log_text.config(state='normal') + self.log_text.delete(1.0, tk.END) + self.log_text.config(state='disabled') + + threading.Thread(target=self.run_crawler_thread, args=(start_date, end_date), daemon=True).start() + + def run_crawler_thread(self, start_date, end_date): + import sys + + class StdoutRedirector: + def __init__(self, func): + self.func = func + def write(self, text): + if text.strip(): + self.func(text.strip()) + def flush(self): + pass + + sys.stdout = StdoutRedirector(self.log) + + try: + success = run_crawler(start_date, end_date) + if success is False: + self.log("[INFO] CAPTCHA 감지됨: 헤드리스 모드를 해제하고 프로그램을 재실행합니다.") + + python_executable = sys.executable + script_path = os.path.abspath(sys.argv[0]) + + # 재실행 명령 (기존 프로세스 종료 전) + subprocess.Popen([python_executable, script_path]) + os._exit(0) + + except Exception as e: + self.log(f"[ERROR] 크롤링 중 오류 발생: {e}") + + finally: + sys.stdout = sys.__stdout__ + self.after(0, lambda: self.run_btn.config(state='normal')) + +if __name__ == "__main__": + app = BizCrawlerGUI() + app.mainloop() diff --git a/naver_review/config.sample.py b/naver_review/config.sample.py index df2ea42..5958d88 100644 --- a/naver_review/config.sample.py +++ b/naver_review/config.sample.py @@ -1,5 +1,26 @@ # config.py -PLACE_IDS = ["플레이스 ID 1", "플레이스 ID 2"] # 여러 플레이스 ID 가능 +PLACE_IDS = ["12345678"] # 여러 플레이스 ID 가능 MAX_REVIEWS = 100 # 각 플레이스당 최대 수집 수 -START_DATE = "2025-07-01" # 필터링 시작일 +START_DATE = "2025-07-03" # 필터링 시작일 END_DATE = "2025-07-03" # 필터링 종료일 + + +NAVER_ID = 'ID' +NAVER_PW = 'PW' + + +# 네이버 비즈니스 페이지에서 보는 작성일 기준 리뷰 +BIZ_ID = [ + "BIZ_ID", + ] + +COOKIE_FILE = "naver_cookies.pkl" + +# 리뷰 작성일 필터 (YYYY-MM-DD 형식) +START_DATE = "2025-06-01" +END_DATE = "2025-07-03" + +# ========================== +# 설정 변경 지점 +HEADLESS = True # True로 하면 헤드리스 모드 실행 +# ========================== diff --git a/naver_review/naver_cookies.pkl b/naver_review/naver_cookies.pkl new file mode 100644 index 0000000000000000000000000000000000000000..2307ef398a9806641dddc5ee7f7c0dd42ae70cec GIT binary patch literal 883 zcmZvaOOKmY0EMUNB$-70A&F3i0UOLLB9({3!x(I^F-EE)%8m9m_X=#KuN z{dgEWA8|U-%{yulxpuG34aGvf!T_Ziu zhWUBA&0QGG(_2jzzM5-uo^Fv_gL(Q1pQkrf;5B71KTWS&H*12~mR{Z5uq?|JwW#`o zajIq6;hV||-TmcEBUwjRDG|eq-C(JMb$y`f=#Wyb3e1Q2%ptxSor_2u2r~52;|_M~ zhtOAwHGM@a&y2aKnbb0;bMiz4?cVbb0z#P#u`}71BHzT>3iDCZFS7=eVRFK-FN0LC z+l$e=sbfi!3RbL0bv8zA&kBDw0)2ythqh)A;8e*ZFvq~EhSc!{`ap!T$JK)G<82@x zH|WUQmv~Q$S8;_3)Dyeb(k_F}vlVj6~|ShD(aEc_iF5H zi(|2g_CT@aG^nB;9M+k^85>h(qp}h;WehyX*|?d&Jz~})e5tnWT*Yyjy0|Z@;el*n z2M^9d$uC{eQv$Ue`lE;s`WXpXXZvtq$gQ#wI(geiyL{`hY(?w_&cxb9u15{};_6JJ ztwdo|i||~o42n>AXe@j^^(y-WA5jDO-2rpQHYrb9EYK~b<&J@yFkv;Hexx9;B>Cv= zmK|3WWAE^#Y*o^hNgH!{iu-FjBw;SMXlZa_3@w%LsMg_e>4=vhyQmkN%8Yq0IN_8v zMn+K8&;;v}I7cRU)Cz6Eu$oHQ-RI9Q(gi(=%b=Q-kAM9EnipZgtN)k^ZlA#Gf81n` zhZ(#Dci`OLJ^0V2M}T&3+l+Rv|J1KAt5U yb4IJZrsQCa)eL@7K2A;YztHFJ&^-3P#(DTJ^!WvP_i07YBtt*^7=M2p-2V;EEH1?W literal 0 HcmV?d00001 From 04f9e06e5e100fa70ebec1167a6c20d6920ce927 Mon Sep 17 00:00:00 2001 From: KWON Date: Thu, 10 Jul 2025 15:17:17 +0900 Subject: [PATCH 2/2] =?UTF-8?q?=EB=84=A4=EC=9D=B4=EB=B2=84=20=EB=A6=AC?= =?UTF-8?q?=EB=B7=B0=20=ED=81=AC=EB=A1=A4=EB=9F=AC=20=EB=B6=84=EB=A6=AC?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- naver_cookies.pkl | Bin 1227 -> 0 bytes naver_review/biz_crawler.py | 255 --------------------------------- naver_review/biz_run.py | 110 -------------- naver_review/config.sample.py | 26 ---- naver_review/main.py | 170 ---------------------- naver_review/naver_cookies.pkl | Bin 883 -> 0 bytes naver_review/run.py | 89 ------------ 7 files changed, 650 deletions(-) delete mode 100644 naver_cookies.pkl delete mode 100644 naver_review/biz_crawler.py delete mode 100644 naver_review/biz_run.py delete mode 100644 naver_review/config.sample.py delete mode 100644 naver_review/main.py delete mode 100644 naver_review/naver_cookies.pkl delete mode 100644 naver_review/run.py diff --git a/naver_cookies.pkl b/naver_cookies.pkl deleted file mode 100644 index 5fe8ffb30beeadd023d07884944a6e8dc53d5ff0..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1227 zcma)*+iK%h6oxYsCrv}=JqFywwvIM*pj#))ijJ0KS%#A8e6S=-mSo9>LhqV@dZ)fm z->WzEq|=tR6X@O-3)X`FTMPdG$H6bZ4lZ8acm4Yx`uE+nyGyMo>$}_g%sP1cduNyS z-L==|(f-(T-~Z-?-A!0kxs)Zx{<#}uR_gWLKwWC+x#U(A_T8IN-`$kwff`jGhvF_f zm+R7V>b=)Lbyo)~sn4C;364(C35=nWzCP2#xBtL~1MZtQei&Xohu7WJnE;Rb@Fx5+ zy!|8@e)SZ-`yVRsvW~;6r?bi>p_8A}_!`{BOIPNs0*bX{;^NGHfq3giI&Ngqr6jt3 z}gqr4+fgvD1I4d+xtc}3>_r=LHa{Drun9#vZXQ|gkp z+WzI{HT(AB0suf17@Jb=+T4Mm68Yw0M?;i4n=V86v{Lg89SBt994dvL8ylh^5VT05 zrRCH7$)S~W!MP(E=Lu+eaze;Cxi+de4x^wg?Ufvj*!X&r#mBh|Ri_kHOcwb2y-1PG z!a2@0ZEx$9o}6UFu=S=P0)oPGFB>Wt#$&7jFV$)8kLjpN8D~|eQoSZL(I6IBF%BF} zFI6X{JyMHIUB+i3f#G9`&D04=&SV#kBzeZ~?O9;x9tbmx3}n<4!j(Ndz(gzAYn5-& zBw&+WjzeT%W&$etk}gG5Y1$)2c-T%@2CnE3E=x+9wk%LU6CWBsj|~nK6HBWBQq?-m z&CRW2I?Z~{4+SctvCk?b$I!TEb;&z`F6I6_+^{ zZ*3dV@rc>f2#}gIWk>oS4mS-O zLuF$fGPulECx7cFJ4BJ}MVhl}QX|3;FK|6B9UEtue7BS4&cyIBv-$9vo$vk%&5I9# zuKm}C-M1zmY*vz27en>|IHnuNSQ7}=AuRwkX3Mdh!1S@L$2zIQ4|tTM_;eef{4p*$ jG3JW;l@Q$h9fB-!AEV6w7yS7g{F^%*I)4Q2Yw!MdU-g-- diff --git a/naver_review/biz_crawler.py b/naver_review/biz_crawler.py deleted file mode 100644 index 0617311..0000000 --- a/naver_review/biz_crawler.py +++ /dev/null @@ -1,255 +0,0 @@ -import undetected_chromedriver as uc -from selenium.webdriver.common.by import By -from selenium.webdriver.support.ui import WebDriverWait -from selenium.webdriver.support import expected_conditions as EC -from selenium.common.exceptions import NoSuchElementException, TimeoutException -import pickle -import os -import time -from datetime import datetime -import sys -from config import HEADLESS, BIZ_ID, NAVER_ID, NAVER_PW, START_DATE, END_DATE, COOKIE_FILE - - -def create_driver(headless=False): - options = uc.ChromeOptions() - options.add_argument('--window-size=375,812') - if headless: - options.add_argument("--headless=new") - options.add_argument("--disable-gpu") - - options.add_argument("--user-agent=Mozilla/5.0 (iPhone; CPU iPhone OS 14_0 like Mac OS X) " - "AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Mobile/15A372 Safari/604.1") - - driver = uc.Chrome(options=options) - - if not headless: - # 명시적으로 모바일 해상도로 강제 설정 - driver.set_window_size(375, 812) - - return driver - - -def save_cookies(driver): - cookies = driver.get_cookies() - for c in cookies: - c.pop("sameSite", None) - if "expiry" in c: - c["expires"] = c.pop("expiry") - with open(COOKIE_FILE, "wb") as f: - pickle.dump(cookies, f) - print(f"[INFO] 쿠키 저장 완료 ({len(cookies)}개)") - - -def load_cookies(driver): - with open(COOKIE_FILE, "rb") as f: - cookies = pickle.load(f) - for cookie in cookies: - driver.add_cookie(cookie) - print(f"[INFO] 쿠키 로드 완료 ({len(cookies)}개)") - - -def perform_login(): - driver = create_driver(headless=False) - wait = WebDriverWait(driver, 20) - print("[INFO] 로그인 시도 중...") - - driver.get(f"https://new.smartplace.naver.com/bizes/place/{BIZ_ID[0]}/reviews") - time.sleep(2) - - try: - modal = wait.until(EC.presence_of_element_located((By.ID, "modal-root"))) - login_btn = modal.find_element(By.XPATH, './/button') - login_btn.click() - print("[INFO] 로그인 버튼 클릭됨") - except: - print("[WARN] 로그인 모달이 감지되지 않음") - - try: - wait.until(EC.presence_of_element_located((By.ID, 'id'))).send_keys(NAVER_ID) - driver.find_element(By.ID, 'pw').send_keys(NAVER_PW) - driver.find_element(By.XPATH, '//button[@type="submit"]').click() - print("[INFO] 로그인 폼 제출 완료") - except Exception as e: - print("[ERROR] 로그인 페이지 구성 실패:", e) - driver.quit() - return False - - time.sleep(3) - if "captcha" in driver.page_source.lower() or "자동입력 방지문자" in driver.page_source: - print("\n⚠️ CAPTCHA 감지됨. 브라우저에서 수동 입력 후 Enter 키를 누르세요...") - input("✅ 완료되었으면 Enter를 누르세요.") - - time.sleep(3) - save_cookies(driver) - driver.quit() - return True - - -def is_login_required(driver): - return "로그인이 필요한 기능" in driver.page_source - - -def access_review_page(driver, biz_id): - driver.get(f"https://new.smartplace.naver.com/bizes/place/{biz_id}/reviews") - time.sleep(2) - -import re - -def extract_reviews(driver): - reviews = [] - try: - WebDriverWait(driver, 10).until( - EC.presence_of_element_located((By.XPATH, "//ul[starts-with(@class, 'Review_columns_list')]")) - ) - lis = driver.find_elements(By.XPATH, "//ul[starts-with(@class, 'Review_columns_list')]/li") - - for li in lis: - if "Review_banner__" in li.get_attribute("class"): - continue - - try: - # 작성자 - author = li.find_element(By.XPATH, ".//div[1]/a[2]/div/span/span").text.strip() - - # 방문일 - visit_date_text = li.find_element(By.XPATH, ".//div[2]/div[1]/span[2]/time").text.strip() - visit_date = datetime.strptime( - visit_date_text.split("(")[0].replace(". ", "-").replace(".", ""), - "%Y-%m-%d" - ).strftime("%Y-%m-%d") - - # 작성일, 예약자 구분 후 날짜 텍스트 추출 예시 - - # li > div[2] > div[2] 영역 - date_container = li.find_element(By.XPATH, ".//div[2]/div[2]") - - # span 태그들 텍스트 확인 - spans = date_container.find_elements(By.TAG_NAME, "span") - labels = [span.text.strip() for span in spans] - - # 기본값: 작성일 날짜 위치 - written_date_text = None - - if "작성일" in labels: - # 작성일이 span[2] 위치 - # 예: labels = ['작성일', '2025-07-03'] - idx = labels.index("작성일") - # 날짜는 그 다음 span 위치(예시가 span[2]이면 index=1, +1) - if idx + 1 < len(spans): - written_date_text = spans[idx + 1].find_element(By.TAG_NAME, "time").text.strip() - elif "예약자" in labels: - # 예약자인 경우 날짜가 div[3] 에 있을 수 있음 - try: - written_date_text = li.find_element(By.XPATH, ".//div[3]/div[1]/span[2]/time").text.strip() - except NoSuchElementException: - written_date_text = None - - if not written_date_text: - # 날짜 못 찾으면 건너뛰기 - continue - - written_date = datetime.strptime( - written_date_text.split("(")[0].replace(". ", "-").replace(".", ""), - "%Y-%m-%d" - ).strftime("%Y-%m-%d") - - - # 날짜 필터링 - if not (START_DATE <= written_date <= END_DATE): - continue - - # 이미지 유무 체크 - has_image = False - try: - li.find_element(By.XPATH, ".//div[starts-with(@class, 'Review_img_slide_wrap_')]") - has_image = True - except NoSuchElementException: - pass - - # 내용 추출 - if has_image: - content_el = li.find_element(By.XPATH, "./div[5]/a") - else: - content_el = li.find_element(By.XPATH, "./div[4]/a") - - # innerHTML 얻기 - html_content = content_el.get_attribute("innerHTML") - - #
태그를 줄바꿈으로 변환 - for br_tag in ["
", "
", "
", "
", "
", "
"]: - html_content = html_content.replace(br_tag, "\n") - - # span.pui__blind 제거 (html 태그 형태) - html_content = re.sub(r'.*?<\/span>', '', html_content, flags=re.DOTALL) - - # 남은 html 태그가 있을 수 있으니 간단히 제거 (예:
태그 포함될 경우) - html_content = re.sub(r'<.*?>', '', html_content) - - # 공백 양쪽 제거 - content = html_content.strip() - - # 내용 없는 경우 6번째 div가 Review_btn_group_ 포함 확인 (생략 가능, 상황에 따라 if문으로 처리 가능) - - reviews.append({ - "작성자": author, - "방문일": visit_date, - "작성일": written_date, - "내용": content - }) - - except Exception as e: - print("[WARN] 리뷰 항목 처리 중 오류:", e) - - except TimeoutException: - print("[ERROR] 리뷰 리스트 로딩 실패: 페이지에 리스트가 없음") - except Exception as e: - print("[ERROR] 리뷰 전체 추출 실패:", e) - - return reviews - - -def run(headless=True): - if not os.path.exists(COOKIE_FILE): - if headless: - print("[WARN] HEADLESS 모드에서는 로그인 불가 → HEADLESS=False로 재시작 중...") - run(headless=False) - return - if not perform_login(): - print("[ERROR] 로그인 실패 → 종료") - return - - driver = create_driver(headless=headless) - driver.get("https://naver.com") - time.sleep(1) - load_cookies(driver) - - total_reviews = 0 - - for biz_id in BIZ_ID: - print(f"\n=== [{biz_id}] 리뷰 수집 시작 ===") - try: - access_review_page(driver, biz_id) - - if is_login_required(driver): - print("[WARN] 세션 만료 또는 쿠키 무효. 재로그인 필요") - driver.quit() - os.remove(COOKIE_FILE) - run(headless=False) - return - - reviews = extract_reviews(driver) - print(f"[RESULT] 리뷰 {len(reviews)}개 수집됨") - total_reviews += len(reviews) - - for r in reviews: - print(" -", r["방문일"], r["작성일"], r["작성자"], r["내용"]) - except Exception as e: - print(f"[ERROR] {biz_id} 처리 중 오류:", e) - - driver.quit() - print("\n[SUMMARY] 총 수집 리뷰 수:", total_reviews) - - -if __name__ == "__main__": - run(headless=HEADLESS) diff --git a/naver_review/biz_run.py b/naver_review/biz_run.py deleted file mode 100644 index fac0447..0000000 --- a/naver_review/biz_run.py +++ /dev/null @@ -1,110 +0,0 @@ -import tkinter as tk -from tkinter import messagebox -from datetime import datetime, timedelta -import threading -import sys -import subprocess -import os - -# biz_crawler.py 에서 run_crawler 함수 임포트 -from biz_crawler import run_crawler - -def get_default_dates(): - today = datetime.today() - if today.weekday() == 0: # 월요일 - start = today - timedelta(days=3) - end = today - timedelta(days=1) - else: - start = today - timedelta(days=1) - end = today - timedelta(days=1) - return start.strftime("%Y-%m-%d"), end.strftime("%Y-%m-%d") - -class BizCrawlerGUI(tk.Tk): - def __init__(self): - super().__init__() - self.title("네이버 스마트플레이스 리뷰 크롤러") - self.geometry("600x450") - - frame = tk.Frame(self) - frame.pack(pady=10) - - tk.Label(frame, text="시작일 (YYYY-MM-DD):").grid(row=0, column=0, sticky="e") - tk.Label(frame, text="종료일 (YYYY-MM-DD):").grid(row=1, column=0, sticky="e") - - self.start_entry = tk.Entry(frame, width=15) - self.end_entry = tk.Entry(frame, width=15) - self.start_entry.grid(row=0, column=1, padx=5) - self.end_entry.grid(row=1, column=1, padx=5) - - start_default, end_default = get_default_dates() - self.start_entry.insert(0, start_default) - self.end_entry.insert(0, end_default) - - self.run_btn = tk.Button(self, text="크롤링 실행", command=self.start_crawling) - self.run_btn.pack(pady=10) - - self.log_text = tk.Text(self, height=20, state='disabled') - self.log_text.pack(fill=tk.BOTH, expand=True, padx=10, pady=10) - - def log(self, msg): - self.log_text.config(state='normal') - self.log_text.insert(tk.END, msg + "\n") - self.log_text.see(tk.END) - self.log_text.config(state='disabled') - - def start_crawling(self): - start_date = self.start_entry.get().strip() - end_date = self.end_entry.get().strip() - - try: - sd = datetime.strptime(start_date, "%Y-%m-%d") - ed = datetime.strptime(end_date, "%Y-%m-%d") - if sd > ed: - raise ValueError("시작일이 종료일보다 이후일 수 없습니다.") - except Exception as e: - messagebox.showerror("입력 오류", f"날짜 형식 오류 또는 범위 오류: {e}") - return - - self.run_btn.config(state='disabled') - self.log_text.config(state='normal') - self.log_text.delete(1.0, tk.END) - self.log_text.config(state='disabled') - - threading.Thread(target=self.run_crawler_thread, args=(start_date, end_date), daemon=True).start() - - def run_crawler_thread(self, start_date, end_date): - import sys - - class StdoutRedirector: - def __init__(self, func): - self.func = func - def write(self, text): - if text.strip(): - self.func(text.strip()) - def flush(self): - pass - - sys.stdout = StdoutRedirector(self.log) - - try: - success = run_crawler(start_date, end_date) - if success is False: - self.log("[INFO] CAPTCHA 감지됨: 헤드리스 모드를 해제하고 프로그램을 재실행합니다.") - - python_executable = sys.executable - script_path = os.path.abspath(sys.argv[0]) - - # 재실행 명령 (기존 프로세스 종료 전) - subprocess.Popen([python_executable, script_path]) - os._exit(0) - - except Exception as e: - self.log(f"[ERROR] 크롤링 중 오류 발생: {e}") - - finally: - sys.stdout = sys.__stdout__ - self.after(0, lambda: self.run_btn.config(state='normal')) - -if __name__ == "__main__": - app = BizCrawlerGUI() - app.mainloop() diff --git a/naver_review/config.sample.py b/naver_review/config.sample.py deleted file mode 100644 index 5958d88..0000000 --- a/naver_review/config.sample.py +++ /dev/null @@ -1,26 +0,0 @@ -# config.py -PLACE_IDS = ["12345678"] # 여러 플레이스 ID 가능 -MAX_REVIEWS = 100 # 각 플레이스당 최대 수집 수 -START_DATE = "2025-07-03" # 필터링 시작일 -END_DATE = "2025-07-03" # 필터링 종료일 - - -NAVER_ID = 'ID' -NAVER_PW = 'PW' - - -# 네이버 비즈니스 페이지에서 보는 작성일 기준 리뷰 -BIZ_ID = [ - "BIZ_ID", - ] - -COOKIE_FILE = "naver_cookies.pkl" - -# 리뷰 작성일 필터 (YYYY-MM-DD 형식) -START_DATE = "2025-06-01" -END_DATE = "2025-07-03" - -# ========================== -# 설정 변경 지점 -HEADLESS = True # True로 하면 헤드리스 모드 실행 -# ========================== diff --git a/naver_review/main.py b/naver_review/main.py deleted file mode 100644 index cf8ceec..0000000 --- a/naver_review/main.py +++ /dev/null @@ -1,170 +0,0 @@ -from selenium import webdriver -from selenium.webdriver.common.by import By -from selenium.webdriver.chrome.options import Options -from selenium.webdriver.support.ui import WebDriverWait -from selenium.webdriver.support import expected_conditions as EC -from datetime import datetime -import time -import config # 사용자 설정 파일 - -# ─────────────────────────────────────────────────────── -# ✅ WebDriver 설정 (모바일 User-Agent 포함, 헤드리스 옵션 가능) -# ─────────────────────────────────────────────────────── -def setup_driver(): - chrome_options = Options() - chrome_options.add_argument("--headless=new") # 필요 시 주석 해제 - chrome_options.add_argument("--no-sandbox") - chrome_options.add_argument("--disable-dev-shm-usage") - chrome_options.add_argument( - "--user-agent=Mozilla/5.0 (iPhone; CPU iPhone OS 13_5 like Mac OS X) " - "AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.1.1 Mobile/15E148 Safari/604.1" - ) - print("[INFO] Chrome WebDriver 실행 중...") - return webdriver.Chrome(options=chrome_options) - -# ─────────────────────────────────────────────────────── -# ✅ 날짜 문자열 파싱 함수 (예: '2025년 6월 8일 일요일') -# ─────────────────────────────────────────────────────── -def parse_korean_date(date_str): - try: - # '2025년 6월 8일 일요일' → '2025년 6월 8일' - date_clean = " ".join(date_str.strip().split(" ")[:3]) - return datetime.strptime(date_clean, "%Y년 %m월 %d일").date() - except Exception as e: - print(f"[WARN] 날짜 파싱 실패: {date_str} ({e})") - return None - -# ─────────────────────────────────────────────────────── -# ✅ "더보기" 버튼 클릭 함수 (한 번만 클릭) -# ─────────────────────────────────────────────────────── -def click_more(driver): - try: - container = driver.find_element(By.CLASS_NAME, "place_section_content") - more_div = container.find_element(By.XPATH, "./following-sibling::div[1]") - more_btn = more_div.find_element(By.TAG_NAME, "a") - driver.execute_script("arguments[0].click();", more_btn) - print("[INFO] 더보기 클릭") - time.sleep(2) - return True - except: - print("[INFO] 더보기 없음") - return False - -# ─────────────────────────────────────────────────────── -# ✅ 업체명 추출 함수 (페이지 상단의 "_title" ID 활용) -# ─────────────────────────────────────────────────────── -def extract_shop_name(driver): - try: - main = driver.find_element(By.CSS_SELECTOR, 'div[role="main"]') - title = main.find_element(By.ID, "_title") - name = title.find_element(By.TAG_NAME, "span").text.strip() - return name - except Exception as e: - print(f"[WARN] 업체명 추출 실패: {e}") - return "업체명 없음" - -# ─────────────────────────────────────────────────────── -# ✅ 리뷰 추출 함수: 작성자 / 날짜 / 본문 -# ─────────────────────────────────────────────────────── -def extract_reviews(driver): - wait = WebDriverWait(driver, 10) - wait.until(EC.presence_of_element_located((By.ID, "_review_list"))) - - ul = driver.find_element(By.ID, "_review_list") - items = ul.find_elements(By.XPATH, './/li[contains(@class, "place_apply_pui")]') - reviews = [] - - for item in items: - try: - # ① 작성자: ./div[1]/a[2]/div/span/span - writer = "익명" - try: - writer = item.find_element(By.XPATH, "./div[1]/a[2]/div/span/span").text.strip() - except: - pass - - # ② 날짜: ./div[7]/div[2]/div/span[1]/span[2] - date = "날짜 없음" - date_obj = None - try: - date_text = item.find_element(By.XPATH, "./div[7]/div[2]/div/span[1]/span[2]").text.strip() - date = date_text - date_obj = parse_korean_date(date_text) - except: - pass - - # ③ 본문: ./div[5]/a - text = "" - try: - text = item.find_element(By.XPATH, "./div[5]/a").text.strip() - except: - pass - - if text: - reviews.append({ - "writer": writer, - "date": date, - "date_obj": date_obj, - "text": text - }) - except Exception as e: - print(f"[WARN] 리뷰 추출 실패: {e}") - - return reviews - -# ─────────────────────────────────────────────────────── -# ✅ 특정 기간 내 리뷰 수집 함수 (날짜 필터 + 더보기 반복) -# ─────────────────────────────────────────────────────── -def crawl_reviews_within_range(place_id, start_date, end_date): - url = f"https://m.place.naver.com/place/{place_id}/review/visitor?reviewSort=recent" - driver = setup_driver() - print(f"[INFO] 리뷰 페이지 접속: {url}") - driver.get(url) - - shop_name = extract_shop_name(driver) - all_reviews = [] - seen = set() - - while True: - new_reviews = extract_reviews(driver) - if not new_reviews: - break - - filtered = [] - for r in new_reviews: - if r["date_obj"] is None: - continue - if start_date <= r["date_obj"] <= end_date: - key = (r["writer"], r["date_obj"], r["text"]) - if key not in seen: - seen.add(key) - filtered.append(r) - - if not filtered: - print("[INFO] 범위 내 리뷰 없음 → 수집 종료") - break - - all_reviews.extend(filtered) - - # 더 클릭할 필요가 없으면 종료 - if not click_more(driver): - break - - driver.quit() - print(f"[DONE] [{shop_name}] {len(all_reviews)}개 리뷰 수집 완료") - return shop_name, all_reviews - -# ─────────────────────────────────────────────────────── -# ✅ 메인 실행부 -# ─────────────────────────────────────────────────────── -if __name__ == "__main__": - start_date = datetime.strptime(config.START_DATE, "%Y-%m-%d").date() - end_date = datetime.strptime(config.END_DATE, "%Y-%m-%d").date() - - for place_id in config.PLACE_IDS: - shop, reviews = crawl_reviews_within_range(place_id, start_date, end_date) - - print(f"\n==== {shop} ({place_id}) 리뷰 목록 ====") - for i, r in enumerate(reviews, 1): - print(f"{i}. 작성자: {r['writer']}, 날짜: {r['date']}") - print(f" 내용: {r['text']}\n") diff --git a/naver_review/naver_cookies.pkl b/naver_review/naver_cookies.pkl deleted file mode 100644 index 2307ef398a9806641dddc5ee7f7c0dd42ae70cec..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 883 zcmZvaOOKmY0EMUNB$-70A&F3i0UOLLB9({3!x(I^F-EE)%8m9m_X=#KuN z{dgEWA8|U-%{yulxpuG34aGvf!T_Ziu zhWUBA&0QGG(_2jzzM5-uo^Fv_gL(Q1pQkrf;5B71KTWS&H*12~mR{Z5uq?|JwW#`o zajIq6;hV||-TmcEBUwjRDG|eq-C(JMb$y`f=#Wyb3e1Q2%ptxSor_2u2r~52;|_M~ zhtOAwHGM@a&y2aKnbb0;bMiz4?cVbb0z#P#u`}71BHzT>3iDCZFS7=eVRFK-FN0LC z+l$e=sbfi!3RbL0bv8zA&kBDw0)2ythqh)A;8e*ZFvq~EhSc!{`ap!T$JK)G<82@x zH|WUQmv~Q$S8;_3)Dyeb(k_F}vlVj6~|ShD(aEc_iF5H zi(|2g_CT@aG^nB;9M+k^85>h(qp}h;WehyX*|?d&Jz~})e5tnWT*Yyjy0|Z@;el*n z2M^9d$uC{eQv$Ue`lE;s`WXpXXZvtq$gQ#wI(geiyL{`hY(?w_&cxb9u15{};_6JJ ztwdo|i||~o42n>AXe@j^^(y-WA5jDO-2rpQHYrb9EYK~b<&J@yFkv;Hexx9;B>Cv= zmK|3WWAE^#Y*o^hNgH!{iu-FjBw;SMXlZa_3@w%LsMg_e>4=vhyQmkN%8Yq0IN_8v zMn+K8&;;v}I7cRU)Cz6Eu$oHQ-RI9Q(gi(=%b=Q-kAM9EnipZgtN)k^ZlA#Gf81n` zhZ(#Dci`OLJ^0V2M}T&3+l+Rv|J1KAt5U yb4IJZrsQCa)eL@7K2A;YztHFJ&^-3P#(DTJ^!WvP_i07YBtt*^7=M2p-2V;EEH1?W diff --git a/naver_review/run.py b/naver_review/run.py deleted file mode 100644 index ed90828..0000000 --- a/naver_review/run.py +++ /dev/null @@ -1,89 +0,0 @@ -import tkinter as tk -from tkinter import ttk, scrolledtext, messagebox -from datetime import datetime, timedelta -import threading -import main # main.py가 같은 폴더에 있어야 함 - -def get_yesterday_str(): - return (datetime.now() - timedelta(days=1)).strftime("%Y-%m-%d") - -def run_crawler_thread(place_ids, start_date, end_date, output_widget, run_button): - def task(): - try: - output_widget.config(state='normal') - output_widget.delete("1.0", tk.END) - output_widget.insert(tk.END, f"크롤링 시작: {start_date} ~ {end_date}\n\n") - - start_date_obj = datetime.strptime(start_date, "%Y-%m-%d").date() - end_date_obj = datetime.strptime(end_date, "%Y-%m-%d").date() - - for place_id in place_ids: - output_widget.insert(tk.END, f"[업체 ID: {place_id}] 리뷰 수집 중...\n") - shop_name, reviews = main.crawl_reviews_within_range(place_id, start_date_obj, end_date_obj) - - output_widget.insert(tk.END, f"업체명: {shop_name}\n") - output_widget.insert(tk.END, f"수집 리뷰 수: {len(reviews)}\n") - - for i, r in enumerate(reviews, 1): - output_widget.insert(tk.END, f"{i}. 작성자: {r['writer']}, 날짜: {r['date']}\n") - output_widget.insert(tk.END, f" 내용: {r['text']}\n") - - output_widget.insert(tk.END, "\n") - - output_widget.insert(tk.END, "크롤링 완료!\n") - output_widget.config(state='disabled') - except Exception as e: - messagebox.showerror("오류", f"크롤링 중 오류 발생:\n{e}") - output_widget.config(state='disabled') - finally: - run_button.config(state='normal') - - run_button.config(state='disabled') - threading.Thread(target=task, daemon=True).start() - -def create_gui(): - root = tk.Tk() - root.title("네이버 플레이스 리뷰 크롤러") - - frm = ttk.Frame(root, padding=10) - frm.grid() - - ttk.Label(frm, text="시작일 (YYYY-MM-DD):").grid(column=0, row=0, sticky='w') - start_entry = ttk.Entry(frm, width=15) - start_entry.grid(column=1, row=0) - start_entry.insert(0, get_yesterday_str()) - - ttk.Label(frm, text="종료일 (YYYY-MM-DD):").grid(column=0, row=1, sticky='w') - end_entry = ttk.Entry(frm, width=15) - end_entry.grid(column=1, row=1) - end_entry.insert(0, get_yesterday_str()) - - output = scrolledtext.ScrolledText(frm, width=80, height=25, state='disabled') - output.grid(column=0, row=3, columnspan=3, pady=10) - - def on_run(): - start_date = start_entry.get() - end_date = end_entry.get() - - # 날짜 형식 검증 - try: - datetime.strptime(start_date, "%Y-%m-%d") - datetime.strptime(end_date, "%Y-%m-%d") - except ValueError: - messagebox.showerror("입력 오류", "날짜 형식을 YYYY-MM-DD 로 입력하세요.") - return - - place_ids = getattr(main.config, "PLACE_IDS", []) - if not place_ids: - messagebox.showerror("설정 오류", "config.py에 PLACE_IDS 리스트가 비어있습니다.") - return - - run_crawler_thread(place_ids, start_date, end_date, output, run_button) - - run_button = ttk.Button(frm, text="실행", command=on_run) - run_button.grid(column=0, row=2, pady=5) - - root.mainloop() - -if __name__ == "__main__": - create_gui()