
import os
import random
import time

import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
import zyt_validation_utils

import utils

USED_VIDEO_URLS = set()
USED_IMAGE_URLS = set()
VIDEO_IDX = 1
IMAGE_IDX = 1
PAGE_IDX = 1

type_name = 'animal'

# 初始化
try:
    with open(f'history/example/{type_name}/example_video_urls.txt', 'r') as url_file:
        USED_VIDEO_URLS = set(url.strip() for url in url_file.readlines())
except FileNotFoundError:
    # 如果文件不存在，创建一个空的 USED_VIDEO_URLS
    pass

try:
    with open(f'history/xhs/{type_name}/example_image_urls.txt', 'r') as url_file:
        USED_IMAGE_URLS = set(url.strip() for url in url_file.readlines())
except FileNotFoundError:
    # 如果文件不存在，创建一个空的 USED_IMAGE_URLS
    pass


def add_video_idx():
    global VIDEO_IDX
    VIDEO_IDX = VIDEO_IDX + 1

def add_image_idx():
    global IMAGE_IDX
    IMAGE_IDX = IMAGE_IDX + 1


def add_page_idx():
    global PAGE_IDX
    PAGE_IDX = PAGE_IDX + 1

def create_option():
    options = Options()
    options.add_argument('--ignore-certificate-errors')  # 忽略证书错误
    options.add_argument('--no-first-run')  # 跳过首次运行向导
    options.add_argument('--disable-blink-features=AutomationControlled')  # 反爬虫检测
    options.add_argument('--disable-background-networking')  # 禁用后台网络交互
    options.add_argument('--disable-sync')  # 关闭账号同步
    options.add_argument('--disable-extensions')  # 禁用扩展
    options.add_argument('--disable-default-apps')  # 禁用默认应用
    options.add_argument('--disable-translate')  # 禁用翻译
    options.add_argument('--disable-web-resources')  # 禁用网络资源加载
    options.add_argument('--disable-component-update')  # 禁用组件更新
    options.add_argument('--disable-domain-reliability')  # 禁用域名可靠性检测
    options.add_argument('--disable-notifications')  # 禁用通知
    options.add_argument('--disable-features=TranslateUI')  # 关闭翻译 UI
    options.add_argument('--no-sandbox')  # 以 root 运行
    options.add_argument('--disable-dev-shm-usage')  # 共享内存不足时避免崩溃
    options.add_argument('--log-level=3')  # 只显示严重错误，减少日志
    options.add_argument(f'--proxy-server={utils.get_random_proxy()}')
    options.add_argument(f'--user-agent={utils.get_random_user_agent()}')

    options.debugger_address = "127.0.0.1:9222"  # 连接已打开网页

    return options


options = create_option()

driver = webdriver.Chrome(options=options)


def reload_web():
    global driver
    if driver:
        try:
            driver.quit()
        except Exception as e:
            print(f"关闭旧会话失败:{e}")

    new_options = create_option()

    driver = webdriver.Chrome(options=new_options)


def try_web():
    try:
        driver.get('https://www.example.com')
        print(f"页面加载成功，开始等待")
        time.sleep(60)
        return True
    except Exception as e:
        print(f"页面加载失败：{str(e)}")
        return False


def change_web(try_time=5):
    print('begin change web')
    cnt = 0
    while cnt < try_time:
        cnt = cnt + 1
        print(f'try {cnt} times')
        reload_web()

        if try_web():
            break
        else:
            time.sleep(10)


def get_page_urls(keyword, max_page=10):
    all_urls = []

    search_url = f'https://www.example.com/'
    driver.get(search_url)

    for page in range(max_pages):
        print(f"begin page: [{page}]")
        # 加载搜索页面
        time.sleep(5)
        retries = 0
        max_retries = 3

        while retries < max_retries:
            try:
                page_links = WebDriverWait(driver, 10).until(
                    EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'example')))

                print(f"get {len(page_links)} urls")
                for page_link in page_links:
                    page_url = page_link.get_attribute('href')

                    if page_url:
                        all_urls.append(page_url)

                next_button = WebDriverWait(driver, 10).until(
                    EC.element_to_be_clickable((
                        By.CSS_SELECTOR, 'example')))

                next_button.click()
                break
            except Exception as e:
                retries += 1  # 失败后增加重试次数
                print(f"失败，进行重试，重试次数：{retries}/{max_retries}，错误信息：{e}")
                driver.refresh()
                time.sleep(10)
                if retries == max_retries:
                    print("重试次数达到最大值，查找失败:{e}。")

        if retries == max_retries:
            print(f"提早退出：{page}")
            break

    all_urls = list(set(all_urls))
    print(f"获取到 {len(all_urls)} 条 URL。")
    utils.save_list_to_txt(f'history/example/{type_name}/example_get_page_urls.txt', all_urls)

    return all_urls


def get_info_urls(all_urls):
    video_urls = []
    image_urls = []

    for url in urls:
        time.sleep(round(random.uniform(1, 1.5), 2))
        print(f"begin: {url}")

        retries = 0
        max_retries = 3
        response = None
        while retries < max_retries:
            try:
                headers = {
                    'User-Agent': utils.get_random_user_agent(),
                    'Referer': 'https://www.example.com/',  # 设置Referer
                }

                response = requests.get(url, headers=headers, proxies={'https': utils.get_random_proxy()}, timeout=10)

                response.raise_for_status()

                break
            except Exception as e:
                print(f'{e}, begin retry')
                retries += 1  # 失败后增加重试次数

        if not response:
            continue

        html = response.text
        # 解析 HTML
        soup = BeautifulSoup(html, 'html.parser')

        image_tags = soup.find_all('img', class_=" ")  # example
        video_tags = soup.find_all('video', class_=" ")  # example

        for img in image_tags:
            src = img.get('src')
            if src:  # 确保 src 不为空
                image_urls.append(src)
        
        for video in video_tags:
            src = video.get('src')
            if src:  # 确保 src 不为空
                video_urls.append(src)

        file_path = f'history/example/{type_name}/used_page_urls.txt'
        mode = "a" if os.path.exists(file_path) else "w"

        with open(file_path, mode) as f:
            f.write(url + '\n')
    
    video_urls = list(set(video_urls))
    image_urls = list(set(image_urls))

    print(f'共获取{len(video_urls)}个视频, {len(image_urls)}个图片')

    utils.save_list_to_txt(f'history/example/{type_name}/get_video_urls.txt', list(video_urls))      
    utils.save_list_to_txt(f'history/example/{type_name}/get_image_urls.txt', list(image_urls))

    return video_urls, image_urls


def download_urls(url_infos, save_dir, save_way=0):  # save_way=0表示下载视频和图片，=1表示只下载视频，=2表示只下载图片
    image_dir = os.path.join(save_dir, 'image')
    video_dir = os.path.join(save_dir, 'video')
    os.makedirs(image_dir, exist_ok=True)
    os.makedirs(video_dir, exist_ok=True)

    if not url_infos:
        print('urls empty')
        return

    video_urls = url_infos[0]
    image_urls = url_infos[1]

    old_video_idx = VIDEO_IDX
    old_image_idx = IMAGE_IDX

    if save_way == 0:
        for video_url in video_urls:
            download_video(video_url, video_dir)
        for image_url in image_urls:
            download_image(image_url, image_dir)
    elif save_way == 1:
        for video_url in video_urls:
            download_video(video_url, video_dir)
    else:
        for image_url in image_urls:
            download_image(image_url, image_dir)

    print(f"下载全部完成，本次共下载{VIDEO_IDX-old_video_idx}个视频，{IMAGE_IDX-old_image_idx}个图片（新计数值，VIDEO_IDX={VIDEO_IDX},IMAGE_IDX={IMAGE_IDX})")


def download_video(video_url, save_dir):
    if not video_url:
        print('empty！')
        return

    if video_url in USED_VIDEO_URLS:
        print(f"视频链接 {video_url} 已经处理过，跳过下载。")
        return

    # 发送 GET 请求获取视频内容
    header = {
        'User-Agent': utils.get_random_user_agent(),
        'Referer': 'https://www.example.com/',  # 设置Referer
    }

    for _ in range(5):
        proxy = utils.get_random_proxy()
        try:
            response = requests.get(video_url, headers=header, proxies={'https': proxy}, stream=True, timeout=10)
            response.raise_for_status()  # 如果请求失败，抛出异常

            current_time = utils.get_formatted_timestamp()

            filename = f'C_examplev{VIDEO_IDX:05d}_{current_time}.mp4'

            save_path = os.path.join(save_dir, filename)

            # 将视频内容写入文件
            with open(save_path, 'wb') as file:
                for chunk in response.iter_content(chunk_size=1024):
                    if chunk:
                        file.write(chunk)

            with open(f'history/example/{type_name}/example_video_urls.txt', 'a') as url:
                url.write(video_url + '\n')

            USED_VIDEO_URLS.add(video_url)

            add_video_idx()

            print(f"视频已成功下载到 {save_path}")
            break
        except requests.exceptions.RequestException as e:
            print(f"{proxy}_下载失败: {e}")
            continue


def download_image(image_url, save_dir):
    if not image_url:
        print('empty！')
        return

    if image_url in USED_IMAGE_URLS:
        print(f"图片链接 {image_url} 已经处理过，跳过下载。")
        return

    # 发送 GET 请求获取视频内容
    header = {
        'User-Agent': utils.get_random_user_agent(),
    }

    for _ in range(5):
        proxy = utils.get_random_proxy()
        try:
            response = requests.get(image_url, headers=header, proxies={'https': proxy}, stream=True, timeout=10)
            response.raise_for_status()  # 如果请求失败，抛出异常
            image_data = response.content

            current_time = utils.get_formatted_timestamp()

            filename = f'C_examplei{IMAGE_IDX:05d}_{current_time}.jpg'

            save_path = os.path.join(save_dir, filename)

            with open(save_path, 'wb') as file:
                file.write(image_data)

            with open(f'history/example/{type_name}/example_image_urls.txt', 'a') as url:
                url.write(image_url + '\n')

            USED_IMAGE_URLS.add(image_url)

            add_image_idx()

            print(f"图片已成功下载到 {save_path}")
            break
        except requests.exceptions.RequestException as e:
            print(f"{proxy}_下载失败: {e}")
            continue
