tk标签页获取内容账号

需要Python3.10以上的环境

代码

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from bs4 import BeautifulSoup
import time
import random
import csv
from datetime import datetime
import re

# 依赖安装
# pip3 install selenium beautifulsoup4
# pip3 install webdriver-manager
# pip3 install requests lxml html5lib


def init_driver(headless=True):
    """初始化浏览器驱动，增加反反爬措施"""
    options = webdriver.ChromeOptions()
    
    if headless:
        options.add_argument('--headless')
    
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    options.add_argument('--disable-gpu')
    options.add_argument('--disable-blink-features=AutomationControlled')
    options.add_argument('--user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36')
    options.add_argument('--window-size=1920,1080')
    options.add_experimental_option("excludeSwitches", ["enable-automation"])
    options.add_experimental_option('useAutomationExtension', False)
    
    # 添加更多反检测参数
    options.add_argument('--disable-infobars')
    options.add_argument('--disable-extensions')
    options.add_argument('--disable-popup-blocking')
    options.add_argument('--disable-notifications')
    options.add_argument('--disable-web-security')
    options.add_argument('--disable-translate')
    options.add_argument('--disable-logging')
    options.add_argument('--disable-default-apps')
    options.add_argument('--disable-sync')
    options.add_argument('--disable-background-networking')
    options.add_argument('--disable-client-side-phishing-detection')
    options.add_argument('--disable-component-update')
    options.add_argument('--disable-hang-monitor')
    options.add_argument('--disable-prompt-on-repost')
    options.add_argument('--disable-renderer-backgrounding')
    options.add_argument('--disable-session-crashed-bubble')
    options.add_argument('--disable-sync')
    options.add_argument('--disable-web-resources')
    options.add_argument('--metrics-recording-only')
    options.add_argument('--no-first-run')
    options.add_argument('--safebrowsing-disable-auto-update')
    options.add_argument('--password-store=basic')
    
    try:
        driver = webdriver.Chrome(options=options)
        
        # 执行JavaScript代码来隐藏自动化特征
        driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
        driver.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument', {
            'source': '''
                Object.defineProperty(navigator, 'plugins', {
                    get: () => [1, 2, 3, 4, 5],
                });
                Object.defineProperty(navigator, 'languages', {
                    get: () => ['en-US', 'en'],
                });
                const originalQuery = window.navigator.permissions.query;
                window.navigator.permissions.query = (parameters) => (
                    parameters.name === 'notifications' ?
                        Promise.resolve({ state: Notification.permission }) :
                        originalQuery(parameters)
                );
            '''
        })
        
        return driver
    except Exception as e:
        print(f"初始化浏览器失败: {e}")
        raise

def print_page_info(driver, step_name):
    """打印页面信息到控制台"""
    print(f"\n=== {step_name} ===")
    print(f"页面标题: {driver.title}")
    print(f"页面URL: {driver.current_url}")
    print(f"页面源代码长度: {len(driver.page_source)} 字符")
    
    # 检查页面内容特征
    page_text = driver.page_source.lower()
    
    # 检查关键特征
    checks = {
        "包含TikTok": "tiktok" in page_text,
        "包含用户名元素": 'data-e2e="challenge-item-username"' in page_text,
        "包含登录": "login" in page_text,
        "包含验证码": "captcha" in page_text,
    }
    
    for check_name, result in checks.items():
        status = "✓" if result else "✗"
        print(f"{status} {check_name}")
    
    return page_text

def get_total_videos_count(driver):
    """获取标签下的作品总数"""
    try:
        # 尝试多种选择器查找作品总数元素
        selectors = [
            'h2[data-e2e="challenge-vvcount"]',
            'h2[data-e2e*="vvcount"]',
            'h2[class*="ShareSubTitle"]',
            'div[data-e2e*="vvcount"]',
            'span[data-e2e*="vvcount"]'
        ]
        
        for selector in selectors:
            try:
                element = driver.find_element(By.CSS_SELECTOR, selector)
                if element:
                    text = element.text.strip()
                    # 使用正则表达式提取数字
                    match = re.search(r'(\d+[\d,]*)\s*个作品', text)
                    if match:
                        count_str = match.group(1).replace(',', '')
                        return int(count_str)
                    print(f"找到作品总数元素，但格式不匹配: {text}")
            except:
                continue
        
        # 如果没找到，尝试从页面源代码中查找
        page_source = driver.page_source
        match = re.search(r'(\d+[\d,]*)\s*个作品', page_source)
        if match:
            count_str = match.group(1).replace(',', '')
            return int(count_str)
            
        print("未找到作品总数元素")
        return 0
    except Exception as e:
        print(f"获取作品总数失败: {e}")
        return 0

def scrape_tiktok_usernames(url, max_items=50, scroll_pause=2, headless=True):
    """
    采集TikTok标签页面的用户名数据（增强版）
    """
    driver = None
    try:
        driver = init_driver(headless)
        
        # 设置页面加载超时
        driver.set_page_load_timeout(60)
        driver.implicitly_wait(15)
        
        print(f"正在访问页面: {url}")
        driver.get(url)
        
        # 等待页面加载
        print("等待页面加载...")
        WebDriverWait(driver, 30).until(
            EC.presence_of_element_located((By.TAG_NAME, "body"))
        )
        
        # 打印初始页面信息
        page_text = print_page_info(driver, "初始页面")
        
        # 检查是否有重定向或错误页面
        if "error" in driver.current_url.lower() or "login" in driver.current_url.lower():
            print("检测到可能的错误页面或登录页面")
            return []
        
        # 获取作品总数
        total_videos = get_total_videos_count(driver)
        print(f"标签下共有 {total_videos} 个作品")
        
        # 等待更长时间让内容加载
        print("等待内容加载...")
        
        # 尝试多种等待策略
        try:
            # 策略1: 等待特定元素出现
            print("尝试等待特定元素...")
            WebDriverWait(driver, 20).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, 'div[data-e2e="challenge-item-list"]'))
            )
            print("找到目标项目列表元素")
        except TimeoutException:
            print("未找到目标项目列表元素，尝试其他策略")
            
            # 策略2: 等待视频元素出现
            try:
                print("尝试等待视频元素...")
                WebDriverWait(driver, 15).until(
                    EC.presence_of_element_located((By.CSS_SELECTOR, 'div[data-e2e*="video"]'))
                )
                print("找到视频元素")
            except TimeoutException:
                print("未找到视频元素，尝试滚动加载")
                
                # 策略3: 滚动页面触发加载
                for i in range(3):
                    print(f"滚动页面 {i+1}/3")
                    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
                    time.sleep(2)
        
        # 打印等待后的页面信息
        page_text = print_page_info(driver, "等待后页面")
        
        # 检查是否包含目标元素
        if 'data-e2e="challenge-item-username"' not in page_text:
            print("页面中未找到目标用户名元素")
            return []
        
        collected_usernames = []
        scroll_attempts = 0
        max_scroll_attempts = 15
        
        print("开始滚动采集用户名...")
        
        while len(collected_usernames) < max_items and scroll_attempts < max_scroll_attempts:
            print(f"\n=== 第 {scroll_attempts + 1} 次滚动 ===")
            
            # 滚动页面
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            scroll_attempts += 1
            
            # 等待内容加载 - 使用更智能的等待
            wait_time = scroll_pause + random.uniform(1.0, 3.0)
            print(f"等待 {wait_time:.1f} 秒让内容加载...")
            time.sleep(wait_time)
            
            # 打印滚动后的页面信息
            page_text = print_page_info(driver, f"滚动后页面 {scroll_attempts}")
            
            # 提取用户名
            soup = BeautifulSoup(driver.page_source, 'html.parser')
            usernames = extract_usernames(soup)
            print(f"本次提取到 {len(usernames)} 个用户名")
            
            # 去重处理
            new_usernames = []
            for username in usernames:
                if username not in collected_usernames and len(collected_usernames) < max_items:
                    collected_usernames.append(username)
                    new_usernames.append(username)
            
            print(f"新增 {len(new_usernames)} 个用户名，总计 {len(collected_usernames)} 个")
            
            # 显示处理进度
            if total_videos > 0:
                progress = min(len(collected_usernames) / total_videos * 100, 100)
                print(f"处理进度: {len(collected_usernames)}/{total_videos} ({progress:.1f}%)")
            
            # 如果没有新数据，尝试多次滚动后退出
            if len(new_usernames) == 0 and scroll_attempts > 5:
                print("连续多次滚动没有新用户名，停止采集")
                break
        
        print(f"采集完成！共获取 {len(collected_usernames)} 个用户名")
        return collected_usernames
        
    except Exception as e:
        print(f"采集过程中出现错误: {str(e)}")
        import traceback
        traceback.print_exc()
        return []
    
    finally:
        if driver:
            driver.quit()
            print("浏览器已关闭")

def extract_usernames(soup):
    """提取用户名数据（增强版）"""
    usernames = []
    
    print("开始提取用户名...")
    
    # 方法1: 使用CSS选择器
    username_elements = soup.select('p[data-e2e="challenge-item-username"]')
    print(f"使用CSS选择器找到 {len(username_elements)} 个用户名元素")
    
    for element in username_elements:
        try:
            username = element.get_text(strip=True)
            if username and username not in usernames:
                usernames.append(username)
                print(f"提取到用户名: {username}")
        except Exception as e:
            print(f"提取用户名失败: {e}")
    
    # 方法2: 使用BeautifulSoup的find_all
    if len(username_elements) == 0:
        print("尝试使用find_all方法查找用户名元素...")
        username_elements = soup.find_all('p', attrs={'data-e2e': 'challenge-item-username'})
        print(f"使用find_all找到 {len(username_elements)} 个用户名元素")
        
        for element in username_elements:
            try:
                username = element.get_text(strip=True)
                if username and username not in usernames:
                    usernames.append(username)
                    print(f"提取到用户名: {username}")
            except Exception as e:
                print(f"提取用户名失败: {e}")
    
    # 方法3: 尝试其他可能的选择器
    if len(username_elements) == 0:
        print("尝试其他选择器查找用户名...")
        alternative_selectors = [
            'p[data-e2e*="username"]',
            'p[class*="user-name"]',
            'div[data-e2e*="username"]',
            'span[data-e2e*="username"]',
            'a[data-e2e*="username"]',
            'div[class*="user-name"]',
            'span[class*="user-name"]',
            'a[class*="user-name"]'
        ]
        
        for selector in alternative_selectors:
            try:
                elements = soup.select(selector)
                if elements:
                    print(f"使用选择器 '{selector}' 找到 {len(elements)} 个元素")
                    for element in elements:
                        try:
                            username = element.get_text(strip=True)
                            if username and username not in usernames:
                                usernames.append(username)
                                print(f"提取到用户名: {username}")
                        except Exception as e:
                            print(f"提取用户名失败: {e}")
            except Exception as e:
                print(f"使用选择器 {selector} 失败: {e}")
    
    print(f"最终提取到 {len(usernames)} 个用户名")
    return usernames

def save_usernames_to_csv(usernames, filename=None):
    """保存用户名到CSV表格"""
    if not usernames:
        print("没有用户名数据可保存")
        return None
    
    if not filename:
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        filename = f'tiktok_usernames_{timestamp}.csv'
    
    # 定义CSV表头
    fieldnames = ['序号', '用户名', '主页地址']
    
    with open(filename, 'w', newline='', encoding='utf-8-sig') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        
        for i, username in enumerate(usernames, 1):
            writer.writerow({
                '序号': i,
                '用户名': username,
                '主页地址': f"https://www.tiktok.com/@{username}"
            })
    
    print(f"用户名数据已保存到CSV文件: {filename}")
    return filename

def display_usernames_preview(usernames, count=10):
    """显示用户名预览"""
    if not usernames:
        print("没有用户名数据可显示")
        return
    
    print(f"\n=== 采集到的用户名预览（前{min(count, len(usernames))}个）===")
    for i, username in enumerate(usernames[:count], 1):
        print(f"{i}. {username} (主页: https://www.tiktok.com/@{username})")
    
    print(f"\n总共采集到 {len(usernames)} 个用户名")

if __name__ == "__main__":
    TagName = "xxx"
    # 配置参数
    TARGET_URL = "https://www.tiktok.com/tag/"+TagName
    MAX_ITEMS = 10
    SCROLL_PAUSE = 3
    HEADLESS_MODE = False  # 先设置为 False 进行调试
    
    print("开始采集 TikTok 用户名数据...")
    
    try:
        usernames = scrape_tiktok_usernames(
            url=TARGET_URL,
            max_items=MAX_ITEMS,
            scroll_pause=SCROLL_PAUSE,
            headless=HEADLESS_MODE
        )
        
        if usernames:
            # 保存为CSV表格
            csv_file = save_usernames_to_csv(usernames,TagName+".csv")
            
            # 显示用户名预览
            display_usernames_preview(usernames)
            
            print(f"\n采集完成！共获取 {len(usernames)} 个用户名")
            print(f"数据已保存到: {csv_file}")
        else:
            print("未采集到任何用户名数据")
            
    except Exception as e:
        print(f"程序执行出错: {e}")
        import traceback
        traceback.print_exc()

代码2-去掉了滚动次数限制

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from bs4 import BeautifulSoup
import time
import random
import csv
from datetime import datetime
import re

# 依赖安装
# pip3 install selenium beautifulsoup4
# pip3 install webdriver-manager
# pip3 install requests lxml html5lib


def init_driver(headless=True):
    """初始化浏览器驱动，增加反反爬措施"""
    options = webdriver.ChromeOptions()
    
    if headless:
        options.add_argument('--headless')
    
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    options.add_argument('--disable-gpu')
    options.add_argument('--disable-blink-features=AutomationControlled')
    options.add_argument('--user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36')
    options.add_argument('--window-size=1920,1080')
    options.add_experimental_option("excludeSwitches", ["enable-automation"])
    options.add_experimental_option('useAutomationExtension', False)
    
    # 添加更多反检测参数
    options.add_argument('--disable-infobars')
    options.add_argument('--disable-extensions')
    options.add_argument('--disable-popup-blocking')
    options.add_argument('--disable-notifications')
    options.add_argument('--disable-web-security')
    options.add_argument('--disable-translate')
    options.add_argument('--disable-logging')
    options.add_argument('--disable-default-apps')
    options.add_argument('--disable-sync')
    options.add_argument('--disable-background-networking')
    options.add_argument('--disable-client-side-phishing-detection')
    options.add_argument('--disable-component-update')
    options.add_argument('--disable-hang-monitor')
    options.add_argument('--disable-prompt-on-repost')
    options.add_argument('--disable-renderer-backgrounding')
    options.add_argument('--disable-session-crashed-bubble')
    options.add_argument('--disable-sync')
    options.add_argument('--disable-web-resources')
    options.add_argument('--metrics-recording-only')
    options.add_argument('--no-first-run')
    options.add_argument('--safebrowsing-disable-auto-update')
    options.add_argument('--password-store=basic')
    
    try:
        driver = webdriver.Chrome(options=options)
        
        # 执行JavaScript代码来隐藏自动化特征
        driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
        driver.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument', {
            'source': '''
                Object.defineProperty(navigator, 'plugins', {
                    get: () => [1, 2, 3, 4, 5],
                });
                Object.defineProperty(navigator, 'languages', {
                    get: () => ['en-US', 'en'],
                });
                const originalQuery = window.navigator.permissions.query;
                window.navigator.permissions.query = (parameters) => (
                    parameters.name === 'notifications' ?
                        Promise.resolve({ state: Notification.permission }) :
                        originalQuery(parameters)
                );
            '''
        })
        
        return driver
    except Exception as e:
        print(f"初始化浏览器失败: {e}")
        raise

def print_page_info(driver, step_name):
    """打印页面信息到控制台"""
    print(f"\n=== {step_name} ===")
    print(f"页面标题: {driver.title}")
    print(f"页面URL: {driver.current_url}")
    print(f"页面源代码长度: {len(driver.page_source)} 字符")
    
    # 检查页面内容特征
    page_text = driver.page_source.lower()
    
    # 检查关键特征
    checks = {
        "包含TikTok": "tiktok" in page_text,
        "包含用户名元素": 'data-e2e="challenge-item-username"' in page_text,
        "包含登录": "login" in page_text,
        "包含验证码": "captcha" in page_text,
    }
    
    for check_name, result in checks.items():
        status = "✓" if result else "✗"
        print(f"{status} {check_name}")
    
    return page_text

def get_total_videos_count(driver):
    """获取标签下的作品总数"""
    try:
        # 尝试多种选择器查找作品总数元素
        selectors = [
            'h2[data-e2e="challenge-vvcount"]',
            'h2[data-e2e*="vvcount"]',
            'h2[class*="ShareSubTitle"]',
            'div[data-e2e*="vvcount"]',
            'span[data-e2e*="vvcount"]'
        ]
        
        for selector in selectors:
            try:
                element = driver.find_element(By.CSS_SELECTOR, selector)
                if element:
                    text = element.text.strip()
                    # 使用正则表达式提取数字
                    match = re.search(r'(\d+[\d,]*)\s*个作品', text)
                    if match:
                        count_str = match.group(1).replace(',', '')
                        return int(count_str)
                    print(f"找到作品总数元素，但格式不匹配: {text}")
            except:
                continue
        
        # 如果没找到，尝试从页面源代码中查找
        page_source = driver.page_source
        match = re.search(r'(\d+[\d,]*)\s*个作品', page_source)
        if match:
            count_str = match.group(1).replace(',', '')
            return int(count_str)
            
        print("未找到作品总数元素")
        return 0
    except Exception as e:
        print(f"获取作品总数失败: {e}")
        return 0

def scrape_tiktok_usernames(url, max_items=50, scroll_pause=2, headless=True):
    """
    采集TikTok标签页面的用户名数据（增强版）
    """
    driver = None
    try:
        driver = init_driver(headless)
        
        # 设置页面加载超时
        driver.set_page_load_timeout(60)
        driver.implicitly_wait(15)
        
        print(f"正在访问页面: {url}")
        driver.get(url)
        
        # 等待页面加载
        print("等待页面加载...")
        WebDriverWait(driver, 30).until(
            EC.presence_of_element_located((By.TAG_NAME, "body"))
        )
        
        # 打印初始页面信息
        page_text = print_page_info(driver, "初始页面")
        
        # 检查是否有重定向或错误页面
        if "error" in driver.current_url.lower() or "login" in driver.current_url.lower():
            print("检测到可能的错误页面或登录页面")
            return []
        
        # 获取作品总数
        total_videos = get_total_videos_count(driver)
        print(f"标签下共有 {total_videos} 个作品")
        
        # 等待更长时间让内容加载
        print("等待内容加载...")
        
        # 尝试多种等待策略
        try:
            # 策略1: 等待特定元素出现
            print("尝试等待特定元素...")
            WebDriverWait(driver, 20).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, 'div[data-e2e="challenge-item-list"]'))
            )
            print("找到目标项目列表元素")
        except TimeoutException:
            print("未找到目标项目列表元素，尝试其他策略")
            
            # 策略2: 等待视频元素出现
            try:
                print("尝试等待视频元素...")
                WebDriverWait(driver, 15).until(
                    EC.presence_of_element_located((By.CSS_SELECTOR, 'div[data-e2e*="video"]'))
                )
                print("找到视频元素")
            except TimeoutException:
                print("未找到视频元素，尝试滚动加载")
                
                # 策略3: 滚动页面触发加载
                for i in range(3):
                    print(f"滚动页面 {i+1}/3")
                    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
                    time.sleep(2)
        
        # 打印等待后的页面信息
        page_text = print_page_info(driver, "等待后页面")
        
        # 检查是否包含目标元素
        if 'data-e2e="challenge-item-username"' not in page_text:
            print("页面中未找到目标用户名元素")
            return []
        
        collected_usernames = []
        scroll_attempts = 0
        # max_scroll_attempts = 15
        
        print("开始滚动采集用户名...")
        
        while len(collected_usernames) < max_items:
            print(f"\n=== 第 {scroll_attempts + 1} 次滚动 ===")
            
            # 滚动页面
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            scroll_attempts += 1
            
            # 等待内容加载 - 使用更智能的等待
            wait_time = scroll_pause + random.uniform(1.0, 3.0)
            print(f"等待 {wait_time:.1f} 秒让内容加载...")
            time.sleep(wait_time)
            
            # 打印滚动后的页面信息
            page_text = print_page_info(driver, f"滚动后页面 {scroll_attempts}")
            
            # 提取用户名
            soup = BeautifulSoup(driver.page_source, 'html.parser')
            usernames = extract_usernames(soup)
            print(f"本次提取到 {len(usernames)} 个用户名")
            
            # 去重处理
            new_usernames = []
            for username in usernames:
                if username not in collected_usernames and len(collected_usernames) < max_items:
                    collected_usernames.append(username)
                    new_usernames.append(username)
            
            print(f"新增 {len(new_usernames)} 个用户名，总计 {len(collected_usernames)} 个")
            
            # 显示处理进度
            if total_videos > 0:
                progress = min(len(collected_usernames) / total_videos * 100, 100)
                print(f"处理进度: {len(collected_usernames)}/{total_videos} ({progress:.1f}%)")
            
            # 如果没有新数据，尝试多次滚动后退出
            if len(new_usernames) == 0 and scroll_attempts > 10:
                print("连续多次滚动没有新用户名，停止采集")
                break
        
        print(f"采集完成！共获取 {len(collected_usernames)} 个用户名")
        return collected_usernames
        
    except Exception as e:
        print(f"采集过程中出现错误: {str(e)}")
        import traceback
        traceback.print_exc()
        return []
    
    finally:
        if driver:
            driver.quit()
            print("浏览器已关闭")

def extract_usernames(soup):
    """提取用户名数据（增强版）"""
    usernames = []
    
    print("开始提取用户名...")
    
    # 方法1: 使用CSS选择器
    username_elements = soup.select('p[data-e2e="challenge-item-username"]')
    print(f"使用CSS选择器找到 {len(username_elements)} 个用户名元素")
    
    for element in username_elements:
        try:
            username = element.get_text(strip=True)
            if username and username not in usernames:
                usernames.append(username)
                print(f"提取到用户名: {username}")
        except Exception as e:
            print(f"提取用户名失败: {e}")
    
    # 方法2: 使用BeautifulSoup的find_all
    if len(username_elements) == 0:
        print("尝试使用find_all方法查找用户名元素...")
        username_elements = soup.find_all('p', attrs={'data-e2e': 'challenge-item-username'})
        print(f"使用find_all找到 {len(username_elements)} 个用户名元素")
        
        for element in username_elements:
            try:
                username = element.get_text(strip=True)
                if username and username not in usernames:
                    usernames.append(username)
                    print(f"提取到用户名: {username}")
            except Exception as e:
                print(f"提取用户名失败: {e}")
    
    # 方法3: 尝试其他可能的选择器
    if len(username_elements) == 0:
        print("尝试其他选择器查找用户名...")
        alternative_selectors = [
            'p[data-e2e*="username"]',
            'p[class*="user-name"]',
            'div[data-e2e*="username"]',
            'span[data-e2e*="username"]',
            'a[data-e2e*="username"]',
            'div[class*="user-name"]',
            'span[class*="user-name"]',
            'a[class*="user-name"]'
        ]
        
        for selector in alternative_selectors:
            try:
                elements = soup.select(selector)
                if elements:
                    print(f"使用选择器 '{selector}' 找到 {len(elements)} 个元素")
                    for element in elements:
                        try:
                            username = element.get_text(strip=True)
                            if username and username not in usernames:
                                usernames.append(username)
                                print(f"提取到用户名: {username}")
                        except Exception as e:
                            print(f"提取用户名失败: {e}")
            except Exception as e:
                print(f"使用选择器 {selector} 失败: {e}")
    
    print(f"最终提取到 {len(usernames)} 个用户名")
    return usernames

def save_usernames_to_csv(usernames, filename=None):
    """保存用户名到CSV表格"""
    if not usernames:
        print("没有用户名数据可保存")
        return None
    
    if not filename:
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        filename = f'tiktok_usernames_{timestamp}.csv'
    
    # 定义CSV表头
    fieldnames = ['序号', '用户名', '主页地址']
    
    with open(filename, 'w', newline='', encoding='utf-8-sig') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        
        for i, username in enumerate(usernames, 1):
            writer.writerow({
                '序号': i,
                '用户名': username,
                '主页地址': f"https://www.tiktok.com/@{username}"
            })
    
    print(f"用户名数据已保存到CSV文件: {filename}")
    return filename

def display_usernames_preview(usernames, count=10):
    """显示用户名预览"""
    if not usernames:
        print("没有用户名数据可显示")
        return
    
    print(f"\n=== 采集到的用户名预览（前{min(count, len(usernames))}个）===")
    for i, username in enumerate(usernames[:count], 1):
        print(f"{i}. {username} (主页: https://www.tiktok.com/@{username})")
    
    print(f"\n总共采集到 {len(usernames)} 个用户名")

if __name__ == "__main__":
    TagName = "xxx"
    # 配置参数
    TARGET_URL = "https://www.tiktok.com/tag/"+TagName
    MAX_ITEMS = 1800
    SCROLL_PAUSE = 3
    HEADLESS_MODE = False  # 先设置为 False 进行调试
    
    print("开始采集 TikTok 用户名数据...")
    
    try:
        usernames = scrape_tiktok_usernames(
            url=TARGET_URL,
            max_items=MAX_ITEMS,
            scroll_pause=SCROLL_PAUSE,
            headless=HEADLESS_MODE
        )
        
        if usernames:
            # 保存为CSV表格
            csv_file = save_usernames_to_csv(usernames,TagName+".csv")
            
            # 显示用户名预览
            # display_usernames_preview(usernames)
            
            print(f"\n采集完成！共获取 {len(usernames)} 个用户名")
            print(f"数据已保存到: {csv_file}")
        else:
            print("未采集到任何用户名数据")
            
    except Exception as e:
        print(f"程序执行出错: {e}")
        import traceback
        traceback.print_exc()