基于 Python 爬取 TikTok 搜索数据 Tiktok爬虫（2025.3.17）

1. 前言

在数据分析和网络爬虫的应用场景中，我们经常需要获取社交媒体平台的数据，例如 TikTok。本篇文章介绍如何使用 Python 爬取 TikTok 用户搜索数据，并解析其返回的数据。

结果截图

2. 项目环境准备

在正式运行代码之前，我们需要安装相关的 Python 库：

pip install requests pandas execjs loguru

此外，我们需要一个 JavaScript 运行环境（如 Node.js），用于执行加密签名代码。

3. 代码解析

3.1 初始化爬虫类

我们创建 TiktokUserSearch 类，并在初始化方法 __init__ 中设置请求头信息，并初始化输出文件。

class TiktokUserSearch:
    def __init__(self, output_file=None):
        self.headers = {  # 设置请求头
            "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 ...",
            "referer": "https://www.tiktok.com/"
        }
        self.cookies = None
        self.output_file = output_file if output_file else f'tiktok_videos_{datetime.now().strftime("%Y%m%d_%H%M%S")}.csv'

3.2 处理 Cookie

我们需要将 TikTok 的 cookie 从字符串转换成字典格式，以便后续请求使用。

    def cookie_str_to_dict(self, cookie_str) -> dict:
        cookie_dict = {}
        cookies = [i.strip() for i in cookie_str.split('; ') if i.strip() != ""]
        for cookie in cookies:
            key, value = cookie.split('=', 1)
            cookie_dict[key] = value
        return cookie_dict

3.3 发送请求

TikTok 需要使用 X-Bogus 进行签名，我们需要执行 JavaScript 代码来获取该参数。

为了防止网络不稳定，设置三次重试机制。

可根据自己需求设置代理。

    def get(self, keyword, cursor, search_id, cookie_str):
        self.cookies = self.cookie_str_to_dict(cookie_str)
        url = "https://www.tiktok.com/api/search/general/full/"
        if cursor == "0":
            focus_state = "true"
        else:
            focus_state = "false"
        params = {
            "WebIdLastTime": f"{int(time.time())}",
            "aid": "1988",
            "app_language": "zh-Hans",
            "app_name": "tiktok_web",
            "browser_language": "zh-CN",
            "browser_name": "Mozilla",
            "browser_online": "true",
            "browser_platform": "Win32",
            "browser_version": self.headers['user-agent'].replace('Mozilla/', ''),
            "channel": "tiktok_web",
            "cookie_enabled": "true",
            "cursor": cursor,
            "device_id": "7339506347602019870",
            "device_platform": "web_pc",
            "focus_state": focus_state,
            "from_page": "search",
            "history_len": "7",
            "is_fullscreen": "false",
            "is_page_visible": "true",
            "keyword": keyword,
            "os": "windows",
            "priority_region": "",
            "referer": "",
            "region": "KR",
            "screen_height": "1080",
            "screen_width": "1920",
            "tz_name": "Asia/Shanghai",
            "web_search_code": "{\"tiktok\":{\"client_params_x\":{\"search_engine\":{\"ies_mt_user_live_video_card_use_libra\":1,\"mt_search_general_user_live_card\":1}},\"search_server\":{}}}",
            "webcast_language": "zh-Hans",
            "msToken": self.cookies["msToken"],
        }
        if cursor != "0":
            params.update({"search_id": search_id})
        x_b = execjs.compile(open('./encrypt.js', encoding='utf-8').read()).call("sign", urlencode(params), self.headers["user-agent"])
        params.update({"X-Bogus": x_b})

        headers = self.headers.copy()
        headers.update({"referer": "https://www.tiktok.com/search?q=" + keyword})

        max_retries = 3
        for attempt in range(max_retries):
            try:
                response = requests.get(
                    url,
                    headers=headers,
                    cookies=self.cookies,
                    params=params,
                    timeout=(3, 10),
                    proxies=None
                )
                return response.json()
            except (ex1, ex2, ex3) as e:
                print(f"尝试 {attempt + 1}/{max_retries} 发生网络错误：{e}")
                if attempt < max_retries - 1:
                    time.sleep(2)
                else:
                    return {"error": f"Network error after {max_retries} attempts: {str(e)}"}
            except Exception as e:
                print(f"发生其他错误：{e}")
                return {"error": str(e)}

3.4 解析数据并存储

解析 TikTok 返回的视频数据，并保存到 CSV 文件。

    def parse_data(self, data_list):
        resultList = []
        video_data = []
        
        for u in data_list:
            try:
                item = u['item']
                author = item['author']
                stats = item['stats']
                author_stats = item['authorStats']  # 添加作者统计信息
                
                # 提取需要的数据
                video_info = {
                    'video_id': item['id'],
                    'desc': item['desc'],
                    'create_time': datetime.fromtimestamp(item['createTime']).strftime('%Y-%m-%d %H:%M:%S'),
                    'duration': item['video']['duration'],
                    # 作者基本信息
                    'author_id': author['id'],
                    'author_name': author['uniqueId'],
                    'author_nickname': author['nickname'],
                    'author_signature': author['signature'],
                    'author_verified': author['verified'],
                    # 作者统计信息
                    'author_following_count': author_stats['followingCount'],  # 关注数
                    'author_follower_count': author_stats['followerCount'],    # 粉丝数
                    'author_heart_count': author_stats['heartCount'],          # 获赞总数
                    'author_video_count': author_stats['videoCount'],          # 视频总数
                    'author_digg_count': author_stats['diggCount'],           # 点赞数
                    # 视频统计信息
                    'digg_count': stats['diggCount'],
                    'share_count': stats['shareCount'],
                    'comment_count': stats['commentCount'],
                    'play_count': stats['playCount'],
                    'collect_count': stats.get('collectCount', 0),
                    'video_url': item['video']['playAddr']
                }
                
                # 添加标签信息
                if 'challenges' in item:
                    video_info['hashtags'] = ','.join([tag['title'] for tag in item['challenges']])
                else:
                    video_info['hashtags'] = ''
                
                # 添加音乐信息
                if 'music' in item:
                    music = item['music']
                    video_info.update({
                        'music_id': music['id'],
                        'music_title': music['title'],
                        'music_author': music['authorName'],
                        'music_original': music['original']
                    })
                
                video_data.append(video_info)
                resultList.append(f"https://www.tiktok.com/@{author['uniqueId']}")
            except Exception as e:
                logger.error(f"解析视频数据时出错: {str(e)}")
                continue
        
        # 将数据保存到CSV文件
        try:
            df = pd.DataFrame(video_data)
            
            # 检查文件是否存在
            file_exists = os.path.exists(self.output_file)
            
            # 如果文件不存在，创建新文件并写入表头
            # 如果文件存在，追加数据不写入表头
            df.to_csv(self.output_file, 
                     mode='a', 
                     header=not file_exists,
                     index=False, 
                     encoding='utf-8-sig')
            
            logger.info(f"数据已{'追加' if file_exists else '保存'}到文件: {self.output_file}")
        except Exception as e:
            logger.error(f"保存CSV文件时出错: {str(e)}")

        return resultList

3.5 运行爬虫

我们定义 main 方法，负责调用 get 方法获取数据并解析。

    def main(self, keyword, cookie_str, cursor="0", search_id=None):
        dataJson = self.get(keyword, cursor, search_id, cookie_str)
        if dataJson:
            if "error" in dataJson:
                return {"cursor": cursor, "search_id": search_id, "data": [], "status": "-2", "error": dataJson["error"]}
            elif "verify_event" in str(dataJson):
                return {"cursor": cursor, "search_id": search_id, "data": [], "status": "-1"}
            else:
                # 解析数据并保存到CSV
                if 'data' in dataJson:
                    self.parse_data(dataJson['data'])
                return dataJson

3.6 运行入口

最后，我们编写 if __name__ == '__main__' 逻辑，定义要爬取的关键词，并进行循环爬取。

if __name__ == '__main__':
    os.makedirs('results1', exist_ok=True)
    topics = [
    "Chen Duxiu",
    "Li Dazhao",
]
    for keyword in topics:
        logger.info(f"开始爬取 {keyword} 的视频")
        output_file = f'results1/{keyword}_videos.csv'  # 你可以自定义文件名
        tiktok = TiktokUserSearch(output_file=output_file)
        cookie_str = '_ttp=2ZzUB37CLclhWsrgyW56Erox1XM; tiktok_webapp_theme_auto_dark_ab=1; delay_guest_mode_vid=5; passport_csrf_token=d8e4d28ec7abdf12a7829d524dca64de; passport_csrf_token_default=d8e4d28ec7abdf12a7829d524dca64de; tt_chain_token=SSmpjX/0in/IP8BYwawD+Q==; multi_sids=7361707798058615814%3A53b730c284c4eaaa9bb2157eef01d70d; cmpl_token=AgQQAPNoF-RO0rYU5JqLsx0__dmghl8Nv5IhYNkWMA; passport_auth_status=a8a7a1e1c4b96a994a45acb38dc83509%2C; passport_auth_status_ss=a8a7a1e1c4b96a994a45acb38dc83509%2C; uid_tt=7857882a3366539dc1d9ca226b3fdc91f76b1b072c7da11dd4120368d88bf861; uid_tt_ss=7857882a3366539dc1d9ca226b3fdc91f76b1b072c7da11dd4120368d88bf861; sid_tt=53b730c284c4eaaa9bb2157eef01d70d; sessionid=53b730c284c4eaaa9bb2157eef01d70d; sessionid_ss=53b730c284c4eaaa9bb2157eef01d70d; store-idc=maliva; store-country-code=ca; store-country-code-src=uid; tt-target-idc=useast1a; tt-target-idc-sign=t3pz21FprSb2qc1ucJWFQbxzCKwgoBX9PKUWEbPHh7_4mpPThOuO0EN9pm2ORzFqk0bLFt6MtI9-gofvcVtQFoGSTOI_JvUWIAAUSHz1mM1A9jP1kRk_qucQnxEMOLvir3s4ffm0hJSh62RyKNO5LBTlT-fsqbi2tQVUwrgIGF-2HFT04S52ciyRnKAXr_0NyD3Aa0lM4J4hUGplo46wKRfId1DwwajXudUfjqJ3rvAuA8qURTsSHCKuDjLbcdfhcC0WKqemrmHFBJ11hGFJxiL4VEOClIoJGrF1_S9jvlx0H0Nph9BHlHNA-wzwi3NF6hPK17WL3TSvsqfEiKclZ5ScpHMv7ATYfOK4BVOzKXrq6fCxzNBT5kCNc4-ImuvjBNqpY8yL2s2KusWxslveOyIq3gwU3Dhxl084w5Tsp13xzuFOGNVHK5ZPeS5ERmykYFB6uTIHty9W_Z6pwN1tT9yQ-34qyZRZB7WONZn_NAFsywU6Hj4wcHLQkJ-tIiAO; last_login_method=google; tiktok_webapp_theme_source=auto; tiktok_webapp_theme=dark; sid_guard=53b730c284c4eaaa9bb2157eef01d70d%7C1740207607%7C15551996%7CThu%2C+21-Aug-2025+07%3A00%3A03+GMT; sid_ucp_v1=1.0.0-KDc2NDMxMTQzMTkwNTY2NTJiOWZhMmZhM2ZlMDg3ZDE0YzNiOGU5NTUKGQiGiIec0MeClWYQ9-vlvQYYsws4CEASSAQQAxoGbWFsaXZhIiA1M2I3MzBjMjg0YzRlYWFhOWJiMjE1N2VlZjAxZDcwZA; ssid_ucp_v1=1.0.0-KDc2NDMxMTQzMTkwNTY2NTJiOWZhMmZhM2ZlMDg3ZDE0YzNiOGU5NTUKGQiGiIec0MeClWYQ9-vlvQYYsws4CEASSAQQAxoGbWFsaXZhIiA1M2I3MzBjMjg0YzRlYWFhOWJiMjE1N2VlZjAxZDcwZA; odin_tt=a7027d0b8a102be6dd20600ca35291f4aee8c003895d8913d5c2f8276f16d6b974345357a22ed0bdcb57930d326afded3e5bd3105e061197876c80a92bbdbbfc29402634b8e7439ba178a1c7ed9ca552; tt_csrf_token=qs6ncqIZ-SbUZVzUbkUZ2SViJyY7VzYRki0M; perf_feed_cache={%22expireTimestamp%22:1742302800000%2C%22itemIds%22:[%227481325874978000134%22%2C%227463463290065161505%22%2C%227466633420098112799%22]}; msToken=BnkIjkPpJEc1i9jiiwT_paC5FW-NL62UVF7-lzpHYki9WIA_KpLrplpY-qlZfuG7V12rbCDHiyQYNrZcOnTzZLk1cvnH3_E_89nfOpqpVquKbSR-Nqr6bGDmL220vjBHdutm4R-gfVnYIG7fvWOJUkZ7yg==; ttwid=1%7Cv5j4n07c_G3ZtA91KIuree-ptnDLwgTwFuM8BnZINnQ%7C1742131441%7Ce88a85fcc36fd7e79815fddb10d16ef553b5e2e4a51c65e2c40098ade19023e2; msToken=iYDKRqCM8rSqc_9ZDzQnWcQiv_iJqPk15-6Y-iFBUmk4uIzb61dM13b9fWHcg4hxGkl9L3n56glok05TllvGurkwpgBYEF8N76ZRIii7OvNEkrk004dagNuoqQVeV9Bzd0_9naXjFXtEiMRi330G5Jdakw==; passport_fe_beating_status=false'
        has_more = 1
        cursor = '0'
        search_id = None
        while has_more:
            data = tiktok.main(keyword, cookie_str, cursor, search_id)
            logger.info(data)
            if data and isinstance(data, dict):
                has_more = data.get('has_more', 0)
                logger.info(has_more)
                cursor = data.get('cursor', '0')
                search_id = data.get('log_pb', {}).get('impr_id')
                if 'data' in data:
                    data = data['data']
                else:
                    logger.error("No data found in response")
                    break
            else:
                logger.error("Invalid response format")
                break
            time.sleep(1)  # 添加延时避免请求过快
        logger.info(f"爬取 {keyword} 的视频完成")
        time.sleep(30)
        logger.info(f"等待30秒后继续爬取下一个主题")
    logger.info("所有主题的视频爬取完成")