前言
仅限交流学习:
一、抓包分析
通过抓包分析,可以看到有个friends的数据包就有我们想要的信息:
每个包只有20条数据,再来翻页抓下一个包:
对比两个包,很明显page数就是下一页,uid是用户id,用户id也就是用户主页链接的后面一段,type直接固定fans就行
二、测试请求
上Pycharm,构建传参和headers,这里注意要登录后的cookie:
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.60 Safari/537.36', 'cookie': cookie } def getFriendsList(uid, uname): url = 'https://xxxxx.com/ajax/friendships/friends' f = open(out_path + '/' + uname + '.txt', 'a+', encoding='utf-8') for page in range(1, 101): print(f'{
uid} {
uname} 第 {
page} 页') time.sleep(0.5) try: params = {
'relate': 'fans', 'page': page, 'uid': uid, 'type': 'fans', 'newFollowerCount': 0, } res = requests.get(url, headers=headers, params=params) if res.status_code == 414: return '414' json_data = res.json() users = json_data.get('users') if users: for user in users: user_id = user.get('idstr') # 用户id user_name = user.get('name') # 用户名 followers_count = user.get('followers_count_str') # 关注数 friends_count = user.get('friends_count') # fans数 description = user.get('description') # 简介 location = user.get('location') # ip属地 statuses_count = user.get('statuses_count') # 全部发文数 profile_url = 'https://xxxxx.com/' + user.get('profile_url') # 主页链接 # print(user_name, followers_count, friends_count, description, location, statuses_count, profile_url) f.write(f'{
user_name},{
followers_count},{
friends_count},{
description},{
location},{
statuses_count},{
profile_url}\n') except Exception as e: print(e) return None f.close() return 'ok'
测试数据没问题,但是每个用户粉丝页最多只能查询100页的数据,每页20条,也就是2000条,那怎么能采集到100万条数据呢?很简单,我们来准备500个这样的用户主页,每个用户采集2000,这样就是100万了:
三、读取表格
有了准备文件,得先读取出来到一个列表吧:
def getExcelData(input_file): # 读取表格数据 wb = openpyxl.load_workbook(input_file) sh_names = wb.sheetnames s1 = wb[sh_names[0]] try: data_list = [] for index, item in enumerate(list(s1.rows)[1:]): # 读取每行信息 values = [] for val in item: values.append(val.value) data_list.append(values) # 添加行数据到列表 return data_list except: return None
四、关注
经过测试,有一些用户的粉丝列表并不可见,需要关注后才能看见,所以我们得来解决这个问题:
以这个为例,未关注前,是看不到粉丝列表的:
然后我们来一下关注,可以看到有个create的包发起了一个post请求,所以我们来写个判断,如果粉丝列表返回内容为空,那么我们来执行一下这个关注的函数,然后再次执行粉丝列表获取函数:
def createPost(uid): url = 'https://xxxxx.com/ajax/friendships/create' headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.60 Safari/537.36', 'cookie': cookie, 'x-xsrf-token': 'mBqiNo6lC0vAzPfp-uUX4Z3q', 'XMLHttpRequest': 'XMLHttpRequest', 'origin': 'https://xxxxx.com', 'content-type': 'application/json;charset=UTF-8', } data = {
"friend_uid": str(uid), "lpage": "profileRecom", "page": "profile" } try: res = requests.post(url, headers=headers, data=json.dumps(data)).json() ok = res.get('ok') if ok == 1: print('关注成功') return except Exception as e: return
五、多线程爬取
到这里,大部分代码已经写完了,数据量比较大,我们不可能用单线程跑吧,所以我们需要写一个多线程:
def matching(i, data_list, st, ed, ans_file): for item in data_list[st:ed]: time.sleep(1) user_name = item[0] uid = str(item[2]).split('/')[-1] result = getFriendsList(uid, user_name) if result == '414': time.sleep(120) print('访问频繁,休息一会...') if result is None: time.sleep(1) createPost(uid) time.sleep(1) getFriendsList(uid, user_name) st = 0 diff = ceil(len(data_list) / THREAD_NUM) thread_pool = [] for i in range(THREAD_NUM): ans_file = os.path.join(out_path, str(i) + ".txt") thread = Thread(target=matching, args=(i, data_list, diff * i + st, min(diff * (i + 1) + st, len(data_list)), ans_file)) thread_pool.append(thread) for i in range(THREAD_NUM): thread_pool[i].start() for i in range(THREAD_NUM): thread_pool[i].join()
最后,再写个函数让多线程的文件合并成一个文件,直接上全部代码吧:
# !/usr/bin/env python # -*- coding:utf-8 -*- # Time :2022/6/17 14:31 # Author :JACK # VX :lingjie2014 import json import openpyxl import requests import csv import os from math import ceil from threading import Thread import time THREAD_NUM = 8 def mkdir(path): path = path.strip() isExists = os.path.exists(path) if not isExists: os.makedirs(path) def getExcelData(input_file): # 读取表格数据 wb = openpyxl.load_workbook(input_file) sh_names = wb.sheetnames s1 = wb[sh_names[0]] try: data_list = [] for index, item in enumerate(list(s1.rows)[1:]): # 读取每行信息 values = [] for val in item: values.append(val.value) data_list.append(values) # 添加行数据到列表 return data_list except: return None def createPost(uid): url = 'https://xxxxx.com/ajax/friendships/create' headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.60 Safari/537.36', 'cookie': cookie, 'x-xsrf-token': 'mBqiNo6lC0vAzPfp-uUX4Z3q', 'XMLHttpRequest': 'XMLHttpRequest', 'origin': 'https://xxxxx.com', 'content-type': 'application/json;charset=UTF-8', } data = {
"friend_uid": str(uid), "lpage": "profileRecom", "page": "profile" } try: res = requests.post(url, headers=headers, data=json.dumps(data)).json() ok = res.get('ok') if ok == 1: print('关注成功') return except Exception as e: return def getFriendsList(uid, uname): url = 'https://xxxxx.com/ajax/friendships/friends' f = open(out_path + '/' + uname + '.txt', 'a+', encoding='utf-8') for page in range(1, 101): print(f'{
uid} {
uname} 第 {
page} 页') time.sleep(0.5) try: params = {
'relate': 'fans', 'page': page, 'uid': uid, 'type': 'fans', 'newFollowerCount': 0, } res = requests.get(url, headers=headers, params=params) if res.status_code == 414: return '414' json_data = res.json() users = json_data.get('users') if users: for user in users: user_id = user.get('idstr') # 用户id user_name = user.get('name') # 用户名 followers_count = user.get('followers_count_str') # 关注数 friends_count = user.get('friends_count') # fans数 description = user.get('description') # 简介 location = user.get('location') # ip属地 statuses_count = user.get('statuses_count') # 全部发文数 profile_url = 'https://xxxxx.com/' + user.get('profile_url') # 主页链接 print(user_name, followers_count, friends_count, description, location, statuses_count, profile_url) f.write(f'{
user_name},{
followers_count},{
friends_count},{
description},{
location},{
statuses_count},{
profile_url}\n') except Exception as e: print(e) return None f.close() return 'ok' def matching(i, data_list, st, ed, ans_file): for item in data_list[st:ed]: time.sleep(1) user_name = item[0] uid = str(item[2]).split('/')[-1] result = getFriendsList(uid, user_name) if result == '414': time.sleep(120) print('访问频繁,休息一会...') if result is None: time.sleep(1) createPost(uid) time.sleep(1) getFriendsList(uid, user_name) def merge_result_files(out_path): """ 合并多进程的输出文件 :return: """ print("合并输出文件") file_list = os.listdir(out_path) ans = open('result.txt', 'a+', encoding='utf-8') cnt = 0 with open('all_data.csv', 'w', newline='', encoding='utf-8-sig') as fp: w = csv.writer(fp) w.writerow(['用户名', 'fans数量', '关注数量', '简介', 'IP属地', '全部V博数量', '主页URL']) for i in file_list: temp = os.path.join(out_path, i) with open(temp, 'r', encoding='utf-8') as f: for line in f.readlines(): cnt += 1 ans.write(line) w.writerow(line.strip().split(',')) ans.close() print("合并多进程输出文件成功") if cnt == 0: return 0 return cnt if __name__ == "__main__": cookie = '' headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.60 Safari/537.36', 'cookie': cookie } data_list = getExcelData('input_data.xlsx') out_path = 'out_data' mkdir(out_path) st = 0 diff = ceil(len(data_list) / THREAD_NUM) thread_pool = [] for i in range(THREAD_NUM): ans_file = os.path.join(out_path, str(i) + ".txt") thread = Thread(target=matching, args=(i, data_list, diff * i + st, min(diff * (i + 1) + st, len(data_list)), ans_file)) thread_pool.append(thread) for i in range(THREAD_NUM): thread_pool[i].start() for i in range(THREAD_NUM): thread_pool[i].join() cnt = merge_result_files(out_path) # 合并保存文件 print(f'总数:{
cnt}')
总结
我用了8线程跑了3小时左右,最终爬了98万多的数据量,因为有少部分爬取失败的,平台有速度限制,速度不宜太快,否则需要等2分钟左右才能继续访问粉丝列表页,可以看到我在代码里也写了,如果返回响应代码为414,则等待120秒后再继续。
好了,以上是全部内容,欢迎交流!
今天的文章 v博Python爬虫实战案例,多线程实现3小时100万量分享到此就结束了,感谢您的阅读。
版权声明:本文内容由互联网用户自发贡献,该文观点仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容, 请发送邮件至 举报,一经查实,本站将立刻删除。
如需转载请保留出处:https://bianchenghao.cn/bian-cheng-ji-chu/83448.html