需要环境:python3.x,requests,pandas, lxml,re
在这里我爬取了中关村手机信息的手机型号、手机图片、参考价格、京东价格、CPU型号、后置摄像头像素、前置摄像头像素、电池容量以及屏幕尺寸
#!/usr/bin/python3
# -*- coding:utf-8 -*-
# Author:ChenYuan
import requests
from lxml import etree
import re
import pandas as pd
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'}
def get_params_url(p_url): # 获取手机参数的url
p_html = etree.HTML(requests.get(p_url, headers).content.decode('gbk'))
params = p_html.xpath('.//div[@id="_j_tag_nav"]/ul/li[2]/a/@href')
p_url = 'http://detail.zol.com.cn' + params[0]
return p_url
def get_phone_price(p_url): # 获取手机价格
""" :param p_url: :return: 参考价格, 京东价格 """
p_html = etree.HTML(requests.get(p_url, headers).content.decode('gbk'))
try:
n_price = p_html.xpath('.//div[@class="price price-normal"]/span/b[@class="price-type"]/text()')[0]
j_price = p_html.xpath('.//ul[@class="price_commerce price__b2c"]/li/div/a/span/span/text()')[0]
except:
n_price = '0'
j_price = '0'
return n_price, j_price
def get_params(p_url): # 获取参数
html_ = etree.HTML(requests.get(p_url, headers).content.decode('gbk'))
try:
cpu = html_.xpath('.//div[@class="info-list-fr"]/ul/li[1]/div')[0].xpath('string(.)')
cpu = cpu.replace(' ', '')
cpu = cpu.replace('CPU:', '')
cpu = cpu.split('游戏')[0]
cpu = cpu.split('\n')[0]
cpu = cpu.split('行业')[0]
cpu = cpu.strip()
except:
cpu = '0' # CPU型号
try:
rear_camera = html_.xpath('.//div[@class="info-list-fr"]/ul/li[2]/div')[0].xpath('string(.)').replace('后置:', '') # 前置摄像头像素
except:
rear_camera = '0'
try:
front_camera = html_.xpath('.//div[@class="info-list-fr"]/ul/li[3]/div')[0].xpath('string(.)').replace('前置:', '') # 后置摄像头像素
except:
front_camera = '0'
try:
battery = html_.xpath('.//div[@class="info-list-fr"]/ul/li[5]/div')[0].xpath('string(.)').replace('电池:', '') # 电池容量
except:
battery = '0'
try:
screen = html_.xpath('.//div[@class="info-list-fr"]/ul/li[6]/div')[0].xpath('string(.)').replace('屏幕:', '').replace('需双手打字', '') # 屏幕大小
except:
screen = '0'
return cpu, rear_camera, front_camera, battery, screen
for i in range(1, 31): # 翻页三十页
url = 'http://detail.zol.com.cn/cell_phone_index/subcate57_0_list_1_0_1_2_0_{}.html'.format(i)
response = requests.get(url, headers=headers)
html = etree.HTML(response.content.decode('gbk'))
phone_pics = re.findall('<img width="220" height="165" .src="(.*?)" alt=', response.content.decode('gbk'))
product_list = html.xpath('//ul[@id="J_PicMode"]/li')
for product, picture in zip(product_list, phone_pics):
phone_dict = {
}
if len(product.xpath('a/@href')) >= 1:
phone_url = 'http://detail.zol.com.cn'+product.xpath('a/@href')[0]
print(phone_url)
phone_name = product.xpath('a/img/@alt')[0] # 手机型号
normal_price, jd_price = get_phone_price(phone_url)
cpu, rear_camera, front_camera, battery, screen = get_params(get_params_url(phone_url))
phone_dict['手机型号'] = [phone_name]
phone_dict['手机图片'] = [picture]
phone_dict['参考价格'] = [normal_price]
phone_dict['京东价格'] = [jd_price.replace('¥', '')]
phone_dict['CPU'] = [cpu]
phone_dict['后置摄像头像素'] = [rear_camera]
phone_dict['前置摄像头像素'] = [front_camera]
phone_dict['电池容量'] = [battery]
phone_dict['屏幕尺寸'] = [screen]
df = pd.DataFrame(phone_dict)
df.to_csv('ZOL.csv', sep=',', header=None, mode='a', index=None)
print(phone_dict)
今天的文章python爬取手机app的数据_手机上怎么做python爬虫分享到此就结束了,感谢您的阅读。
版权声明:本文内容由互联网用户自发贡献,该文观点仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容, 请发送邮件至 举报,一经查实,本站将立刻删除。
如需转载请保留出处:https://bianchenghao.cn/85813.html