爬虫地址
http://www.cfcpn.com/plist/caigou?pageNo=1&kflag=0&keyword=&keywordType=&province=&city=&typeOne=&ptpTwo=,,
环境
- python3.6.5
爬虫代码
# -*- coding:utf-8*-
import csv
import os
import re
import time
import lxml
import xlrd as xlrd
import xlwt as xlwt
from lxml import etree
import requests
import sys
from xlutils.copy import copy
sys.getdefaultencoding()
def get_page(url):
try:
response = requests.get(url)
if response.status_code == 200:
# response.encoding = 'utf-8'
html = response.content.decode('utf-8', 'ignore').replace(u'\xa9', u'')
# print(sys.getdefaultencoding())
# print(html)
return html
except requests.ConnectionError:
return None
def parse_page(html):
pattern1 = '<.*?(href=".*?/\d+").*?'
href_url = re.findall(pattern1, html, re.I)
# print(href_url)
url_list = []
for url in href_url:
url1 = url.replace('href=', 'http://www.cfcpn.com').replace('"', '')
# print(url1)
url_list.append(url1)
return url_list
def get_detail_page(url):
try:
response = requests.get(url)
if response.status_code == 200:
html = response.content.decode('utf-8', 'ignore').replace(u'\xa9', u'').replace(u'\xa0', u'')
# print(response.status_code)
# print('--------------------------------')
mytree = lxml.etree.HTML(html)
return mytree
except requests.ConnectionError:
return None
def parse_detail_page(detail_html):
item_name = ""
purchase = ""
item_type = ""
money = ""
date = ""
company = ""
p_list = detail_html.xpath('//*[@id="news_content"]/p')
item_name_list = detail_html.xpath('//*[@id="news_head"]/p[1]//text()')
for item_name_src in item_name_list:
index = item_name_src.find('目')
if (index == -1):
index = item_name_src.find('案')
item_type = item_name_src[index + 1:]
if index < 1:
item_name = item_name_src
else:
item_name = item_name_src[:index + 1]
date_list = detail_html.xpath('//*[@id="news_head"]/p[2]//text()')
for dt in date_list:
date = dt[5:16]
for p in p_list:
p_content = p.xpath('.//text()')
context = ""
for text in p_content:
context = context + text.strip().replace(u'\xa0', u'').replace(u'\xa5', u'').replace('\r\n', '')
# print(context.strip("\n"))
if '元' in context or '金额' in context:
money = context + money
elif '价格' in context:
money = context + money
# print(money)
if '中标' in context:
company = company + context
elif '供应商' in context:
company = company + context
elif '中选单位' in context:
company = company + context
elif '公司' in context:
company = company + context
if '采购方式' in context:
try:
purchase = context.split(":")[1]
except:
purchase = context
info_list = [item_name, item_type, purchase, company, money, date]
return info_list
def write_data(sheet, row, lst):
for data_infos in lst:
j = 0
for data in data_infos:
sheet.write(row, j, data)
j += 1
row += 1
def save(file_name, data):
if os.path.exists(file_name):
# 打开excel
rb = xlrd.open_workbook(file_name, formatting_info=True)
# 用 xlrd 提供的方法获得现在已有的行数
rn = rb.sheets()[0].nrows
# 复制excel
wb = copy(rb)
# 从复制的excel文件中得到第一个sheet
sheet = wb.get_sheet(0)
# 向sheet中写入文件
write_data(sheet, rn, data)
# 删除原先的文件
os.remove(file_name)
# 保存
wb.save(file_name)
else:
header = ['company_name', 'company_desc', 'company_type', 'card_type', 'activity_deadtime', 'company_address',
'company_phone', 'activity_info']
book = xlwt.Workbook(encoding='utf-8')
sheet = book.add_sheet('金采网')
# 向 excel 中写入表头
for h in range(len(header)):
sheet.write(0, h, header[h])
# 向sheet中写入内容
write_data(sheet, 1, data)
book.save(file_name)
def main():
print('*' * 80)
print('\t\t\t\t金采网数据下载')
print('作者:谢华东 2018.11.8')
print('--------------')
table = int(float(input('请输入公告类型(1:1采购公告,2-结果公告):\n')))
while (table < 1 or table > 2):
quarter = int(float(input('公告类型输入错误,请重新输入:\n')))
path = (input('请输入要保存的地址(例如:C:\\Users\\xhdong1\\Desktop\\),不输入直接按回车表示默认当前位置:\n'))
dict_tables = {1: 'caigou', 2: 'jieguo'}
file_name = path + '金采网' + dict_tables[table] + '.xls'
# 计算总共有多少页
minfrom = int(input('请输入你需要从哪一个开始爬:\n'))
maxto = int(input('请输入你需要截止到哪一页:\n'))
for i in range(minfrom, maxto):
print('正在爬取' + dict_tables[table] + '下的第' + str(i) + '页数据')
base = 'http://www.cfcpn.com/plist/{type}?pageNo={page_num}&kflag=0&keyword=&keywordType=&province=&city=&typeOne=&ptpTwo='
url = base.format(type=dict_tables[table], page_num=i)
time.sleep(1)
all_info_list = []
html = get_page(url)
if html == None:
print('该页没有数据')
continue
url_list = parse_page(html)
# print(url_list)
for url in url_list:
# print(url)
detail_html = get_detail_page(url)
context_list = parse_detail_page(detail_html)
context_list.append(url)
# print(i)
# print(context_list)
all_info_list.append(context_list)
save(file_name, all_info_list)
if __name__ == '__main__':
main()
致谢
感谢自己。
今天的文章爬虫-金采网数据(2018-11-19)分享到此就结束了,感谢您的阅读。
版权声明:本文内容由互联网用户自发贡献,该文观点仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容, 请发送邮件至 举报,一经查实,本站将立刻删除。
如需转载请保留出处:https://bianchenghao.cn/64589.html