爬虫-金采网数据(2018-11-19)

爬虫-金采网数据(2018-11-19)文章目录爬虫地址环境爬虫代码致谢爬虫地址http://www.cfcpn.com/plist/caigou?pageNo=1&kflag=0&keyword=&keyw

爬虫地址

http://www.cfcpn.com/plist/caigou?pageNo=1&kflag=0&keyword=&keywordType=&province=&city=&typeOne=&ptpTwo=,,

环境

  1. python3.6.5

爬虫代码

# -*- coding:utf-8*-
import csv
import os
import re
import time

import lxml
import xlrd as xlrd
import xlwt as xlwt
from lxml import etree
import requests
import sys
from xlutils.copy import copy

sys.getdefaultencoding()


def get_page(url):
    try:
        response = requests.get(url)
        if response.status_code == 200:
            # response.encoding = 'utf-8'
            html = response.content.decode('utf-8', 'ignore').replace(u'\xa9', u'')
            # print(sys.getdefaultencoding())
            # print(html)
            return html
    except requests.ConnectionError:
        return None


def parse_page(html):
    pattern1 = '<.*?(href=".*?/\d+").*?'
    href_url = re.findall(pattern1, html, re.I)
    # print(href_url)
    url_list = []
    for url in href_url:
        url1 = url.replace('href=', 'http://www.cfcpn.com').replace('"', '')
        # print(url1)
        url_list.append(url1)
    return url_list


def get_detail_page(url):
    try:
        response = requests.get(url)
        if response.status_code == 200:
            html = response.content.decode('utf-8', 'ignore').replace(u'\xa9', u'').replace(u'\xa0', u'')
            # print(response.status_code)
            # print('--------------------------------')
            mytree = lxml.etree.HTML(html)
            return mytree
    except requests.ConnectionError:
        return None


def parse_detail_page(detail_html):
    item_name = ""
    purchase = ""
    item_type = ""
    money = ""
    date = ""
    company = ""
    p_list = detail_html.xpath('//*[@id="news_content"]/p')
    item_name_list = detail_html.xpath('//*[@id="news_head"]/p[1]//text()')
    for item_name_src in item_name_list:
        index = item_name_src.find('目')
        if (index == -1):
            index = item_name_src.find('案')
        item_type = item_name_src[index + 1:]
        if index < 1:
            item_name = item_name_src
        else:
            item_name = item_name_src[:index + 1]
    date_list = detail_html.xpath('//*[@id="news_head"]/p[2]//text()')

    for dt in date_list:
        date = dt[5:16]

    for p in p_list:
        p_content = p.xpath('.//text()')
        context = ""
        for text in p_content:
            context = context + text.strip().replace(u'\xa0', u'').replace(u'\xa5', u'').replace('\r\n', '')
        # print(context.strip("\n"))
        if '元' in context or '金额' in context:
            money = context + money
        elif '价格' in context:
            money = context + money
            # print(money)
        if '中标' in context:
            company = company + context
        elif '供应商' in context:
            company = company + context
        elif '中选单位' in context:
            company = company + context
        elif '公司' in context:
            company = company + context

        if '采购方式' in context:
            try:
                purchase = context.split(":")[1]
            except:
                purchase = context

    info_list = [item_name, item_type, purchase, company, money, date]
    return info_list


def write_data(sheet, row, lst):
    for data_infos in lst:
        j = 0
        for data in data_infos:
            sheet.write(row, j, data)
            j += 1
        row += 1


def save(file_name, data):
    if os.path.exists(file_name):
        # 打开excel
        rb = xlrd.open_workbook(file_name, formatting_info=True)
        # 用 xlrd 提供的方法获得现在已有的行数
        rn = rb.sheets()[0].nrows
        # 复制excel
        wb = copy(rb)
        # 从复制的excel文件中得到第一个sheet
        sheet = wb.get_sheet(0)
        # 向sheet中写入文件
        write_data(sheet, rn, data)
        # 删除原先的文件
        os.remove(file_name)
        # 保存
        wb.save(file_name)
    else:
        header = ['company_name', 'company_desc', 'company_type', 'card_type', 'activity_deadtime', 'company_address',
                  'company_phone', 'activity_info']
        book = xlwt.Workbook(encoding='utf-8')
        sheet = book.add_sheet('金采网')
        # 向 excel 中写入表头
        for h in range(len(header)):
            sheet.write(0, h, header[h])
        # 向sheet中写入内容
        write_data(sheet, 1, data)
        book.save(file_name)


def main():
    print('*' * 80)
    print('\t\t\t\t金采网数据下载')
    print('作者:谢华东  2018.11.8')
    print('--------------')
    table = int(float(input('请输入公告类型(1:1采购公告,2-结果公告):\n')))
    while (table < 1 or table > 2):
        quarter = int(float(input('公告类型输入错误,请重新输入:\n')))
    path = (input('请输入要保存的地址(例如:C:\\Users\\xhdong1\\Desktop\\),不输入直接按回车表示默认当前位置:\n'))
    dict_tables = {1: 'caigou', 2: 'jieguo'}
    file_name = path + '金采网' + dict_tables[table] + '.xls'

    # 计算总共有多少页

    minfrom = int(input('请输入你需要从哪一个开始爬:\n'))
    maxto = int(input('请输入你需要截止到哪一页:\n'))
    for i in range(minfrom, maxto):
        print('正在爬取' + dict_tables[table] + '下的第' + str(i) + '页数据')
        base = 'http://www.cfcpn.com/plist/{type}?pageNo={page_num}&kflag=0&keyword=&keywordType=&province=&city=&typeOne=&ptpTwo='
        url = base.format(type=dict_tables[table], page_num=i)
        time.sleep(1)
        all_info_list = []
        html = get_page(url)
        if html == None:
            print('该页没有数据')
            continue
        url_list = parse_page(html)
        # print(url_list)
        for url in url_list:
            # print(url)
            detail_html = get_detail_page(url)
            context_list = parse_detail_page(detail_html)
            context_list.append(url)
            # print(i)
            # print(context_list)
            all_info_list.append(context_list)
        save(file_name, all_info_list)


if __name__ == '__main__':
    main()

致谢

感谢自己。

在这里插入图片描述

今天的文章爬虫-金采网数据(2018-11-19)分享到此就结束了,感谢您的阅读。

版权声明:本文内容由互联网用户自发贡献,该文观点仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容, 请发送邮件至 举报,一经查实,本站将立刻删除。
如需转载请保留出处:https://bianchenghao.cn/64589.html

(0)
编程小号编程小号

相关推荐

发表回复

您的电子邮箱地址不会被公开。 必填项已用*标注