2025年记录：一次爬取gitee项目名称和url[通俗易懂]

编程汇总 • 2025-04-06 21:01 • 阅读 14

items：

class GiteeItem(scrapy.Item):

    link = scrapy.Field()

    desc = scrapy.Field()

    pass

db：

import emoji

import pymysql



connect = pymysql.connect(host='localhost', user='root', password='root', db='mindsa', charset='utf8mb4')

cursor = connect.cursor()





def insertGitee(item):

    sql = """INSERT INTO gitee(link,`desc`) VALUES ({},{})""".format("'" + emoji.demojize(item['link']) + "'",

                                                                     "'" + emoji.demojize(item['desc']) + "'")

    cursor.execute(sql)

    connect.commit()

pipelines：

class GiteePipeline:

    def process_item(self, item, spider):

        insertGitee(item)

settings：

ITEM_PIPELINES = {

    'myscrapy.pipelines.GiteePipeline': 300,

}

GiteeSprider:

import scrapy



from myscrapy.items import GiteeItem





class GiteeSprider(scrapy.Spider):

    name = 'gitee'

    allow_domains = 'gitee.com'

    start_urls = ['https://gitee.com/explore/all']



    def parse(self, response, **kwargs):

        # 使用绝对路径定位标签

        elements = response.xpath('//div[@]//div[@]')

        for element in elements:

            # 注意：再次进行xpath的时候是相对路径在需要//前面加上.。是.//而不是//

            link = self.allow_domains + element.xpath('.//h3/a/@href').get()

            desc = element.xpath('.//div[@]/text()').get()

            item = GiteeItem()

            item['link'] = link

            item['desc'] = desc

            yield item

        # 注意：根据多个属性值进行xpath的时候，用and来连接。

        next_href__get = response.xpath(

            '//div[@]//a[@ and @rel="next"]/@href'

        ).get()



        if next_href__get is not None:

            # 如果存在下一页则继续请求

            yield scrapy.Request("https://gitee.com"+next_href__get, self.parse)

2025年记录：一次爬取gitee项目名称和url[通俗易懂]

相关推荐