代码很详细,就不注释啦!有问题尽情留言,有问必答。。。
spider爬虫模块:
# -*- coding: utf-8 -*-
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from anjuke2.items import Anjuke2Item
from scrapy import Request
import time
class Anju2Spider(CrawlSpider):
name = 'anju2'
allowed_domains = ['sh.zu.anjuke.com']
start_urls = ['http://sh.zu.anjuke.com/']
page_link = LinkExtractor(restrict_xpaths='//div[@class="page-content"]/div/a')
rules = (
Rule(page_link, callback='parse_item', follow=True),
)
def parse_item(self, response):
infolist = response.xpath('//div[@class="list-content"]/div')
infolist = infolist[2:-1]
for house in infolist:
try:
name = house.xpath('.//div[@class="zu-info"]/h3/a/text()')[0].extract()
huxing = house.xpath('.//div[@class="zu-info"]/p/text()')[0].extract().split(' ')[-1]
louceng = house.xpath('.//div[@class="zu-info"]/p/text()')[2].extract()
mianji = house.xpath('.//div[@class="zu-info"]/p/text()')[1].extract()
addrss = house.xpath('.//div[@class="zu-info"]/address/a/text()')[0].extract()
chuzufangshi = house.xpath('.//div[@class="zu-info"]/p[2]/span[1]/text()')[0].extract()
rent = house.xpath('.//div[@class="zu-side"]/p/strong/text()')[0].extract()
item = Anjuke2Item()
city = response.xpath('//div[@class="cityselect"]/div[1]/text()')[0].extract().split(' ')[-1]
item['city'] = city
item['name'] = name
item['huxing'] = huxing
item['louceng'] = louceng
item['mianji'] = mianji
item['addrss'] = addrss
item['chuzufangshi'] = chuzufangshi
item['rent'] = rent
yield item
except Exception as e:
print(e)
pipeline管道模块:
#写入本地/
import json
class Anjuke2Pipeline(object):
def open_spider(self,spider):
self.fp=open('上海.txt','w',encoding='utf8')
def close_spider(self,spider):
self.fp.close()
def process_item(self, item, spider):
dic=dict(item)
string=json.dumps(dic,ensure_ascii=False)
self.fp.write(string+'\n')
return item
#写入MongoDB/
import pymongo
class mongodbPipeline(object):
def open_spider(self,spider):
self.client=pymongo.MongoClient(host='localhost',port=27017)
def close_spider(self,spider):
self.client.close()
def process_item(self,item,spider):
db=self.client.anjuke
clo=db.zufang
clo.insert(dict(item))
return item
#写入pymysql数据库
import pymysql
class mysqlPipeline(object):
def open_spider(self,spider):
self.connect=pymysql.connect(host='127.0.0.1',port=3306,user='root',pwd='123456',database='anjuke',charset='utf8')
def close_spider(self,spider):
self.connect.close()
def process_item(self,item,spider):
self.save_mysql(item)
return item
def save_mysql(self,item):
cursor=self.connect.cursor()
sql='insert into zufang(city,title,huxing,louceng,mianji,addrss,chuzufangshi,rent) values("%s","%s","%s","%s","%s","%s","%s","%s")' % (item['city'],item['name'],item['huxing'],item['louceng'],item['mianji'],item['addrss'],item['chuzufangshi'],item['rent'])
try:
cursor.execute(sql)
self.connect.commit()
except Exception as e:
print(e)
self.connect.rollback()
#写入sqlite
from scrapy.utils.project import get_project_settings
import sqlite3
class sqllitPipeline(object):
def open_spider(self,spider):
settings = get_project_settings()
self.db=sqlite3.connect('sql.db')
self.cur = self.db.cursor()
sql1 = '''create table zufang(
city char(50) not null,
title char(50) not null,
huxing char(50) not null,
louceng char(50) not null,
mianji char(50) not null,
addrss char(50) not null,
chuzufangshi char(50) not null,
rent char(50) not null)'''
self.cur.execute(sql1)
def close_spider(self,spider):
self.db.close()
def process_item(self,item,spider):
self.save_to_sqlite(item)
return item
def save_to_sqlite(self,item):
sql='insert into zufang(city,title,huxing,louceng,mianji,addrss,chuzufangshi,rent) values("%s","%s","%s","%s","%s","%s","%s","%s")' % (item['city'],item['name'],item['huxing'],item['louceng'],item['mianji'],item['addrss'],item['chuzufangshi'],item['rent'])
try:
self.cur.execute(sql)
self.db.commit()
except Exception as e:
print(e)
self.db.rollback()
items模块:
import scrapy
class Anjuke2Item(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
city = scrapy.Field()
name = scrapy.Field()
huxing = scrapy.Field()
louceng = scrapy.Field()
mianji = scrapy.Field()
addrss = scrapy.Field()
chuzufangshi = scrapy.Field()
rent = scrapy.Field()
settings模块:
BOT_NAME = 'anjuke2'
SPIDER_MODULES = ['anjuke2.spiders']
NEWSPIDER_MODULE = 'anjuke2.spiders'
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.75 Safari/537.36'
ROBOTSTXT_OBEY = False
DOWNLOAD_DELAY = 3
ITEM_PIPELINES = {
'anjuke2.pipelines.Anjuke2Pipeline': 300,
# 'anjuke2.pipelines.mongodbPipeline': 301,
# 'anjuke2.pipelines.mysqlPipeline': 302,
# 'anjuke2.pipelines.sqllitPipeline': 303,
}
版权声明:本文内容由互联网用户自发贡献,该文观点仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容, 请发送邮件至 举报,一经查实,本站将立刻删除。
如需转载请保留出处:https://bianchenghao.cn/38013.html