python爬虫进阶-大众点评店铺信息（字体反爬-静态映射）

目的

获取大众点评店铺信息

详细需求

http://www.dianping.com/shenzhen/ch10

思路解析

一通过F12查找目标信息位置，进行分析



同理进行其他信息的解析，分析汇总

店铺名称：源网页获取
商家评分：源网页获取
评价：
人均：
口味：	
环境：
服务：
标签：	
地址：源网页获取
推荐菜:源网页获取

二字体反爬解析


三根据其后的链接，保存为ttf在本地，查看TTF字体映射

字体工具：FontCreatorPortable，自行百度下载安装
在线工具：http://fontstore.baidu.com/static/editor/index.html

经过比对四个字体映射（地址，标签，店铺，评论）后发现，字体映射的内容是相同的

pip install fontTools

from fontTools.ttLib import TTFont
 
font=TTFont('01.ttf')    #打开本地字体文件01.ttf
font.saveXML('01.xml')   #将ttf文件转化成xml格式并保存到本地，主要是方便我们查看内部数据结构

读取TTF文件，将其另存为xml文件，进行查看

每个字符都静态映射为一个字，我们需要做的就是将网页中的字符映射为文字，将其提取出来，他们的最后四位是相同的，利用这个实现一一映射对应

获取店铺信息，这里只需要用到两个字体依赖文件（shopNum与tagName）

四整体思路

一.获取目标网页
二.获取字体相关css文件，提取需要的字体TTF链接
三.下载TTF，构建本地字体映射
四.正则提取目标信息
五.打印信息，查看是否正确

源码实现

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Author : jia666
# Time : 2021/6/15 15:41

import re
import requests
from fontTools.ttLib import TTFont

class Public_Spider():
    def __init__(self):
        self.url = "http://www.dianping.com/shenzhen/ch10"  #目标URL
        self.head = { 
   
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
            "Accept-Encoding": "gzip, deflate",
            "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
            "Connection": "keep-alive",
            "DNT": "1",
            "Host": "www.dianping.com",
            "Upgrade-Insecure-Requests": "1",
            "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36",
}

    def Get_TTF(self):

        TTF = re.findall('<linkrel="stylesheet"type="text\/css"href="(.*?)">', self.html, re.S)[1]  # 缩小范围，店铺信息
        header = { 
   
            "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36", }
        res=requests.get('http:'+TTF,headers=header).text                                #请求css

        TTF_dict={ 
   }                     #字体字典
        fontlist=re.findall('@font-face{(.*?)}',res,re.S)                #缩小范围

        for font in fontlist:
            TTF_name=re.findall('font-family: "PingFangSC-Regular-(.*?)"',font,re.S)[0] #TTF的类别
            TTF_link=re.findall(',url\("(.*?)"\);',font,re.S)[0]
            TTF_dict.update({ 
   TTF_name:TTF_link})
        shopNum=TTF_dict.get('shopNum')         #店铺ttf
        tagName=TTF_dict.get('tagName')         #标签ttf


        b = requests.get('http:'+shopNum, headers=header).content
        with open('大众点评shopNum.ttf', 'wb')as f:
            f.write(b)

        b = requests.get('http:'+tagName, headers=header).content
        with open('大众点评tagName.ttf', 'wb')as f:
            f.write(b)

        font1 = TTFont('大众点评shopNum.ttf')  # 打开本地字体文件
        uni_list1 = font1.getGlyphOrder()[2:]  # 获取所有编码，去除前2个
        self.utf8List = ['&#x' + uni[3:] for uni in uni_list1]

        font2 = TTFont('大众点评tagName.ttf')  # 打开本地字体文件01.ttf
        uni_list2 = font2.getGlyphOrder()[2:]  # 获取所有编码，去除前2个
        self.utf8List2 = ['&#x' + uni[3:] for uni in uni_list2]

        self.wordlist = '1234567890店中美家馆小车大市公酒行国品发电金心业商司超生装园场食有新限天面工服海华水房饰城乐汽香部利子老艺花专东肉菜学福饭人百餐茶务通味所山区门药银农龙停尚安广鑫一容动南具源兴鲜记时机烤文康信果阳理锅宝达地儿衣特产西批坊州牛佳化五米修爱北养卖建材三会鸡室红站德王光名丽油院堂烧江社合星货型村自科快便日民营和活童明器烟育宾精屋经居庄石顺林尔县手厅销用好客火雅盛体旅之鞋辣作粉包楼校鱼平彩上吧保永万物教吃设医正造丰健点汤网庆技斯洗料配汇木缘加麻联卫川泰色世方寓风幼羊烫来高厂兰阿贝皮全女拉成云维贸道术运都口博河瑞宏京际路祥青镇厨培力惠连马鸿钢训影甲助窗布富牌头四多妆吉苑沙恒隆春干饼氏里二管诚制售嘉长轩杂副清计黄讯太鸭号街交与叉附近层旁对巷栋环省桥湖段乡厦府铺内侧元购前幢滨处向座下澩凤港开关景泉塘放昌线湾政步宁解白田町溪十八古双胜本单同九迎第台玉锦底后七斜期武岭松角纪朝峰六振珠局岗洲横边济井办汉代临弄团外塔杨铁浦字年岛陵原梅进荣友虹央桂沿事津凯莲丁秀柳集紫旗张谷的是不了很还个也这我就在以可到错没去过感次要比觉看得说常真们但最喜哈么别位能较境非为欢然他挺着价那意种想出员两推做排实分间甜度起满给热完格荐喝等其再几只现朋候样直而买于般豆量选奶打每评少算又因情找些份置适什蛋师气你姐棒试总定啊足级整带虾如态且尝主话强当更板知己无酸让入啦式笑赞片酱差像提队走嫩才刚午接重串回晚微周值费性桌拍跟块调糕'


    def Get_html(self):
        response = requests.get(self.url, headers=self.head).text

        self.html = re.sub('\s', '', response)

    def RE_html(self):
        shop_info = re.findall('<liclass="">(.*?)<\/li>', self.html, re.S)  # 缩小范围，店铺信息

        for i in range(len(shop_info)):
            shop_name = re.findall('data-name="(.*?)"', shop_info[i], re.S)  # 店铺名称
            shop_score = re.findall('score_sml">(.*?)<\/div>', shop_info[i], re.S)  # 店铺评分
            shop_address = re.findall('data-address="(.*?)"', shop_info[i], re.S)  # 店铺地址
            shop_goods = re.findall('target="_blank">(.*?)<\/a>', shop_info[i], re.S)  # 店铺推荐菜

            coment = re.findall('<b>(.*?)<\/b>条评价', shop_info[i], re.S)[0]  # 店铺评价数目
            per_capita = re.findall('人均<b>(.*?)<\/b', shop_info[i], re.S)[0]  # 人均价格
            taste = re.findall('口味<b>(.*?)<\/b>', shop_info[i], re.S)[0]  # 口味评分
            surroundings = re.findall('环境<b>(.*?)<\/b>', shop_info[i], re.S)[0]  # 环境评分
            server = re.findall('服务<b>(.*?)<\/b>', shop_info[i], re.S)[0]  # 服务评分

            tag = '|'.join(re.findall('<spanclass="tag">(.*?)<\/span>', shop_info[i], re.S))  # 标签



            for i in range(10):
                coment = coment.replace(self.utf8List[i], self.wordlist[i])  # 字体映射
                per_capita = per_capita.replace(self.utf8List[i], self.wordlist[i])  # 字体映射
                taste = taste.replace(self.utf8List[i], self.wordlist[i])  # 字体映射
                surroundings = surroundings.replace(self.utf8List[i], self.wordlist[i])  # 字体映射
                server = server.replace(self.utf8List[i], self.wordlist[i])  # 字体映射
            shop_comment = (''.join(re.findall('\d', coment, re.S)))  # 评价数目
            per_capita = (''.join(re.findall('\d', per_capita, re.S)))  # 人均消费

            surroundings = (''.join(re.findall('\d', surroundings, re.S)))  # 环境评分
            surroundings = surroundings[0] + '.' + surroundings[1:]
            taste = (''.join(re.findall('\d', taste, re.S)))  # 口味评分
            taste = taste[0] + '.' + taste[1:]
            server_score = (''.join(re.findall('\d', server, re.S)))  # 服务评分
            server_score = server_score[0] + '.' + server_score[1:]

            for i in range(len(self.utf8List2)):
                if self.utf8List2[i] in tag:
                    tag = tag.replace(self.utf8List2[i], self.wordlist[i])
            tag = tag.split('|')

            Tag = []  # 标签列表
            for tag in tag:
                Tag.append(''.join(re.findall('[\u4e00-\u9fa5]', tag, re.S)))
            Tag = ' | '.join(Tag)
            print(
                '*' * 50 + '\n' +
                str(shop_name[0]) + '\n' + '商家评分：' + str(shop_score[0]) + '\t' + str(
                    shop_comment) + '条评价' + '\t' + '人均￥' + str(per_capita) + '\n' + '口味：' + str(
                    taste) + '\t' + '环境：' + str(surroundings) + '服务：' + str(server_score) + '\n' + str(
                    Tag) + '\t' + str(shop_address[0]) + '\n' + '推荐菜:' + str(','.join(shop_goods))
            )

    def run(self):
        self.Get_html()
        self.Get_TTF()
        self.RE_html()
if __name__ == '__main__':
    example=Public_Spider()
    example.run()
    # example.Get_TTF()