使用python爬虫爬取迅雷侠下载,呵呵,你懂得

使用python爬虫爬取迅雷侠下载,呵呵,你懂得importurllib.requestfromlxmlimportetreeimportthreadingfrommultiprocessingimportPoolfrommultiprocessing.dummyimportPoolasThreadPool#请求#三种浏览器的user-agent#headers={‘User-Agent’

啥也不说了,直接上代码

#!python3
import urllib.request
from lxml import etree
import threading
from multiprocessing import Pool
from multiprocessing.dummy import Pool as ThreadPool


#龙图腾原创

#请求  
#三种浏览器的user-agent
#headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.63 Safari/537.36'}
#headers  = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11'}
headers  = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0'}


max_try_down = 20
max_timeout_down = 1
allresult = ""      #最终要下载的内容
allresult_count = 1; #下载个数


all_page_item = [];  


#将下载结果总结到allresult
lock = threading.Lock();#线程锁初始化
def add_inresult(weburl,name,link,size):
    global allresult_count,allresult


    lock.acquire();#线程锁加锁
    if link not in allresult:
        allresult_count = allresult_count+1;
        allresult += ("%s\r\n"%(weburl));
        #allresult += ("len name=%d link=%d %d\r\n"%(len(name),len(link),len(size)));
        allresult += ("name=%s\r\n"%(name));
        allresult += ("size=%s\r\n"%(size));
        allresult += ("link=%s\r\n"%(link));
        allresult += ("\r\n");
    lock.release();#线程锁解锁






#下载文件
def download_this(weburl):


    #print("down %s++\n"%(weburl),end='');#因为使用了threadpool故此打印\n可能会遇到问题,故此使用这种打印方法
   
    #下载循环尝试4次
    for i in range(max_try_down):
        try:
            sub_req = urllib.request.Request(url=weburl,headers=headers)  
            sub_response = urllib.request.urlopen(sub_req,timeout=max_timeout_down)
            sub_data = sub_response.read()
            sub_data = sub_data.decode('utf-8')
            tree=etree.HTML(sub_data);


           
            file_name = " ".join(tree.xpath(u"//h3/text()"));#找到h3的text
            file_size = tree.xpath(u"//b[text()='文件大小']/following-sibling::span")[0].text;#找到文本="文件大小"的节点 following-sibling::span寻找下一个span节点
            file_link = tree.xpath(u"//a[@href=*]/@href")[0];#寻找存在href的 a节点,并且提取href
            add_inresult(weburl,file_name,file_link,file_size);
            break
        except IndexError as e:
            add_inresult(weburl,file_name,"连接失效:"+weburl,file_size);
        except Exception as e:
            print("%s [download_this]重试 %d <%s>:%s\n"%(weburl,i,type(e),e),end='');
            pass
        
    print("down %s ok--\n"%(weburl),end='');#因为使用了threadpool故此打印\n可能会遇到问题,故此使用这种打印方法




#列出当前页面的下载列表
def list_this(weburl):
    global all_page_item
    for i in range(max_try_down):
        try:
            #print("+++search+++%s\n"%weburl,end='');#因为使用了threadpool故此打印\n可能会遇到问题,故此使用这种打印方法


            request = urllib.request.Request(url=weburl,headers=headers)    
            #爬取结果  
            response = urllib.request.urlopen(request,timeout=max_timeout_down)  


            data = response.read()




            
            #设置解码方式  
            data = data.decode('utf-8')


            tree=etree.HTML(data);


            nodes = tree.xpath(u"//h5[@class='item-title']/a/@href");#找寻h5节点的class属性为item-title下的a节点/提取href属性


            if len(nodes) <= 0:
                return False


            all_page_item += nodes;#统计所有的页面下的下载页面


            
            print("allresult_count=%d"%len(all_page_item));
            print("---search ok---%s \n"%weburl,end='');#因为使用了threadpool故此打印\n可能会遇到问题,故此使用这种打印方法
            return True;
        except Exception as e:
            print("%s [list_this]重试 %d <%s>:%s\n"%(weburl,i,type(e),e),end='');#因为使用了threadpool故此打印\n可能会遇到问题,故此使用这种打印方法
            pass




def search_this(name):




    while True:
        try:
            url = "https://www.soxunlei.com/search/%s/page/%d.html"%(urllib.request.quote(name),1)
            request = urllib.request.Request(url=url,headers=headers)    
            #爬取结果  
            response = urllib.request.urlopen(request,timeout=1)  


            data = response.read()




            #设置解码方式  
            data = data.decode('utf-8')


            tree=etree.HTML(data);


            count = int(tree.xpath(u"//ol[@class='breadcrumb']/li[4]/font[2]")[0].text);#搜索返回个数,并且算出有多少页
            page_count = count / 15 + 1  #一页容纳15个下载页,算出页数
            print("%s搜索到%d个内容,分为%d页"%(name,count,page_count))




            #讲下载页的url算出放入page_nodes
            page_nodes = [];
            for n in range(1,int(page_count)+1):
                url = "https://www.soxunlei.com/search/%s/page/%d.html"%(urllib.request.quote(name),n)
                page_nodes.append(url);
                print("url=%s"%url);
            
            break
        except Exception as e:
            print("%s:%s"%(type(e),e));
    #调用python线程池模型,抓取所有下载页中的下载项
    pool = ThreadPool(32)
    results = pool.map(list_this,page_nodes);
    pool.close();
    pool.join();  


    #线程池模型,下载所有内容
    pool = ThreadPool(32)
    results = pool.map(download_this,all_page_item);
    pool.close();
    pool.join();


    
    print(allresult);
    print("抓取完成 项%d 下载%d"%(count,allresult_count));
if __name__ == "__main__":
    search_this("上原空");

版权声明:本文内容由互联网用户自发贡献,该文观点仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容, 请发送邮件至 举报,一经查实,本站将立刻删除。
如需转载请保留出处:https://bianchenghao.cn/36309.html

(0)
编程小号编程小号

相关推荐

发表回复

您的电子邮箱地址不会被公开。 必填项已用*标注