java的爬虫(java爬虫步骤)

java的爬虫(java爬虫步骤)package com chao crawler import java util ArrayList import java util HashSet import java util List import java util Set import com chao util ListUtil import us codecraft webmagic Page import us codecraft webmagic Site import us codecraft

package com.chao.crawler;

import java.util.ArrayList;

import java.util.HashSet;

import java.util.List;

import java.util.Set;

import com.chao.util.ListUtil;

import us.codecraft.webmagic.Page;

import us.codecraft.webmagic.Site;

import us.codecraft.webmagic.Spider;

import us.codecraft.webmagic.pipeline.ConsolePipeline;

import us.codecraft.webmagic.processor.PageProcessor;

public class PageProcesserProduct implements PageProcessor {

// private Site site = null;

//

// public PageProcesser(String domain, String startUrl) {

//

// site = Site.me().setDomain(domain).addStartUrl(startUrl);

//

// }

//调试用

private Site site = Site.me().setDomain("http://www.babysittersnow.com")

.addStartUrl("http://www.babysittersnow.com.au/babysitters/search");

@Override

public void process(Page page) {

//System.out.println(page.getUrl());

String Title= page.getHtml().xpath("//div[@class='profile-panel-main']/h1").toString().replaceAll("<[^>]*>", "");;

page.putField("Title",Title);

String Info=page.getHtml().xpath("//div[@class='profile-panel-details']").toString().replaceAll("<[^>]*>", "");;

page.putField("Info",Info);

String Review=page.getHtml().xpath("//div[@class='review']/p").toString();

page.putField("Review",Review);

String Introduction=page.getHtml().xpath("//div[@id='profile-tab-introduction']").toString().replaceAll("<[^>]*>", "");;

page.putField("Introduction",Introduction);

String Details=page.getHtml().xpath("//div[@id='profile-tab-details']").toString().replaceAll("<[^>]*>", "");;

page.putField("Details",Details);

String Insights=page.getHtml().xpath("//div[@id='profile-tab-insights']").toString().replaceAll("<[^>]*>", "");;

page.putField("Insights",Insights);

System.out.println("商品筛选完毕,准备执行存储");

// page.putField("author", page.getHtml().$("div.Resume").toString());

// page.putField("info", page.getHtml().xpath("//p[@class='profile-panel-details']/p/label/text()").toString());

Product product = new Product();

product.setTitle(Title);

product.setInfo(Info);

product.setReview(Review);

product.setIntroduction(Introduction);

product.setDetails(Details);

product.setInsights(Insights);

page.putField("product", product);

System.out.println("----------------------------------------------------");

}

@Override

public Site getSite() {

return site;

}

public static void main(String[] args) {

Spider.create(new PageProcesserProduct())

.pipeline(new ConsolePipeline()).thread(10).run();

}

编程小号
上一篇 2025-08-20 19:30
下一篇 2025-12-05 20:11

相关推荐

版权声明:本文内容由互联网用户自发贡献,该文观点仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容, 请发送邮件至 举报,一经查实,本站将立刻删除。
如需转载请保留出处:https://bianchenghao.cn/bian-cheng-ri-ji/27111.html