package com.chao.crawler;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import com.chao.util.ListUtil;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.pipeline.ConsolePipeline;
import us.codecraft.webmagic.processor.PageProcessor;
public class PageProcesserProduct implements PageProcessor {
// private Site site = null;
//
// public PageProcesser(String domain, String startUrl) {
//
// site = Site.me().setDomain(domain).addStartUrl(startUrl);
//
// }
//调试用
private Site site = Site.me().setDomain("http://www.babysittersnow.com")
.addStartUrl("http://www.babysittersnow.com.au/babysitters/search");
@Override
public void process(Page page) {
//System.out.println(page.getUrl());
String Title= page.getHtml().xpath("//div[@class='profile-panel-main']/h1").toString().replaceAll("<[^>]*>", "");;
page.putField("Title",Title);
String Info=page.getHtml().xpath("//div[@class='profile-panel-details']").toString().replaceAll("<[^>]*>", "");;
page.putField("Info",Info);
String Review=page.getHtml().xpath("//div[@class='review']/p").toString();
page.putField("Review",Review);
String Introduction=page.getHtml().xpath("//div[@id='profile-tab-introduction']").toString().replaceAll("<[^>]*>", "");;
page.putField("Introduction",Introduction);
String Details=page.getHtml().xpath("//div[@id='profile-tab-details']").toString().replaceAll("<[^>]*>", "");;
page.putField("Details",Details);
String Insights=page.getHtml().xpath("//div[@id='profile-tab-insights']").toString().replaceAll("<[^>]*>", "");;
page.putField("Insights",Insights);
System.out.println("商品筛选完毕,准备执行存储");
// page.putField("author", page.getHtml().$("div.Resume").toString());
// page.putField("info", page.getHtml().xpath("//p[@class='profile-panel-details']/p/label/text()").toString());
Product product = new Product();
product.setTitle(Title);
product.setInfo(Info);
product.setReview(Review);
product.setIntroduction(Introduction);
product.setDetails(Details);
product.setInsights(Insights);
page.putField("product", product);
System.out.println("----------------------------------------------------");
}
@Override
public Site getSite() {
return site;
}
public static void main(String[] args) {
Spider.create(new PageProcesserProduct())
.pipeline(new ConsolePipeline()).thread(10).run();
}
版权声明:本文内容由互联网用户自发贡献,该文观点仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容, 请发送邮件至 举报,一经查实,本站将立刻删除。
如需转载请保留出处:https://bianchenghao.cn/bian-cheng-ri-ji/27111.html