基于Jsoup爬虫Demo

基于Jsoup爬虫Demo今天写了一个爬虫跟大家分享一下,该爬虫为简单爬虫,后续会跟大家分享难一些的爬虫,话不多说,直接上代码。如果有疑问,可以直接评论。。。。。packagecom.analysis;importjava.io.IOException;importjava.util.ArrayList;importjava.util.HashMap;importjava.util.List;…

 今天写了一个爬虫跟大家分享一下,该爬虫为简单爬虫,后续会跟大家分享难一些的爬虫,话不多说,直接上代码。如果有疑问,可以直接评论。。。。。

package com.analysis;

import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import com.dao.FriendLinkDao;


public class SnatchSHUJUJU {

	public static Document getDocument (String url){
        try {
       	 //5000是设置连接超时时间,单位ms
            return Jsoup.connect(url).timeout(5000).get();
        } catch (IOException e) {
            e.printStackTrace();
        }
        return null;
    }

	public static List<String> getEveryOtherUrl(){
		List<String> urlList = new ArrayList<>(); 
		String host = "http://www.shujuju.cn";
		String url = "http://www.shujuju.cn/navigation/navigationPage";
		Document document = getDocument(url);
		Elements elements1 = document.select("[class=more fr]");
		Elements elements2 = elements1.select("a[href]");
		for(Element element : elements2){
			String string = host+element.attr("href");
			urlList.add(string);
		}
		return urlList;
	}
	
	public static List<Map> getDetailUrl(List<String> list){
		List <Map> mapList = new ArrayList<>();
		for(String url:list){
			Document document = getDocument(url);
			Elements elements1 = document.select("[class=nav-sort-info]");
			String channelName = elements1.get(0).select("h4").text();
			System.out.println("channelName:"+channelName);
			Elements elements2 = elements1.select("[class=nav-sort-body clearfix]").select("a");
			for(Element element : elements2){
				Map<String,String> map = new HashMap<>();
				String linkUrl = element.attr("href");
			    String name =  element.text();
			    System.out.println("linkUrl:"+linkUrl);
			    System.out.println("name:"+name);
			    map.put("channelName", channelName);
			    map.put("linkUrl", linkUrl);
			    map.put("name", name);
			    mapList.add(map);
			}
		}
		return mapList;
	}
	
	public static void main(String[] args) {
		List<Map> list = getDetailUrl(getEveryOtherUrl());
		FriendLinkDao friendDao = new FriendLinkDao();
		for(Map map:list){
			String channelName = map.get("channelName").toString();
			Integer channelId = friendDao.getChannelId(channelName);
			if(channelId != -1){
				System.out.println("channelId: " + channelId);
				map.put("channelId", channelId);
				map.put("stat", "1");
				friendDao.insertFriendLink(map);
			}else {
				friendDao.insertChannelName(channelName, 1);
				channelId = friendDao.getChannelId(channelName);
				System.out.println("channelId: " + channelId);
				map.put("channelId", channelId);
				map.put("stat", "1");
				friendDao.insertFriendLink(map);
			}
		}
		
	}
}
package com.dao;

import java.sql.Connection;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Timestamp;
import java.text.SimpleDateFormat;
import java.util.HashSet;
import java.util.Date;
import java.util.Map;
import java.util.Set;

import com.util.ConnectUtil;

public class FriendLinkDao {

	public Connection conn = ConnectUtil.getConn();

	public Integer getChannelId(String channelName) {
		Integer id = -1;
		try {
			String sql = "SELECT id FROM t_zsff_friend_link_channel WHERE channel_name = ?";
			PreparedStatement ptmt = conn.prepareStatement(sql);
			ptmt.setString(1, channelName);
			ResultSet rs = ptmt.executeQuery();
			while (rs.next()) {
				id = rs.getInt("id");
			}
			return id;
		} catch (SQLException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
			return id; // 返回-1,数据库插入异常
		}
	}
	
	
	

	public void insertChannelName(String channelName,Integer pid) {

		String sql = "INSERT INTO t_zsff_friend_link_channel (channel_name, pid) VALUES (?, ?)";
		try {
			PreparedStatement ptmt = conn.prepareStatement(sql);
			ptmt.setObject(1, channelName);
			ptmt.setObject(2, pid);
			ptmt.executeUpdate();
		} catch (Exception e) {
			e.printStackTrace();// TODO: handle exception
		}

	}
	
	
	public void insertFriendLink(Map map) {

		String sql = "INSERT INTO t_zsff_friend_link (name, channel_id, link_url, stat) VALUES (?, ?, ?, ?)";
		try {
			PreparedStatement ptmt = conn.prepareStatement(sql);
			ptmt.setObject(1, map.get("name"));
			ptmt.setObject(2, map.get("channelId"));
			ptmt.setObject(3, map.get("linkUrl"));
			ptmt.setObject(4, map.get("stat"));
			ptmt.executeUpdate();
		} catch (Exception e) {
			e.printStackTrace();// TODO: handle exception
		}

	}


}

package com.util;

import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.SQLException;

public class ConnectUtil {
    private static Connection conn;

    public static Connection getConn() {
        
        try {
            //1.加载mysql连接到数据库jar包,数据库驱动
            Class.forName("com.mysql.jdbc.Driver");
            //2.数据库所在位置以及要访问数据库的名字
            String url = "jdbc:mysql://127.0.0.7:3306/test?characterEncoding=UTF-8";
            //3.数据库的用户名,密码
            String username = "root";
            String password = "root";
            //4.使用驱动管理器连接到数据库
            conn = DriverManager.getConnection(url,username,password);
        } catch (ClassNotFoundException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        } catch (SQLException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
        return conn;
    }

    public void setConn(Connection conn1) {
    	conn = conn1;
    }
	
}
<dependency>
  <!-- jsoup HTML parser library @ https://jsoup.org/ -->
  <groupId>org.jsoup</groupId>
  <artifactId>jsoup</artifactId>
  <version>1.13.1</version>
</dependency>

版权声明:本文内容由互联网用户自发贡献,该文观点仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容, 请发送邮件至 举报,一经查实,本站将立刻删除。
如需转载请保留出处:https://bianchenghao.cn/38646.html

(0)
编程小号编程小号

相关推荐

发表回复

您的电子邮箱地址不会被公开。 必填项已用*标注