jsoup爬虫代码

jsoup爬虫代码1、添加依赖<dependency><groupId>org.jsoup</groupId><artifactId>jsoup</artifactId><version>1.10.2</version></dependency><dependency><g

1、添加依赖

<dependency>
            <groupId>org.jsoup</groupId>
            <artifactId>jsoup</artifactId>
            <version>1.10.2</version>
        </dependency>
        <dependency>
            <groupId>commons-io</groupId>
            <artifactId>commons-io</artifactId>
            <version>2.6</version>
        </dependency>
        <dependency>
            <groupId>commons-lang</groupId>
            <artifactId>commons-lang</artifactId>
            <version>2.6</version>
        </dependency>

2、具体实现

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.junit.jupiter.api.Test;
import org.springframework.boot.test.context.SpringBootTest;

import java.io.*;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;

@SpringBootTest
class SpiderTests { 
   

    @Test
    void contextLoads() throws IOException { 
   
        File folder = new File("C:\\Users\\86139\\Desktop\\姓名 - 副本.txt");
        getData(folder);
    }

    private void getData(File txtFile) throws IOException { 
   
        BufferedReader reader = null;
        BufferedWriter writer = null;
        try { 
   

            // 读
            reader = new BufferedReader(new InputStreamReader(new FileInputStream(txtFile)));

            List<String> lineList = new ArrayList<>();
            String line;
            while ((line = reader.readLine()) != null) { 
   
                if (line == null || line.isEmpty()) { 
   
                    continue;
                }
                String url = "https://www.baidu.com/s?ie=utf-8&f=8&rsv_bp=1&tn=62095104_41_oem_dg&wd=" + line + "&oq=%25E6%259E%2597%25E6%25BD%2598%25E6%25AD%25A6&rsv_pq=c04cd2e0002906e5&rsv_t=0a47IpZ7ERpQPcIxllylZwrPOhgeiYs2DLK5Zm%2BB%2Ffe7BOM5ioHXJqKjvKdDPm8KSHdlWt6w%2BkJ9&rqlang=cn&rsv_dl=tb&rsv_enter=0&rsv_btype=t&inputT=2336&rsv_sug3=1582&rsv_n=2&rsv_sug1=1407&rsv_sug7=100&rsv_sug4=2680";
                HashMap<String, String> map = new HashMap<>();
                map.put("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9");
                map.put("Accept-Encoding", "gzip, deflate, br");
                map.put("Accept-Language", "zh-CN,zh;q=0.9,en;q=0.8");
                map.put("Cache-Control", "max-age=0");
                map.put("Connection", "keep-alive");
                map.put("sec-ch-ua", "\" Not;A Brand\";v=\"99\", \"Google Chrome\";v=\"91\", \"Chromium\";v=\"91\"");
                map.put("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36");

                Document doc = Jsoup.connect(url).headers(map).get();

                String first = doc.getElementsByAttributeValue("class", "nums_text").first().text();
                String res = first.substring(11, first.length() - 1).replace(",", "");
                //把读到的数据存到数组里
                lineList.add(res);
            }

            // 写
            writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(txtFile)));

            boolean firstTime = true;
            for (String s : lineList) { 
   

                if (firstTime) { 
   
                    writer.write(s);
                    firstTime = false;
                    continue;
                }

                writer.newLine();
                writer.write(s);
            }

            writer.flush();

        } catch (IOException e) { 
   
            e.printStackTrace();
        }finally { 
   
            reader.close();
            writer.close();
        }
    }
}

版权声明:本文内容由互联网用户自发贡献,该文观点仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容, 请发送邮件至 举报,一经查实,本站将立刻删除。
如需转载请保留出处:https://bianchenghao.cn/38606.html

(0)
编程小号编程小号

相关推荐

发表回复

您的电子邮箱地址不会被公开。 必填项已用*标注