【爬虫】Java爬虫爬取某招聘网站招聘信息

前言

一、爬虫程序的基本架构

二、如何获取目标网站的页面内容

三、解析HTML页面，提取所需信息

四、代理IP的使用

五、完整代码

总结

前言

随着互联网的普及，越来越多的人开始关注网络上的招聘信息，而传统的求职方式愈发显得不够快捷、高效。爬虫技术，则能够帮助我们快速地获取互联网上的招聘信息，从而提高求职的效率。

本文介绍如何使用Java编写爬虫程序，以爬取某招聘网站的招聘信息为例，并采用代理IP提高爬取效率。文章包含以下几个部分：

1. 爬虫程序的基本架构
2. 如何获取目标网站的页面内容
3. 解析HTML页面，提取所需信息
4. 代理IP的使用
5. 完整代码

一、爬虫程序的基本架构

一个基本的爬虫程序通常由三个模块组成：获取页面、解析页面、存储数据。具体实现可以使用各种语言和库，这里我们使用Java和Jsoup库实现爬虫程序。

二、如何获取目标网站的页面内容

获取页面的方法主要有两种：使用HttpURLConnection或使用HttpClient。此处我们使用HttpClient。HttpClient是Apache Jakarta Common Project组织提供的开源Java实现的HTTP客户端软件包。它不仅可以支持HTTP协议，还可以支持HTTPS协议。并且，HttpClient提供了一些扩展功能，例如自动重定向，SSL连接等。

下面是使用HttpClient获取网页内容的示例代码：

public String getHtml(String url) {
    String html = null;
    try {
        // 创建HttpClient对象
        CloseableHttpClient httpClient = HttpClients.createDefault();
        // 创建HttpGet请求
        HttpGet httpGet = new HttpGet(url);
        // 执行HttpGet请求，获取HttpResponse响应
        CloseableHttpResponse httpResponse = httpClient.execute(httpGet);
        // 获取HttpEntity实例
        HttpEntity entity = httpResponse.getEntity();
        // 使用EntityUtils工具类将HttpEntity转换成字符串
        html = EntityUtils.toString(entity, "utf-8");
        // 关闭资源
        httpResponse.close();
        httpClient.close();
    } catch (Exception e) {
        e.printStackTrace();
    }
    return html;
}

三、解析HTML页面，提取所需信息

解析HTML页面的方法通常有两种：使用正则表达式或使用HTML解析器。使用正则表达式可能会更加灵活，但是容易出错。所以，此处我们使用HTML解析器Jsoup。

Jsoup是Java的一个HTML解析器，它可以直接解析某个URL地址、HTML文本内容。它提供了类似于Jquery的语法，再加上一些API操作，可以很灵活的进行HTML解析。

下面是使用Jsoup解析HTML页面并提取信息的示例代码：

public List<Map<String, Object>> parse(String html) {
    List<Map<String, Object>> dataList = new ArrayList<>();
    // 使用Jsoup解析HTML页面
    Document document = Jsoup.parse(html);
    // 获取招聘信息列表
    Elements jobElements = document.select(".newlist .jobList");
    for (Element job : jobElements) {
        Map<String, Object> data = new HashMap<>();
        // 获取招聘信息
        String jobTitle = job.select(".zwmc div a").first().text();
        String jobUrl = job.select(".zwmc div a").first().attr("href");
        String companyName = job.select(".gsmc a").first().text();
        String companyUrl = job.select(".gsmc a").first().attr("href");
        String jobCity = job.select(".gzdd").first().text();
        String jobSalary = job.select(".zwyx").first().text();
        String jobDate = job.select(".gxsj").first().text();
        // 将信息保存到Map里
        data.put("jobTitle", jobTitle);
        data.put("jobUrl", jobUrl);
        data.put("companyName", companyName);
        data.put("companyUrl", companyUrl);
        data.put("jobCity", jobCity);
        data.put("jobSalary", jobSalary);
        data.put("jobDate", jobDate);
        // 将Map添加到列表里
        dataList.add(data);
    }
    return dataList;
}

以上代码使用了CSS选择器来定位目标元素，大大简化了解析过程。

四、代理IP的使用

在爬虫过程中，我们需要频繁的向目标网站发送请求，如果每次请求都使用同一个IP地址，就会被目标网站封锁，影响爬取效率。此时，代理IP是一个好的选择。

代理IP是指代理服务器上的IP地址，用来代替客户端发送请求和接收响应。代理IP可以隐藏客户端真实的IP地址，同时可以在一定程度上保护用户的隐私。

使用代理IP的方法也很简单。我们只需要在每次发送请求时，指定使用的代理IP即可。

下面是使用代理IP的示例代码：

public String getHtmlWithProxy(String url, String host, int port) {
    String html = null;
    try {
        // 创建HttpClient对象
        CloseableHttpClient httpClient = HttpClients.createDefault();
        // 创建HttpGet请求
        HttpGet httpGet = new HttpGet(url);
        // 设置代理IP和端口
        HttpHost proxy = new HttpHost(host, port);
        RequestConfig config = RequestConfig.custom().setProxy(proxy).build();
        httpGet.setConfig(config);
        // 执行HttpGet请求，获取HttpResponse响应
        CloseableHttpResponse httpResponse = httpClient.execute(httpGet);
        // 获取HttpEntity实例
        HttpEntity entity = httpResponse.getEntity();
        // 使用EntityUtils工具类将HttpEntity转换成字符串
        html = EntityUtils.toString(entity, "utf-8");
        // 关闭资源
        httpResponse.close();
        httpClient.close();
    } catch (Exception e) {
        e.printStackTrace();
    }
    return html;
}

五、完整代码

以下是完整的爬虫程序代码：

import org.apache.http.HttpEntity;
import org.apache.http.HttpHost;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

public class JobSpider {
    private static final String ZHAOPIN_URL = "https://sou.zhaopin.com/jobs/searchresult.ashx";

    public static void main(String[] args) {
        String keyword = "Java";
        String city = "北京";
        int start = 0;
        int count = 60;

        JobSpider spider = new JobSpider();
        List<Map<String, Object>> dataList = spider.spiderJobInfo(keyword, city, start, count);
        System.out.println(dataList);
    }

    public List<Map<String, Object>> spiderJobInfo(String keyword, String city, int start, int count) {
        List<Map<String, Object>> dataList = new ArrayList<>();
        try {
            // 爬取数据
            for (int i = start; i < start + count; i += 60) {
                // 构造请求参数
                String url = String.format("%s?jl=%s&kw=%s&start=%d", ZHAOPIN_URL, city, keyword, i);
                // 获取页面HTML
                String html = getHtmlWithProxy(url, "127.0.0.1", 1080);
                // 解析HTML页面，提取信息
                List<Map<String, Object>> jobList = parse(html);
                // 将解析结果添加到结果集中
                dataList.addAll(jobList);
            }
        } catch (Exception e) {
            e.printStackTrace();
        }
        return dataList;
    }

    public String getHtml(String url) {
        String html = null;
        try {
            // 创建HttpClient对象
            CloseableHttpClient httpClient = HttpClients.createDefault();
            // 创建HttpGet请求
            HttpGet httpGet = new HttpGet(url);
            // 执行HttpGet请求，获取HttpResponse响应
            CloseableHttpResponse httpResponse = httpClient.execute(httpGet);
            // 获取HttpEntity实例
            HttpEntity entity = httpResponse.getEntity();
            // 使用EntityUtils工具类将HttpEntity转换

成字符串
            html = EntityUtils.toString(entity, "utf-8");
            // 关闭资源
            httpResponse.close();
            httpClient.close();
        } catch (Exception e) {
            e.printStackTrace();
        }
        return html;
    }

    public String getHtmlWithProxy(String url, String host, int port) {
        String html = null;
        try {
            // 创建HttpClient对象
            CloseableHttpClient httpClient = HttpClients.createDefault();
            // 创建HttpGet请求
            HttpGet httpGet = new HttpGet(url);
            // 设置代理IP和端口
            HttpHost proxy = new HttpHost(host, port);
            RequestConfig config = RequestConfig.custom().setProxy(proxy).build();
            httpGet.setConfig(config);
            // 执行HttpGet请求，获取HttpResponse响应
            CloseableHttpResponse httpResponse = httpClient.execute(httpGet);
            // 获取HttpEntity实例
            HttpEntity entity = httpResponse.getEntity();
            // 使用EntityUtils工具类将HttpEntity转换成字符串
            html = EntityUtils.toString(entity, "utf-8");
            // 关闭资源
            httpResponse.close();
            httpClient.close();
        } catch (Exception e) {
            e.printStackTrace();
        }
        return html;
    }

    public List<Map<String, Object>> parse(String html) {
        List<Map<String, Object>> dataList = new ArrayList<>();
        // 使用Jsoup解析HTML页面
        Document document = Jsoup.parse(html);
        // 获取招聘信息列表
        Elements jobElements = document.select(".newlist .jobList");
        for (Element job : jobElements) {
            Map<String, Object> data = new HashMap<>();
            // 获取招聘信息
            String jobTitle = job.select(".zwmc div a").first().text();
            String jobUrl = job.select(".zwmc div a").first().attr("href");
            String companyName = job.select(".gsmc a").first().text();
            String companyUrl = job.select(".gsmc a").first().attr("href");
            String jobCity = job.select(".gzdd").first().text();
            String jobSalary = job.select(".zwyx").first().text();
            String jobDate = job.select(".gxsj").first().text();
            // 将信息保存到Map里
            data.put("jobTitle", jobTitle);
            data.put("jobUrl", jobUrl);
            data.put("companyName", companyName);
            data.put("companyUrl", companyUrl);
            data.put("jobCity", jobCity);
            data.put("jobSalary", jobSalary);
            data.put("jobDate", jobDate);
            // 将Map添加到列表里
            dataList.add(data);
        }
        return dataList;
    }
}