使用libcurl的简单爬虫

Aki 发布于 2023-11-03 328 次阅读


#include <iostream>
#include <string>
#include <thread>
#include <mutex>
#include <fstream>
#include <random>
#include <curl/curl.h>
#include<atomic>
using namespace std;

//爬取的url
const string url = "https://www.dmoe.cc/random.php";

//爬取图片线程数
const int _n1 = 2;

//每个线程爬取次数
const int _n2 = 10;

//记录爬取次数
atomic<int> _n3(0);

//设置curl基本信息
void set_basic_curl(CURL* curl, const string& url) noexcept
{
	//设置url
	curl_easy_setopt(curl, CURLOPT_URL, url.c_str());

	/*根证书是用于验证其他证书有效性的最高级别证书。它由受信任的证书颁发机构(CA)签发,并用于构建证书链来验证远程服务器的证书。
	  根证书包含了多个 X.509 格式的证书,这些证书由公共的证书机构签发。这些证书被称为“根证书”,因为它们构成了证书链中的最高级别。
	  libcurl 使用提供的根证书文件来验证远程服务器证书的可信性。在 TLS 握手阶段,libcurl 将验证服务器证书的签名是否能够追溯到根证书。
	  因此,通过在 libcurl 中设置 CURLOPT_CAINFO 选项为该根证书文件,你可以确保 libcurl 能够正确验证远程服务器证书的有效性。*/

	  //设置curl使用的根证书文件,用于https连接来认证对方服务器身份
	curl_easy_setopt(curl, CURLOPT_CAINFO, "cacert.pem");

	/*User-Agent字段向服务器发送,包含了访问者系统引擎版本、浏览器信息的字段信息。一般服务器识别出是爬虫请求,会拒绝访问。
	  所以此时设置User - Agent,可以将爬虫伪装成用户通过浏览器访问。*/

	  // 设置 User-Agent 字段
	curl_easy_setopt(curl, CURLOPT_USERAGENT, "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Safari/537.36");

	// 设置 Referer 字段
	curl_easy_setopt(curl, CURLOPT_REFERER, "https://www.baidu.com");

	//设置HTTP请求版本为1.1
	curl_easy_setopt(curl, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_1_1);

	//启用重定向
	//重定向是指在发起 HTTP 请求时,服务器返回一个特殊的响应状态码(如 301 或 302),告诉客户端需要将请求重定向到另一个 URL。通常,这会导致客户端重新发起一个新的请求到重定向的 URL 上。
	curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L);
}

//写入到文件回调函数
size_t write_file(char* ptr, size_t size, size_t nmemb, void* des) noexcept
{
	fstream* file = static_cast<fstream*>(des);
	if (file)
	{
		file->write(ptr, size * nmemb);
	}
	return size * nmemb;
}

//保存图片线程入口函数
void save_img() noexcept
{
	//循环爬取
	for (int j = 0; j < _n2; ++j)
	{
		//开始爬取图片
		CURL* curl = curl_easy_init();
		if (curl)
		{
			set_basic_curl(curl, url);
			string path = "C:\\Users\\laijian\\Desktop\\img\\" + to_string(_n3++) + ".jpg";
			fstream file(path, ios::binary | ios::out);
			if (file.is_open())
			{
				curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, write_file);
				curl_easy_setopt(curl, CURLOPT_WRITEDATA, &file);
				CURLcode res = curl_easy_perform(curl);
				if (res == CURLE_OK)
				{
					cout << url << " > " << path << " \033[1;32msuccessfully !!!\033[0m" << endl;
				}
			}
			curl_easy_cleanup(curl);
		}
	}
}

int main()
{

	//初始化curl库
	curl_global_init(CURL_GLOBAL_DEFAULT);
	cout << "\033[31mStart !!!\033[0m" << endl;

	vector<thread> threads;
	for (int i = 0; i < _n1; ++i)
	{
		threads.push_back(thread(save_img));
	}
	for (auto& i : threads)
	{
		i.join();
	}

	// 清理libcurl全局环境
	curl_global_cleanup();
	cout << "\033[31mEnd !!!\033[0m" << endl;

	return 0;
}
#include <iostream>
#include <string>
#include <thread>
#include <mutex>
#include <vector>
#include <condition_variable>
#include <cstring>
#include <sys/types.h>
#include <sys/stat.h>
#include <fstream>
#include <random>
#include <curl/curl.h>
#include <atomic>
#include <regex>
using namespace std;

int i = 0;

size_t write_str(char* ptr, size_t size, size_t nmeb, void* des)
{
	string* str = static_cast<string*>(des);
	if (str)
	{
		str->append(ptr, size * nmeb);
	}
	return size * nmeb;
}

size_t write_file(char* ptr, size_t size, size_t nmeb, void* des)
{
	fstream* str = static_cast<fstream*>(des);
	if (str)
	{
		str->write(ptr, size * nmeb);
	}
	return size * nmeb;
}

//设置curl基本信息
void set_basic_curl(CURL* curl, const string& url) noexcept
{
	curl_easy_setopt(curl, CURLOPT_URL, url.c_str());

	curl_easy_setopt(curl, CURLOPT_CAINFO, "cacert.pem");

	curl_easy_setopt(curl, CURLOPT_USERAGENT, "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Safari/537.36");

	curl_easy_setopt(curl, CURLOPT_REFERER, "https://www.baidu.com");

	curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L);
}

int main()
{
	std::cout << "\033[32m";
	//瑟图网站url
	string url = "https://sex.nyan.xyz/api/v2/?num=10&tag=fate";

	curl_global_init(CURL_GLOBAL_DEFAULT);

	regex rule("https://sex.nyan.xyz/.*?\.((png)|(jpg))");
	smatch results;

	CURL* curl = curl_easy_init();

	if (curl)
	{
		string s;
		set_basic_curl(curl, url);
		curl_easy_setopt(curl, CURLOPT_HEADERFUNCTION, write_str);
		curl_easy_setopt(curl, CURLOPT_HEADERDATA, &s);
		curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, write_str);
		curl_easy_setopt(curl, CURLOPT_WRITEDATA, &s);
		CURLcode res = curl_easy_perform(curl);
		if (res == CURLE_OK)
		{
			for (sregex_iterator it(s.begin(), s.end(), rule), end; it != end; ++it)
			{
				string path = "D:\\setu\\" + to_string(i++);
				if (it->str()[it->str().size() - 3] == 'p')
				{
					path += ".png";
				}
				else
				{
					path += ".jpg";
				}
				
				fstream file(path,ios::binary | ios::out);
				if (file.is_open())
				{
					CURL* curl2 = curl_easy_init();
					if (curl2)
					{
						set_basic_curl(curl2, it->str());
						curl_easy_setopt(curl2, CURLOPT_WRITEFUNCTION, write_file);
						curl_easy_setopt(curl2, CURLOPT_WRITEDATA, &file);
						CURLcode res2 = curl_easy_perform(curl2);
						if (res2 == CURLE_OK)
						{
							cout << it->str() << " > " << path << endl;
						}
						curl_easy_cleanup(curl2);
					}
					file.close();
				}
			}
		}
		curl_easy_cleanup(curl);
	}

	curl_global_cleanup();
	cout << "\033[0m";

	return 0;
}
g++ main.cpp -o main -w -std=c++2a -O2 -lpthread -lssl -lcrypto -lcurl