目录
- 测试说明
- 第一种方法:按块读,一次读8kb
- 第二种方法:按行读,一次读一行
- 第三种方法:多线程并行读取
- 完整示例
测试说明
测试文件:100万行,每一行是两个小数,中间用逗号隔开,读取时每一行的第一个数作为x,第二个数作为y组成两个vector数组。
第一种方法:按块读,一次读8kb
// 按块大小读取
const size_t BUFFER_SIZE = 20480; // 8KB 缓冲区大小
std::vector<double> x_large;
std::vector<double> y_large;
void readLargeFile(const std::string& filePath)
{
std::ifstream file(filePath, std::ios::binary);
if (!file)
{
std::cerr << "Failed to open file" << std::endl;
return;
}
std::string line;
std::vector<char> buffer(BUFFER_SIZE);
while (file.read(buffer.data(), BUFFER_SIZE))
{
// 在这里处理读取到的缓冲区数据
for (char c : buffer)
{
if (c == ',')
{
x_large.emplace_back(std::stod(line));
line.clear();
}
else if (c == '\n')
{
y_large.emplace_back(std::stod(line));
line.clear();
}
else
line += c;
}
}
// 处理最后不足 BUFFER_SIZE 的部分
size_t remaining = file.gcount();
for (size_t i = 0; i < remaining; ++i)
{
if (buffer[i] == ',')
{
x_large.emplace_back(std::stod(line));
line.clear();
}
else if (buffer[i] == '\n')
{
y_large.emplace_back(std::stod(line));
line.clear();
}
else
line += buffer[i];
}
if (!line.empty())
std::cout << line << std::endl;
file.close();
}
测试,这种方法大概需要1800ms左右时间
// 记录开始时间
clock_t start = clock();
readLargeFile("output.txt"); // 1835 1839 1810 1765 1871
// 记录结束时间
clock_t end = clock();
// 计算时间差并转换为毫秒
double elapsedTime = (double)(end - start) * 1000.0 / CLOCKS_PER_SEC;
std::cout << "Function execution time: " << elapsedTime << " milliseconds" << std::endl;
第二种方法:按行读,一次读一行
// 按行读
std::vector<double> x_normal;
std::vector<double> y_normal;
void ReadFileNormal(const std::string& filePath)
{
std::ifstream file(filePath);
if (!file)
{
std::cerr << "Failed to open file" << std::endl;
return;
}
std::string line;
while (std::getline(file, line))
{
//std::cout << line << std::endl;
int pos = line.find(',');
double x = std::stod(line.substr(0, pos));
double y = std::stod(line.substr(pos + 1, line.size() - pos));
x_normal.emplace_back(x);
y_normal.emplace_back(y);
}
file.close();
}
测试,大概需要4800ms左右的时间
// 记录开始时间
clock_t start = clock();
ReadFileNormal("output.txt"); // 4878 4772 4821
// 记录结束时间
clock_t end = clock();
// 计算时间差并转换为毫秒
double elapsedTime = (double)(end - start) * 1000.0 / CLOCKS_PER_SEC;
std::cout << "Function execution time: " << elapsedTime << " milliseconds" << std::endl;
第三种方法:多线程并行读取
// 多线程读取
const int NUM_THREADS = 4; // 线程数量,可以根据文件大小设置
std::vector<std::vector<double>> x(NUM_THREADS);
std::vector<std::vector<double>> y(NUM_THREADS);
std::mutex mtx;
// 每个线程读取文件的部分
void readFilePart(const int& _idx,const std::string& filePath, long long start, long long size)
{
std::ifstream file(filePath, std::ios::binary);
if (!file)
{
std::cerr << "Failed to open file" << std::endl;
return;
}
file.seekg(start); // 移动到指定位置
std::vector<char> buffer(size);
file.read(buffer.data(), size);
std::vector<double> x1;
std::vector<double> y1;
std::string line;
for (char c : buffer)
{
if (c == ',')
{
x1.emplace_back(std::stod(line));
line.clear();
}
else if (c == '\n')
{
y1.emplace_back(std::stod(line));
line.clear();
}
else
line += c;
}
file.close();
std::lock_guard<std::mutex> lock(mtx); // 作用域内加锁,超出作用域自动解锁
// 按照读取顺序进行赋值
x[_idx] = x1;
y[_idx] = y1;
}
void ReadFileThread(const std::string& filePath)
{
std::ifstream file(filePath, std::ios::ate | std::ios::binary); // 以二进制模式打开并获取文件末尾位置
if (!file)
{
std::cerr << "Failed to open file" << std::endl;
return;
}
long long fileSize = file.tellg(); // 获取文件大小
file.close();
long long partSize = fileSize / NUM_THREADS;
std::vector<std::thread> threads;
for (int i = 0; i < NUM_THREADS; ++i)
{
long long start = i * partSize;
long long size = (i == NUM_THREADS - 1) ? fileSize - start : partSize;
threads.emplace_back(readFilePart,i, filePath, start, size);
}
for (auto& thread : threads)
thread.join();
}
测试,开启4个线程并行读取,所需时间大概是580ms左右
// 记录开始时间
clock_t start = clock();
ReadFileThread("output.txt"); // 584 553
// 记录结束时间
clock_t end = clock();
// 计算时间差并转换为毫秒
double elapsedTime = (double)(end - start) * 1000.0 / CLOCKS_PER_SEC;
std::cout << "Function execution time: " << elapsedTime << " milliseconds" << std::endl;
完整示例
#include <iostream>
#include <random>
#include <fstream>
#include <string>
#include <ctime>
#include <thread>
#include <vector>
#include <mutex>
// 随机生成0~10的小数
double GenerateRandomDouble()
{
std::random_device rd;
std::mt19937 gen(rd());
std::uniform_real_distribution<double> dis(0.0, 10.0);
return dis(gen);
}
// 写入文件测试数据
void WriteFile()
{
std::string filePath = "output.txt"; // 要写入的文件路径
// 创建并打开文件用于写入,如果文件不存在则创建
std::ofstream outputFile(filePath, std::ios::app);
if (outputFile.is_open())
{
for (int i = 0; i < 1000000; i++)
{
double x = GenerateRandomDouble();
double y = GenerateRandomDouble();
std::string text = std::to_string(x) + "," + std::to_string(y);
outputFile << text << std::endl;
}
outputFile.close();
}
else
{
std::cerr << "无法打开文件进行写入。" << std::endl;
return;
}
}
// 按块大小读取
const size_t BUFFER_SIZE = 20480; // 8KB 缓冲区大小
std::vector<double> x_large;
std::vector<double> y_large;
void readLargeFile(const std::string& filePath)
{
std::ifstream file(filePath, std::ios::binary);
if (!file)
{
std::cerr << "Failed to open file" << std::endl;
return;
}
std::string line;
std::vector<char> buffer(BUFFER_SIZE);
while (file.read(buffer.data(), BUFFER_SIZE))
{
// 在这里处理读取到的缓冲区数据
for (char c : buffer)
{
if (c == ',')
{
x_large.emplace_back(std::stod(line));
line.clear();
}
else if (c == '\n')
{
y_large.emplace_back(std::stod(line));
line.clear();
}
else
line += c;
}
}
// 处理最后不足 BUFFER_SIZE 的部分
size_t remaining = file.gcount();
for (size_t i = 0; i < remaining; ++i)
{
if (buffer[i] == ',')
{
x_large.emplace_back(std::stod(line));
line.clear();
}
else if (buffer[i] == '\n')
{
y_large.emplace_back(std::stod(line));
line.clear();
}
else
line += buffer[i];
}
if (!line.empty())
std::cout << line << std::endl;
file.close();
}
// 按行读
std::vector<double> x_normal;
std::vector<double> y_normal;
void ReadFileNormal(const std::string& filePath)
{
std::ifstream file(filePath);
if (!file)
{
std::cerr << "Failed to open file" << std::endl;
return;
}
std::string line;
while (std::getline(file, line))
{
//std::cout << line << std::endl;
int pos = line.find(',');
double x = std::stod(line.substr(0, pos));
double y = std::stod(line.substr(pos + 1, line.size() - pos));
x_normal.emplace_back(x);
y_normal.emplace_back(y);
}
file.close();
}
// 多线程读取
const int NUM_THREADS = 4; // 线程数量,可以根据文件大小设置
std::vector<std::vector<double>> x(NUM_THREADS);
std::vector<std::vector<double>> y(NUM_THREADS);
std::mutex mtx;
// 每个线程读取文件的部分
void readFilePart(const int& _idx,const std::string& filePath, long long start, long long size)
{
std::ifstream file(filePath, std::ios::binary);
if (!file)
{
std::cerr << "Failed to open file" << std::endl;
return;
}
file.seekg(start); // 移动到指定位置
std::vector<char> buffer(size);
file.read(buffer.data(), size);
std::vector<double> x1;
std::vector<double> y1;
std::string line;
for (char c : buffer)
{
if (c == ',')
{
x1.emplace_back(std::stod(line));
line.clear();
}
else if (c == '\n')
{
y1.emplace_back(std::stod(line));
line.clear();
}
else
line += c;
}
file.close();
std::lock_guard<std::mutex> lock(mtx); // 作用域内加锁,超出作用域自动解锁
x[_idx] = x1;
y[_idx] = y1;
}
void ReadFileThread(const std::string& filePath)
{
std::ifstream file(filePath, std::ios::ate | std::ios::binary); // 以二进制模式打开并获取文件末尾位置
if (!file)
{
std::cerr << "Failed to open file" << std::endl;
return;
}
long long fileSize = file.tellg(); // 获取文件大小
file.close();
long long partSize = fileSize / NUM_THREADS;
std::vector<std::thread> threads;
for (int i = 0; i < NUM_THREADS; ++i)
{
long long start = i * partSize;
long long size = (i == NUM_THREADS - 1) ? fileSize - start : partSize;
threads.emplace_back(readFilePart,i, filePath, start, size);
}
for (auto& thread : threads)
thread.join();
}
int main()
{
// 记录开始时间
clock_t start = clock();
//WriteFile();
//readLargeFile("output.txt"); // 1835 1839 1810 1765 1871
//ReadFileNormal("output.txt"); // 4878 4772 4821
ReadFileThread("output.txt"); // 584 553
// 记录结束时间
clock_t end = clock();
// 计算时间差并转换为毫秒
double elapsedTime = (double)(end - start) * 1000.0 / CLOCKS_PER_SEC;
std::cout << "Function execution time: " << elapsedTime << " milliseconds" << std::endl;
system("pause");
return 0;
}