C++读取大文件三种方法速度比较

测试说明

测试文件：100万行，每一行是两个小数，中间用逗号隔开，读取时每一行的第一个数作为x，第二个数作为y组成两个vector数组。

在这里插入图片描述

第一种方法：按块读，一次读8kb

// 按块大小读取
const size_t BUFFER_SIZE = 20480;  // 8KB 缓冲区大小

std::vector<double> x_large;
std::vector<double> y_large;

void readLargeFile(const std::string& filePath) 
{
    std::ifstream file(filePath, std::ios::binary);

    if (!file)
    {
        std::cerr << "Failed to open file" << std::endl;
        return;
    }

    std::string line;
    std::vector<char> buffer(BUFFER_SIZE);
    while (file.read(buffer.data(), BUFFER_SIZE)) 
    {
        // 在这里处理读取到的缓冲区数据
        for (char c : buffer) 
        {
            if (c == ',')
            {
                x_large.emplace_back(std::stod(line));
                line.clear();
            }
            else if (c == '\n')
            {
                y_large.emplace_back(std::stod(line));
                line.clear();
            }
            else
                line += c;
        }
    }

    // 处理最后不足 BUFFER_SIZE 的部分
    size_t remaining = file.gcount();
    for (size_t i = 0; i < remaining; ++i) 
    {
        if (buffer[i] == ',')
        {
            x_large.emplace_back(std::stod(line));
            line.clear();
        }
        else if (buffer[i] == '\n')
        {
            y_large.emplace_back(std::stod(line));
            line.clear();
        }
        else
            line += buffer[i];
    }

    if (!line.empty())
        std::cout << line << std::endl;

    file.close();
}

测试，这种方法大概需要1800ms左右时间

// 记录开始时间
clock_t start = clock();

readLargeFile("output.txt");    // 1835 1839 1810 1765 1871

// 记录结束时间
clock_t end = clock();

// 计算时间差并转换为毫秒
double elapsedTime = (double)(end - start) * 1000.0 / CLOCKS_PER_SEC;

std::cout << "Function execution time: " << elapsedTime << " milliseconds" << std::endl;

第二种方法：按行读，一次读一行

// 按行读
std::vector<double> x_normal;
std::vector<double> y_normal;
void ReadFileNormal(const std::string& filePath)
{
    std::ifstream file(filePath);

    if (!file)
    {
        std::cerr << "Failed to open file" << std::endl;
        return;
    }

    std::string line;
    while (std::getline(file, line))
    {
        //std::cout << line << std::endl;
        int pos = line.find(',');

        double x = std::stod(line.substr(0, pos));
        double y = std::stod(line.substr(pos + 1, line.size() - pos));

        x_normal.emplace_back(x);
        y_normal.emplace_back(y);
    }

    file.close();
}

测试，大概需要4800ms左右的时间

// 记录开始时间
clock_t start = clock();

ReadFileNormal("output.txt"); // 4878 4772 4821

// 记录结束时间
clock_t end = clock();

// 计算时间差并转换为毫秒
double elapsedTime = (double)(end - start) * 1000.0 / CLOCKS_PER_SEC;

std::cout << "Function execution time: " << elapsedTime << " milliseconds" << std::endl;

第三种方法：多线程并行读取

// 多线程读取
const int NUM_THREADS = 4;  // 线程数量，可以根据文件大小设置

std::vector<std::vector<double>> x(NUM_THREADS);
std::vector<std::vector<double>> y(NUM_THREADS);

std::mutex mtx;

// 每个线程读取文件的部分
void readFilePart(const int& _idx,const std::string& filePath, long long start, long long size) 
{
    std::ifstream file(filePath, std::ios::binary);
    if (!file) 
    {
        std::cerr << "Failed to open file" << std::endl;
        return;
    }

    file.seekg(start);  // 移动到指定位置

    std::vector<char> buffer(size);
    file.read(buffer.data(), size);

    std::vector<double> x1;
    std::vector<double> y1;

    std::string line;
    for (char c : buffer)
    {
        if (c == ',')
        {
            x1.emplace_back(std::stod(line));
            line.clear();
        }
        else if (c == '\n')
        {
            y1.emplace_back(std::stod(line));
            line.clear();
        }
        else
            line += c;
    }

    file.close();

    std::lock_guard<std::mutex> lock(mtx);  // 作用域内加锁，超出作用域自动解锁
	// 按照读取顺序进行赋值
    x[_idx] = x1;
    y[_idx] = y1;
}

void ReadFileThread(const std::string& filePath)
{
    std::ifstream file(filePath, std::ios::ate | std::ios::binary);  // 以二进制模式打开并获取文件末尾位置

    if (!file) 
    {
        std::cerr << "Failed to open file" << std::endl;
        return;
    }

    long long fileSize = file.tellg();  // 获取文件大小
    file.close();

    long long partSize = fileSize / NUM_THREADS;
    std::vector<std::thread> threads;

    for (int i = 0; i < NUM_THREADS; ++i) 
    {
        long long start = i * partSize;
        long long size = (i == NUM_THREADS - 1) ? fileSize - start : partSize;
        threads.emplace_back(readFilePart,i, filePath, start, size);
    }

    for (auto& thread : threads)
        thread.join();
}

测试，开启4个线程并行读取，所需时间大概是580ms左右

// 记录开始时间
clock_t start = clock();

ReadFileThread("output.txt"); // 584 553

// 记录结束时间
clock_t end = clock();

// 计算时间差并转换为毫秒
double elapsedTime = (double)(end - start) * 1000.0 / CLOCKS_PER_SEC;

std::cout << "Function execution time: " << elapsedTime << " milliseconds" << std::endl;

完整示例

#include <iostream>
#include <random>
#include <fstream>
#include <string>
#include <ctime>
#include <thread>
#include <vector>
#include <mutex>

// 随机生成0~10的小数
double GenerateRandomDouble() 
{
    std::random_device rd;
    std::mt19937 gen(rd());
    std::uniform_real_distribution<double> dis(0.0, 10.0);
    return dis(gen);
}

// 写入文件测试数据
void WriteFile()
{
    std::string filePath = "output.txt";  // 要写入的文件路径

    // 创建并打开文件用于写入，如果文件不存在则创建
    std::ofstream outputFile(filePath, std::ios::app);

    if (outputFile.is_open()) 
    {
        for (int i = 0; i < 1000000; i++)
        {
            double x = GenerateRandomDouble();
            double y = GenerateRandomDouble();

            std::string text = std::to_string(x) + "," + std::to_string(y);

            outputFile << text << std::endl;
        }

        outputFile.close();
    }
    else 
    {
        std::cerr << "无法打开文件进行写入。" << std::endl;
        return;
    }
}

// 按块大小读取
const size_t BUFFER_SIZE = 20480;  // 8KB 缓冲区大小

std::vector<double> x_large;
std::vector<double> y_large;

void readLargeFile(const std::string& filePath) 
{
    std::ifstream file(filePath, std::ios::binary);

    if (!file)
    {
        std::cerr << "Failed to open file" << std::endl;
        return;
    }

    std::string line;
    std::vector<char> buffer(BUFFER_SIZE);
    while (file.read(buffer.data(), BUFFER_SIZE)) 
    {
        // 在这里处理读取到的缓冲区数据
        for (char c : buffer) 
        {
            if (c == ',')
            {
                x_large.emplace_back(std::stod(line));
                line.clear();
            }
            else if (c == '\n')
            {
                y_large.emplace_back(std::stod(line));
                line.clear();
            }
            else
                line += c;
        }
    }

    // 处理最后不足 BUFFER_SIZE 的部分
    size_t remaining = file.gcount();
    for (size_t i = 0; i < remaining; ++i) 
    {
        if (buffer[i] == ',')
        {
            x_large.emplace_back(std::stod(line));
            line.clear();
        }
        else if (buffer[i] == '\n')
        {
            y_large.emplace_back(std::stod(line));
            line.clear();
        }
        else
            line += buffer[i];
    }

    if (!line.empty())
        std::cout << line << std::endl;

    file.close();
}

// 按行读
std::vector<double> x_normal;
std::vector<double> y_normal;
void ReadFileNormal(const std::string& filePath)
{
    std::ifstream file(filePath);

    if (!file)
    {
        std::cerr << "Failed to open file" << std::endl;
        return;
    }

    std::string line;
    while (std::getline(file, line))
    {
        //std::cout << line << std::endl;
        int pos = line.find(',');

        double x = std::stod(line.substr(0, pos));
        double y = std::stod(line.substr(pos + 1, line.size() - pos));

        x_normal.emplace_back(x);
        y_normal.emplace_back(y);
    }

    file.close();
}

// 多线程读取
const int NUM_THREADS = 4;  // 线程数量，可以根据文件大小设置

std::vector<std::vector<double>> x(NUM_THREADS);
std::vector<std::vector<double>> y(NUM_THREADS);

std::mutex mtx;

// 每个线程读取文件的部分
void readFilePart(const int& _idx,const std::string& filePath, long long start, long long size) 
{
    std::ifstream file(filePath, std::ios::binary);
    if (!file) 
    {
        std::cerr << "Failed to open file" << std::endl;
        return;
    }

    file.seekg(start);  // 移动到指定位置

    std::vector<char> buffer(size);
    file.read(buffer.data(), size);

    std::vector<double> x1;
    std::vector<double> y1;

    std::string line;
    for (char c : buffer)
    {
        if (c == ',')
        {
            x1.emplace_back(std::stod(line));
            line.clear();
        }
        else if (c == '\n')
        {
            y1.emplace_back(std::stod(line));
            line.clear();
        }
        else
            line += c;
    }

    file.close();

    std::lock_guard<std::mutex> lock(mtx);  // 作用域内加锁，超出作用域自动解锁
    x[_idx] = x1;
    y[_idx] = y1;
}

void ReadFileThread(const std::string& filePath)
{
    std::ifstream file(filePath, std::ios::ate | std::ios::binary);  // 以二进制模式打开并获取文件末尾位置

    if (!file) 
    {
        std::cerr << "Failed to open file" << std::endl;
        return;
    }

    long long fileSize = file.tellg();  // 获取文件大小
    file.close();

    long long partSize = fileSize / NUM_THREADS;
    std::vector<std::thread> threads;

    for (int i = 0; i < NUM_THREADS; ++i) 
    {
        long long start = i * partSize;
        long long size = (i == NUM_THREADS - 1) ? fileSize - start : partSize;
        threads.emplace_back(readFilePart,i, filePath, start, size);
    }

    for (auto& thread : threads)
        thread.join();
}

int main()
{
    // 记录开始时间
    clock_t start = clock();

    //WriteFile();

    //readLargeFile("output.txt");    // 1835 1839 1810 1765 1871

    //ReadFileNormal("output.txt"); // 4878 4772 4821
    
    ReadFileThread("output.txt"); // 584 553

    // 记录结束时间
    clock_t end = clock();

    // 计算时间差并转换为毫秒
    double elapsedTime = (double)(end - start) * 1000.0 / CLOCKS_PER_SEC;

    std::cout << "Function execution time: " << elapsedTime << " milliseconds" << std::endl;

    system("pause");
    return 0;
}