背景:
在做机器翻译的时候,我们的单边语料大约20G大小的纯文本语料,在DataLoader加载的时候不可能一次性加载进来,所以就有了这个超大语料的加载问题。
解决方案:
data_dealing.py:
import os
import sys
root_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.append(root_dir)
from CODES.CONFIG import *
from tqdm import tqdm
from CODES.UTILS import wordcount
import pickle
def data_dealing():
pos = 0
file_pos_list = []
file_path = datas_dir / "stc_weibo_train_post"
with open(file_path, "r", encoding="utf-8") as fr:
file_length = int(wordcount(file_path))
pb = tqdm(total=file_length)
for line in fr:
file_pos_list.append(pos)
pos += len(line.encode("utf-8"))
pb.update(1)
pickle.dump(file_pos_list , open(datas_dir / "big_file_seek_list.pkl", "wb"))
if __name__ == "__main__":
data_dealing()
主要就是通过记录每行的偏移量的方式来记录文本位置!!!
BigFileModuleDataset.py:
import torch
from torch.utils.data import Dataset
import pickle
class BigFileModuleDataset(Dataset):
def __init__(self, seek_file_path, source_file_path):
self.big_file_seek_pos_list = pickle.load(open(seek_file_path, "rb"))
self.source_file_path = source_file_path
self.length = len(self.big_file_seek_pos_list)
self.fr = open(self.source_file_path , "r", encoding="utf-8")
def __len__(self):
return self.length
def __getitem__(self, index):
self.fr.seek(self.big_file_seek_pos_list[index])
line = self.fr.readline()
return line
__test_BigFileModuleDataset__.py:
import os
import sys
root_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.append(root_dir)
from CODES.CONFIG import *
from CODES.BigFileModuleDataset import BigFileModuleDataset
def __test_BigFileModuleDataset__():
train_datasets = BigFileModuleDataset(seek_file_path=datas_dir / "big_file_seek_list.pkl", source_file_path=datas_dir / "stc_weibo_train_post")
for idx in range(10, 20):
print(train_datasets[idx].strip())
if __name__ == "__main__":
__test_BigFileModuleDataset__()
输出如下:
代码路径:
https://download.csdn.net/download/wtl1992/89533717?spm=1001.2014.3001.5503https://download.csdn.net/download/wtl1992/89533717?spm=1001.2014.3001.5503