AlphaFold3 generate_chain_data_cache 脚本在源代码的scripts文件夹下。该脚本从指定目录中批量解析 mmCIF/PDB 文件的工具,并将每个链的基本信息(序列、分辨率、是否属于聚类等)提取并写入 JSON 文件,主要用于后续蛋白质建模、过滤或训练数据准备。
源代码:
import argparse
from functools import partial
import json
import logging
from multiprocessing import Pool
import os
import string
from collections import defaultdict
from tqdm import tqdm
from src.data.mmcif_parsing import parse
from src.common import protein, residue_constants
import sys
sys.path.append("../../../Downloads") # an innocent hack to get this to run from the top level
def parse_file(
f,
args,
chain_cluster_size_dict
):
file_id, ext = os.path.splitext(f)
if ext == ".cif":
with open(os.path.join(args.data_dir, f), "r") as fp:
mmcif_string = fp.read()
mmcif = parse(file_id=file_id, mmcif_string=mmcif_string)
if mmcif.mmcif_object is None:
logging.info(f"Could not parse {f}. Skipping...")
return {}
else:
mmcif = mmcif.mmcif_object
out = {}
for chain_id, seq in mmcif.chain_to_seqres.items():
full_name = "_".join([file_id, chain_id])
out[full_name] = {}
local_data = out[full_name]
local_data["release_date"] = mmcif.header["release_date"]
local_data["seq"] = seq
local_data["resolution"] = mmcif.header["resolution"]
if chain_cluster_size_dict is not None:
cluster_size = chain_cluster_size_dict.get(
full_name.upper(), -1
)
local_data["cluster_size"] = cluster_size
elif ext == ".pdb":
with open(os.path.join(args.data_dir, f), "r") as fp:
pdb_string = fp.read()
protein_object = protein.from_pdb_string(pdb_string, None)
aatype = pr