PSP - 计算蛋白质复合物链间接触的残基与面积

news2025/4/11 0:32:11

欢迎关注我的CSDN：https://spike.blog.csdn.net/
本文地址：https://spike.blog.csdn.net/article/details/134884889

在蛋白质复合物中，通过链间距离，可以计算出在接触面的残基与接触面的面积，使用 BioPython 库与 SASA (Solvent Accessible Surface Area，溶剂可及表面积) 库。SASA 计算主要以水分子是否通过为基础，水分子的半径是1.4 A，直径是 2.8 A，所以以 2.8 A 的距离，判断链间是否连接。

相关函数：

获取蛋白质复合物的链列表，即 get_chain_ids()
获取残基-原子字典，注意原子名称与 PDB 的名称可能不同，即 get_atom_list()
获取全原子列表，即 get_all_atoms()
判断是否原子之间是否相连，即 is_contact()
获取全部的接触面，原子计算残基维度，即 get_contacts()
打印 ChimeraX 的显示命令，即 get_chimerax_cmd()
端到端的计算接触残基，即 cal_chain_interface_residue()
计算 SASA 面积，即 cal_sasa()
获取复合物多链的子结构，即 get_sub_structure()
端到端的计算接触面积，即 cal_chain_interface_area()

以 PDB 8j18 为例，计算链 A 与链 R 的接触残基与面积：

源码如下：

#!/usr/bin/env python
# -- coding: utf-8 --
"""
Copyright (c) 2022. All rights reserved.
Created by C. L. Wang on 2023/12/8
"""

import os

from Bio.PDB import NeighborSearch
from Bio.PDB import PDBParser, Selection
from Bio.PDB.Chain import Chain
from Bio.PDB.Model import Model
from Bio.PDB.Structure import Structure

from root_dir import DATA_DIR


class MainComplexChainDist(object):
    """
    复合物，多链距离计算
    """
    def __init__(self):
        pass

    @staticmethod
    def get_chain_ids(structure):
        chain_ids = []
        for chain in structure.get_chains():
            chain_ids.append(chain.id)
        return chain_ids

    @staticmethod
    def get_atom_list(structure, chains):
        """
        获取 atom list 列表
        注意: 输出的 atom 名字与 PDB 文件中的 atom 名字略有差异，例如
        8jtl 第1个，PRO: CG 对应 OG1，CD 对应 CG2
        8jtl 第2个，VAL: CG1 对应 CD1，CG2 对应 CD2
        """
        output = dict()
        for chain in structure:
            if chain.id in chains:
                for residue in chain.get_residues():
                    het_flag, res_seq, i_code = residue.get_id()
                    the_id = (chain.id + "_" + str(res_seq) + "_" + i_code).strip()
                    for atom in residue.get_unpacked_list():
                        if het_flag == ' ':
                            if the_id in output:
                                output[the_id].append(atom)
                            else:
                                output[the_id] = [atom]
        return output

    # Filter out all the atoms from the chain,residue map given by residue_map
    @staticmethod
    def get_all_atoms(residue_map):
        all_atoms_out = []
        for residue in residue_map:
            for atom in residue_map[residue]:
                all_atoms_out.append(atom)
                # Set the b-factor to zero for coloring by contacts
                atom.set_bfactor(0.0)
        return all_atoms_out

    @staticmethod
    def is_contact(res_1, other_atoms, cutoff):
        for atom in res_1:
            ns = NeighborSearch(other_atoms)
            center = atom.get_coord()
            neighbors = ns.search(center, cutoff)  # 5.0 for distance in angstrom
            residue_list = Selection.unfold_entities(neighbors, 'R')  # R for residues
            if len(residue_list) > 0:
                return True
        return False

    @classmethod
    def get_contacts(cls, struc, all_atoms, verbose, cutoff):
        progress = 0
        contacts = []
        for residue in struc:
            progress += 1
            # if len(verbose) > 0:
            #     print(verbose, progress, "out of", len(struc))
            atom_list = struc[residue]
            outcome = cls.is_contact(atom_list, all_atoms, cutoff)
            if outcome:
                contacts.append(residue)
        return contacts

    @staticmethod
    def get_chimerax_cmd(contacts, chain_id, pdb_id="2"):
        """
        计算命令
        """
        tmp_list = []
        for item in contacts:
            req_id = int(item.split("_")[1])
            tmp_list.append(req_id)
        contacts_gap = []
        prev, post = -1, -1
        for i in tmp_list:
            if prev == -1:
                prev = post = i
                continue
            if i - post == 1:
                post = i
            else:
                contacts_gap.append(f"#{pdb_id}/{chain_id}:{prev}-{post}")
                prev = post = i
        cmd = "select " + " ".join(contacts_gap)
        return cmd

    @classmethod
    def cal_chain_interface_residue(cls, structure, chain1, chain2, cutoff=7.5):
        """
        计算复合物结构的链内距离
        """

        chain_ids = cls.get_chain_ids(structure)
        # ['A', 'B', 'G', 'R', 'S']
        print(f"[Info] chain_ids: {chain_ids}")
        assert chain1 in chain_ids and chain2 in chain_ids

        # C 表示 chain，即将 structure 展开 chain 的形式
        # 同时支持 A, R, C, M, S
        structure_c = Selection.unfold_entities(structure, target_level="C")
        atom_dict1 = cls.get_atom_list(structure_c, chains=[chain1])
        atom_dict2 = cls.get_atom_list(structure_c, chains=[chain2])
        print(f"[Info] res_num: {len(atom_dict1.keys())}")

        all_atoms_1 = cls.get_all_atoms(atom_dict1)
        all_atoms_2 = cls.get_all_atoms(atom_dict2)
        cutoff = cutoff
        contacts_1 = cls.get_contacts(atom_dict1, all_atoms_2, "First molecule, residue ", cutoff)
        contacts_2 = cls.get_contacts(atom_dict2, all_atoms_1, "Second molecule, residue ", cutoff)
        print(f"[Info] contacts_1: {len(contacts_1)}")
        print(f"[Info] contacts_2: {len(contacts_2)}")
        return contacts_1, contacts_2

    @staticmethod
    def cal_sasa(structure):
        """
        计算单体的表面积
        """
        import freesasa
        structure = freesasa.structureFromBioPDB(structure)
        result = freesasa.calc(structure)
        # classArea = freesasa.classifyResults(result, structure)
        return result.totalArea()

    @staticmethod
    def get_sub_structure(structure, chain_list):
        """
        获取 PDB 结构的子结构
        """
        new_structure = Structure(0)
        new_model = Model(0)
        # 遍历模型中的所有链
        for chain in structure[0]:
            # 获取链的ID
            tmp_chain_id = chain.get_id()
            if tmp_chain_id not in chain_list:
                continue
            # 创建一个新的结构对象，只包含当前链

            new_chain = Chain(tmp_chain_id)
            new_chain.child_list = chain.child_list
            new_model.add(new_chain)
        new_structure.add(new_model)
        return new_structure

    @classmethod
    def cal_chain_interface_area(cls, structure, chain1, chain2):
        sub_all_structure = cls.get_sub_structure(structure, [chain1, chain2])
        sub1_structure = cls.get_sub_structure(structure, [chain1])
        sub2_structure = cls.get_sub_structure(structure, [chain2])
        v1_area = cls.cal_sasa(sub_all_structure)
        v2_area = cls.cal_sasa(sub1_structure) + cls.cal_sasa(sub2_structure)
        inter_area = max(int((v2_area - v1_area) // 2), 0)
        return inter_area

    def process(self, input_path, chain1, chain2):
        """
        核心逻辑
        """
        print(f"[Info] input_path: {input_path}")
        print(f"[Info] chain1: {chain1}, chain2: {chain2}")
        chain1 = chain1.upper()
        chain2 = chain2.upper()
        parser = PDBParser(QUIET=True)
        structure = parser.get_structure("def", input_path)
        contacts_1, contacts_2 = self.cal_chain_interface_residue(structure, chain1, chain2, cutoff=7.5)
        cmd1 = self.get_chimerax_cmd(contacts_1, chain1)
        cmd2 = self.get_chimerax_cmd(contacts_2, chain2)
        print(f"[Info] chimerax: {cmd1}")
        print(f"[Info] chimerax: {cmd2}")
        inter_area = self.cal_chain_interface_area(structure, chain1, chain2)
        print(f"[Info] inter_area: {inter_area}")


def main():
    ccd = MainComplexChainDist()

    # input_path = os.path.join(DATA_DIR, "templates", "8p3l.pdb")
    # ccd.process(input_path, "A", "D")  # 2562
    # ccd.process(input_path, "A", "J")  # 490

    input_path = os.path.join(DATA_DIR, "templates", "8j18.pdb")
    # ccd.process(input_path, "R", "S")  # 0
    ccd.process(input_path, "A", "R")  # 1114


if __name__ == '__main__':
    main()