Source code for bioat.foldtools

"""TODO."""

from pathlib import Path

from Bio.PDB.Structure import Structure as BiopythonStructure
from Bio.Seq import Seq

from bioat.lib.libpdb import get_cut2ref_aln_info, pdb2fasta, show_ref_cut
from bioat.logger import LoggerManager

lm = LoggerManager(mod_name="bioat.foldtools")



[docs]
class FoldTools:
    """Folding toolbox."""

    def __init__(self):
        pass


[docs]
    def show_ref_cut(
        self,
        ref_seq: str | Path | Seq,
        ref_pdb: str | Path | BiopythonStructure,
        cut_seq: list[str | Path | Seq] | str | Path | Seq | None = None,
        cut_pdb: list[str | Path | BiopythonStructure]
        | str
        | Path
        | BiopythonStructure
        | None = None,
        cut_labels: list[str] | str | None = None,
        ref_color: str = "red",
        ref_map_colors: tuple[str, str] | None = None,
        ref_map_values: dict | None = None,
        cut_color="lightgray",
        gap_color="purple",
        ref_style="cartoon",
        cut_style="cartoon",
        gap_style="cartoon",
        ref_map_value_random: bool = False,
        output_fig: str | Path | None = None,
        col: int = 4,
        scale: float = 1.0,
        annotate: bool = True,
        text_interval: int = 5,
        log_level="WARNING",
    ):
        """Visualizes the alignment of sequences and highlights changes in PDB structures using py3Dmol.

        Args:
            ref_seq (str or Path or Seq): Amino acid sequence content for the ref protein.
            ref_pdb (str or Path or Bio.PDB.Structure.Structure): Path to the PDB file of the reference structure.
            cut_seq (str, Path or Seq or None, optional): Amino acid sequence content for the cut protein.
            cut_pdb (str, Path or Bio.PDB.Structure.Structure or None, optional): Path to the PDB file of the cut structure.
            cut_labels (list[str] or str or None, optional): Label for the cut proteins. If None, the label will be set to "cut".
            ref_color (str, optional): Color for reference residues.
            ref_map_colors (tuple[str, str] or None, optional): ref_map_colors will be used as color bar from ref_map_colors[0] to ref_map_colors[1]. If None, do not apply color mapping. Defaults to None.
            ref_map_values (dict or None, optional): A dictionary of values for the ref color map, it will be normalized to the range of [0 - 1]. If None, all residues will be colored with the same color. e.g. ref_value_dict = {'V_0': 0.4177215189873418, 'S_1': 0.8185654008438819, 'K_2': 0.9915611814345991, 'G_3': 0.42616033755274263, ...}
            cut_color (str, optional): Color for cut residues.
            gap_color (str, optional): Color for gaps or removed residues.
            ref_style (str, optional): "stick", "sphere", "cartoon", or "line"
            cut_style (str, optional): "stick", "sphere", "cartoon", or "line"
            gap_style (str, optional): "stick", "sphere", "cartoon", or "line"
            ref_map_value_random (bool, optional): If True, ref_value_dict will be randomly generated. Defaults to False.
            output_fig (str or None, optional): Output figure file path. If None, the figure will not be saved in html format. Defaults to None.
            col (int, optional): Number of columns for the visualization. Defaults to 3.
            scale (float, optional): Scale factor for the visualization. Defaults to 1.0.
            annotate (bool, optional): Whether to annotate the visualization with labels. Defaults to True.
            text_interval (int, optional): The interval between text annotations. Defaults to 5.
            log_level (str, optional): Log level. Defaults to "WARNING".
        """
        show_ref_cut(
            ref_seq=ref_seq,
            cut_seq=cut_seq,
            ref_pdb=ref_pdb,
            cut_pdb=cut_pdb,
            cut_labels=cut_labels,
            ref_color=ref_color,
            ref_map_colors=ref_map_colors,
            ref_map_values=ref_map_values,
            cut_color=cut_color,
            gap_color=gap_color,
            ref_style=ref_style,
            cut_style=cut_style,
            gap_style=gap_style,
            ref_map_value_random=ref_map_value_random,
            output_fig=output_fig,
            col=col,
            scale=scale,
            annotate=annotate,
            text_interval=text_interval,
            log_level=log_level,
        )



[docs]
    def pdb2fasta(
        self,
        input_pdb: str | Path,
        output_fasta: str | Path | None = None,
        log_level="WARNING",
    ):
        """Converts a PDB file to a FASTA file.

        Details:
            1. **Proteins**:
               The protein sequence for each chain will be extracted as "Chain X Protein".
            2. **DNA and RNA**:
               Bases for DNA (A, T, G, C) will be saved as "Chain X DNA", and bases for RNA (A, U, G, C) will be saved as "Chain X RNA".
            3. **Other molecules**:
               Any unrecognized molecules (e.g., ions, modified molecules) will be labeled as [residue] and stored as "Chain X Other molecules".
            4. **Multi-chain complexes**:
               The program supports multi-chain structures in complexes, and the content of each chain will be recorded separately.

        Args:
            input_pdb (str or Path):
                Path to the input PDB/CIF file or Biopython Structure.
            output_fasta (str or Path, optional):
                Output file path. If None, the output file will be named as the
                basename of the input file with a ".fa" extension. Defaults to None.
            func_return: (bool, optional)
                Whether to return a list of SeqRecord objects, useful when used as a function but not for command line. Defaults to False.
            log_level (str, optional):
                Logging level. Defaults to "WARNING".

        Returns:
            List of SeqRecord if func_return is True, otherwise None.
        """
        pdb2fasta(input_pdb=input_pdb, output_fasta=output_fasta, log_level=log_level)



[docs]
    def get_cut2ref_aln_info(
        self,
        ref: str | BiopythonStructure,
        cut: str | BiopythonStructure,
        cal_rmsd=True,
        cal_tmscore=False,
        label1="ref",
        label2="cut",
        usalign_bin: str = "usalign",
        log_level="WARNING",
    ):
        """Align cutted pdb to ref pdb using the CA atoms.

        Aligns a truncated protein structure (cut) to its full-length reference structure (ref)
        using Ca atoms and Biopython's Superimposer.

        This function:
        - Extracts all Ca atoms from `ref` and `cut`
        - Removes atoms from `ref` at the indices listed in `gap_indices`
        - Aligns the remaining atoms from `cut` to the corresponding positions in `ref`
        - Modifies the `cut` structure in-place to match the aligned orientation
        - Returns both structures and the RMSD value of the alignment

        It assumes:
        - One-to-one correspondence between residues after gap removal
        - Structures are predicted by AlphaFold2 / ESMFold (no missing atoms)

        Args:
            ref (str or Bio.PDB.Structure.Structure): Reference structure path or loaded Structure.
            cut (str or Bio.PDB.Structure.Structure): Truncated structure path or loaded Structure.
            cal_rmsd (bool, optional): Whether to calculate RMSD. Default is True.
            cal_tmscore (bool, optional): Whether to calculate TM-score using USalign. Default is False.
            label1 (str, optional): Name for the reference structure. Default is "ref".
            label2 (str, optional): Name for the cut structure. Default is "cut".
            usalign_bin (str, optional): Path to the USalign binary for TM-score calculation. Default is "usalign".
            log_level (str, optional): Logging level. Default is "WARNING".

        Returns:
            dict: {
                    "{label1}": aln label1 structure,  # if cal_rmsd is True, unaltered label1 structure
                    "{label2}}": fixed label2 structure,  # if cal_rmsd is True, fix label2 coords in-place
                    "RMSD": 0.123  # if cal_rmsd is True, the RMSD value between label1 and label2
                    f"{label1}_seq": ref_seq,  # if cal_rmsd is True, the sequence of label1 structure
                    f"{label2}_seq": cut_seq,  # if cal_rmsd is True, the sequence of label2 structure
                    "alignment_dict": alignment_dict,  # if cal_rmsd is True, the alignment dict of label1 and label2
                    "gap_indices": gap_indices,  # if cal_rmsd is True, the indices of gaps in label1 structure
                    "TM-score:mean": 0.623,  # if cal_tmscore is True, the mean TM-score value
                    "TM-score:TM1": 0.456,  # if cal_tmscore is True, use label1 as ref <L_N> in calculation
                    "TM-score:TM2": 0.789,  # if cal_tmscore is True, use label2 as ref <L_N> in calculation
                    ...
                }
        """
        aln_info = get_cut2ref_aln_info(
            ref=ref,
            cut=cut,
            cal_rmsd=cal_rmsd,
            cal_tmscore=cal_tmscore,
            label1=label1,
            label2=label2,
            usalign_bin=usalign_bin,
            log_level=log_level,
        )
        for key, value in aln_info.items():
            print(f"{key}: {value}")