Source code for bioat.foldtools
"""TODO."""
from pathlib import Path
from Bio.PDB.Structure import Structure as BiopythonStructure
from Bio.Seq import Seq
from bioat.lib.libpdb import get_cut2ref_aln_info, pdb2fasta, show_ref_cut
from bioat.logger import LoggerManager
lm = LoggerManager(mod_name="bioat.foldtools")
[docs]
class FoldTools:
"""Folding toolbox."""
def __init__(self):
pass
[docs]
def show_ref_cut(
self,
ref_seq: str | Path | Seq,
ref_pdb: str | Path | BiopythonStructure,
cut_seq: list[str | Path | Seq] | str | Path | Seq | None = None,
cut_pdb: list[str | Path | BiopythonStructure]
| str
| Path
| BiopythonStructure
| None = None,
cut_labels: list[str] | str | None = None,
ref_color: str = "red",
ref_map_colors: tuple[str, str] | None = None,
ref_map_values: dict | None = None,
cut_color="lightgray",
gap_color="purple",
ref_style="cartoon",
cut_style="cartoon",
gap_style="cartoon",
ref_map_value_random: bool = False,
output_fig: str | Path | None = None,
col: int = 4,
scale: float = 1.0,
annotate: bool = True,
text_interval: int = 5,
log_level="WARNING",
):
"""Visualizes the alignment of sequences and highlights changes in PDB structures using py3Dmol.
Args:
ref_seq (str or Path or Seq): Amino acid sequence content for the ref protein.
ref_pdb (str or Path or Bio.PDB.Structure.Structure): Path to the PDB file of the reference structure.
cut_seq (str, Path or Seq or None, optional): Amino acid sequence content for the cut protein.
cut_pdb (str, Path or Bio.PDB.Structure.Structure or None, optional): Path to the PDB file of the cut structure.
cut_labels (list[str] or str or None, optional): Label for the cut proteins. If None, the label will be set to "cut".
ref_color (str, optional): Color for reference residues.
ref_map_colors (tuple[str, str] or None, optional): ref_map_colors will be used as color bar from ref_map_colors[0] to ref_map_colors[1]. If None, do not apply color mapping. Defaults to None.
ref_map_values (dict or None, optional): A dictionary of values for the ref color map, it will be normalized to the range of [0 - 1]. If None, all residues will be colored with the same color. e.g. ref_value_dict = {'V_0': 0.4177215189873418, 'S_1': 0.8185654008438819, 'K_2': 0.9915611814345991, 'G_3': 0.42616033755274263, ...}
cut_color (str, optional): Color for cut residues.
gap_color (str, optional): Color for gaps or removed residues.
ref_style (str, optional): "stick", "sphere", "cartoon", or "line"
cut_style (str, optional): "stick", "sphere", "cartoon", or "line"
gap_style (str, optional): "stick", "sphere", "cartoon", or "line"
ref_map_value_random (bool, optional): If True, ref_value_dict will be randomly generated. Defaults to False.
output_fig (str or None, optional): Output figure file path. If None, the figure will not be saved in html format. Defaults to None.
col (int, optional): Number of columns for the visualization. Defaults to 3.
scale (float, optional): Scale factor for the visualization. Defaults to 1.0.
annotate (bool, optional): Whether to annotate the visualization with labels. Defaults to True.
text_interval (int, optional): The interval between text annotations. Defaults to 5.
log_level (str, optional): Log level. Defaults to "WARNING".
"""
show_ref_cut(
ref_seq=ref_seq,
cut_seq=cut_seq,
ref_pdb=ref_pdb,
cut_pdb=cut_pdb,
cut_labels=cut_labels,
ref_color=ref_color,
ref_map_colors=ref_map_colors,
ref_map_values=ref_map_values,
cut_color=cut_color,
gap_color=gap_color,
ref_style=ref_style,
cut_style=cut_style,
gap_style=gap_style,
ref_map_value_random=ref_map_value_random,
output_fig=output_fig,
col=col,
scale=scale,
annotate=annotate,
text_interval=text_interval,
log_level=log_level,
)
[docs]
def pdb2fasta(
self,
input_pdb: str | Path,
output_fasta: str | Path | None = None,
log_level="WARNING",
):
"""Converts a PDB file to a FASTA file.
Details:
1. **Proteins**:
The protein sequence for each chain will be extracted as "Chain X Protein".
2. **DNA and RNA**:
Bases for DNA (A, T, G, C) will be saved as "Chain X DNA", and bases for RNA (A, U, G, C) will be saved as "Chain X RNA".
3. **Other molecules**:
Any unrecognized molecules (e.g., ions, modified molecules) will be labeled as [residue] and stored as "Chain X Other molecules".
4. **Multi-chain complexes**:
The program supports multi-chain structures in complexes, and the content of each chain will be recorded separately.
Args:
input_pdb (str or Path):
Path to the input PDB/CIF file or Biopython Structure.
output_fasta (str or Path, optional):
Output file path. If None, the output file will be named as the
basename of the input file with a ".fa" extension. Defaults to None.
func_return: (bool, optional)
Whether to return a list of SeqRecord objects, useful when used as a function but not for command line. Defaults to False.
log_level (str, optional):
Logging level. Defaults to "WARNING".
Returns:
List of SeqRecord if func_return is True, otherwise None.
"""
pdb2fasta(input_pdb=input_pdb, output_fasta=output_fasta, log_level=log_level)
[docs]
def get_cut2ref_aln_info(
self,
ref: str | BiopythonStructure,
cut: str | BiopythonStructure,
cal_rmsd=True,
cal_tmscore=False,
label1="ref",
label2="cut",
usalign_bin: str = "usalign",
log_level="WARNING",
):
"""Align cutted pdb to ref pdb using the CA atoms.
Aligns a truncated protein structure (cut) to its full-length reference structure (ref)
using Ca atoms and Biopython's Superimposer.
This function:
- Extracts all Ca atoms from `ref` and `cut`
- Removes atoms from `ref` at the indices listed in `gap_indices`
- Aligns the remaining atoms from `cut` to the corresponding positions in `ref`
- Modifies the `cut` structure in-place to match the aligned orientation
- Returns both structures and the RMSD value of the alignment
It assumes:
- One-to-one correspondence between residues after gap removal
- Structures are predicted by AlphaFold2 / ESMFold (no missing atoms)
Args:
ref (str or Bio.PDB.Structure.Structure): Reference structure path or loaded Structure.
cut (str or Bio.PDB.Structure.Structure): Truncated structure path or loaded Structure.
cal_rmsd (bool, optional): Whether to calculate RMSD. Default is True.
cal_tmscore (bool, optional): Whether to calculate TM-score using USalign. Default is False.
label1 (str, optional): Name for the reference structure. Default is "ref".
label2 (str, optional): Name for the cut structure. Default is "cut".
usalign_bin (str, optional): Path to the USalign binary for TM-score calculation. Default is "usalign".
log_level (str, optional): Logging level. Default is "WARNING".
Returns:
dict: {
"{label1}": aln label1 structure, # if cal_rmsd is True, unaltered label1 structure
"{label2}}": fixed label2 structure, # if cal_rmsd is True, fix label2 coords in-place
"RMSD": 0.123 # if cal_rmsd is True, the RMSD value between label1 and label2
f"{label1}_seq": ref_seq, # if cal_rmsd is True, the sequence of label1 structure
f"{label2}_seq": cut_seq, # if cal_rmsd is True, the sequence of label2 structure
"alignment_dict": alignment_dict, # if cal_rmsd is True, the alignment dict of label1 and label2
"gap_indices": gap_indices, # if cal_rmsd is True, the indices of gaps in label1 structure
"TM-score:mean": 0.623, # if cal_tmscore is True, the mean TM-score value
"TM-score:TM1": 0.456, # if cal_tmscore is True, use label1 as ref <L_N> in calculation
"TM-score:TM2": 0.789, # if cal_tmscore is True, use label2 as ref <L_N> in calculation
...
}
"""
aln_info = get_cut2ref_aln_info(
ref=ref,
cut=cut,
cal_rmsd=cal_rmsd,
cal_tmscore=cal_tmscore,
label1=label1,
label2=label2,
usalign_bin=usalign_bin,
log_level=log_level,
)
for key, value in aln_info.items():
print(f"{key}: {value}")