Source code for bioat.lib.libcrispr

import pandas as pd
from Bio.Align import PairwiseAligner
from Bio.Seq import Seq

from bioat.lib.libalignment import get_alignment_info
from bioat.logger import LoggerManager

lm = LoggerManager(mod_name="bioat.lib.libcrispr")

# TARGET_REGIONS_LIB
#     for target_seq alignment
#
# bioat target_seq region_heatmap test_sorted.mpileup.info.tsv test.pdf --get_built_in_target_region
#
# INFO:root:You can use <key> in built-in <target_regions> to represent your target_region:
#         <key>   <target_region>
#         EMX1    GAGTCCGAGCAGAAGAAGAA^NGG^
#         HEK3    GGCCCAGACTGAGCACGTGA^NGG^
#         HEK4    GGCACTGCGGCTGGAGGTGG^NGG^
#         ...

TARGET_SEQ_LIB = {
    "EMX1": "GAGTCCGAGCAGAAGAAGAA^NGG^",
    "HEK3": "GGCCCAGACTGAGCACGTGA^NGG^",
    "HEK4": "GGCACTGCGGCTGGAGGTGG^NGG^",
    "RNF2": "GTCATCTTAGTCATTACCTG^NGG^",
    "VEGFA": "GACCCCCTCCACCCCGCCTC^NGG^",
    "CDKN2A": "^TTTA^GCCCCAATAATCCCCACATGTCA",
    "DYRK1A": "^TTTA^GAAGCACATCAAGGACATTCTAA",
    "ND4": "TGCTAGTAACCACGTTCTCCTGATCAAATATCACTCTCCTACTTACAGGA",
    "ND5.1": "TAGCATTAGCAGGAATACCTTTCCTCACAGGTTTCTACTCCAAAGA",
    "ND6": "TGACCCCCATGCCTCAGGATACTCCTCAATAGCCATCG",
}



[docs]
def cmp_align_list(aln_a: dict, aln_b: dict):
    """Compare function for alignment lists.

    This function processes alignment data and performs comparisons based on the input structure.

    Args:
        input_data (list | dict):
            The alignment data to be processed. Can be provided as:
            - A list, e.g.: [17, 3, 0, 73.0, 1, 22, 'AAGAAGAAGACGAGTCTGCA', '||||||||||||||X|||XX', 'AAGAAGAAGACGAGCCTGAG']
            - A dictionary, e.g.: {'match_count': 26, 'mismatch_count': 3, 'gap_count': 4, 'aln_score': 96.0, 'ref_aln_start': 0, 'ref_aln_end': 32, 'alignment': {'reference_seq': 'GGCACTGCGGCTGGAAAAAAAAAAAAAAA--GT','aln_info': '--..|.|||||||||||||||||||||||--||', 'target_seq': '--GGCAGCGGCTGGAAAAAAAAAAAAAAAAGGT'}}

    Returns:
        None: The function does not return a value but performs comparisons.

    Example:
        >>> compare_alignments(
        ...     [
        ...         17,
        ...         3,
        ...         0,
        ...         73.0,
        ...         1,
        ...         22,
        ...         "AAGAAGAAGACGAGTCTGCA",
        ...         "||||||||||||||X|||XX",
        ...         "AAGAAGAAGACGAGCCTGAG",
        ...     ]
        ... )
    """

    # sort reason score -> gap -> mismatch -> match
    def sign_value(x):
        """HELP
        sign function.
        """
        if x > 0:
            return 1
        if x < 0:
            return -1
        return 0

    align_alphabet_score = {"-": 0, ".": 1, "|": 2}
    sort_keys = ["alignment_score", "gap_count", "mismatch_count", "match_count"]
    sort_rev_state_list = [True, False, False, True]

    for idx, key in enumerate(sort_keys):
        if aln_a[key] - aln_b[key] == 0:
            continue
        if sort_rev_state_list[idx]:
            return -1 * sign_value(aln_a[key] - aln_b[key])
        return sign_value(aln_a[key] - aln_b[key])

    aln_info_a = aln_a["alignment"]["aln_info"]
    aln_info_b = aln_b["alignment"]["aln_info"]

    for idx, char_a in enumerate(aln_info_a):
        if idx <= (len(aln_info_b) - 1):
            value_a = align_alphabet_score[char_a]
            value_b = align_alphabet_score[char_a]
            return sign_value(value_a - value_b)
        pass
    return None




[docs]
def run_target_seq_align(
    ref_seq: Seq,
    target_seq: Seq,
    aligner: PairwiseAligner,
    PAM: dict | None = None,
    log_level="WARNING",
) -> list:
    """Perform global alignment for the target sequence.

    This function performs global alignment between a reference sequence and a target sequence (usually the sgRNA sequence without PAM) using the specified aligner.

    Args:
        ref_seq (Seq):
            A Seq object from BioPython representing the reference sequence
            for the targeted deep sequencing.
        target_seq (Seq):
            A Seq object from BioPython representing the target region sequence
            for the editing window (usually the sgRNA sequence without PAM).
        aligner (object):
            A PairwiseAligner object from BioPython used for performing the alignment.
        PAM (dict, optional):
            A dictionary containing PAM information with the following keys:
            - 'PAM' (str): The PAM sequence (e.g., "AGG").
            - 'position' (int): The insertion site of `target_seq`.
            - 'weight' (float): The priority weight for PAM alignment. The alignment score will be multiplied by this weight.
            - Example: {'PAM': 'AGG', 'position': 20, 'weight': 1.0}.

    Returns:
        list[dict]:
            A list of dictionaries, each containing alignment results. Each dictionary has the following keys:
            - 'match_count' (int): The number of matching bases in the alignment.
            - 'mismatch_count' (int): The number of mismatched bases.
            - 'gap_count' (int): The number of gaps.
            - 'aln_score' (float): The alignment score.
            - 'ref_aln_start' (int): The starting position of the alignment on the reference sequence.
            - 'ref_aln_end' (int): The ending position of the alignment on the reference sequence.
            - 'alignment' (dict): The alignment details containing:'reference_seq' (str): The aligned reference sequence.'aln_info' (str): The alignment information (e.g., matches, mismatches, and gaps).'target_seq' (str): The aligned target sequence.

    Example:
        An example alignment result might look like this:[{'match_count': 26,'mismatch_count': 3,'gap_count': 4,'aln_score': 96.0,'ref_aln_start': 0,'ref_aln_end': 32,'alignment': {'reference_seq': 'GGCACTGCGGCTGGAAAAAAAAAAAAAAA--GT','aln_info': '--..|.|||||||||||||||||||||||--||','target_seq': '--GGCAGCGGCTGGAAAAAAAAAAAAAAAAGGT'}}]

    """
    lm.set_names(func_name="run_target_seq_align")
    lm.set_level(log_level)

    if PAM:
        # considering PAM
        # DNA info https://www.bioinformatics.org/sms/iupac.html
        alignments_pam = aligner.align(ref_seq, Seq(PAM["PAM"]))
        # use pam to find aim sequence on ref and then use target_seq to align them and give it a higher priority
        dt_PAMs = {}

        for _idx, alignment in enumerate(alignments_pam):
            alignment_analysis_result = get_alignment_info(alignment, reverse=False)
            lm.logger.debug(
                f"alignment_analysis_result for PAM:\n{alignment_analysis_result}"
            )
            ref_aln_start = alignment_analysis_result["ref_aln_start"]
            ref_aln_end = alignment_analysis_result["ref_aln_end"]
            aln_score = alignment_analysis_result["aln_score"] * PAM["weight"]
            dt_PAMs[f"{ref_aln_start}_{ref_aln_end}"] = {
                "aln_score": aln_score * PAM["weight"]
            }

        lm.logger.debug(f"find PAMs on reference: {dt_PAMs}")

        # add PAM left or right or any position
        target_seq = (
            target_seq[: PAM["position"]]
            + Seq(PAM["PAM"])
            + target_seq[PAM["position"] :]
        )
        lm.logger.info(
            f"find PAM on target_seq, target_seq is updated to: {target_seq!s}"
        )

    # alignment part
    alignments = aligner.align(ref_seq, target_seq)
    # parse alignment
    alignment_results = []

    for alignment in alignments:
        alignment_analysis_result = get_alignment_info(alignment, reverse=False)
        lm.logger.debug(f"alignment_analysis_result:\n{alignment_analysis_result}")
        alignment_results.append(alignment_analysis_result)

    if len(alignment_results) == 1:
        lm.logger.debug(f"find 1 alignment result:\n{alignment_results}")
        return alignment_results[0]
    lm.logger.debug(
        f"find {len(alignment_results)} alignment results:\n{alignment_results}"
    )
    df = pd.DataFrame(alignment_results)
    # if contain multiple alignment result with the same score, keep the best one;
    # sort reason score -> gap -> mismatch -> match
    df = df.sort_values(
        by=["aln_score", "gap_count", "mismatch_count", "match_count"],
        ascending=[False, True, True, False],
    )
    return df.iloc[0, :].to_dict()