Source code for bioat.lib.libcrispr


import pandas as pd
from Bio.Align import PairwiseAligner
from Bio.Seq import Seq

from bioat.lib.libalignment import get_alignment_info
from bioat.logger import LoggerManager

lm = LoggerManager(mod_name="bioat.lib.libcrispr")

# TARGET_REGIONS_LIB
#     for target_seq alignment
#
# bioat target_seq region_heatmap test_sorted.mpileup.info.tsv test.pdf --get_built_in_target_region
#
# INFO:root:You can use <key> in built-in <target_regions> to represent your target_region:
#         <key>   <target_region>
#         EMX1    GAGTCCGAGCAGAAGAAGAA^NGG^
#         HEK3    GGCCCAGACTGAGCACGTGA^NGG^
#         HEK4    GGCACTGCGGCTGGAGGTGG^NGG^
#         ...

TARGET_SEQ_LIB = {
    "EMX1": "GAGTCCGAGCAGAAGAAGAA^NGG^",
    "HEK3": "GGCCCAGACTGAGCACGTGA^NGG^",
    "HEK4": "GGCACTGCGGCTGGAGGTGG^NGG^",
    "RNF2": "GTCATCTTAGTCATTACCTG^NGG^",
    "VEGFA": "GACCCCCTCCACCCCGCCTC^NGG^",
    "CDKN2A": "^TTTA^GCCCCAATAATCCCCACATGTCA",
    "DYRK1A": "^TTTA^GAAGCACATCAAGGACATTCTAA",
    "ND4": "TGCTAGTAACCACGTTCTCCTGATCAAATATCACTCTCCTACTTACAGGA",
    "ND5.1": "TAGCATTAGCAGGAATACCTTTCCTCACAGGTTTCTACTCCAAAGA",
    "ND6": "TGACCCCCATGCCTCAGGATACTCCTCAATAGCCATCG",
}



[docs]
def cmp_align_list(aln_a: dict, aln_b: dict):
    """Compare function for alignment lists.

    This function processes alignment data and performs comparisons based on the input structure.

    Args:
        input_data (list | dict): 
            The alignment data to be processed. Can be provided as:
            - A list, e.g.: [17, 3, 0, 73.0, 1, 22, 'AAGAAGAAGACGAGTCTGCA', '||||||||||||||X|||XX', 'AAGAAGAAGACGAGCCTGAG']
            - A dictionary, e.g.: {'match_count': 26, 'mismatch_count': 3, 'gap_count': 4, 'aln_score': 96.0, 'ref_aln_start': 0, 'ref_aln_end': 32, 'alignment': {'reference_seq': 'GGCACTGCGGCTGGAAAAAAAAAAAAAAA--GT','aln_info': '--..|.|||||||||||||||||||||||--||', 'target_seq': '--GGCAGCGGCTGGAAAAAAAAAAAAAAAAGGT'}}

    Returns:
        None: The function does not return a value but performs comparisons.

    Example:
        >>> compare_alignments([
        ...     17, 3, 0, 73.0, 1, 22, 
        ...     'AAGAAGAAGACGAGTCTGCA', 
        ...     '||||||||||||||X|||XX', 
        ...     'AAGAAGAAGACGAGCCTGAG'
        ... ])
    """

    # sort reason score -> gap -> mismatch -> match
    def sign_value(x):
        """
        HELP
            sign function
        """
        if x > 0:
            return 1
        elif x < 0:
            return -1
        else:
            return 0

    align_alphabet_score = {"-": 0, ".": 1, "|": 2}
    sort_keys = ['alignment_score', 'gap_count', 'mismatch_count', 'match_count']
    sort_rev_state_list = [True, False, False, True]

    for idx, key in enumerate(sort_keys):
        if aln_a[key] - aln_b[key] == 0:
            continue
        else:
            if sort_rev_state_list[idx]:
                return -1 * sign_value(aln_a[key] - aln_b[key])
            else:
                return sign_value(aln_a[key] - aln_b[key])

    aln_info_a = aln_a['alignment']['aln_info']
    aln_info_b = aln_b['alignment']['aln_info']

    for idx, char_a in enumerate(aln_info_a):
        if idx <= (len(aln_info_b) - 1):
            value_a = align_alphabet_score[char_a]
            value_b = align_alphabet_score[aln_info_a[idx]]
            return sign_value(value_a - value_b)
        else:
            pass




[docs]
def run_target_seq_align(
        ref_seq: Seq,
        target_seq: Seq,
        aligner: PairwiseAligner,
        PAM: dict = None,
        log_level='WARNING'
) -> list:
    """Perform global alignment for the target sequence.

    This function performs global alignment between a reference sequence and a target sequence (usually the sgRNA sequence without PAM) using the specified aligner.

    Args:
        ref_seq (Seq): 
            A Seq object from BioPython representing the reference sequence 
            for the targeted deep sequencing.
        target_seq (Seq): 
            A Seq object from BioPython representing the target region sequence 
            for the editing window (usually the sgRNA sequence without PAM).
        aligner (object): 
            A PairwiseAligner object from BioPython used for performing the alignment.
        PAM (dict, optional): 
            A dictionary containing PAM information with the following keys:
            - 'PAM' (str): The PAM sequence (e.g., "AGG").
            - 'position' (int): The insertion site of `target_seq`.
            - 'weight' (float): The priority weight for PAM alignment. The alignment score will be multiplied by this weight.
            - Example: {'PAM': 'AGG', 'position': 20, 'weight': 1.0}.

    Returns:
        list[dict]: 
            A list of dictionaries, each containing alignment results. Each dictionary has the following keys:
            - 'match_count' (int): The number of matching bases in the alignment.
            - 'mismatch_count' (int): The number of mismatched bases.
            - 'gap_count' (int): The number of gaps.
            - 'aln_score' (float): The alignment score.
            - 'ref_aln_start' (int): The starting position of the alignment on the reference sequence.
            - 'ref_aln_end' (int): The ending position of the alignment on the reference sequence.
            - 'alignment' (dict): The alignment details containing:'reference_seq' (str): The aligned reference sequence.'aln_info' (str): The alignment information (e.g., matches, mismatches, and gaps).'target_seq' (str): The aligned target sequence.

    Example:
        An example alignment result might look like this:[{'match_count': 26,'mismatch_count': 3,'gap_count': 4,'aln_score': 96.0,'ref_aln_start': 0,'ref_aln_end': 32,'alignment': {'reference_seq': 'GGCACTGCGGCTGGAAAAAAAAAAAAAAA--GT','aln_info': '--..|.|||||||||||||||||||||||--||','target_seq': '--GGCAGCGGCTGGAAAAAAAAAAAAAAAAGGT'}}]

    """

    lm.set_names(func_name="run_target_seq_align")
    lm.set_level(log_level)


    if PAM:
        # considering PAM
        # DNA info https://www.bioinformatics.org/sms/iupac.html
        alignments_pam = aligner.align(ref_seq, Seq(PAM['PAM']))
        # use pam to find aim sequence on ref and then use target_seq to align them and give it a higher priority
        dt_PAMs = {}

        for idx, alignment in enumerate(alignments_pam):
            alignment_analysis_result = get_alignment_info(alignment, reverse=False)
            lm.logger.debug(f'alignment_analysis_result for PAM:\n{alignment_analysis_result}')
            ref_aln_start = alignment_analysis_result['ref_aln_start']
            ref_aln_end = alignment_analysis_result['ref_aln_end']
            aln_score = alignment_analysis_result['aln_score'] * PAM['weight']
            dt_PAMs[f'{ref_aln_start}_{ref_aln_end}'] = dict(aln_score=aln_score * PAM['weight'])

        lm.logger.debug(f'find PAMs on reference: {dt_PAMs}')

        # add PAM left or right or any position
        target_seq = target_seq[:PAM['position']] + Seq(PAM['PAM']) + target_seq[PAM['position']:]
        lm.logger.info(f'find PAM on target_seq, target_seq is updated to: {str(target_seq)}')

    # alignment part
    alignments = aligner.align(ref_seq, target_seq)
    # parse alignment
    alignment_results = []

    for alignment in alignments:
        alignment_analysis_result = get_alignment_info(alignment, reverse=False)
        lm.logger.debug(f'alignment_analysis_result:\n{alignment_analysis_result}')
        alignment_results.append(alignment_analysis_result)

    if len(alignment_results) == 1:
        lm.logger.debug(f'find 1 alignment result:\n{alignment_results}')
        return alignment_results[0]
    else:
        lm.logger.debug(f'find {len(alignment_results)} alignment results:\n{alignment_results}')
        df = pd.DataFrame(alignment_results)
        # if contain multiple alignment result with the same score, keep the best one;
        # sort reason score -> gap -> mismatch -> match
        df.sort_values(
            by=['aln_score', 'gap_count', 'mismatch_count', 'match_count'],
            ascending=[False, True, True, False],
            inplace=True
        )
        return df.iloc[0, :].to_dict()