Source code for bioat.lib.libcrispr

import pandas as pd
from Bio.Align import PairwiseAligner
from Bio.Seq import Seq

from bioat.lib.libalignment import get_alignment_info
from bioat.logger import LoggerManager

lm = LoggerManager(mod_name="bioat.lib.libcrispr")

# TARGET_REGIONS_LIB
#     for target_seq alignment
#
# bioat target_seq region_heatmap test_sorted.mpileup.info.tsv test.pdf --get_built_in_target_region
#
# INFO:root:You can use <key> in built-in <target_regions> to represent your target_region:
#         <key>   <target_region>
#         EMX1    GAGTCCGAGCAGAAGAAGAA^NGG^
#         HEK3    GGCCCAGACTGAGCACGTGA^NGG^
#         HEK4    GGCACTGCGGCTGGAGGTGG^NGG^
#         ...

TARGET_SEQ_LIB = {
    "EMX1": "GAGTCCGAGCAGAAGAAGAA^NGG^",
    "HEK3": "GGCCCAGACTGAGCACGTGA^NGG^",
    "HEK4": "GGCACTGCGGCTGGAGGTGG^NGG^",
    "RNF2": "GTCATCTTAGTCATTACCTG^NGG^",
    "VEGFA": "GACCCCCTCCACCCCGCCTC^NGG^",
    "CDKN2A": "^TTTA^GCCCCAATAATCCCCACATGTCA",
    "DYRK1A": "^TTTA^GAAGCACATCAAGGACATTCTAA",
    "ND4": "TGCTAGTAACCACGTTCTCCTGATCAAATATCACTCTCCTACTTACAGGA",
    "ND5.1": "TAGCATTAGCAGGAATACCTTTCCTCACAGGTTTCTACTCCAAAGA",
    "ND6": "TGACCCCCATGCCTCAGGATACTCCTCAATAGCCATCG",
}


[docs] def cmp_align_list(aln_a: dict, aln_b: dict): """Compare function for alignment lists. This function processes alignment data and performs comparisons based on the input structure. Args: input_data (list | dict): The alignment data to be processed. Can be provided as: - A list, e.g.: [17, 3, 0, 73.0, 1, 22, 'AAGAAGAAGACGAGTCTGCA', '||||||||||||||X|||XX', 'AAGAAGAAGACGAGCCTGAG'] - A dictionary, e.g.: {'match_count': 26, 'mismatch_count': 3, 'gap_count': 4, 'aln_score': 96.0, 'ref_aln_start': 0, 'ref_aln_end': 32, 'alignment': {'reference_seq': 'GGCACTGCGGCTGGAAAAAAAAAAAAAAA--GT','aln_info': '--..|.|||||||||||||||||||||||--||', 'target_seq': '--GGCAGCGGCTGGAAAAAAAAAAAAAAAAGGT'}} Returns: None: The function does not return a value but performs comparisons. Example: >>> compare_alignments( ... [ ... 17, ... 3, ... 0, ... 73.0, ... 1, ... 22, ... "AAGAAGAAGACGAGTCTGCA", ... "||||||||||||||X|||XX", ... "AAGAAGAAGACGAGCCTGAG", ... ] ... ) """ # sort reason score -> gap -> mismatch -> match def sign_value(x): """HELP sign function. """ if x > 0: return 1 if x < 0: return -1 return 0 align_alphabet_score = {"-": 0, ".": 1, "|": 2} sort_keys = ["alignment_score", "gap_count", "mismatch_count", "match_count"] sort_rev_state_list = [True, False, False, True] for idx, key in enumerate(sort_keys): if aln_a[key] - aln_b[key] == 0: continue if sort_rev_state_list[idx]: return -1 * sign_value(aln_a[key] - aln_b[key]) return sign_value(aln_a[key] - aln_b[key]) aln_info_a = aln_a["alignment"]["aln_info"] aln_info_b = aln_b["alignment"]["aln_info"] for idx, char_a in enumerate(aln_info_a): if idx <= (len(aln_info_b) - 1): value_a = align_alphabet_score[char_a] value_b = align_alphabet_score[char_a] return sign_value(value_a - value_b) pass return None
[docs] def run_target_seq_align( ref_seq: Seq, target_seq: Seq, aligner: PairwiseAligner, PAM: dict | None = None, log_level="WARNING", ) -> list: """Perform global alignment for the target sequence. This function performs global alignment between a reference sequence and a target sequence (usually the sgRNA sequence without PAM) using the specified aligner. Args: ref_seq (Seq): A Seq object from BioPython representing the reference sequence for the targeted deep sequencing. target_seq (Seq): A Seq object from BioPython representing the target region sequence for the editing window (usually the sgRNA sequence without PAM). aligner (object): A PairwiseAligner object from BioPython used for performing the alignment. PAM (dict, optional): A dictionary containing PAM information with the following keys: - 'PAM' (str): The PAM sequence (e.g., "AGG"). - 'position' (int): The insertion site of `target_seq`. - 'weight' (float): The priority weight for PAM alignment. The alignment score will be multiplied by this weight. - Example: {'PAM': 'AGG', 'position': 20, 'weight': 1.0}. Returns: list[dict]: A list of dictionaries, each containing alignment results. Each dictionary has the following keys: - 'match_count' (int): The number of matching bases in the alignment. - 'mismatch_count' (int): The number of mismatched bases. - 'gap_count' (int): The number of gaps. - 'aln_score' (float): The alignment score. - 'ref_aln_start' (int): The starting position of the alignment on the reference sequence. - 'ref_aln_end' (int): The ending position of the alignment on the reference sequence. - 'alignment' (dict): The alignment details containing:'reference_seq' (str): The aligned reference sequence.'aln_info' (str): The alignment information (e.g., matches, mismatches, and gaps).'target_seq' (str): The aligned target sequence. Example: An example alignment result might look like this:[{'match_count': 26,'mismatch_count': 3,'gap_count': 4,'aln_score': 96.0,'ref_aln_start': 0,'ref_aln_end': 32,'alignment': {'reference_seq': 'GGCACTGCGGCTGGAAAAAAAAAAAAAAA--GT','aln_info': '--..|.|||||||||||||||||||||||--||','target_seq': '--GGCAGCGGCTGGAAAAAAAAAAAAAAAAGGT'}}] """ lm.set_names(func_name="run_target_seq_align") lm.set_level(log_level) if PAM: # considering PAM # DNA info https://www.bioinformatics.org/sms/iupac.html alignments_pam = aligner.align(ref_seq, Seq(PAM["PAM"])) # use pam to find aim sequence on ref and then use target_seq to align them and give it a higher priority dt_PAMs = {} for _idx, alignment in enumerate(alignments_pam): alignment_analysis_result = get_alignment_info(alignment, reverse=False) lm.logger.debug( f"alignment_analysis_result for PAM:\n{alignment_analysis_result}" ) ref_aln_start = alignment_analysis_result["ref_aln_start"] ref_aln_end = alignment_analysis_result["ref_aln_end"] aln_score = alignment_analysis_result["aln_score"] * PAM["weight"] dt_PAMs[f"{ref_aln_start}_{ref_aln_end}"] = { "aln_score": aln_score * PAM["weight"] } lm.logger.debug(f"find PAMs on reference: {dt_PAMs}") # add PAM left or right or any position target_seq = ( target_seq[: PAM["position"]] + Seq(PAM["PAM"]) + target_seq[PAM["position"] :] ) lm.logger.info( f"find PAM on target_seq, target_seq is updated to: {target_seq!s}" ) # alignment part alignments = aligner.align(ref_seq, target_seq) # parse alignment alignment_results = [] for alignment in alignments: alignment_analysis_result = get_alignment_info(alignment, reverse=False) lm.logger.debug(f"alignment_analysis_result:\n{alignment_analysis_result}") alignment_results.append(alignment_analysis_result) if len(alignment_results) == 1: lm.logger.debug(f"find 1 alignment result:\n{alignment_results}") return alignment_results[0] lm.logger.debug( f"find {len(alignment_results)} alignment results:\n{alignment_results}" ) df = pd.DataFrame(alignment_results) # if contain multiple alignment result with the same score, keep the best one; # sort reason score -> gap -> mismatch -> match df = df.sort_values( by=["aln_score", "gap_count", "mismatch_count", "match_count"], ascending=[False, True, True, False], ) return df.iloc[0, :].to_dict()