Source code for fast_conformation.msa_generation.get_msa_jackhmmer

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import os
from urllib import request
from concurrent import futures
import pickle
import re

from tqdm import tqdm

from fast_conformation.msa_generation import jackhmmer
from fast_conformation.msa_generation import parsers
from fast_conformation.msa_generation import colabfold as cf
from fast_conformation.msa_generation import pairmsa

TQDM_BAR_FORMAT = '{l_bar}{bar}| {n_fmt}/{total_fmt} [elapsed: {elapsed} remaining: {remaining}]'

#######################################################################################################################################
# prep_inputs
#######################################################################################################################################


[docs]
def prep_inputs(sequence, jobname="test", homooligomer="1", output_dir=None, clean=False, verbose=True):
    """
    Prepares the input sequence and parameters for MSA generation.

    Args:
        sequence (str): The protein sequence to be processed.
        jobname (str): The name of the job. Default is "test".
        homooligomer (str): A string specifying the number of homooligomers for each sequence segment.
                            Default is "1".
        output_dir (str): The directory where output files will be saved. If None, a default directory
                          is created based on the jobname and sequence hash.
        clean (bool): If True, cleans the output directory by removing existing files. Default is False.
        verbose (bool): If True, prints warnings and information during execution. Default is True.

    Returns:
        dict: A dictionary containing the processed inputs, including sequences, homooligomer information,
              and output directory.
    """
    # process inputs
    sequence = str(sequence)
    sequence = re.sub("[^A-Z:/]", "", sequence.upper())
    sequence = re.sub(":+", ":", sequence)
    sequence = re.sub("/+", "/", sequence)
    sequence = re.sub("^[:/]+", "", sequence)
    sequence = re.sub("[:/]+$", "", sequence)
    jobname = re.sub(r'\W+', '', jobname)
    homooligomer = str(homooligomer)
    homooligomer = re.sub("[:/]+", ":", homooligomer)
    homooligomer = re.sub("^[:/]+", "", homooligomer)
    homooligomer = re.sub("[:/]+$", "", homooligomer)

    if len(homooligomer) == 0: homooligomer = "1"
    homooligomer = re.sub("[^0-9:]", "", homooligomer)

    # define inputs
    I = {"ori_sequence": sequence,
         "sequence": sequence.replace("/", "").replace(":", ""),
         "seqs": sequence.replace("/", "").split(":"),
         "homooligomer": homooligomer,
         "homooligomers": [int(h) for h in homooligomer.split(":")],
         "msas": [], "deletion_matrices": []}

    # adjust homooligomer option
    if len(I["seqs"]) != len(I["homooligomers"]):
        if len(I["homooligomers"]) == 1:
            I["homooligomers"] = [I["homooligomers"][0]] * len(I["seqs"])
        else:
            if verbose:
                print("WARNING: Mismatch between number of breaks ':' in 'sequence' and 'homooligomer' definition")
            while len(I["seqs"]) > len(I["homooligomers"]):
                I["homooligomers"].append(1)
            I["homooligomers"] = I["homooligomers"][:len(I["seqs"])]
        I["homooligomer"] = ":".join([str(h) for h in I["homooligomers"]])

    # define full sequence being modelled
    I["full_sequence"] = ''.join([s * h for s, h in zip(I["seqs"], I["homooligomers"])])
    I["lengths"] = [len(seq) for seq in I["seqs"]]

    # prediction directory
    if output_dir is None:
        I["output_dir"] = 'prediction_' + jobname + '_' + cf.get_hash(I["full_sequence"])[:5]
    else:
        I["output_dir"] = output_dir
    os.makedirs(I["output_dir"], exist_ok=True)

    # delete existing files in working directory
    if clean:
        for f in os.listdir(I["output_dir"]):
            os.remove(os.path.join(I["output_dir"], f))

    if verbose and len(I["full_sequence"]) > 1400:
        print(
            f"WARNING: For a typical Google-Colab-GPU (16G) session, the max total length is ~1400 residues. You are at {len(I['full_sequence'])}!")
        print(
            f"Run Alphafold may crash, unless you trim to the protein(s) to a short length. (See trim options below).")

    if verbose:
        print(f"\nhomooligomer: {I['homooligomer']}")
        print(f"total_length: {len(I['full_sequence'])}")
        print(f"output_dir: {I['output_dir']}\n")

    return I



#######################################################################################################################################
# prep_msa
#######################################################################################################################################


[docs]
def run_jackhmmer(sequence, prefix, jackhmmer_binary_path='jackhmmer', verbose=True, use_ramdisk=False):
    """
    Runs the jackhmmer tool to search for homologous sequences in a protein sequence database.

    Args:
        sequence (str): The query protein sequence.
        prefix (str): The prefix for output files.
        jackhmmer_binary_path (str): Path to the jackhmmer binary executable. Default is 'jackhmmer'.
        verbose (bool): If True, prints progress and information during execution. Default is True.
        use_ramdisk (bool): If True, uses a RAM disk for temporary storage. Default is False.

    Returns:
        tuple: A tuple containing the MSAs, deletion matrices, and sequence names.
    """
    fasta_path = f"{prefix}.fasta"
    with open(fasta_path, 'wt') as f:
        f.write(f'>query\n{sequence}')

    pickled_msa_path = f"{prefix}.jackhmmer.pickle"
    if os.path.isfile(pickled_msa_path):
        msas_dict = pickle.load(open(pickled_msa_path, "rb"))
        msas, deletion_matrices, names = (msas_dict[k] for k in ['msas', 'deletion_matrices', 'names'])
        full_msa = []
        for msa in msas:
            full_msa += msa
    else:
        # --- Find the closest source ---
        test_url_pattern = 'https://storage.googleapis.com/alphafold-colab{:s}/latest/uniref90_2021_03.fasta.1'
        ex = futures.ThreadPoolExecutor(3)

        def fetch(source):
            request.urlretrieve(test_url_pattern.format(source))
            return source

        fs = [ex.submit(fetch, source) for source in ['', '-europe', '-asia']]
        source = None
        for f in futures.as_completed(fs):
            source = f.result()
            ex.shutdown()
            break

        dbs = []

        num_jackhmmer_chunks = {'uniref90': 59, 'smallbfd': 17, 'mgnify': 71}
        total_jackhmmer_chunks = sum(num_jackhmmer_chunks.values())
        disable_tqdm = not verbose
        with tqdm(total=total_jackhmmer_chunks, bar_format=TQDM_BAR_FORMAT, disable=disable_tqdm) as pbar:
            def jackhmmer_chunk_callback(i):
                pbar.update(n=1)

            pbar.set_description('Searching uniref90')
            jackhmmer_uniref90_runner = jackhmmer.Jackhmmer(
                binary_path=jackhmmer_binary_path,
                database_path=f'https://storage.googleapis.com/alphafold-colab{source}/latest/uniref90_2021_03.fasta',
                get_tblout=True,
                num_streamed_chunks=num_jackhmmer_chunks['uniref90'],
                streaming_callback=jackhmmer_chunk_callback,
                z_value=135301051,
                use_ramdisk=use_ramdisk)
            dbs.append(('uniref90', jackhmmer_uniref90_runner.query(fasta_path)))

            pbar.set_description('Searching smallbfd')
            jackhmmer_smallbfd_runner = jackhmmer.Jackhmmer(
                binary_path=jackhmmer_binary_path,
                database_path=f'https://storage.googleapis.com/alphafold-colab{source}/latest/bfd-first_non_consensus_sequences.fasta',
                get_tblout=True,
                num_streamed_chunks=num_jackhmmer_chunks['smallbfd'],
                streaming_callback=jackhmmer_chunk_callback,
                z_value=65984053,
                use_ramdisk=use_ramdisk)
            dbs.append(('smallbfd', jackhmmer_smallbfd_runner.query(fasta_path)))

            pbar.set_description('Searching mgnify')
            jackhmmer_mgnify_runner = jackhmmer.Jackhmmer(
                binary_path=jackhmmer_binary_path,
                database_path=f'https://storage.googleapis.com/alphafold-colab{source}/latest/mgy_clusters_2019_05.fasta',
                get_tblout=True,
                num_streamed_chunks=num_jackhmmer_chunks['mgnify'],
                streaming_callback=jackhmmer_chunk_callback,
                z_value=304820129,
                use_ramdisk=use_ramdisk)
            dbs.append(('mgnify', jackhmmer_mgnify_runner.query(fasta_path)))

        # --- Extract the MSAs and visualize ---
        # Extract the MSAs from the Stockholm files.
        # NB: deduplication happens later in pipeline.make_msa_features.

        mgnify_max_hits = 501
        msas = []
        deletion_matrices = []
        names = []
        for db_name, db_results in dbs:
            unsorted_results = []
            for i, result in enumerate(db_results):
                msa, deletion_matrix, target_names = parsers.parse_stockholm(result['sto'])
                e_values_dict = parsers.parse_e_values_from_tblout(result['tbl'])
                e_values = [e_values_dict[t.split('/')[0]] for t in target_names]
                zipped_results = zip(msa, deletion_matrix, target_names, e_values)
                if i != 0:
                    # Only take query from the first chunk
                    zipped_results = [x for x in zipped_results if x[2] != 'query']
                unsorted_results.extend(zipped_results)
            sorted_by_evalue = sorted(unsorted_results, key=lambda x: x[3])
            db_msas, db_deletion_matrices, db_names, _ = zip(*sorted_by_evalue)
            if db_msas:
                if db_name == 'mgnify':
                    db_msas = db_msas[:mgnify_max_hits]
                    db_deletion_matrices = db_deletion_matrices[:mgnify_max_hits]
                    db_names = db_names[:mgnify_max_hits]
                msas.append(db_msas)
                deletion_matrices.append(db_deletion_matrices)
                names.append(db_names)
                msa_size = len(set(db_msas))
                print(f'{msa_size} Sequences Found in {db_name}')

            pickle.dump({"msas": msas,
                         "deletion_matrices": deletion_matrices,
                         "names": names}, open(pickled_msa_path, "wb"))
    return msas, deletion_matrices, names




[docs]
def prep_msa(I, msa_method="mmseqs2", add_custom_msa=False, msa_format="fas",
             pair_mode="unpaired", pair_cov=50, pair_qid=20,
             hhfilter_loc="hhfilter", reformat_loc="reformat.pl", TMP_DIR="tmp",
             custom_msa=None, precomputed=None,
             mmseqs_host_url="https://a3m.mmseqs.com",
             verbose=True, use_ramdisk=False):
    """
    Prepares and processes MSAs for the given sequences using the specified MSA method.

    Args:
        I (dict): Dictionary containing input sequences and other parameters.
        msa_method (str): Method used to generate MSAs. Default is "mmseqs2".
        add_custom_msa (bool): Whether to add a custom MSA. Default is False.
        msa_format (str): The format of the MSA file. Default is "fas".
        pair_mode (str): Pairing mode for sequences. Can be "unpaired", "paired", or "unpaired+paired".
                         Default is "unpaired".
        pair_cov (int): Coverage threshold for pairing sequences. Default is 50.
        pair_qid (int): Identity threshold for pairing sequences. Default is 20.
        hhfilter_loc (str): Path to the hhfilter binary. Default is "hhfilter".
        reformat_loc (str): Path to the reformat.pl script. Default is "reformat.pl".
        TMP_DIR (str): Path to the temporary directory. Default is "tmp".
        custom_msa (str): Path to a custom MSA file (optional).
        precomputed (str): Path to a precomputed MSA file (optional).
        mmseqs_host_url (str): URL of the MMseqs2 server. Default is "https://a3m.mmseqs.com".
        verbose (bool): If True, prints progress and information during execution. Default is True.
        use_ramdisk (bool): If True, uses a RAM disk for temporary storage. Default is False.

    Returns:
        dict: The updated dictionary I containing the generated MSAs and deletion matrices.
    """
    # make temp directory
    os.makedirs(TMP_DIR, exist_ok=True)

    # clear previous inputs
    I["msas"] = []
    I["deletion_matrices"] = []

    _blank_seq = ["-" * L for L in I["lengths"]]
    _blank_mtx = [[0] * L for L in I["lengths"]]

    def _pad(ns, vals, mode):
        if mode == "seq": _blank = _blank_seq.copy()
        if mode == "mtx": _blank = _blank_mtx.copy()
        if isinstance(ns, list):
            for n, val in zip(ns, vals): _blank[n] = val
        else:
            _blank[ns] = vals
        if mode == "seq": return "".join(_blank)
        if mode == "mtx": return sum(_blank, [])

    if len(I["seqs"]) == 1 or "unpaired" in pair_mode:
        # gather msas
        for n, seq in enumerate(I["seqs"]):
            # tmp directory
            prefix = cf.get_hash(seq)
            prefix = os.path.join(TMP_DIR, prefix)

            print(f"Running jackhmmer")
            # run jackhmmer
            msas_, mtxs_, names_ = ([sum(x, ())] for x in run_jackhmmer(seq, prefix, use_ramdisk=use_ramdisk))

            # pad sequences
            for msa_, mtx_ in zip(msas_, mtxs_):
                msa, mtx = [I["sequence"]], [[0] * len(I["sequence"])]
                for s, m in zip(msa_, mtx_):
                    msa.append(_pad(n, s, "seq"))
                    mtx.append(_pad(n, m, "mtx"))

                I["msas"].append(msa)
                I["deletion_matrices"].append(mtx)

        # PAIR_MSA
        if len(I["seqs"]) > 1 and (pair_mode == "paired" or pair_mode == "unpaired+paired"):
            print("attempting to pair some sequences...")

            _data = []
            for a in range(len(I["seqs"])):
                print(f"prepping seq_{a}")
                _seq = I["seqs"][a]
                _prefix = os.path.join(TMP_DIR, cf.get_hash(_seq))

                _msas, _mtxs, _names = run_jackhmmer(_seq, _prefix, use_ramdisk=use_ramdisk)
                _msa, _mtx, _lab = pairmsa.get_uni_jackhmmer(_msas[0], _mtxs[0], _names[0],
                                                             filter_qid=pair_qid / 100,
                                                             filter_cov=pair_cov / 100)

                if len(_msa) > 1:
                    _data.append(pairmsa.hash_it(_msa, _lab, _mtx, call_uniprot=False))
                else:
                    _data.append(None)

            Ln = len(I["seqs"])
            O = [[None for _ in I["seqs"]] for _ in I["seqs"]]
            for a in range(Ln):
                if _data[a] is not None:
                    for b in range(a + 1, Ln):
                        if _data[b] is not None:
                            print(f"attempting pairwise stitch for {a} {b}")
                            O[a][b] = pairmsa._stitch(_data[a], _data[b])
                            _seq_a, _seq_b, _mtx_a, _mtx_b = (*O[a][b]["seq"], *O[a][b]["mtx"])

                            # filter to remove redundant sequences
                            ok = []
                            with open(f"{TMP_DIR}/tmp.fas", "w") as fas_file:
                                fas_file.writelines(
                                    [f">{n}\n{a + b}\n" for n, (a, b) in enumerate(zip(_seq_a, _seq_b))])
                            os.system(
                                f"{hhfilter_loc} -maxseq 1000000 -i {TMP_DIR}/tmp.fas -o {TMP_DIR}/tmp.id90.fas -id 90")
                            for line in open(f"{TMP_DIR}/tmp.id90.fas", "r"):
                                if line.startswith(">"): ok.append(int(line[1:]))

                            if verbose:
                                print(f"found {len(_seq_a)} pairs ({len(ok)} after filtering)")

                            if len(_seq_a) > 0:
                                msa, mtx = [I["sequence"]], [[0] * len(I["sequence"])]
                                for s_a, s_b, m_a, m_b in zip(_seq_a, _seq_b, _mtx_a, _mtx_b):
                                    msa.append(_pad([a, b], [s_a, s_b], "seq"))
                                    mtx.append(_pad([a, b], [m_a, m_b], "mtx"))
                                I["msas"].append(msa)
                                I["deletion_matrices"].append(mtx)

    # save MSA as pickle
    pickle.dump({"msas": I["msas"], "deletion_matrices": I["deletion_matrices"]},
                open(os.path.join(I["output_dir"], "msa.pickle"), "wb"))
    return I