Source code for fast_conformation.jackhmmer_msa

import os
import argparse
import json
import pickle
from fast_conformation.msa_generation import get_msa_jackhmmer
from fast_conformation.msa_generation.msa_utils import create_ram_disk, read_fasta, create_directory, save_dict_to_fasta
import time


[docs]
def load_config(file_path):
    """
    Load the configuration file in JSON format.

    Args:
        file_path (str): Path to the configuration file.

    Returns:
        dict: Configuration dictionary with default values if the file is not found or if there's an error in reading the file.
    """
    default_config = {
        'sequences_file': None,
        'output_path': './',
        'jobname': "prediction_run",
        'homooligomers': 1,
        'tmp_dir': "./tmp",
        'use_ramdisk': False
    }

    try:
        with open(file_path, 'r') as file:
            config = json.load(file)
        return config
    except FileNotFoundError:
        print(f"Configuration file {file_path} not found. Using default values.")
        return default_config
    except json.JSONDecodeError:
        print(f"Error decoding JSON from the configuration file {file_path}. Using default values.")
        return default_config




[docs]
def reformat_sequences(input_msa):
    """
    Reformat sequences from MSA for use in colabfold_batch.

    Args:
        input_msa (list): List of sequences from the MSA.

    Returns:
        list: Formatted sequences in FASTA format.
    """
    formatted_sequences = []
    for idx, sequence in enumerate(input_msa[0]):
        formatted_sequence = f">sequence_{idx + 1}\n{sequence}\n"
        formatted_sequences.append(formatted_sequence)
    return formatted_sequences




[docs]
def convert_msa(filename):
    """
    Convert the MSA pickle file to a formatted sequence list.

    Args:
        filename (str): Path to the MSA pickle file.

    Returns:
        list: Converted and formatted sequences from the MSA.
    """
    with open(filename, 'rb') as f:
        msa_data = pickle.load(f)

    msa_list = msa_data['msas']
    converted_msa = reformat_sequences(msa_list)
    return converted_msa




[docs]
def save_formatted_sequences_to_file(formatted_sequences, output_file):
    """
    Save the formatted sequences to a file.

    Args:
        formatted_sequences (list): List of formatted sequences.
        output_file (str): Path to save the formatted sequences.
    """
    with open(f'{output_file}', 'w') as f:
        for sequence in formatted_sequences:
            f.write(sequence)




[docs]
def prepare_os():
    """
    Prepare the operating system environment by creating a RAM disk.
    """
    create_ram_disk()




[docs]
def build_msa(sequence, jobname, complete_output_dir, homooligomer, tmp_dir, use_ramdisk):
    """
    Build the MSA using jackhmmer for the target sequence.

    Args:
        sequence (str): The target sequence for MSA.
        jobname (str): The job name.
        complete_output_dir (str): The output directory path.
        homooligomer (int): The number of copies of the protein.
        tmp_dir (str): Temporary directory path.
        use_ramdisk (bool): Whether to use a RAM disk for the process.
    """
    prepped_msa = get_msa_jackhmmer.prep_inputs(sequence, jobname, homooligomer,  output_dir=complete_output_dir)

    get_msa_jackhmmer.prep_msa(prepped_msa, msa_method='jackhmmer', add_custom_msa=False, msa_format="fas",
                               pair_mode="unpaired", pair_cov=50, pair_qid=20, TMP_DIR=tmp_dir, use_ramdisk=use_ramdisk)




[docs]
def main():
    """
    Main function that handles command-line arguments and initiates the MSA building process.
    """
    parser = argparse.ArgumentParser(description="Assemble MSA for target sequence with jackhmmer")
    parser.add_argument('--config_file', type=str, help="Path to the configuration file")
    parser.add_argument('--jobname', type=str, help="The job name")
    parser.add_argument('--sequence_path', type=str, help="Path to a .fasta file containing the target sequence for MSA building")
    parser.add_argument('--output_path', type=str, help="Path to save results to")
    parser.add_argument('--homooligomers', type=int, help="Number of copies of the protein")
    parser.add_argument('--use_ramdisk', type=bool, help="Mounts a ramdisk for dramatically speeding up jackhmmer search (requires root access)")

    args = parser.parse_args()

    # Load configuration from file if provided
    config_file = args.config_file if args.config_file else 'config.json'
    config = load_config(config_file)

    # Override config with command line arguments if provided
    config['jobname'] = args.jobname if args.jobname else config.get('jobname')
    config['sequence_path'] = args.sequence_path if args.sequence_path else config.get('sequence_path')
    config['output_path'] = args.output_path if args.output_path else config.get('output_path')
    config['homooligomers'] = args.homooligomers if args.homooligomers else config.get('homooligomers')
    config['use_ramdisk'] = args.use_ramdisk if args.use_ramdisk else config.get('use_ramdisk')
    config['tmp_dir'] = "tmp"

    if config['sequence_path'] is None:
        raise ValueError("Sequences file must be provided via --sequences_file or in the config file.")
    if not os.path.exists(config['sequence_path']):
        raise FileNotFoundError(f"Sequences file {config['sequence_path']} not found.")
    if not os.path.isdir(config['output_path']):
        raise NotADirectoryError(f"Output path {config['output_path']} is not a directory.")
    if not isinstance(config['homooligomers'], int) or config['homooligomers'] <= 0:
        raise ValueError("Homooligomers must be a positive integer.")

    build_jackhmmer_msa(config)




[docs]
def build_jackhmmer_msa(config):
    """
    Build the MSA using jackhmmer for the target sequence based on the provided configuration.

    Args:
        config (dict): Configuration dictionary containing all necessary parameters.
    """
    print("starting")
    create_directory(f'{config["output_path"]}/{config["jobname"]}/msas/jackhmmer')
    create_directory(f'{config["output_path"]}/{config["jobname"]}/target_seq/')
    print("directories created")
    sequence_dict = read_fasta(config['sequence_path'])
    save_dict_to_fasta(sequence_dict, config['output_path'], config['jobname'])

    sequence_name = list(sequence_dict.keys())[0]
    config['sequence_string'] = list(sequence_dict.values())[0]

    print(f"\nRunning jackhmmer MSA building for target sequence at {config['sequence_path']}\n")

    print(f"{sequence_name}: \n{config['sequence_string']}\n")

    print("Configurations:")
    print("***************************************************************")
    print(f"Job Name: {config['jobname']}")
    print(f"Sequence File Path: {config['sequence_path']}")
    print(f"Output Path: {config['output_path']}")
    print(f"Homooligomers: {config['homooligomers']}")
    print(f"Use ramdisk? {config['use_ramdisk']}")
    print("***************************************************************\n")

    # optionally create ramdisk
    if config["use_ramdisk"]:
        prepare_os()
    # sets output directory for MSA
    complete_output_dir = f"{config['output_path']}/{config['jobname']}/msas/jackhmmer/"
    time.sleep(3)
    # builds jackhmmer MSA from target sequence
    build_msa(config["sequence_string"], config["jobname"], complete_output_dir, config["homooligomers"], tmp_dir=config["tmp_dir"], use_ramdisk=config["use_ramdisk"])

    # reformats MSA to something colabfold_batch can use
    converted_msa = convert_msa(f"{complete_output_dir}/msa.pickle")
    save_formatted_sequences_to_file(converted_msa, f"{complete_output_dir}/{config['jobname']}.a3m")

    print(f'\nSaved {config["jobname"]} jackhmmer MSA to {complete_output_dir}/{config["jobname"]}.a3m\n')



if __name__ == "__main__":
    main()