Source code for fast_conformation.mmseqs2_msa

import subprocess
import os
import argparse
import json
import shutil

from fast_conformation.msa_generation.msa_utils import read_fasta, save_dict_to_fasta, create_directory

[docs] def validate_inputs(fasta_path, output_path, jobname): """ Validate the inputs to ensure they are correct. Args: fasta_path (str): Path to the FASTA file containing the target sequence. output_path (str): Directory path to save the output results. jobname (str): Name of the job. Raises: ValueError: If the FASTA path is not a valid file, the output path is not a valid directory, or the jobname is not a string. """ if not os.path.isfile(fasta_path): raise ValueError(f"MSA path '{fasta_path}' is not a valid file.") if not os.path.isdir(output_path): raise ValueError(f"Output path '{output_path}' is not a valid directory.") if not isinstance(jobname, str): raise ValueError(f"Jobname '{jobname}' is not a valid string.")
[docs] def get_mmseqs_msa(sequence_file_path, output_path, jobname, env): """ Run mmseqs2 to generate the MSA (Multiple Sequence Alignment) for the target sequence. Args: sequence_file_path (str): Path to the file containing the target sequence. output_path (str): Directory path to save the output results. jobname (str): Name of the job. env (dict): Environment variables for the subprocess. Raises: RuntimeError: If the subprocess fails to execute. """ complete_output_path = f'{output_path}/{jobname}/msas/mmseqs2' command = (f"colabfold_batch {sequence_file_path} {complete_output_path}/temp_msa --msa-only") process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, env=env) while True: output = process.stdout.readline() if output == '' and process.poll() is not None: break if output: print(output.strip()) rc = process.poll() shutil.copy(f"{complete_output_path}/temp_msa/{jobname}.pickle", complete_output_path) shutil.copy(f"{complete_output_path}/temp_msa/{jobname}_coverage.png", complete_output_path) copy_msa_and_clean(f"{complete_output_path}/temp_msa/{jobname}.a3m", complete_output_path)
[docs] def load_config(config_file): """ Load the configuration from a JSON file. Args: config_file (str): Path to the configuration file. Returns: dict: Configuration dictionary with default values if the file is not found or if there's an error in reading the file. """ default_config = { 'fasta_path': None, 'output_path': './', 'jobname': 'prediction_run', 'seq_pairs': [[8, 16], [64, 128], [512, 1024]], 'seeds': 10, 'save_all': False, 'platform': 'cpu' } try: with open(config_file, 'r') as file: config = json.load(file) return config except FileNotFoundError: print(f"Configuration file {config_file} not found. Using default values.") return default_config except json.JSONDecodeError: print(f"Error decoding JSON from the configuration file {config_file}. Using default values.") return default_config
[docs] def save_config(config, file_path): """ Save the configuration to a JSON file. Args: config (dict): Configuration dictionary to save. file_path (str): Path to save the configuration file. """ with open(file_path, 'w') as file: json.dump(config, file, indent=4)
[docs] def remove_first_line(file_path): """ Remove the first line from a file. Args: file_path (str): Path to the file to modify. """ with open(file_path, 'r') as file: lines = file.readlines() with open(file_path, 'w') as file: file.writelines(lines[1:])
[docs] def remove_duplicate_sequences(input_file): """ Remove duplicate sequences from a FASTA file. Args: input_file (str): Path to the FASTA file containing sequences. """ with open(input_file, 'r') as f: lines = f.readlines() seq_dict = {} current_name = None counter = 0 for line in lines: if line.startswith('>'): current_name = line.strip() else: seq = line.strip() if seq not in seq_dict: seq_dict[seq] = current_name else: counter += 1 with open(input_file, 'w') as f: for seq, name in seq_dict.items(): f.write(f"{name}\n{seq}\n")
[docs] def copy_msa_and_clean(src_file_path, dest_dir_path): """ Clean up and copy the MSA file to the destination directory. Args: src_file_path (str): Path to the source MSA file. dest_dir_path (str): Path to the destination directory. Raises: OSError: If the source directory cannot be deleted. """ remove_first_line(src_file_path) remove_duplicate_sequences(src_file_path) # Copy the file to the destination directory shutil.copy(src_file_path, dest_dir_path) # Get the directory of the source file src_dir_path = os.path.dirname(src_file_path) # Delete the source directory and all its contents shutil.rmtree(src_dir_path)
[docs] def run_mmseqs2_msa(config): """ Run the full MSA pipeline using mmseqs2. Args: config (dict): Configuration dictionary containing necessary parameters. """ jobname = config['jobname'] sequence_path = config['sequence_path'] output_path = config['output_path'] use_ramdisk = config['use_ramdisk'] platform = 'cpu' os.environ['JAX_PLATFORMS'] = platform env = os.environ.copy() env["PATH"] += os.pathsep + "localcolabfold/colabfold-conda/bin" create_directory(f'{output_path}/{jobname}/msas/mmseqs2') create_directory(f'{output_path}/{jobname}/target_seq/') sequence_dict = read_fasta(sequence_path) save_dict_to_fasta(sequence_dict, output_path, jobname) sequence_name = list(sequence_dict.keys())[0] sequence_string = list(sequence_dict.values())[0] print(f"\nRunning mmseqs2 MSA building for target sequence at {sequence_path}\n") print(f"{sequence_name}: \n{sequence_string}\n") print("Configurations:") print("***************************************************************") print(f"Job Name: {jobname}") print(f"Sequence File path: {sequence_path}") print(f"Output Path: {output_path}") print("***************************************************************") # Load target sequence single_sequence_path = f'{output_path}/{jobname}/target_seq/{jobname}.fasta' # Get mmseqs2 MSA get_mmseqs_msa(single_sequence_path, output_path, jobname, env) print(f'\nSaved {jobname} mmseqs2 MSA to {output_path}/{jobname}/msas/mmseqs2/{jobname}.a3m\n')
[docs] def main(): """ Main function to parse command-line arguments and execute the MSA pipeline. """ parser = argparse.ArgumentParser(description="Assemble MSA for target sequence with mmseqs2") parser.add_argument('--config_file', type=str, help="Path to the configuration file") parser.add_argument('--jobname', type=str, help="The job name") parser.add_argument('--sequence_path', type=str, help="Path to a .fasta file containing the target sequence for MSA building") parser.add_argument('--output_path', type=str, help="Path to save results to") args = parser.parse_args() # Load configuration from file if provided config_file = args.config_file if args.config_file else 'config.json' config = load_config(config_file) # Override config with command line arguments if provided config['jobname'] = args.jobname if args.jobname else config.get('jobname') config['sequence_path'] = args.sequence_path if args.sequence_path else config.get('sequence_path') config['output_path'] = args.output_path if args.output_path else config.get('output_path') config['use_ramdisk'] = True # or set this based on your needs or add as an argument run_mmseqs2_msa(config)
if __name__ == "__main__": main()