Source code for fast_conformation.ensemble_analysis.analysis_utils

import json
import os
import shutil
import subprocess
import pandas as pd
from glob import glob
import MDAnalysis as mda
from MDAnalysis.analysis import align

[docs] def parabola(x, a, b, c): """ Calculate the value of a parabola given the coefficients. Parameters: x (float or array-like): The independent variable. a (float): Coefficient for the quadratic term. b (float): Coefficient for the linear term. c (float): Constant term. Returns: float or array-like: The value of the parabola at x. """ return a * x**2 + b * x + c
[docs] def create_directory(path): """ Create a directory at the specified path. If the directory already exists, it is removed and recreated. Parameters: path (str): The path to the directory. Raises: OSError: If the directory cannot be created. """ if os.path.exists(path): shutil.rmtree(path) os.makedirs(path)
[docs] def load_config(config_file): """ Load a JSON configuration file. Parameters: config_file (str): The path to the configuration file. Returns: dict: The configuration as a dictionary. Raises: FileNotFoundError: If the file does not exist. json.JSONDecodeError: If the file is not a valid JSON. """ try: with open(config_file, 'r') as file: config = json.load(file) return config except FileNotFoundError: print(f"File {config_file} not found.") except json.JSONDecodeError: print(f"Error decoding JSON from the file {config_file}.")
[docs] def load_frames(file_list): """ Load a list of molecular dynamics files as MDAnalysis Universes. Parameters: file_list (list): A list of file paths to load. Returns: list: A list of MDAnalysis Universes. """ universes = [mda.Universe(f) for f in file_list] return universes
[docs] def load_pdb_files_as_universe(folder_path, reindex): """ Load all PDB files in the specified folder as a Universe, using the first PDB file as the topology. Parameters: folder_path (str): Path to the folder containing PDB files. reindex (int or None): If provided, reindex the PDB files so the first residue matches this index. Returns: MDAnalysis.Universe: The loaded Universe. Raises: FileNotFoundError: If no PDB files are found in the folder. """ pdb_files = sorted(glob(os.path.join(folder_path, '*.pdb'))) if not pdb_files: raise FileNotFoundError("No PDB files found in the specified folder") topology = pdb_files[0] if reindex: temp_pdb_path = os.path.join(folder_path, 'temp.pdb') command = f"pdb_reres -{reindex} \"{topology}\" > \"{temp_pdb_path}\"" subprocess.run(command, shell=True, check=True) topology = temp_pdb_path u = mda.Universe(topology, *pdb_files, dt=1) if reindex: os.remove(temp_pdb_path) print(f"Loaded {len(u.trajectory)} predictions from {folder_path}") return u
[docs] def load_predictions(predictions_path, seq_pairs, jobname, starting_residue): """ Load predictions from a set of PDB files as MDAnalysis Universes. Parameters: predictions_path (str): Path to the directory containing the predictions. seq_pairs (list): A list of sequence pairs for the predictions. jobname (str): The job name associated with the predictions. starting_residue (int or None): The starting residue index for reindexing. Returns: dict: A dictionary of Universes and associated metadata. """ predictions_dict = {} for seq_pair in seq_pairs: max_seq, extra_seq = seq_pair folder_pattern = os.path.join(predictions_path, f"*{max_seq}_{extra_seq}*") # Find all folders that match the pattern matching_folders = [f for f in glob(folder_pattern) if os.path.isdir(f)] # Check if any matching folders are found if not matching_folders: print(f"Warning: No folders found for sequence pair '{max_seq}_{extra_seq}'. Skipping...") continue for matching_folder in matching_folders: # Load the found PDB files as MDAnalysis Universes universe = load_pdb_files_as_universe(matching_folder, starting_residue) params = {'max_seq': max_seq, 'extra_seq': extra_seq, 'jobname': jobname, 'mda_universe': universe} predictions_dict[f'{jobname}_{max_seq}_{extra_seq}'] = params return predictions_dict
[docs] def load_predictions_json(predictions_path, seq_pairs, jobname): """ Load pLDDT scores from JSON files associated with predictions. Parameters: predictions_path (str): Path to the directory containing the predictions. seq_pairs (list): A list of sequence pairs for the predictions. jobname (str): The job name associated with the predictions. Returns: dict: A dictionary of pLDDT scores for each prediction. """ plddt_dict = {} for seq_pair in seq_pairs: max_seq, extra_seq = seq_pair folder_pattern = os.path.join(predictions_path, f"*{max_seq}_{extra_seq}*") # Find all folders that match the pattern matching_folders = [f for f in glob(folder_pattern) if os.path.isdir(f)] # Check if any matching folders are found if not matching_folders: print(f"Warning: No folders found for sequence pair '{max_seq}_{extra_seq}'. Skipping...") continue all_plddts = [] for folder_path in matching_folders: # Search for JSON files containing max_seq_extra_seq within the matching folder json_files = glob(f"{folder_path}/*.json") # Check if any matching files are found if not json_files: print(f"Warning: No JSON files containing '{max_seq}_{extra_seq}' found in folder '{folder_path}'. Skipping...") continue for json_file in json_files: # Load pLDDT data from the JSON file plddt_data = load_config(json_file) per_residue_plddt = plddt_data.get('plddt') if per_residue_plddt: all_plddts.append(per_residue_plddt) if all_plddts: # Only add to dictionary if there are valid pLDDT values partial_plddt_dict = { 'max_seq': max_seq, 'extra_seq': extra_seq, 'jobname': jobname, 'all_plddts': all_plddts } plddt_dict[f'{jobname}_{max_seq}_{extra_seq}'] = partial_plddt_dict else: print(f"Warning: No valid pLDDT data found for sequence pair '{max_seq}_{extra_seq}'. Skipping...") return plddt_dict
[docs] def reorder_frames_by(frames, values): """ Reorder trajectory frames based on associated values (e.g., RMSD). Parameters: frames (MDAnalysis.Universe): The Universe containing the trajectory frames to reorder. values (list): A list of values associated with each frame. Returns: MDAnalysis.Universe: A new Universe with frames reordered by the associated values. """ top = frames._topology frame_value_pairs = list(enumerate(values)) sorted_values = sorted(frame_value_pairs, key=lambda x: x[1]) sorted_indices = [idx for idx, _ in sorted_values] traj_temp = [] create_directory('tmp_frames') for idx, frame in enumerate(frames.trajectory[sorted_indices]): frame_name = f"tmp_frames/tmp_{idx}.pdb" ag = frames.select_atoms('all') ag.write(frame_name) traj_temp.append(frame_name) traj = mda.Universe(top, traj_temp, dt=1) align.AlignTraj(traj, traj, select='backbone', ref_Frame=0, in_memory=True).run() return traj
[docs] def save_traj(universe, traj_output_path, jobname, max_seq, extra_seq, traj_format, ordered): """ Save the trajectory of a Universe to a file. Parameters: universe (MDAnalysis.Universe): The Universe containing the trajectory to save. traj_output_path (str): The directory where the trajectory file will be saved. jobname (str): The job name associated with the trajectory. max_seq (str): The maximum sequence associated with the trajectory. extra_seq (str): The extra sequence associated with the trajectory. traj_format (str): The format to save the trajectory in (e.g., 'pdb'). ordered (str): A description of the ordering of the frames (default is 'raw'). Returns: None """ ag = universe.select_atoms('all') if not ordered: ordered = 'raw' ag.write(f"{traj_output_path}/{jobname}_{max_seq}_{extra_seq}_ordered_by_{ordered}.{traj_format}", frames='all') shutil.rmtree('tmp_frames')
[docs] def auto_select_2d_references(references_dataset_path, analysis_type): """ Automatically select two 2D references based on the most distant mode representatives. Parameters: references_dataset_path (str): Path to the CSV file containing reference data. analysis_type (str): The type of analysis (e.g., 'tmscore', 'rmsd') to use for mode selection. Returns: tuple: Paths to the two selected reference structures. """ df_references = pd.read_csv(references_dataset_path) unique_trials = df_references['trial'].unique() all_modes_diff = {} for trial in unique_trials: subset = df_references[df_references['trial'] == trial] modes_subset = list(subset[analysis_type]) modes_diff_partial = {} if len(modes_subset) > 1: max_difference = 0 index1 = 0 index2 = 0 for i in range(len(modes_subset)): for j in range(i + 1, len(modes_subset)): difference = abs(modes_subset[i] - modes_subset[j]) if difference > max_difference: max_difference = difference index1 = i index2 = j modes_diff_partial = {"trial": trial, "max_difference": max_difference, "index1": index1, "index2": index2} else: modes_diff_partial = {"trial": trial, "max_difference": None, "index1": None, "index2": None} all_modes_diff[trial] = modes_diff_partial max_diff_key = None max_diff_value = float('-inf') for key, value in all_modes_diff.items(): if value['max_difference'] is not None and value['max_difference'] > max_diff_value: max_diff_value = value['max_difference'] max_diff_key = key ref1_path = df_references.loc[df_references['trial'] == max_diff_key, 'pdb_filename'].values[all_modes_diff[max_diff_key]['index1']] ref2_path = df_references.loc[df_references['trial'] == max_diff_key, 'pdb_filename'].values[all_modes_diff[max_diff_key]['index2']] return ref1_path, ref2_path