Source code for fast_conformation.ensemble_analysis.rmsd

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from scipy.stats import gaussian_kde
from scipy.signal import find_peaks

import MDAnalysis as mda
from MDAnalysis.analysis import rms

from tqdm import tqdm
import pyqtgraph as pg
from pyqtgraph.Qt import QtCore

TQDM_BAR_FORMAT = '{l_bar}{bar}| {n_fmt}/{total_fmt} [elapsed: {elapsed} remaining: {remaining}]'


[docs]
def calculate_rmsd(u: mda.Universe,
                   ref: mda.Universe = None,
                   align_range: str = "backbone",
                   analysis_range: str = "backbone") -> dict:
    """
    Calculate the Root Mean Square Deviation (RMSD) of a given molecular dynamics trajectory.

    Parameters:
    u (mda.Universe): The MDAnalysis Universe containing the trajectory to be analyzed.
    ref (mda.Universe): The reference Universe for RMSD calculation. If None, the first frame of `u` is used.
    align_range (str): The atom selection string for alignment of the trajectory (default is "backbone").
    analysis_range (str): The atom selection string for RMSD calculation (default is "backbone").

    Returns:
    dict: A dictionary containing the RMSD values for each frame with keys 'frame', `align_range`, and `analysis_range`.
    """
    # if reference is not supplied, use first frame
    if ref is None:
        ref = u.select_atoms('protein')
    else:
        ref_u = mda.Universe(ref)
        ref = ref_u.select_atoms('protein')

    r = mda.analysis.rms.RMSD(u, ref, select=align_range, groupselections=[analysis_range])
    r.run()

    rmsd = r.results.rmsd.T
    rmsd_dict = {'frame': rmsd[1], f'{align_range}': rmsd[2], f"{analysis_range}": rmsd[3]}

    return rmsd_dict




[docs]
def rmsd_kde(rmsd_data: list, input_dict: dict, widget) -> dict:
    """
    Perform Kernel Density Estimation (KDE) on RMSD data and identify the most distant modes.

    Parameters:
    rmsd_data (list): A list of RMSD values.
    input_dict (dict): A dictionary containing job-related metadata (jobname, max_seq, extra_seq, analysis range, etc.).
    widget (object): A widget object for interactive plotting.

    Returns:
    dict: A dictionary containing information about the identified modes, including their indices, values, and densities.
    """
    jobname = input_dict['jobname']
    max_seq = input_dict['max_seq']
    extra_seq = input_dict['extra_seq']
    rmsd_range = input_dict['analysis_range']
    align_range = input_dict['align_range']
    output_path = input_dict['output_path']

    # Calculate KDE
    kde = gaussian_kde(rmsd_data, bw_method='silverman')
    x_vals = np.linspace(min(rmsd_data), max(rmsd_data), 1000)
    kde_vals = kde(x_vals)

    # Find modes (peaks)
    peaks, _ = find_peaks(kde_vals)
    modes = x_vals[peaks]

    # Find the two most distant modes
    if len(modes) > 1:
        distances = np.abs(np.subtract.outer(modes, modes))
        np.fill_diagonal(distances, 0)
        max_dist_indices = np.unravel_index(np.argmax(distances), distances.shape)
        most_distant_modes = modes[list(max_dist_indices)]
    else:
        most_distant_modes = modes

    # Mark the most distant modes
    modes_dict = {}
    mode_n = 0
    for mode_density, mode in zip(kde_vals[peaks], modes):
        array = np.asarray(rmsd_data).flatten()  # Flatten the array to ensure it's 1-dimensional
        target_value = np.asarray(mode).item()  # Ensure target_value is a scalar

        mode_index = (np.abs(array - target_value)).argmin()
        mode_results = {'mode_index': mode_index,
                        'mode_value': mode,
                        'mode_density': mode_density}

        mode_n += 1

        modes_dict[f'mode_{mode_n}'] = mode_results
    if widget:
        plot_item = widget.add_plot(x_vals, kde_vals, title=f'{jobname} {max_seq} {extra_seq}', x_label='RMSD (Å)', y_label='Density', label='KDE')
        widget.add_scatter(plot_item, modes, kde_vals[peaks], label='Modes')
        
        for mode in most_distant_modes:
            lines=pg.InfiniteLine(pos=mode, angle=90, pen=pg.mkPen('b', style=QtCore.Qt.DashLine), label='Most Distant Mode')
            plot_item.addItem(lines)
        
    if output_path:
        # Plot KDE and mark modes
        plt.figure(figsize=(6, 3))
        plt.plot(x_vals, kde_vals, label='KDE')
        plt.scatter(modes, kde_vals[peaks], color='red', zorder=5, label='Modes')

        for mode in most_distant_modes:
            plt.axvline(mode, color='blue', linestyle='--', label='Most Distant Mode')

        plt.title(f'{jobname} {max_seq} {extra_seq}', fontsize=16)
        #plt.suptitle(f'{rmsd_range} after alignment to {align_range}', fontsize=10)
        plt.xlabel('RMSD (Å)', fontsize=14)
        plt.ylabel('Density', fontsize=14)
        plt.tick_params(axis='both', which='major', labelsize=12)  # Major ticks
        plt.tick_params(axis='both', which='minor', labelsize=12)  # Minor ticks (if any)
        plt.grid(False)

        # Show the plot
        plt.tight_layout()

        full_output_path = (f"{output_path}/"
                            f"{jobname}/"
                            f"analysis/"
                            f"rmsd_1d/"
                            f"{jobname}_"
                            f"{max_seq}_"
                            f"{extra_seq}_"
                            f"rmsd_1d_kde.png")

        plt.savefig(full_output_path, dpi=300)
        plt.close()

    return modes_dict




[docs]
def rmsd_mode_analysis(prediction_dicts, input_dict, ref1d, widget):
    """
    Perform 1D RMSD mode analysis for each prediction in the provided dictionary.

    Parameters:
    prediction_dicts (dict): A dictionary containing prediction data with associated MDAnalysis Universes.
    input_dict (dict): A dictionary containing job-related metadata (jobname, analysis range, alignment range, etc.).
    ref1d (str): Path to the reference PDB file for RMSD calculation.
    widget (object): A widget object for interactive plotting.

    Returns:
    dict: The updated prediction_dicts with calculated RMSD data and identified modes.
    """
    jobname = input_dict['jobname']
    rmsd_range = input_dict['analysis_range']
    align_range = input_dict['align_range']
    output_path = input_dict['output_path']

    print('')

    with tqdm(total=len(prediction_dicts), bar_format=TQDM_BAR_FORMAT) as pbar:
        for prediction in prediction_dicts:
            pbar.set_description(f'Running 1D RMSD analysis for {prediction}')
            universe = prediction_dicts[prediction]['mda_universe']
            max_seq = prediction_dicts[prediction]['max_seq']
            extra_seq = prediction_dicts[prediction]['extra_seq']
            input_dict['max_seq'] = max_seq
            input_dict['extra_seq'] = extra_seq

            prediction_dicts[prediction]['rmsd_data'] = calculate_rmsd(universe,
                                                                       ref1d,
                                                                       align_range=align_range,
                                                                       analysis_range=rmsd_range)

            rmsd_df = pd.DataFrame.from_dict(prediction_dicts[prediction]['rmsd_data'], orient='columns')

            full_output_path = (f"{output_path}/"
                                f"{jobname}/"
                                f"analysis/"
                                f"rmsd_1d/"
                                f"{jobname}_"
                                f"{max_seq}_"
                                f"{extra_seq}_"
                                f"rmsd_1d_df.csv")

            rmsd_df.to_csv(full_output_path, index=False)

            all_rmsd_modes = {}
            rmsd_modes = rmsd_kde(prediction_dicts[prediction]['rmsd_data'][rmsd_range], input_dict, widget)

            for mode in rmsd_modes:
                mode_index = rmsd_modes[mode]['mode_index']
                mode_value = int(rmsd_modes[mode]['mode_value'])
                frame_to_save = prediction_dicts[prediction]['rmsd_data']['frame'][mode_index]
                universe.trajectory[int(frame_to_save)]

                full_pdb_path = (f"{output_path}/"
                                 f"{jobname}/"
                                 f"analysis/"
                                 f"representative_structures/"
                                 f"rmsd_1d/"
                                 f"{jobname}_"
                                 f'{max_seq}_'
                                 f'{extra_seq}_'
                                 f'frame_'
                                 f'{mode_index}_'
                                 f'rmsd_vs_ref_'
                                 f'{mode_value}A.pdb')

                with mda.Writer(full_pdb_path, universe.atoms.n_atoms) as W:
                    W.write(universe.atoms)

                rmsd_modes[mode]['pdb_filename'] = full_pdb_path

                all_rmsd_modes[rmsd_range] = rmsd_modes

            prediction_dicts[prediction]['rmsd_data']['rmsd_modes'] = all_rmsd_modes
            pbar.update(n=1)

    return prediction_dicts




[docs]
def build_dataset_rmsd_modes(results_dict, input_dict):
    """
    Build a dataset from the RMSD mode analysis results and save it as a CSV file.

    Parameters:
    results_dict (dict): A dictionary containing the results of the RMSD mode analysis.
    input_dict (dict): A dictionary containing job-related metadata (jobname, analysis range, output path, etc.).

    Returns:
    None: The function saves the dataset as a CSV file in the specified output directory.
    """
    jobname = input_dict['jobname']
    rmsd_range = input_dict['analysis_range']
    output_path = input_dict['output_path']

    trials = []
    rmsd_range_labels = []
    mode_labels = []
    mode_indexes = []
    mode_densities = []
    mode_values = []
    pdb_filenames = []

    for result in results_dict:
        mode_data = results_dict[result]['rmsd_data']['rmsd_modes'][rmsd_range]
        for mode in mode_data:
            trials.append(result)
            rmsd_range_labels.append(rmsd_range)
            mode_labels.append(mode)
            mode_indexes.append(results_dict[result]['rmsd_data']['rmsd_modes'][rmsd_range][mode]['mode_index'])
            mode_densities.append(results_dict[result]['rmsd_data']['rmsd_modes'][rmsd_range][mode]['mode_density'])
            mode_values.append(results_dict[result]['rmsd_data']['rmsd_modes'][rmsd_range][mode]['mode_value'])
            pdb_filenames.append(results_dict[result]['rmsd_data']['rmsd_modes'][rmsd_range][mode]['pdb_filename'])

    df = pd.DataFrame({'mode_label': mode_labels,
                       'representative_Frame': mode_indexes,
                       'RMSD': mode_values,
                       'mode_peak_density': mode_densities,
                       'trial': trials,
                       'rmsd_range_label': rmsd_range_labels,
                       'pdb_filename': pdb_filenames
                       })

    full_output_path = (f"{output_path}/"
                        f"{jobname}/"
                        f"analysis/"
                        f"rmsd_1d/"
                        f"{jobname}_"
                        f"rmsd_1d_analysis_results.csv")

    print(f"\nSaving {jobname} RMSD_1D analysis results to {full_output_path}\n")

    print(f"Representative Structures for {jobname} RMSD 1D analysis saved at {output_path}/{jobname}/analysis/"
          f"representative_structures/rmsd_1d/\n")

    df.to_csv(full_output_path, index=False)