From 56bbfac91dfc389cdd23714a622e2dab6bfe653f Mon Sep 17 00:00:00 2001
From: finlayclark <s1629668@ed.ac.uk>
Date: Fri, 26 Jul 2024 16:11:36 +0100
Subject: [PATCH] Add reanalysis of Alibay results

---
 Makefile                       |   2 +-
 README.md                      |   5 +-
 analysis/alibay/analysis.ipynb | 269 +++++++++++++++++++++++++++++++++
 3 files changed, 273 insertions(+), 3 deletions(-)
 create mode 100644 analysis/alibay/analysis.ipynb

diff --git a/Makefile b/Makefile
index 26e7bc3..b2602b8 100644
--- a/Makefile
+++ b/Makefile
@@ -1,7 +1,7 @@
 PACKAGE_NAME  := a3fe_reproduce
 CONDA_ENV_RUN := conda run --no-capture-output --name $(PACKAGE_NAME)
 
-ANALYSIS_DIRS := $(wildcard analysis/*)
+ANALYSIS_DIRS := $(filter-out analysis/alibay, $(wildcard analysis/*))
 ANALYSIS_NBS := $(shell find $(ANALYSIS_DIRS) -name '*analysis.ipynb')
 ANALYSIS_OUTPUT_DIRS := $(ANALYSIS_DIRS:%=%/final_analysis)
 
diff --git a/README.md b/README.md
index ccf8088..47ac036 100644
--- a/README.md
+++ b/README.md
@@ -8,7 +8,8 @@ Inputs and code to reproduce the results and analysis from "Automated Adaptive A
 │   ├── adaptive
 │   ├── cyclod
 │   ├── lambda_window_spacing
-│   └── non_adaptive
+│   ├── non_adaptive
+|   └── alibay
 └── simulations
     ├── cyclod
     ├── initial_systems
@@ -55,7 +56,7 @@ to an [issue with box vectors](https://github.com/OpenBioSim/sire/issues/49). He
 
 ## To Reproduce Analyses
 
-The `analysis` directory contains the code and inputs required to perform analyses not already performed by `a3fe` by default (when `calc.analyse()` or `calc.analyse_convergence()` is run). Each sub-directory contains two notebooks - "analysis" notebooks use the pre-provided data in the `final_analysis` directories to generate the figures from the paper. These can be run without changes. The "preprocessing" notebooks show examples of the computationally-intensive analyses run on the simulation outputs (not included due to size) to generate the data which which is used in the "analysis" notebooks. These will not work without changes - they are intended to be adapted to run on outputs generated by the user.
+The `analysis` directory contains the code and inputs required to perform analyses not already performed by `a3fe` by default (when `calc.analyse()` or `calc.analyse_convergence()` is run). Each sub-directory contains two notebooks - "analysis" notebooks use the pre-provided data in the `final_analysis` directories to generate the figures from the paper. These can be run without changes. The "preprocessing" notebooks show examples of the computationally-intensive analyses run on the simulation outputs (not included due to size) to generate the data which which is used in the "analysis" notebooks. These will not work without changes - they are intended to be adapted to run on outputs generated by the user. Note that `make analysis` will skip the `alibay` directory to avoid downloading large files from Zenodo.
 
 To rerun the "analysis" notebooks and regenerate the plots:
 
diff --git a/analysis/alibay/analysis.ipynb b/analysis/alibay/analysis.ipynb
new file mode 100644
index 0000000..6485484
--- /dev/null
+++ b/analysis/alibay/analysis.ipynb
@@ -0,0 +1,269 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Analysis of Alibay ABFE Results\n",
+    "\n",
+    "In [our manuscript](https://doi.org/10.26434/chemrxiv-2024-3ft7f), we point out that most component simulations of the ABFE calculations are not strictly converged, because the distributions of perturbed energies sampled by replicate runs are significantly different. To illustrate that this issue is wide-spread, we analyse gradients from ABFE calculations from [Alibay et al.'s work](https://www.nature.com/articles/s42004-022-00721-4). We arbitrarily choose to analyse the Cyclphilin-D data. We are grateful to Alibay et al. for providing the raw gradient data."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import a3fe as a3\n",
+    "import os\n",
+    "from pathlib import Path\n",
+    "import matplotlib.pyplot as plt\n",
+    "from scipy.stats import kruskal\n",
+    "from pymbar.timeseries import statisticalInefficiency\n",
+    "from tqdm import tqdm\n",
+    "import numpy as np\n",
+    "plt.style.use(\"seaborn-v0_8-colorblind\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Cyclophilin D: Data Extraction"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Get the data from zenodo\n",
+    "\n",
+    "! wget https://zenodo.org/records/5906019/files/complex.zip\\?download\\=1\n",
+    "! wget https://zenodo.org/records/5906019/files/ligand.zip\\?download\\=1"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Unzip the data\n",
+    "\n",
+    "! mv complex.zip\\?download\\=1.1 complex.zip && unzip complex.zip\n",
+    "! mv ligand.zip\\?download\\=1 ligand.zip && unzip ligand.zip"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Base directory for 'complex' and 'ligand'\n",
+    "base_dir = Path('.')\n",
+    "\n",
+    "xvg_paths = {}\n",
+    "for leg in ['complex', 'ligand']:\n",
+    "    leg_path = base_dir / leg\n",
+    "    if not leg_path.exists():\n",
+    "        print(f\"Directory {leg_path} does not exist.\")\n",
+    "        continue\n",
+    "\n",
+    "    xvg_paths[leg] = {}\n",
+    "    for lig_path in leg_path.iterdir():\n",
+    "        if not lig_path.is_dir():\n",
+    "            continue  # Skip files, only process directories\n",
+    "\n",
+    "        lig = lig_path.name\n",
+    "        xvg_paths[leg][lig] = {}\n",
+    "        for run in range(1, 6):\n",
+    "            run_path = lig_path / f\"run{run}\"\n",
+    "            xvg_paths[leg][lig][run] = {}\n",
+    "            for stage in [\"restraints\", \"coul\", \"vdw\"]:\n",
+    "                if leg == 'ligand' and stage == 'restraints':\n",
+    "                    continue  # No restraint stage for ligand\n",
+    "                stage_path = run_path / f\"{stage}-xvg\"\n",
+    "                if not stage_path.exists():\n",
+    "                    print(f\"Directory {stage_path} does not exist.\")\n",
+    "                    continue\n",
+    "\n",
+    "                xvg_paths[leg][lig][run][stage] = {}\n",
+    "                for xvg_file in stage_path.iterdir():\n",
+    "                    if xvg_file.is_file() and xvg_file.suffix == '.xvg':\n",
+    "                        lam = xvg_file.stem.split(\".\")[1]\n",
+    "                        xvg_paths[leg][lig][run][stage][lam] = xvg_file"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def read_grads(xvg_path: Path) -> list[float]:\n",
+    "    lines = xvg_path.read_text().splitlines()\n",
+    "    filtered_lines = [line for line in lines if not line.startswith((\"#\", \"@\"))]\n",
+    "    # The gradients are the second column\n",
+    "    return [float(line.split()[1]) for line in filtered_lines]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# For one example, check that we have the expected number of gradient data points\n",
+    "example_xvg_path = xvg_paths['complex']['ligand-27'][1]['coul']['0']\n",
+    "example_xvgs = read_grads(example_xvg_path)\n",
+    "\n",
+    "NRG_FREQ = 100\n",
+    "TIMESTEP = 4E-6 # ns\n",
+    "\n",
+    "print(f\"Total time: {(len(example_xvgs) - 1) * NRG_FREQ * TIMESTEP} ns\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Now, get the data and subsample with pymbar timeseries\n",
+    "\n",
+    "grads_subsampled = {}\n",
+    "for leg in ['complex', 'ligand']:\n",
+    "    grads_subsampled[leg] = {}\n",
+    "    for lig in tqdm(xvg_paths[leg], desc=leg):\n",
+    "        grads_subsampled[leg][lig] = {}\n",
+    "        for run in xvg_paths[leg][lig]:\n",
+    "            grads_subsampled[leg][lig][run] = {}\n",
+    "            for stage in xvg_paths[leg][lig][run]:\n",
+    "                grads_subsampled[leg][lig][run][stage] = {}\n",
+    "                for lam in xvg_paths[leg][lig][run][stage]:\n",
+    "                    xvg_path = xvg_paths[leg][lig][run][stage][lam]\n",
+    "                    grads = read_grads(xvg_path)\n",
+    "                    g = statisticalInefficiency(grads)\n",
+    "                    grads_subsampled[leg][lig][run][stage][lam] = grads[::round(g)]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Cyclophilin D: Data Analysis"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def get_sig_diff_grads(grads_subsampled: dict, leg: str, lig: str, stage: str) -> tuple[float, float]:\n",
+    "    \"\"\"\n",
+    "    Calculate the percentage of lambda windows where the gradient distributions\n",
+    "    are significantly different, using the Kruskal-Wallis test\n",
+    "    \"\"\"\n",
+    "    n_lam = len(grads_subsampled[leg][lig][1][stage])\n",
+    "    n_sig_diff = 0\n",
+    "\n",
+    "    for i in range(n_lam):\n",
+    "        gradients = [grads_subsampled[leg][lig][run][stage][str(i)] for run in range(1, 6)]\n",
+    "        _, p = kruskal(*gradients)\n",
+    "        if p < 0.05:\n",
+    "            n_sig_diff += 1\n",
+    "\n",
+    "    return n_lam, n_sig_diff"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Get a dictionary with the percentage of significantly different gradients\n",
+    "sig_diff_grads = {}\n",
+    "for lig in grads_subsampled[\"complex\"]:\n",
+    "    sig_diff_grads[lig] = {}\n",
+    "    for leg in ['complex', 'ligand']:\n",
+    "        sig_diff_grads[lig][leg] = {}\n",
+    "        for stage in grads_subsampled[leg][lig][1]:\n",
+    "            sig_diff_grads[lig][leg][stage] = get_sig_diff_grads(grads_subsampled, leg, lig, stage)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Plot the percentage of lambda windows where the gradients are significantly different all on one bar plot\n",
+    "fig, ax = plt.subplots(figsize=(12, 4), dpi=300)\n",
+    "x = np.arange(len(sig_diff_grads))\n",
+    "width = 0.2\n",
+    "stage_name_map = {\"restraints\": \"Restrain\", \"coul\": \"Discharge\", \"vdw\": \"Vanish\"}\n",
+    "ligand_labels = [lig_name.replace(\"-\", \" \") for lig_name in sig_diff_grads]\n",
+    "\n",
+    "# Plot complex and ligand next to each other\n",
+    "for i, stage in enumerate(['restraints', 'coul', 'vdw']):\n",
+    "    color = ax._get_lines.get_next_color()\n",
+    "    stage_name = stage_name_map[stage]\n",
+    "\n",
+    "    # Bound leg\n",
+    "    y_complex = [100 * (sig_diff_grads[lig][\"complex\"][stage][1] / sig_diff_grads[lig][\"complex\"][stage][0]) for lig in sig_diff_grads]\n",
+    "    ax.bar(x + i * width, y_complex, width, label=f\"Bound {stage_name}\", edgecolor=\"k\", alpha=1, color=color)\n",
+    "\n",
+    "    # Free leg\n",
+    "    if stage != 'restraints':\n",
+    "        y_lig = [100 * (sig_diff_grads[lig][\"ligand\"][stage][1] / sig_diff_grads[lig][\"ligand\"][stage][0]) for lig in sig_diff_grads]\n",
+    "        ax.bar(x + i * width, y_lig, width, label=f\"Free {stage_name}\", edgecolor=\"k\", alpha=0.8, color=color, hatch=\"///////\")\n",
+    "\n",
+    "# Set x ticks but rotate them 90\n",
+    "ax.set_xticks(x + width)\n",
+    "ax.set_xticklabels(ligand_labels, rotation=90)\n",
+    "ax.set_ylabel(\"% Windows with Significant\\n Inter-run Differences Between\\n Gradient Distributions\")\n",
+    "# Put label off to right of plot\n",
+    "ax.legend(bbox_to_anchor=(1.03, 0.7))\n",
+    "fig.tight_layout(pad=-2)\n",
+    "fig.savefig(\"final_analysis/gradient_sig_diffs_cyclod_alibay.png\", dpi=300, bbox_inches=\"tight\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Remove all the downloaded data\n",
+    "\n",
+    "! rm -r complex ligand complex.zip ligand.zip"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "a3fe_reproduce",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}