Class Project

For the final class project, I worked on using transformers and LSTM to generate and predict the effect of mutations on Flu H5 antigenic escape.
RicardoRH96 · Dec 25, 2024 · fe3afd7 · fe3afd7
1 parent 32ed5db
commit fe3afd7
Show file tree

Hide file tree

Showing 6 changed files with 6,796 additions and 0 deletions.
diff --git a/.DS_Store b/.DS_Store
diff --git a/FluEscaping_RNN+Transformers/FluEscaping.pdf b/FluEscaping_RNN+Transformers/FluEscaping.pdf
diff --git a/FluEscaping_RNN+Transformers/Flu_LSTM.ipynb b/FluEscaping_RNN+Transformers/Flu_LSTM.ipynb
diff --git a/FluEscaping_RNN+Transformers/Flu_Transformer+Visualization.ipynb b/FluEscaping_RNN+Transformers/Flu_Transformer+Visualization.ipynb
diff --git a/FluEscaping_RNN+Transformers/ProProcess.ipynb b/FluEscaping_RNN+Transformers/ProProcess.ipynb
@@ -0,0 +1,238 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import subprocess\n",
+    "import sys\n",
+    "import os"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "nthread = 0\n",
+      "nthreadpair = 0\n",
+      "nthreadtb = 0\n",
+      "ppenalty_ex = 0\n",
+      "stacksize: 8176 kb\n",
+      "rescale = 1\n",
+      "Gap Penalty = -1.53, +0.00, +0.00\n",
+      "\n",
+      "\n",
+      "\n",
+      "Making a distance matrix ..\n",
+      "\n",
+      "There are 4298 ambiguous characters.\n",
+      "  501 / 591\n",
+      "done.\n",
+      "\n",
+      "Constructing a UPGMA tree (efffree=0) ... \n",
+      "  580 / 591\n",
+      "done.\n",
+      "\n",
+      "Progressive alignment 1/2... \n",
+      "STEP   386 / 590 \n",
+      "Reallocating..done. *alloclen = 2154\n",
+      "STEP   501 / 590 \n",
+      "done.\n",
+      "\n",
+      "Making a distance matrix from msa.. \n",
+      "  500 / 591\n",
+      "done.\n",
+      "\n",
+      "Constructing a UPGMA tree (efffree=1) ... \n",
+      "  580 / 591\n",
+      "done.\n",
+      "\n",
+      "Progressive alignment 2/2... \n",
+      "STEP   445 / 590 \n",
+      "Reallocating..done. *alloclen = 2160\n",
+      "STEP   501 / 590 \n",
+      "done.\n",
+      "\n",
+      "disttbfast (aa) Version 7.526\n",
+      "alg=A, model=BLOSUM62, 1.53, -0.00, -0.00, noshift, amax=0.0\n",
+      "0 thread(s)\n",
+      "\n",
+      "\n",
+      "Strategy:\n",
+      " FFT-NS-2 (Fast but rough)\n",
+      " Progressive method (guide trees were built 2 times.)\n",
+      "\n",
+      "If unsure which option to use, try 'mafft --auto input > output'.\n",
+      "For more information, see 'mafft --help', 'mafft --man' and the mafft page.\n",
+      "\n",
+      "The default gap scoring scheme has been changed in version 7.110 (2013 Oct).\n",
+      "It tends to insert more gaps into gap-rich regions than previous versions.\n",
+      "To disable this change, add the --leavegappyregion option.\n",
+      "\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "CompletedProcess(args='mafft --auto H5_GisaidData.clustered.fasta > H5_GisaidData.clustered.aligned.fasta', returncode=0)"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "fasta_path = 'H5_GisaidData.clustered.fasta'\n",
+    "output_path = 'H5_GisaidData.clustered.aligned.fasta'\n",
+    "\n",
+    "command = f\"mafft --auto {fasta_path} > {output_path}\"\n",
+    "subprocess.run(command, shell=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "591"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from Bio import SeqIO\n",
+    "from Bio.Seq import Seq\n",
+    "from Bio.SeqRecord import SeqRecord\n",
+    "\n",
+    "records = []\n",
+    "for record in SeqIO.parse(output_path, \"fasta\"):\n",
+    "    records.append(record)\n",
+    "\n",
+    "len(records)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "nthread = 0\n",
+      "nthreadpair = 0\n",
+      "nthreadtb = 0\n",
+      "ppenalty_ex = 0\n",
+      "stacksize: 8176 kb\n",
+      "rescale = 1\n",
+      "Gap Penalty = -1.53, +0.00, +0.00\n",
+      "\n",
+      "\n",
+      "\n",
+      "Making a distance matrix ..\n",
+      "\n",
+      "There are 4298 ambiguous characters.\n",
+      "  601 / 611\n",
+      "done.\n",
+      "\n",
+      "Constructing a UPGMA tree (efffree=0) ... \n",
+      "  600 / 611\n",
+      "done.\n",
+      "\n",
+      "Progressive alignment 1/2... \n",
+      "STEP   347 / 610 \n",
+      "Reallocating..done. *alloclen = 2157\n",
+      "STEP   601 / 610  h\n",
+      "done.\n",
+      "\n",
+      "Making a distance matrix from msa.. \n",
+      "  600 / 611\n",
+      "done.\n",
+      "\n",
+      "Constructing a UPGMA tree (efffree=1) ... \n",
+      "  600 / 611\n",
+      "done.\n",
+      "\n",
+      "Progressive alignment 2/2... \n",
+      "STEP   456 / 610 \n",
+      "Reallocating..done. *alloclen = 2164\n",
+      "STEP   601 / 610  h\n",
+      "done.\n",
+      "\n",
+      "disttbfast (aa) Version 7.526\n",
+      "alg=A, model=BLOSUM62, 1.53, -0.00, -0.00, noshift, amax=0.0\n",
+      "0 thread(s)\n",
+      "\n",
+      "\n",
+      "Strategy:\n",
+      " FFT-NS-2 (Fast but rough)\n",
+      " Progressive method (guide trees were built 2 times.)\n",
+      "\n",
+      "If unsure which option to use, try 'mafft --auto input > output'.\n",
+      "For more information, see 'mafft --help', 'mafft --man' and the mafft page.\n",
+      "\n",
+      "The default gap scoring scheme has been changed in version 7.110 (2013 Oct).\n",
+      "It tends to insert more gaps into gap-rich regions than previous versions.\n",
+      "To disable this change, add the --leavegappyregion option.\n",
+      "\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "CompletedProcess(args='mafft --auto Data/H5_clusters_rnn.fasta > Data/H5_clusters_rnn.aligned.fasta', returncode=0)"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "#Align the clusters with the RNN generated seqs\n",
+    "\n",
+    "fasta_path = 'Data/H5_clusters_rnn.fasta'\n",
+    "output_path = 'Data/H5_clusters_rnn.aligned.fasta'\n",
+    "\n",
+    "command = f\"mafft --auto {fasta_path} > {output_path}\"\n",
+    "subprocess.run(command, shell=True)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "base",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.8"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}