Skip to content

Commit

Permalink
Class Project
Browse files Browse the repository at this point in the history
For the final class project, I worked on using transformers and LSTM to generate and predict the effect of mutations on Flu H5 antigenic escape.
  • Loading branch information
RicardoRH96 committed Dec 25, 2024
1 parent 32ed5db commit fe3afd7
Show file tree
Hide file tree
Showing 6 changed files with 6,796 additions and 0 deletions.
Binary file modified .DS_Store
Binary file not shown.
Binary file added FluEscaping_RNN+Transformers/FluEscaping.pdf
Binary file not shown.
1 change: 1 addition & 0 deletions FluEscaping_RNN+Transformers/Flu_LSTM.ipynb

Large diffs are not rendered by default.

1,219 changes: 1,219 additions & 0 deletions FluEscaping_RNN+Transformers/Flu_Transformer+Visualization.ipynb

Large diffs are not rendered by default.

238 changes: 238 additions & 0 deletions FluEscaping_RNN+Transformers/ProProcess.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,238 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import subprocess\n",
"import sys\n",
"import os"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"nthread = 0\n",
"nthreadpair = 0\n",
"nthreadtb = 0\n",
"ppenalty_ex = 0\n",
"stacksize: 8176 kb\n",
"rescale = 1\n",
"Gap Penalty = -1.53, +0.00, +0.00\n",
"\n",
"\n",
"\n",
"Making a distance matrix ..\n",
"\n",
"There are 4298 ambiguous characters.\n",
" 501 / 591\n",
"done.\n",
"\n",
"Constructing a UPGMA tree (efffree=0) ... \n",
" 580 / 591\n",
"done.\n",
"\n",
"Progressive alignment 1/2... \n",
"STEP 386 / 590 \n",
"Reallocating..done. *alloclen = 2154\n",
"STEP 501 / 590 \n",
"done.\n",
"\n",
"Making a distance matrix from msa.. \n",
" 500 / 591\n",
"done.\n",
"\n",
"Constructing a UPGMA tree (efffree=1) ... \n",
" 580 / 591\n",
"done.\n",
"\n",
"Progressive alignment 2/2... \n",
"STEP 445 / 590 \n",
"Reallocating..done. *alloclen = 2160\n",
"STEP 501 / 590 \n",
"done.\n",
"\n",
"disttbfast (aa) Version 7.526\n",
"alg=A, model=BLOSUM62, 1.53, -0.00, -0.00, noshift, amax=0.0\n",
"0 thread(s)\n",
"\n",
"\n",
"Strategy:\n",
" FFT-NS-2 (Fast but rough)\n",
" Progressive method (guide trees were built 2 times.)\n",
"\n",
"If unsure which option to use, try 'mafft --auto input > output'.\n",
"For more information, see 'mafft --help', 'mafft --man' and the mafft page.\n",
"\n",
"The default gap scoring scheme has been changed in version 7.110 (2013 Oct).\n",
"It tends to insert more gaps into gap-rich regions than previous versions.\n",
"To disable this change, add the --leavegappyregion option.\n",
"\n"
]
},
{
"data": {
"text/plain": [
"CompletedProcess(args='mafft --auto H5_GisaidData.clustered.fasta > H5_GisaidData.clustered.aligned.fasta', returncode=0)"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"fasta_path = 'H5_GisaidData.clustered.fasta'\n",
"output_path = 'H5_GisaidData.clustered.aligned.fasta'\n",
"\n",
"command = f\"mafft --auto {fasta_path} > {output_path}\"\n",
"subprocess.run(command, shell=True)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"591"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from Bio import SeqIO\n",
"from Bio.Seq import Seq\n",
"from Bio.SeqRecord import SeqRecord\n",
"\n",
"records = []\n",
"for record in SeqIO.parse(output_path, \"fasta\"):\n",
" records.append(record)\n",
"\n",
"len(records)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"nthread = 0\n",
"nthreadpair = 0\n",
"nthreadtb = 0\n",
"ppenalty_ex = 0\n",
"stacksize: 8176 kb\n",
"rescale = 1\n",
"Gap Penalty = -1.53, +0.00, +0.00\n",
"\n",
"\n",
"\n",
"Making a distance matrix ..\n",
"\n",
"There are 4298 ambiguous characters.\n",
" 601 / 611\n",
"done.\n",
"\n",
"Constructing a UPGMA tree (efffree=0) ... \n",
" 600 / 611\n",
"done.\n",
"\n",
"Progressive alignment 1/2... \n",
"STEP 347 / 610 \n",
"Reallocating..done. *alloclen = 2157\n",
"STEP 601 / 610 h\n",
"done.\n",
"\n",
"Making a distance matrix from msa.. \n",
" 600 / 611\n",
"done.\n",
"\n",
"Constructing a UPGMA tree (efffree=1) ... \n",
" 600 / 611\n",
"done.\n",
"\n",
"Progressive alignment 2/2... \n",
"STEP 456 / 610 \n",
"Reallocating..done. *alloclen = 2164\n",
"STEP 601 / 610 h\n",
"done.\n",
"\n",
"disttbfast (aa) Version 7.526\n",
"alg=A, model=BLOSUM62, 1.53, -0.00, -0.00, noshift, amax=0.0\n",
"0 thread(s)\n",
"\n",
"\n",
"Strategy:\n",
" FFT-NS-2 (Fast but rough)\n",
" Progressive method (guide trees were built 2 times.)\n",
"\n",
"If unsure which option to use, try 'mafft --auto input > output'.\n",
"For more information, see 'mafft --help', 'mafft --man' and the mafft page.\n",
"\n",
"The default gap scoring scheme has been changed in version 7.110 (2013 Oct).\n",
"It tends to insert more gaps into gap-rich regions than previous versions.\n",
"To disable this change, add the --leavegappyregion option.\n",
"\n"
]
},
{
"data": {
"text/plain": [
"CompletedProcess(args='mafft --auto Data/H5_clusters_rnn.fasta > Data/H5_clusters_rnn.aligned.fasta', returncode=0)"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#Align the clusters with the RNN generated seqs\n",
"\n",
"fasta_path = 'Data/H5_clusters_rnn.fasta'\n",
"output_path = 'Data/H5_clusters_rnn.aligned.fasta'\n",
"\n",
"command = f\"mafft --auto {fasta_path} > {output_path}\"\n",
"subprocess.run(command, shell=True)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "base",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.8"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Loading

0 comments on commit fe3afd7

Please sign in to comment.