diff --git a/notebook/attention.svg b/notebook/attention.svg
new file mode 100644
index 0000000..af2c5b5
--- /dev/null
+++ b/notebook/attention.svg
@@ -0,0 +1,21 @@
+<svg version="1.1" xmlns="http://www.w3.org/2000/svg" viewBox="0 0 99.28954922651064 116.30325601136246" width="198.5790984530213" height="232.60651202272493">
+  <!-- svg-source:excalidraw -->
+  
+  <defs>
+    <style class="style-fonts">
+      @font-face {
+        font-family: "Virgil";
+        src: url("https://excalidraw.com/Virgil.woff2");
+      }
+      @font-face {
+        font-family: "Cascadia";
+        src: url("https://excalidraw.com/Cascadia.woff2");
+      }
+      @font-face {
+        font-family: "Assistant";
+        src: url("https://excalidraw.com/Assistant-Regular.woff2");
+      }
+    </style>
+    
+  </defs>
+  <rect x="0" y="0" width="99.28954922651064" height="116.30325601136246" fill="#ffffff"></rect><g stroke-linecap="round"><g transform="translate(76.69672351183908 51.357834054048) rotate(0 1.0595353932822036 2.757208822508801)"><path d="M0 0 C0.64 1.66, 1.27 3.32, 2.12 5.51 M0 0 C0.68 1.76, 1.35 3.52, 2.12 5.51" stroke="#1e1e1e" stroke-width="2" fill="none"></path></g></g><mask></mask><g stroke-linecap="round"><g transform="translate(18.77385220638098 51.30146533579682) rotate(0 1.0595353932822036 2.757208822508801)"><path d="M0 0 C0.53 1.39, 1.07 2.78, 2.12 5.51 M0 0 C0.8 2.08, 1.6 4.16, 2.12 5.51" stroke="#1e1e1e" stroke-width="2" fill="none"></path></g></g><mask></mask><g stroke-linecap="round"><g transform="translate(20.915389196448814 56.93466612851989) rotate(0 0.9141717410634556 -2.9088855668724136)"><path d="M0 0 C0.49 -1.57, 0.99 -3.14, 1.83 -5.82 M0 0 C0.41 -1.31, 0.82 -2.62, 1.83 -5.82" stroke="#1e1e1e" stroke-width="2" fill="none"></path></g></g><mask></mask><g stroke-linecap="round" transform="translate(20.694718858136184 41.91998573778983) rotate(0 28.977108040527128 11.51894327096852)"><path d="M5.76 0 C23.18 0, 40.6 0, 52.19 0 M5.76 0 C22.39 0, 39.02 0, 52.19 0 M52.19 0 C56.03 0, 57.95 1.92, 57.95 5.76 M52.19 0 C56.03 0, 57.95 1.92, 57.95 5.76 M57.95 5.76 C57.95 8.74, 57.95 11.71, 57.95 17.28 M57.95 5.76 C57.95 9.85, 57.95 13.93, 57.95 17.28 M57.95 17.28 C57.95 21.12, 56.03 23.04, 52.19 23.04 M57.95 17.28 C57.95 21.12, 56.03 23.04, 52.19 23.04 M52.19 23.04 C39.67 23.04, 27.14 23.04, 5.76 23.04 M52.19 23.04 C41.21 23.04, 30.23 23.04, 5.76 23.04 M5.76 23.04 C1.92 23.04, 0 21.12, 0 17.28 M5.76 23.04 C1.92 23.04, 0 21.12, 0 17.28 M0 17.28 C0 13.44, 0 9.61, 0 5.76 M0 17.28 C0 14.14, 0 10.99, 0 5.76 M0 5.76 C0 1.92, 1.92 0, 5.76 0 M0 5.76 C0 1.92, 1.92 0, 5.76 0" stroke="#1e1e1e" stroke-width="2" fill="none"></path></g><g stroke-linecap="round"><g transform="translate(50.230074675333526 10) rotate(0 0 23.533204096706413)"><path d="M0 0 C0 9.71, 0 19.42, 0 47.07 M0 0 C0 18.6, 0 37.2, 0 47.07" stroke="#1e1e1e" stroke-width="2" fill="none"></path></g></g><mask></mask><g stroke-linecap="round" transform="translate(10 57.7608077914374) rotate(0 39.64477461325532 22.817662184918845)"><path d="M0 0 L79.29 0 L79.29 45.64 L0 45.64" stroke="none" stroke-width="0" fill="#feddac"></path><path d="M0 0 C27.5 0, 55 0, 79.29 0 M0 0 C30.97 0, 61.94 0, 79.29 0 M79.29 0 C79.29 13.77, 79.29 27.54, 79.29 45.64 M79.29 0 C79.29 15.8, 79.29 31.61, 79.29 45.64 M79.29 45.64 C56.54 45.64, 33.8 45.64, 0 45.64 M79.29 45.64 C49.83 45.64, 20.38 45.64, 0 45.64 M0 45.64 C0 34.51, 0 23.38, 0 0 M0 45.64 C0 35.91, 0 26.19, 0 0" stroke="#1e1e1e" stroke-width="2" fill="none"></path></g><g transform="translate(15.919194260142831 57.05841990830595) rotate(0 32.90045335948116 23.47621119539008)"><text x="32.90045335948138" y="12.520911779425331" font-family="Helvetica, Segoe UI Emoji" font-size="13.609397794428979px" fill="#1e1e1e" text-anchor="middle" style="white-space: pre;" direction="ltr" dominant-baseline="alphabetic">Masked</text><text x="32.90045335948138" y="28.171719243018657" font-family="Helvetica, Segoe UI Emoji" font-size="13.609397794428979px" fill="#1e1e1e" text-anchor="middle" style="white-space: pre;" direction="ltr" dominant-baseline="alphabetic">Multi-Head</text><text x="32.90045335948138" y="43.82252670661198" font-family="Helvetica, Segoe UI Emoji" font-size="13.609397794428979px" fill="#1e1e1e" text-anchor="middle" style="white-space: pre;" direction="ltr" dominant-baseline="alphabetic">Attention</text></g><g stroke-linecap="round"><g transform="translate(48.25899154467152 51.458960460415256) rotate(0 1.0595353932822036 2.757208822508801)"><path d="M0 0 C0.82 2.14, 1.65 4.29, 2.12 5.51 M0 0 C0.6 1.56, 1.2 3.11, 2.12 5.51" stroke="#1e1e1e" stroke-width="2" fill="none"></path></g></g><mask></mask><g stroke-linecap="round"><g transform="translate(50.58565666258801 103.60057175637485) rotate(0 0 1.3513421274938082)"><path d="M0 0 C0 0.45, 0 2.25, 0 2.7 M0 0 C0 0.45, 0 2.25, 0 2.7" stroke="#1e1e1e" stroke-width="2" fill="none"></path></g></g><mask></mask><g stroke-linecap="round"><g transform="translate(48.38072208917674 51.60078649314164) rotate(0 1.0595353932822036 2.757208822508801)"><path d="M0 0 C0.43 1.11, 0.85 2.22, 2.12 5.51 M0 0 C0.56 1.47, 1.13 2.94, 2.12 5.51" stroke="#1e1e1e" stroke-width="2" fill="none"></path></g></g><mask></mask><g stroke-linecap="round"><g transform="translate(50.522259079246396 57.23398728586835) rotate(0 0.9141717410634556 -2.9088855668724136)"><path d="M0 0 C0.47 -1.48, 0.93 -2.97, 1.83 -5.82 M0 0 C0.59 -1.89, 1.19 -3.78, 1.83 -5.82" stroke="#1e1e1e" stroke-width="2" fill="none"></path></g></g><mask></mask><g stroke-linecap="round"><g transform="translate(76.76818443649881 51.38989696505814) rotate(0 1.0595353932822036 2.757208822508801)"><path d="M0 0 C0.83 2.17, 1.67 4.34, 2.12 5.51 M0 0 C0.45 1.16, 0.89 2.32, 2.12 5.51" stroke="#1e1e1e" stroke-width="2" fill="none"></path></g></g><mask></mask><g stroke-linecap="round"><g transform="translate(78.90972142656847 57.02309775778485) rotate(0 0.9141717410634556 -2.9088855668724136)"><path d="M0 0 C0.37 -1.17, 0.73 -2.34, 1.83 -5.82 M0 0 C0.43 -1.36, 0.86 -2.73, 1.83 -5.82" stroke="#1e1e1e" stroke-width="2" fill="none"></path></g></g><mask></mask></svg>
\ No newline at end of file
diff --git a/notebook/embedding.svg b/notebook/embedding.svg
new file mode 100644
index 0000000..ee0c0f9
--- /dev/null
+++ b/notebook/embedding.svg
@@ -0,0 +1,21 @@
+<svg version="1.1" xmlns="http://www.w3.org/2000/svg" viewBox="0 0 167.96701278992987 86.01269097874592" width="335.93402557985974" height="172.02538195749185">
+  <!-- svg-source:excalidraw -->
+  
+  <defs>
+    <style class="style-fonts">
+      @font-face {
+        font-family: "Virgil";
+        src: url("https://excalidraw.com/Virgil.woff2");
+      }
+      @font-face {
+        font-family: "Cascadia";
+        src: url("https://excalidraw.com/Cascadia.woff2");
+      }
+      @font-face {
+        font-family: "Assistant";
+        src: url("https://excalidraw.com/Assistant-Regular.woff2");
+      }
+    </style>
+    
+  </defs>
+  <rect x="0" y="0" width="167.96701278992987" height="86.01269097874592" fill="#ffffff"></rect><g stroke-linecap="round" transform="translate(10 10) rotate(0 39.64477461325532 15.548572179997336)"><path d="M0 0 L79.29 0 L79.29 31.1 L0 31.1" stroke="none" stroke-width="0" fill="#fad8dc"></path><path d="M0 0 C16.26 0, 32.51 0, 79.29 0 M0 0 C22.44 0, 44.88 0, 79.29 0 M79.29 0 C79.29 7.5, 79.29 15, 79.29 31.1 M79.29 0 C79.29 7.53, 79.29 15.06, 79.29 31.1 M79.29 31.1 C52.63 31.1, 25.96 31.1, 0 31.1 M79.29 31.1 C56.69 31.1, 34.09 31.1, 0 31.1 M0 31.1 C0 22.93, 0 14.76, 0 0 M0 31.1 C0 19.32, 0 7.54, 0 0" stroke="#1e1e1e" stroke-width="2" fill="none"></path></g><g transform="translate(15.775234534627998 19.115778758672604) rotate(0 34.42552991896446 7.825403731796541)"><text x="34.42552991896451" y="12.52091177942533" font-family="Helvetica, Segoe UI Emoji" font-size="13.609397794428975px" fill="#1e1e1e" text-anchor="middle" style="white-space: pre;" direction="ltr" dominant-baseline="alphabetic">Embedding</text></g><g transform="translate(98.95751454064794 44.71107605155885) rotate(0 29.504749124640966 15.650807463593537)"><text x="29.504749124640945" y="12.52091177942533" font-family="Helvetica, Segoe UI Emoji" font-size="13.609397794428977px" fill="#1e1e1e" text-anchor="middle" style="white-space: pre;" direction="ltr" dominant-baseline="alphabetic">Positional</text><text x="29.504749124640945" y="28.17171924301865" font-family="Helvetica, Segoe UI Emoji" font-size="13.609397794428977px" fill="#1e1e1e" text-anchor="middle" style="white-space: pre;" direction="ltr" dominant-baseline="alphabetic">Encoding</text></g><g stroke-linecap="round" transform="translate(66.84708733557636 46.50562777726191) rotate(0 13.14980218221649 12.845972805910606)"><path d="M26.3 12.85 C26.3 13.59, 26.23 14.34, 26.1 15.08 C25.97 15.81, 25.77 16.54, 25.51 17.24 C25.25 17.94, 24.92 18.63, 24.54 19.27 C24.16 19.91, 23.71 20.53, 23.22 21.1 C22.73 21.67, 22.19 22.21, 21.6 22.69 C21.02 23.16, 20.38 23.6, 19.72 23.97 C19.07 24.34, 18.36 24.66, 17.65 24.92 C16.93 25.17, 16.18 25.37, 15.43 25.5 C14.68 25.63, 13.91 25.69, 13.15 25.69 C12.39 25.69, 11.62 25.63, 10.87 25.5 C10.12 25.37, 9.37 25.17, 8.65 24.92 C7.94 24.66, 7.23 24.34, 6.57 23.97 C5.92 23.6, 5.28 23.16, 4.7 22.69 C4.11 22.21, 3.57 21.67, 3.08 21.1 C2.59 20.53, 2.14 19.91, 1.76 19.27 C1.38 18.63, 1.05 17.94, 0.79 17.24 C0.53 16.54, 0.33 15.81, 0.2 15.08 C0.07 14.34, 0 13.59, 0 12.85 C0 12.1, 0.07 11.35, 0.2 10.62 C0.33 9.88, 0.53 9.15, 0.79 8.45 C1.05 7.75, 1.38 7.07, 1.76 6.42 C2.14 5.78, 2.59 5.16, 3.08 4.59 C3.57 4.02, 4.11 3.48, 4.7 3.01 C5.28 2.53, 5.92 2.09, 6.57 1.72 C7.23 1.35, 7.94 1.03, 8.65 0.77 C9.37 0.52, 10.12 0.32, 10.87 0.2 C11.62 0.07, 12.39 0, 13.15 0 C13.91 0, 14.68 0.07, 15.43 0.2 C16.18 0.32, 16.93 0.52, 17.65 0.77 C18.36 1.03, 19.07 1.35, 19.72 1.72 C20.38 2.09, 21.02 2.53, 21.6 3.01 C22.19 3.48, 22.73 4.02, 23.22 4.59 C23.71 5.16, 24.16 5.78, 24.54 6.42 C24.92 7.07, 25.25 7.75, 25.51 8.45 C25.77 9.15, 25.97 9.88, 26.1 10.62 C26.23 11.35, 26.27 12.47, 26.3 12.85 C26.33 13.22, 26.33 12.47, 26.3 12.85" stroke="#1e1e1e" stroke-width="2" fill="none"></path></g><g stroke-linecap="round"><g transform="translate(66.92867549300627 60.44080568656318) rotate(0 12.545145071907427 0.32787560659244264)"><path d="M0 0 C1.38 1.01, 5.52 6.93, 8.25 6.03 C10.98 5.14, 13.59 -4.27, 16.39 -5.38 C19.2 -6.49, 23.64 -1.42, 25.09 -0.63 M0 0 C1.38 1.01, 5.52 6.93, 8.25 6.03 C10.98 5.14, 13.59 -4.27, 16.39 -5.38 C19.2 -6.49, 23.64 -1.42, 25.09 -0.63" stroke="#1e1e1e" stroke-width="2" fill="none"></path></g></g><mask></mask><g stroke-linecap="round" transform="translate(42.535844498418555 53.25987122353945) rotate(0 6.685627830784142 6.25366536698948)"><path d="M13.37 6.25 C13.37 6.62, 13.34 6.98, 13.27 7.34 C13.2 7.7, 13.1 8.05, 12.97 8.39 C12.84 8.73, 12.67 9.07, 12.48 9.38 C12.28 9.69, 12.06 10, 11.81 10.27 C11.56 10.55, 11.28 10.81, 10.98 11.04 C10.69 11.28, 10.36 11.49, 10.03 11.67 C9.69 11.85, 9.34 12.01, 8.97 12.13 C8.61 12.25, 8.23 12.35, 7.85 12.41 C7.47 12.48, 7.07 12.51, 6.69 12.51 C6.3 12.51, 5.91 12.48, 5.52 12.41 C5.14 12.35, 4.76 12.25, 4.4 12.13 C4.04 12.01, 3.68 11.85, 3.34 11.67 C3.01 11.49, 2.68 11.28, 2.39 11.04 C2.09 10.81, 1.81 10.55, 1.56 10.27 C1.32 10, 1.09 9.69, 0.9 9.38 C0.7 9.07, 0.54 8.73, 0.4 8.39 C0.27 8.05, 0.17 7.7, 0.1 7.34 C0.03 6.98, 0 6.62, 0 6.25 C0 5.89, 0.03 5.52, 0.1 5.17 C0.17 4.81, 0.27 4.45, 0.4 4.11 C0.54 3.77, 0.7 3.44, 0.9 3.13 C1.09 2.81, 1.32 2.51, 1.56 2.23 C1.81 1.96, 2.09 1.7, 2.39 1.46 C2.68 1.23, 3.01 1.02, 3.34 0.84 C3.68 0.66, 4.04 0.5, 4.4 0.38 C4.76 0.25, 5.14 0.16, 5.52 0.1 C5.91 0.03, 6.3 0, 6.69 0 C7.07 0, 7.47 0.03, 7.85 0.1 C8.23 0.16, 8.61 0.25, 8.97 0.38 C9.34 0.5, 9.69 0.66, 10.03 0.84 C10.36 1.02, 10.69 1.23, 10.98 1.46 C11.28 1.7, 11.56 1.96, 11.81 2.23 C12.06 2.51, 12.28 2.81, 12.48 3.13 C12.67 3.44, 12.84 3.77, 12.97 4.11 C13.1 4.45, 13.2 4.81, 13.27 5.17 C13.34 5.52, 13.35 6.07, 13.37 6.25 C13.39 6.43, 13.39 6.07, 13.37 6.25" stroke="#1e1e1e" stroke-width="2" fill="none"></path></g><g stroke-linecap="round"><g transform="translate(45.422951477000424 59.0073581943725) rotate(0 3.7731446528471224 -0.06111556836640375)"><path d="M0 0 C1.26 -0.02, 6.29 -0.1, 7.55 -0.12 M0 0 C1.26 -0.02, 6.29 -0.1, 7.55 -0.12" stroke="#1e1e1e" stroke-width="2" fill="none"></path></g></g><mask></mask><g stroke-linecap="round"><g transform="translate(49.32488772802208 63.103025876051106) rotate(0 0 -3.6064118886624783)"><path d="M0 0 C0 -1.2, 0 -6.01, 0 -7.21 M0 0 C0 -1.2, 0 -6.01, 0 -7.21" stroke="#1e1e1e" stroke-width="2" fill="none"></path></g></g><mask></mask><g stroke-linecap="round"><g transform="translate(49.02655520096687 41.97069837165691) rotate(0 0.08840989987015746 5.245851843973014)"><path d="M0 0 C0.03 1.75, 0.15 8.74, 0.18 10.49 M0 0 C0.03 1.75, 0.15 8.74, 0.18 10.49" stroke="#1e1e1e" stroke-width="2" fill="none"></path></g><g transform="translate(49.02655520096687 41.97069837165691) rotate(0 0.08840989987015746 5.245851843973014)"><path d="M-1.7 5.59 C-1.11 7.12, -0.53 8.65, 0.18 10.49 M-1.7 5.59 C-1.02 7.37, -0.34 9.14, 0.18 10.49" stroke="#1e1e1e" stroke-width="2" fill="none"></path></g><g transform="translate(49.02655520096687 41.97069837165691) rotate(0 0.08840989987015746 5.245851843973014)"><path d="M1.89 5.53 C1.35 7.08, 0.82 8.63, 0.18 10.49 M1.89 5.53 C1.27 7.33, 0.65 9.13, 0.18 10.49" stroke="#1e1e1e" stroke-width="2" fill="none"></path></g></g><mask></mask></svg>
\ No newline at end of file
diff --git a/notebook/interactive-transformer.ipynb b/notebook/interactive-transformer.ipynb
index 1a6bedf..bfcccdf 100644
--- a/notebook/interactive-transformer.ipynb
+++ b/notebook/interactive-transformer.ipynb
@@ -11,28 +11,117 @@
   {
    "cell_type": "markdown",
    "id": "23bb2ecc-2822-4143-bfb3-a93de5d5b435",
+   "metadata": {},
+   "source": [
+    "# Introduction\n",
+    "\n",
+    "Welcome. This is an implementation of a transformer in pure go with no third party libraries. This way, everything from tensor operations to tokenization are all done inside this notebook.\n",
+    "\n",
+    "Because the goal of this project is illustrative, there are no optimisations. Everything runs on a single goroutine and doesn't have any parallelism. This makes it easy to follow and good as a reference guide. \n",
+    "\n",
+    "This page is also heavily referenced. The goal is to have everything have a reference to its original paper or reference implementation.\n",
+    "\n",
+    "For more content like this, or if you like croissants, got to [joshcarp.com](https://joshcarp.com)\n",
+    "\n",
+    "Attribution: This notebook is a version of my repo [github.com/joshcarp/llm.go](https:/github.com/joshcarp/llm.go) which in itself is a fork of [github.com/karpathy/llm.c](https://github.com/karpathy/llm.c).\n",
+    "\n",
+    "Karpathy's effort to democratize AI through education makes him one of the most important people in this field."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f05e8492-82e4-41a6-be04-d8c0265e7361",
+   "metadata": {},
+   "source": [
+    "# Table of contents\n",
+    "- [Math and Tensors](#math-and-tensors)\n",
+    "- [Architecture](#architecture)\n",
+    "    - [Inference](#inference)\n",
+    "    - [Training](#training)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "eaa9b73f-cdcf-457f-b02f-24f6b17c6bed",
+   "metadata": {},
+   "source": [
+    "# Architecture\n",
+    "<img src=\"decoder-only.svg\" alt=\"decoder-architecture\" width=\"300\" height=\"200\">"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "70117143-70e8-47bf-b3c5-71e70c6a2eb9",
    "metadata": {
     "jp-MarkdownHeadingCollapsed": true
    },
    "source": [
-    "# Introduction"
+    "## Math\n",
+    "- [Math](#math) - All the mathematical operations used within the transformer.\n",
+    "\n",
+    "## Tensors\n",
+    "- [Tensors](#tensors) - What are tensors, how they work, tensor operations, some of the optimisations that different libraries use.\n",
+    "\n",
+    "## Parameters vs Activations\n",
+    "\n",
+    "### Parameters\n",
+    "\n",
+    "- [Parameters](#parameters) - The bulk of what makes up \"the model\". Most of the bytes you download comes from this part.\n",
+    "\n",
+    "### Activations\n",
+    "\n",
+    "- [Activations](#activations) - Output of mathematical operations between the input and the parameters. \n",
+    "\n",
+    "## Forward pass\n",
+    "A forward pass is the \"inference\" stage - this section is what's occuring when you talk with ChatGPT. One single forward pass of a transformer produces a single token as an output, then it's put back into the input (known as being \"Autoregressive\").\n",
+    "\n",
+    "### Preparing\n",
+    "This section transforms text into a vector representation that can be processed by a neural network.\n",
+    "- [Tokenizer](#tokenizer) - Converts text to numeric ids that can be processed.\n",
+    "- [Data Loading](#data-loading) - This section describes how data is loaded, including batching, tokenization, and offsetting.\n",
+    "- [Embedding](#embedding) - Converts these ids into n dimensional vector space\n",
+    "\n",
+    "### N-Layers \n",
+    "This section is repeated for every layer. GPT-2 has 12 layers.\n",
+    "- [Masked Multi-Head Attention](#masked-multi-head-attention) - Allows all tokens in the context window to impact other tokens in the context window\n",
+    "- [Add and Norm](#add-norm) - Adds residual stream and normalises outputs\n",
+    "- [Feed Forward](#feed-forward) - Feed forward is a standard MLP. Allows for more complex connections to be formed than just the attention mechanism alone.\n",
+    "\n",
+    "### Final transformations\n",
+    "This section takes the higher dimensionality representations of our activations and processes it to give us our final output\n",
+    "- [Linear](#linear) - Transformation that reduces dimensionality into \"logits\" which are correlated to how likely each token is (-inf==never, +inf=100% certainty)\n",
+    "- [Softmax](#softmax) - This takes the logits and creates a probability distribution that adds up to 100%\n",
+    "- [Sampling](#sampling) - This samples the probability distribution and returns the single token that's needed to make the next prediction\n",
+    "\n",
+    "### A complete forward pass\n",
+    "This section puts all of this together.\n",
+    "- [Forward](#forward)\n",
+    "\n",
+    "## Backwards pass\n",
+    "This is \"training\". Companies spend billions of dollars optimizing to make this as fast as possible.\n",
+    "\n",
+    "\n"
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": 1,
-   "id": "9f1a32da-97bf-4ea4-9c9c-0709a500f382",
+   "cell_type": "markdown",
+   "id": "25fbc61b-a752-46be-beec-e72bdeac427e",
    "metadata": {},
-   "outputs": [],
    "source": [
-    "// Setup\n",
-    "const GPT2_EOT int32 = 50256\n",
-    "const delta = 1e-5"
+    "# <a name=\"math\"></a> Math\n",
+    "\n",
+    "Because we're GPU poor, and because it makes the implementation easier, we use float32 for all parameters and calculations. \n",
+    "\n",
+    "CPUs can either do calculations in 32 or 64 bits, but the go standard library is opinionated and only supports 64 bit math operations. This wraps all the math functions we need. Whilst all modern architectures have instructions for both float32 and float64 operations, float32 is still faster because it uses 1/2 the registers, so the throughput can be 2x the float64 (citation needed). This is an obvious optimisation for this implementation.\n",
+    "\n",
+    "\n",
+    "Because graphics applications aren't needed to be precise, GPUs often use IEEE 754 half precision which is 16 bits, the training loss from switching from 32 -> 16 bits is negligible. (citation needed)\n",
+    "\n"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 69,
    "id": "150162b5-1556-4663-9a66-9d4d8a28231a",
    "metadata": {},
    "outputs": [],
@@ -79,17 +168,83 @@
     "}\n"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "3519a977-45d2-4bec-af4e-10813ad205c0",
+   "metadata": {},
+   "source": [
+    "# Matrix Multiplication\n",
+    "\n",
+    "matmulForward performs matrix multiplication and adds bias.\n",
+    "Parameters:\n",
+    "  - out: output matrix\n",
+    "  - inp: input matrix\n",
+    "  - weight: weight matrix\n",
+    "  - bias: bias vector\n",
+    "  - B: batch size\n",
+    "  - T: sequence length (number of time steps)\n",
+    "  - C: input dimension (number of features)\n",
+    "  - OC: number of output channels\n",
+    "\n",
+    "Most of the time spent in inference is in this function. Because we're only doing this on a CPU this implemenation is very, very slow, and this is where different implementations would use a GPU/CUDA/Metal implementation to do parallel computation.\n",
+    "\n",
+    "On CPU, many architectures have an optimisation called Basic Linear Algebra Subprograms [BLAS](https://netlib.org/blas/). This allows for tiling (breaking matricies into smaller pieces and processing) or Single Instruction Multiple Data (SIMD).\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 66,
+   "id": "a6a7f1f2-4e30-4f65-a7d5-522157036b60",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "func matmulForward(out, inp, weight, bias []float32, B, T, C, OC int) {\n",
+    "\t// Iterate over each batch\n",
+    "\tvar wg sync.WaitGroup\n",
+    "\tfor b := 0; b < B; b++ {\n",
+    "\t\t// Iterate over each time step in the sequence\n",
+    "\t\tfor t := 0; t < T; t++ {\n",
+    "\t\t\twg.Add(1)\n",
+    "\t\t\tgo func(b, t int) {\n",
+    "\t\t\t\tdefer wg.Done()\n",
+    "\t\t\t\t// Calculate the index in the output slice\n",
+    "\t\t\t\tinp_bt := inp[b*T*C+t*C:]\n",
+    "\t\t\t\tout_bt := out[b*T*OC+t*OC:]\n",
+    "\t\t\t\tfor o := 0; o < OC; o++ {\n",
+    "\t\t\t\t\tvar val float32\n",
+    "\t\t\t\t\tif bias != nil {\n",
+    "\t\t\t\t\t\tval = bias[o]\n",
+    "\t\t\t\t\t}\n",
+    "\t\t\t\t\t// Calculate the index in the weight slice\n",
+    "\t\t\t\t\twrow := weight[o*C:]\n",
+    "\t\t\t\t\t// Perform the dot product between the input and weight row\n",
+    "\t\t\t\t\tfor i := 0; i < C; i++ {\n",
+    "\t\t\t\t\t\tval += inp_bt[i] * wrow[i]\n",
+    "\t\t\t\t\t}\n",
+    "\t\t\t\t\t// Store the output value in the output slice\n",
+    "\t\t\t\t\tout_bt[o] = val\n",
+    "\t\t\t\t}\n",
+    "\t\t\t}(b, t)\n",
+    "\t\t}\n",
+    "\t}\n",
+    "\twg.Wait()\n",
+    "}\n"
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "af8bb417-4277-44c2-b58d-b0c782d0af57",
    "metadata": {},
    "source": [
-    "# Data loading"
+    "# Data loading\n",
+    "\n",
+    "TODO: Rewrite this to not need file system."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 74,
    "id": "a3f465c6-a248-443b-a1d8-f1160e35446d",
    "metadata": {},
    "outputs": [],
@@ -179,7 +334,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 85,
    "id": "7ad42eaa-817f-420d-a215-da5546f12879",
    "metadata": {},
    "outputs": [
@@ -188,15 +343,7 @@
      "output_type": "stream",
      "text": [
       "=== RUN   TestDataLoader_NextBatch\n",
-      "=== RUN   TestDataLoader_NextBatch/1char\n",
-      "=== RUN   TestDataLoader_NextBatch/endOfFile\n",
-      "=== RUN   TestDataLoader_NextBatch/seqLen4\n",
-      "=== RUN   TestDataLoader_NextBatch/seqLen!=batchSize\n",
       "--- PASS: TestDataLoader_NextBatch (0.00s)\n",
-      "    --- PASS: TestDataLoader_NextBatch/1char (0.00s)\n",
-      "    --- PASS: TestDataLoader_NextBatch/endOfFile (0.00s)\n",
-      "    --- PASS: TestDataLoader_NextBatch/seqLen4 (0.00s)\n",
-      "    --- PASS: TestDataLoader_NextBatch/seqLen!=batchSize (0.00s)\n",
       "PASS\n"
      ]
     }
@@ -204,152 +351,36 @@
    "source": [
     "%test\n",
     "func TestDataLoader_NextBatch(t *testing.T) {\n",
-    "\tzeroTo100 := []int32{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99}\n",
-    "\ttype want struct {\n",
-    "\t\treset           bool\n",
-    "\t\tinput           []int32\n",
-    "\t\ttarget          []int32\n",
-    "\t\tcurrentPosition int64\n",
-    "\t}\n",
-    "\ttests := []struct {\n",
-    "\t\tname              string\n",
-    "\t\tcontents          []int32\n",
-    "\t\tfilename          string\n",
-    "\t\tbatchSize, seqLen int\n",
-    "\t\twant              []want\n",
-    "\t\twantNumBatches    int\n",
-    "\t}{\n",
-    "\t\t{\n",
-    "\t\t\tname:           \"1char\",\n",
-    "\t\t\tcontents:       []int32{0, 1, 2, 3, 4, 5, 6, 7, 8, 9},\n",
-    "\t\t\tbatchSize:      1,\n",
-    "\t\t\tseqLen:         1,\n",
-    "\t\t\twantNumBatches: 10,\n",
-    "\t\t\twant: []want{\n",
-    "\t\t\t\t{\n",
-    "\t\t\t\t\tinput:           []int32{0},\n",
-    "\t\t\t\t\ttarget:          []int32{1},\n",
-    "\t\t\t\t\tcurrentPosition: 1,\n",
-    "\t\t\t\t},\n",
-    "\t\t\t\t{\n",
-    "\t\t\t\t\tinput:           []int32{1},\n",
-    "\t\t\t\t\ttarget:          []int32{2},\n",
-    "\t\t\t\t\tcurrentPosition: 2,\n",
-    "\t\t\t\t},\n",
-    "\t\t\t\t{\n",
-    "\t\t\t\t\treset:           true,\n",
-    "\t\t\t\t\tinput:           []int32{0},\n",
-    "\t\t\t\t\ttarget:          []int32{1},\n",
-    "\t\t\t\t\tcurrentPosition: 1,\n",
-    "\t\t\t\t},\n",
-    "\t\t\t},\n",
-    "\t\t},\n",
-    "\t\t{\n",
-    "\t\t\tname:           \"endOfFile\",\n",
-    "\t\t\tcontents:       []int32{0, 1, 2},\n",
-    "\t\t\tbatchSize:      1,\n",
-    "\t\t\tseqLen:         1,\n",
-    "\t\t\twantNumBatches: 3,\n",
-    "\t\t\twant: []want{\n",
-    "\t\t\t\t{\n",
-    "\t\t\t\t\tinput:           []int32{0},\n",
-    "\t\t\t\t\ttarget:          []int32{1},\n",
-    "\t\t\t\t\tcurrentPosition: 1,\n",
-    "\t\t\t\t},\n",
-    "\t\t\t\t{\n",
-    "\t\t\t\t\tinput:           []int32{1},\n",
-    "\t\t\t\t\ttarget:          []int32{2},\n",
-    "\t\t\t\t\tcurrentPosition: 2,\n",
-    "\t\t\t\t},\n",
-    "\t\t\t\t{ // should loop back\n",
-    "\t\t\t\t\tinput:           []int32{0},\n",
-    "\t\t\t\t\ttarget:          []int32{1},\n",
-    "\t\t\t\t\tcurrentPosition: 1,\n",
-    "\t\t\t\t},\n",
-    "\t\t\t\t{\n",
-    "\t\t\t\t\treset:           true,\n",
-    "\t\t\t\t\tinput:           []int32{0},\n",
-    "\t\t\t\t\ttarget:          []int32{1},\n",
-    "\t\t\t\t\tcurrentPosition: 1,\n",
-    "\t\t\t\t},\n",
-    "\t\t\t},\n",
-    "\t\t},\n",
-    "\t\t{\n",
-    "\t\t\tname:           \"seqLen4\",\n",
-    "\t\t\tcontents:       []int32{0, 1, 2, 3, 4, 5, 6, 7, 8, 9},\n",
-    "\t\t\tbatchSize:      1,\n",
-    "\t\t\tseqLen:         4,\n",
-    "\t\t\twantNumBatches: 2,\n",
-    "\t\t\twant: []want{\n",
-    "\t\t\t\t{\n",
-    "\t\t\t\t\tinput:           []int32{0, 1, 2, 3},\n",
-    "\t\t\t\t\ttarget:          []int32{1, 2, 3, 4},\n",
-    "\t\t\t\t\tcurrentPosition: 4,\n",
-    "\t\t\t\t},\n",
-    "\t\t\t\t{\n",
-    "\t\t\t\t\tinput:           []int32{4, 5, 6, 7},\n",
-    "\t\t\t\t\ttarget:          []int32{5, 6, 7, 8},\n",
-    "\t\t\t\t\tcurrentPosition: 8,\n",
-    "\t\t\t\t},\n",
-    "\t\t\t},\n",
-    "\t\t},\n",
-    "\t\t{\n",
-    "\t\t\tname:           \"seqLen!=batchSize\",\n",
-    "\t\t\tcontents:       zeroTo100,\n",
-    "\t\t\tbatchSize:      2,\n",
-    "\t\t\tseqLen:         4,\n",
-    "\t\t\twantNumBatches: 12,\n",
-    "\t\t\twant: []want{\n",
-    "\t\t\t\t{\n",
-    "\t\t\t\t\tinput:           []int32{0, 1, 2, 3, 4, 5, 6, 7},\n",
-    "\t\t\t\t\ttarget:          []int32{1, 2, 3, 4, 5, 6, 7, 8},\n",
-    "\t\t\t\t\tcurrentPosition: 8,\n",
-    "\t\t\t\t},\n",
-    "\t\t\t\t{\n",
-    "\t\t\t\t\tinput:           []int32{8, 9, 10, 11, 12, 13, 14, 15},\n",
-    "\t\t\t\t\ttarget:          []int32{9, 10, 11, 12, 13, 14, 15, 16},\n",
-    "\t\t\t\t\tcurrentPosition: 16,\n",
-    "\t\t\t\t},\n",
-    "\t\t\t\t{\n",
-    "\t\t\t\t\treset:           true,\n",
-    "\t\t\t\t\tinput:           []int32{0, 1, 2, 3, 4, 5, 6, 7},\n",
-    "\t\t\t\t\ttarget:          []int32{1, 2, 3, 4, 5, 6, 7, 8},\n",
-    "\t\t\t\t\tcurrentPosition: 8,\n",
-    "\t\t\t\t},\n",
-    "\t\t\t},\n",
-    "\t\t},\n",
-    "\t}\n",
-    "\tnewInt32Reader := func(data []int32) (io.Reader, int) {\n",
+    "    newInt32Reader := func (data []int32) (io.Reader, int) {\n",
     "\t\tvar b bytes.Buffer\n",
     "\t\trequire.NoError(t, binary.Write(&b, binary.LittleEndian, data))\n",
     "\t\treturn &b, b.Len()\n",
-    "\t}\n",
-    "\tfor _, tt := range tests {\n",
-    "\t\tt.Run(tt.name, func(t *testing.T) {\n",
-    "\t\t\treader, _ := newInt32Reader(tt.contents)\n",
-    "\t\t\tif tt.filename != \"\" {\n",
-    "\t\t\t\t_, err := os.Stat(tt.filename)\n",
-    "\t\t\t\tassert.NoError(t, err)\n",
-    "\t\t\t\tfile, err := os.Open(tt.filename)\n",
-    "\t\t\t\tassert.NoError(t, err)\n",
-    "\t\t\t\tdefer file.Close()\n",
-    "\t\t\t\treader = file\n",
-    "\t\t\t}\n",
-    "\t\t\tloader, err := newDataLoader(reader, tt.batchSize, tt.seqLen)\n",
-    "\t\t\tassert.NoError(t, err)\n",
-    "\t\t\tassert.Equal(t, tt.wantNumBatches, loader.NumBatches)\n",
-    "\t\t\tfor _, want := range tt.want {\n",
-    "\t\t\t\tif want.reset {\n",
-    "\t\t\t\t\tloader.Reset()\n",
-    "\t\t\t\t}\n",
-    "\t\t\t\tinput, target, err := loader.NextBatch()\n",
-    "\t\t\t\tassert.NoError(t, err)\n",
-    "\t\t\t\tassert.Equal(t, want.input, input)\n",
-    "\t\t\t\tassert.Equal(t, want.target, target)\n",
-    "\t\t\t\tassert.Equal(t, want.currentPosition, loader.currentPosition)\n",
-    "\t\t\t}\n",
-    "\t\t})\n",
-    "\t}\n",
+    "    }\n",
+    "\tzeroTo100 := []int32{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99}\n",
+    "\treader, _ := newInt32Reader(zeroTo100)\n",
+    "\n",
+    "\tloader, err := newDataLoader(reader, 1, 1)\n",
+    "\tassert.NoError(t, err)\n",
+    "\n",
+    "\t// next batch\n",
+    "\tinput, target, err := loader.NextBatch()\n",
+    "\tassert.NoError(t, err)\n",
+    "\tassert.Equal(t, []int32{0}, input)\n",
+    "\tassert.Equal(t, []int32{1}, target)\n",
+    "\tassert.Equal(t, int64(1), loader.currentPosition)\n",
+    "\n",
+    "\tinput, target, err = loader.NextBatch()\n",
+    "\tassert.NoError(t, err)\n",
+    "\tassert.Equal(t, []int32{1}, input)\n",
+    "\tassert.Equal(t, []int32{2}, target)\n",
+    "\tassert.Equal(t, int64(2), loader.currentPosition)\n",
+    "\n",
+    "\tloader.Reset()\n",
+    "\tinput, target, err = loader.NextBatch()\n",
+    "\tassert.NoError(t, err)\n",
+    "\tassert.Equal(t, []int32{0}, input)\n",
+    "\tassert.Equal(t, []int32{1}, target)\n",
+    "\tassert.Equal(t, int64(1), loader.currentPosition)\n",
     "}\n"
    ]
   },
@@ -363,12 +394,18 @@
     "What is a tensor?\n",
     "A tensor is a multi-dimensional array. A regular slice is one-dimensional, holding elements in a sequence. A tensor can have multiple dimensions, like a 2D array (grid) or even a 3D array (cube).\n",
     "\n",
-    "[Computerphile video](https://www.youtube.com/watch?v=DfK83xEtJ_k)"
+    "[Computerphile video](https://www.youtube.com/watch?v=DfK83xEtJ_k)\n",
+    "\n",
+    "\n",
+    "Tensor libraries like pytorch or tensorflow exist in python. The most widely used tensor library for local inference is https://ggml.ai/ which powers [llama.cpp](https://github.com/ggerganov/llama.cpp)\n",
+    "\n",
+    "\n",
+    "Tensor libraries can store elements in row-major, or column-major \n"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 63,
+   "execution_count": 86,
    "id": "307f09fa-b67f-4e10-a8b0-ebdc401ad45e",
    "metadata": {},
    "outputs": [],
@@ -441,8 +478,85 @@
     "\t\tsubTensorSize *= dim\n",
     "\t}\n",
     "\treturn subTensorSize\n",
-    "}\n",
+    "}\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 87,
+   "id": "f79b8806-0380-4925-a6b6-54415a449dfa",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "=== RUN   TestTokenizer\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "000000000000000000000000000000\n",
+      "0000000000000000, 00000000, 000000, "
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[25645 8269 10535]\n",
+      "--- PASS: TestTokenizer (0.08s)\n",
+      "PASS\n"
+     ]
+    }
+   ],
+   "source": [
+    "%test\n",
+    "func TestTokenizer(t *testing.T) {\n",
+    "\ttext := \"000000000000000000000000000000\"\n",
+    "\tprintln(text)\n",
+    "\ttokenizer, err := NewTokenizer(\"./gpt2_tokenizer.bin\")\n",
+    "\tassert.NoError(t, err)\n",
+    "\tencoded, err := tokenizer.Encode(text)\n",
+    "\tfmt.Println(encoded)\n",
+    "\tfor _, tok := range encoded {\n",
+    "\t\tdecoded, err := tokenizer.Decode([]int32{tok})\n",
+    "\t\tassert.NoError(t, err)\n",
+    "\t\tprint(decoded)\n",
+    "\t\tprint(\", \")\n",
+    "\t}\n",
+    "\tassert.NoError(t, err)\n",
+    "\tdecoded, err := tokenizer.Decode(encoded)\n",
+    "\tassert.NoError(t, err)\n",
+    "\tassert.Equal(t, text, decoded)\n",
+    "\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "84a9c9f8-0b0f-41dd-a5e8-030bbcdabfbf",
+   "metadata": {},
+   "source": [
+    "# Parameters\n",
+    "A Parameter is a numerical value that determines the strength of the connection between neurons. These connections are similar to synapses in the human brain, and the parameters are like the knobs that adjust the strength of those connections.\n",
     "\n",
+    "There are two main types of parameters in neural networks:\n",
+    "\n",
+    "- Weights: These are associated with each connection between neurons. They multiply the signal coming from one neuron before it's passed on to the next neuron. A higher weight means a stronger connection and a greater influence on the receiving neuron.\n",
+    "\n",
+    "- Biases: These are added to the sum of the weighted inputs at each neuron. They act like a baseline shift, allowing the neuron to activate even if the weighted inputs are weak."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 102,
+   "id": "f23c98a5-6a63-4b79-a6a2-8204308bc95b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
     "// ParameterTensors are the parameters of the model\n",
     "type ParameterTensors struct {\n",
     "\tMemory        []float32\n",
@@ -531,7 +645,25 @@
     "\tif len(memPtr) != 0 {\n",
     "\t\tpanic(\"something went real bad here\")\n",
     "\t}\n",
-    "}\n",
+    "}\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4d2b527e",
+   "metadata": {},
+   "source": [
+    "# Activations\n",
+    "An activation is the output of the input, and a mathematical operation. If the weight determines the strength of the function, the activation is the output.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 103,
+   "id": "de695050",
+   "metadata": {},
+   "outputs": [],
+   "source": [
     "\n",
     "// ActivationTensors\n",
     "type ActivationTensors struct {\n",
@@ -637,36 +769,27 @@
     "\tif len(memPtr) != 0 {\n",
     "\t\tpanic(\"something went real bad here\")\n",
     "\t}\n",
-    "}\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "4d0ed719-c21a-4046-8c63-464818414fc6",
-   "metadata": {},
-   "source": [
-    "# Table of contents\n",
-    "- Tokenization\n",
-    "- Architecture\n",
-    "- Training\n",
-    "  - Forward pass\n",
-    "  - Backward pass"
+    "}"
    ]
   },
   {
    "cell_type": "markdown",
    "id": "d86a9c14-c28d-4dc4-b69e-7d316ecf9b7c",
-   "metadata": {
-    "jp-MarkdownHeadingCollapsed": true
-   },
+   "metadata": {},
    "source": [
-    "# Tokenization"
+    "# Tokenization\n",
+    "\n",
+    "Tokenization is the fundamental process of transforming text data into a format the model can understand. It involves breaking down sentences into smaller units called tokens.\n",
+    "\n",
+    "Byte Pair Encoding is the method that is common in transformers.\n",
+    "\n",
+    "![tokenization](tokenization.svg)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
-   "id": "f6e52b05-a25c-403f-a870-0bb7363da62e",
+   "execution_count": 104,
+   "id": "971a8763",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -772,102 +895,17 @@
     "}\n"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "id": "f79b8806-0380-4925-a6b6-54415a449dfa",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "=== RUN   TestTokenizer\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "000000000000000000000000000000\n",
-      "0000000000000000, 00000000, 000000, "
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[25645 8269 10535]\n",
-      "--- PASS: TestTokenizer (0.08s)\n",
-      "PASS\n"
-     ]
-    }
-   ],
-   "source": [
-    "%test\n",
-    "func TestTokenizer(t *testing.T) {\n",
-    "\ttext := \"000000000000000000000000000000\"\n",
-    "\tprintln(text)\n",
-    "\ttokenizer, err := NewTokenizer(\"./gpt2_tokenizer.bin\")\n",
-    "\tassert.NoError(t, err)\n",
-    "\tencoded, err := tokenizer.Encode(text)\n",
-    "\tfmt.Println(encoded)\n",
-    "\tfor _, tok := range encoded {\n",
-    "\t\tdecoded, err := tokenizer.Decode([]int32{tok})\n",
-    "\t\tassert.NoError(t, err)\n",
-    "\t\tprint(decoded)\n",
-    "\t\tprint(\", \")\n",
-    "\t}\n",
-    "\tassert.NoError(t, err)\n",
-    "\tdecoded, err := tokenizer.Decode(encoded)\n",
-    "\tassert.NoError(t, err)\n",
-    "\tassert.Equal(t, text, decoded)\n",
-    "\n",
-    "}"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "168a356a-a504-4bbd-95d9-5df1daef5e8b",
-   "metadata": {},
-   "source": [
-    "# Architecture"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "eaa9b73f-cdcf-457f-b02f-24f6b17c6bed",
-   "metadata": {},
-   "source": [
-    "<img src=\"decoder-only.svg\" alt=\"decoder-architecture\" width=\"300\" height=\"200\">"
-   ]
-  },
   {
    "cell_type": "markdown",
-   "id": "70117143-70e8-47bf-b3c5-71e70c6a2eb9",
+   "id": "c7e78b24-85a4-468c-bfe8-0a8652d27202",
    "metadata": {},
    "source": [
-    "- [Tokenizer](#tokenizer)\n",
-    "- [Embedding](#embedding)\n",
-    "- [Masked Multi-Head Attention](#masked-multi-head-attention)\n",
-    "- [Add and Norm](#add-norm)\n",
-    "- [Feed Forward](#feed-forward)\n",
-    "- [Linear](#linear)\n",
-    "- [Softmax](#softmax)\n",
-    "- [Sampling](#sampling)"
+    "# GPT Struct"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
-   "id": "1d9fa878-667c-49d7-b8b6-b7589d53b4bf",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 105,
    "id": "973f678c-3961-44dd-b70f-2eafbbfdc51e",
    "metadata": {},
    "outputs": [],
@@ -884,7 +922,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 53,
+   "execution_count": 106,
    "id": "0c705437-6396-4a96-8004-9894541187c0",
    "metadata": {},
    "outputs": [],
@@ -913,7 +951,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 54,
+   "execution_count": 107,
    "id": "a6135550-0a1a-4d92-94ae-d807fffd9f3f",
    "metadata": {},
    "outputs": [],
@@ -974,7 +1012,7 @@
    "id": "def9dfc8-906a-4785-96f6-b41fd97dc1cd",
    "metadata": {},
    "source": [
-    "# Encoder forward"
+    "## Embedding"
    ]
   },
   {
@@ -983,12 +1021,14 @@
    "metadata": {},
    "source": [
     "encoderForward iterates through the batch/sequence and combines the word token embeddings\n",
-    "with the word position embeddings. This allows out vector to encode tokens and positions in one."
+    "with the word position embeddings. This allows out vector to encode tokens and positions in one vector.\n",
+    "\n",
+    "![embeddings](embedding.svg)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 55,
+   "execution_count": 108,
    "id": "098e1e8c-fd88-469c-aa63-30c7b31377f5",
    "metadata": {},
    "outputs": [],
@@ -1021,7 +1061,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 56,
+   "execution_count": 109,
    "id": "ce9d4fc8-f565-4e16-a5b7-f4118d6f92b4",
    "metadata": {},
    "outputs": [
@@ -1029,51 +1069,25 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "=== RUN   TestEncoderForward\n",
-      "=== RUN   TestEncoderForward/#00\n",
-      "--- PASS: TestEncoderForward (0.00s)\n",
-      "    --- PASS: TestEncoderForward/#00 (0.00s)\n",
+      "=== RUN   TestEncoderForwardExplicit\n",
+      "--- PASS: TestEncoderForwardExplicit (0.00s)\n",
       "PASS\n"
      ]
     }
    ],
    "source": [
     "%test\n",
-    "func TestEncoderForward(t *testing.T) {\n",
-    "\ttype args struct {\n",
-    "\t\tout []float32\n",
-    "\t\tinp []int32\n",
-    "\t\twte []float32\n",
-    "\t\twpe []float32\n",
-    "\t\tB   int\n",
-    "\t\tT   int\n",
-    "\t\tC   int\n",
-    "\t}\n",
-    "\ttests := []struct {\n",
-    "\t\tname    string\n",
-    "\t\targs    args\n",
-    "\t\twantOut []float32\n",
-    "\t}{\n",
-    "\t\t{\n",
-    "\t\t\tname: \"\",\n",
-    "\t\t\targs: args{\n",
-    "\t\t\t\tinp: []int32{1, 0}, // [1 -> wte (2, 3), wpe(4, 5)] [0 -> wte (0, 1), wpe(6, 7)]\n",
-    "\t\t\t\twte: []float32{0, 1, 2, 3},\n",
-    "\t\t\t\twpe: []float32{4, 5, 6, 7},\n",
-    "\t\t\t\tB:   1, // Batch size\n",
-    "\t\t\t\tT:   1, // Sequence Len\n",
-    "\t\t\t\tC:   2, // Dimensions\n",
-    "\t\t\t},\n",
-    "\t\t\twantOut: []float32{6, 8},\n",
-    "\t\t},\n",
-    "\t}\n",
-    "\tfor _, tt := range tests {\n",
-    "\t\tt.Run(tt.name, func(t *testing.T) {\n",
-    "\t\t\tout := make([]float32, len(tt.args.inp))\n",
-    "\t\t\tencoderForward(out, tt.args.inp, tt.args.wte, tt.args.wpe, tt.args.B, tt.args.T, tt.args.C)\n",
-    "\t\t\tassert.Equal(t, tt.wantOut, out)\n",
-    "\t\t})\n",
-    "\t}\n",
+    "func TestEncoderForwardExplicit(t *testing.T) {\n",
+    "    inp := []int32{1, 0} // [1 -> wte (2, 3), wpe(4, 5)] [0 -> wte (0, 1), wpe(6, 7)]\n",
+    "    wte := []float32{0, 1, 2, 3}\n",
+    "    wpe := []float32{4, 5, 6, 7}\n",
+    "    B := 1 // Batch size\n",
+    "    T := 1 // Sequence Len\n",
+    "    C := 2 // Dimensions\n",
+    "    out := make([]float32, len(inp))\n",
+    "    encoderForward(out, inp, wte, wpe, B, T, C)\n",
+    "    expectedOut := []float32{6, 8}\n",
+    "    assert.Equal(t, expectedOut, out)\n",
     "}"
    ]
   },
@@ -1105,7 +1119,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 110,
    "id": "8b35d061-c965-42b4-b6ce-ea7d42a36593",
    "metadata": {},
    "outputs": [],
@@ -1151,7 +1165,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 111,
    "id": "c2508379-75cd-4642-9144-19e1c18c4ed7",
    "metadata": {},
    "outputs": [
@@ -1160,9 +1174,7 @@
      "output_type": "stream",
      "text": [
       "=== RUN   TestLayernormForward\n",
-      "=== RUN   TestLayernormForward/#00\n",
       "--- PASS: TestLayernormForward (0.00s)\n",
-      "    --- PASS: TestLayernormForward/#00 (0.00s)\n",
       "PASS\n"
      ]
     }
@@ -1170,70 +1182,29 @@
    "source": [
     "%test\n",
     "func TestLayernormForward(t *testing.T) {\n",
-    "\ttype args struct {\n",
-    "\t\tinp    []float32\n",
-    "\t\tweight []float32\n",
-    "\t\tbias   []float32\n",
-    "\t\tB      int\n",
-    "\t\tT      int\n",
-    "\t\tC      int\n",
-    "\t}\n",
-    "\ttests := []struct {\n",
-    "\t\tname     string\n",
-    "\t\targs     args\n",
-    "\t\twantOut  []float32\n",
-    "\t\twantMean []float32\n",
-    "\t\twantRstd []float32\n",
-    "\t}{\n",
-    "\t\t{\n",
-    "\t\t\tname: \"\",\n",
-    "\t\t\targs: args{\n",
-    "\t\t\t\tinp:    []float32{0.2, 0.1, 0.3, 0.5, 0.1, 0.1},\n",
-    "\t\t\t\tweight: []float32{1, 1, 1, 1, 1, 1},\n",
-    "\t\t\t\tbias:   []float32{0, 0, 0, 0, 0, 0},\n",
-    "\t\t\t\tB:      2,\n",
-    "\t\t\t\tT:      1,\n",
-    "\t\t\t\tC:      3,\n",
-    "\t\t\t},\n",
-    "\t\t\twantOut:  []float32{0, -1.2238272, 1.2238274, 1.4140146, -0.70700747, -0.70700747},\n",
-    "\t\t\twantMean: []float32{0.2, 0.23333335},\n",
-    "\t\t\twantRstd: []float32{12.238273, 5.302555},\n",
-    "\t\t},\n",
-    "\t}\n",
-    "\tfor _, tt := range tests {\n",
-    "\t\tt.Run(tt.name, func(t *testing.T) {\n",
-    "\t\t\tout, mean, rstd := make([]float32, len(tt.args.inp)), make([]float32, tt.args.B*tt.args.T), make([]float32, tt.args.B*tt.args.T)\n",
-    "\t\t\tlayernormForward(out, mean, rstd, tt.args.inp, tt.args.weight, tt.args.bias, tt.args.B, tt.args.T, tt.args.C)\n",
-    "\t\t\trequire.InDeltaSlice(t, tt.wantOut, out, delta)\n",
-    "\t\t\trequire.InDeltaSlice(t, tt.wantMean, mean, delta)\n",
-    "\t\t\trequire.InDeltaSlice(t, tt.wantRstd, rstd, delta)\n",
-    "\t\t})\n",
-    "\t}\n",
+    "\t// Test 1\n",
+    "\tinp := []float32{0.2, 0.1, 0.3, 0.5, 0.1, 0.1}\n",
+    "\tweight := []float32{1, 1, 1, 1, 1, 1}\n",
+    "\tbias := []float32{0, 0, 0, 0, 0, 0}\n",
+    "\tB := 2\n",
+    "\tT := 1\n",
+    "\tC := 3\n",
+    "\tout, mean, rstd := make([]float32, len(inp)), make([]float32, B*T), make([]float32, B*T)\n",
+    "\tlayernormForward(out, mean, rstd, inp, weight, bias, B, T, C)\n",
+    "\n",
+    "\twantOut := []float32{0, -1.2238272, 1.2238274, 1.4140146, -0.70700747, -0.70700747}\n",
+    "\twantMean := []float32{0.2, 0.23333335}\n",
+    "\twantRstd := []float32{12.238273, 5.302555}\n",
+    "\n",
+    "\trequire.InDeltaSlice(t, wantOut, out, delta)\n",
+    "\trequire.InDeltaSlice(t, wantMean, mean, delta)\n",
+    "\trequire.InDeltaSlice(t, wantRstd, rstd, delta)\n",
     "}"
    ]
   },
-  {
-   "cell_type": "markdown",
-   "id": "2cf70f39-3cb7-4314-9318-92623c35e773",
-   "metadata": {},
-   "source": [
-    "# Matmul forward\n",
-    "\n",
-    "matmulForward performs matrix multiplication and adds bias.\n",
-    "Parameters:\n",
-    "  - out: output matrix\n",
-    "  - inp: input matrix\n",
-    "  - weight: weight matrix\n",
-    "  - bias: bias vector\n",
-    "  - B: batch size\n",
-    "  - T: sequence length (number of time steps)\n",
-    "  - C: input dimension (number of features)\n",
-    "  - OC: number of output channels"
-   ]
-  },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 112,
    "id": "b95fe909-e5c1-4159-82a8-a2d969978a0b",
    "metadata": {},
    "outputs": [],
@@ -1273,7 +1244,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 113,
    "id": "3bf1229d-cb85-41f3-b425-4db84940a49b",
    "metadata": {},
    "outputs": [
@@ -1363,12 +1334,14 @@
     "  - B: batch size\n",
     "  - T: sequence length (number of time steps)\n",
     "  - C: input dimension (number of features)\n",
-    "  - NH: number of attention heads"
+    "  - NH: number of attention heads\n",
+    "\n",
+    "![attention](attention.svg)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": 114,
    "id": "b5b4dc98-1c0c-45c6-bb16-f7167fb97584",
    "metadata": {},
    "outputs": [],
@@ -1470,7 +1443,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 62,
+   "execution_count": 115,
    "id": "5e8663da-22f7-42a6-831e-c2fc954a470d",
    "metadata": {},
    "outputs": [
@@ -1582,12 +1555,14 @@
     "# Residual forward\n",
     "https://arxiv.org/abs/1512.03385\n",
     "\n",
-    "residualForward implements a simple residual connection, a common technique used in deep neural networks to improve training and performance."
+    "residualForward implements a simple residual connection, a common technique used in deep neural networks to improve training and performance.\n",
+    "\n",
+    "![residual](residual.svg)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": 116,
    "id": "6b7e0789-1f0b-4d32-a922-029bfb109c3b",
    "metadata": {},
    "outputs": [],
@@ -1616,7 +1591,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": 117,
    "id": "dc544479-7bf5-4bdf-8913-f11718c06626",
    "metadata": {},
    "outputs": [],
@@ -1649,7 +1624,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 21,
+   "execution_count": 118,
    "id": "357b6249-95b3-4ee0-a59d-e659636e2b1a",
    "metadata": {},
    "outputs": [],
@@ -1699,7 +1674,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 61,
+   "execution_count": 119,
    "id": "584ffc62-c932-40be-9aa0-0da83d866d4d",
    "metadata": {},
    "outputs": [],
@@ -1734,7 +1709,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 60,
+   "execution_count": 120,
    "id": "deac87d4-e901-401e-b109-9642ea02bc8d",
    "metadata": {},
    "outputs": [],
@@ -1895,12 +1870,22 @@
    "id": "cb2fe3e3-d8ff-40a0-98e5-a9d8666f380d",
    "metadata": {},
    "source": [
-    "# sampleMult"
+    "# Sampling\n",
+    "\n",
+    "The probabilities are a float array of:\n",
+    "\n",
+    "index/tokenid:probability\n",
+    "\n",
+    "coin is a random value between 0 and 1. \n",
+    "\n",
+    "We start with a cumulative sum, and when it gets above our target coin, we return. \n",
+    "\n",
+    "This makes it that the most likely token returned is the one that has the most probability, but we still have the possibiility of choosing other ones, proportional to how likley they are."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 59,
+   "execution_count": 122,
    "id": "1b21fd77-d192-4d1a-9a9d-14f36ba3d812",
    "metadata": {},
    "outputs": [],
@@ -1919,7 +1904,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 58,
+   "execution_count": 123,
    "id": "b539b57e-025a-4f73-94cf-02ac09860f01",
    "metadata": {},
    "outputs": [],
@@ -1960,7 +1945,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 26,
+   "execution_count": 124,
    "id": "ea5451cb-3790-4935-a7cb-7238d0b739b3",
    "metadata": {},
    "outputs": [],
@@ -1984,7 +1969,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 49,
+   "execution_count": 127,
    "id": "92b53304-1961-48ee-93d3-b679c68526fc",
    "metadata": {},
    "outputs": [
@@ -1996,7 +1981,6 @@
       "=== RUN   TestLoadGPT2Model/#00\n",
       "input is 4 tokens long\n",
       "generating token: 0\n",
-      "inference time took: 31.292µs\n",
       "--- FAIL: TestLoadGPT2Model (0.00s)\n",
       "    --- FAIL: TestLoadGPT2Model/#00 (0.00s)\n"
      ]
@@ -2008,20 +1992,20 @@
       "panic: runtime error: slice bounds out of range [-3:] [recovered]\n",
       "\tpanic: runtime error: slice bounds out of range [-3:]\n",
       "\n",
-      "goroutine 7 [running]:\n",
-      "testing.tRunner.func1.2({0x100e1cfa0, 0x14000184018})\n",
+      "goroutine 20 [running]:\n",
+      "testing.tRunner.func1.2({0x1042fc560, 0x1400012e300})\n",
       "\t/opt/homebrew/Cellar/go/1.22.2/libexec/src/testing/testing.go:1631 +0x1c4\n",
       "testing.tRunner.func1()\n",
       "\t/opt/homebrew/Cellar/go/1.22.2/libexec/src/testing/testing.go:1634 +0x33c\n",
-      "panic({0x100e1cfa0?, 0x14000184018?})\n",
+      "panic({0x1042fc560?, 0x1400012e300?})\n",
       "\t/opt/homebrew/Cellar/go/1.22.2/libexec/src/runtime/panic.go:770 +0x124\n",
-      "gonb_706a570e.(*GPT2).Inference(0x14000172f70, {0x100d7984e, 0x4}, 0x1, 0x2)\n",
-      "\t \u001b[7m[[ Cell [25] Line 28 ]]\u001b[0m /var/folders/b_/lv3cnbp904q9_0ndh5mkmp2h0000gn/T/gonb_706a570e/main_test.go:507 +0x4e4\n",
-      "gonb_706a570e.TestLoadGPT2Model.func1(0x140001169c0)\n",
-      "\t \u001b[7m[[ Cell [49] Line 29 ]]\u001b[0m /var/folders/b_/lv3cnbp904q9_0ndh5mkmp2h0000gn/T/gonb_706a570e/main_test.go:1119 +0x9c\n",
-      "testing.tRunner(0x140001169c0, 0x14000138380)\n",
+      "gonb_706a570e.(*GPT2).Inference(0x1400019ef70, {0x104258d2e?, 0x3?}, 0x1, 0x2)\n",
+      "\t \u001b[7m[[ Cell [123] Line 23 ]]\u001b[0m /var/folders/b_/lv3cnbp904q9_0ndh5mkmp2h0000gn/T/gonb_706a570e/main_test.go:502 +0x440\n",
+      "gonb_706a570e.TestLoadGPT2Model.func1(0x140001329c0)\n",
+      "\t \u001b[7m[[ Cell [127] Line 29 ]]\u001b[0m /var/folders/b_/lv3cnbp904q9_0ndh5mkmp2h0000gn/T/gonb_706a570e/main_test.go:989 +0x9c\n",
+      "testing.tRunner(0x140001329c0, 0x1400015e380)\n",
       "\t/opt/homebrew/Cellar/go/1.22.2/libexec/src/testing/testing.go:1689 +0xec\n",
-      "created by testing.(*T).Run in goroutine 6\n",
+      "created by testing.(*T).Run in goroutine 19\n",
       "\t/opt/homebrew/Cellar/go/1.22.2/libexec/src/testing/testing.go:1742 +0x318\n",
       "exit status 2\n"
      ]
@@ -2066,7 +2050,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 28,
+   "execution_count": 126,
    "id": "639850ea-2c6f-4939-8893-41cbb91e677e",
    "metadata": {},
    "outputs": [
@@ -2077,8 +2061,7 @@
       "=== RUN   TestInference\n",
       "input is 10 tokens long\n",
       "generating token: 0\n",
-      "inference time took: 226.473625ms\n",
-      "--- FAIL: TestInference (0.72s)\n"
+      "--- FAIL: TestInference (0.91s)\n"
      ]
     },
     {
@@ -2089,17 +2072,17 @@
       "\tpanic: runtime error: slice bounds out of range [-50257:]\n",
       "\n",
       "goroutine 6 [running]:\n",
-      "testing.tRunner.func1.2({0x104eccf40, 0x1403c6dc018})\n",
+      "testing.tRunner.func1.2({0x100790560, 0x14000126168})\n",
       "\t/opt/homebrew/Cellar/go/1.22.2/libexec/src/testing/testing.go:1631 +0x1c4\n",
       "testing.tRunner.func1()\n",
       "\t/opt/homebrew/Cellar/go/1.22.2/libexec/src/testing/testing.go:1634 +0x33c\n",
-      "panic({0x104eccf40?, 0x1403c6dc018?})\n",
+      "panic({0x100790560?, 0x14000126168?})\n",
       "\t/opt/homebrew/Cellar/go/1.22.2/libexec/src/runtime/panic.go:770 +0x124\n",
-      "gonb_706a570e.(*GPT2).Inference(0x1400011f008, {0x0, 0x0}, 0x1, 0x9)\n",
-      "\t \u001b[7m[[ Cell [25] Line 28 ]]\u001b[0m /var/folders/b_/lv3cnbp904q9_0ndh5mkmp2h0000gn/T/gonb_706a570e/main_test.go:411 +0x4e4\n",
-      "gonb_706a570e.TestInference(0x14000116b60)\n",
-      "\t \u001b[7m[[ Cell [28] Line 6 ]]\u001b[0m /var/folders/b_/lv3cnbp904q9_0ndh5mkmp2h0000gn/T/gonb_706a570e/main_test.go:849 +0x78\n",
-      "testing.tRunner(0x14000116b60, 0x104ee2138)\n",
+      "gonb_706a570e.(*GPT2).Inference(0x140000a7008, {0x0?, 0x0?}, 0x1, 0x9)\n",
+      "\t \u001b[7m[[ Cell [123] Line 23 ]]\u001b[0m /var/folders/b_/lv3cnbp904q9_0ndh5mkmp2h0000gn/T/gonb_706a570e/main_test.go:502 +0x440\n",
+      "gonb_706a570e.TestInference(0x1400009eea0)\n",
+      "\t \u001b[7m[[ Cell [126] Line 6 ]]\u001b[0m /var/folders/b_/lv3cnbp904q9_0ndh5mkmp2h0000gn/T/gonb_706a570e/main_test.go:937 +0x78\n",
+      "testing.tRunner(0x1400009eea0, 0x1007a5560)\n",
       "\t/opt/homebrew/Cellar/go/1.22.2/libexec/src/testing/testing.go:1689 +0xec\n",
       "created by testing.(*T).Run in goroutine 1\n",
       "\t/opt/homebrew/Cellar/go/1.22.2/libexec/src/testing/testing.go:1742 +0x318\n",
@@ -2121,7 +2104,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 29,
+   "execution_count": null,
    "id": "b4e65fbf-a631-457c-876d-450c4fe5af70",
    "metadata": {},
    "outputs": [
@@ -2159,7 +2142,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 30,
+   "execution_count": null,
    "id": "9e966913-1023-4307-b419-51a0642c618e",
    "metadata": {},
    "outputs": [],
@@ -2200,7 +2183,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 31,
+   "execution_count": null,
    "id": "7529a3c7-6a19-4efe-93c2-41fa750cb3b8",
    "metadata": {},
    "outputs": [],
@@ -2262,7 +2245,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 32,
+   "execution_count": null,
    "id": "b13930e2-bdfd-4724-8213-af1087ae32a6",
    "metadata": {},
    "outputs": [],
@@ -2319,7 +2302,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 33,
+   "execution_count": null,
    "id": "c7868ba4-d181-4ee1-80e6-6cedd23944a9",
    "metadata": {},
    "outputs": [],
@@ -2344,7 +2327,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 34,
+   "execution_count": null,
    "id": "037bb57f-2a32-473e-be3e-986e7aa12853",
    "metadata": {},
    "outputs": [],
@@ -2375,7 +2358,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 35,
+   "execution_count": null,
    "id": "f833485b-57c3-4f6a-bfab-c1ed0cd6e449",
    "metadata": {},
    "outputs": [],
@@ -2460,7 +2443,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 36,
+   "execution_count": null,
    "id": "d6ddc99a-0b48-43aa-a3c6-76b824a84dc2",
    "metadata": {},
    "outputs": [],
@@ -2528,7 +2511,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 37,
+   "execution_count": null,
    "id": "e38b3ac4-ff9f-439b-ac9a-9fbedd9b1bbe",
    "metadata": {},
    "outputs": [],
@@ -2559,7 +2542,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 38,
+   "execution_count": null,
    "id": "8580eaa8-af05-4f51-b83b-94d9c44c5de1",
    "metadata": {},
    "outputs": [],
@@ -2577,7 +2560,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 39,
+   "execution_count": null,
    "id": "275503ff-b344-4f1c-9619-b3c9c1dc9956",
    "metadata": {},
    "outputs": [],
@@ -2609,7 +2592,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 40,
+   "execution_count": null,
    "id": "ada64c1b-6e14-45cd-99c7-b04139e07726",
    "metadata": {},
    "outputs": [],
@@ -2714,7 +2697,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 41,
+   "execution_count": null,
    "id": "8db1366e-a3de-42ff-88d9-aecc8559f7b6",
    "metadata": {},
    "outputs": [],
@@ -2790,7 +2773,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 45,
+   "execution_count": null,
    "id": "71cded43-3dbe-4730-a6a5-62c7352c5ddf",
    "metadata": {},
    "outputs": [
diff --git a/notebook/residual.svg b/notebook/residual.svg
new file mode 100644
index 0000000..75d60e9
--- /dev/null
+++ b/notebook/residual.svg
@@ -0,0 +1,21 @@
+<svg version="1.1" xmlns="http://www.w3.org/2000/svg" viewBox="0 0 209.4832038324912 164.88853428006496" width="418.9664076649824" height="329.7770685601299">
+  <!-- svg-source:excalidraw -->
+  
+  <defs>
+    <style class="style-fonts">
+      @font-face {
+        font-family: "Virgil";
+        src: url("https://excalidraw.com/Virgil.woff2");
+      }
+      @font-face {
+        font-family: "Cascadia";
+        src: url("https://excalidraw.com/Cascadia.woff2");
+      }
+      @font-face {
+        font-family: "Assistant";
+        src: url("https://excalidraw.com/Assistant-Regular.woff2");
+      }
+    </style>
+    
+  </defs>
+  <rect x="0" y="0" width="209.4832038324912" height="164.88853428006496" fill="#ffffff"></rect><g stroke-linecap="round" transform="translate(10 27.082169433893796) rotate(0 58.80173489643107 63.90318242308558)"><path d="M0 0 L117.6 0 L117.6 127.81 L0 127.81" stroke="none" stroke-width="0" fill="#e9ecef"></path><path d="M0 0 C32.69 0, 65.38 0, 117.6 0 M0 0 C43.1 0, 86.2 0, 117.6 0 M117.6 0 C117.6 51.07, 117.6 102.13, 117.6 127.81 M117.6 0 C117.6 42.8, 117.6 85.61, 117.6 127.81 M117.6 127.81 C81.28 127.81, 44.95 127.81, 0 127.81 M117.6 127.81 C93.14 127.81, 68.68 127.81, 0 127.81 M0 127.81 C0 85.41, 0 43.01, 0 0 M0 127.81 C0 96.71, 0 65.61, 0 0" stroke="#1e1e1e" stroke-width="2" fill="none"></path></g><g stroke-linecap="round" transform="translate(32.115349518160656 35.33952057924398) rotate(0 44.27296203323044 39.83704062934157)"><path d="M19.92 0 C32.1 0, 44.29 0, 68.63 0 M19.92 0 C36.82 0, 53.72 0, 68.63 0 M68.63 0 C81.91 0, 88.55 6.64, 88.55 19.92 M68.63 0 C81.91 0, 88.55 6.64, 88.55 19.92 M88.55 19.92 C88.55 33.13, 88.55 46.35, 88.55 59.76 M88.55 19.92 C88.55 29.61, 88.55 39.31, 88.55 59.76 M88.55 59.76 C88.55 73.03, 81.91 79.67, 68.63 79.67 M88.55 59.76 C88.55 73.03, 81.91 79.67, 68.63 79.67 M68.63 79.67 C52.03 79.67, 35.44 79.67, 19.92 79.67 M68.63 79.67 C55.75 79.67, 42.88 79.67, 19.92 79.67 M19.92 79.67 C6.64 79.67, 0 73.03, 0 59.76 M19.92 79.67 C6.64 79.67, 0 73.03, 0 59.76 M0 59.76 C0 45.88, 0 32.01, 0 19.92 M0 59.76 C0 44.6, 0 29.44, 0 19.92 M0 19.92 C0 6.64, 6.64 0, 19.92 0 M0 19.92 C0 6.64, 6.64 0, 19.92 0" stroke="#1e1e1e" stroke-width="2" fill="none"></path></g><g stroke-linecap="round" transform="translate(22.559865213828743 32.02022475751983) rotate(0 19.822898797887774 43.61924530095621)"><path d="M0 0 L39.65 0 L39.65 87.24 L0 87.24" stroke="none" stroke-width="0" fill="#e9ecef"></path><path d="M0 0 C10.17 0, 20.34 0, 39.65 0 M0 0 C15.33 0, 30.65 0, 39.65 0 M39.65 0 C39.65 19.6, 39.65 39.19, 39.65 87.24 M39.65 0 C39.65 22.75, 39.65 45.5, 39.65 87.24 M39.65 87.24 C26.88 87.24, 14.11 87.24, 0 87.24 M39.65 87.24 C26.1 87.24, 12.56 87.24, 0 87.24 M0 87.24 C0 57.52, 0 27.8, 0 0 M0 87.24 C0 64.18, 0 41.12, 0 0" stroke="transparent" stroke-width="2" fill="none"></path></g><g stroke-linecap="round" transform="translate(21.51409772369516 106.65083589469577) rotate(0 39.65208402070016 7.604828169534812)"><path d="M0 0 L79.3 0 L79.3 15.21 L0 15.21" stroke="none" stroke-width="0" fill="#eef3b5"></path><path d="M0 0 C23.22 0, 46.45 0, 79.3 0 M0 0 C23.25 0, 46.51 0, 79.3 0 M79.3 0 C79.3 4.58, 79.3 9.16, 79.3 15.21 M79.3 0 C79.3 4.6, 79.3 9.2, 79.3 15.21 M79.3 15.21 C56.58 15.21, 33.86 15.21, 0 15.21 M79.3 15.21 C52.97 15.21, 26.64 15.21, 0 15.21 M0 15.21 C0 11, 0 6.8, 0 0 M0 15.21 C0 10.44, 0 5.68, 0 0" stroke="#1e1e1e" stroke-width="2" fill="none"></path></g><g transform="translate(25.113203921157037 106.58939106333855) rotate(0 25.23088836669922 7.794799129418607)"><text x="0" y="12.47194337606214" font-family="Helvetica, Segoe UI Emoji" font-size="13.5561723989891px" fill="#1e1e1e" text-anchor="start" style="white-space: pre;" direction="ltr" dominant-baseline="alphabetic">       Add</text></g><g stroke-linecap="round"><g transform="translate(61.73078133274794 10) rotate(0 0 23.533204096706413)"><path d="M0 0 C0 16.3, 0 32.6, 0 47.07 M0 0 C0 17.57, 0 35.14, 0 47.07" stroke="#1e1e1e" stroke-width="2" fill="none"></path></g></g><mask></mask><g stroke-linecap="round" transform="translate(21.52160891211497 57.85385411849893) rotate(0 39.64477461325532 22.817662184918845)"><path d="M0 0 L79.29 0 L79.29 45.64 L0 45.64" stroke="none" stroke-width="0" fill="#feddac"></path><path d="M0 0 C17.81 0, 35.62 0, 79.29 0 M0 0 C20.68 0, 41.35 0, 79.29 0 M79.29 0 C79.29 14.7, 79.29 29.39, 79.29 45.64 M79.29 0 C79.29 15.59, 79.29 31.18, 79.29 45.64 M79.29 45.64 C52.28 45.64, 25.27 45.64, 0 45.64 M79.29 45.64 C58.33 45.64, 37.37 45.64, 0 45.64 M0 45.64 C0 34.97, 0 24.31, 0 0 M0 45.64 C0 35.52, 0 25.4, 0 0" stroke="#1e1e1e" stroke-width="2" fill="none"></path></g><g transform="translate(30.478410157225426 57.15146623536748) rotate(0 29.86284637451172 15.650807463593537)"><text x="29.86284637451172" y="12.520911779425331" font-family="Helvetica, Segoe UI Emoji" font-size="13.609397794428979px" fill="#1e1e1e" text-anchor="middle" style="white-space: pre;" direction="ltr" dominant-baseline="alphabetic"></text><text x="29.86284637451172" y="28.171719243018657" font-family="Helvetica, Segoe UI Emoji" font-size="13.609397794428979px" fill="#1e1e1e" text-anchor="middle" style="white-space: pre;" direction="ltr" dominant-baseline="alphabetic">Operation</text></g><g stroke-linecap="round"><g transform="translate(59.81611753856305 51.51537979689056) rotate(0 1.059535393282431 2.757208822508801)"><path d="M0 0 C0.55 1.43, 1.1 2.86, 2.12 5.51 M0 0 C0.75 1.95, 1.5 3.91, 2.12 5.51" stroke="#1e1e1e" stroke-width="2" fill="none"></path></g></g><mask></mask><g stroke-linecap="round"><g transform="translate(61.95765452863452 57.14858058961545) rotate(0 0.9141717410632282 -2.9088855668724136)"><path d="M0 0 C0.61 -1.93, 1.21 -3.86, 1.83 -5.82 M0 0 C0.44 -1.42, 0.89 -2.83, 1.83 -5.82" stroke="#1e1e1e" stroke-width="2" fill="none"></path></g></g><mask></mask><g stroke-linecap="round"><g transform="translate(62.14278265647954 103.65699109285015) rotate(0 0 1.3513421274938082)"><path d="M0 0 C0 0.45, 0 2.25, 0 2.7 M0 0 C0 0.45, 0 2.25, 0 2.7" stroke="#1e1e1e" stroke-width="2" fill="none"></path></g></g><mask></mask><g stroke-linecap="round"><g transform="translate(103.41699095210606 111.00442719646526) rotate(90 1.059535393282431 2.757208822508801)"><path d="M0 0 C0.67 1.76, 1.35 3.51, 2.12 5.51 M0 0 C0.55 1.44, 1.11 2.89, 2.12 5.51" stroke="#1e1e1e" stroke-width="2" fill="none"></path></g></g><mask></mask><g stroke-linecap="round"><g transform="translate(103.59524820098159 118.66669492369874) rotate(90 0.9141717410632282 -2.9088855668724136)"><path d="M0 0 C0.69 -2.2, 1.39 -4.41, 1.83 -5.82 M0 0 C0.65 -2.07, 1.3 -4.14, 1.83 -5.82" stroke="#1e1e1e" stroke-width="2" fill="none"></path></g></g><mask></mask><g stroke-linecap="round"><g transform="translate(61.81446444676658 121.85851847784761) rotate(0 0 7.350146489091458)"><path d="M0 0 C0 4.21, 0 8.42, 0 14.7 M0 0 C0 4.88, 0 9.76, 0 14.7" stroke="#1e1e1e" stroke-width="2" fill="none"></path></g></g><mask></mask><g stroke-linecap="round"><g transform="translate(61.75091062928641 122.92962724972949) rotate(0 0 11.486520266928437)"><path d="M0 0 C0 8.43, 0 16.86, 0 22.97 M0 0 C0 8.54, 0 17.09, 0 22.97" stroke="#1e1e1e" stroke-width="2" fill="none"></path></g></g><mask></mask><g stroke-linecap="round"><g transform="translate(59.58933108343899 140.34438218383093) rotate(0 1.1033065276544676 2.871113613823127)"><path d="M0 0 C0.75 1.96, 1.5 3.91, 2.21 5.74 M0 0 C0.58 1.52, 1.17 3.04, 2.21 5.74" stroke="#1e1e1e" stroke-width="2" fill="none"></path></g></g><mask></mask><g stroke-linecap="round"><g transform="translate(61.81933845774165 146.21029968465973) rotate(0 0.9519376659875434 -3.0290563717630903)"><path d="M0 0 C0.53 -1.68, 1.05 -3.35, 1.9 -6.06 M0 0 C0.6 -1.9, 1.19 -3.8, 1.9 -6.06" stroke="#1e1e1e" stroke-width="2" fill="none"></path></g></g><mask></mask><g stroke-linecap="round"><g transform="translate(132.41212519750843 60.543641929230944) rotate(270 0 10.767214242026057)"><path d="M0 0 C0 7.64, 0 15.27, 0 21.53 M0 0 C0 7.37, 0 14.73, 0 21.53" stroke="#1e1e1e" stroke-width="2" fill="none"></path></g></g><mask></mask><g transform="translate(146.5641364496787 63.33929114535749) rotate(0 26.45953369140625 7.825403731796541)"><text x="26.45953369140625" y="12.52091177942533" font-family="Helvetica, Segoe UI Emoji" font-size="13.609397794428977px" fill="#1e1e1e" text-anchor="middle" style="white-space: pre;" direction="ltr" dominant-baseline="alphabetic">Residual</text></g></svg>
\ No newline at end of file
diff --git a/notebook/tokenization.svg b/notebook/tokenization.svg
new file mode 100644
index 0000000..b5d8f50
--- /dev/null
+++ b/notebook/tokenization.svg
@@ -0,0 +1,21 @@
+<svg version="1.1" xmlns="http://www.w3.org/2000/svg" viewBox="0 0 100.01131621190234 134.85251339265506" width="200.02263242380468" height="269.7050267853101">
+  <!-- svg-source:excalidraw -->
+  
+  <defs>
+    <style class="style-fonts">
+      @font-face {
+        font-family: "Virgil";
+        src: url("https://excalidraw.com/Virgil.woff2");
+      }
+      @font-face {
+        font-family: "Cascadia";
+        src: url("https://excalidraw.com/Cascadia.woff2");
+      }
+      @font-face {
+        font-family: "Assistant";
+        src: url("https://excalidraw.com/Assistant-Regular.woff2");
+      }
+    </style>
+    
+  </defs>
+  <rect x="0" y="0" width="100.01131621190234" height="134.85251339265506" fill="#ffffff"></rect><g transform="translate(10 10) rotate(0 38.937225341796875 7.825403731796541)"><text x="38.937225341796875" y="12.52091177942533" font-family="Helvetica, Segoe UI Emoji" font-size="13.609397794428977px" fill="#1e1e1e" text-anchor="middle" style="white-space: pre;" direction="ltr" dominant-baseline="alphabetic">Input context</text></g><g stroke-linecap="round"><g transform="translate(50.40001268998185 82.51912740393527) rotate(0 0 10.767214242026057)"><path d="M0 0 C0 5.33, 0 10.65, 0 21.53 M0 0 C0 8.48, 0 16.97, 0 21.53" stroke="#1e1e1e" stroke-width="2" fill="none"></path></g></g><mask></mask><g stroke-linecap="round"><g transform="translate(48.48753813739768 98.52181197562822) rotate(0 1.059535393282431 2.757208822508801)"><path d="M0 0 C0.49 1.28, 0.99 2.57, 2.12 5.51 M0 0 C0.7 1.82, 1.4 3.63, 2.12 5.51" stroke="#1e1e1e" stroke-width="2" fill="none"></path></g></g><mask></mask><g stroke-linecap="round"><g transform="translate(50.62907512746824 104.15501276834948) rotate(0 0.9141717410632282 -2.9088855668724136)"><path d="M0 0 C0.57 -1.8, 1.13 -3.6, 1.83 -5.82 M0 0 C0.47 -1.49, 0.93 -2.97, 1.83 -5.82" stroke="#1e1e1e" stroke-width="2" fill="none"></path></g></g><mask></mask><g stroke-linecap="round" transform="translate(10.721766985391696 52.84193015485471) rotate(0 39.64477461325532 14.143613157191794)"><path d="M0 0 L79.29 0 L79.29 28.29 L0 28.29" stroke="none" stroke-width="0" fill="#a5d8ff"></path><path d="M0 0 C27.47 0, 54.93 0, 79.29 0 M0 0 C21.6 0, 43.19 0, 79.29 0 M79.29 0 C79.29 8.78, 79.29 17.55, 79.29 28.29 M79.29 0 C79.29 5.79, 79.29 11.57, 79.29 28.29 M79.29 28.29 C61.05 28.29, 42.82 28.29, 0 28.29 M79.29 28.29 C54.14 28.29, 29 28.29, 0 28.29 M0 28.29 C0 21.51, 0 14.72, 0 0 M0 28.29 C0 17.45, 0 6.61, 0 0" stroke="#1e1e1e" stroke-width="2" fill="none"></path></g><g transform="translate(20.467129523165568 59.13537370042832) rotate(0 29.122649333049367 7.825403731796541)"><text x="29.12264933304931" y="12.52091177942533" font-family="Helvetica, Segoe UI Emoji" font-size="13.609397794428977px" fill="#1e1e1e" text-anchor="middle" style="white-space: pre;" direction="ltr" dominant-baseline="alphabetic">Tokenizer</text></g><g stroke-linecap="round"><g transform="translate(47.7805387426306 28.50994072494359) rotate(0 0 10.767214242026057)"><path d="M0 0 C0 8.36, 0 16.72, 0 21.53 M0 0 C0 5.93, 0 11.86, 0 21.53" stroke="#1e1e1e" stroke-width="2" fill="none"></path></g></g><mask></mask><g stroke-linecap="round"><g transform="translate(45.78401727084565 44.384548420748615) rotate(0 1.059535393282431 2.757208822508801)"><path d="M0 0 C0.49 1.27, 0.98 2.54, 2.12 5.51 M0 0 C0.57 1.48, 1.14 2.96, 2.12 5.51" stroke="#1e1e1e" stroke-width="2" fill="none"></path></g></g><mask></mask><g stroke-linecap="round"><g transform="translate(47.925554260917124 50.01774921347533) rotate(0 0.9141717410632282 -2.9088855668724136)"><path d="M0 0 C0.43 -1.36, 0.85 -2.71, 1.83 -5.82 M0 0 C0.44 -1.41, 0.89 -2.83, 1.83 -5.82" stroke="#1e1e1e" stroke-width="2" fill="none"></path></g></g><mask></mask><g transform="translate(30.576532055349617 109.20170592906197) rotate(0 20.03472900390625 7.825403731796541)"><text x="20.03472900390625" y="12.52091177942533" font-family="Helvetica, Segoe UI Emoji" font-size="13.609397794428977px" fill="#1e1e1e" text-anchor="middle" style="white-space: pre;" direction="ltr" dominant-baseline="alphabetic">tokens</text></g></svg>
\ No newline at end of file