diff --git a/Week 01/Python review session/python-review-demo.py b/Week 01/Python review session/python-review-demo.py
new file mode 100644
index 0000000..5c51552
--- /dev/null
+++ b/Week 01/Python review session/python-review-demo.py	
@@ -0,0 +1,28 @@
+import numpy as np
+
+def power_iteration(A, tolerance=1e-7):
+    b_old = np.random.rand(A.shape[1])
+    b = np.random.rand(A.shape[1])
+    num_iterations = 0
+    while num_iterations == 0 or np.linalg.norm(b_old - b) > tolerance:
+        b_old = np.copy(b)
+        b = np.dot(A, b)
+        b_norm = np.linalg.norm(b)
+        b /= b_norm
+        num_iterations += 1
+    return np.dot(A, b), b, num_iterations
+
+def main():
+    A = np.array([[.5, .4], [.2, .8]])
+    ab, b, number_iterations = power_iteration(A)
+
+    eig1 = ab[0] / b[0]
+    eig2 = ab[1] / b[1]
+    assert(np.abs((eig1 - eig2) / eig2) < 1e-5)
+
+    b /= b[1]
+
+    print(eig1, b, number_iterations)
+
+if __name__ == '__main__':
+    main()
diff --git a/Week 03/Assignment 3/README.md b/Week 03/Assignment 3/README.md
new file mode 100644
index 0000000..335eb7e
--- /dev/null
+++ b/Week 03/Assignment 3/README.md	
@@ -0,0 +1,3 @@
+# Assignment 3
+
+- [Handout](http://web.stanford.edu/class/cs224n/assignments/a3.pdf)
diff --git a/Week 03/Assignment 3/collect_submission.sh b/Week 03/Assignment 3/collect_submission.sh
new file mode 100755
index 0000000..02acf7c
--- /dev/null
+++ b/Week 03/Assignment 3/collect_submission.sh	
@@ -0,0 +1,2 @@
+rm -f assignment3.zip
+zip -r assignment3.zip *.py ./data ./utils
diff --git a/Week 03/Assignment 3/local_env.yml b/Week 03/Assignment 3/local_env.yml
new file mode 100755
index 0000000..6325364
--- /dev/null
+++ b/Week 03/Assignment 3/local_env.yml	
@@ -0,0 +1,11 @@
+name: a3
+channels:
+  - pytorch
+  - defaults
+dependencies:
+  - python=3.7
+  - numpy
+  - tqdm
+  - docopt
+  - pytorch
+  - torchvision
diff --git a/Week 03/Assignment 3/parser_model.py b/Week 03/Assignment 3/parser_model.py
new file mode 100755
index 0000000..851e131
--- /dev/null
+++ b/Week 03/Assignment 3/parser_model.py	
@@ -0,0 +1,178 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+CS224N 2019-20: Homework 3
+parser_model.py: Feed-Forward Neural Network for Dependency Parsing
+Sahil Chopra <schopra8@stanford.edu>
+Haoshen Hong <haoshen@stanford.edu>
+"""
+import argparse
+import numpy as np
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class ParserModel(nn.Module):
+    """ Feedforward neural network with an embedding layer and two hidden layers.
+    The ParserModel will predict which transition should be applied to a
+    given partial parse configuration.
+
+    PyTorch Notes:
+        - Note that "ParserModel" is a subclass of the "nn.Module" class. In PyTorch all neural networks
+            are a subclass of this "nn.Module".
+        - The "__init__" method is where you define all the layers and parameters
+            (embedding layers, linear layers, dropout layers, etc.).
+        - "__init__" gets automatically called when you create a new instance of your class, e.g.
+            when you write "m = ParserModel()".
+        - Other methods of ParserModel can access variables that have "self." prefix. Thus,
+            you should add the "self." prefix layers, values, etc. that you want to utilize
+            in other ParserModel methods.
+        - For further documentation on "nn.Module" please see https://pytorch.org/docs/stable/nn.html.
+    """
+    def __init__(self, embeddings, n_features=36,
+        hidden_size=200, n_classes=3, dropout_prob=0.5):
+        """ Initialize the parser model.
+
+        @param embeddings (ndarray): word embeddings (num_words, embedding_size)
+        @param n_features (int): number of input features
+        @param hidden_size (int): number of hidden units
+        @param n_classes (int): number of output classes
+        @param dropout_prob (float): dropout probability
+        """
+        super(ParserModel, self).__init__()
+        self.n_features = n_features
+        self.n_classes = n_classes
+        self.dropout_prob = dropout_prob
+        self.embed_size = embeddings.shape[1]
+        self.hidden_size = hidden_size
+        self.embeddings = nn.Parameter(torch.tensor(embeddings))
+
+        ### YOUR CODE HERE (~10 Lines)
+        ### TODO:
+        ###     1) Declare `self.embed_to_hidden_weight` and `self.embed_to_hidden_bias` as `nn.Parameter`.
+        ###        Initialize weight with the `nn.init.xavier_uniform_` function and bias with `nn.init.uniform_`
+        ###        with default parameters.
+        ###     2) Construct `self.dropout` layer.
+        ###     3) Declare `self.hidden_to_logits_weight` and `self.hidden_to_logits_bias` as `nn.Parameter`.
+        ###        Initialize weight with the `nn.init.xavier_uniform_` function and bias with `nn.init.uniform_`
+        ###        with default parameters.
+        ###
+        ### Note: Trainable variables are declared as `nn.Parameter` which is a commonly used API
+        ###       to include a tensor into a computational graph to support updating w.r.t its gradient.
+        ###       Here, we use Xavier Uniform Initialization for our Weight initialization.
+        ###       It has been shown empirically, that this provides better initial weights
+        ###       for training networks than random uniform initialization.
+        ###       For more details checkout this great blogpost:
+        ###             http://andyljones.tumblr.com/post/110998971763/an-explanation-of-xavier-initialization
+        ###
+        ### Please see the following docs for support:
+        ###     nn.Parameter: https://pytorch.org/docs/stable/nn.html#parameters
+        ###     Initialization: https://pytorch.org/docs/stable/nn.init.html
+        ###     Dropout: https://pytorch.org/docs/stable/nn.html#dropout-layers
+
+
+
+
+        ### END YOUR CODE
+
+    def embedding_lookup(self, w):
+        """ Utilize `w` to select embeddings from embedding matrix `self.embeddings`
+            @param w (Tensor): input tensor of word indices (batch_size, n_features)
+
+            @return x (Tensor): tensor of embeddings for words represented in w
+                                (batch_size, n_features * embed_size)
+        """
+
+        ### YOUR CODE HERE (~1-3 Lines)
+        ### TODO:
+        ###     1) For each index `i` in `w`, select `i`th vector from self.embeddings
+        ###     2) Reshape the tensor using `view` function if necessary
+        ###
+        ### Note: All embedding vectors are stacked and stored as a matrix. The model receives
+        ###       a list of indices representing a sequence of words, then it calls this lookup
+        ###       function to map indices to sequence of embeddings.
+        ###
+        ###       This problem aims to test your understanding of embedding lookup,
+        ###       so DO NOT use any high level API like nn.Embedding
+        ###       (we are asking you to implement that!). Pay attention to tensor shapes
+        ###       and reshape if necessary. Make sure you know each tensor's shape before you run the code!
+        ###
+        ### Pytorch has some useful APIs for you, and you can use either one
+        ### in this problem (except nn.Embedding). These docs might be helpful:
+        ###     Index select: https://pytorch.org/docs/stable/torch.html#torch.index_select
+        ###     Gather: https://pytorch.org/docs/stable/torch.html#torch.gather
+        ###     View: https://pytorch.org/docs/stable/tensors.html#torch.Tensor.view
+
+
+
+        ### END YOUR CODE
+        return x
+
+
+    def forward(self, w):
+        """ Run the model forward.
+
+            Note that we will not apply the softmax function here because it is included in the loss function nn.CrossEntropyLoss
+
+            PyTorch Notes:
+                - Every nn.Module object (PyTorch model) has a `forward` function.
+                - When you apply your nn.Module to an input tensor `w` this function is applied to the tensor.
+                    For example, if you created an instance of your ParserModel and applied it to some `w` as follows,
+                    the `forward` function would called on `w` and the result would be stored in the `output` variable:
+                        model = ParserModel()
+                        output = model(w) # this calls the forward function
+                - For more details checkout: https://pytorch.org/docs/stable/nn.html#torch.nn.Module.forward
+
+        @param w (Tensor): input tensor of tokens (batch_size, n_features)
+
+        @return logits (Tensor): tensor of predictions (output after applying the layers of the network)
+                                 without applying softmax (batch_size, n_classes)
+        """
+        ### YOUR CODE HERE (~3-5 lines)
+        ### TODO:
+        ###     Complete the forward computation as described in write-up. In addition, include a dropout layer
+        ###     as decleared in `__init__` after ReLU function.
+        ###
+        ### Note: We do not apply the softmax to the logits here, because
+        ### the loss function (torch.nn.CrossEntropyLoss) applies it more efficiently.
+        ###
+        ### Please see the following docs for support:
+        ###     Matrix product: https://pytorch.org/docs/stable/torch.html#torch.matmul
+        ###     ReLU: https://pytorch.org/docs/stable/nn.html?highlight=relu#torch.nn.functional.relu
+
+
+        ### END YOUR CODE
+        return logits
+
+
+if __name__ == "__main__":
+
+    parser = argparse.ArgumentParser(description='Simple sanity check for parser_model.py')
+    parser.add_argument('-e', '--embedding', action='store_true', help='sanity check for embeding_lookup function')
+    parser.add_argument('-f', '--forward', action='store_true', help='sanity check for forward function')
+    args = parser.parse_args()
+
+    embeddings = np.zeros((100, 30), dtype=np.float32)
+    model = ParserModel(embeddings)
+
+    def check_embedding():
+        inds = torch.randint(0, 100, (4, 36), dtype=torch.long)
+        selected = model.embedding_lookup(inds)
+        assert np.all(selected.data.numpy() == 0), "The result of embedding lookup: " \
+                                      + repr(selected) + " contains non-zero elements."
+
+    def check_forward():
+        inputs =torch.randint(0, 100, (4, 36), dtype=torch.long)
+        out = model(inputs)
+        expected_out_shape = (4, 3)
+        assert out.shape == expected_out_shape, "The result shape of forward is: " + repr(out.shape) + \
+                                                " which doesn't match expected " + repr(expected_out_shape)
+
+    if args.embedding:
+        check_embedding()
+        print("Embedding_lookup sanity check passes!")
+
+    if args.forward:
+        check_forward()
+        print("Forward sanity check passes!")
\ No newline at end of file
diff --git a/Week 03/Assignment 3/parser_transitions.py b/Week 03/Assignment 3/parser_transitions.py
new file mode 100755
index 0000000..2f99ed0
--- /dev/null
+++ b/Week 03/Assignment 3/parser_transitions.py	
@@ -0,0 +1,233 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+CS224N 2019-20: Homework 3
+parser_transitions.py: Algorithms for completing partial parsess.
+Sahil Chopra <schopra8@stanford.edu>
+Haoshen Hong <haoshen@stanford.edu>
+"""
+
+import sys
+
+class PartialParse(object):
+    def __init__(self, sentence):
+        """Initializes this partial parse.
+
+        @param sentence (list of str): The sentence to be parsed as a list of words.
+                                        Your code should not modify the sentence.
+        """
+        # The sentence being parsed is kept for bookkeeping purposes. Do not alter it in your code.
+        self.sentence = sentence
+
+        ### YOUR CODE HERE (3 Lines)
+        ### Your code should initialize the following fields:
+        ###     self.stack: The current stack represented as a list with the top of the stack as the
+        ###                 last element of the list.
+        ###     self.buffer: The current buffer represented as a list with the first item on the
+        ###                  buffer as the first item of the list
+        ###     self.dependencies: The list of dependencies produced so far. Represented as a list of
+        ###             tuples where each tuple is of the form (head, dependent).
+        ###             Order for this list doesn't matter.
+        ###
+        ### Note: The root token should be represented with the string "ROOT"
+        ###
+
+
+        ### END YOUR CODE
+
+
+    def parse_step(self, transition):
+        """Performs a single parse step by applying the given transition to this partial parse
+
+        @param transition (str): A string that equals "S", "LA", or "RA" representing the shift,
+                                left-arc, and right-arc transitions. You can assume the provided
+                                transition is a legal transition.
+        """
+        ### YOUR CODE HERE (~7-10 Lines)
+        ### TODO:
+        ###     Implement a single parsing step, i.e. the logic for the following as
+        ###     described in the pdf handout:
+        ###         1. Shift
+        ###         2. Left Arc
+        ###         3. Right Arc
+
+
+        ### END YOUR CODE
+
+    def parse(self, transitions):
+        """Applies the provided transitions to this PartialParse
+
+        @param transitions (list of str): The list of transitions in the order they should be applied
+
+        @return dsependencies (list of string tuples): The list of dependencies produced when
+                                                        parsing the sentence. Represented as a list of
+                                                        tuples where each tuple is of the form (head, dependent).
+        """
+        for transition in transitions:
+            self.parse_step(transition)
+        return self.dependencies
+
+
+def minibatch_parse(sentences, model, batch_size):
+    """Parses a list of sentences in minibatches using a model.
+
+    @param sentences (list of list of str): A list of sentences to be parsed
+                                            (each sentence is a list of words and each word is of type string)
+    @param model (ParserModel): The model that makes parsing decisions. It is assumed to have a function
+                                model.predict(partial_parses) that takes in a list of PartialParses as input and
+                                returns a list of transitions predicted for each parse. That is, after calling
+                                    transitions = model.predict(partial_parses)
+                                transitions[i] will be the next transition to apply to partial_parses[i].
+    @param batch_size (int): The number of PartialParses to include in each minibatch
+
+
+    @return dependencies (list of dependency lists): A list where each element is the dependencies
+                                                    list for a parsed sentence. Ordering should be the
+                                                    same as in sentences (i.e., dependencies[i] should
+                                                    contain the parse for sentences[i]).
+    """
+    dependencies = []
+
+    ### YOUR CODE HERE (~8-10 Lines)
+    ### TODO:
+    ###     Implement the minibatch parse algorithm as described in the pdf handout
+    ###
+    ###     Note: A shallow copy (as denoted in the PDF) can be made with the "=" sign in python, e.g.
+    ###                 unfinished_parses = partial_parses[:].
+    ###             Here `unfinished_parses` is a shallow copy of `partial_parses`.
+    ###             In Python, a shallow copied list like `unfinished_parses` does not contain new instances
+    ###             of the object stored in `partial_parses`. Rather both lists refer to the same objects.
+    ###             In our case, `partial_parses` contains a list of partial parses. `unfinished_parses`
+    ###             contains references to the same objects. Thus, you should NOT use the `del` operator
+    ###             to remove objects from the `unfinished_parses` list. This will free the underlying memory that
+    ###             is being accessed by `partial_parses` and may cause your code to crash.
+
+
+    ### END YOUR CODE
+
+    return dependencies
+
+
+def test_step(name, transition, stack, buf, deps,
+              ex_stack, ex_buf, ex_deps):
+    """Tests that a single parse step returns the expected output"""
+    pp = PartialParse([])
+    pp.stack, pp.buffer, pp.dependencies = stack, buf, deps
+
+    pp.parse_step(transition)
+    stack, buf, deps = (tuple(pp.stack), tuple(pp.buffer), tuple(sorted(pp.dependencies)))
+    assert stack == ex_stack, \
+        "{:} test resulted in stack {:}, expected {:}".format(name, stack, ex_stack)
+    assert buf == ex_buf, \
+        "{:} test resulted in buffer {:}, expected {:}".format(name, buf, ex_buf)
+    assert deps == ex_deps, \
+        "{:} test resulted in dependency list {:}, expected {:}".format(name, deps, ex_deps)
+    print("{:} test passed!".format(name))
+
+
+def test_parse_step():
+    """Simple tests for the PartialParse.parse_step function
+    Warning: these are not exhaustive
+    """
+    test_step("SHIFT", "S", ["ROOT", "the"], ["cat", "sat"], [],
+              ("ROOT", "the", "cat"), ("sat",), ())
+    test_step("LEFT-ARC", "LA", ["ROOT", "the", "cat"], ["sat"], [],
+              ("ROOT", "cat",), ("sat",), (("cat", "the"),))
+    test_step("RIGHT-ARC", "RA", ["ROOT", "run", "fast"], [], [],
+              ("ROOT", "run",), (), (("run", "fast"),))
+
+
+def test_parse():
+    """Simple tests for the PartialParse.parse function
+    Warning: these are not exhaustive
+    """
+    sentence = ["parse", "this", "sentence"]
+    dependencies = PartialParse(sentence).parse(["S", "S", "S", "LA", "RA", "RA"])
+    dependencies = tuple(sorted(dependencies))
+    expected = (('ROOT', 'parse'), ('parse', 'sentence'), ('sentence', 'this'))
+    assert dependencies == expected,  \
+        "parse test resulted in dependencies {:}, expected {:}".format(dependencies, expected)
+    assert tuple(sentence) == ("parse", "this", "sentence"), \
+        "parse test failed: the input sentence should not be modified"
+    print("parse test passed!")
+
+
+class DummyModel(object):
+    """Dummy model for testing the minibatch_parse function
+    """
+    def __init__(self, mode = "unidirectional"):
+        self.mode = mode
+
+    def predict(self, partial_parses):
+        if self.mode == "unidirectional":
+            return self.unidirectional_predict(partial_parses)
+        elif self.mode == "interleave":
+            return self.interleave_predict(partial_parses)
+        else:
+            raise NotImplementedError()
+
+    def unidirectional_predict(self, partial_parses):
+        """First shifts everything onto the stack and then does exclusively right arcs if the first word of
+        the sentence is "right", "left" if otherwise.
+        """
+        return [("RA" if pp.stack[1] is "right" else "LA") if len(pp.buffer) == 0 else "S"
+                for pp in partial_parses]
+
+    def interleave_predict(self, partial_parses):
+        """First shifts everything onto the stack and then interleaves "right" and "left".
+        """
+        return [("RA" if len(pp.stack) % 2 == 0 else "LA") if len(pp.buffer) == 0 else "S"
+                for pp in partial_parses]
+
+def test_dependencies(name, deps, ex_deps):
+    """Tests the provided dependencies match the expected dependencies"""
+    deps = tuple(sorted(deps))
+    assert deps == ex_deps, \
+        "{:} test resulted in dependency list {:}, expected {:}".format(name, deps, ex_deps)
+
+
+def test_minibatch_parse():
+    """Simple tests for the minibatch_parse function
+    Warning: these are not exhaustive
+    """
+
+    # Unidirectional arcs test
+    sentences = [["right", "arcs", "only"],
+                 ["right", "arcs", "only", "again"],
+                 ["left", "arcs", "only"],
+                 ["left", "arcs", "only", "again"]]
+    deps = minibatch_parse(sentences, DummyModel(), 2)
+    test_dependencies("minibatch_parse", deps[0],
+                      (('ROOT', 'right'), ('arcs', 'only'), ('right', 'arcs')))
+    test_dependencies("minibatch_parse", deps[1],
+                      (('ROOT', 'right'), ('arcs', 'only'), ('only', 'again'), ('right', 'arcs')))
+    test_dependencies("minibatch_parse", deps[2],
+                      (('only', 'ROOT'), ('only', 'arcs'), ('only', 'left')))
+    test_dependencies("minibatch_parse", deps[3],
+                      (('again', 'ROOT'), ('again', 'arcs'), ('again', 'left'), ('again', 'only')))
+
+    # Out-of-bound test
+    sentences = [["right"]]
+    deps = minibatch_parse(sentences, DummyModel(), 2)
+    test_dependencies("minibatch_parse", deps[0], (('ROOT', 'right'),))
+
+    # Mixed arcs test
+    sentences = [["this", "is", "interleaving", "dependency", "test"]]
+    deps = minibatch_parse(sentences, DummyModel(mode="interleave"), 1)
+    test_dependencies("minibatch_parse", deps[0],
+                      (('ROOT', 'is'), ('dependency', 'interleaving'),
+                      ('dependency', 'test'), ('is', 'dependency'), ('is', 'this')))
+    print("minibatch_parse test passed!")
+
+
+if __name__ == '__main__':
+    args = sys.argv
+    if len(args) != 2:
+        raise Exception("You did not provide a valid keyword. Either provide 'part_c' or 'part_d', when executing this script")
+    elif args[1] == "part_c":
+        test_parse_step()
+        test_parse()
+    elif args[1] == "part_d":
+        test_minibatch_parse()
+    else:
+        raise Exception("You did not provide a valid keyword. Either provide 'part_c' or 'part_d', when executing this script")
diff --git a/Week 03/Assignment 3/run.py b/Week 03/Assignment 3/run.py
new file mode 100755
index 0000000..be7e5f6
--- /dev/null
+++ b/Week 03/Assignment 3/run.py	
@@ -0,0 +1,161 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+CS224N 2019-20: Homework 3
+run.py: Run the dependency parser.
+Sahil Chopra <schopra8@stanford.edu>
+Haoshen Hong <haoshen@stanford.edu>
+"""
+from datetime import datetime
+import os
+import pickle
+import math
+import time
+import argparse
+
+from torch import nn, optim
+import torch
+from tqdm import tqdm
+
+from parser_model import ParserModel
+from utils.parser_utils import minibatches, load_and_preprocess_data, AverageMeter
+
+parser = argparse.ArgumentParser(description='Train neural dependency parser in pytorch')
+parser.add_argument('-d', '--debug', action='store_true', help='whether to enter debug mode')
+args = parser.parse_args()
+
+# -----------------
+# Primary Functions
+# -----------------
+def train(parser, train_data, dev_data, output_path, batch_size=1024, n_epochs=10, lr=0.0005):
+    """ Train the neural dependency parser.
+
+    @param parser (Parser): Neural Dependency Parser
+    @param train_data ():
+    @param dev_data ():
+    @param output_path (str): Path to which model weights and results are written.
+    @param batch_size (int): Number of examples in a single batch
+    @param n_epochs (int): Number of training epochs
+    @param lr (float): Learning rate
+    """
+    best_dev_UAS = 0
+
+
+    ### YOUR CODE HERE (~2-7 lines)
+    ### TODO:
+    ###      1) Construct Adam Optimizer in variable `optimizer`
+    ###      2) Construct the Cross Entropy Loss Function in variable `loss_func` with `mean`
+    ###         reduction (default)
+    ###
+    ### Hint: Use `parser.model.parameters()` to pass optimizer
+    ###       necessary parameters to tune.
+    ### Please see the following docs for support:
+    ###     Adam Optimizer: https://pytorch.org/docs/stable/optim.html
+    ###     Cross Entropy Loss: https://pytorch.org/docs/stable/nn.html#crossentropyloss
+
+
+
+    ### END YOUR CODE
+
+    for epoch in range(n_epochs):
+        print("Epoch {:} out of {:}".format(epoch + 1, n_epochs))
+        dev_UAS = train_for_epoch(parser, train_data, dev_data, optimizer, loss_func, batch_size)
+        if dev_UAS > best_dev_UAS:
+            best_dev_UAS = dev_UAS
+            print("New best dev UAS! Saving model.")
+            torch.save(parser.model.state_dict(), output_path)
+        print("")
+
+
+def train_for_epoch(parser, train_data, dev_data, optimizer, loss_func, batch_size):
+    """ Train the neural dependency parser for single epoch.
+
+    Note: In PyTorch we can signify train versus test and automatically have
+    the Dropout Layer applied and removed, accordingly, by specifying
+    whether we are training, `model.train()`, or evaluating, `model.eval()`
+
+    @param parser (Parser): Neural Dependency Parser
+    @param train_data ():
+    @param dev_data ():
+    @param optimizer (nn.Optimizer): Adam Optimizer
+    @param loss_func (nn.CrossEntropyLoss): Cross Entropy Loss Function
+    @param batch_size (int): batch size
+
+    @return dev_UAS (float): Unlabeled Attachment Score (UAS) for dev data
+    """
+    parser.model.train() # Places model in "train" mode, i.e. apply dropout layer
+    n_minibatches = math.ceil(len(train_data) / batch_size)
+    loss_meter = AverageMeter()
+
+    with tqdm(total=(n_minibatches)) as prog:
+        for i, (train_x, train_y) in enumerate(minibatches(train_data, batch_size)):
+            optimizer.zero_grad()   # remove any baggage in the optimizer
+            loss = 0. # store loss for this batch here
+            train_x = torch.from_numpy(train_x).long()
+            train_y = torch.from_numpy(train_y.nonzero()[1]).long()
+
+            ### YOUR CODE HERE (~5-10 lines)
+            ### TODO:
+            ###      1) Run train_x forward through model to produce `logits`
+            ###      2) Use the `loss_func` parameter to apply the PyTorch CrossEntropyLoss function.
+            ###         This will take `logits` and `train_y` as inputs. It will output the CrossEntropyLoss
+            ###         between softmax(`logits`) and `train_y`. Remember that softmax(`logits`)
+            ###         are the predictions (y^ from the PDF).
+            ###      3) Backprop losses
+            ###      4) Take step with the optimizer
+            ### Please see the following docs for support:
+            ###     Optimizer Step: https://pytorch.org/docs/stable/optim.html#optimizer-step
+
+
+
+
+            ### END YOUR CODE
+            prog.update(1)
+            loss_meter.update(loss.item())
+
+    print ("Average Train Loss: {}".format(loss_meter.avg))
+
+    print("Evaluating on dev set",)
+    parser.model.eval() # Places model in "eval" mode, i.e. don't apply dropout layer
+    dev_UAS, _ = parser.parse(dev_data)
+    print("- dev UAS: {:.2f}".format(dev_UAS * 100.0))
+    return dev_UAS
+
+
+if __name__ == "__main__":
+    debug = args.debug
+
+    assert (torch.__version__.split(".") >= ["1", "0", "0"]), "Please install torch version >= 1.0.0"
+
+    print(80 * "=")
+    print("INITIALIZING")
+    print(80 * "=")
+    parser, embeddings, train_data, dev_data, test_data = load_and_preprocess_data(debug)
+
+    start = time.time()
+    model = ParserModel(embeddings)
+    parser.model = model
+    print("took {:.2f} seconds\n".format(time.time() - start))
+
+    print(80 * "=")
+    print("TRAINING")
+    print(80 * "=")
+    output_dir = "results/{:%Y%m%d_%H%M%S}/".format(datetime.now())
+    output_path = output_dir + "model.weights"
+
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+
+    train(parser, train_data, dev_data, output_path, batch_size=1024, n_epochs=10, lr=0.0005)
+
+    if not debug:
+        print(80 * "=")
+        print("TESTING")
+        print(80 * "=")
+        print("Restoring the best model weights found on the dev set")
+        parser.model.load_state_dict(torch.load(output_path))
+        print("Final evaluation on test set",)
+        parser.model.eval()
+        UAS, dependencies = parser.parse(test_data)
+        print("- test UAS: {:.2f}".format(UAS * 100.0))
+        print("Done!")
diff --git a/Week 03/Assignment 3/utils/__init__.py b/Week 03/Assignment 3/utils/__init__.py
new file mode 100755
index 0000000..e69de29
diff --git a/Week 03/Assignment 3/utils/general_utils.py b/Week 03/Assignment 3/utils/general_utils.py
new file mode 100755
index 0000000..5940ce7
--- /dev/null
+++ b/Week 03/Assignment 3/utils/general_utils.py	
@@ -0,0 +1,63 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+CS224N 2018-19: Homework 3
+general_utils.py: General purpose utilities.
+Sahil Chopra <schopra8@stanford.edu>
+"""
+
+import sys
+import time
+import numpy as np
+
+
+def get_minibatches(data, minibatch_size, shuffle=True):
+    """
+    Iterates through the provided data one minibatch at at time. You can use this function to
+    iterate through data in minibatches as follows:
+
+        for inputs_minibatch in get_minibatches(inputs, minibatch_size):
+            ...
+
+    Or with multiple data sources:
+
+        for inputs_minibatch, labels_minibatch in get_minibatches([inputs, labels], minibatch_size):
+            ...
+
+    Args:
+        data: there are two possible values:
+            - a list or numpy array
+            - a list where each element is either a list or numpy array
+        minibatch_size: the maximum number of items in a minibatch
+        shuffle: whether to randomize the order of returned data
+    Returns:
+        minibatches: the return value depends on data:
+            - If data is a list/array it yields the next minibatch of data.
+            - If data a list of lists/arrays it returns the next minibatch of each element in the
+              list. This can be used to iterate through multiple data sources
+              (e.g., features and labels) at the same time.
+
+    """
+    list_data = type(data) is list and (type(data[0]) is list or type(data[0]) is np.ndarray)
+    data_size = len(data[0]) if list_data else len(data)
+    indices = np.arange(data_size)
+    if shuffle:
+        np.random.shuffle(indices)
+    for minibatch_start in np.arange(0, data_size, minibatch_size):
+        minibatch_indices = indices[minibatch_start:minibatch_start + minibatch_size]
+        yield [_minibatch(d, minibatch_indices) for d in data] if list_data \
+            else _minibatch(data, minibatch_indices)
+
+
+def _minibatch(data, minibatch_idx):
+    return data[minibatch_idx] if type(data) is np.ndarray else [data[i] for i in minibatch_idx]
+
+
+def test_all_close(name, actual, expected):
+    if actual.shape != expected.shape:
+        raise ValueError("{:} failed, expected output to have shape {:} but has shape {:}"
+                         .format(name, expected.shape, actual.shape))
+    if np.amax(np.fabs(actual - expected)) > 1e-6:
+        raise ValueError("{:} failed, expected {:} but value is {:}".format(name, expected, actual))
+    else:
+        print(name, "passed!")
diff --git a/Week 03/Assignment 3/utils/parser_utils.py b/Week 03/Assignment 3/utils/parser_utils.py
new file mode 100755
index 0000000..c559a58
--- /dev/null
+++ b/Week 03/Assignment 3/utils/parser_utils.py	
@@ -0,0 +1,422 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+CS224N 2018-19: Homework 3
+parser_utils.py: Utilities for training the dependency parser.
+Sahil Chopra <schopra8@stanford.edu>
+"""
+
+import time
+import os
+import logging
+from collections import Counter
+from . general_utils import get_minibatches
+from parser_transitions import minibatch_parse
+
+from tqdm import tqdm
+import torch
+import numpy as np
+
+P_PREFIX = '<p>:'
+L_PREFIX = '<l>:'
+UNK = '<UNK>'
+NULL = '<NULL>'
+ROOT = '<ROOT>'
+
+
+class Config(object):
+    language = 'english'
+    with_punct = True
+    unlabeled = True
+    lowercase = True
+    use_pos = True
+    use_dep = True
+    use_dep = use_dep and (not unlabeled)
+    data_path = './data'
+    train_file = 'train.conll'
+    dev_file = 'dev.conll'
+    test_file = 'test.conll'
+    embedding_file = './data/en-cw.txt'
+
+
+class Parser(object):
+    """Contains everything needed for transition-based dependency parsing except for the model"""
+
+    def __init__(self, dataset):
+        root_labels = list([l for ex in dataset
+                           for (h, l) in zip(ex['head'], ex['label']) if h == 0])
+        counter = Counter(root_labels)
+        if len(counter) > 1:
+            logging.info('Warning: more than one root label')
+            logging.info(counter)
+        self.root_label = counter.most_common()[0][0]
+        deprel = [self.root_label] + list(set([w for ex in dataset
+                                               for w in ex['label']
+                                               if w != self.root_label]))
+        tok2id = {L_PREFIX + l: i for (i, l) in enumerate(deprel)}
+        tok2id[L_PREFIX + NULL] = self.L_NULL = len(tok2id)
+
+        config = Config()
+        self.unlabeled = config.unlabeled
+        self.with_punct = config.with_punct
+        self.use_pos = config.use_pos
+        self.use_dep = config.use_dep
+        self.language = config.language
+
+        if self.unlabeled:
+            trans = ['L', 'R', 'S']
+            self.n_deprel = 1
+        else:
+            trans = ['L-' + l for l in deprel] + ['R-' + l for l in deprel] + ['S']
+            self.n_deprel = len(deprel)
+
+        self.n_trans = len(trans)
+        self.tran2id = {t: i for (i, t) in enumerate(trans)}
+        self.id2tran = {i: t for (i, t) in enumerate(trans)}
+
+        # logging.info('Build dictionary for part-of-speech tags.')
+        tok2id.update(build_dict([P_PREFIX + w for ex in dataset for w in ex['pos']],
+                                  offset=len(tok2id)))
+        tok2id[P_PREFIX + UNK] = self.P_UNK = len(tok2id)
+        tok2id[P_PREFIX + NULL] = self.P_NULL = len(tok2id)
+        tok2id[P_PREFIX + ROOT] = self.P_ROOT = len(tok2id)
+
+        # logging.info('Build dictionary for words.')
+        tok2id.update(build_dict([w for ex in dataset for w in ex['word']],
+                                  offset=len(tok2id)))
+        tok2id[UNK] = self.UNK = len(tok2id)
+        tok2id[NULL] = self.NULL = len(tok2id)
+        tok2id[ROOT] = self.ROOT = len(tok2id)
+
+        self.tok2id = tok2id
+        self.id2tok = {v: k for (k, v) in tok2id.items()}
+
+        self.n_features = 18 + (18 if config.use_pos else 0) + (12 if config.use_dep else 0)
+        self.n_tokens = len(tok2id)
+
+    def vectorize(self, examples):
+        vec_examples = []
+        for ex in examples:
+            word = [self.ROOT] + [self.tok2id[w] if w in self.tok2id
+                                  else self.UNK for w in ex['word']]
+            pos = [self.P_ROOT] + [self.tok2id[P_PREFIX + w] if P_PREFIX + w in self.tok2id
+                                   else self.P_UNK for w in ex['pos']]
+            head = [-1] + ex['head']
+            label = [-1] + [self.tok2id[L_PREFIX + w] if L_PREFIX + w in self.tok2id
+                            else -1 for w in ex['label']]
+            vec_examples.append({'word': word, 'pos': pos,
+                                 'head': head, 'label': label})
+        return vec_examples
+
+    def extract_features(self, stack, buf, arcs, ex):
+        if stack[0] == "ROOT":
+            stack[0] = 0
+
+        def get_lc(k):
+            return sorted([arc[1] for arc in arcs if arc[0] == k and arc[1] < k])
+
+        def get_rc(k):
+            return sorted([arc[1] for arc in arcs if arc[0] == k and arc[1] > k],
+                          reverse=True)
+
+        p_features = []
+        l_features = []
+        features = [self.NULL] * (3 - len(stack)) + [ex['word'][x] for x in stack[-3:]]
+        features += [ex['word'][x] for x in buf[:3]] + [self.NULL] * (3 - len(buf))
+        if self.use_pos:
+            p_features = [self.P_NULL] * (3 - len(stack)) + [ex['pos'][x] for x in stack[-3:]]
+            p_features += [ex['pos'][x] for x in buf[:3]] + [self.P_NULL] * (3 - len(buf))
+
+        for i in range(2):
+            if i < len(stack):
+                k = stack[-i-1]
+                lc = get_lc(k)
+                rc = get_rc(k)
+                llc = get_lc(lc[0]) if len(lc) > 0 else []
+                rrc = get_rc(rc[0]) if len(rc) > 0 else []
+
+                features.append(ex['word'][lc[0]] if len(lc) > 0 else self.NULL)
+                features.append(ex['word'][rc[0]] if len(rc) > 0 else self.NULL)
+                features.append(ex['word'][lc[1]] if len(lc) > 1 else self.NULL)
+                features.append(ex['word'][rc[1]] if len(rc) > 1 else self.NULL)
+                features.append(ex['word'][llc[0]] if len(llc) > 0 else self.NULL)
+                features.append(ex['word'][rrc[0]] if len(rrc) > 0 else self.NULL)
+
+                if self.use_pos:
+                    p_features.append(ex['pos'][lc[0]] if len(lc) > 0 else self.P_NULL)
+                    p_features.append(ex['pos'][rc[0]] if len(rc) > 0 else self.P_NULL)
+                    p_features.append(ex['pos'][lc[1]] if len(lc) > 1 else self.P_NULL)
+                    p_features.append(ex['pos'][rc[1]] if len(rc) > 1 else self.P_NULL)
+                    p_features.append(ex['pos'][llc[0]] if len(llc) > 0 else self.P_NULL)
+                    p_features.append(ex['pos'][rrc[0]] if len(rrc) > 0 else self.P_NULL)
+
+                if self.use_dep:
+                    l_features.append(ex['label'][lc[0]] if len(lc) > 0 else self.L_NULL)
+                    l_features.append(ex['label'][rc[0]] if len(rc) > 0 else self.L_NULL)
+                    l_features.append(ex['label'][lc[1]] if len(lc) > 1 else self.L_NULL)
+                    l_features.append(ex['label'][rc[1]] if len(rc) > 1 else self.L_NULL)
+                    l_features.append(ex['label'][llc[0]] if len(llc) > 0 else self.L_NULL)
+                    l_features.append(ex['label'][rrc[0]] if len(rrc) > 0 else self.L_NULL)
+            else:
+                features += [self.NULL] * 6
+                if self.use_pos:
+                    p_features += [self.P_NULL] * 6
+                if self.use_dep:
+                    l_features += [self.L_NULL] * 6
+
+        features += p_features + l_features
+        assert len(features) == self.n_features
+        return features
+
+    def get_oracle(self, stack, buf, ex):
+        if len(stack) < 2:
+            return self.n_trans - 1
+
+        i0 = stack[-1]
+        i1 = stack[-2]
+        h0 = ex['head'][i0]
+        h1 = ex['head'][i1]
+        l0 = ex['label'][i0]
+        l1 = ex['label'][i1]
+
+        if self.unlabeled:
+            if (i1 > 0) and (h1 == i0):
+                return 0
+            elif (i1 >= 0) and (h0 == i1) and \
+                 (not any([x for x in buf if ex['head'][x] == i0])):
+                return 1
+            else:
+                return None if len(buf) == 0 else 2
+        else:
+            if (i1 > 0) and (h1 == i0):
+                return l1 if (l1 >= 0) and (l1 < self.n_deprel) else None
+            elif (i1 >= 0) and (h0 == i1) and \
+                 (not any([x for x in buf if ex['head'][x] == i0])):
+                return l0 + self.n_deprel if (l0 >= 0) and (l0 < self.n_deprel) else None
+            else:
+                return None if len(buf) == 0 else self.n_trans - 1
+
+    def create_instances(self, examples):
+        all_instances = []
+        succ = 0
+        for id, ex in enumerate(examples):
+            n_words = len(ex['word']) - 1
+
+            # arcs = {(h, t, label)}
+            stack = [0]
+            buf = [i + 1 for i in range(n_words)]
+            arcs = []
+            instances = []
+            for i in range(n_words * 2):
+                gold_t = self.get_oracle(stack, buf, ex)
+                if gold_t is None:
+                    break
+                legal_labels = self.legal_labels(stack, buf)
+                assert legal_labels[gold_t] == 1
+                instances.append((self.extract_features(stack, buf, arcs, ex),
+                                  legal_labels, gold_t))
+                if gold_t == self.n_trans - 1:
+                    stack.append(buf[0])
+                    buf = buf[1:]
+                elif gold_t < self.n_deprel:
+                    arcs.append((stack[-1], stack[-2], gold_t))
+                    stack = stack[:-2] + [stack[-1]]
+                else:
+                    arcs.append((stack[-2], stack[-1], gold_t - self.n_deprel))
+                    stack = stack[:-1]
+            else:
+                succ += 1
+                all_instances += instances
+
+        return all_instances
+
+    def legal_labels(self, stack, buf):
+        labels = ([1] if len(stack) > 2 else [0]) * self.n_deprel
+        labels += ([1] if len(stack) >= 2 else [0]) * self.n_deprel
+        labels += [1] if len(buf) > 0 else [0]
+        return labels
+
+    def parse(self, dataset, eval_batch_size=5000):
+        sentences = []
+        sentence_id_to_idx = {}
+        for i, example in enumerate(dataset):
+            n_words = len(example['word']) - 1
+            sentence = [j + 1 for j in range(n_words)]
+            sentences.append(sentence)
+            sentence_id_to_idx[id(sentence)] = i
+
+        model = ModelWrapper(self, dataset, sentence_id_to_idx)
+        dependencies = minibatch_parse(sentences, model, eval_batch_size)
+
+        UAS = all_tokens = 0.0
+        with tqdm(total=len(dataset)) as prog:
+            for i, ex in enumerate(dataset):
+                head = [-1] * len(ex['word'])
+                for h, t, in dependencies[i]:
+                    head[t] = h
+                for pred_h, gold_h, gold_l, pos in \
+                        zip(head[1:], ex['head'][1:], ex['label'][1:], ex['pos'][1:]):
+                        assert self.id2tok[pos].startswith(P_PREFIX)
+                        pos_str = self.id2tok[pos][len(P_PREFIX):]
+                        if (self.with_punct) or (not punct(self.language, pos_str)):
+                            UAS += 1 if pred_h == gold_h else 0
+                            all_tokens += 1
+                prog.update(i + 1)
+        UAS /= all_tokens
+        return UAS, dependencies
+
+
+class ModelWrapper(object):
+    def __init__(self, parser, dataset, sentence_id_to_idx):
+        self.parser = parser
+        self.dataset = dataset
+        self.sentence_id_to_idx = sentence_id_to_idx
+
+    def predict(self, partial_parses):
+        mb_x = [self.parser.extract_features(p.stack, p.buffer, p.dependencies,
+                                             self.dataset[self.sentence_id_to_idx[id(p.sentence)]])
+                for p in partial_parses]
+        mb_x = np.array(mb_x).astype('int32')
+        mb_x = torch.from_numpy(mb_x).long()
+        mb_l = [self.parser.legal_labels(p.stack, p.buffer) for p in partial_parses]
+
+        pred = self.parser.model(mb_x)
+        pred = pred.detach().numpy()
+        pred = np.argmax(pred + 10000 * np.array(mb_l).astype('float32'), 1)
+        pred = ["S" if p == 2 else ("LA" if p == 0 else "RA") for p in pred]
+        return pred
+
+
+def read_conll(in_file, lowercase=False, max_example=None):
+    examples = []
+    with open(in_file) as f:
+        word, pos, head, label = [], [], [], []
+        for line in f.readlines():
+            sp = line.strip().split('\t')
+            if len(sp) == 10:
+                if '-' not in sp[0]:
+                    word.append(sp[1].lower() if lowercase else sp[1])
+                    pos.append(sp[4])
+                    head.append(int(sp[6]))
+                    label.append(sp[7])
+            elif len(word) > 0:
+                examples.append({'word': word, 'pos': pos, 'head': head, 'label': label})
+                word, pos, head, label = [], [], [], []
+                if (max_example is not None) and (len(examples) == max_example):
+                    break
+        if len(word) > 0:
+            examples.append({'word': word, 'pos': pos, 'head': head, 'label': label})
+    return examples
+
+
+def build_dict(keys, n_max=None, offset=0):
+    count = Counter()
+    for key in keys:
+        count[key] += 1
+    ls = count.most_common() if n_max is None \
+        else count.most_common(n_max)
+
+    return {w[0]: index + offset for (index, w) in enumerate(ls)}
+
+
+def punct(language, pos):
+    if language == 'english':
+        return pos in ["''", ",", ".", ":", "``", "-LRB-", "-RRB-"]
+    elif language == 'chinese':
+        return pos == 'PU'
+    elif language == 'french':
+        return pos == 'PUNC'
+    elif language == 'german':
+        return pos in ["$.", "$,", "$["]
+    elif language == 'spanish':
+        # http://nlp.stanford.edu/software/spanish-faq.shtml
+        return pos in ["f0", "faa", "fat", "fc", "fd", "fe", "fg", "fh",
+                       "fia", "fit", "fp", "fpa", "fpt", "fs", "ft",
+                       "fx", "fz"]
+    elif language == 'universal':
+        return pos == 'PUNCT'
+    else:
+        raise ValueError('language: %s is not supported.' % language)
+
+
+def minibatches(data, batch_size):
+    x = np.array([d[0] for d in data])
+    y = np.array([d[2] for d in data])
+    one_hot = np.zeros((y.size, 3))
+    one_hot[np.arange(y.size), y] = 1
+    return get_minibatches([x, one_hot], batch_size)
+
+
+def load_and_preprocess_data(reduced=True):
+    config = Config()
+
+    print("Loading data...",)
+    start = time.time()
+    train_set = read_conll(os.path.join(config.data_path, config.train_file),
+                           lowercase=config.lowercase)
+    dev_set = read_conll(os.path.join(config.data_path, config.dev_file),
+                         lowercase=config.lowercase)
+    test_set = read_conll(os.path.join(config.data_path, config.test_file),
+                          lowercase=config.lowercase)
+    if reduced:
+        train_set = train_set[:1000]
+        dev_set = dev_set[:500]
+        test_set = test_set[:500]
+    print("took {:.2f} seconds".format(time.time() - start))
+
+    print("Building parser...",)
+    start = time.time()
+    parser = Parser(train_set)
+    print("took {:.2f} seconds".format(time.time() - start))
+
+    print("Loading pretrained embeddings...",)
+    start = time.time()
+    word_vectors = {}
+    for line in open(config.embedding_file).readlines():
+        sp = line.strip().split()
+        word_vectors[sp[0]] = [float(x) for x in sp[1:]]
+    embeddings_matrix = np.asarray(np.random.normal(0, 0.9, (parser.n_tokens, 50)), dtype='float32')
+
+    for token in parser.tok2id:
+        i = parser.tok2id[token]
+        if token in word_vectors:
+            embeddings_matrix[i] = word_vectors[token]
+        elif token.lower() in word_vectors:
+            embeddings_matrix[i] = word_vectors[token.lower()]
+    print("took {:.2f} seconds".format(time.time() - start))
+
+    print("Vectorizing data...",)
+    start = time.time()
+    train_set = parser.vectorize(train_set)
+    dev_set = parser.vectorize(dev_set)
+    test_set = parser.vectorize(test_set)
+    print("took {:.2f} seconds".format(time.time() - start))
+
+    print("Preprocessing training data...",)
+    start = time.time()
+    train_examples = parser.create_instances(train_set)
+    print("took {:.2f} seconds".format(time.time() - start))
+
+    return parser, embeddings_matrix, train_examples, dev_set, test_set,
+
+
+class AverageMeter(object):
+    """Computes and stores the average and current value"""
+    def __init__(self):
+        self.reset()
+
+    def reset(self):
+        self.val = 0
+        self.avg = 0
+        self.sum = 0
+        self.count = 0
+
+    def update(self, val, n=1):
+        self.val = val
+        self.sum += val * n
+        self.count += n
+        self.avg = self.sum / self.count
+
+
+if __name__ == '__main__':
+    pass
diff --git a/Week 03/Linguistic Structure: Dependency Parsing/README.md b/Week 03/Linguistic Structure: Dependency Parsing/README.md
new file mode 100644
index 0000000..e57f1fc
--- /dev/null
+++ b/Week 03/Linguistic Structure: Dependency Parsing/README.md	
@@ -0,0 +1,11 @@
+# Linguistic Structure: Dependency Parsing
+
+- [Slide](http://web.stanford.edu/class/cs224n/slides/cs224n-2020-lecture05-dep-parsing.pdf)
+- [Note](http://web.stanford.edu/class/cs224n/readings/cs224n-2019-notes04-dependencyparsing.pdf)
+- Suggested Readings:
+    1. [Incrementality in Deterministic Dependency Parsing](https://www.aclweb.org/anthology/W/W04/W04-0308.pdf)
+    2. [A Fast and Accurate Dependency Parser using Neural Networks](https://www.emnlp2014.org/papers/pdf/EMNLP2014082.pdf)
+    3. [Dependency Parsing](http://www.morganclaypool.com/doi/abs/10.2200/S00169ED1V01Y200901HLT002)
+    4. [Globally Normalized Transition-Based Neural Networks](https://arxiv.org/pdf/1603.06042.pdf)
+    5. [Universal Stanford Dependencies: A cross-linguistic typology](http://nlp.stanford.edu/~manning/papers/USD_LREC14_UD_revision.pdf)
+    6. [Universal Dependencies website](http://universaldependencies.org/)
diff --git a/Week 03/Recurrent Neural Networks and Language Models/README.md b/Week 03/Recurrent Neural Networks and Language Models/README.md
new file mode 100644
index 0000000..93898c3
--- /dev/null
+++ b/Week 03/Recurrent Neural Networks and Language Models/README.md	
@@ -0,0 +1,9 @@
+# Recurrent Neural Networks and Language Models
+
+- [Slide](http://web.stanford.edu/class/cs224n/slides/cs224n-2020-lecture06-rnnlm.pdf)
+- [Note](http://web.stanford.edu/class/cs224n/readings/cs224n-2019-notes05-LM_RNN.pdf)
+- Suggested Readings:
+    1. [N-gram Language Models](https://web.stanford.edu/~jurafsky/slp3/3.pdf)
+    2. [The Unreasonable Effectiveness of Recurrent Neural Networks](http://karpathy.github.io/2015/05/21/rnn-effectiveness/)
+    3. [Sequence Modeling: Recurrent and Recursive Neural Nets](http://www.deeplearningbook.org/contents/rnn.html) (Sections 10.1 and 10.2)
+    4. [On Chomsky and the Two Cultures of Statistical Learning](http://norvig.com/chomsky.html)