Create project structure (#9)

Co-authored-by: Hồng Hạnh <[email protected]>
ZootoPi · Feb 22, 2020 · 9aadbea · 9aadbea
1 parent cc7627c
commit 9aadbea
Show file tree

Hide file tree

Showing 166 changed files with 918,147 additions and 37 deletions.
diff --git a/.gitignore b/.gitignore
@@ -127,3 +127,9 @@ dmypy.json
 
 # Pyre type checker
 .pyre/
+
+# Visual Code
+**.vscode
+
+# MacOS
+**.DS_Store
diff --git a/Assignments/Assignment 1/README.txt b/Assignments/Assignment 1/README.txt
@@ -0,0 +1,28 @@
+Welcome to CS224N!
+
+We'll be using Python throughout the course. If you've got a good Python setup already, great! But make sure that it is at least Python version 3.5. If not, the easiest thing to do is to make sure you have at least 3GB free on your computer and then to head over to (https://www.anaconda.com/download/) and install the Python 3 version of Anaconda. It will work on any operating system.
+
+After you have installed conda, close any open terminals you might have. Then open a new terminal and run the following command:
+
+# 1. Create an environment with dependencies specified in env.yml:
+
+    conda env create -f env.yml
+
+# 2. Activate the new environment:
+
+    conda activate cs224n
+
+# 3. Inside the new environment, instatll IPython kernel so we can use this environment in jupyter notebook: 
+
+    python -m ipykernel install --user --name cs224n
+
+
+# 4. Homework 1 (only) is a Jupyter Notebook. With the above done you should be able to get underway by typing:
+
+    jupyter notebook exploring_word_vectors.ipynb
+
+# 5. To make sure we are using the right environment, go to the toolbar of exploring_word_vectors.ipynb, click on Kernel -> Change kernel, you should see and select cs224n in the drop-down menu.
+
+# To deactivate an active environment, use
+
+    conda deactivate
diff --git a/Assignments/Assignment 1/env.yml b/Assignments/Assignment 1/env.yml
@@ -0,0 +1,14 @@
+name: cs224n
+channels:
+  - defaults
+  - anaconda
+dependencies:
+  - jupyter
+  - matplotlib
+  - numpy
+  - python=3.7
+  - ipykernel
+  - scikit-learn
+  - nltk
+  - gensim
+
diff --git a/Assignments/Assignment 1/exploring_word_vectors.ipynb b/Assignments/Assignment 1/exploring_word_vectors.ipynb
diff --git a/Assignments/Assignment 1/imgs/inner_product.png b/Assignments/Assignment 1/imgs/inner_product.png
diff --git a/Assignments/Assignment 1/imgs/svd.png b/Assignments/Assignment 1/imgs/svd.png
diff --git a/Assignments/Assignment 1/imgs/test_plot.png b/Assignments/Assignment 1/imgs/test_plot.png
diff --git a/Assignments/Assignment 2/README.md b/Assignments/Assignment 2/README.md
@@ -0,0 +1,3 @@
+# Assignment 2
+
+- [Handout](http://web.stanford.edu/class/cs224n/assignments/a2.pdf)
diff --git a/Assignments/Assignment 2/collect_submission.sh b/Assignments/Assignment 2/collect_submission.sh
@@ -0,0 +1,2 @@
+rm -f assignment2.zip
+zip -r assignment2.zip *.py *.png saved_params_40000.npy
diff --git a/Assignments/Assignment 2/env.yml b/Assignments/Assignment 2/env.yml
@@ -0,0 +1,10 @@
+name: a2
+channels:
+  - defaults
+  - anaconda
+dependencies:
+  - jupyter
+  - matplotlib
+  - numpy
+  - python=3.7
+  - scikit-learn
diff --git a/Assignments/Assignment 2/get_datasets.sh b/Assignments/Assignment 2/get_datasets.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+DATASETS_DIR="utils/datasets"
+mkdir -p $DATASETS_DIR
+
+cd $DATASETS_DIR
+
+# Get Stanford Sentiment Treebank
+if hash wget 2>/dev/null; then
+  wget http://nlp.stanford.edu/~socherr/stanfordSentimentTreebank.zip
+else
+  curl -L http://nlp.stanford.edu/~socherr/stanfordSentimentTreebank.zip -o stanfordSentimentTreebank.zip
+fi
+unzip stanfordSentimentTreebank.zip
+rm stanfordSentimentTreebank.zip
diff --git a/Assignments/Assignment 2/run.py b/Assignments/Assignment 2/run.py
@@ -0,0 +1,75 @@
+#!/usr/bin/env python
+
+import random
+import numpy as np
+from utils.treebank import StanfordSentiment
+import matplotlib
+matplotlib.use('agg')
+import matplotlib.pyplot as plt
+import time
+
+from word2vec import *
+from sgd import *
+
+# Check Python Version
+import sys
+assert sys.version_info[0] == 3
+assert sys.version_info[1] >= 5
+
+# Reset the random seed to make sure that everyone gets the same results
+random.seed(314)
+dataset = StanfordSentiment()
+tokens = dataset.tokens()
+nWords = len(tokens)
+
+# We are going to train 10-dimensional vectors for this assignment
+dimVectors = 10
+
+# Context size
+C = 5
+
+# Reset the random seed to make sure that everyone gets the same results
+random.seed(31415)
+np.random.seed(9265)
+
+startTime=time.time()
+wordVectors = np.concatenate(
+    ((np.random.rand(nWords, dimVectors) - 0.5) /
+       dimVectors, np.zeros((nWords, dimVectors))),
+    axis=0)
+wordVectors = sgd(
+    lambda vec: word2vec_sgd_wrapper(skipgram, tokens, vec, dataset, C,
+        negSamplingLossAndGradient),
+    wordVectors, 0.3, 40000, None, True, PRINT_EVERY=10)
+# Note that normalization is not called here. This is not a bug,
+# normalizing during training loses the notion of length.
+
+print("sanity check: cost at convergence should be around or below 10")
+print("training took %d seconds" % (time.time() - startTime))
+
+# concatenate the input and output word vectors
+wordVectors = np.concatenate(
+    (wordVectors[:nWords,:], wordVectors[nWords:,:]),
+    axis=0)
+
+visualizeWords = [
+    "great", "cool", "brilliant", "wonderful", "well", "amazing",
+    "worth", "sweet", "enjoyable", "boring", "bad", "dumb",
+    "annoying", "female", "male", "queen", "king", "man", "woman", "rain", "snow",
+    "hail", "coffee", "tea"]
+
+visualizeIdx = [tokens[word] for word in visualizeWords]
+visualizeVecs = wordVectors[visualizeIdx, :]
+temp = (visualizeVecs - np.mean(visualizeVecs, axis=0))
+covariance = 1.0 / len(visualizeIdx) * temp.T.dot(temp)
+U,S,V = np.linalg.svd(covariance)
+coord = temp.dot(U[:,0:2])
+
+for i in range(len(visualizeWords)):
+    plt.text(coord[i,0], coord[i,1], visualizeWords[i],
+        bbox=dict(facecolor='green', alpha=0.1))
+
+plt.xlim((np.min(coord[:,0]), np.max(coord[:,0])))
+plt.ylim((np.min(coord[:,1]), np.max(coord[:,1])))
+
+plt.savefig('word_vectors.png')
diff --git a/Assignments/Assignment 2/sgd.py b/Assignments/Assignment 2/sgd.py
@@ -0,0 +1,131 @@
+#!/usr/bin/env python
+
+# Save parameters every a few SGD iterations as fail-safe
+SAVE_PARAMS_EVERY = 5000
+
+import pickle
+import glob
+import random
+import numpy as np
+import os.path as op
+
+def load_saved_params():
+    """
+    A helper function that loads previously saved parameters and resets
+    iteration start.
+    """
+    st = 0
+    for f in glob.glob("saved_params_*.npy"):
+        iter = int(op.splitext(op.basename(f))[0].split("_")[2])
+        if (iter > st):
+            st = iter
+
+    if st > 0:
+        params_file = "saved_params_%d.npy" % st
+        state_file = "saved_state_%d.pickle" % st
+        params = np.load(params_file)
+        with open(state_file, "rb") as f:
+            state = pickle.load(f)
+        return st, params, state
+    else:
+        return st, None, None
+
+
+def save_params(iter, params):
+    params_file = "saved_params_%d.npy" % iter
+    np.save(params_file, params)
+    with open("saved_state_%d.pickle" % iter, "wb") as f:
+        pickle.dump(random.getstate(), f)
+
+
+def sgd(f, x0, step, iterations, postprocessing=None, useSaved=False,
+        PRINT_EVERY=10):
+    """ Stochastic Gradient Descent
+
+    Implement the stochastic gradient descent method in this function.
+
+    Arguments:
+    f -- the function to optimize, it should take a single
+         argument and yield two outputs, a loss and the gradient
+         with respect to the arguments
+    x0 -- the initial point to start SGD from
+    step -- the step size for SGD
+    iterations -- total iterations to run SGD for
+    postprocessing -- postprocessing function for the parameters
+                      if necessary. In the case of word2vec we will need to
+                      normalize the word vectors to have unit length.
+    PRINT_EVERY -- specifies how many iterations to output loss
+
+    Return:
+    x -- the parameter value after SGD finishes
+    """
+
+    # Anneal learning rate every several iterations
+    ANNEAL_EVERY = 20000
+
+    if useSaved:
+        start_iter, oldx, state = load_saved_params()
+        if start_iter > 0:
+            x0 = oldx
+            step *= 0.5 ** (start_iter / ANNEAL_EVERY)
+
+        if state:
+            random.setstate(state)
+    else:
+        start_iter = 0
+
+    x = x0
+
+    if not postprocessing:
+        postprocessing = lambda x: x
+
+    exploss = None
+
+    for iter in range(start_iter + 1, iterations + 1):
+        # You might want to print the progress every few iterations.
+
+        loss = None
+        ### YOUR CODE HERE (~2 lines)
+
+        ### END YOUR CODE
+
+        x = postprocessing(x)
+        if iter % PRINT_EVERY == 0:
+            if not exploss:
+                exploss = loss
+            else:
+                exploss = .95 * exploss + .05 * loss
+            print("iter %d: %f" % (iter, exploss))
+
+        if iter % SAVE_PARAMS_EVERY == 0 and useSaved:
+            save_params(iter, x)
+
+        if iter % ANNEAL_EVERY == 0:
+            step *= 0.5
+
+    return x
+
+
+def sanity_check():
+    quad = lambda x: (np.sum(x ** 2), x * 2)
+
+    print("Running sanity checks...")
+    t1 = sgd(quad, 0.5, 0.01, 1000, PRINT_EVERY=100)
+    print("test 1 result:", t1)
+    assert abs(t1) <= 1e-6
+
+    t2 = sgd(quad, 0.0, 0.01, 1000, PRINT_EVERY=100)
+    print("test 2 result:", t2)
+    assert abs(t2) <= 1e-6
+
+    t3 = sgd(quad, -1.5, 0.01, 1000, PRINT_EVERY=100)
+    print("test 3 result:", t3)
+    assert abs(t3) <= 1e-6
+
+    print("-" * 40)
+    print("ALL TESTS PASSED")
+    print("-" * 40)
+
+
+if __name__ == "__main__":
+    sanity_check()
diff --git a/Assignments/Assignment 2/utils/__init__.py b/Assignments/Assignment 2/utils/__init__.py
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		# Assignment 2

		- [Handout](http://web.stanford.edu/class/cs224n/assignments/a2.pdf)
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		rm -f assignment2.zip
		zip -r assignment2.zip .py .png saved_params_40000.npy