From e3006ebf064ffac6db2ca1579beeb07372a5b1c1 Mon Sep 17 00:00:00 2001
From: Wei Chen <weichen@apache.org>
Date: Sat, 3 Oct 2020 17:19:49 +0800
Subject: [PATCH] prioritized-exp-replay

---
 README.md                                     |   4 +-
 .../deeplearning/NeuralNetwork.scala          |  31 +++-
 .../scala/algorithm/reinforcement/PER.scala   | 158 ++++++++++++++++++
 .../algorithm/reinforcement/PERTest.scala     |  68 ++++++++
 4 files changed, 250 insertions(+), 11 deletions(-)
 create mode 100644 src/main/scala/algorithm/reinforcement/PER.scala
 create mode 100644 src/test/scala/algorithm/reinforcement/PERTest.scala

diff --git a/README.md b/README.md
index 6965244..a76096d 100644
--- a/README.md
+++ b/README.md
@@ -108,6 +108,8 @@ A very light weight Scala machine learning library that provide some basic ML al
 
 - [x] Asynchronous Advantage Actor-Critic (A3C) [[Code]](src/main/scala/algorithm/reinforcement/A3C.scala) [[Usage]](src/test/scala/algorithm/reinforcement/A3CTest.scala)
 
+- [x] Prioritized Experience Replay (PER-DQN) [[Code]](src/main/scala/algorithm/reinforcement/PER.scala) [[Usage]](src/test/scala/algorithm/reinforcement/PERTest.scala)
+
 ### Feature Analysis :
 
 - [x] Student-T Test [[Code]](src/main/scala/algorithm/analysis/StudentT.scala) [[Usage]](src/test/scala/algorithm/analysis/StudentTTest.scala)
@@ -125,8 +127,6 @@ A very light weight Scala machine learning library that provide some basic ML al
 
 ## TODO
 
-- [ ] Polarize Experience Replay - Deep Reinforcement Learning
-
 - [ ] Rainbow - Deep Reinforcement Learning
 
 - [ ] Alpha-go Zero (MCTS-NN) - Deep Reinforcement Learning
diff --git a/src/main/scala/algorithm/deeplearning/NeuralNetwork.scala b/src/main/scala/algorithm/deeplearning/NeuralNetwork.scala
index db0ee54..a86b7b4 100644
--- a/src/main/scala/algorithm/deeplearning/NeuralNetwork.scala
+++ b/src/main/scala/algorithm/deeplearning/NeuralNetwork.scala
@@ -28,6 +28,7 @@ class Node(
     var output: Double = 0.0
     /** Error derivative with respect to this node's output. */
     var outputDer: Double = 0.0
+    var rawOutputDer: Double = 0.0
     /** Error derivative with respect to this node's total input. */
     var inputDer: Double = 0.0
     /**
@@ -264,15 +265,17 @@ class NeuralNetwork {
      */
     def backProp(
         targets: Array[Double],
-        errorFunc: ErrorFunction = SQUARE
+        errorFunc: ErrorFunction = SQUARE,
+        _outputWeights: Array[Double] = Array.fill[Double](networkShape.last)(1.0)
     ): Unit = {
-        val outputNodes = network.last
         // The output node is a special case. We use the user-defined error
         // function for the derivative.
-        for((node, target) <- outputNodes.zip(targets)) {
-            node.outputDer = errorFunc.der(node.output, target)
+        for((node, target) <- getOutputNodes.zip(targets)) {
+            node.rawOutputDer = errorFunc.der(node.output, target)
+        }
+        for((node, weight) <- getOutputNodes.zip(_outputWeights)) {
+            node.outputDer = node.rawOutputDer * weight
         }
-
         // Go through the layers backwards.
         for(layerIdx <- network.length - 1 to 1 by -1) {
             val currentLayer = network(layerIdx)
@@ -374,9 +377,13 @@ class NeuralNetwork {
     def clear() = reset(false)
 
     /** Train one inputs to one targets, moved and Modified from Playground. */
-    def trainOne(inputs: Array[Double], targets: Array[Double], errorFunc: ErrorFunction = SQUARE): Unit = {
+    def trainOne(
+        inputs: Array[Double], targets: Array[Double],
+        errorFunc: ErrorFunction = SQUARE,
+        _outputWeights: Array[Double] = Array.fill[Double](networkShape.last)(1.0)
+    ): Unit = {
         forwardProp(inputs)
-        backProp(targets, errorFunc)
+        backProp(targets, errorFunc, _outputWeights)
         if((index - updateIndex + 1) % batchSize == 0) {
             updateIndex = index
             updateWeights()
@@ -388,10 +395,16 @@ class NeuralNetwork {
     def predictOne = forwardProp _
 
     /** Train all data */
-    def train(x: Array[Array[Double]], y: Array[Array[Double]], errorFunc: ErrorFunction = SQUARE, iter: Int = 1, _learningRate: Double = learningRate): Boolean = {
+    def train(
+        x: Array[Array[Double]], y: Array[Array[Double]],
+        errorFunc: ErrorFunction = SQUARE,
+        iter: Int = 1,
+        _learningRate: Double = learningRate,
+        _outputWeights: Array[Double] = Array.fill[Double](networkShape.last)(1.0)
+    ): Boolean = {
         learningRate = _learningRate
         val data = x.zip(y)
-        for(i <- 0 until iter) data.foreach { case (inputs, targets) => trainOne(inputs, targets, errorFunc) }
+        for(i <- 0 until iter) data.foreach { case (inputs, targets) => trainOne(inputs, targets, errorFunc, _outputWeights) }
         true
     }
 
diff --git a/src/main/scala/algorithm/reinforcement/PER.scala b/src/main/scala/algorithm/reinforcement/PER.scala
new file mode 100644
index 0000000..cea7532
--- /dev/null
+++ b/src/main/scala/algorithm/reinforcement/PER.scala
@@ -0,0 +1,158 @@
+// Wei Chen - Prioritized Experience Replay (PER)
+// 2017-08-31
+
+package com.scalaml.algorithm
+
+// nextstate, reward, end = simulator(state, action)
+class PER(
+    val layer_neurons: Array[Int],
+    val initparas: Array[Double],
+    val actnumber: Int,
+    val simulator: (Array[Double], Int) => (Array[Double], Double, Boolean),
+    val batchsize_number: Int = 100,
+    val epsilon_saturation_number: Int = 10000,
+    val train_number: Int = 10,
+    val nn_learning_rate: Double = 0.01,
+    val prior_eps: Double = 1e-6,
+    val alpha: Double = 0.6,
+    var beta: Double = 0.6
+) {
+
+    val nn = new NeuralNetwork()
+    nn.config(initparas.size +: layer_neurons :+ actnumber,
+        _batchSize = batchsize_number, _gradientClipping = true)
+    val ex = new Exp
+
+    class Exp {
+        var c = 0
+        var x = Array[Array[Double]]()
+        var y = Array[Array[Double]]()
+        var max_priority = 1.0
+        var fin_priority = Array[Double]()
+
+        def consume = {
+            val indices = _sample_proportional()
+            var nx = Array[Array[Double]]()
+            var ny = Array[Array[Double]]()
+            var nw = Array[Double]()
+            for (i <- indices) {
+                nx :+= x(i)
+                ny :+= y(i)
+                nw :+= _calculate_weight(i, beta)
+            }
+
+            nn.train(
+                nx, ny,
+                iter = train_number,
+                _learningRate = nn_learning_rate,
+                _outputWeights = nw
+            )
+            for (node <- nn.getOutputNodes) {
+                max_priority = math.max(max_priority, node.rawOutputDer + prior_eps)
+            }
+
+            x = Array[Array[Double]]()
+            y = Array[Array[Double]]()
+            fin_priority = Array[Double]()
+            c = 0
+        }
+        def add(paras: Array[Double], target: Array[Double]) {
+            x :+= paras
+            y :+= target
+            fin_priority :+= math.pow(max_priority, alpha)
+            c += 1
+            if (c >= batchsize_number) consume
+        }
+        def end = if (c > 0) consume
+        // Functions for PER
+        def _sample_proportional(): Array[Int] = {
+            // Sample indices based on proportions
+            val indices = new Array[Int](batchsize_number)
+            val p_sum = fin_priority.sum
+            val segment = p_sum / batchsize_number
+            for (i <- 0 until batchsize_number) {
+                val a = segment * i
+                val b = segment * (i + 1)
+                val upperbound = scala.util.Random.nextDouble * (b - a) + a
+                val idx = _retrieve(upperbound)
+                indices(i) = idx
+            }
+            indices
+        }
+        def _retrieve(upperbound: Double): Int = {
+            var a = 0.0
+            var i = 0
+            while (a < upperbound) {
+                a += fin_priority(i)
+                i += 1
+            }
+            i - 1
+        }
+        def _calculate_weight(idx: Int, beta: Double): Double = {
+            // Calculate the weight of the experience at idx
+            // get max weight
+            val p_sum = fin_priority.sum
+            val p_min = fin_priority.min / p_sum
+            val max_weight = math.pow(p_min * c, -beta)
+            // calculate weights
+            val p_sample = fin_priority(idx) / p_sum
+            val weight = math.pow(p_sample * c, -beta)
+            weight / max_weight
+        }
+        def _update_priorities(indices: Array[Int], priorities: Array[Double]) {
+            // Update priorities of sampled transitions
+            for ((idx, priority) <- indices.zip(priorities)) {
+                fin_priority(idx) = math.pow(priority, alpha)
+                max_priority = math.max(max_priority, priority)
+            }
+        }
+    }
+
+    class DQState (val paras: Array[Double]) {
+        def learn(lr: Double, df: Double, epoch: Int): Double = {
+            val q_s = nn.predictOne(paras)
+            val act = (if (scala.util.Random.nextDouble > epsilon) q_s.zipWithIndex.maxBy(_._1)._2
+                else scala.util.Random.nextInt.abs % actnumber)
+            if (epsilon > 0.1) epsilon -= depsilon
+            val (newparas, newreward, newfinish) = simulator(paras, act)
+            if (epoch > 0 && !newfinish) {
+                val newstate = new DQState(newparas)
+                val gradient = newreward + df * newstate.learn(lr, df, epoch - 1) // max -> a: Q(s+1, a)
+                q_s(act) = (1 - lr) * q_s(act) + lr * gradient
+            } else {
+                q_s(act) = newreward
+            }
+            ex.add(paras, q_s) // nn.train(Array(paras), Array(q_s), batchsize_number, lr)
+            q_s.max
+        }
+        val bestAct: Int = nn.predictOne(paras).zipWithIndex.maxBy(_._1)._2
+    }
+
+    var epsilon = 1.0
+    var depsilon = 0.9 / epsilon_saturation_number
+    var state = new DQState(initparas)
+    def train(number: Int = 1, lr: Double = 0.1, df: Double = 0.6, epoch: Int = 100): Unit = {
+        for (n <- 0 until number) {
+            state.learn(lr, df, epoch)
+            val fraction = math.min(n / number, 1.0)
+            beta += fraction * (1.0 - beta)
+        }
+        ex.end
+    }
+    def result(epoch: Int = 100): Array[DQState] = {
+        var paras = initparas
+        var curstate = new DQState(initparas)
+        var arr: Array[DQState] = Array(curstate)
+        var i = 0
+        while (i < epoch) {
+            i += 1
+            val act = curstate.bestAct
+            val (newparas, newreward, newfinish) = simulator(paras, act)
+            if (newfinish) i = epoch
+            paras = newparas
+            curstate = new DQState(newparas)
+            arr :+= curstate
+        }
+        arr
+    }
+}
diff --git a/src/test/scala/algorithm/reinforcement/PERTest.scala b/src/test/scala/algorithm/reinforcement/PERTest.scala
new file mode 100644
index 0000000..f239b92
--- /dev/null
+++ b/src/test/scala/algorithm/reinforcement/PERTest.scala
@@ -0,0 +1,68 @@
+// Wei Chen - Deep Q Network
+// 2017-09-01
+
+import com.scalaml.TestData._
+import com.scalaml.algorithm.PER
+import org.scalatest.funsuite.AnyFunSuite
+
+class PERSuite extends AnyFunSuite {
+
+    val learning_rate = 0.1
+    val scale = 1
+    val limit = 10000
+    val epoch = 100
+
+    test("PER Test : Result 1") { // Case 1
+        def simulator(paras: Array[Double], act: Int): (Array[Double], Double, Boolean) = {
+            val links = Map(0 -> Array(1, 2),
+                1 -> Array(3, 4))
+            val scores = Map(2 -> 10.0, 3 -> 0.0, 4 -> 100.0)
+            val atloc = paras.zipWithIndex.maxBy(_._1)._2
+            val moves = links.getOrElse(atloc, Array[Int]())
+            if (moves.size == 0) {
+                null
+            } else {
+                val endloc = moves(act)
+                val result = Array(0.0, 0.0, 0.0, 0.0, 0.0)
+                result(endloc) = 1.0
+                val nextmoves = links.getOrElse(endloc, Array[Int]())
+                (result, scores.getOrElse(endloc, 0.0), nextmoves.size == 0)
+            }
+        }
+
+        val ql = new PER(Array(5, 4), Array(1.0, 0.0, 0.0, 0.0, 0.0), 2, simulator, 10)
+        ql.train(limit, learning_rate, scale, epoch)
+        val result = ql.result(epoch)
+        assert(result.size == 3)
+        assert(result.head.bestAct == 0)
+        assert(result(1).bestAct == 1)
+        assert(result.last.paras.zipWithIndex.maxBy(_._1)._2 == 4)
+    }
+
+    test("PER Test : Result 2") { // Case 2
+        def simulator(paras: Array[Double], act: Int): (Array[Double], Double, Boolean) = {
+            val links = Map(0 -> Array(1, 2),
+                1 -> Array(3, 4))
+            val scores = Map(2 -> 10.0, 3 -> 0.0, 4 -> 12.0)
+            val atloc = paras.zipWithIndex.maxBy(_._1)._2
+            val moves = links.getOrElse(atloc, Array[Int]())
+            if (moves.size == 0) {
+                null
+            } else {
+                val endloc = moves(act)
+                val result = Array(0.0, 0.0, 0.0, 0.0, 0.0)
+                result(endloc) = 1.0
+                val nextmoves = links.getOrElse(endloc, Array[Int]())
+                (result, scores.getOrElse(endloc, 0.0), nextmoves.size == 0)
+            }
+        }
+
+        val ql = new PER(Array(5, 4), Array(1.0, 0.0, 0.0, 0.0, 0.0), 2, simulator, 10)
+        ql.train(limit, learning_rate, scale, epoch)
+        val result = ql.result(epoch)
+        assert(result.size == 3)
+        assert(result.head.bestAct == 0)
+        assert(result(1).bestAct == 1)
+        assert(result.last.paras.zipWithIndex.maxBy(_._1)._2 == 4)
+    }
+}