From dfe2eda43d3a1907306cb107d0e01ca8f08f45e4 Mon Sep 17 00:00:00 2001 From: Wei Chen Date: Sat, 5 Mar 2022 15:17:38 +0800 Subject: [PATCH] 2022-03-05-abnormal --- README.md | 6 ++ src/main/scala/algorithm/Algorithm.scala | 2 +- .../scala/algorithm/abnormal/Abnormal.scala | 10 ++++ .../algorithm/abnormal/IsolationForest.scala | 57 +++++++++++++++++++ .../algorithm/abnormal/IsolationTree.scala | 53 +++++++++++++++++ src/test/scala/algorithm/AlgorithmTest.scala | 2 +- .../algorithm/abnormal/AbnormalTest.scala | 32 +++++++++++ .../abnormal/IsolationForestTest.scala | 31 ++++++++++ .../abnormal/IsolationTreeTest.scala | 30 ++++++++++ .../classification/ClassificationTest.scala | 2 +- .../algorithm/clustering/ClusteringTest.scala | 2 +- .../algorithm/regression/RegressionTest.scala | 2 +- 12 files changed, 224 insertions(+), 5 deletions(-) create mode 100644 src/main/scala/algorithm/abnormal/Abnormal.scala create mode 100644 src/main/scala/algorithm/abnormal/IsolationForest.scala create mode 100644 src/main/scala/algorithm/abnormal/IsolationTree.scala create mode 100644 src/test/scala/algorithm/abnormal/AbnormalTest.scala create mode 100644 src/test/scala/algorithm/abnormal/IsolationForestTest.scala create mode 100644 src/test/scala/algorithm/abnormal/IsolationTreeTest.scala diff --git a/README.md b/README.md index ebb57ad..86bde37 100644 --- a/README.md +++ b/README.md @@ -124,6 +124,12 @@ A very light weight Scala machine learning library that provide some basic ML al - [x] One Hot Encoding [[Code]](src/main/scala/algorithm/transformation/OneHot.scala) [[Usage]](src/test/scala/algorithm/transformation/OneHotTest.scala) +### Abnormal Detection : + +- [x] Isolation Tree [[Code]](src/main/scala/algorithm/abnormal/IsolationTree.scala) [[Usage]](src/test/scala/algorithm/abnormal/IsolationTreeTest.scala) + +- [x] Isolation Forest [[Code]](src/main/scala/algorithm/abnormal/IsolationForest.scala) [[Usage]](src/test/scala/algorithm/abnormal/IsolationForestTest.scala) + ## TODO diff --git a/src/main/scala/algorithm/Algorithm.scala b/src/main/scala/algorithm/Algorithm.scala index 394b2ff..f29981e 100644 --- a/src/main/scala/algorithm/Algorithm.scala +++ b/src/main/scala/algorithm/Algorithm.scala @@ -7,6 +7,6 @@ trait Algorithm { val algotype: String val algoname: String val version: String - def clear: Boolean + def clear(): Boolean def config(paras: Map[String, Any]): Boolean } \ No newline at end of file diff --git a/src/main/scala/algorithm/abnormal/Abnormal.scala b/src/main/scala/algorithm/abnormal/Abnormal.scala new file mode 100644 index 0000000..e33fd87 --- /dev/null +++ b/src/main/scala/algorithm/abnormal/Abnormal.scala @@ -0,0 +1,10 @@ +// Wei Chen - Abnormal Detection +// 2022-03-04 + +package com.scalaml.algorithm + +trait Abnormal extends Algorithm { + val algotype: String = "Abnormal" + def train(data: Array[Array[Double]]): Boolean + def predict(data: Array[Array[Double]]): Array[Double] +} \ No newline at end of file diff --git a/src/main/scala/algorithm/abnormal/IsolationForest.scala b/src/main/scala/algorithm/abnormal/IsolationForest.scala new file mode 100644 index 0000000..f060d8d --- /dev/null +++ b/src/main/scala/algorithm/abnormal/IsolationForest.scala @@ -0,0 +1,57 @@ +// Wei Chen - Isolation Forest +// 2022-03-04 + +package com.scalaml.algorithm +import com.scalaml.general.MatrixFunc._ + +class IsolationForest() extends Abnormal { + val algoname: String = "IsolationForest" + val version: String = "0.1" + + var trees = Array[IsolationTree]() + var tree_n = 10 // Number of Trees + var sample_n = 10 // Number of Sample Data in a Tree + var maxLayer = 5 + + override def clear(): Boolean = { + trees = Array[IsolationTree]() + tree_n = 10 // Number of Trees + sample_n = 10 // Number of Sample Data in a Tree + maxLayer = 5 + true + } + + override def config(paras: Map[String, Any]): Boolean = try { + tree_n = paras.getOrElse("TREE_NUMBER", paras.getOrElse("tree_number", paras.getOrElse("tree_n", 10.0))).asInstanceOf[Double].toInt + sample_n = paras.getOrElse("SAMPLE_NUMBER", paras.getOrElse("sample_number", paras.getOrElse("sample_n", 10.0))).asInstanceOf[Double].toInt + maxLayer = paras.getOrElse("maxLayer", 5.0).asInstanceOf[Double].toInt + true + } catch { case e: Exception => + Console.err.println(e) + false + } + + private def randomSelect(data: Array[Array[Double]], sample_n: Int) = + scala.util.Random.shuffle(data.toList).take(sample_n).toArray + + private def addTree(data: Array[Array[Double]]): Boolean = { + val itree = new IsolationTree() + var paras = Map("maxLayer" -> maxLayer.toDouble): Map[String, Any] + val check = itree.config(paras) && itree.train(data) + if(check) trees :+= itree + check + } + + override def train(data: Array[Array[Double]]): Boolean = { + val data_n = data.size + if (data_n > sample_n) { + (0 until tree_n).forall(i => addTree(randomSelect(data, sample_n))) + } else addTree(data) + } + + override def predict(data: Array[Array[Double]]): Array[Double] = { + matrixaccumulate(trees.map { tree => + tree.predict(data) + }).map(_ / tree_n) + } +} diff --git a/src/main/scala/algorithm/abnormal/IsolationTree.scala b/src/main/scala/algorithm/abnormal/IsolationTree.scala new file mode 100644 index 0000000..83b0e77 --- /dev/null +++ b/src/main/scala/algorithm/abnormal/IsolationTree.scala @@ -0,0 +1,53 @@ +// Wei Chen - Isolation Tree +// 2022-03-04 + +package com.scalaml.algorithm + +class IsolationTree() extends Abnormal { + val algoname: String = "IsolationTree" + val version: String = "0.1" + + var maxLayer = 5 + var tree: DecisionNode = null + + override def clear(): Boolean = { + maxLayer = 5 + true + } + + override def config(paras: Map[String, Any]): Boolean = try { + maxLayer = paras.getOrElse("maxLayer", 5.0).asInstanceOf[Double].toInt + true + } catch { case e: Exception => + Console.err.println(e) + false + } + + private def buildtree(data: Array[Array[Double]], layer: Int = 0): DecisionNode = { + val dataSize = data.size + val columnSize: Int = data.head.size + val col = scala.util.Random.nextInt(columnSize) + val colData = data.map(d => d(col)) + val minV = colData.min + val maxV = colData.max + val value = (maxV - minV) * scala.util.Random.nextDouble() + minV + val (tData, fData) = data.partition { d => + d(col) >= value + } + if (tData.size > 0 && fData.size > 0 && layer < maxLayer) { + val tnode = buildtree(tData, layer + 1) + val fnode = buildtree(fData, layer + 1) + new DecisionNode(col, value, tnode, fnode) + } else new DecisionNode(0, 0, null, null, layer) + } + + override def train(data: Array[Array[Double]]): Boolean = try { + tree = buildtree(data) + true + } catch { case e: Exception => + Console.err.println(e) + false + } + + override def predict(x: Array[Array[Double]]): Array[Double] = x.map(xi => tree.predict(xi)) +} \ No newline at end of file diff --git a/src/test/scala/algorithm/AlgorithmTest.scala b/src/test/scala/algorithm/AlgorithmTest.scala index 33cfe4f..0d2fee1 100644 --- a/src/test/scala/algorithm/AlgorithmTest.scala +++ b/src/test/scala/algorithm/AlgorithmTest.scala @@ -16,7 +16,7 @@ class AlgorithmSuite extends AnyFunSuite { class TestAlgo() extends TestType { val algoname: String = "TestAlgo" val version: String = "TestVersion" - override def clear: Boolean = true + override def clear(): Boolean = true override def config(paras: Map[String, Any]): Boolean = true override def testfunc(testinput: Int): Boolean = true } diff --git a/src/test/scala/algorithm/abnormal/AbnormalTest.scala b/src/test/scala/algorithm/abnormal/AbnormalTest.scala new file mode 100644 index 0000000..9b3aebf --- /dev/null +++ b/src/test/scala/algorithm/abnormal/AbnormalTest.scala @@ -0,0 +1,32 @@ +// Wei Chen - Abnormal Trait Test +// 2022-03-05 + +import com.scalaml.algorithm.Abnormal +import org.scalatest.funsuite.AnyFunSuite + +class AbnormalSuite extends AnyFunSuite { + + test("Abnormal Test : Create Sample Algo") { + + class TestAlgo() extends Abnormal { + val algoname: String = "TestAlgo" + val version: String = "TestVersion" + override def clear(): Boolean = true + override def config(paras: Map[String, Any]): Boolean = true + override def train(data: Array[Array[Double]]): Boolean = true + override def predict(data: Array[Array[Double]]): Array[Double] = data.map(_ => 0) + } + + val ta = new TestAlgo + + assert(ta.algotype == "Abnormal") + assert(ta.algoname == "TestAlgo") + assert(ta.version == "TestVersion") + assert(ta.clear) + assert(ta.config(Map())) + assert(ta.train(Array())) + assert(ta.predict(Array()).size == 0) + assert(ta.predict(Array(Array(1))).head == 0) + } + +} diff --git a/src/test/scala/algorithm/abnormal/IsolationForestTest.scala b/src/test/scala/algorithm/abnormal/IsolationForestTest.scala new file mode 100644 index 0000000..377c561 --- /dev/null +++ b/src/test/scala/algorithm/abnormal/IsolationForestTest.scala @@ -0,0 +1,31 @@ +// Wei Chen - Isolation Forest Test +// 2022-03-05 + +import com.scalaml.TestData._ +import com.scalaml.general.MatrixFunc._ +import com.scalaml.algorithm.IsolationForest +import org.scalatest.funsuite.AnyFunSuite + +class IsolationForestSuite extends AnyFunSuite { + + val iforest = new IsolationForest() + + test("IsolationForest Test : Clear") { + assert(iforest.clear()) + } + + test("IsolationForest Test : Abnormal Large Data") { + assert(iforest.clear()) + assert(iforest.config(Map("tree_n" -> 100.0))) + assert(iforest.train(UNLABELED_LARGE_DATA)) + val result = iforest.predict(UNLABELED_LARGE_DATA) + assert(arraysimilar(result, UNLABELED_LARGE_DATA.map(_ => 1.0), UNLABELED_NONLINEAR_DATA.size)) + assert(result.last < result.sum / result.size) + } + + test("IsolationForest Test : Invalid Data") { + assert(iforest.clear()) + assert(!iforest.config(Map("maxLayer" -> "test"))) + assert(!iforest.train(Array(Array(1, 2), Array()))) + } +} diff --git a/src/test/scala/algorithm/abnormal/IsolationTreeTest.scala b/src/test/scala/algorithm/abnormal/IsolationTreeTest.scala new file mode 100644 index 0000000..91d78a0 --- /dev/null +++ b/src/test/scala/algorithm/abnormal/IsolationTreeTest.scala @@ -0,0 +1,30 @@ +// Wei Chen - Isolation Tree Test +// 2022-03-05 + +import com.scalaml.TestData._ +import com.scalaml.general.MatrixFunc._ +import com.scalaml.algorithm.IsolationTree +import org.scalatest.funsuite.AnyFunSuite + +class IsolationTreeSuite extends AnyFunSuite { + + val itree = new IsolationTree() + + test("IsolationTree Test : Clear") { + assert(itree.clear()) + } + + test("IsolationTree Test : Abnormal Large Data") { + assert(itree.clear()) + assert(itree.config(Map[String, Double]())) + assert(itree.train(UNLABELED_LARGE_DATA)) + val result = itree.predict(UNLABELED_LARGE_DATA) + assert(arraysimilar(result, UNLABELED_LARGE_DATA.map(_ => 1.0), UNLABELED_NONLINEAR_DATA.size)) + } + + test("IsolationTree Test : Invalid Data") { + assert(itree.clear()) + assert(!itree.config(Map("maxLayer" -> "test"))) + assert(!itree.train(Array(Array(1, 2), Array()))) + } +} diff --git a/src/test/scala/algorithm/classification/ClassificationTest.scala b/src/test/scala/algorithm/classification/ClassificationTest.scala index 83385ac..c21b0df 100644 --- a/src/test/scala/algorithm/classification/ClassificationTest.scala +++ b/src/test/scala/algorithm/classification/ClassificationTest.scala @@ -11,7 +11,7 @@ class ClassificationSuite extends AnyFunSuite { class TestAlgo() extends Classification { val algoname: String = "TestAlgo" val version: String = "TestVersion" - override def clear: Boolean = true + override def clear(): Boolean = true override def config(paras: Map[String, Any]): Boolean = true override def train(data: Array[(Int, Array[Double])]): Boolean = true override def predict(data: Array[Array[Double]]): Array[Int] = data.map(_ => 0) diff --git a/src/test/scala/algorithm/clustering/ClusteringTest.scala b/src/test/scala/algorithm/clustering/ClusteringTest.scala index 21de11b..fd2e7b2 100644 --- a/src/test/scala/algorithm/clustering/ClusteringTest.scala +++ b/src/test/scala/algorithm/clustering/ClusteringTest.scala @@ -11,7 +11,7 @@ class ClusteringSuite extends AnyFunSuite { class TestAlgo() extends Clustering { val algoname: String = "TestAlgo" val version: String = "TestVersion" - override def clear: Boolean = true + override def clear(): Boolean = true override def config(paras: Map[String, Any]): Boolean = true override def cluster(data: Array[Array[Double]]): Array[Int] = data.map(_ => 0) } diff --git a/src/test/scala/algorithm/regression/RegressionTest.scala b/src/test/scala/algorithm/regression/RegressionTest.scala index 59e8b33..a090db8 100644 --- a/src/test/scala/algorithm/regression/RegressionTest.scala +++ b/src/test/scala/algorithm/regression/RegressionTest.scala @@ -11,7 +11,7 @@ class RegressionSuite extends AnyFunSuite { class TestAlgo() extends Regression { val algoname: String = "TestAlgo" val version: String = "TestVersion" - override def clear: Boolean = true + override def clear(): Boolean = true override def config(paras: Map[String, Any]): Boolean = true override def train(data: Array[(Double, Array[Double])]): Boolean = true override def predict(data: Array[Array[Double]]): Array[Double] = data.map(_ => 0.0)