Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve docs of linfa-hierarchical #146

Draft
wants to merge 1 commit into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,11 @@ optional = true
default-features = false
features = ["cblas"]

[dependencies.linfa-kernel]
version = "0.4.0"
path = "algorithms/linfa-kernel"
optional = true

[dev-dependencies]
ndarray-rand = "0.13"
linfa-datasets = { path = "datasets", features = ["winequality", "iris", "diabetes"] }
Expand Down
78 changes: 54 additions & 24 deletions algorithms/linfa-hierarchical/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,12 @@
//! [kodama](https://docs.rs/kodama/0.2.3/kodama/) crate.

use std::collections::HashMap;
use ndarray::Array1;

use kodama::linkage;
pub use kodama::Method;

use linfa::dataset::DatasetBase;
use linfa::traits::Transformer;
use linfa::traits::PredictRef;
use linfa::Float;
use linfa_kernel::Kernel;

Expand All @@ -40,9 +40,50 @@ enum Criterion<T> {

/// Agglomerative hierarchical clustering
///
/// In this clustering algorithm, each point is first considered as a separate cluster. During each
/// step, two points are merged into new clusters, until a stopping criterion is reached. The distance
/// between the points is computed as the negative-log transform of the similarity kernel.
/// Hierarchical clustering is a method of cluster analysis which seeks to build a hierarchy of
/// cluster. First each points is considered as a separate cluster. During each step, two points
/// are merged into new clusters, until a stopping criterion is reached. The distance between the
/// points is computed as the negative-log transform of the similarity kernel.
///
/// # Example
///
/// This example loads the iris flower dataset and performs hierarchical clustering into three
/// separate clusters.
/// ```rust
/// use std::error::Error;
///
/// use linfa::traits::Transformer;
/// use linfa_hierarchical::HierarchicalCluster;
/// use linfa_kernel::{Kernel, KernelMethod};
///
/// fn main() -> Result<(), Box<dyn Error>> {
/// // load Iris plant dataset
/// let dataset = linfa_datasets::iris();
///
/// let kernel = Kernel::params()
/// .method(KernelMethod::Gaussian(1.0))
/// .transform(dataset.records().view());
///
/// let kernel = HierarchicalCluster::default()
/// .num_clusters(3)
/// .transform(kernel);
///
/// for (id, target) in kernel.targets().iter().zip(dataset.targets().into_iter()) {
/// let name = match *target as usize {
/// 0 => "setosa",
/// 1 => "versicolor",
/// 2 => "virginica",
/// _ => unreachable!(),
/// };
///
/// print!("({} {}) ", id, name);
/// }
/// println!();
///
/// Ok(())
/// }
/// ```

pub struct HierarchicalCluster<T> {
method: Method,
stopping: Criterion<T>,
Expand Down Expand Up @@ -77,13 +118,14 @@ impl<F: Float> HierarchicalCluster<F> {
}
}

impl<F: Float> Transformer<Kernel<F>, DatasetBase<Kernel<F>, Vec<usize>>>
/// Predict cluster assignements with a kernel operator
impl<F: Float> PredictRef<Kernel<F>, Array1<usize>>
for HierarchicalCluster<F>
{
/// Perform hierarchical clustering of a similarity matrix
///
/// Returns the class id for each data point
fn transform(&self, kernel: Kernel<F>) -> DatasetBase<Kernel<F>, Vec<usize>> {
fn predict_ref(&self, kernel: &Kernel<F>) -> Array1<usize> {
// ignore all similarities below this value
let threshold = F::cast(1e-6);

Expand Down Expand Up @@ -145,19 +187,7 @@ impl<F: Float> Transformer<Kernel<F>, DatasetBase<Kernel<F>, Vec<usize>>>
}

// return node_index -> cluster_index map
DatasetBase::new(kernel, tmp)
}
}

impl<F: Float, T> Transformer<DatasetBase<Kernel<F>, T>, DatasetBase<Kernel<F>, Vec<usize>>>
for HierarchicalCluster<F>
{
/// Perform hierarchical clustering of a similarity matrix
///
/// Returns the class id for each data point
fn transform(&self, dataset: DatasetBase<Kernel<F>, T>) -> DatasetBase<Kernel<F>, Vec<usize>> {
//let Dataset { records, .. } = dataset;
self.transform(dataset.records)
Array1::from(tmp)
}
}

Expand All @@ -174,7 +204,8 @@ impl<T> Default for HierarchicalCluster<T> {

#[cfg(test)]
mod tests {
use linfa::traits::Transformer;
use linfa::traits::{Transformer, Predict};
use linfa::Dataset;
use linfa_kernel::{Kernel, KernelMethod};
use ndarray::{Array, Axis};
use ndarray_rand::{rand_distr::Normal, RandomExt};
Expand All @@ -199,12 +230,11 @@ mod tests {
.method(KernelMethod::Gaussian(5.0))
.transform(entries.view());

let kernel = HierarchicalCluster::default()
let ids = HierarchicalCluster::default()
.max_distance(0.1)
.transform(kernel);
.predict_ref(&kernel);

// check that all assigned ids are equal for the first cluster
let ids = kernel.targets();
let first_cluster_id = &ids[0];
assert!(ids
.iter()
Expand Down
29 changes: 16 additions & 13 deletions algorithms/linfa-kernel/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,11 +25,14 @@ use serde_crate::{Deserialize, Serialize};
use sprs::{CsMat, CsMatView};
use std::ops::Mul;

pub use linfa::Float;

use linfa::{
dataset::AsTargets, dataset::DatasetBase, dataset::FromTargetArray, dataset::Records,
traits::Transformer, Float,
traits::Transformer,
};


/// Kernel representation, can be either dense or sparse
#[derive(Clone)]
pub enum KernelType {
Expand Down Expand Up @@ -234,18 +237,6 @@ impl<'a, F: Float> KernelView<'a, F> {
}
}

impl<F: Float, K1: Inner<Elem = F>, K2: Inner<Elem = F>> Records for KernelBase<K1, K2> {
type Elem = F;

fn nsamples(&self) -> usize {
self.size()
}

fn nfeatures(&self) -> usize {
self.size()
}
}

/// The inner product definition used by a kernel.
///
/// There are three methods available:
Expand Down Expand Up @@ -549,6 +540,18 @@ fn sparse_from_fn<F: Float, D: Data<Elem = F>>(
data
}

impl<F: Float, K1: Inner<Elem = F>, K2: Inner<Elem = F>> Records for KernelBase<K1, K2> {
type Elem = F;

fn nsamples(&self) -> usize {
self.size()
}

fn nfeatures(&self) -> usize {
self.size()
}
}

#[cfg(test)]
mod tests {
use super::*;
Expand Down
27 changes: 27 additions & 0 deletions src/dataset/impl_dataset.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1091,6 +1091,33 @@ where
}
}

#[cfg(feature = "linfa-kernel")]
mod predict_kernels_impl {
use linfa_kernel::{Kernel, KernelBase, Inner};
use crate::traits::{Predict, PredictRef};
use linfa_kernel::{DatasetBase, dataset::Records};
use linfa_kernel::Float;

impl<F: Float, T, O> Predict<Kernel<F>, DatasetBase<Kernel<F>, T>> for O
where
O: PredictRef<Kernel<F>, T>,
{
fn predict(&self, records: Kernel<F>) -> DatasetBase<Kernel<F>, T> {
let new_targets = self.predict_ref(&records);
DatasetBase::new(records, new_targets)
}
}

impl<'a, F: Float, T, O> Predict<&'a Kernel<F>, T> for O
where
O: PredictRef<Kernel<F>, T>,
{
fn predict(&self, records: &'a Kernel<F>) -> T {
self.predict_ref(records)
}
}
}

impl<L: Label, S: Labels<Elem = L>> CountedTargets<L, S> {
pub fn new(targets: S) -> Self {
let labels = targets.label_count();
Expand Down