diff --git a/src/TulipaClustering.jl b/src/TulipaClustering.jl index 9826b52..ac4d0f8 100644 --- a/src/TulipaClustering.jl +++ b/src/TulipaClustering.jl @@ -10,6 +10,7 @@ using SparseArrays include("input-tables.jl") include("structures.jl") include("io.jl") +include("weight_fitting.jl") include("cluster.jl") end diff --git a/src/cluster.jl b/src/cluster.jl index a8784d8..47ddb22 100644 --- a/src/cluster.jl +++ b/src/cluster.jl @@ -538,11 +538,6 @@ function find_representative_periods( throw(ArgumentError("Clustering method is not supported")) end - # If demands was rescaled, scale it back - if rescale_demand_data - rp_matrix[1:n_demand_rows, :] .*= demand_scaling_factor - end - # Fill in the weight matrix using the assignments for (p, rp) ∈ enumerate(assignments) weight_matrix[p, rp] = complete_period_weight @@ -552,6 +547,11 @@ function find_representative_periods( # First, convert the matrix data back to dataframes using the previously saved key columns demand_rp_df = matrix_and_keys_to_df(rp_matrix[1:n_demand_rows, :], demand_keys) + # If demands was rescaled, scale it back + if rescale_demand_data + demand_rp_df.value .*= demand_scaling_factor + end + generation_availability_rp_df = matrix_and_keys_to_df(rp_matrix[(n_demand_rows + 1):end, :], generation_availability_keys) @@ -574,5 +574,11 @@ function find_representative_periods( ) end - return ClusteringResult(demand_rp_df, generation_availability_rp_df, weight_matrix) + return ClusteringResult( + demand_rp_df, + generation_availability_rp_df, + weight_matrix, + clustering_matrix, + rp_matrix, + ) end diff --git a/src/structures.jl b/src/structures.jl index 629673d..1fc60de 100644 --- a/src/structures.jl +++ b/src/structures.jl @@ -46,9 +46,17 @@ Structure to hold the clustering result. mutable struct ClusteringResult demand::AbstractDataFrame generation_availability::AbstractDataFrame - weight_matrix::Matrix{Float64} + weight_matrix::Union{SparseMatrixCSC{Float64, Int64}, Matrix{Float64}} + clustering_matrix::Matrix{Float64} + rp_matrix::Matrix{Float64} - function ClusteringResult(demand, generation_availability, weight_matrix) - return new(demand, generation_availability, weight_matrix) + function ClusteringResult( + demand, + generation_availability, + weight_matrix, + clustering_matrix, + rp_matrix, + ) + return new(demand, generation_availability, weight_matrix, clustering_matrix, rp_matrix) end end diff --git a/src/weight_fitting.jl b/src/weight_fitting.jl new file mode 100644 index 0000000..9dac773 --- /dev/null +++ b/src/weight_fitting.jl @@ -0,0 +1,251 @@ +export fit_rep_period_weights! + +""" + project_onto_simplex(vector) + +Projects `vector` onto a unit simplex using Michelot's algorithm in +Condat's accelerated implementation (2017). See Figure 2 of +[Condat, L. _Fast projection onto the simplex and the ball._ Math. Program. 158, +575–585 (2016).](https://doi.org/10.1007/s10107-015-0946-6). For the details on +the meanings of v, ṽ, ρ and other variables, see the original paper. +""" +function project_onto_simplex(vector::Vector{Float64}) + # There is a trivial solution when it's a one-element vector + if length(vector) == 1 + return [1.0] + end + # step 1 + v = [vector[1]] + ṽ = Vector{Float64}() + ρ = vector[1] - 1.0 + # step 2 + for y ∈ vector[2:end] + if y > ρ + ρ += (y - ρ) / (length(v) + 1) + if ρ > y - 1.0 + push!(v, y) + else + append!(ṽ, v) + v = [y] + ρ = y - 1.0 + end + end + end + # step 3 + for y ∈ ṽ + if y > ρ + push!(v, y) + ρ += (y - ρ) / length(v) + end + end + # step 4 + while true + length_v = length(v) + to_be_removed = Vector{Int}() + for (i, y) ∈ enumerate(v) + if y ≤ ρ + push!(to_be_removed, i) + length_v -= 1 + ρ += (ρ - y) / length_v + end + end + if length(to_be_removed) == 0 + break + end + deleteat!(v, to_be_removed) + end + # step 5 is skipped because it computes the data that we do not need + # step 6 + return max.(vector .- ρ, 0.0) +end + +""" + project_onto_nonnegative_orthant(vector) + +Projects `vector` onto the nonnegative_orthant. This projection is trivial: +replace negative components of the vector with zeros. +""" +function project_onto_nonnegative_orthant(vector::Vector{Float64}) + return max.(vector, 0.0) +end + +""" + projected_subgradient_descent!(x; gradient, projection, niters, rtol, learning_rate, adaptive_grad) + +Fits `x` using the projected gradient descent scheme. + +The arguments: + + - `x`: the value to fit + - `subgradient`: the subgradient operator, that is, a function that takes + vectors of the same shape as `x` as inputs and returns a subgradient of the + loss at that point; the fitting is done to minimize the corresponding + implicit loss + - `projection`: the projection operator, that is, a function that, given a + vector `x`, finds a point within some subspace that is closest to `x` + - `niters`: maximum number of projected gradient descent iterations + - `tol`: tolerance; when no components of `x` improve by more than `tol`, the + algorithm stops + - `learning_rate`: learning rate of the algorithm + - `adaptive_grad`: if true, the learning rate is adjusted using the adaptive + gradient method, see [John Duchi, Elad Hazan, and Yoram Singer. 2011. + _Adaptive Subgradient Methods for Online Learning and Stochastic + Optimization._ J. Mach. Learn. Res. 12, null (2/1/2011), 2121–2159.] + (https://dl.acm.org/doi/10.5555/1953048.2021068) +""" +function projected_subgradient_descent!( + x::Vector{Float64}; + subgradient::Function, + projection::Function, + niters::Int = 100, + tol::Float64 = 1e-3, + learning_rate::Float64 = 0.001, + adaptive_grad = true, +) + # It is possible that the initial guess is not in the required subspace; + # project it first. + x = projection(x) + + if adaptive_grad + G = zeros(length(x)) + end + + for _ ∈ 1:niters + g = subgradient(x) # find the subgradient + if adaptive_grad # find the learning rate + G += g .^ 2 + α = learning_rate ./ (1e-6 .+ .√(G)) + else + α = learning_rate + end + y = x .- α .* g # gradent step, may leave the domain + x_new = projection(y) # projection step, return to the domain + diff = maximum(x_new - x) # how much did the vector change + x = x_new + if diff ≤ tol + break + end + end + return x +end + +""" + fit_rep_period_weights!(weight_matrix, clustering_matrix, rp_matrix; weight_type, tol, args...) + +Given the initial weight guesses, finds better weights for convex or conical +combinations of representative periods. For conical weights, it is possible to +bound the total weight by one. + +The arguments: + + - `weight_matrix`: the initial guess for weights; the weights are adjusted + using a projected subgradient descent method + - `clustering_matrix`: the matrix of raw clustering data + - `rp_matrix`: the matrix of raw representative period data + - `weight_type`: the type of weights to find; possible values are: + - `:convex`: each period is represented as a convex sum of the + representative periods (a sum with nonnegative weights adding into one) + - `:conical`: each period is represented as a conical sum of the + representative periods (a sum with nonnegative weights) + - `:conical_bounded`: each period is represented as a conical sum of the + representative periods (a sum with nonnegative weights) with the total + weight bounded from above by one. + - `tol`: algorithm's tolerance; when the weights are adjusted by a value less + then or equal to `tol`, they stop being fitted further. + - other arguments control the projected subgradient method; they are passed + through to `TulipaClustering.projected_subgradient_descent!`. +""" +function fit_rep_period_weights!( + weight_matrix::Union{SparseMatrixCSC{Float64, Int64}, Matrix{Float64}}, + clustering_matrix::Matrix{Float64}, + rp_matrix::Matrix{Float64}; + weight_type::Symbol = :dirac, + tol::Float64 = 10e-3, + args..., +) + # Determine the appropriate projection method + if weight_type == :convex + projection = project_onto_simplex + elseif weight_type == :conical + projection = project_onto_nonnegative_orthant + elseif weight_type == :conical_bounded + # Conic bounded method does convex fitting, but adds a zero component. + # The weight of a zero vector is then discarded without affecting the + # total, and the resulting weights will always have sums between zero and + # one. + projection = project_onto_simplex + n_data_points = size(rp_matrix, 1) + rp_matrix = hcat(rp_matrix, repeat([0.0], n_data_points)) + else + throw(ArgumentError("Unsupported weight type.")) + end + + n_periods = size(clustering_matrix, 2) + n_rp = size(rp_matrix, 2) + + for period ∈ 1:n_periods # TODO: this can be parallelized; investigate + target_vector = clustering_matrix[:, period] + x = Vector(weight_matrix[period, 1:n_rp]) + if weight_type == :conical_bounded + x[n_rp] = 0.0 + end + x = projected_subgradient_descent!( + x; + subgradient = (x) -> rp_matrix' * (rp_matrix * x - target_vector), + projection, + tol = tol, + args..., + ) + x[x .< tol] .= 0.0 # replace insifnificant small values with zeros + if weight_type == :convex + # Because some values might have been removed, convexity can be lost. + # To account for these, the weights are re-normalized. + x = x ./ sum(x) + end + if weight_type == :conical_bounded + pop!(x) + end + weight_matrix[period, 1:length(x)] = x + end + return weight_matrix +end + +""" + fit_rep_period_weights!(weight_matrix, clustering_matrix, rp_matrix; weight_type, tol, args...) + + Given the initial weight guesses, finds better weights for convex or conical +combinations of representative periods. For conical weights, it is possible to +bound the total weight by one. + +The arguments: + + - `clustering_result`: the result of running + `TulipaClustering.find_representative_periods` + - `weight_type`: the type of weights to find; possible values are: + - `:convex`: each period is represented as a convex sum of the + representative periods (a sum with nonnegative weights adding into one) + - `:conical`: each period is represented as a conical sum of the + representative periods (a sum with nonnegative weights) + - `:conical_bounded`: each period is represented as a conical sum of the + representative periods (a sum with nonnegative weights) with the total + weight bounded from above by one. + - `tol`: algorithm's tolerance; when the weights are adjusted by a value less + then or equal to `tol`, they stop being fitted further. + - other arguments control the projected subgradient method; they are passed + through to `TulipaClustering.projected_subgradient_descent!`. +""" +function fit_rep_period_weights!( + clustering_result = ClusteringResult; + weight_type::Symbol = :dirac, + tol::Float64 = 10e-3, + args..., +) + fit_rep_period_weights!( + clustering_result.weight_matrix, + clustering_result.clustering_matrix, + clustering_result.rp_matrix; + weight_type, + tol, + args..., + ) +end diff --git a/test.jl b/test.jl deleted file mode 100644 index 0d7433a..0000000 --- a/test.jl +++ /dev/null @@ -1,139 +0,0 @@ -using DataFrames -using TulipaClustering -using SparseArrays -using Distances - -dir = INPUT_FOLDER = joinpath(@__DIR__, "test/inputs/EU") -clustering_data = TulipaClustering.read_clustering_data_from_csv_folder(dir) -TulipaClustering.reshape_clustering_data!(clustering_data; period_duration = 24 * 7) -rescale_demand_data = true -drop_incomplete_periods = true -distance = SqEuclidean() - -n_rp = 10 -method = :k_means - -res = find_representative_periods( - clustering_data; - n_rp, - rescale_demand_data, - drop_incomplete_periods, - method, - distance, -) - -# Check that the time series use the same periods and time steps -n_periods = clustering_data.demand.n_periods -period_duration = clustering_data.demand.period_duration -total_time_steps = clustering_data.demand.total_time_steps -if clustering_data.generation_availability.n_periods != n_periods || - clustering_data.generation_availability.period_duration != period_duration || - clustering_data.generation_availability.total_time_steps != total_time_steps - throw(ArgumentError("Time series for demand and generation must have the same time steps")) -end - -# Check if there are any incomplete periods that are not the last period -demand_df = TulipaClustering.get_df_without_key_columns(clustering_data.demand) -generation_df = TulipaClustering.get_df_without_key_columns(clustering_data.generation_availability) - -incomplete_periods = - TulipaClustering.find_incomplete_columns(demand_df) ∪ - TulipaClustering.find_incomplete_columns(generation_df) - -n_incomplete_periods = length(incomplete_periods) -has_incomplete_periods = n_incomplete_periods > 0 -if has_incomplete_periods && n_incomplete_periods > 1 - throw(DomainError(incomplete_periods, "Multiple periods have missing data")) -end - -n_complete_periods = n_periods - n_incomplete_periods - -# Find the period weights -complete_period_weight = - has_incomplete_periods && discard_incomplete_periods ? - total_time_steps / (n_complete_periods * period_duration) : 1.0 -incomplete_period_weight = - (total_time_steps - n_complete_periods * period_duration) / period_duration - -# df = select(clustering_data.demand.data, clustering_data.demand.key_columns, incomplete_periods) -# #df = clustering_data.demand.data -# df = stack(df, variable_name = :period) |> dropmissing -# df = combine(groupby(df, :period), :time_step => maximum => :weight) -# df.weight .= df.weight ./ period_duration - -#incomplete_period_mapping = [incomplete_periods .=> string.((n_rp - n_incomplete_periods + 1):n_rp)] -# Deal with the incomplete periods -#weights = spzeros(n_periods, n_rp) - -# Compute the clustering matrix by concatenating the data matrices -demand_matrix = select(demand_df, Not(incomplete_periods)) |> dropmissing |> Matrix{Float64} - -if rescale_demand_data - # Generation availability is on a scale from 0 to 1, but demand is not; - # rescale the demand profiles by dividing them by the largest possible value, - # and remember this value so that the demands can be computed back from the - # normalized values later on. - demand_scaling_factor = maximum(demand_matrix[map(!ismissing, demand_matrix)]) - demand_matrix ./= demand_scaling_factor -end - -generation_matrix = select(generation_df, Not(incomplete_periods)) |> dropmissing |> Matrix{Float64} -clustering_matrix = vcat(demand_matrix, generation_matrix) - -needs_discarding = n_incomplete_periods > 0 && !discard_incomplete_periods -if needs_discarding - rp_mapping = [incomplete_periods .=> string.((n_rp - n_incomplete_periods + 1):n_rp)] - n_rp -= n_incomplete_periods # incomplete periods become their own representatives -end - -# Do the clustering -kmeans_result = kmeans(clustering_matrix, n_rp) - -# Reinterpret the results -centroids = kmeans_result.centers -centroids = DataFrame(centroids, string.(1:n_rp)) -if needs_discarding - n_rp += n_incomplete_periods -end -n_demand = size(demand_matrix)[1] - -demand_columns = select(clustering_data.demand.data, clustering_data.demand.key_columns) -demand_centroids = centroids[1:n_demand, :] -if rescale_demand_data - demand_centroids .*= demand_scaling_factor -end -if needs_discarding - demand_centroids = hcat( - demand_centroids, - select(demand_df, incomplete_periods .=> string.((n_rp - n_incomplete_periods + 1):n_rp)), - ) -end -demand = _matrix_to_df(demand_columns, demand_centroids) - -generation_columns = select( - clustering_data.generation_availability.data, - clustering_data.generation_availability.key_columns, -) -generation_centroids = centroids[(n_demand + 1):end, :] -if needs_discarding - generation_centroids = hcat( - generation_centroids, - select(generation_df, incomplete_periods .=> string.((n_rp - n_incomplete_periods + 1):n_rp)), - ) -end -generation = _matrix_to_df(generation_columns, generation_centroids) - -if needs_discarding - w = 1.0 - weights = - [ - t < n_periods && kmeans_result.assignments[t] == r ? w : 0.0 for t = 1:n_periods, r = 1:n_rp - ] |> Matrix{Float64} - # TODO: add a zero column and row to weights - weights[n_periods, n_rp] = - (total_time_steps - n_complete_periods * period_duration) / period_duration -else - w = total_time_steps / (n_complete_periods * period_duration) - weights = - [kmeans_result.assignments[t] == r ? w : 0.0 for t = 1:n_complete_periods, r = 1:n_rp] |> Matrix{Float64} -end diff --git a/test/Project.toml b/test/Project.toml index 498e648..b0a1d97 100644 --- a/test/Project.toml +++ b/test/Project.toml @@ -1,4 +1,5 @@ [deps] -Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b" DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" +Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7" +Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" diff --git a/test/runtests.jl b/test/runtests.jl index cce2593..c3112d8 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1,6 +1,7 @@ using CSV using DataFrames using TulipaClustering +using Distances using Test # Folders names diff --git a/test/test-weight-fitting.jl b/test/test-weight-fitting.jl new file mode 100644 index 0000000..8c4c60f --- /dev/null +++ b/test/test-weight-fitting.jl @@ -0,0 +1,115 @@ +@testset "Projections" begin + @testset "Make sure that projection onto simplex works correctly" begin + @test begin + x = [1.0, 1.0] + TulipaClustering.project_onto_simplex(x) ≈ [0.5, 0.5] + end + + @test begin + x = [3.0] + TulipaClustering.project_onto_simplex(x) == [1.0] + end + + @test begin + x = [-2.0, 1.0] + TulipaClustering.project_onto_simplex(x) ≈ [0.0, 1.0] + end + end +end + +@testset "Subgradient descent" begin + @testset "Make sure that subgradient descent can hit the maximum number of iterations" begin + x = [100.0, 10.0] + subgradient = (x) -> x + projection = TulipaClustering.project_onto_simplex + TulipaClustering.projected_subgradient_descent!( + x; + subgradient, + projection, + niters = 2, + tol = 1e-3, + learning_rate = 100.0, + adaptive_grad = false, + ) ≈ [1.0, 0.0] + end +end + +@testset "Weight fitting" begin + @testset "Make sure that weight fitting works correctly for convex weights" begin + @test begin + dir = joinpath(INPUT_FOLDER, "EU") + clustering_data = TulipaClustering.read_clustering_data_from_csv_folder(dir) + split_into_periods!(clustering_data; period_duration = 24 * 7) + clustering_result = find_representative_periods( + clustering_data, + 10; + rescale_demand_data = true, + drop_incomplete_last_period = false, + method = :k_means, + distance = SqEuclidean(), + init = :kmcen, + ) + TulipaClustering.fit_rep_period_weights!(clustering_result; weight_type = :convex, niters = 5) + sum(clustering_result.weight_matrix) ≈ 365 / 7 && + all(sum(clustering_result.weight_matrix[1:(end - 1), :], dims = 2) .≈ 1.0) + end + end + + @testset "Make sure that weight fitting works correctly for bounded conical weights" begin + @test begin + dir = joinpath(INPUT_FOLDER, "EU") + clustering_data = TulipaClustering.read_clustering_data_from_csv_folder(dir) + split_into_periods!(clustering_data; period_duration = 24 * 7) + clustering_result = find_representative_periods( + clustering_data, + 10; + rescale_demand_data = true, + drop_incomplete_last_period = false, + method = :k_means, + distance = SqEuclidean(), + init = :kmcen, + ) + TulipaClustering.fit_rep_period_weights!( + clustering_result; + weight_type = :conical_bounded, + niters = 5, + ) + all(sum(clustering_result.weight_matrix[1:(end - 1), :], dims = 2) .≤ 1.0) + end + end + + @testset "Make sure that weight fitting works correctly for conical weights" begin + @test begin + dir = joinpath(INPUT_FOLDER, "EU") + clustering_data = TulipaClustering.read_clustering_data_from_csv_folder(dir) + split_into_periods!(clustering_data; period_duration = 24 * 7) + clustering_result = find_representative_periods( + clustering_data, + 10; + rescale_demand_data = true, + drop_incomplete_last_period = false, + method = :k_means, + distance = SqEuclidean(), + init = :kmcen, + ) + TulipaClustering.fit_rep_period_weights!( + clustering_result; + weight_type = :conical, + niters = 5, + ) + all(sum(clustering_result.weight_matrix[1:(end - 1), :], dims = 2) .≥ 0.0) + end + end + + @testset "Make sure that weight fitting works correctly for conical weights" begin + @test_throws ArgumentError begin + dummy_matrix = [1.0 1.0; 1.0 1.0] + TulipaClustering.fit_rep_period_weights!( + dummy_matrix, + dummy_matrix, + dummy_matrix; + weight_type = :bad, + ) + end + end +end