-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #107 from invenia/mz/standardscaling
`StandardScaling` with separate constructor and fit methods to replace `MeanStdScaling`
- Loading branch information
Showing
13 changed files
with
321 additions
and
78 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,74 @@ | ||
""" | ||
MeanStdScaling(μ, σ) <: AbstractScaling | ||
Linearly scale the data by the statistical mean `μ` and standard deviation `σ`. | ||
This is also known as standardization, or the Z score transform. | ||
# Keyword arguments to `apply` | ||
* `inverse=true`: inverts the scaling (e.g. to reconstruct the unscaled data). | ||
* `eps=1e-3`: used in place of all 0 values in `σ` before scaling (if `inverse=false`). | ||
""" | ||
struct MeanStdScaling <: AbstractScaling | ||
μ::Real | ||
σ::Real | ||
|
||
""" | ||
MeanStdScaling(A::AbstractArray; dims=:, inds=:) -> MeanStdScaling | ||
MeanStdScaling(table, [cols]) -> MeanStdScaling | ||
Construct a [`MeanStdScaling`](@ref) transform from the statistics of the given data. | ||
By default _all the data_ is considered when computing the mean and standard deviation. | ||
This can be restricted to certain slices via the keyword arguments (see below). | ||
Since `MeanStdScaling` is a stateful transform, i.e. the parameters depend on the data | ||
it's given, you should define it independently before applying it so you can keep the | ||
information for later use. For instance, if you want to invert the transform or apply it | ||
to a test set. | ||
# `AbstractArray` keyword arguments | ||
* `dims=:`: the dimension along which to take the `inds` slices. Default uses all dims. | ||
* `inds=:`: the indices to use in computing the statistics. Default uses all indices. | ||
# `Table` keyword arguments | ||
* `cols`: the columns to use in computing the statistics. Default uses all columns. | ||
!!! note | ||
If you want the `MeanStdScaling` to transform your data consistently you should use | ||
the same `inds`, `dims`, or `cols` keywords when calling `apply`. Otherwise, `apply` | ||
might rescale the wrong data or throw an error. | ||
""" | ||
function MeanStdScaling(A::AbstractArray; dims=:, inds=:) | ||
_depwarn() | ||
dims == Colon() && return new(compute_stats(A)...) | ||
return new(compute_stats(selectdim(A, dims, inds))...) | ||
end | ||
|
||
function MeanStdScaling(table; cols=_get_cols(table)) | ||
_depwarn() | ||
Tables.istable(table) || throw(MethodError(MeanStdScaling, table)) | ||
columntable = Tables.columns(table) | ||
data = reduce(vcat, [getproperty(columntable, c) for c in _to_vec(cols)]) | ||
return new(compute_stats(data)...) | ||
end | ||
end | ||
|
||
function _depwarn() | ||
Base.depwarn( | ||
"`MeanStdScaling(args...; kwargs...)` is deprecated. Use " * | ||
"`ss = StandardScaling(); fit!(scaling, args...; kwargs...)` instead", | ||
:MeanStdScaling | ||
) | ||
return nothing | ||
end | ||
|
||
function _apply(A::AbstractArray, scaling::MeanStdScaling; inverse=false, eps=1e-3, kwargs...) | ||
inverse && return scaling.μ .+ scaling.σ .* A | ||
# Avoid division by 0 | ||
# If std is 0 then data was uniform, so the scaled value would end up ≈ 0 | ||
# Therefore the particular `eps` value should not matter much. | ||
σ_safe = max(scaling.σ, eps) | ||
return (A .- scaling.μ) ./ σ_safe | ||
end | ||
|
||
# TODO: remove in v0.4 https://github.com/invenia/FeatureTransforms.jl/issues/82 | ||
Base.@deprecate_binding is_transformable TestUtils.is_transformable |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
""" | ||
fit!(transform::Transform, data::AbstractArray; dims=:, inds=:) | ||
fit!(transform::Transform, table, [cols]) | ||
Fit the transform to the given data. By default _all the data_ is considered. | ||
This can be restricted to certain slices via the keyword arguments (see below). | ||
# `AbstractArray` keyword arguments | ||
* `dims=:`: the dimension along which to take the `inds` slices. Default uses all dims. | ||
* `inds=:`: the indices to use in computing the statistics. Default uses all indices. | ||
# `Table` keyword arguments | ||
* `cols`: the columns to use in computing the statistics. Default uses all columns. | ||
!!! note | ||
If you want to transform your data consistently you should use the same `inds`, `dims`, | ||
or `cols` keywords when calling `apply`. Otherwise, `apply` might rescale the wrong | ||
data or throw an error. | ||
""" | ||
fit!(t::Transform, args...; kwargs...) = return t |
Oops, something went wrong.
4a636c8
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@JuliaRegistrator register
4a636c8
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Registration pull request created: JuliaRegistries/General/60404
After the above pull request is merged, it is recommended that a tag is created on this repository for the registered package version.
This will be done automatically if the Julia TagBot GitHub Action is installed, or can be done manually through the github interface, or via: