Merge #873

873: Make bias optional r=MikeInnes a=dhairyagandhi96 Addresses #868 Co-authored-by: Dhairya Gandhi <dhairya@juliacopmuting.com>
FluxML · May 1, 2020 · 5d9acc7 · 5d9acc7
2 parents 9237cda + 8f877f2
commit 5d9acc7
Show file tree

Hide file tree

Showing 4 changed files with 319 additions and 37 deletions.
diff --git a/src/Flux.jl b/src/Flux.jl
@@ -27,6 +27,7 @@ using CuArrays
 const use_cuda = Ref(false)
 
 include("utils.jl")
+include("zeros.jl")
 include("onehot.jl")
 include("functor.jl")
 

diff --git a/src/layers/conv.jl b/src/layers/conv.jl
@@ -30,27 +30,36 @@ function calc_padding(::SamePad, k::NTuple{N,T}, dilation, stride) where {N,T}
 end
 
 """
-    Conv(size, in => out, σ = identity; init = glorot_uniform,
+    Conv(filter, in => out, σ = identity; init = glorot_uniform,
          stride = 1, pad = 0, dilation = 1)
 
-Standard convolutional layer. `size` should be a tuple like `(2, 2)`.
+    filter = (2,2)
+    in = 1
+    out = 16
+    Conv((2, 2), 1=>16, relu)
+
+Standard convolutional layer. `filter` should be a tuple like `(2, 2)`.
 `in` and `out` specify the number of input and output channels respectively.
 
 Data should be stored in WHCN order (width, height, # channels, batch size).
 In other words, a 100×100 RGB image would be a `100×100×3×1` array,
 and a batch of 50 would be a `100×100×3×50` array.
 
+Accepts keyword arguments `weight` and `bias` to set the corresponding fields.
+Setting `bias` to `Flux.Zeros()` will switch bias off for the layer.
+
+Takes the keyword arguments `pad`, `stride` and `dilation`.
 Use `pad=SamePad()` to apply padding so that outputsize == inputsize / stride.
 
 # Examples
 
-Apply a `Conv` layer to a 1-channel input using a 2×2 window size, giving us a
+Apply a `Conv` layer to a 1-channel input using a 2×2 window filter size, giving us a
 16-channel output. Output is activated with ReLU.
 ```julia
-size = (2,2)
+filter = (2,2)
 in = 1
 out = 16
-Conv(size, in => out, relu)
+Conv(filter, in => out, relu)
 ```
 """
 struct Conv{N,M,F,A,V}
@@ -62,18 +71,61 @@ struct Conv{N,M,F,A,V}
   dilation::NTuple{N,Int}
 end
 
-function Conv(w::AbstractArray{T,N}, b::AbstractVector{T}, σ = identity;
+"""
+    Conv(weight::AbstractArray, bias::AbstractArray)
+    Conv(weight::AbstractArray, bias::AbstractArray, activation)
+
+Constructs the convolutional layer with user defined weight and bias arrays.
+
+Setting `bias` to `Flux.Zeros()` would switch `bias` off for the layer.
+
+Takes the keyword arguments `pad`, `stride` and `dilation`.
+
+There is also a keyword-only constuctor available for all convoultional
+layers.
+
+```julia
+weight = rand(Float32, 3, 3, 5)
+bias = zeros(Float32, 5)
+Conv(weight = weight,
+    bias = bias,
+    σ = sigmoid)
+```
+"""
+function Conv(w::AbstractArray{T,N}, b::Union{Zeros, AbstractVector{T}}, σ = identity;
               stride = 1, pad = 0, dilation = 1) where {T,N}
   stride = expand(Val(N-2), stride)
   dilation = expand(Val(N-2), dilation)
   pad = calc_padding(pad, size(w)[1:N-2], dilation, stride)
   return Conv(σ, w, b, stride, pad, dilation)
 end
 
-Conv(k::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer}, σ = identity;
-     init = glorot_uniform,  stride = 1, pad = 0, dilation = 1) where N =
-  Conv(init(k..., ch...), zeros(ch[2]), σ,
-       stride = stride, pad = pad, dilation = dilation)
+function Conv(;weight::AbstractArray{T,N}, bias::Union{Zeros, AbstractVector{T}},
+              activation = identity, stride = 1, pad = 0, dilation = 1) where {T,N}
+  Conv(weight, bias, activation, stride = stride, pad = pad, dilation = dilation)
+end
+
+"""
+    convfilter(filter::Tuple, in=>out)
+
+Constructs a standard convolutional weight matrix with given `filter` and
+channels from `in` to `out`.
+
+Accepts the keyword `init` (default: `glorot_uniform`) to control the sampling
+distribution.
+
+See also: [`depthwiseconvfilter`](@ref)
+"""
+convfilter(filter::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer};
+          init = glorot_uniform) where N = init(filter..., ch...)
+
+function Conv(k::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer}, σ = identity;
+            init = glorot_uniform,  stride = 1, pad = 0, dilation = 1,
+            weight = convfilter(k, ch, init = init), bias = zeros(ch[2])) where N
+
+  Conv(weight, bias, σ,
+      stride = stride, pad = pad, dilation = dilation)
+end
 
 @functor Conv
 
@@ -114,16 +166,22 @@ outdims(l::Conv, isize) =
   output_size(DenseConvDims(_paddims(isize, size(l.weight)), size(l.weight); stride = l.stride, padding = l.pad, dilation = l.dilation))
 
 """
-    ConvTranspose(size, in => out, σ = identity; init = glorot_uniform,
+    ConvTranspose(filter, in=>out)
+    ConvTranspose(filter, in=>out, activation)
+    ConvTranspose(filter, in => out, σ = identity; init = glorot_uniform,
                   stride = 1, pad = 0, dilation = 1)
 
-Standard convolutional transpose layer. `size` should be a tuple like `(2, 2)`.
+Standard convolutional transpose layer. `filter` should be a tuple like `(2, 2)`.
 `in` and `out` specify the number of input and output channels respectively.
 
 Data should be stored in WHCN order (width, height, # channels, batch size).
 In other words, a 100×100 RGB image would be a `100×100×3×1` array,
 and a batch of 50 would be a `100×100×3×50` array.
 
+Accepts keyword arguments `weight` and `bias` to set the corresponding fields.
+Setting `bias` to `Flux.Zeros()` will switch bias off for the layer.
+
+Takes the keyword arguments `pad`, `stride` and `dilation`.
 Use `pad=SamePad()` to apply padding so that outputsize == stride * inputsize - stride + 1.
 """
 struct ConvTranspose{N,M,F,A,V}
@@ -135,18 +193,39 @@ struct ConvTranspose{N,M,F,A,V}
   dilation::NTuple{N,Int}
 end
 
-function ConvTranspose(w::AbstractArray{T,N}, b::AbstractVector{T}, σ = identity;
-              stride = 1, pad = 0, dilation = 1) where {T,N}
+"""
+    ConvTranspose(weight::AbstractArray, bias::AbstractArray)
+    ConvTranspose(weight::AbstractArray, bias::AbstractArray, activation)
+
+Constructs the convolutional transpose layer with user defined weight and bias arrays.
+forward pass.
+
+Setting `bias` to `Flux.Zeros()` would switch `bias` off for the layer.
+
+Takes the keyword arguments `pad`, `stride` and `dilation`.
+
+For keyword-only constuctor, see also [`Conv`](@ref)
+"""
+function ConvTranspose(w::AbstractArray{T,N}, b::Union{Zeros, AbstractVector{T}}, σ = identity;
+                      stride = 1, pad = 0, dilation = 1) where {T,N}
   stride = expand(Val(N-2), stride)
   dilation = expand(Val(N-2), dilation)
   pad = calc_padding(pad, size(w)[1:N-2], dilation, stride)
   return ConvTranspose(σ, w, b, stride, pad, dilation)
 end
 
-ConvTranspose(k::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer}, σ = identity;
-              init = glorot_uniform, stride = 1, pad = 0, dilation = 1) where N =
-ConvTranspose(init(k..., reverse(ch)...), zeros(ch[2]), σ,
+function ConvTranspose(;weight::AbstractArray{T,N}, bias::Union{Zeros, AbstractVector{T}},
+                        activation = identity, stride = 1, pad = 0, dilation = 1) where {T,N}
+  ConvTranspose(weight, bias, activation, stride = stride, pad = pad, dilation = dilation)
+end
+
+function ConvTranspose(k::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer}, σ = identity;
+                      init = glorot_uniform, stride = 1, pad = 0, dilation = 1,
+                      weight = convfilter(k, reverse(ch), init = init), bias = zeros(ch[2])) where N
+
+  ConvTranspose(weight, bias, σ,
               stride = stride, pad = pad, dilation = dilation)
+end
 
 @functor ConvTranspose
 
@@ -158,9 +237,9 @@ function conv_transpose_dims(c::ConvTranspose, x::AbstractArray)
     batch_size = size(x)[end]
     # Create DenseConvDims() that looks like the corresponding conv()
     return DenseConvDims((I..., C_in, batch_size), size(c.weight);
-        stride=c.stride,
-        padding=c.pad,
-        dilation=c.dilation,
+                        stride=c.stride,
+                        padding=c.pad,
+                        dilation=c.dilation,
     )
 end
 
@@ -171,7 +250,7 @@ function (c::ConvTranspose)(x::AbstractArray)
   # ndims(x) == ndims(c.weight)-1 && return squeezebatch(c(reshape(x, size(x)..., 1)))
   σ, b = c.σ, reshape(c.bias, map(_->1, c.stride)..., :, 1)
   cdims = conv_transpose_dims(c, x)
-  return σ.(∇conv_data(x, c.weight, cdims) .+ b)
+  σ.(∇conv_data(x, c.weight, cdims) .+ b)
 end
 
 function Base.show(io::IO, l::ConvTranspose)
@@ -190,17 +269,23 @@ end
 outdims(l::ConvTranspose{N}, isize) where N = _convtransoutdims(isize[1:2], size(l.weight)[1:N], l.stride, l.dilation, l.pad)
 
 """
-    DepthwiseConv(size, in => out, σ = identity; init = glorot_uniform,
+    DepthwiseConv(filter::Tuple, in=>out)
+    DepthwiseConv(filter::Tuple, in=>out, activation)
+    DepthwiseConv(filter, in => out, σ = identity; init = glorot_uniform,
                   stride = 1, pad = 0, dilation = 1)
 
-Depthwise convolutional layer. `size` should be a tuple like `(2, 2)`.
+Depthwise convolutional layer. `filter` should be a tuple like `(2, 2)`.
 `in` and `out` specify the number of input and output channels respectively.
 Note that `out` must be an integer multiple of `in`.
 
 Data should be stored in WHCN order (width, height, # channels, batch size).
 In other words, a 100×100 RGB image would be a `100×100×3×1` array,
 and a batch of 50 would be a `100×100×3×50` array.
 
+Accepts keyword arguments `weight` and `bias` to set the corresponding fields.
+Setting `bias` to `Flux.Zeros()` will switch bias off for the layer.
+
+Takes the keyword arguments `pad`, `stride` and `dilation`.
 Use `pad=SamePad()` to apply padding so that outputsize == inputsize / stride.
 """
 struct DepthwiseConv{N,M,F,A,V}
@@ -212,20 +297,54 @@ struct DepthwiseConv{N,M,F,A,V}
   dilation::NTuple{N,Int}
 end
 
-function DepthwiseConv(w::AbstractArray{T,N}, b::AbstractVector{T}, σ = identity;
-                       stride = 1, pad = 0, dilation = 1) where {T,N}
+"""
+    DepthwiseConv(weight::AbstractArray, bias::AbstractArray)
+    DepthwiseConv(weight::AbstractArray, bias::AbstractArray, activation)
+
+Constructs the `DepthwiseConv` layer with user defined weight and bias arrays.
+forward pass.
+
+Setting `bias` to `Flux.Zeros()` would switch `bias` off for the layer.
+
+Takes the keyword arguments `pad`, `stride` and `dilation`.
+
+For keyword-only constuctor, see also [`Conv`](@ref)
+"""
+function DepthwiseConv(w::AbstractArray{T,N}, b::Union{Zeros, AbstractVector{T}}, σ = identity;
+                      stride = 1, pad = 0, dilation = 1) where {T,N}
   stride = expand(Val(N-2), stride)
   dilation = expand(Val(N-2), dilation)
   pad = calc_padding(pad, size(w)[1:N-2], dilation, stride)
   return DepthwiseConv(σ, w, b, stride, pad, dilation)
 end
 
+function DepthwiseConv(;weight::AbstractArray{T,N}, bias::Union{Zeros, AbstractVector{T}},
+                      activation = identity, stride = 1, pad = 0, dilation = 1) where {T,N}
+  DepthwiseConv(weight, bias, activation, stride = stride, pad = pad, dilation = dilation)
+end
+
+"""
+    depthwiseconvfilter(filter::Tuple, in=>out)
+
+Constructs a depthwise convolutional weight array defined by `filter` and channels
+from `in` to `out`.
+
+Accepts the keyword `init` (default: `glorot_uniform`) to control the sampling
+distribution.
+
+See also: [`convfilter`](@ref)
+"""
+depthwiseconvfilter(filter::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer};
+                    init = glorot_uniform) where N = init(filter..., div(ch[2], ch[1]), ch[1])
+
 function DepthwiseConv(k::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer}, σ = identity;
-     init = glorot_uniform, stride = 1, pad = 0, dilation = 1) where N
+                      init = glorot_uniform, stride = 1, pad = 0, dilation = 1,
+                      weight = depthwiseconvfilter(k, ch, init = init), bias = zeros(ch[2])) where N
   @assert ch[2] % ch[1] == 0 "Output channels must be integer multiple of input channels"
+
   return DepthwiseConv(
-    init(k..., div(ch[2], ch[1]), ch[1]),
-    zeros(ch[2]),
+    weight,
+    bias,
     σ;
     stride = stride,
     pad = pad,
@@ -258,24 +377,30 @@ outdims(l::DepthwiseConv, isize) =
   output_size(DepthwiseConvDims(_paddims(isize, (1, 1, size(l.weight)[end], 1)), size(l.weight); stride = l.stride, padding = l.pad, dilation = l.dilation))
 
 """
-    CrossCor(size, in => out, σ = identity; init = glorot_uniform,
+    CrossCor(filter, in=>out)
+    CrossCor(filter, in=>out, activation)
+    CrossCor(filter, in => out, σ = identity; init = glorot_uniform,
              stride = 1, pad = 0, dilation = 1)
 
-Standard cross convolutional layer. `size` should be a tuple like `(2, 2)`.
+Standard cross convolutional layer. `filter` should be a tuple like `(2, 2)`.
 `in` and `out` specify the number of input and output channels respectively.
 
 Data should be stored in WHCN order (width, height, # channels, batch size).
 In other words, a 100×100 RGB image would be a `100×100×3×1` array,
 and a batch of 50 would be a `100×100×3×50` array.
 
+Accepts keyword arguments `weight` and `bias` to set the corresponding fields.
+Setting `bias` to `Flux.Zeros()` will switch bias off for the layer.
+
+Takes the keyword arguments `pad`, `stride` and `dilation`.
 Use `pad=SamePad()` to apply padding so that outputsize == inputsize / stride.
 
 # Examples
 
-Apply a `CrossCor` layer to a 1-channel input using a 2×2 window size, giving us a
+Apply a `CrossCor` layer to a 1-channel input using a 2×2 window filter size, giving us a
 16-channel output. Output is activated with ReLU.
 ```julia
-size = (2,2)
+filter = (2,2)
 in = 1
 out = 16
 CrossCor((2, 2), 1=>16, relu)
@@ -290,18 +415,39 @@ struct CrossCor{N,M,F,A,V}
   dilation::NTuple{N,Int}
 end
 
-function CrossCor(w::AbstractArray{T,N}, b::AbstractVector{T}, σ = identity;
-              stride = 1, pad = 0, dilation = 1) where {T,N}
+"""
+    CrossCor(weight::AbstractArray, bias::AbstractArray)
+    CrossCor(weight::AbstractArray, bias::AbstractArray, activation)
+
+Constructs the standard cross convolutional layer with user defined weight and bias
+arrays.
+
+Setting `bias` to `Flux.Zeros()` would switch `bias` off for the layer.
+
+Takes the keyword arguments `pad`, `stride` and `dilation`.
+
+For keyword-only constuctor, see also [`Conv`](@ref)
+"""
+function CrossCor(w::AbstractArray{T,N}, b::Union{Zeros, AbstractVector{T}}, σ = identity;
+                  stride = 1, pad = 0, dilation = 1) where {T,N}
   stride = expand(Val(N-2), stride)
   dilation = expand(Val(N-2), dilation)
   pad = calc_padding(pad, size(w)[1:N-2], dilation, stride)
   return CrossCor(σ, w, b, stride, pad, dilation)
 end
 
-CrossCor(k::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer}, σ = identity;
-     init = glorot_uniform, stride = 1, pad = 0, dilation = 1) where N =
-  CrossCor(init(k..., ch...), zeros(ch[2]), σ,
+function CrossCor(;weight::AbstractArray{T,N}, bias::Union{Zeros, AbstractVector{T}},
+                      activation = identity, stride = 1, pad = 0, dilation = 1) where {T,N}
+  CrossCor(weight, bias, activation, stride = stride, pad = pad, dilation = dilation)
+end
+
+function CrossCor(k::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer}, σ = identity;
+                  init = glorot_uniform, stride = 1, pad = 0, dilation = 1,
+                  weight = convfilter(k, ch, init = init), bias = zeros(ch[2])) where N
+
+  CrossCor(weight, bias, σ,
        stride = stride, pad = pad, dilation = dilation)
+end
 
 @functor CrossCor