-
Notifications
You must be signed in to change notification settings - Fork 51
/
focal_loss_layer.hpp
133 lines (120 loc) · 5.32 KB
/
focal_loss_layer.hpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
#ifndef CAFFE_FOCAL_LOSS_LAYER_HPP_
#define CAFFE_FOCAL_LOSS_LAYER_HPP_
#include <vector>
#include "caffe/blob.hpp"
#include "caffe/layer.hpp"
#include "caffe/proto/caffe.pb.h"
#include "caffe/layers/loss_layer.hpp"
#include "caffe/layers/sigmoid_layer.hpp"
namespace caffe {
/**
* @brief Computes the cross-entropy (logistic) loss @f$
* E = \frac{-1}{n} \sum\limits_{n=1}^N \left[
* p_n \log \hat{p}_n +
* (1 - p_n) \log(1 - \hat{p}_n)
* \right]
* @f$, often used for predicting targets interpreted as probabilities.
*
* This layer is implemented rather than separate
* SigmoidLayer + CrossEntropyLayer
* as its gradient computation is more numerically stable.
* At test time, this layer can be replaced simply by a SigmoidLayer.
*
* @param bottom input Blob vector (length 2)
* -# @f$ (N \times C \times H \times W) @f$
* the scores @f$ x \in [-\infty, +\infty]@f$,
* which this layer maps to probability predictions
* @f$ \hat{p}_n = \sigma(x_n) \in [0, 1] @f$
* using the sigmoid function @f$ \sigma(.) @f$ (see SigmoidLayer).
* -# @f$ (N \times C \times H \times W) @f$
* the targets @f$ y \in [0, 1] @f$
* @param top output Blob vector (length 1)
* -# @f$ (1 \times 1 \times 1 \times 1) @f$
* the computed cross-entropy loss: @f$
* E = \frac{-1}{n} \sum\limits_{n=1}^N \left[
* p_n \log \hat{p}_n + (1 - p_n) \log(1 - \hat{p}_n)
* \right]
* @f$
*/
template <typename Dtype>
class FocalLossLayer : public LossLayer<Dtype> {
public:
explicit FocalLossLayer(const LayerParameter& param)
: LossLayer<Dtype>(param),
sigmoid_output_(new Blob<Dtype>()) {
LayerParameter param_t = param;
param_t.clear_loss_weight();
sigmoid_layer_.reset(new SigmoidLayer<Dtype>(param_t));
}
virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
const vector<Blob<Dtype>*>& top);
virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
const vector<Blob<Dtype>*>& top);
virtual inline const char* type() const { return "FocalLoss"; }
virtual inline int ExactNumTopBlobs() const { return -1; }
protected:
/// @copydoc FocalLossLayer
virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
const vector<Blob<Dtype>*>& top);
virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
const vector<Blob<Dtype>*>& top);
/// Read the normalization mode parameter and compute the normalizer based
/// on the blob size. If normalization_mode is VALID, the count of valid
/// outputs will be read from valid_count, unless it is -1 in which case
/// all outputs are assumed to be valid.
virtual Dtype get_normalizer(
LossParameter_NormalizationMode normalization_mode, int valid_count);
/**
* @brief Computes the sigmoid cross-entropy loss error gradient w.r.t. the
* predictions.
*
* Gradients cannot be computed with respect to the target inputs (bottom[1]),
* so this method ignores bottom[1] and requires !propagate_down[1], crashing
* if propagate_down[1] is set.
*
* @param top output Blob vector (length 1), providing the error gradient with
* respect to the outputs
* -# @f$ (1 \times 1 \times 1 \times 1) @f$
* This Blob's diff will simply contain the loss_weight* @f$ \lambda @f$,
* as @f$ \lambda @f$ is the coefficient of this layer's output
* @f$\ell_i@f$ in the overall Net loss
* @f$ E = \lambda_i \ell_i + \mbox{other loss terms}@f$; hence
* @f$ \frac{\partial E}{\partial \ell_i} = \lambda_i @f$.
* (*Assuming that this top Blob is not used as a bottom (input) by any
* other layer of the Net.)
* @param propagate_down see Layer::Backward.
* propagate_down[1] must be false as gradient computation with respect
* to the targets is not implemented.
* @param bottom input Blob vector (length 2)
* -# @f$ (N \times C \times H \times W) @f$
* the predictions @f$x@f$; Backward computes diff
* @f$ \frac{\partial E}{\partial x} =
* \frac{1}{n} \sum\limits_{n=1}^N (\hat{p}_n - p_n)
* @f$
* -# @f$ (N \times 1 \times 1 \times 1) @f$
* the labels -- ignored as we can't compute their error gradients
*/
virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
/// The internal SigmoidLayer used to map predictions to probabilities.
shared_ptr<SigmoidLayer<Dtype> > sigmoid_layer_;
/// sigmoid_output stores the output of the SigmoidLayer.
shared_ptr<Blob<Dtype> > sigmoid_output_;
/// bottom vector holder to call the underlying SigmoidLayer::Forward
vector<Blob<Dtype>*> sigmoid_bottom_vec_;
/// top vector holder to call the underlying SigmoidLayer::Forward
vector<Blob<Dtype>*> sigmoid_top_vec_;
Dtype alpha_, gamma_, valid_num_;
Blob<Dtype> scaler_;
bool has_ignore_label_;
/// The label indicating that an instance should be ignored.
int ignore_label_;
/// How to normalize the loss.
LossParameter_NormalizationMode normalization_;
Dtype normalizer_;
int outer_num_, inner_num_;
};
} // namespace caffe
#endif // CAFFE_FOCAL_LOSS_LAYER_HPP_