-
Notifications
You must be signed in to change notification settings - Fork 0
/
cgroup.c
4841 lines (4278 loc) · 126 KB
/
cgroup.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/*
* Generic process-grouping system.
*
* Based originally on the cpuset system, extracted by Paul Menage
* Copyright (C) 2006 Google, Inc
*
* Notifications support
* Copyright (C) 2009 Nokia Corporation
* Author: Kirill A. Shutemov
*
* Copyright notices from the original cpuset code:
* --------------------------------------------------
* Copyright (C) 2003 BULL SA.
* Copyright (C) 2004-2006 Silicon Graphics, Inc.
*
* Portions derived from Patrick Mochel's sysfs code.
* sysfs is Copyright (c) 2001-3 Patrick Mochel
*
* 2003-10-10 Written by Simon Derr.
* 2003-10-22 Updates by Stephen Hemminger.
* 2004 May-July Rework by Paul Jackson.
* ---------------------------------------------------
*
* This file is subject to the terms and conditions of the GNU General Public
* License. See the file COPYING in the main directory of the Linux
* distribution for more details.
*/
#include <linux/cgroup.h>
#include <linux/ctype.h>
#include <linux/errno.h>
#include <linux/fs.h>
#include <linux/kernel.h>
#include <linux/list.h>
#include <linux/mm.h>
#include <linux/mutex.h>
#include <linux/mount.h>
#include <linux/pagemap.h>
#include <linux/proc_fs.h>
#include <linux/rcupdate.h>
#include <linux/sched.h>
#include <linux/backing-dev.h>
#include <linux/seq_file.h>
#include <linux/slab.h>
#include <linux/magic.h>
#include <linux/spinlock.h>
#include <linux/string.h>
#include <linux/sort.h>
#include <linux/kmod.h>
#include <linux/module.h>
#include <linux/delayacct.h>
#include <linux/cgroupstats.h>
#include <linux/hash.h>
#include <linux/namei.h>
#include <linux/smp_lock.h>
#include <linux/pid_namespace.h>
#include <linux/idr.h>
#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
#include <linux/eventfd.h>
#include <linux/poll.h>
#include <asm/atomic.h>
static DEFINE_MUTEX(cgroup_mutex);
/*
* Generate an array of cgroup subsystem pointers. At boot time, this is
* populated up to CGROUP_BUILTIN_SUBSYS_COUNT, and modular subsystems are
* registered after that. The mutable section of this array is protected by
* cgroup_mutex.
*/
#define SUBSYS(_x) &_x ## _subsys,
static struct cgroup_subsys *subsys[CGROUP_SUBSYS_COUNT] = {
#include <linux/cgroup_subsys.h>
};
#define MAX_CGROUP_ROOT_NAMELEN 64
/*
* A cgroupfs_root represents the root of a cgroup hierarchy,
* and may be associated with a superblock to form an active
* hierarchy
*/
struct cgroupfs_root {
struct super_block *sb;
/*
* The bitmask of subsystems intended to be attached to this
* hierarchy
*/
unsigned long subsys_bits;
/* Unique id for this hierarchy. */
int hierarchy_id;
/* The bitmask of subsystems currently attached to this hierarchy */
unsigned long actual_subsys_bits;
/* A list running through the attached subsystems */
struct list_head subsys_list;
/* The root cgroup for this hierarchy */
struct cgroup top_cgroup;
/* Tracks how many cgroups are currently defined in hierarchy.*/
int number_of_cgroups;
/* A list running through the active hierarchies */
struct list_head root_list;
/* Hierarchy-specific flags */
unsigned long flags;
/* The path to use for release notifications. */
char release_agent_path[PATH_MAX];
/* The name for this hierarchy - may be empty */
char name[MAX_CGROUP_ROOT_NAMELEN];
};
/*
* The "rootnode" hierarchy is the "dummy hierarchy", reserved for the
* subsystems that are otherwise unattached - it never has more than a
* single cgroup, and all tasks are part of that cgroup.
*/
static struct cgroupfs_root rootnode;
/*
* CSS ID -- ID per subsys's Cgroup Subsys State(CSS). used only when
* cgroup_subsys->use_id != 0.
*/
#define CSS_ID_MAX (65535)
struct css_id {
/*
* The css to which this ID points. This pointer is set to valid value
* after cgroup is populated. If cgroup is removed, this will be NULL.
* This pointer is expected to be RCU-safe because destroy()
* is called after synchronize_rcu(). But for safe use, css_is_removed()
* css_tryget() should be used for avoiding race.
*/
struct cgroup_subsys_state *css;
/*
* ID of this css.
*/
unsigned short id;
/*
* Depth in hierarchy which this ID belongs to.
*/
unsigned short depth;
/*
* ID is freed by RCU. (and lookup routine is RCU safe.)
*/
struct rcu_head rcu_head;
/*
* Hierarchy of CSS ID belongs to.
*/
unsigned short stack[0]; /* Array of Length (depth+1) */
};
/*
* cgroup_event represents events which userspace want to recieve.
*/
struct cgroup_event {
/*
* Cgroup which the event belongs to.
*/
struct cgroup *cgrp;
/*
* Control file which the event associated.
*/
struct cftype *cft;
/*
* eventfd to signal userspace about the event.
*/
struct eventfd_ctx *eventfd;
/*
* Each of these stored in a list by the cgroup.
*/
struct list_head list;
/*
* All fields below needed to unregister event when
* userspace closes eventfd.
*/
poll_table pt;
wait_queue_head_t *wqh;
wait_queue_t wait;
struct work_struct remove;
};
/* The list of hierarchy roots */
static LIST_HEAD(roots);
static int root_count;
static DEFINE_IDA(hierarchy_ida);
static int next_hierarchy_id;
static DEFINE_SPINLOCK(hierarchy_id_lock);
/* dummytop is a shorthand for the dummy hierarchy's top cgroup */
#define dummytop (&rootnode.top_cgroup)
/* This flag indicates whether tasks in the fork and exit paths should
* check for fork/exit handlers to call. This avoids us having to do
* extra work in the fork/exit path if none of the subsystems need to
* be called.
*/
static int need_forkexit_callback __read_mostly;
#ifdef CONFIG_PROVE_LOCKING
int cgroup_lock_is_held(void)
{
return lockdep_is_held(&cgroup_mutex);
}
#else /* #ifdef CONFIG_PROVE_LOCKING */
int cgroup_lock_is_held(void)
{
return mutex_is_locked(&cgroup_mutex);
}
#endif /* #else #ifdef CONFIG_PROVE_LOCKING */
EXPORT_SYMBOL_GPL(cgroup_lock_is_held);
/* convenient tests for these bits */
inline int cgroup_is_removed(const struct cgroup *cgrp)
{
return test_bit(CGRP_REMOVED, &cgrp->flags);
}
/* bits in struct cgroupfs_root flags field */
enum {
ROOT_NOPREFIX, /* mounted subsystems have no named prefix */
};
static int cgroup_is_releasable(const struct cgroup *cgrp)
{
const int bits =
(1 << CGRP_RELEASABLE) |
(1 << CGRP_NOTIFY_ON_RELEASE);
return (cgrp->flags & bits) == bits;
}
static int notify_on_release(const struct cgroup *cgrp)
{
return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
}
/*
* for_each_subsys() allows you to iterate on each subsystem attached to
* an active hierarchy
*/
#define for_each_subsys(_root, _ss) \
list_for_each_entry(_ss, &_root->subsys_list, sibling)
/* for_each_active_root() allows you to iterate across the active hierarchies */
#define for_each_active_root(_root) \
list_for_each_entry(_root, &roots, root_list)
/* the list of cgroups eligible for automatic release. Protected by
* release_list_lock */
static LIST_HEAD(release_list);
static DEFINE_SPINLOCK(release_list_lock);
static void cgroup_release_agent(struct work_struct *work);
static DECLARE_WORK(release_agent_work, cgroup_release_agent);
static void check_for_release(struct cgroup *cgrp);
/* Link structure for associating css_set objects with cgroups */
struct cg_cgroup_link {
/*
* List running through cg_cgroup_links associated with a
* cgroup, anchored on cgroup->css_sets
*/
struct list_head cgrp_link_list;
struct cgroup *cgrp;
/*
* List running through cg_cgroup_links pointing at a
* single css_set object, anchored on css_set->cg_links
*/
struct list_head cg_link_list;
struct css_set *cg;
};
/* The default css_set - used by init and its children prior to any
* hierarchies being mounted. It contains a pointer to the root state
* for each subsystem. Also used to anchor the list of css_sets. Not
* reference-counted, to improve performance when child cgroups
* haven't been created.
*/
static struct css_set init_css_set;
static struct cg_cgroup_link init_css_set_link;
static int cgroup_init_idr(struct cgroup_subsys *ss,
struct cgroup_subsys_state *css);
/* css_set_lock protects the list of css_set objects, and the
* chain of tasks off each css_set. Nests outside task->alloc_lock
* due to cgroup_iter_start() */
static DEFINE_RWLOCK(css_set_lock);
static int css_set_count;
/*
* hash table for cgroup groups. This improves the performance to find
* an existing css_set. This hash doesn't (currently) take into
* account cgroups in empty hierarchies.
*/
#define CSS_SET_HASH_BITS 7
#define CSS_SET_TABLE_SIZE (1 << CSS_SET_HASH_BITS)
static struct hlist_head css_set_table[CSS_SET_TABLE_SIZE];
static struct hlist_head *css_set_hash(struct cgroup_subsys_state *css[])
{
int i;
int index;
unsigned long tmp = 0UL;
for (i = 0; i < CGROUP_SUBSYS_COUNT; i++)
tmp += (unsigned long)css[i];
tmp = (tmp >> 16) ^ tmp;
index = hash_long(tmp, CSS_SET_HASH_BITS);
return &css_set_table[index];
}
static void free_css_set_rcu(struct rcu_head *obj)
{
struct css_set *cg = container_of(obj, struct css_set, rcu_head);
kfree(cg);
}
/* We don't maintain the lists running through each css_set to its
* task until after the first call to cgroup_iter_start(). This
* reduces the fork()/exit() overhead for people who have cgroups
* compiled into their kernel but not actually in use */
static int use_task_css_set_links __read_mostly;
static void __put_css_set(struct css_set *cg, int taskexit)
{
struct cg_cgroup_link *link;
struct cg_cgroup_link *saved_link;
/*
* Ensure that the refcount doesn't hit zero while any readers
* can see it. Similar to atomic_dec_and_lock(), but for an
* rwlock
*/
if (atomic_add_unless(&cg->refcount, -1, 1))
return;
write_lock(&css_set_lock);
if (!atomic_dec_and_test(&cg->refcount)) {
write_unlock(&css_set_lock);
return;
}
/* This css_set is dead. unlink it and release cgroup refcounts */
hlist_del(&cg->hlist);
css_set_count--;
list_for_each_entry_safe(link, saved_link, &cg->cg_links,
cg_link_list) {
struct cgroup *cgrp = link->cgrp;
list_del(&link->cg_link_list);
list_del(&link->cgrp_link_list);
if (atomic_dec_and_test(&cgrp->count) &&
notify_on_release(cgrp)) {
if (taskexit)
set_bit(CGRP_RELEASABLE, &cgrp->flags);
check_for_release(cgrp);
}
kfree(link);
}
write_unlock(&css_set_lock);
call_rcu(&cg->rcu_head, free_css_set_rcu);
}
/*
* refcounted get/put for css_set objects
*/
static inline void get_css_set(struct css_set *cg)
{
atomic_inc(&cg->refcount);
}
static inline void put_css_set(struct css_set *cg)
{
__put_css_set(cg, 0);
}
static inline void put_css_set_taskexit(struct css_set *cg)
{
__put_css_set(cg, 1);
}
/*
* compare_css_sets - helper function for find_existing_css_set().
* @cg: candidate css_set being tested
* @old_cg: existing css_set for a task
* @new_cgrp: cgroup that's being entered by the task
* @template: desired set of css pointers in css_set (pre-calculated)
*
* Returns true if "cg" matches "old_cg" except for the hierarchy
* which "new_cgrp" belongs to, for which it should match "new_cgrp".
*/
static bool compare_css_sets(struct css_set *cg,
struct css_set *old_cg,
struct cgroup *new_cgrp,
struct cgroup_subsys_state *template[])
{
struct list_head *l1, *l2;
if (memcmp(template, cg->subsys, sizeof(cg->subsys))) {
/* Not all subsystems matched */
return false;
}
/*
* Compare cgroup pointers in order to distinguish between
* different cgroups in heirarchies with no subsystems. We
* could get by with just this check alone (and skip the
* memcmp above) but on most setups the memcmp check will
* avoid the need for this more expensive check on almost all
* candidates.
*/
l1 = &cg->cg_links;
l2 = &old_cg->cg_links;
while (1) {
struct cg_cgroup_link *cgl1, *cgl2;
struct cgroup *cg1, *cg2;
l1 = l1->next;
l2 = l2->next;
/* See if we reached the end - both lists are equal length. */
if (l1 == &cg->cg_links) {
BUG_ON(l2 != &old_cg->cg_links);
break;
} else {
BUG_ON(l2 == &old_cg->cg_links);
}
/* Locate the cgroups associated with these links. */
cgl1 = list_entry(l1, struct cg_cgroup_link, cg_link_list);
cgl2 = list_entry(l2, struct cg_cgroup_link, cg_link_list);
cg1 = cgl1->cgrp;
cg2 = cgl2->cgrp;
/* Hierarchies should be linked in the same order. */
BUG_ON(cg1->root != cg2->root);
/*
* If this hierarchy is the hierarchy of the cgroup
* that's changing, then we need to check that this
* css_set points to the new cgroup; if it's any other
* hierarchy, then this css_set should point to the
* same cgroup as the old css_set.
*/
if (cg1->root == new_cgrp->root) {
if (cg1 != new_cgrp)
return false;
} else {
if (cg1 != cg2)
return false;
}
}
return true;
}
/*
* find_existing_css_set() is a helper for
* find_css_set(), and checks to see whether an existing
* css_set is suitable.
*
* oldcg: the cgroup group that we're using before the cgroup
* transition
*
* cgrp: the cgroup that we're moving into
*
* template: location in which to build the desired set of subsystem
* state objects for the new cgroup group
*/
static struct css_set *find_existing_css_set(
struct css_set *oldcg,
struct cgroup *cgrp,
struct cgroup_subsys_state *template[])
{
int i;
struct cgroupfs_root *root = cgrp->root;
struct hlist_head *hhead;
struct hlist_node *node;
struct css_set *cg;
/*
* Build the set of subsystem state objects that we want to see in the
* new css_set. while subsystems can change globally, the entries here
* won't change, so no need for locking.
*/
for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
if (root->subsys_bits & (1UL << i)) {
/* Subsystem is in this hierarchy. So we want
* the subsystem state from the new
* cgroup */
template[i] = cgrp->subsys[i];
} else {
/* Subsystem is not in this hierarchy, so we
* don't want to change the subsystem state */
template[i] = oldcg->subsys[i];
}
}
hhead = css_set_hash(template);
hlist_for_each_entry(cg, node, hhead, hlist) {
if (!compare_css_sets(cg, oldcg, cgrp, template))
continue;
/* This css_set matches what we need */
return cg;
}
/* No existing cgroup group matched */
return NULL;
}
static void free_cg_links(struct list_head *tmp)
{
struct cg_cgroup_link *link;
struct cg_cgroup_link *saved_link;
list_for_each_entry_safe(link, saved_link, tmp, cgrp_link_list) {
list_del(&link->cgrp_link_list);
kfree(link);
}
}
/*
* allocate_cg_links() allocates "count" cg_cgroup_link structures
* and chains them on tmp through their cgrp_link_list fields. Returns 0 on
* success or a negative error
*/
static int allocate_cg_links(int count, struct list_head *tmp)
{
struct cg_cgroup_link *link;
int i;
INIT_LIST_HEAD(tmp);
for (i = 0; i < count; i++) {
link = kmalloc(sizeof(*link), GFP_KERNEL);
if (!link) {
free_cg_links(tmp);
return -ENOMEM;
}
list_add(&link->cgrp_link_list, tmp);
}
return 0;
}
/**
* link_css_set - a helper function to link a css_set to a cgroup
* @tmp_cg_links: cg_cgroup_link objects allocated by allocate_cg_links()
* @cg: the css_set to be linked
* @cgrp: the destination cgroup
*/
static void link_css_set(struct list_head *tmp_cg_links,
struct css_set *cg, struct cgroup *cgrp)
{
struct cg_cgroup_link *link;
BUG_ON(list_empty(tmp_cg_links));
link = list_first_entry(tmp_cg_links, struct cg_cgroup_link,
cgrp_link_list);
link->cg = cg;
link->cgrp = cgrp;
atomic_inc(&cgrp->count);
list_move(&link->cgrp_link_list, &cgrp->css_sets);
/*
* Always add links to the tail of the list so that the list
* is sorted by order of hierarchy creation
*/
list_add_tail(&link->cg_link_list, &cg->cg_links);
}
/*
* find_css_set() takes an existing cgroup group and a
* cgroup object, and returns a css_set object that's
* equivalent to the old group, but with the given cgroup
* substituted into the appropriate hierarchy. Must be called with
* cgroup_mutex held
*/
static struct css_set *find_css_set(
struct css_set *oldcg, struct cgroup *cgrp)
{
struct css_set *res;
struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT];
struct list_head tmp_cg_links;
struct hlist_head *hhead;
struct cg_cgroup_link *link;
/* First see if we already have a cgroup group that matches
* the desired set */
read_lock(&css_set_lock);
res = find_existing_css_set(oldcg, cgrp, template);
if (res)
get_css_set(res);
read_unlock(&css_set_lock);
if (res)
return res;
res = kmalloc(sizeof(*res), GFP_KERNEL);
if (!res)
return NULL;
/* Allocate all the cg_cgroup_link objects that we'll need */
if (allocate_cg_links(root_count, &tmp_cg_links) < 0) {
kfree(res);
return NULL;
}
atomic_set(&res->refcount, 1);
INIT_LIST_HEAD(&res->cg_links);
INIT_LIST_HEAD(&res->tasks);
INIT_HLIST_NODE(&res->hlist);
/* Copy the set of subsystem state objects generated in
* find_existing_css_set() */
memcpy(res->subsys, template, sizeof(res->subsys));
write_lock(&css_set_lock);
/* Add reference counts and links from the new css_set. */
list_for_each_entry(link, &oldcg->cg_links, cg_link_list) {
struct cgroup *c = link->cgrp;
if (c->root == cgrp->root)
c = cgrp;
link_css_set(&tmp_cg_links, res, c);
}
BUG_ON(!list_empty(&tmp_cg_links));
css_set_count++;
/* Add this cgroup group to the hash table */
hhead = css_set_hash(res->subsys);
hlist_add_head(&res->hlist, hhead);
write_unlock(&css_set_lock);
return res;
}
/*
* Return the cgroup for "task" from the given hierarchy. Must be
* called with cgroup_mutex held.
*/
static struct cgroup *task_cgroup_from_root(struct task_struct *task,
struct cgroupfs_root *root)
{
struct css_set *css;
struct cgroup *res = NULL;
BUG_ON(!mutex_is_locked(&cgroup_mutex));
read_lock(&css_set_lock);
/*
* No need to lock the task - since we hold cgroup_mutex the
* task can't change groups, so the only thing that can happen
* is that it exits and its css is set back to init_css_set.
*/
css = task->cgroups;
if (css == &init_css_set) {
res = &root->top_cgroup;
} else {
struct cg_cgroup_link *link;
list_for_each_entry(link, &css->cg_links, cg_link_list) {
struct cgroup *c = link->cgrp;
if (c->root == root) {
res = c;
break;
}
}
}
read_unlock(&css_set_lock);
BUG_ON(!res);
return res;
}
/*
* There is one global cgroup mutex. We also require taking
* task_lock() when dereferencing a task's cgroup subsys pointers.
* See "The task_lock() exception", at the end of this comment.
*
* A task must hold cgroup_mutex to modify cgroups.
*
* Any task can increment and decrement the count field without lock.
* So in general, code holding cgroup_mutex can't rely on the count
* field not changing. However, if the count goes to zero, then only
* cgroup_attach_task() can increment it again. Because a count of zero
* means that no tasks are currently attached, therefore there is no
* way a task attached to that cgroup can fork (the other way to
* increment the count). So code holding cgroup_mutex can safely
* assume that if the count is zero, it will stay zero. Similarly, if
* a task holds cgroup_mutex on a cgroup with zero count, it
* knows that the cgroup won't be removed, as cgroup_rmdir()
* needs that mutex.
*
* The fork and exit callbacks cgroup_fork() and cgroup_exit(), don't
* (usually) take cgroup_mutex. These are the two most performance
* critical pieces of code here. The exception occurs on cgroup_exit(),
* when a task in a notify_on_release cgroup exits. Then cgroup_mutex
* is taken, and if the cgroup count is zero, a usermode call made
* to the release agent with the name of the cgroup (path relative to
* the root of cgroup file system) as the argument.
*
* A cgroup can only be deleted if both its 'count' of using tasks
* is zero, and its list of 'children' cgroups is empty. Since all
* tasks in the system use _some_ cgroup, and since there is always at
* least one task in the system (init, pid == 1), therefore, top_cgroup
* always has either children cgroups and/or using tasks. So we don't
* need a special hack to ensure that top_cgroup cannot be deleted.
*
* The task_lock() exception
*
* The need for this exception arises from the action of
* cgroup_attach_task(), which overwrites one tasks cgroup pointer with
* another. It does so using cgroup_mutex, however there are
* several performance critical places that need to reference
* task->cgroup without the expense of grabbing a system global
* mutex. Therefore except as noted below, when dereferencing or, as
* in cgroup_attach_task(), modifying a task'ss cgroup pointer we use
* task_lock(), which acts on a spinlock (task->alloc_lock) already in
* the task_struct routinely used for such matters.
*
* P.S. One more locking exception. RCU is used to guard the
* update of a tasks cgroup pointer by cgroup_attach_task()
*/
/**
* cgroup_lock - lock out any changes to cgroup structures
*
*/
void cgroup_lock(void)
{
mutex_lock(&cgroup_mutex);
}
EXPORT_SYMBOL_GPL(cgroup_lock);
/**
* cgroup_unlock - release lock on cgroup changes
*
* Undo the lock taken in a previous cgroup_lock() call.
*/
void cgroup_unlock(void)
{
mutex_unlock(&cgroup_mutex);
}
EXPORT_SYMBOL_GPL(cgroup_unlock);
/*
* A couple of forward declarations required, due to cyclic reference loop:
* cgroup_mkdir -> cgroup_create -> cgroup_populate_dir ->
* cgroup_add_file -> cgroup_create_file -> cgroup_dir_inode_operations
* -> cgroup_mkdir.
*/
static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode);
static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry);
static int cgroup_populate_dir(struct cgroup *cgrp);
static const struct inode_operations cgroup_dir_inode_operations;
static const struct file_operations proc_cgroupstats_operations;
static struct backing_dev_info cgroup_backing_dev_info = {
.name = "cgroup",
.capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
};
static int alloc_css_id(struct cgroup_subsys *ss,
struct cgroup *parent, struct cgroup *child);
static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb)
{
struct inode *inode = new_inode(sb);
if (inode) {
inode->i_mode = mode;
inode->i_uid = current_fsuid();
inode->i_gid = current_fsgid();
inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
inode->i_mapping->backing_dev_info = &cgroup_backing_dev_info;
}
return inode;
}
/*
* Call subsys's pre_destroy handler.
* This is called before css refcnt check.
*/
static int cgroup_call_pre_destroy(struct cgroup *cgrp)
{
struct cgroup_subsys *ss;
int ret = 0;
for_each_subsys(cgrp->root, ss)
if (ss->pre_destroy) {
ret = ss->pre_destroy(ss, cgrp);
if (ret)
break;
}
return ret;
}
static void free_cgroup_rcu(struct rcu_head *obj)
{
struct cgroup *cgrp = container_of(obj, struct cgroup, rcu_head);
kfree(cgrp);
}
static void cgroup_diput(struct dentry *dentry, struct inode *inode)
{
/* is dentry a directory ? if so, kfree() associated cgroup */
if (S_ISDIR(inode->i_mode)) {
struct cgroup *cgrp = dentry->d_fsdata;
struct cgroup_subsys *ss;
BUG_ON(!(cgroup_is_removed(cgrp)));
/* It's possible for external users to be holding css
* reference counts on a cgroup; css_put() needs to
* be able to access the cgroup after decrementing
* the reference count in order to know if it needs to
* queue the cgroup to be handled by the release
* agent */
synchronize_rcu();
mutex_lock(&cgroup_mutex);
/*
* Release the subsystem state objects.
*/
for_each_subsys(cgrp->root, ss)
ss->destroy(ss, cgrp);
cgrp->root->number_of_cgroups--;
mutex_unlock(&cgroup_mutex);
/*
* Drop the active superblock reference that we took when we
* created the cgroup
*/
deactivate_super(cgrp->root->sb);
/*
* if we're getting rid of the cgroup, refcount should ensure
* that there are no pidlists left.
*/
BUG_ON(!list_empty(&cgrp->pidlists));
call_rcu(&cgrp->rcu_head, free_cgroup_rcu);
}
iput(inode);
}
static void remove_dir(struct dentry *d)
{
struct dentry *parent = dget(d->d_parent);
d_delete(d);
simple_rmdir(parent->d_inode, d);
dput(parent);
}
static void cgroup_clear_directory(struct dentry *dentry)
{
struct list_head *node;
BUG_ON(!mutex_is_locked(&dentry->d_inode->i_mutex));
spin_lock(&dcache_lock);
node = dentry->d_subdirs.next;
while (node != &dentry->d_subdirs) {
struct dentry *d = list_entry(node, struct dentry, d_u.d_child);
list_del_init(node);
if (d->d_inode) {
/* This should never be called on a cgroup
* directory with child cgroups */
BUG_ON(d->d_inode->i_mode & S_IFDIR);
d = dget_locked(d);
spin_unlock(&dcache_lock);
d_delete(d);
simple_unlink(dentry->d_inode, d);
dput(d);
spin_lock(&dcache_lock);
}
node = dentry->d_subdirs.next;
}
spin_unlock(&dcache_lock);
}
/*
* NOTE : the dentry must have been dget()'ed
*/
static void cgroup_d_remove_dir(struct dentry *dentry)
{
cgroup_clear_directory(dentry);
spin_lock(&dcache_lock);
list_del_init(&dentry->d_u.d_child);
spin_unlock(&dcache_lock);
remove_dir(dentry);
}
/*
* A queue for waiters to do rmdir() cgroup. A tasks will sleep when
* cgroup->count == 0 && list_empty(&cgroup->children) && subsys has some
* reference to css->refcnt. In general, this refcnt is expected to goes down
* to zero, soon.
*
* CGRP_WAIT_ON_RMDIR flag is set under cgroup's inode->i_mutex;
*/
DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq);
static void cgroup_wakeup_rmdir_waiter(struct cgroup *cgrp)
{
if (unlikely(test_and_clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags)))
wake_up_all(&cgroup_rmdir_waitq);
}
void cgroup_exclude_rmdir(struct cgroup_subsys_state *css)
{
css_get(css);
}
void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css)
{
cgroup_wakeup_rmdir_waiter(css->cgroup);
css_put(css);
}
/*
* Call with cgroup_mutex held. Drops reference counts on modules, including
* any duplicate ones that parse_cgroupfs_options took. If this function
* returns an error, no reference counts are touched.
*/
static int rebind_subsystems(struct cgroupfs_root *root,
unsigned long final_bits)
{
unsigned long added_bits, removed_bits;
struct cgroup *cgrp = &root->top_cgroup;
int i;
BUG_ON(!mutex_is_locked(&cgroup_mutex));
removed_bits = root->actual_subsys_bits & ~final_bits;
added_bits = final_bits & ~root->actual_subsys_bits;
/* Check that any added subsystems are currently free */
for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
unsigned long bit = 1UL << i;
struct cgroup_subsys *ss = subsys[i];
if (!(bit & added_bits))
continue;
/*
* Nobody should tell us to do a subsys that doesn't exist:
* parse_cgroupfs_options should catch that case and refcounts
* ensure that subsystems won't disappear once selected.
*/
BUG_ON(ss == NULL);
if (ss->root != &rootnode) {
/* Subsystem isn't free */
return -EBUSY;
}
}
/* Currently we don't handle adding/removing subsystems when
* any child cgroups exist. This is theoretically supportable
* but involves complex error handling, so it's being left until
* later */
if (root->number_of_cgroups > 1)
return -EBUSY;
/* Process each subsystem */
for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
struct cgroup_subsys *ss = subsys[i];
unsigned long bit = 1UL << i;
if (bit & added_bits) {
/* We're binding this subsystem to this hierarchy */
BUG_ON(ss == NULL);
BUG_ON(cgrp->subsys[i]);
BUG_ON(!dummytop->subsys[i]);
BUG_ON(dummytop->subsys[i]->cgroup != dummytop);
mutex_lock(&ss->hierarchy_mutex);
cgrp->subsys[i] = dummytop->subsys[i];
cgrp->subsys[i]->cgroup = cgrp;
list_move(&ss->sibling, &root->subsys_list);
ss->root = root;
if (ss->bind)
ss->bind(ss, cgrp);
mutex_unlock(&ss->hierarchy_mutex);
/* refcount was already taken, and we're keeping it */
} else if (bit & removed_bits) {
/* We're removing this subsystem */
BUG_ON(ss == NULL);
BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]);
BUG_ON(cgrp->subsys[i]->cgroup != cgrp);
mutex_lock(&ss->hierarchy_mutex);
if (ss->bind)
ss->bind(ss, dummytop);