1 /*
2  *
3  * Copyright IBM Corporation, 2012
4  * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
5  *
6  * This program is free software; you can redistribute it and/or modify it
7  * under the terms of version 2.1 of the GNU Lesser General Public License
8  * as published by the Free Software Foundation.
9  *
10  * This program is distributed in the hope that it would be useful, but
11  * WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
13  *
14  */
15 
16 #include <linux/cgroup.h>
17 #include <linux/page_counter.h>
18 #include <linux/slab.h>
19 #include <linux/hugetlb.h>
20 #include <linux/hugetlb_cgroup.h>
21 
22 struct hugetlb_cgroup {
23 	struct cgroup_subsys_state css;
24 	/*
25 	 * the counter to account for hugepages from hugetlb.
26 	 */
27 	struct page_counter hugepage[HUGE_MAX_HSTATE];
28 };
29 
30 #define MEMFILE_PRIVATE(x, val)	(((x) << 16) | (val))
31 #define MEMFILE_IDX(val)	(((val) >> 16) & 0xffff)
32 #define MEMFILE_ATTR(val)	((val) & 0xffff)
33 
34 static struct hugetlb_cgroup *root_h_cgroup __read_mostly;
35 
36 static inline
hugetlb_cgroup_from_css(struct cgroup_subsys_state * s)37 struct hugetlb_cgroup *hugetlb_cgroup_from_css(struct cgroup_subsys_state *s)
38 {
39 	return s ? container_of(s, struct hugetlb_cgroup, css) : NULL;
40 }
41 
42 static inline
hugetlb_cgroup_from_task(struct task_struct * task)43 struct hugetlb_cgroup *hugetlb_cgroup_from_task(struct task_struct *task)
44 {
45 	return hugetlb_cgroup_from_css(task_css(task, hugetlb_cgrp_id));
46 }
47 
hugetlb_cgroup_is_root(struct hugetlb_cgroup * h_cg)48 static inline bool hugetlb_cgroup_is_root(struct hugetlb_cgroup *h_cg)
49 {
50 	return (h_cg == root_h_cgroup);
51 }
52 
53 static inline struct hugetlb_cgroup *
parent_hugetlb_cgroup(struct hugetlb_cgroup * h_cg)54 parent_hugetlb_cgroup(struct hugetlb_cgroup *h_cg)
55 {
56 	return hugetlb_cgroup_from_css(h_cg->css.parent);
57 }
58 
hugetlb_cgroup_have_usage(struct hugetlb_cgroup * h_cg)59 static inline bool hugetlb_cgroup_have_usage(struct hugetlb_cgroup *h_cg)
60 {
61 	int idx;
62 
63 	for (idx = 0; idx < hugetlb_max_hstate; idx++) {
64 		if (page_counter_read(&h_cg->hugepage[idx]))
65 			return true;
66 	}
67 	return false;
68 }
69 
hugetlb_cgroup_init(struct hugetlb_cgroup * h_cgroup,struct hugetlb_cgroup * parent_h_cgroup)70 static void hugetlb_cgroup_init(struct hugetlb_cgroup *h_cgroup,
71 				struct hugetlb_cgroup *parent_h_cgroup)
72 {
73 	int idx;
74 
75 	for (idx = 0; idx < HUGE_MAX_HSTATE; idx++) {
76 		struct page_counter *counter = &h_cgroup->hugepage[idx];
77 		struct page_counter *parent = NULL;
78 		unsigned long limit;
79 		int ret;
80 
81 		if (parent_h_cgroup)
82 			parent = &parent_h_cgroup->hugepage[idx];
83 		page_counter_init(counter, parent);
84 
85 		limit = round_down(PAGE_COUNTER_MAX,
86 				   1 << huge_page_order(&hstates[idx]));
87 		ret = page_counter_set_max(counter, limit);
88 		VM_BUG_ON(ret);
89 	}
90 }
91 
92 static struct cgroup_subsys_state *
hugetlb_cgroup_css_alloc(struct cgroup_subsys_state * parent_css)93 hugetlb_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
94 {
95 	struct hugetlb_cgroup *parent_h_cgroup = hugetlb_cgroup_from_css(parent_css);
96 	struct hugetlb_cgroup *h_cgroup;
97 
98 	h_cgroup = kzalloc(sizeof(*h_cgroup), GFP_KERNEL);
99 	if (!h_cgroup)
100 		return ERR_PTR(-ENOMEM);
101 
102 	if (!parent_h_cgroup)
103 		root_h_cgroup = h_cgroup;
104 
105 	hugetlb_cgroup_init(h_cgroup, parent_h_cgroup);
106 	return &h_cgroup->css;
107 }
108 
hugetlb_cgroup_css_free(struct cgroup_subsys_state * css)109 static void hugetlb_cgroup_css_free(struct cgroup_subsys_state *css)
110 {
111 	struct hugetlb_cgroup *h_cgroup;
112 
113 	h_cgroup = hugetlb_cgroup_from_css(css);
114 	kfree(h_cgroup);
115 }
116 
117 
118 /*
119  * Should be called with hugetlb_lock held.
120  * Since we are holding hugetlb_lock, pages cannot get moved from
121  * active list or uncharged from the cgroup, So no need to get
122  * page reference and test for page active here. This function
123  * cannot fail.
124  */
hugetlb_cgroup_move_parent(int idx,struct hugetlb_cgroup * h_cg,struct page * page)125 static void hugetlb_cgroup_move_parent(int idx, struct hugetlb_cgroup *h_cg,
126 				       struct page *page)
127 {
128 	unsigned int nr_pages;
129 	struct page_counter *counter;
130 	struct hugetlb_cgroup *page_hcg;
131 	struct hugetlb_cgroup *parent = parent_hugetlb_cgroup(h_cg);
132 
133 	page_hcg = hugetlb_cgroup_from_page(page);
134 	/*
135 	 * We can have pages in active list without any cgroup
136 	 * ie, hugepage with less than 3 pages. We can safely
137 	 * ignore those pages.
138 	 */
139 	if (!page_hcg || page_hcg != h_cg)
140 		goto out;
141 
142 	nr_pages = 1 << compound_order(page);
143 	if (!parent) {
144 		parent = root_h_cgroup;
145 		/* root has no limit */
146 		page_counter_charge(&parent->hugepage[idx], nr_pages);
147 	}
148 	counter = &h_cg->hugepage[idx];
149 	/* Take the pages off the local counter */
150 	page_counter_cancel(counter, nr_pages);
151 
152 	set_hugetlb_cgroup(page, parent);
153 out:
154 	return;
155 }
156 
157 /*
158  * Force the hugetlb cgroup to empty the hugetlb resources by moving them to
159  * the parent cgroup.
160  */
hugetlb_cgroup_css_offline(struct cgroup_subsys_state * css)161 static void hugetlb_cgroup_css_offline(struct cgroup_subsys_state *css)
162 {
163 	struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css);
164 	struct hstate *h;
165 	struct page *page;
166 	int idx = 0;
167 
168 	do {
169 		for_each_hstate(h) {
170 			spin_lock(&hugetlb_lock);
171 			list_for_each_entry(page, &h->hugepage_activelist, lru)
172 				hugetlb_cgroup_move_parent(idx, h_cg, page);
173 
174 			spin_unlock(&hugetlb_lock);
175 			idx++;
176 		}
177 		cond_resched();
178 	} while (hugetlb_cgroup_have_usage(h_cg));
179 }
180 
hugetlb_cgroup_charge_cgroup(int idx,unsigned long nr_pages,struct hugetlb_cgroup ** ptr)181 int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages,
182 				 struct hugetlb_cgroup **ptr)
183 {
184 	int ret = 0;
185 	struct page_counter *counter;
186 	struct hugetlb_cgroup *h_cg = NULL;
187 
188 	if (hugetlb_cgroup_disabled())
189 		goto done;
190 	/*
191 	 * We don't charge any cgroup if the compound page have less
192 	 * than 3 pages.
193 	 */
194 	if (huge_page_order(&hstates[idx]) < HUGETLB_CGROUP_MIN_ORDER)
195 		goto done;
196 again:
197 	rcu_read_lock();
198 	h_cg = hugetlb_cgroup_from_task(current);
199 	if (!css_tryget_online(&h_cg->css)) {
200 		rcu_read_unlock();
201 		goto again;
202 	}
203 	rcu_read_unlock();
204 
205 	if (!page_counter_try_charge(&h_cg->hugepage[idx], nr_pages, &counter))
206 		ret = -ENOMEM;
207 	css_put(&h_cg->css);
208 done:
209 	*ptr = h_cg;
210 	return ret;
211 }
212 
213 /* Should be called with hugetlb_lock held */
hugetlb_cgroup_commit_charge(int idx,unsigned long nr_pages,struct hugetlb_cgroup * h_cg,struct page * page)214 void hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages,
215 				  struct hugetlb_cgroup *h_cg,
216 				  struct page *page)
217 {
218 	if (hugetlb_cgroup_disabled() || !h_cg)
219 		return;
220 
221 	set_hugetlb_cgroup(page, h_cg);
222 	return;
223 }
224 
225 /*
226  * Should be called with hugetlb_lock held
227  */
hugetlb_cgroup_uncharge_page(int idx,unsigned long nr_pages,struct page * page)228 void hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages,
229 				  struct page *page)
230 {
231 	struct hugetlb_cgroup *h_cg;
232 
233 	if (hugetlb_cgroup_disabled())
234 		return;
235 	lockdep_assert_held(&hugetlb_lock);
236 	h_cg = hugetlb_cgroup_from_page(page);
237 	if (unlikely(!h_cg))
238 		return;
239 	set_hugetlb_cgroup(page, NULL);
240 	page_counter_uncharge(&h_cg->hugepage[idx], nr_pages);
241 	return;
242 }
243 
hugetlb_cgroup_uncharge_cgroup(int idx,unsigned long nr_pages,struct hugetlb_cgroup * h_cg)244 void hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages,
245 				    struct hugetlb_cgroup *h_cg)
246 {
247 	if (hugetlb_cgroup_disabled() || !h_cg)
248 		return;
249 
250 	if (huge_page_order(&hstates[idx]) < HUGETLB_CGROUP_MIN_ORDER)
251 		return;
252 
253 	page_counter_uncharge(&h_cg->hugepage[idx], nr_pages);
254 	return;
255 }
256 
257 enum {
258 	RES_USAGE,
259 	RES_LIMIT,
260 	RES_MAX_USAGE,
261 	RES_FAILCNT,
262 };
263 
hugetlb_cgroup_read_u64(struct cgroup_subsys_state * css,struct cftype * cft)264 static u64 hugetlb_cgroup_read_u64(struct cgroup_subsys_state *css,
265 				   struct cftype *cft)
266 {
267 	struct page_counter *counter;
268 	struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css);
269 
270 	counter = &h_cg->hugepage[MEMFILE_IDX(cft->private)];
271 
272 	switch (MEMFILE_ATTR(cft->private)) {
273 	case RES_USAGE:
274 		return (u64)page_counter_read(counter) * PAGE_SIZE;
275 	case RES_LIMIT:
276 		return (u64)counter->max * PAGE_SIZE;
277 	case RES_MAX_USAGE:
278 		return (u64)counter->watermark * PAGE_SIZE;
279 	case RES_FAILCNT:
280 		return counter->failcnt;
281 	default:
282 		BUG();
283 	}
284 }
285 
286 static DEFINE_MUTEX(hugetlb_limit_mutex);
287 
hugetlb_cgroup_write(struct kernfs_open_file * of,char * buf,size_t nbytes,loff_t off)288 static ssize_t hugetlb_cgroup_write(struct kernfs_open_file *of,
289 				    char *buf, size_t nbytes, loff_t off)
290 {
291 	int ret, idx;
292 	unsigned long nr_pages;
293 	struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(of_css(of));
294 
295 	if (hugetlb_cgroup_is_root(h_cg)) /* Can't set limit on root */
296 		return -EINVAL;
297 
298 	buf = strstrip(buf);
299 	ret = page_counter_memparse(buf, "-1", &nr_pages);
300 	if (ret)
301 		return ret;
302 
303 	idx = MEMFILE_IDX(of_cft(of)->private);
304 	nr_pages = round_down(nr_pages, 1 << huge_page_order(&hstates[idx]));
305 
306 	switch (MEMFILE_ATTR(of_cft(of)->private)) {
307 	case RES_LIMIT:
308 		mutex_lock(&hugetlb_limit_mutex);
309 		ret = page_counter_set_max(&h_cg->hugepage[idx], nr_pages);
310 		mutex_unlock(&hugetlb_limit_mutex);
311 		break;
312 	default:
313 		ret = -EINVAL;
314 		break;
315 	}
316 	return ret ?: nbytes;
317 }
318 
hugetlb_cgroup_reset(struct kernfs_open_file * of,char * buf,size_t nbytes,loff_t off)319 static ssize_t hugetlb_cgroup_reset(struct kernfs_open_file *of,
320 				    char *buf, size_t nbytes, loff_t off)
321 {
322 	int ret = 0;
323 	struct page_counter *counter;
324 	struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(of_css(of));
325 
326 	counter = &h_cg->hugepage[MEMFILE_IDX(of_cft(of)->private)];
327 
328 	switch (MEMFILE_ATTR(of_cft(of)->private)) {
329 	case RES_MAX_USAGE:
330 		page_counter_reset_watermark(counter);
331 		break;
332 	case RES_FAILCNT:
333 		counter->failcnt = 0;
334 		break;
335 	default:
336 		ret = -EINVAL;
337 		break;
338 	}
339 	return ret ?: nbytes;
340 }
341 
mem_fmt(char * buf,int size,unsigned long hsize)342 static char *mem_fmt(char *buf, int size, unsigned long hsize)
343 {
344 	if (hsize >= (1UL << 30))
345 		snprintf(buf, size, "%luGB", hsize >> 30);
346 	else if (hsize >= (1UL << 20))
347 		snprintf(buf, size, "%luMB", hsize >> 20);
348 	else
349 		snprintf(buf, size, "%luKB", hsize >> 10);
350 	return buf;
351 }
352 
__hugetlb_cgroup_file_init(int idx)353 static void __init __hugetlb_cgroup_file_init(int idx)
354 {
355 	char buf[32];
356 	struct cftype *cft;
357 	struct hstate *h = &hstates[idx];
358 
359 	/* format the size */
360 	mem_fmt(buf, 32, huge_page_size(h));
361 
362 	/* Add the limit file */
363 	cft = &h->cgroup_files[0];
364 	snprintf(cft->name, MAX_CFTYPE_NAME, "%s.limit_in_bytes", buf);
365 	cft->private = MEMFILE_PRIVATE(idx, RES_LIMIT);
366 	cft->read_u64 = hugetlb_cgroup_read_u64;
367 	cft->write = hugetlb_cgroup_write;
368 
369 	/* Add the usage file */
370 	cft = &h->cgroup_files[1];
371 	snprintf(cft->name, MAX_CFTYPE_NAME, "%s.usage_in_bytes", buf);
372 	cft->private = MEMFILE_PRIVATE(idx, RES_USAGE);
373 	cft->read_u64 = hugetlb_cgroup_read_u64;
374 
375 	/* Add the MAX usage file */
376 	cft = &h->cgroup_files[2];
377 	snprintf(cft->name, MAX_CFTYPE_NAME, "%s.max_usage_in_bytes", buf);
378 	cft->private = MEMFILE_PRIVATE(idx, RES_MAX_USAGE);
379 	cft->write = hugetlb_cgroup_reset;
380 	cft->read_u64 = hugetlb_cgroup_read_u64;
381 
382 	/* Add the failcntfile */
383 	cft = &h->cgroup_files[3];
384 	snprintf(cft->name, MAX_CFTYPE_NAME, "%s.failcnt", buf);
385 	cft->private  = MEMFILE_PRIVATE(idx, RES_FAILCNT);
386 	cft->write = hugetlb_cgroup_reset;
387 	cft->read_u64 = hugetlb_cgroup_read_u64;
388 
389 	/* NULL terminate the last cft */
390 	cft = &h->cgroup_files[4];
391 	memset(cft, 0, sizeof(*cft));
392 
393 	WARN_ON(cgroup_add_legacy_cftypes(&hugetlb_cgrp_subsys,
394 					  h->cgroup_files));
395 }
396 
hugetlb_cgroup_file_init(void)397 void __init hugetlb_cgroup_file_init(void)
398 {
399 	struct hstate *h;
400 
401 	for_each_hstate(h) {
402 		/*
403 		 * Add cgroup control files only if the huge page consists
404 		 * of more than two normal pages. This is because we use
405 		 * page[2].private for storing cgroup details.
406 		 */
407 		if (huge_page_order(h) >= HUGETLB_CGROUP_MIN_ORDER)
408 			__hugetlb_cgroup_file_init(hstate_index(h));
409 	}
410 }
411 
412 /*
413  * hugetlb_lock will make sure a parallel cgroup rmdir won't happen
414  * when we migrate hugepages
415  */
hugetlb_cgroup_migrate(struct page * oldhpage,struct page * newhpage)416 void hugetlb_cgroup_migrate(struct page *oldhpage, struct page *newhpage)
417 {
418 	struct hugetlb_cgroup *h_cg;
419 	struct hstate *h = page_hstate(oldhpage);
420 
421 	if (hugetlb_cgroup_disabled())
422 		return;
423 
424 	VM_BUG_ON_PAGE(!PageHuge(oldhpage), oldhpage);
425 	spin_lock(&hugetlb_lock);
426 	h_cg = hugetlb_cgroup_from_page(oldhpage);
427 	set_hugetlb_cgroup(oldhpage, NULL);
428 
429 	/* move the h_cg details to new cgroup */
430 	set_hugetlb_cgroup(newhpage, h_cg);
431 	list_move(&newhpage->lru, &h->hugepage_activelist);
432 	spin_unlock(&hugetlb_lock);
433 	return;
434 }
435 
436 struct cgroup_subsys hugetlb_cgrp_subsys = {
437 	.css_alloc	= hugetlb_cgroup_css_alloc,
438 	.css_offline	= hugetlb_cgroup_css_offline,
439 	.css_free	= hugetlb_cgroup_css_free,
440 };
441