xref: /aosp_15_r20/system/core/libprocessgroup/setup/cgroup_map_write.cpp (revision 00c7fec1bb09f3284aad6a6f96d2f63dfc3650ad)
1 /*
2  * Copyright (C) 2019 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 //#define LOG_NDEBUG 0
18 #define LOG_TAG "libprocessgroup"
19 
20 #include <dirent.h>
21 #include <errno.h>
22 #include <fcntl.h>
23 #include <grp.h>
24 #include <pwd.h>
25 #include <sys/mount.h>
26 #include <sys/stat.h>
27 #include <sys/types.h>
28 #include <unistd.h>
29 
30 #include <optional>
31 
32 #include <android-base/file.h>
33 #include <android-base/logging.h>
34 #include <processgroup/cgroup_descriptor.h>
35 #include <processgroup/processgroup.h>
36 #include <processgroup/setup.h>
37 #include <processgroup/util.h>
38 
39 #include "../build_flags.h"
40 #include "../internal.h"
41 
42 static constexpr const char* CGROUPS_DESC_FILE = "/etc/cgroups.json";
43 static constexpr const char* CGROUPS_DESC_VENDOR_FILE = "/vendor/etc/cgroups.json";
44 
45 static constexpr const char* TEMPLATE_CGROUPS_DESC_API_FILE = "/etc/task_profiles/cgroups_%u.json";
46 
ChangeDirModeAndOwner(const std::string & path,mode_t mode,const std::string & uid,const std::string & gid,bool permissive_mode=false)47 static bool ChangeDirModeAndOwner(const std::string& path, mode_t mode, const std::string& uid,
48                                   const std::string& gid, bool permissive_mode = false) {
49     uid_t pw_uid = -1;
50     gid_t gr_gid = -1;
51 
52     if (!uid.empty()) {
53         passwd* uid_pwd = getpwnam(uid.c_str());
54         if (!uid_pwd) {
55             PLOG(ERROR) << "Unable to decode UID for '" << uid << "'";
56             return false;
57         }
58 
59         pw_uid = uid_pwd->pw_uid;
60         gr_gid = -1;
61 
62         if (!gid.empty()) {
63             group* gid_pwd = getgrnam(gid.c_str());
64             if (!gid_pwd) {
65                 PLOG(ERROR) << "Unable to decode GID for '" << gid << "'";
66                 return false;
67             }
68             gr_gid = gid_pwd->gr_gid;
69         }
70     }
71 
72     auto dir = std::unique_ptr<DIR, decltype(&closedir)>(opendir(path.c_str()), closedir);
73 
74     if (dir == NULL) {
75         PLOG(ERROR) << "opendir failed for " << path;
76         return false;
77     }
78 
79     struct dirent* dir_entry;
80     while ((dir_entry = readdir(dir.get()))) {
81         if (!strcmp("..", dir_entry->d_name)) {
82             continue;
83         }
84 
85         std::string file_path = path + "/" + dir_entry->d_name;
86 
87         if (pw_uid != -1 && lchown(file_path.c_str(), pw_uid, gr_gid) < 0) {
88             PLOG(ERROR) << "lchown() failed for " << file_path;
89             return false;
90         }
91 
92         if (fchmodat(AT_FDCWD, file_path.c_str(), mode, AT_SYMLINK_NOFOLLOW) != 0 &&
93             (errno != EROFS || !permissive_mode)) {
94             PLOG(ERROR) << "fchmodat() failed for " << path;
95             return false;
96         }
97     }
98 
99     return true;
100 }
101 
Mkdir(const std::string & path,mode_t mode,const std::string & uid,const std::string & gid)102 static bool Mkdir(const std::string& path, mode_t mode, const std::string& uid,
103                   const std::string& gid) {
104     bool permissive_mode = false;
105 
106     if (mode == 0) {
107         /* Allow chmod to fail */
108         permissive_mode = true;
109         mode = 0755;
110     }
111 
112     if (mkdir(path.c_str(), mode) != 0) {
113         // /acct is a special case when the directory already exists
114         if (errno != EEXIST) {
115             PLOG(ERROR) << "mkdir() failed for " << path;
116             return false;
117         } else {
118             permissive_mode = true;
119         }
120     }
121 
122     if (uid.empty() && permissive_mode) {
123         return true;
124     }
125 
126     if (!ChangeDirModeAndOwner(path, mode, uid, gid, permissive_mode)) {
127         PLOG(ERROR) << "change of ownership or mode failed for " << path;
128         return false;
129     }
130 
131     return true;
132 }
133 
134 // To avoid issues in sdk_mac build
135 #if defined(__ANDROID__)
136 
IsOptionalController(const CgroupController * controller)137 static bool IsOptionalController(const CgroupController* controller) {
138     return controller->flags() & CGROUPRC_CONTROLLER_FLAG_OPTIONAL;
139 }
140 
MountV2CgroupController(const CgroupDescriptor & descriptor)141 static bool MountV2CgroupController(const CgroupDescriptor& descriptor) {
142     const CgroupController* controller = descriptor.controller();
143 
144     // /sys/fs/cgroup is created by cgroup2 with specific selinux permissions,
145     // try to create again in case the mount point is changed
146     if (!Mkdir(controller->path(), 0, "", "")) {
147         LOG(ERROR) << "Failed to create directory for " << controller->name() << " cgroup";
148         return false;
149     }
150 
151     // The memory_recursiveprot mount option has been introduced by kernel commit
152     // 8a931f801340 ("mm: memcontrol: recursive memory.low protection"; v5.7). Try first to
153     // mount with that option enabled. If mounting fails because the kernel is too old,
154     // retry without that mount option.
155     if (mount("none", controller->path(), "cgroup2", MS_NODEV | MS_NOEXEC | MS_NOSUID,
156               "memory_recursiveprot") < 0) {
157         LOG(INFO) << "Mounting memcg with memory_recursiveprot failed. Retrying without.";
158         if (mount("none", controller->path(), "cgroup2", MS_NODEV | MS_NOEXEC | MS_NOSUID,
159                   nullptr) < 0) {
160             PLOG(ERROR) << "Failed to mount cgroup v2";
161             return IsOptionalController(controller);
162         }
163     }
164 
165     // selinux permissions change after mounting, so it's ok to change mode and owner now
166     if (!ChangeDirModeAndOwner(controller->path(), descriptor.mode(), descriptor.uid(),
167                                descriptor.gid())) {
168         PLOG(ERROR) << "Change of ownership or mode failed for controller " << controller->name();
169         return IsOptionalController(controller);
170     }
171 
172     return true;
173 }
174 
ActivateV2CgroupController(const CgroupDescriptor & descriptor)175 static bool ActivateV2CgroupController(const CgroupDescriptor& descriptor) {
176     const CgroupController* controller = descriptor.controller();
177 
178     if (!Mkdir(controller->path(), descriptor.mode(), descriptor.uid(), descriptor.gid())) {
179         LOG(ERROR) << "Failed to create directory for " << controller->name() << " cgroup";
180         return false;
181     }
182 
183     return ::ActivateControllers(controller->path(), {{controller->name(), descriptor}});
184 }
185 
MountV1CgroupController(const CgroupDescriptor & descriptor)186 static bool MountV1CgroupController(const CgroupDescriptor& descriptor) {
187     const CgroupController* controller = descriptor.controller();
188 
189     // mkdir <path> [mode] [owner] [group]
190     if (!Mkdir(controller->path(), descriptor.mode(), descriptor.uid(), descriptor.gid())) {
191         LOG(ERROR) << "Failed to create directory for " << controller->name() << " cgroup";
192         return false;
193     }
194 
195     // Unfortunately historically cpuset controller was mounted using a mount command
196     // different from all other controllers. This results in controller attributes not
197     // to be prepended with controller name. For example this way instead of
198     // /dev/cpuset/cpuset.cpus the attribute becomes /dev/cpuset/cpus which is what
199     // the system currently expects.
200     int res;
201     if (!strcmp(controller->name(), "cpuset")) {
202         // mount cpuset none /dev/cpuset nodev noexec nosuid
203         res = mount("none", controller->path(), controller->name(),
204                     MS_NODEV | MS_NOEXEC | MS_NOSUID, nullptr);
205     } else {
206         // mount cgroup none <path> nodev noexec nosuid <controller>
207         res = mount("none", controller->path(), "cgroup", MS_NODEV | MS_NOEXEC | MS_NOSUID,
208                     controller->name());
209     }
210     if (res != 0) {
211         if (IsOptionalController(controller)) {
212             PLOG(INFO) << "Failed to mount optional controller " << controller->name();
213             return true;
214         }
215         PLOG(ERROR) << "Failed to mount controller " << controller->name();
216         return false;
217     }
218     return true;
219 }
220 
SetupCgroup(const CgroupDescriptor & descriptor)221 static bool SetupCgroup(const CgroupDescriptor& descriptor) {
222     const CgroupController* controller = descriptor.controller();
223 
224     if (controller->version() == 2) {
225         if (controller->name() == CGROUPV2_HIERARCHY_NAME) {
226             return MountV2CgroupController(descriptor);
227         } else {
228             return ActivateV2CgroupController(descriptor);
229         }
230     } else {
231         return MountV1CgroupController(descriptor);
232     }
233 }
234 
235 #else
236 
237 // Stubs for non-Android targets.
SetupCgroup(const CgroupDescriptor &)238 static bool SetupCgroup(const CgroupDescriptor&) {
239     return false;
240 }
241 
242 #endif
243 
CgroupDescriptor(uint32_t version,const std::string & name,const std::string & path,mode_t mode,const std::string & uid,const std::string & gid,uint32_t flags,uint32_t max_activation_depth)244 CgroupDescriptor::CgroupDescriptor(uint32_t version, const std::string& name,
245                                    const std::string& path, mode_t mode, const std::string& uid,
246                                    const std::string& gid, uint32_t flags,
247                                    uint32_t max_activation_depth)
248     : controller_(version, flags, name, path, max_activation_depth),
249       mode_(mode),
250       uid_(uid),
251       gid_(gid) {}
252 
set_mounted(bool mounted)253 void CgroupDescriptor::set_mounted(bool mounted) {
254     uint32_t flags = controller_.flags();
255     if (mounted) {
256         flags |= CGROUPRC_CONTROLLER_FLAG_MOUNTED;
257     } else {
258         flags &= ~CGROUPRC_CONTROLLER_FLAG_MOUNTED;
259     }
260     controller_.set_flags(flags);
261 }
262 
MGLRUDisabled()263 static std::optional<bool> MGLRUDisabled() {
264     const std::string file_name = "/sys/kernel/mm/lru_gen/enabled";
265     std::string content;
266     if (!android::base::ReadFileToString(file_name, &content)) {
267         PLOG(ERROR) << "Failed to read MGLRU state from " << file_name;
268         return {};
269     }
270 
271     return content == "0x0000";
272 }
273 
MEMCGDisabled(const CgroupDescriptorMap & descriptors)274 static std::optional<bool> MEMCGDisabled(const CgroupDescriptorMap& descriptors) {
275     std::string cgroup_v2_root = CGROUP_V2_ROOT_DEFAULT;
276     const auto it = descriptors.find(CGROUPV2_HIERARCHY_NAME);
277     if (it == descriptors.end()) {
278         LOG(WARNING) << "No Cgroups2 path found in cgroups.json. Vendor has modified Android, and "
279                      << "kernel memory use will be higher than intended.";
280     } else if (it->second.controller()->path() != cgroup_v2_root) {
281         cgroup_v2_root = it->second.controller()->path();
282     }
283 
284     const std::string file_name = cgroup_v2_root + "/cgroup.controllers";
285     std::string content;
286     if (!android::base::ReadFileToString(file_name, &content)) {
287         PLOG(ERROR) << "Failed to read cgroup controllers from " << file_name;
288         return {};
289     }
290 
291     // If we've forced memcg to v2 and it's not available, then it could only have been disabled
292     // on the kernel command line (GKI sets CONFIG_MEMCG).
293     return content.find("memory") == std::string::npos;
294 }
295 
CreateV2SubHierarchy(const std::string & path,const CgroupDescriptorMap & descriptors)296 static bool CreateV2SubHierarchy(const std::string& path, const CgroupDescriptorMap& descriptors) {
297     const auto cgv2_iter = descriptors.find(CGROUPV2_HIERARCHY_NAME);
298     if (cgv2_iter == descriptors.end()) return false;
299     const CgroupDescriptor cgv2_descriptor = cgv2_iter->second;
300 
301     if (!Mkdir(path, cgv2_descriptor.mode(), cgv2_descriptor.uid(), cgv2_descriptor.gid())) {
302         PLOG(ERROR) << "Failed to create directory for " << path;
303         return false;
304     }
305 
306     // Activate all v2 controllers in path so they can be activated in
307     // children as they are created.
308     return ::ActivateControllers(path, descriptors);
309 }
310 
CgroupSetup()311 bool CgroupSetup() {
312     CgroupDescriptorMap descriptors;
313 
314     if (getpid() != 1) {
315         LOG(ERROR) << "Cgroup setup can be done only by init process";
316         return false;
317     }
318 
319     // load cgroups.json file
320     if (!ReadDescriptors(&descriptors)) {
321         LOG(ERROR) << "Failed to load cgroup description file";
322         return false;
323     }
324 
325     // setup cgroups
326     for (auto& [name, descriptor] : descriptors) {
327         if (descriptor.controller()->flags() & CGROUPRC_CONTROLLER_FLAG_MOUNTED) {
328             LOG(WARNING) << "Attempt to call CgroupSetup() more than once";
329             return true;
330         }
331 
332         if (!SetupCgroup(descriptor)) {
333             // issue a warning and proceed with the next cgroup
334             LOG(WARNING) << "Failed to setup " << name << " cgroup";
335         }
336     }
337 
338     if (android::libprocessgroup_flags::force_memcg_v2()) {
339         if (MGLRUDisabled().value_or(false)) {
340             LOG(WARNING) << "Memcg forced to v2 hierarchy with MGLRU disabled! "
341                          << "Global reclaim performance will suffer.";
342         }
343         if (MEMCGDisabled(descriptors).value_or(false)) {
344             LOG(WARNING) << "Memcg forced to v2 hierarchy while memcg is disabled by kernel "
345                          << "command line!";
346         }
347     }
348 
349     // System / app isolation.
350     // This really belongs in early-init in init.rc, but we cannot use the flag there.
351     if (android::libprocessgroup_flags::cgroup_v2_sys_app_isolation()) {
352         const auto it = descriptors.find(CGROUPV2_HIERARCHY_NAME);
353         const std::string cgroup_v2_root = (it == descriptors.end())
354                                                    ? CGROUP_V2_ROOT_DEFAULT
355                                                    : it->second.controller()->path();
356 
357         LOG(INFO) << "Using system/app isolation under: " << cgroup_v2_root;
358         if (!CreateV2SubHierarchy(cgroup_v2_root + "/apps", descriptors) ||
359             !CreateV2SubHierarchy(cgroup_v2_root + "/system", descriptors)) {
360             return false;
361         }
362     }
363 
364     return true;
365 }
366