1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3 * Copyright (c) 2020-2024 Oracle. All Rights Reserved.
4 * Author: Darrick J. Wong <[email protected]>
5 */
6 #include "xfs.h"
7 #include "xfs_fs.h"
8 #include "xfs_shared.h"
9 #include "xfs_format.h"
10 #include "xfs_trans_resv.h"
11 #include "xfs_mount.h"
12 #include "xfs_defer.h"
13 #include "xfs_bit.h"
14 #include "xfs_log_format.h"
15 #include "xfs_trans.h"
16 #include "xfs_sb.h"
17 #include "xfs_inode.h"
18 #include "xfs_icache.h"
19 #include "xfs_da_format.h"
20 #include "xfs_da_btree.h"
21 #include "xfs_dir2.h"
22 #include "xfs_dir2_priv.h"
23 #include "xfs_bmap.h"
24 #include "xfs_quota.h"
25 #include "xfs_bmap_btree.h"
26 #include "xfs_trans_space.h"
27 #include "xfs_bmap_util.h"
28 #include "xfs_exchmaps.h"
29 #include "xfs_exchrange.h"
30 #include "xfs_ag.h"
31 #include "xfs_parent.h"
32 #include "scrub/xfs_scrub.h"
33 #include "scrub/scrub.h"
34 #include "scrub/common.h"
35 #include "scrub/trace.h"
36 #include "scrub/repair.h"
37 #include "scrub/tempfile.h"
38 #include "scrub/tempexch.h"
39 #include "scrub/xfile.h"
40 #include "scrub/xfarray.h"
41 #include "scrub/xfblob.h"
42 #include "scrub/iscan.h"
43 #include "scrub/readdir.h"
44 #include "scrub/reap.h"
45 #include "scrub/findparent.h"
46 #include "scrub/orphanage.h"
47 #include "scrub/listxattr.h"
48
49 /*
50 * Directory Repair
51 * ================
52 *
53 * We repair directories by reading the directory data blocks looking for
54 * directory entries that look salvageable (name passes verifiers, entry points
55 * to a valid allocated inode, etc). Each entry worth salvaging is stashed in
56 * memory, and the stashed entries are periodically replayed into a temporary
57 * directory to constrain memory use. Batching the construction of the
58 * temporary directory in this fashion reduces lock cycling of the directory
59 * being repaired and the temporary directory, and will later become important
60 * for parent pointer scanning.
61 *
62 * If parent pointers are enabled on this filesystem, we instead reconstruct
63 * the directory by visiting each parent pointer of each file in the filesystem
64 * and translating the relevant parent pointer records into dirents. In this
65 * case, it is advantageous to stash all directory entries created from parent
66 * pointers for a single child file before replaying them into the temporary
67 * directory. To save memory, the live filesystem scan reuses the findparent
68 * fields. Directory repair chooses either parent pointer scanning or
69 * directory entry salvaging, but not both.
70 *
71 * Directory entries added to the temporary directory do not elevate the link
72 * counts of the inodes found. When salvaging completes, the remaining stashed
73 * entries are replayed to the temporary directory. An atomic mapping exchange
74 * is used to commit the new directory blocks to the directory being repaired.
75 * This will disrupt readdir cursors.
76 *
77 * Locking Issues
78 * --------------
79 *
80 * If /a, /a/b, and /c are all directories, the VFS does not take i_rwsem on
81 * /a/b for a "mv /a/b /c/" operation. This means that only b's ILOCK protects
82 * b's dotdot update. This is in contrast to every other dotdot update (link,
83 * remove, mkdir). If the repair code drops the ILOCK, it must either
84 * revalidate the dotdot entry or use dirent hooks to capture updates from
85 * other threads.
86 */
87
88 /* Create a dirent in the tempdir. */
89 #define XREP_DIRENT_ADD (1)
90
91 /* Remove a dirent from the tempdir. */
92 #define XREP_DIRENT_REMOVE (2)
93
94 /* Directory entry to be restored in the new directory. */
95 struct xrep_dirent {
96 /* Cookie for retrieval of the dirent name. */
97 xfblob_cookie name_cookie;
98
99 /* Target inode number. */
100 xfs_ino_t ino;
101
102 /* Length of the dirent name. */
103 uint8_t namelen;
104
105 /* File type of the dirent. */
106 uint8_t ftype;
107
108 /* XREP_DIRENT_{ADD,REMOVE} */
109 uint8_t action;
110 };
111
112 /*
113 * Stash up to 8 pages of recovered dirent data in dir_entries and dir_names
114 * before we write them to the temp dir.
115 */
116 #define XREP_DIR_MAX_STASH_BYTES (PAGE_SIZE * 8)
117
118 struct xrep_dir {
119 struct xfs_scrub *sc;
120
121 /* Fixed-size array of xrep_dirent structures. */
122 struct xfarray *dir_entries;
123
124 /* Blobs containing directory entry names. */
125 struct xfblob *dir_names;
126
127 /* Information for exchanging data forks at the end. */
128 struct xrep_tempexch tx;
129
130 /* Preallocated args struct for performing dir operations */
131 struct xfs_da_args args;
132
133 /*
134 * Information used to scan the filesystem to find the inumber of the
135 * dotdot entry for this directory. For directory salvaging when
136 * parent pointers are not enabled, we use the findparent_* functions
137 * on this object and access only the parent_ino field directly.
138 *
139 * When parent pointers are enabled, however, the pptr scanner uses the
140 * iscan, hooks, lock, and parent_ino fields of this object directly.
141 * @pscan.lock coordinates access to dir_entries, dir_names,
142 * parent_ino, subdirs, dirents, and args. This reduces the memory
143 * requirements of this structure.
144 */
145 struct xrep_parent_scan_info pscan;
146
147 /*
148 * Context information for attaching this directory to the lost+found
149 * if this directory does not have a parent.
150 */
151 struct xrep_adoption adoption;
152
153 /* How many subdirectories did we find? */
154 uint64_t subdirs;
155
156 /* How many dirents did we find? */
157 unsigned int dirents;
158
159 /* Should we move this directory to the orphanage? */
160 bool needs_adoption;
161
162 /* Directory entry name, plus the trailing null. */
163 struct xfs_name xname;
164 unsigned char namebuf[MAXNAMELEN];
165 };
166
167 /* Tear down all the incore stuff we created. */
168 static void
xrep_dir_teardown(struct xfs_scrub * sc)169 xrep_dir_teardown(
170 struct xfs_scrub *sc)
171 {
172 struct xrep_dir *rd = sc->buf;
173
174 xrep_findparent_scan_teardown(&rd->pscan);
175 xfblob_destroy(rd->dir_names);
176 xfarray_destroy(rd->dir_entries);
177 }
178
179 /* Set up for a directory repair. */
180 int
xrep_setup_directory(struct xfs_scrub * sc)181 xrep_setup_directory(
182 struct xfs_scrub *sc)
183 {
184 struct xrep_dir *rd;
185 int error;
186
187 xchk_fsgates_enable(sc, XCHK_FSGATES_DIRENTS);
188
189 error = xrep_orphanage_try_create(sc);
190 if (error)
191 return error;
192
193 error = xrep_tempfile_create(sc, S_IFDIR);
194 if (error)
195 return error;
196
197 rd = kvzalloc(sizeof(struct xrep_dir), XCHK_GFP_FLAGS);
198 if (!rd)
199 return -ENOMEM;
200 rd->sc = sc;
201 rd->xname.name = rd->namebuf;
202 sc->buf = rd;
203
204 return 0;
205 }
206
207 /*
208 * Look up the dotdot entry and confirm that it's really the parent.
209 * Returns NULLFSINO if we don't know what to do.
210 */
211 static inline xfs_ino_t
xrep_dir_lookup_parent(struct xrep_dir * rd)212 xrep_dir_lookup_parent(
213 struct xrep_dir *rd)
214 {
215 struct xfs_scrub *sc = rd->sc;
216 xfs_ino_t ino;
217 int error;
218
219 error = xfs_dir_lookup(sc->tp, sc->ip, &xfs_name_dotdot, &ino, NULL);
220 if (error)
221 return NULLFSINO;
222 if (!xfs_verify_dir_ino(sc->mp, ino))
223 return NULLFSINO;
224
225 error = xrep_findparent_confirm(sc, &ino);
226 if (error)
227 return NULLFSINO;
228
229 return ino;
230 }
231
232 /*
233 * Look up '..' in the dentry cache and confirm that it's really the parent.
234 * Returns NULLFSINO if the dcache misses or if the hit is implausible.
235 */
236 static inline xfs_ino_t
xrep_dir_dcache_parent(struct xrep_dir * rd)237 xrep_dir_dcache_parent(
238 struct xrep_dir *rd)
239 {
240 struct xfs_scrub *sc = rd->sc;
241 xfs_ino_t parent_ino;
242 int error;
243
244 parent_ino = xrep_findparent_from_dcache(sc);
245 if (parent_ino == NULLFSINO)
246 return parent_ino;
247
248 error = xrep_findparent_confirm(sc, &parent_ino);
249 if (error)
250 return NULLFSINO;
251
252 return parent_ino;
253 }
254
255 /* Try to find the parent of the directory being repaired. */
256 STATIC int
xrep_dir_find_parent(struct xrep_dir * rd)257 xrep_dir_find_parent(
258 struct xrep_dir *rd)
259 {
260 xfs_ino_t ino;
261
262 ino = xrep_findparent_self_reference(rd->sc);
263 if (ino != NULLFSINO) {
264 xrep_findparent_scan_finish_early(&rd->pscan, ino);
265 return 0;
266 }
267
268 ino = xrep_dir_dcache_parent(rd);
269 if (ino != NULLFSINO) {
270 xrep_findparent_scan_finish_early(&rd->pscan, ino);
271 return 0;
272 }
273
274 ino = xrep_dir_lookup_parent(rd);
275 if (ino != NULLFSINO) {
276 xrep_findparent_scan_finish_early(&rd->pscan, ino);
277 return 0;
278 }
279
280 /*
281 * A full filesystem scan is the last resort. On a busy filesystem,
282 * the scan can fail with -EBUSY if we cannot grab IOLOCKs. That means
283 * that we don't know what who the parent is, so we should return to
284 * userspace.
285 */
286 return xrep_findparent_scan(&rd->pscan);
287 }
288
289 /*
290 * Decide if we want to salvage this entry. We don't bother with oversized
291 * names or the dot entry.
292 */
293 STATIC int
xrep_dir_want_salvage(struct xrep_dir * rd,const char * name,int namelen,xfs_ino_t ino)294 xrep_dir_want_salvage(
295 struct xrep_dir *rd,
296 const char *name,
297 int namelen,
298 xfs_ino_t ino)
299 {
300 struct xfs_mount *mp = rd->sc->mp;
301
302 /* No pointers to ourselves or to garbage. */
303 if (ino == rd->sc->ip->i_ino)
304 return false;
305 if (!xfs_verify_dir_ino(mp, ino))
306 return false;
307
308 /* No weird looking names or dot entries. */
309 if (namelen >= MAXNAMELEN || namelen <= 0)
310 return false;
311 if (namelen == 1 && name[0] == '.')
312 return false;
313 if (!xfs_dir2_namecheck(name, namelen))
314 return false;
315
316 return true;
317 }
318
319 /*
320 * Remember that we want to create a dirent in the tempdir. These stashed
321 * actions will be replayed later.
322 */
323 STATIC int
xrep_dir_stash_createname(struct xrep_dir * rd,const struct xfs_name * name,xfs_ino_t ino)324 xrep_dir_stash_createname(
325 struct xrep_dir *rd,
326 const struct xfs_name *name,
327 xfs_ino_t ino)
328 {
329 struct xrep_dirent dirent = {
330 .action = XREP_DIRENT_ADD,
331 .ino = ino,
332 .namelen = name->len,
333 .ftype = name->type,
334 };
335 int error;
336
337 trace_xrep_dir_stash_createname(rd->sc->tempip, name, ino);
338
339 error = xfblob_storename(rd->dir_names, &dirent.name_cookie, name);
340 if (error)
341 return error;
342
343 return xfarray_append(rd->dir_entries, &dirent);
344 }
345
346 /*
347 * Remember that we want to remove a dirent from the tempdir. These stashed
348 * actions will be replayed later.
349 */
350 STATIC int
xrep_dir_stash_removename(struct xrep_dir * rd,const struct xfs_name * name,xfs_ino_t ino)351 xrep_dir_stash_removename(
352 struct xrep_dir *rd,
353 const struct xfs_name *name,
354 xfs_ino_t ino)
355 {
356 struct xrep_dirent dirent = {
357 .action = XREP_DIRENT_REMOVE,
358 .ino = ino,
359 .namelen = name->len,
360 .ftype = name->type,
361 };
362 int error;
363
364 trace_xrep_dir_stash_removename(rd->sc->tempip, name, ino);
365
366 error = xfblob_storename(rd->dir_names, &dirent.name_cookie, name);
367 if (error)
368 return error;
369
370 return xfarray_append(rd->dir_entries, &dirent);
371 }
372
373 /* Allocate an in-core record to hold entries while we rebuild the dir data. */
374 STATIC int
xrep_dir_salvage_entry(struct xrep_dir * rd,unsigned char * name,unsigned int namelen,xfs_ino_t ino)375 xrep_dir_salvage_entry(
376 struct xrep_dir *rd,
377 unsigned char *name,
378 unsigned int namelen,
379 xfs_ino_t ino)
380 {
381 struct xfs_name xname = {
382 .name = name,
383 };
384 struct xfs_scrub *sc = rd->sc;
385 struct xfs_inode *ip;
386 unsigned int i = 0;
387 int error = 0;
388
389 if (xchk_should_terminate(sc, &error))
390 return error;
391
392 /*
393 * Truncate the name to the first character that would trip namecheck.
394 * If we no longer have a name after that, ignore this entry.
395 */
396 while (i < namelen && name[i] != 0 && name[i] != '/')
397 i++;
398 if (i == 0)
399 return 0;
400 xname.len = i;
401
402 /* Ignore '..' entries; we already picked the new parent. */
403 if (xname.len == 2 && name[0] == '.' && name[1] == '.') {
404 trace_xrep_dir_salvaged_parent(sc->ip, ino);
405 return 0;
406 }
407
408 trace_xrep_dir_salvage_entry(sc->ip, &xname, ino);
409
410 /*
411 * Compute the ftype or dump the entry if we can't. We don't lock the
412 * inode because inodes can't change type while we have a reference.
413 */
414 error = xchk_iget(sc, ino, &ip);
415 if (error)
416 return 0;
417
418 /* Don't mix metadata and regular directory trees. */
419 if (xfs_is_metadir_inode(ip) != xfs_is_metadir_inode(rd->sc->ip)) {
420 xchk_irele(sc, ip);
421 return 0;
422 }
423
424 xname.type = xfs_mode_to_ftype(VFS_I(ip)->i_mode);
425 xchk_irele(sc, ip);
426
427 return xrep_dir_stash_createname(rd, &xname, ino);
428 }
429
430 /* Record a shortform directory entry for later reinsertion. */
431 STATIC int
xrep_dir_salvage_sf_entry(struct xrep_dir * rd,struct xfs_dir2_sf_hdr * sfp,struct xfs_dir2_sf_entry * sfep)432 xrep_dir_salvage_sf_entry(
433 struct xrep_dir *rd,
434 struct xfs_dir2_sf_hdr *sfp,
435 struct xfs_dir2_sf_entry *sfep)
436 {
437 xfs_ino_t ino;
438
439 ino = xfs_dir2_sf_get_ino(rd->sc->mp, sfp, sfep);
440 if (!xrep_dir_want_salvage(rd, sfep->name, sfep->namelen, ino))
441 return 0;
442
443 return xrep_dir_salvage_entry(rd, sfep->name, sfep->namelen, ino);
444 }
445
446 /* Record a regular directory entry for later reinsertion. */
447 STATIC int
xrep_dir_salvage_data_entry(struct xrep_dir * rd,struct xfs_dir2_data_entry * dep)448 xrep_dir_salvage_data_entry(
449 struct xrep_dir *rd,
450 struct xfs_dir2_data_entry *dep)
451 {
452 xfs_ino_t ino;
453
454 ino = be64_to_cpu(dep->inumber);
455 if (!xrep_dir_want_salvage(rd, dep->name, dep->namelen, ino))
456 return 0;
457
458 return xrep_dir_salvage_entry(rd, dep->name, dep->namelen, ino);
459 }
460
461 /* Try to recover block/data format directory entries. */
462 STATIC int
xrep_dir_recover_data(struct xrep_dir * rd,struct xfs_buf * bp)463 xrep_dir_recover_data(
464 struct xrep_dir *rd,
465 struct xfs_buf *bp)
466 {
467 struct xfs_da_geometry *geo = rd->sc->mp->m_dir_geo;
468 unsigned int offset;
469 unsigned int end;
470 int error = 0;
471
472 /*
473 * Loop over the data portion of the block.
474 * Each object is a real entry (dep) or an unused one (dup).
475 */
476 offset = geo->data_entry_offset;
477 end = min_t(unsigned int, BBTOB(bp->b_length),
478 xfs_dir3_data_end_offset(geo, bp->b_addr));
479
480 while (offset < end) {
481 struct xfs_dir2_data_unused *dup = bp->b_addr + offset;
482 struct xfs_dir2_data_entry *dep = bp->b_addr + offset;
483
484 if (xchk_should_terminate(rd->sc, &error))
485 return error;
486
487 /* Skip unused entries. */
488 if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
489 offset += be16_to_cpu(dup->length);
490 continue;
491 }
492
493 /* Don't walk off the end of the block. */
494 offset += xfs_dir2_data_entsize(rd->sc->mp, dep->namelen);
495 if (offset > end)
496 break;
497
498 /* Ok, let's save this entry. */
499 error = xrep_dir_salvage_data_entry(rd, dep);
500 if (error)
501 return error;
502
503 }
504
505 return 0;
506 }
507
508 /* Try to recover shortform directory entries. */
509 STATIC int
xrep_dir_recover_sf(struct xrep_dir * rd)510 xrep_dir_recover_sf(
511 struct xrep_dir *rd)
512 {
513 struct xfs_dir2_sf_hdr *hdr;
514 struct xfs_dir2_sf_entry *sfep;
515 struct xfs_dir2_sf_entry *next;
516 struct xfs_ifork *ifp;
517 xfs_ino_t ino;
518 unsigned char *end;
519 int error = 0;
520
521 ifp = xfs_ifork_ptr(rd->sc->ip, XFS_DATA_FORK);
522 hdr = ifp->if_data;
523 end = (unsigned char *)ifp->if_data + ifp->if_bytes;
524
525 ino = xfs_dir2_sf_get_parent_ino(hdr);
526 trace_xrep_dir_salvaged_parent(rd->sc->ip, ino);
527
528 sfep = xfs_dir2_sf_firstentry(hdr);
529 while ((unsigned char *)sfep < end) {
530 if (xchk_should_terminate(rd->sc, &error))
531 return error;
532
533 next = xfs_dir2_sf_nextentry(rd->sc->mp, hdr, sfep);
534 if ((unsigned char *)next > end)
535 break;
536
537 /* Ok, let's save this entry. */
538 error = xrep_dir_salvage_sf_entry(rd, hdr, sfep);
539 if (error)
540 return error;
541
542 sfep = next;
543 }
544
545 return 0;
546 }
547
548 /*
549 * Try to figure out the format of this directory from the data fork mappings
550 * and the directory size. If we can be reasonably sure of format, we can be
551 * more aggressive in salvaging directory entries. On return, @magic_guess
552 * will be set to DIR3_BLOCK_MAGIC if we think this is a "block format"
553 * directory; DIR3_DATA_MAGIC if we think this is a "data format" directory,
554 * and 0 if we can't tell.
555 */
556 STATIC void
xrep_dir_guess_format(struct xrep_dir * rd,__be32 * magic_guess)557 xrep_dir_guess_format(
558 struct xrep_dir *rd,
559 __be32 *magic_guess)
560 {
561 struct xfs_inode *dp = rd->sc->ip;
562 struct xfs_mount *mp = rd->sc->mp;
563 struct xfs_da_geometry *geo = mp->m_dir_geo;
564 xfs_fileoff_t last;
565 int error;
566
567 ASSERT(xfs_has_crc(mp));
568
569 *magic_guess = 0;
570
571 /*
572 * If there's a single directory block and the directory size is
573 * exactly one block, this has to be a single block format directory.
574 */
575 error = xfs_bmap_last_offset(dp, &last, XFS_DATA_FORK);
576 if (!error && XFS_FSB_TO_B(mp, last) == geo->blksize &&
577 dp->i_disk_size == geo->blksize) {
578 *magic_guess = cpu_to_be32(XFS_DIR3_BLOCK_MAGIC);
579 return;
580 }
581
582 /*
583 * If the last extent before the leaf offset matches the directory
584 * size and the directory size is larger than 1 block, this is a
585 * data format directory.
586 */
587 last = geo->leafblk;
588 error = xfs_bmap_last_before(rd->sc->tp, dp, &last, XFS_DATA_FORK);
589 if (!error &&
590 XFS_FSB_TO_B(mp, last) > geo->blksize &&
591 XFS_FSB_TO_B(mp, last) == dp->i_disk_size) {
592 *magic_guess = cpu_to_be32(XFS_DIR3_DATA_MAGIC);
593 return;
594 }
595 }
596
597 /* Recover directory entries from a specific directory block. */
598 STATIC int
xrep_dir_recover_dirblock(struct xrep_dir * rd,__be32 magic_guess,xfs_dablk_t dabno)599 xrep_dir_recover_dirblock(
600 struct xrep_dir *rd,
601 __be32 magic_guess,
602 xfs_dablk_t dabno)
603 {
604 struct xfs_dir2_data_hdr *hdr;
605 struct xfs_buf *bp;
606 __be32 oldmagic;
607 int error;
608
609 /*
610 * Try to read buffer. We invalidate them in the next step so we don't
611 * bother to set a buffer type or ops.
612 */
613 error = xfs_da_read_buf(rd->sc->tp, rd->sc->ip, dabno,
614 XFS_DABUF_MAP_HOLE_OK, &bp, XFS_DATA_FORK, NULL);
615 if (error || !bp)
616 return error;
617
618 hdr = bp->b_addr;
619 oldmagic = hdr->magic;
620
621 trace_xrep_dir_recover_dirblock(rd->sc->ip, dabno,
622 be32_to_cpu(hdr->magic), be32_to_cpu(magic_guess));
623
624 /*
625 * If we're sure of the block's format, proceed with the salvage
626 * operation using the specified magic number.
627 */
628 if (magic_guess) {
629 hdr->magic = magic_guess;
630 goto recover;
631 }
632
633 /*
634 * If we couldn't guess what type of directory this is, then we will
635 * only salvage entries from directory blocks that match the magic
636 * number and pass verifiers.
637 */
638 switch (hdr->magic) {
639 case cpu_to_be32(XFS_DIR2_BLOCK_MAGIC):
640 case cpu_to_be32(XFS_DIR3_BLOCK_MAGIC):
641 if (!xrep_buf_verify_struct(bp, &xfs_dir3_block_buf_ops))
642 goto out;
643 if (xfs_dir3_block_header_check(bp, rd->sc->ip->i_ino) != NULL)
644 goto out;
645 break;
646 case cpu_to_be32(XFS_DIR2_DATA_MAGIC):
647 case cpu_to_be32(XFS_DIR3_DATA_MAGIC):
648 if (!xrep_buf_verify_struct(bp, &xfs_dir3_data_buf_ops))
649 goto out;
650 if (xfs_dir3_data_header_check(bp, rd->sc->ip->i_ino) != NULL)
651 goto out;
652 break;
653 default:
654 goto out;
655 }
656
657 recover:
658 error = xrep_dir_recover_data(rd, bp);
659
660 out:
661 hdr->magic = oldmagic;
662 xfs_trans_brelse(rd->sc->tp, bp);
663 return error;
664 }
665
666 static inline void
xrep_dir_init_args(struct xrep_dir * rd,struct xfs_inode * dp,const struct xfs_name * name)667 xrep_dir_init_args(
668 struct xrep_dir *rd,
669 struct xfs_inode *dp,
670 const struct xfs_name *name)
671 {
672 memset(&rd->args, 0, sizeof(struct xfs_da_args));
673 rd->args.geo = rd->sc->mp->m_dir_geo;
674 rd->args.whichfork = XFS_DATA_FORK;
675 rd->args.owner = rd->sc->ip->i_ino;
676 rd->args.trans = rd->sc->tp;
677 rd->args.dp = dp;
678 if (!name)
679 return;
680 rd->args.name = name->name;
681 rd->args.namelen = name->len;
682 rd->args.filetype = name->type;
683 rd->args.hashval = xfs_dir2_hashname(rd->sc->mp, name);
684 }
685
686 /* Replay a stashed createname into the temporary directory. */
687 STATIC int
xrep_dir_replay_createname(struct xrep_dir * rd,const struct xfs_name * name,xfs_ino_t inum,xfs_extlen_t total)688 xrep_dir_replay_createname(
689 struct xrep_dir *rd,
690 const struct xfs_name *name,
691 xfs_ino_t inum,
692 xfs_extlen_t total)
693 {
694 struct xfs_scrub *sc = rd->sc;
695 struct xfs_inode *dp = rd->sc->tempip;
696 int error;
697
698 ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
699
700 error = xfs_dir_ino_validate(sc->mp, inum);
701 if (error)
702 return error;
703
704 trace_xrep_dir_replay_createname(dp, name, inum);
705
706 xrep_dir_init_args(rd, dp, name);
707 rd->args.inumber = inum;
708 rd->args.total = total;
709 rd->args.op_flags = XFS_DA_OP_ADDNAME | XFS_DA_OP_OKNOENT;
710 return xfs_dir_createname_args(&rd->args);
711 }
712
713 /* Replay a stashed removename onto the temporary directory. */
714 STATIC int
xrep_dir_replay_removename(struct xrep_dir * rd,const struct xfs_name * name,xfs_extlen_t total)715 xrep_dir_replay_removename(
716 struct xrep_dir *rd,
717 const struct xfs_name *name,
718 xfs_extlen_t total)
719 {
720 struct xfs_inode *dp = rd->args.dp;
721
722 ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
723
724 xrep_dir_init_args(rd, dp, name);
725 rd->args.op_flags = 0;
726 rd->args.total = total;
727
728 trace_xrep_dir_replay_removename(dp, name, 0);
729 return xfs_dir_removename_args(&rd->args);
730 }
731
732 /*
733 * Add this stashed incore directory entry to the temporary directory.
734 * The caller must hold the tempdir's IOLOCK, must not hold any ILOCKs, and
735 * must not be in transaction context.
736 */
737 STATIC int
xrep_dir_replay_update(struct xrep_dir * rd,const struct xfs_name * xname,const struct xrep_dirent * dirent)738 xrep_dir_replay_update(
739 struct xrep_dir *rd,
740 const struct xfs_name *xname,
741 const struct xrep_dirent *dirent)
742 {
743 struct xfs_mount *mp = rd->sc->mp;
744 #ifdef DEBUG
745 xfs_ino_t ino;
746 #endif
747 uint resblks;
748 int error;
749
750 resblks = xfs_link_space_res(mp, xname->len);
751 error = xchk_trans_alloc(rd->sc, resblks);
752 if (error)
753 return error;
754
755 /* Lock the temporary directory and join it to the transaction */
756 xrep_tempfile_ilock(rd->sc);
757 xfs_trans_ijoin(rd->sc->tp, rd->sc->tempip, 0);
758
759 switch (dirent->action) {
760 case XREP_DIRENT_ADD:
761 /*
762 * Create a replacement dirent in the temporary directory.
763 * Note that _createname doesn't check for existing entries.
764 * There shouldn't be any in the temporary dir, but we'll
765 * verify this in debug mode.
766 */
767 #ifdef DEBUG
768 error = xchk_dir_lookup(rd->sc, rd->sc->tempip, xname, &ino);
769 if (error != -ENOENT) {
770 ASSERT(error != -ENOENT);
771 goto out_cancel;
772 }
773 #endif
774
775 error = xrep_dir_replay_createname(rd, xname, dirent->ino,
776 resblks);
777 if (error)
778 goto out_cancel;
779
780 if (xname->type == XFS_DIR3_FT_DIR)
781 rd->subdirs++;
782 rd->dirents++;
783 break;
784 case XREP_DIRENT_REMOVE:
785 /*
786 * Remove a dirent from the temporary directory. Note that
787 * _removename doesn't check the inode target of the exist
788 * entry. There should be a perfect match in the temporary
789 * dir, but we'll verify this in debug mode.
790 */
791 #ifdef DEBUG
792 error = xchk_dir_lookup(rd->sc, rd->sc->tempip, xname, &ino);
793 if (error) {
794 ASSERT(error != 0);
795 goto out_cancel;
796 }
797 if (ino != dirent->ino) {
798 ASSERT(ino == dirent->ino);
799 error = -EIO;
800 goto out_cancel;
801 }
802 #endif
803
804 error = xrep_dir_replay_removename(rd, xname, resblks);
805 if (error)
806 goto out_cancel;
807
808 if (xname->type == XFS_DIR3_FT_DIR)
809 rd->subdirs--;
810 rd->dirents--;
811 break;
812 default:
813 ASSERT(0);
814 error = -EIO;
815 goto out_cancel;
816 }
817
818 /* Commit and unlock. */
819 error = xrep_trans_commit(rd->sc);
820 if (error)
821 return error;
822
823 xrep_tempfile_iunlock(rd->sc);
824 return 0;
825 out_cancel:
826 xchk_trans_cancel(rd->sc);
827 xrep_tempfile_iunlock(rd->sc);
828 return error;
829 }
830
831 /*
832 * Flush stashed incore dirent updates that have been recorded by the scanner.
833 * This is done to reduce the memory requirements of the directory rebuild,
834 * since directories can contain up to 32GB of directory data.
835 *
836 * Caller must not hold transactions or ILOCKs. Caller must hold the tempdir
837 * IOLOCK.
838 */
839 STATIC int
xrep_dir_replay_updates(struct xrep_dir * rd)840 xrep_dir_replay_updates(
841 struct xrep_dir *rd)
842 {
843 xfarray_idx_t array_cur;
844 int error;
845
846 /* Add all the salvaged dirents to the temporary directory. */
847 mutex_lock(&rd->pscan.lock);
848 foreach_xfarray_idx(rd->dir_entries, array_cur) {
849 struct xrep_dirent dirent;
850
851 error = xfarray_load(rd->dir_entries, array_cur, &dirent);
852 if (error)
853 goto out_unlock;
854
855 error = xfblob_loadname(rd->dir_names, dirent.name_cookie,
856 &rd->xname, dirent.namelen);
857 if (error)
858 goto out_unlock;
859 rd->xname.type = dirent.ftype;
860 mutex_unlock(&rd->pscan.lock);
861
862 error = xrep_dir_replay_update(rd, &rd->xname, &dirent);
863 if (error)
864 return error;
865 mutex_lock(&rd->pscan.lock);
866 }
867
868 /* Empty out both arrays now that we've added the entries. */
869 xfarray_truncate(rd->dir_entries);
870 xfblob_truncate(rd->dir_names);
871 mutex_unlock(&rd->pscan.lock);
872 return 0;
873 out_unlock:
874 mutex_unlock(&rd->pscan.lock);
875 return error;
876 }
877
878 /*
879 * Periodically flush stashed directory entries to the temporary dir. This
880 * is done to reduce the memory requirements of the directory rebuild, since
881 * directories can contain up to 32GB of directory data.
882 */
883 STATIC int
xrep_dir_flush_stashed(struct xrep_dir * rd)884 xrep_dir_flush_stashed(
885 struct xrep_dir *rd)
886 {
887 int error;
888
889 /*
890 * Entering this function, the scrub context has a reference to the
891 * inode being repaired, the temporary file, and a scrub transaction
892 * that we use during dirent salvaging to avoid livelocking if there
893 * are cycles in the directory structures. We hold ILOCK_EXCL on both
894 * the inode being repaired and the temporary file, though they are
895 * not ijoined to the scrub transaction.
896 *
897 * To constrain kernel memory use, we occasionally write salvaged
898 * dirents from the xfarray and xfblob structures into the temporary
899 * directory in preparation for exchanging the directory structures at
900 * the end. Updating the temporary file requires a transaction, so we
901 * commit the scrub transaction and drop the two ILOCKs so that
902 * we can allocate whatever transaction we want.
903 *
904 * We still hold IOLOCK_EXCL on the inode being repaired, which
905 * prevents anyone from accessing the damaged directory data while we
906 * repair it.
907 */
908 error = xrep_trans_commit(rd->sc);
909 if (error)
910 return error;
911 xchk_iunlock(rd->sc, XFS_ILOCK_EXCL);
912
913 /*
914 * Take the IOLOCK of the temporary file while we modify dirents. This
915 * isn't strictly required because the temporary file is never revealed
916 * to userspace, but we follow the same locking rules. We still hold
917 * sc->ip's IOLOCK.
918 */
919 error = xrep_tempfile_iolock_polled(rd->sc);
920 if (error)
921 return error;
922
923 /* Write to the tempdir all the updates that we've stashed. */
924 error = xrep_dir_replay_updates(rd);
925 xrep_tempfile_iounlock(rd->sc);
926 if (error)
927 return error;
928
929 /*
930 * Recreate the salvage transaction and relock the dir we're salvaging.
931 */
932 error = xchk_trans_alloc(rd->sc, 0);
933 if (error)
934 return error;
935 xchk_ilock(rd->sc, XFS_ILOCK_EXCL);
936 return 0;
937 }
938
939 /* Decide if we've stashed too much dirent data in memory. */
940 static inline bool
xrep_dir_want_flush_stashed(struct xrep_dir * rd)941 xrep_dir_want_flush_stashed(
942 struct xrep_dir *rd)
943 {
944 unsigned long long bytes;
945
946 bytes = xfarray_bytes(rd->dir_entries) + xfblob_bytes(rd->dir_names);
947 return bytes > XREP_DIR_MAX_STASH_BYTES;
948 }
949
950 /* Extract as many directory entries as we can. */
951 STATIC int
xrep_dir_recover(struct xrep_dir * rd)952 xrep_dir_recover(
953 struct xrep_dir *rd)
954 {
955 struct xfs_bmbt_irec got;
956 struct xfs_scrub *sc = rd->sc;
957 struct xfs_da_geometry *geo = sc->mp->m_dir_geo;
958 xfs_fileoff_t offset;
959 xfs_dablk_t dabno;
960 __be32 magic_guess;
961 int nmap;
962 int error;
963
964 xrep_dir_guess_format(rd, &magic_guess);
965
966 /* Iterate each directory data block in the data fork. */
967 for (offset = 0;
968 offset < geo->leafblk;
969 offset = got.br_startoff + got.br_blockcount) {
970 nmap = 1;
971 error = xfs_bmapi_read(sc->ip, offset, geo->leafblk - offset,
972 &got, &nmap, 0);
973 if (error)
974 return error;
975 if (nmap != 1)
976 return -EFSCORRUPTED;
977 if (!xfs_bmap_is_written_extent(&got))
978 continue;
979
980 for (dabno = round_up(got.br_startoff, geo->fsbcount);
981 dabno < got.br_startoff + got.br_blockcount;
982 dabno += geo->fsbcount) {
983 if (xchk_should_terminate(rd->sc, &error))
984 return error;
985
986 error = xrep_dir_recover_dirblock(rd,
987 magic_guess, dabno);
988 if (error)
989 return error;
990
991 /* Flush dirents to constrain memory usage. */
992 if (xrep_dir_want_flush_stashed(rd)) {
993 error = xrep_dir_flush_stashed(rd);
994 if (error)
995 return error;
996 }
997 }
998 }
999
1000 return 0;
1001 }
1002
1003 /*
1004 * Find all the directory entries for this inode by scraping them out of the
1005 * directory leaf blocks by hand, and flushing them into the temp dir.
1006 */
1007 STATIC int
xrep_dir_find_entries(struct xrep_dir * rd)1008 xrep_dir_find_entries(
1009 struct xrep_dir *rd)
1010 {
1011 struct xfs_inode *dp = rd->sc->ip;
1012 int error;
1013
1014 /*
1015 * Salvage directory entries from the old directory, and write them to
1016 * the temporary directory.
1017 */
1018 if (dp->i_df.if_format == XFS_DINODE_FMT_LOCAL) {
1019 error = xrep_dir_recover_sf(rd);
1020 } else {
1021 error = xfs_iread_extents(rd->sc->tp, dp, XFS_DATA_FORK);
1022 if (error)
1023 return error;
1024
1025 error = xrep_dir_recover(rd);
1026 }
1027 if (error)
1028 return error;
1029
1030 return xrep_dir_flush_stashed(rd);
1031 }
1032
1033 /* Scan all files in the filesystem for dirents. */
1034 STATIC int
xrep_dir_salvage_entries(struct xrep_dir * rd)1035 xrep_dir_salvage_entries(
1036 struct xrep_dir *rd)
1037 {
1038 struct xfs_scrub *sc = rd->sc;
1039 int error;
1040
1041 /*
1042 * Drop the ILOCK on this directory so that we can scan for this
1043 * directory's parent. Figure out who is going to be the parent of
1044 * this directory, then retake the ILOCK so that we can salvage
1045 * directory entries.
1046 */
1047 xchk_iunlock(sc, XFS_ILOCK_EXCL);
1048 error = xrep_dir_find_parent(rd);
1049 xchk_ilock(sc, XFS_ILOCK_EXCL);
1050 if (error)
1051 return error;
1052
1053 /*
1054 * Collect directory entries by parsing raw leaf blocks to salvage
1055 * whatever we can. When we're done, free the staging memory before
1056 * exchanging the directories to reduce memory usage.
1057 */
1058 error = xrep_dir_find_entries(rd);
1059 if (error)
1060 return error;
1061
1062 /*
1063 * Cancel the repair transaction and drop the ILOCK so that we can
1064 * (later) use the atomic mapping exchange functions to compute the
1065 * correct block reservations and re-lock the inodes.
1066 *
1067 * We still hold IOLOCK_EXCL (aka i_rwsem) which will prevent directory
1068 * modifications, but there's nothing to prevent userspace from reading
1069 * the directory until we're ready for the exchange operation. Reads
1070 * will return -EIO without shutting down the fs, so we're ok with
1071 * that.
1072 *
1073 * The VFS can change dotdot on us, but the findparent scan will keep
1074 * our incore parent inode up to date. See the note on locking issues
1075 * for more details.
1076 */
1077 error = xrep_trans_commit(sc);
1078 if (error)
1079 return error;
1080
1081 xchk_iunlock(sc, XFS_ILOCK_EXCL);
1082 return 0;
1083 }
1084
1085
1086 /*
1087 * Examine a parent pointer of a file. If it leads us back to the directory
1088 * that we're rebuilding, create an incore dirent from the parent pointer and
1089 * stash it.
1090 */
1091 STATIC int
xrep_dir_scan_pptr(struct xfs_scrub * sc,struct xfs_inode * ip,unsigned int attr_flags,const unsigned char * name,unsigned int namelen,const void * value,unsigned int valuelen,void * priv)1092 xrep_dir_scan_pptr(
1093 struct xfs_scrub *sc,
1094 struct xfs_inode *ip,
1095 unsigned int attr_flags,
1096 const unsigned char *name,
1097 unsigned int namelen,
1098 const void *value,
1099 unsigned int valuelen,
1100 void *priv)
1101 {
1102 struct xfs_name xname = {
1103 .name = name,
1104 .len = namelen,
1105 .type = xfs_mode_to_ftype(VFS_I(ip)->i_mode),
1106 };
1107 xfs_ino_t parent_ino;
1108 uint32_t parent_gen;
1109 struct xrep_dir *rd = priv;
1110 int error;
1111
1112 if (!(attr_flags & XFS_ATTR_PARENT))
1113 return 0;
1114
1115 /*
1116 * Ignore parent pointers that point back to a different dir, list the
1117 * wrong generation number, or are invalid.
1118 */
1119 error = xfs_parent_from_attr(sc->mp, attr_flags, name, namelen, value,
1120 valuelen, &parent_ino, &parent_gen);
1121 if (error)
1122 return error;
1123
1124 if (parent_ino != sc->ip->i_ino ||
1125 parent_gen != VFS_I(sc->ip)->i_generation)
1126 return 0;
1127
1128 mutex_lock(&rd->pscan.lock);
1129 error = xrep_dir_stash_createname(rd, &xname, ip->i_ino);
1130 mutex_unlock(&rd->pscan.lock);
1131 return error;
1132 }
1133
1134 /*
1135 * If this child dirent points to the directory being repaired, remember that
1136 * fact so that we can reset the dotdot entry if necessary.
1137 */
1138 STATIC int
xrep_dir_scan_dirent(struct xfs_scrub * sc,struct xfs_inode * dp,xfs_dir2_dataptr_t dapos,const struct xfs_name * name,xfs_ino_t ino,void * priv)1139 xrep_dir_scan_dirent(
1140 struct xfs_scrub *sc,
1141 struct xfs_inode *dp,
1142 xfs_dir2_dataptr_t dapos,
1143 const struct xfs_name *name,
1144 xfs_ino_t ino,
1145 void *priv)
1146 {
1147 struct xrep_dir *rd = priv;
1148
1149 /* Dirent doesn't point to this directory. */
1150 if (ino != rd->sc->ip->i_ino)
1151 return 0;
1152
1153 /* Ignore garbage inum. */
1154 if (!xfs_verify_dir_ino(rd->sc->mp, ino))
1155 return 0;
1156
1157 /* No weird looking names. */
1158 if (name->len >= MAXNAMELEN || name->len <= 0)
1159 return 0;
1160
1161 /* Don't pick up dot or dotdot entries; we only want child dirents. */
1162 if (xfs_dir2_samename(name, &xfs_name_dotdot) ||
1163 xfs_dir2_samename(name, &xfs_name_dot))
1164 return 0;
1165
1166 trace_xrep_dir_stash_createname(sc->tempip, &xfs_name_dotdot,
1167 dp->i_ino);
1168
1169 xrep_findparent_scan_found(&rd->pscan, dp->i_ino);
1170 return 0;
1171 }
1172
1173 /*
1174 * Decide if we want to look for child dirents or parent pointers in this file.
1175 * Skip the dir being repaired and any files being used to stage repairs.
1176 */
1177 static inline bool
xrep_dir_want_scan(struct xrep_dir * rd,const struct xfs_inode * ip)1178 xrep_dir_want_scan(
1179 struct xrep_dir *rd,
1180 const struct xfs_inode *ip)
1181 {
1182 return ip != rd->sc->ip && !xrep_is_tempfile(ip);
1183 }
1184
1185 /*
1186 * Take ILOCK on a file that we want to scan.
1187 *
1188 * Select ILOCK_EXCL if the file is a directory with an unloaded data bmbt or
1189 * has an unloaded attr bmbt. Otherwise, take ILOCK_SHARED.
1190 */
1191 static inline unsigned int
xrep_dir_scan_ilock(struct xrep_dir * rd,struct xfs_inode * ip)1192 xrep_dir_scan_ilock(
1193 struct xrep_dir *rd,
1194 struct xfs_inode *ip)
1195 {
1196 uint lock_mode = XFS_ILOCK_SHARED;
1197
1198 /* Need to take the shared ILOCK to advance the iscan cursor. */
1199 if (!xrep_dir_want_scan(rd, ip))
1200 goto lock;
1201
1202 if (S_ISDIR(VFS_I(ip)->i_mode) && xfs_need_iread_extents(&ip->i_df)) {
1203 lock_mode = XFS_ILOCK_EXCL;
1204 goto lock;
1205 }
1206
1207 if (xfs_inode_has_attr_fork(ip) && xfs_need_iread_extents(&ip->i_af))
1208 lock_mode = XFS_ILOCK_EXCL;
1209
1210 lock:
1211 xfs_ilock(ip, lock_mode);
1212 return lock_mode;
1213 }
1214
1215 /*
1216 * Scan this file for relevant child dirents or parent pointers that point to
1217 * the directory we're rebuilding.
1218 */
1219 STATIC int
xrep_dir_scan_file(struct xrep_dir * rd,struct xfs_inode * ip)1220 xrep_dir_scan_file(
1221 struct xrep_dir *rd,
1222 struct xfs_inode *ip)
1223 {
1224 unsigned int lock_mode;
1225 int error = 0;
1226
1227 lock_mode = xrep_dir_scan_ilock(rd, ip);
1228
1229 if (!xrep_dir_want_scan(rd, ip))
1230 goto scan_done;
1231
1232 /*
1233 * If the extended attributes look as though they has been zapped by
1234 * the inode record repair code, we cannot scan for parent pointers.
1235 */
1236 if (xchk_pptr_looks_zapped(ip)) {
1237 error = -EBUSY;
1238 goto scan_done;
1239 }
1240
1241 error = xchk_xattr_walk(rd->sc, ip, xrep_dir_scan_pptr, NULL, rd);
1242 if (error)
1243 goto scan_done;
1244
1245 if (S_ISDIR(VFS_I(ip)->i_mode)) {
1246 /*
1247 * If the directory looks as though it has been zapped by the
1248 * inode record repair code, we cannot scan for child dirents.
1249 */
1250 if (xchk_dir_looks_zapped(ip)) {
1251 error = -EBUSY;
1252 goto scan_done;
1253 }
1254
1255 error = xchk_dir_walk(rd->sc, ip, xrep_dir_scan_dirent, rd);
1256 if (error)
1257 goto scan_done;
1258 }
1259
1260 scan_done:
1261 xchk_iscan_mark_visited(&rd->pscan.iscan, ip);
1262 xfs_iunlock(ip, lock_mode);
1263 return error;
1264 }
1265
1266 /*
1267 * Scan all files in the filesystem for parent pointers that we can turn into
1268 * replacement dirents, and a dirent that we can use to set the dotdot pointer.
1269 */
1270 STATIC int
xrep_dir_scan_dirtree(struct xrep_dir * rd)1271 xrep_dir_scan_dirtree(
1272 struct xrep_dir *rd)
1273 {
1274 struct xfs_scrub *sc = rd->sc;
1275 struct xfs_inode *ip;
1276 int error;
1277
1278 /* Roots of directory trees are their own parents. */
1279 if (xchk_inode_is_dirtree_root(sc->ip))
1280 xrep_findparent_scan_found(&rd->pscan, sc->ip->i_ino);
1281
1282 /*
1283 * Filesystem scans are time consuming. Drop the directory ILOCK and
1284 * all other resources for the duration of the scan and hope for the
1285 * best. The live update hooks will keep our scan information up to
1286 * date even though we've dropped the locks.
1287 */
1288 xchk_trans_cancel(sc);
1289 if (sc->ilock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL))
1290 xchk_iunlock(sc, sc->ilock_flags & (XFS_ILOCK_SHARED |
1291 XFS_ILOCK_EXCL));
1292 error = xchk_trans_alloc_empty(sc);
1293 if (error)
1294 return error;
1295
1296 while ((error = xchk_iscan_iter(&rd->pscan.iscan, &ip)) == 1) {
1297 bool flush;
1298
1299 error = xrep_dir_scan_file(rd, ip);
1300 xchk_irele(sc, ip);
1301 if (error)
1302 break;
1303
1304 /* Flush stashed dirent updates to constrain memory usage. */
1305 mutex_lock(&rd->pscan.lock);
1306 flush = xrep_dir_want_flush_stashed(rd);
1307 mutex_unlock(&rd->pscan.lock);
1308 if (flush) {
1309 xchk_trans_cancel(sc);
1310
1311 error = xrep_tempfile_iolock_polled(sc);
1312 if (error)
1313 break;
1314
1315 error = xrep_dir_replay_updates(rd);
1316 xrep_tempfile_iounlock(sc);
1317 if (error)
1318 break;
1319
1320 error = xchk_trans_alloc_empty(sc);
1321 if (error)
1322 break;
1323 }
1324
1325 if (xchk_should_terminate(sc, &error))
1326 break;
1327 }
1328 xchk_iscan_iter_finish(&rd->pscan.iscan);
1329 if (error) {
1330 /*
1331 * If we couldn't grab an inode that was busy with a state
1332 * change, change the error code so that we exit to userspace
1333 * as quickly as possible.
1334 */
1335 if (error == -EBUSY)
1336 return -ECANCELED;
1337 return error;
1338 }
1339
1340 /*
1341 * Cancel the empty transaction so that we can (later) use the atomic
1342 * file mapping exchange functions to lock files and commit the new
1343 * directory.
1344 */
1345 xchk_trans_cancel(rd->sc);
1346 return 0;
1347 }
1348
1349 /*
1350 * Capture dirent updates being made by other threads which are relevant to the
1351 * directory being repaired.
1352 */
1353 STATIC int
xrep_dir_live_update(struct notifier_block * nb,unsigned long action,void * data)1354 xrep_dir_live_update(
1355 struct notifier_block *nb,
1356 unsigned long action,
1357 void *data)
1358 {
1359 struct xfs_dir_update_params *p = data;
1360 struct xrep_dir *rd;
1361 struct xfs_scrub *sc;
1362 int error = 0;
1363
1364 rd = container_of(nb, struct xrep_dir, pscan.dhook.dirent_hook.nb);
1365 sc = rd->sc;
1366
1367 /*
1368 * This thread updated a child dirent in the directory that we're
1369 * rebuilding. Stash the update for replay against the temporary
1370 * directory.
1371 */
1372 if (p->dp->i_ino == sc->ip->i_ino &&
1373 xchk_iscan_want_live_update(&rd->pscan.iscan, p->ip->i_ino)) {
1374 mutex_lock(&rd->pscan.lock);
1375 if (p->delta > 0)
1376 error = xrep_dir_stash_createname(rd, p->name,
1377 p->ip->i_ino);
1378 else
1379 error = xrep_dir_stash_removename(rd, p->name,
1380 p->ip->i_ino);
1381 mutex_unlock(&rd->pscan.lock);
1382 if (error)
1383 goto out_abort;
1384 }
1385
1386 /*
1387 * This thread updated another directory's child dirent that points to
1388 * the directory that we're rebuilding, so remember the new dotdot
1389 * target.
1390 */
1391 if (p->ip->i_ino == sc->ip->i_ino &&
1392 xchk_iscan_want_live_update(&rd->pscan.iscan, p->dp->i_ino)) {
1393 if (p->delta > 0) {
1394 trace_xrep_dir_stash_createname(sc->tempip,
1395 &xfs_name_dotdot,
1396 p->dp->i_ino);
1397
1398 xrep_findparent_scan_found(&rd->pscan, p->dp->i_ino);
1399 } else {
1400 trace_xrep_dir_stash_removename(sc->tempip,
1401 &xfs_name_dotdot,
1402 rd->pscan.parent_ino);
1403
1404 xrep_findparent_scan_found(&rd->pscan, NULLFSINO);
1405 }
1406 }
1407
1408 return NOTIFY_DONE;
1409 out_abort:
1410 xchk_iscan_abort(&rd->pscan.iscan);
1411 return NOTIFY_DONE;
1412 }
1413
1414 /*
1415 * Free all the directory blocks and reset the data fork. The caller must
1416 * join the inode to the transaction. This function returns with the inode
1417 * joined to a clean scrub transaction.
1418 */
1419 STATIC int
xrep_dir_reset_fork(struct xrep_dir * rd,xfs_ino_t parent_ino)1420 xrep_dir_reset_fork(
1421 struct xrep_dir *rd,
1422 xfs_ino_t parent_ino)
1423 {
1424 struct xfs_scrub *sc = rd->sc;
1425 struct xfs_ifork *ifp = xfs_ifork_ptr(sc->tempip, XFS_DATA_FORK);
1426 int error;
1427
1428 /* Unmap all the directory buffers. */
1429 if (xfs_ifork_has_extents(ifp)) {
1430 error = xrep_reap_ifork(sc, sc->tempip, XFS_DATA_FORK);
1431 if (error)
1432 return error;
1433 }
1434
1435 trace_xrep_dir_reset_fork(sc->tempip, parent_ino);
1436
1437 /* Reset the data fork to an empty data fork. */
1438 xfs_idestroy_fork(ifp);
1439 ifp->if_bytes = 0;
1440 sc->tempip->i_disk_size = 0;
1441
1442 /* Reinitialize the short form directory. */
1443 xrep_dir_init_args(rd, sc->tempip, NULL);
1444 return xfs_dir2_sf_create(&rd->args, parent_ino);
1445 }
1446
1447 /*
1448 * Prepare both inodes' directory forks for exchanging mappings. Promote the
1449 * tempfile from short format to leaf format, and if the file being repaired
1450 * has a short format data fork, turn it into an empty extent list.
1451 */
1452 STATIC int
xrep_dir_swap_prep(struct xfs_scrub * sc,bool temp_local,bool ip_local)1453 xrep_dir_swap_prep(
1454 struct xfs_scrub *sc,
1455 bool temp_local,
1456 bool ip_local)
1457 {
1458 int error;
1459
1460 /*
1461 * If the tempfile's directory is in shortform format, convert that to
1462 * a single leaf extent so that we can use the atomic mapping exchange.
1463 */
1464 if (temp_local) {
1465 struct xfs_da_args args = {
1466 .dp = sc->tempip,
1467 .geo = sc->mp->m_dir_geo,
1468 .whichfork = XFS_DATA_FORK,
1469 .trans = sc->tp,
1470 .total = 1,
1471 .owner = sc->ip->i_ino,
1472 };
1473
1474 error = xfs_dir2_sf_to_block(&args);
1475 if (error)
1476 return error;
1477
1478 /*
1479 * Roll the deferred log items to get us back to a clean
1480 * transaction.
1481 */
1482 error = xfs_defer_finish(&sc->tp);
1483 if (error)
1484 return error;
1485 }
1486
1487 /*
1488 * If the file being repaired had a shortform data fork, convert that
1489 * to an empty extent list in preparation for the atomic mapping
1490 * exchange.
1491 */
1492 if (ip_local) {
1493 struct xfs_ifork *ifp;
1494
1495 ifp = xfs_ifork_ptr(sc->ip, XFS_DATA_FORK);
1496 xfs_idestroy_fork(ifp);
1497 ifp->if_format = XFS_DINODE_FMT_EXTENTS;
1498 ifp->if_nextents = 0;
1499 ifp->if_bytes = 0;
1500 ifp->if_data = NULL;
1501 ifp->if_height = 0;
1502
1503 xfs_trans_log_inode(sc->tp, sc->ip,
1504 XFS_ILOG_CORE | XFS_ILOG_DDATA);
1505 }
1506
1507 return 0;
1508 }
1509
1510 /*
1511 * Replace the inode number of a directory entry.
1512 */
1513 static int
xrep_dir_replace(struct xrep_dir * rd,struct xfs_inode * dp,const struct xfs_name * name,xfs_ino_t inum,xfs_extlen_t total)1514 xrep_dir_replace(
1515 struct xrep_dir *rd,
1516 struct xfs_inode *dp,
1517 const struct xfs_name *name,
1518 xfs_ino_t inum,
1519 xfs_extlen_t total)
1520 {
1521 struct xfs_scrub *sc = rd->sc;
1522 int error;
1523
1524 ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
1525
1526 error = xfs_dir_ino_validate(sc->mp, inum);
1527 if (error)
1528 return error;
1529
1530 xrep_dir_init_args(rd, dp, name);
1531 rd->args.inumber = inum;
1532 rd->args.total = total;
1533 return xfs_dir_replace_args(&rd->args);
1534 }
1535
1536 /*
1537 * Reset the link count of this directory and adjust the unlinked list pointers
1538 * as needed.
1539 */
1540 STATIC int
xrep_dir_set_nlink(struct xrep_dir * rd)1541 xrep_dir_set_nlink(
1542 struct xrep_dir *rd)
1543 {
1544 struct xfs_scrub *sc = rd->sc;
1545 struct xfs_inode *dp = sc->ip;
1546 struct xfs_perag *pag;
1547 unsigned int new_nlink = min_t(unsigned long long,
1548 rd->subdirs + 2,
1549 XFS_NLINK_PINNED);
1550 int error;
1551
1552 /*
1553 * The directory is not on the incore unlinked list, which means that
1554 * it needs to be reachable via the directory tree. Update the nlink
1555 * with our observed link count. If the directory has no parent, it
1556 * will be moved to the orphanage.
1557 */
1558 if (!xfs_inode_on_unlinked_list(dp))
1559 goto reset_nlink;
1560
1561 /*
1562 * The directory is on the unlinked list and we did not find any
1563 * dirents. Set the link count to zero and let the directory
1564 * inactivate when the last reference drops.
1565 */
1566 if (rd->dirents == 0) {
1567 rd->needs_adoption = false;
1568 new_nlink = 0;
1569 goto reset_nlink;
1570 }
1571
1572 /*
1573 * The directory is on the unlinked list and we found dirents. This
1574 * directory needs to be reachable via the directory tree. Remove the
1575 * dir from the unlinked list and update nlink with the observed link
1576 * count. If the directory has no parent, it will be moved to the
1577 * orphanage.
1578 */
1579 pag = xfs_perag_get(sc->mp, XFS_INO_TO_AGNO(sc->mp, dp->i_ino));
1580 if (!pag) {
1581 ASSERT(0);
1582 return -EFSCORRUPTED;
1583 }
1584
1585 error = xfs_iunlink_remove(sc->tp, pag, dp);
1586 xfs_perag_put(pag);
1587 if (error)
1588 return error;
1589
1590 reset_nlink:
1591 if (VFS_I(dp)->i_nlink != new_nlink)
1592 set_nlink(VFS_I(dp), new_nlink);
1593 return 0;
1594 }
1595
1596 /*
1597 * Finish replaying stashed dirent updates, allocate a transaction for
1598 * exchanging data fork mappings, and take the ILOCKs of both directories
1599 * before we commit the new directory structure.
1600 */
1601 STATIC int
xrep_dir_finalize_tempdir(struct xrep_dir * rd)1602 xrep_dir_finalize_tempdir(
1603 struct xrep_dir *rd)
1604 {
1605 struct xfs_scrub *sc = rd->sc;
1606 int error;
1607
1608 if (!xfs_has_parent(sc->mp))
1609 return xrep_tempexch_trans_alloc(sc, XFS_DATA_FORK, &rd->tx);
1610
1611 /*
1612 * Repair relies on the ILOCK to quiesce all possible dirent updates.
1613 * Replay all queued dirent updates into the tempdir before exchanging
1614 * the contents, even if that means dropping the ILOCKs and the
1615 * transaction.
1616 */
1617 do {
1618 error = xrep_dir_replay_updates(rd);
1619 if (error)
1620 return error;
1621
1622 error = xrep_tempexch_trans_alloc(sc, XFS_DATA_FORK, &rd->tx);
1623 if (error)
1624 return error;
1625
1626 if (xfarray_length(rd->dir_entries) == 0)
1627 break;
1628
1629 xchk_trans_cancel(sc);
1630 xrep_tempfile_iunlock_both(sc);
1631 } while (!xchk_should_terminate(sc, &error));
1632 return error;
1633 }
1634
1635 /* Exchange the temporary directory's data fork with the one being repaired. */
1636 STATIC int
xrep_dir_swap(struct xrep_dir * rd)1637 xrep_dir_swap(
1638 struct xrep_dir *rd)
1639 {
1640 struct xfs_scrub *sc = rd->sc;
1641 xfs_ino_t ino;
1642 bool ip_local, temp_local;
1643 int error = 0;
1644
1645 /*
1646 * If we never found the parent for this directory, temporarily assign
1647 * the root dir as the parent; we'll move this to the orphanage after
1648 * exchanging the dir contents. We hold the ILOCK of the dir being
1649 * repaired, so we're not worried about racy updates of dotdot.
1650 */
1651 ASSERT(sc->ilock_flags & XFS_ILOCK_EXCL);
1652 if (rd->pscan.parent_ino == NULLFSINO) {
1653 rd->needs_adoption = true;
1654 rd->pscan.parent_ino = rd->sc->mp->m_sb.sb_rootino;
1655 }
1656
1657 /*
1658 * Reset the temporary directory's '..' entry to point to the parent
1659 * that we found. The dirent replace code asserts if the dirent
1660 * already points at the new inumber, so we look it up here.
1661 *
1662 * It's also possible that this replacement could also expand a sf
1663 * tempdir into block format.
1664 */
1665 error = xchk_dir_lookup(sc, rd->sc->tempip, &xfs_name_dotdot, &ino);
1666 if (error)
1667 return error;
1668
1669 if (rd->pscan.parent_ino != ino) {
1670 error = xrep_dir_replace(rd, rd->sc->tempip, &xfs_name_dotdot,
1671 rd->pscan.parent_ino, rd->tx.req.resblks);
1672 if (error)
1673 return error;
1674 }
1675
1676 /*
1677 * Changing the dot and dotdot entries could have changed the shape of
1678 * the directory, so we recompute these.
1679 */
1680 ip_local = sc->ip->i_df.if_format == XFS_DINODE_FMT_LOCAL;
1681 temp_local = sc->tempip->i_df.if_format == XFS_DINODE_FMT_LOCAL;
1682
1683 /*
1684 * If the both files have a local format data fork and the rebuilt
1685 * directory data would fit in the repaired file's data fork, copy
1686 * the contents from the tempfile and update the directory link count.
1687 * We're done now.
1688 */
1689 if (ip_local && temp_local &&
1690 sc->tempip->i_disk_size <= xfs_inode_data_fork_size(sc->ip)) {
1691 xrep_tempfile_copyout_local(sc, XFS_DATA_FORK);
1692 return xrep_dir_set_nlink(rd);
1693 }
1694
1695 /*
1696 * Clean the transaction before we start working on exchanging
1697 * directory contents.
1698 */
1699 error = xrep_tempfile_roll_trans(rd->sc);
1700 if (error)
1701 return error;
1702
1703 /* Otherwise, make sure both data forks are in block-mapping mode. */
1704 error = xrep_dir_swap_prep(sc, temp_local, ip_local);
1705 if (error)
1706 return error;
1707
1708 /*
1709 * Set nlink of the directory in the same transaction sequence that
1710 * (atomically) commits the new directory data.
1711 */
1712 error = xrep_dir_set_nlink(rd);
1713 if (error)
1714 return error;
1715
1716 return xrep_tempexch_contents(sc, &rd->tx);
1717 }
1718
1719 /*
1720 * Exchange the new directory contents (which we created in the tempfile) with
1721 * the directory being repaired.
1722 */
1723 STATIC int
xrep_dir_rebuild_tree(struct xrep_dir * rd)1724 xrep_dir_rebuild_tree(
1725 struct xrep_dir *rd)
1726 {
1727 struct xfs_scrub *sc = rd->sc;
1728 int error;
1729
1730 trace_xrep_dir_rebuild_tree(sc->ip, rd->pscan.parent_ino);
1731
1732 /*
1733 * Take the IOLOCK on the temporary file so that we can run dir
1734 * operations with the same locks held as we would for a normal file.
1735 * We still hold sc->ip's IOLOCK.
1736 */
1737 error = xrep_tempfile_iolock_polled(rd->sc);
1738 if (error)
1739 return error;
1740
1741 /*
1742 * Allocate transaction, lock inodes, and make sure that we've replayed
1743 * all the stashed dirent updates to the tempdir. After this point,
1744 * we're ready to exchange data fork mappings.
1745 */
1746 error = xrep_dir_finalize_tempdir(rd);
1747 if (error)
1748 return error;
1749
1750 if (xchk_iscan_aborted(&rd->pscan.iscan))
1751 return -ECANCELED;
1752
1753 /*
1754 * Exchange the tempdir's data fork with the file being repaired. This
1755 * recreates the transaction and re-takes the ILOCK in the scrub
1756 * context.
1757 */
1758 error = xrep_dir_swap(rd);
1759 if (error)
1760 return error;
1761
1762 /*
1763 * Release the old directory blocks and reset the data fork of the temp
1764 * directory to an empty shortform directory because inactivation does
1765 * nothing for directories.
1766 */
1767 error = xrep_dir_reset_fork(rd, sc->mp->m_rootip->i_ino);
1768 if (error)
1769 return error;
1770
1771 /*
1772 * Roll to get a transaction without any inodes joined to it. Then we
1773 * can drop the tempfile's ILOCK and IOLOCK before doing more work on
1774 * the scrub target directory.
1775 */
1776 error = xfs_trans_roll(&sc->tp);
1777 if (error)
1778 return error;
1779
1780 xrep_tempfile_iunlock(sc);
1781 xrep_tempfile_iounlock(sc);
1782 return 0;
1783 }
1784
1785 /* Set up the filesystem scan so we can regenerate directory entries. */
1786 STATIC int
xrep_dir_setup_scan(struct xrep_dir * rd)1787 xrep_dir_setup_scan(
1788 struct xrep_dir *rd)
1789 {
1790 struct xfs_scrub *sc = rd->sc;
1791 char *descr;
1792 int error;
1793
1794 /* Set up some staging memory for salvaging dirents. */
1795 descr = xchk_xfile_ino_descr(sc, "directory entries");
1796 error = xfarray_create(descr, 0, sizeof(struct xrep_dirent),
1797 &rd->dir_entries);
1798 kfree(descr);
1799 if (error)
1800 return error;
1801
1802 descr = xchk_xfile_ino_descr(sc, "directory entry names");
1803 error = xfblob_create(descr, &rd->dir_names);
1804 kfree(descr);
1805 if (error)
1806 goto out_xfarray;
1807
1808 if (xfs_has_parent(sc->mp))
1809 error = __xrep_findparent_scan_start(sc, &rd->pscan,
1810 xrep_dir_live_update);
1811 else
1812 error = xrep_findparent_scan_start(sc, &rd->pscan);
1813 if (error)
1814 goto out_xfblob;
1815
1816 return 0;
1817
1818 out_xfblob:
1819 xfblob_destroy(rd->dir_names);
1820 rd->dir_names = NULL;
1821 out_xfarray:
1822 xfarray_destroy(rd->dir_entries);
1823 rd->dir_entries = NULL;
1824 return error;
1825 }
1826
1827 /*
1828 * Move the current file to the orphanage.
1829 *
1830 * Caller must hold IOLOCK_EXCL on @sc->ip, and no other inode locks. Upon
1831 * successful return, the scrub transaction will have enough extra reservation
1832 * to make the move; it will hold IOLOCK_EXCL and ILOCK_EXCL of @sc->ip and the
1833 * orphanage; and both inodes will be ijoined.
1834 */
1835 STATIC int
xrep_dir_move_to_orphanage(struct xrep_dir * rd)1836 xrep_dir_move_to_orphanage(
1837 struct xrep_dir *rd)
1838 {
1839 struct xfs_scrub *sc = rd->sc;
1840 xfs_ino_t orig_parent, new_parent;
1841 int error;
1842
1843 /*
1844 * We are about to drop the ILOCK on sc->ip to lock the orphanage and
1845 * prepare for the adoption. Therefore, look up the old dotdot entry
1846 * for sc->ip so that we can compare it after we re-lock sc->ip.
1847 */
1848 error = xchk_dir_lookup(sc, sc->ip, &xfs_name_dotdot, &orig_parent);
1849 if (error)
1850 return error;
1851
1852 /*
1853 * Drop the ILOCK on the scrub target and commit the transaction.
1854 * Adoption computes its own resource requirements and gathers the
1855 * necessary components.
1856 */
1857 error = xrep_trans_commit(sc);
1858 if (error)
1859 return error;
1860 xchk_iunlock(sc, XFS_ILOCK_EXCL);
1861
1862 /* If we can take the orphanage's iolock then we're ready to move. */
1863 if (!xrep_orphanage_ilock_nowait(sc, XFS_IOLOCK_EXCL)) {
1864 xchk_iunlock(sc, sc->ilock_flags);
1865 error = xrep_orphanage_iolock_two(sc);
1866 if (error)
1867 return error;
1868 }
1869
1870 /* Grab transaction and ILOCK the two files. */
1871 error = xrep_adoption_trans_alloc(sc, &rd->adoption);
1872 if (error)
1873 return error;
1874
1875 error = xrep_adoption_compute_name(&rd->adoption, &rd->xname);
1876 if (error)
1877 return error;
1878
1879 /*
1880 * Now that we've reacquired the ILOCK on sc->ip, look up the dotdot
1881 * entry again. If the parent changed or the child was unlinked while
1882 * the child directory was unlocked, we don't need to move the child to
1883 * the orphanage after all.
1884 */
1885 error = xchk_dir_lookup(sc, sc->ip, &xfs_name_dotdot, &new_parent);
1886 if (error)
1887 return error;
1888
1889 /*
1890 * Attach to the orphanage if we still have a linked directory and it
1891 * hasn't been moved.
1892 */
1893 if (orig_parent == new_parent && VFS_I(sc->ip)->i_nlink > 0) {
1894 error = xrep_adoption_move(&rd->adoption);
1895 if (error)
1896 return error;
1897 }
1898
1899 /*
1900 * Launder the scrub transaction so we can drop the orphanage ILOCK
1901 * and IOLOCK. Return holding the scrub target's ILOCK and IOLOCK.
1902 */
1903 error = xrep_adoption_trans_roll(&rd->adoption);
1904 if (error)
1905 return error;
1906
1907 xrep_orphanage_iunlock(sc, XFS_ILOCK_EXCL);
1908 xrep_orphanage_iunlock(sc, XFS_IOLOCK_EXCL);
1909 return 0;
1910 }
1911
1912 /*
1913 * Repair the directory metadata.
1914 *
1915 * XXX: Directory entry buffers can be multiple fsblocks in size. The buffer
1916 * cache in XFS can't handle aliased multiblock buffers, so this might
1917 * misbehave if the directory blocks are crosslinked with other filesystem
1918 * metadata.
1919 *
1920 * XXX: Is it necessary to check the dcache for this directory to make sure
1921 * that we always recreate every cached entry?
1922 */
1923 int
xrep_directory(struct xfs_scrub * sc)1924 xrep_directory(
1925 struct xfs_scrub *sc)
1926 {
1927 struct xrep_dir *rd = sc->buf;
1928 int error;
1929
1930 /* The rmapbt is required to reap the old data fork. */
1931 if (!xfs_has_rmapbt(sc->mp))
1932 return -EOPNOTSUPP;
1933 /* We require atomic file exchange range to rebuild anything. */
1934 if (!xfs_has_exchange_range(sc->mp))
1935 return -EOPNOTSUPP;
1936
1937 error = xrep_dir_setup_scan(rd);
1938 if (error)
1939 return error;
1940
1941 if (xfs_has_parent(sc->mp))
1942 error = xrep_dir_scan_dirtree(rd);
1943 else
1944 error = xrep_dir_salvage_entries(rd);
1945 if (error)
1946 goto out_teardown;
1947
1948 /* Last chance to abort before we start committing fixes. */
1949 if (xchk_should_terminate(sc, &error))
1950 goto out_teardown;
1951
1952 error = xrep_dir_rebuild_tree(rd);
1953 if (error)
1954 goto out_teardown;
1955
1956 if (rd->needs_adoption) {
1957 if (!xrep_orphanage_can_adopt(rd->sc))
1958 error = -EFSCORRUPTED;
1959 else
1960 error = xrep_dir_move_to_orphanage(rd);
1961 if (error)
1962 goto out_teardown;
1963 }
1964
1965 out_teardown:
1966 xrep_dir_teardown(sc);
1967 return error;
1968 }
1969