xref: /aosp_15_r20/external/erofs-utils/lib/dedupe.c (revision 33b1fccf6a0fada2c2875d400ed01119b7676ee5)
1*33b1fccfSAndroid Build Coastguard Worker // SPDX-License-Identifier: GPL-2.0+ OR Apache-2.0
2*33b1fccfSAndroid Build Coastguard Worker /*
3*33b1fccfSAndroid Build Coastguard Worker  * Copyright (C) 2022 Alibaba Cloud
4*33b1fccfSAndroid Build Coastguard Worker  */
5*33b1fccfSAndroid Build Coastguard Worker #include <stdlib.h>
6*33b1fccfSAndroid Build Coastguard Worker #include "erofs/dedupe.h"
7*33b1fccfSAndroid Build Coastguard Worker #include "erofs/print.h"
8*33b1fccfSAndroid Build Coastguard Worker #include "rolling_hash.h"
9*33b1fccfSAndroid Build Coastguard Worker #include "xxhash.h"
10*33b1fccfSAndroid Build Coastguard Worker #include "sha256.h"
11*33b1fccfSAndroid Build Coastguard Worker 
erofs_memcmp2(const u8 * s1,const u8 * s2,unsigned long sz)12*33b1fccfSAndroid Build Coastguard Worker unsigned long erofs_memcmp2(const u8 *s1, const u8 *s2,
13*33b1fccfSAndroid Build Coastguard Worker 			    unsigned long sz)
14*33b1fccfSAndroid Build Coastguard Worker {
15*33b1fccfSAndroid Build Coastguard Worker 	const unsigned long *a1, *a2;
16*33b1fccfSAndroid Build Coastguard Worker 	unsigned long n = sz;
17*33b1fccfSAndroid Build Coastguard Worker 
18*33b1fccfSAndroid Build Coastguard Worker 	if (sz < sizeof(long))
19*33b1fccfSAndroid Build Coastguard Worker 		goto out_bytes;
20*33b1fccfSAndroid Build Coastguard Worker 
21*33b1fccfSAndroid Build Coastguard Worker 	if (((long)s1 & (sizeof(long) - 1)) ==
22*33b1fccfSAndroid Build Coastguard Worker 			((long)s2 & (sizeof(long) - 1))) {
23*33b1fccfSAndroid Build Coastguard Worker 		while ((long)s1 & (sizeof(long) - 1)) {
24*33b1fccfSAndroid Build Coastguard Worker 			if (*s1 != *s2)
25*33b1fccfSAndroid Build Coastguard Worker 				break;
26*33b1fccfSAndroid Build Coastguard Worker 			++s1;
27*33b1fccfSAndroid Build Coastguard Worker 			++s2;
28*33b1fccfSAndroid Build Coastguard Worker 			--sz;
29*33b1fccfSAndroid Build Coastguard Worker 		}
30*33b1fccfSAndroid Build Coastguard Worker 
31*33b1fccfSAndroid Build Coastguard Worker 		a1 = (const unsigned long *)s1;
32*33b1fccfSAndroid Build Coastguard Worker 		a2 = (const unsigned long *)s2;
33*33b1fccfSAndroid Build Coastguard Worker 		while (sz >= sizeof(long)) {
34*33b1fccfSAndroid Build Coastguard Worker 			if (*a1 != *a2)
35*33b1fccfSAndroid Build Coastguard Worker 				break;
36*33b1fccfSAndroid Build Coastguard Worker 			++a1;
37*33b1fccfSAndroid Build Coastguard Worker 			++a2;
38*33b1fccfSAndroid Build Coastguard Worker 			sz -= sizeof(long);
39*33b1fccfSAndroid Build Coastguard Worker 		}
40*33b1fccfSAndroid Build Coastguard Worker 	} else {
41*33b1fccfSAndroid Build Coastguard Worker 		a1 = (const unsigned long *)s1;
42*33b1fccfSAndroid Build Coastguard Worker 		a2 = (const unsigned long *)s2;
43*33b1fccfSAndroid Build Coastguard Worker 		do {
44*33b1fccfSAndroid Build Coastguard Worker 			if (get_unaligned(a1) != get_unaligned(a2))
45*33b1fccfSAndroid Build Coastguard Worker 				break;
46*33b1fccfSAndroid Build Coastguard Worker 			++a1;
47*33b1fccfSAndroid Build Coastguard Worker 			++a2;
48*33b1fccfSAndroid Build Coastguard Worker 			sz -= sizeof(long);
49*33b1fccfSAndroid Build Coastguard Worker 		} while (sz >= sizeof(long));
50*33b1fccfSAndroid Build Coastguard Worker 	}
51*33b1fccfSAndroid Build Coastguard Worker 	s1 = (const u8 *)a1;
52*33b1fccfSAndroid Build Coastguard Worker 	s2 = (const u8 *)a2;
53*33b1fccfSAndroid Build Coastguard Worker out_bytes:
54*33b1fccfSAndroid Build Coastguard Worker 	while (sz) {
55*33b1fccfSAndroid Build Coastguard Worker 		if (*s1 != *s2)
56*33b1fccfSAndroid Build Coastguard Worker 			break;
57*33b1fccfSAndroid Build Coastguard Worker 		++s1;
58*33b1fccfSAndroid Build Coastguard Worker 		++s2;
59*33b1fccfSAndroid Build Coastguard Worker 		--sz;
60*33b1fccfSAndroid Build Coastguard Worker 	}
61*33b1fccfSAndroid Build Coastguard Worker 	return n - sz;
62*33b1fccfSAndroid Build Coastguard Worker }
63*33b1fccfSAndroid Build Coastguard Worker 
64*33b1fccfSAndroid Build Coastguard Worker static unsigned int window_size, rollinghash_rm;
65*33b1fccfSAndroid Build Coastguard Worker static struct list_head dedupe_tree[65536];
66*33b1fccfSAndroid Build Coastguard Worker struct z_erofs_dedupe_item *dedupe_subtree;
67*33b1fccfSAndroid Build Coastguard Worker 
68*33b1fccfSAndroid Build Coastguard Worker struct z_erofs_dedupe_item {
69*33b1fccfSAndroid Build Coastguard Worker 	struct list_head list;
70*33b1fccfSAndroid Build Coastguard Worker 	struct z_erofs_dedupe_item *chain;
71*33b1fccfSAndroid Build Coastguard Worker 	long long	hash;
72*33b1fccfSAndroid Build Coastguard Worker 	u8		prefix_sha256[32];
73*33b1fccfSAndroid Build Coastguard Worker 	u64		prefix_xxh64;
74*33b1fccfSAndroid Build Coastguard Worker 
75*33b1fccfSAndroid Build Coastguard Worker 	erofs_blk_t	compressed_blkaddr;
76*33b1fccfSAndroid Build Coastguard Worker 	unsigned int	compressed_blks;
77*33b1fccfSAndroid Build Coastguard Worker 
78*33b1fccfSAndroid Build Coastguard Worker 	int		original_length;
79*33b1fccfSAndroid Build Coastguard Worker 	bool		partial, raw;
80*33b1fccfSAndroid Build Coastguard Worker 	u8		extra_data[];
81*33b1fccfSAndroid Build Coastguard Worker };
82*33b1fccfSAndroid Build Coastguard Worker 
z_erofs_dedupe_match(struct z_erofs_dedupe_ctx * ctx)83*33b1fccfSAndroid Build Coastguard Worker int z_erofs_dedupe_match(struct z_erofs_dedupe_ctx *ctx)
84*33b1fccfSAndroid Build Coastguard Worker {
85*33b1fccfSAndroid Build Coastguard Worker 	struct z_erofs_dedupe_item e_find;
86*33b1fccfSAndroid Build Coastguard Worker 	u8 *cur;
87*33b1fccfSAndroid Build Coastguard Worker 	bool initial = true;
88*33b1fccfSAndroid Build Coastguard Worker 
89*33b1fccfSAndroid Build Coastguard Worker 	if (!window_size)
90*33b1fccfSAndroid Build Coastguard Worker 		return -ENOENT;
91*33b1fccfSAndroid Build Coastguard Worker 
92*33b1fccfSAndroid Build Coastguard Worker 	if (ctx->cur > ctx->end - window_size)
93*33b1fccfSAndroid Build Coastguard Worker 		cur = ctx->end - window_size;
94*33b1fccfSAndroid Build Coastguard Worker 	else
95*33b1fccfSAndroid Build Coastguard Worker 		cur = ctx->cur;
96*33b1fccfSAndroid Build Coastguard Worker 
97*33b1fccfSAndroid Build Coastguard Worker 	/* move backward byte-by-byte */
98*33b1fccfSAndroid Build Coastguard Worker 	for (; cur >= ctx->start; --cur) {
99*33b1fccfSAndroid Build Coastguard Worker 		struct list_head *p;
100*33b1fccfSAndroid Build Coastguard Worker 		struct z_erofs_dedupe_item *e;
101*33b1fccfSAndroid Build Coastguard Worker 
102*33b1fccfSAndroid Build Coastguard Worker 		unsigned int extra = 0;
103*33b1fccfSAndroid Build Coastguard Worker 		u64 xxh64_csum = 0;
104*33b1fccfSAndroid Build Coastguard Worker 		u8 sha256[32];
105*33b1fccfSAndroid Build Coastguard Worker 
106*33b1fccfSAndroid Build Coastguard Worker 		if (initial) {
107*33b1fccfSAndroid Build Coastguard Worker 			/* initial try */
108*33b1fccfSAndroid Build Coastguard Worker 			e_find.hash = erofs_rolling_hash_init(cur, window_size, true);
109*33b1fccfSAndroid Build Coastguard Worker 			initial = false;
110*33b1fccfSAndroid Build Coastguard Worker 		} else {
111*33b1fccfSAndroid Build Coastguard Worker 			e_find.hash = erofs_rolling_hash_advance(e_find.hash,
112*33b1fccfSAndroid Build Coastguard Worker 				rollinghash_rm, cur[window_size], cur[0]);
113*33b1fccfSAndroid Build Coastguard Worker 		}
114*33b1fccfSAndroid Build Coastguard Worker 
115*33b1fccfSAndroid Build Coastguard Worker 		p = &dedupe_tree[e_find.hash & (ARRAY_SIZE(dedupe_tree) - 1)];
116*33b1fccfSAndroid Build Coastguard Worker 		list_for_each_entry(e, p, list) {
117*33b1fccfSAndroid Build Coastguard Worker 			if (e->hash != e_find.hash)
118*33b1fccfSAndroid Build Coastguard Worker 				continue;
119*33b1fccfSAndroid Build Coastguard Worker 			if (!extra) {
120*33b1fccfSAndroid Build Coastguard Worker 				xxh64_csum = xxh64(cur, window_size, 0);
121*33b1fccfSAndroid Build Coastguard Worker 				extra = 1;
122*33b1fccfSAndroid Build Coastguard Worker 			}
123*33b1fccfSAndroid Build Coastguard Worker 			if (e->prefix_xxh64 == xxh64_csum)
124*33b1fccfSAndroid Build Coastguard Worker 				break;
125*33b1fccfSAndroid Build Coastguard Worker 		}
126*33b1fccfSAndroid Build Coastguard Worker 
127*33b1fccfSAndroid Build Coastguard Worker 		if (&e->list == p)
128*33b1fccfSAndroid Build Coastguard Worker 			continue;
129*33b1fccfSAndroid Build Coastguard Worker 
130*33b1fccfSAndroid Build Coastguard Worker 		erofs_sha256(cur, window_size, sha256);
131*33b1fccfSAndroid Build Coastguard Worker 		if (memcmp(sha256, e->prefix_sha256, sizeof(sha256)))
132*33b1fccfSAndroid Build Coastguard Worker 			continue;
133*33b1fccfSAndroid Build Coastguard Worker 
134*33b1fccfSAndroid Build Coastguard Worker 		extra = min_t(unsigned int, ctx->end - cur - window_size,
135*33b1fccfSAndroid Build Coastguard Worker 			      e->original_length - window_size);
136*33b1fccfSAndroid Build Coastguard Worker 		extra = erofs_memcmp2(cur + window_size, e->extra_data, extra);
137*33b1fccfSAndroid Build Coastguard Worker 		if (window_size + extra <= ctx->cur - cur)
138*33b1fccfSAndroid Build Coastguard Worker 			continue;
139*33b1fccfSAndroid Build Coastguard Worker 		ctx->cur = cur;
140*33b1fccfSAndroid Build Coastguard Worker 		ctx->e.length = window_size + extra;
141*33b1fccfSAndroid Build Coastguard Worker 		ctx->e.partial = e->partial ||
142*33b1fccfSAndroid Build Coastguard Worker 			(window_size + extra < e->original_length);
143*33b1fccfSAndroid Build Coastguard Worker 		ctx->e.raw = e->raw;
144*33b1fccfSAndroid Build Coastguard Worker 		ctx->e.inlined = false;
145*33b1fccfSAndroid Build Coastguard Worker 		ctx->e.blkaddr = e->compressed_blkaddr;
146*33b1fccfSAndroid Build Coastguard Worker 		ctx->e.compressedblks = e->compressed_blks;
147*33b1fccfSAndroid Build Coastguard Worker 		return 0;
148*33b1fccfSAndroid Build Coastguard Worker 	}
149*33b1fccfSAndroid Build Coastguard Worker 	return -ENOENT;
150*33b1fccfSAndroid Build Coastguard Worker }
151*33b1fccfSAndroid Build Coastguard Worker 
z_erofs_dedupe_insert(struct z_erofs_inmem_extent * e,void * original_data)152*33b1fccfSAndroid Build Coastguard Worker int z_erofs_dedupe_insert(struct z_erofs_inmem_extent *e,
153*33b1fccfSAndroid Build Coastguard Worker 			  void *original_data)
154*33b1fccfSAndroid Build Coastguard Worker {
155*33b1fccfSAndroid Build Coastguard Worker 	struct list_head *p;
156*33b1fccfSAndroid Build Coastguard Worker 	struct z_erofs_dedupe_item *di, *k;
157*33b1fccfSAndroid Build Coastguard Worker 
158*33b1fccfSAndroid Build Coastguard Worker 	if (!window_size || e->length < window_size)
159*33b1fccfSAndroid Build Coastguard Worker 		return 0;
160*33b1fccfSAndroid Build Coastguard Worker 
161*33b1fccfSAndroid Build Coastguard Worker 	di = malloc(sizeof(*di) + e->length - window_size);
162*33b1fccfSAndroid Build Coastguard Worker 	if (!di)
163*33b1fccfSAndroid Build Coastguard Worker 		return -ENOMEM;
164*33b1fccfSAndroid Build Coastguard Worker 
165*33b1fccfSAndroid Build Coastguard Worker 	di->original_length = e->length;
166*33b1fccfSAndroid Build Coastguard Worker 	erofs_sha256(original_data, window_size, di->prefix_sha256);
167*33b1fccfSAndroid Build Coastguard Worker 
168*33b1fccfSAndroid Build Coastguard Worker 	di->prefix_xxh64 = xxh64(original_data, window_size, 0);
169*33b1fccfSAndroid Build Coastguard Worker 	di->hash = erofs_rolling_hash_init(original_data,
170*33b1fccfSAndroid Build Coastguard Worker 			window_size, true);
171*33b1fccfSAndroid Build Coastguard Worker 	memcpy(di->extra_data, original_data + window_size,
172*33b1fccfSAndroid Build Coastguard Worker 	       e->length - window_size);
173*33b1fccfSAndroid Build Coastguard Worker 	di->compressed_blkaddr = e->blkaddr;
174*33b1fccfSAndroid Build Coastguard Worker 	di->compressed_blks = e->compressedblks;
175*33b1fccfSAndroid Build Coastguard Worker 	di->partial = e->partial;
176*33b1fccfSAndroid Build Coastguard Worker 	di->raw = e->raw;
177*33b1fccfSAndroid Build Coastguard Worker 
178*33b1fccfSAndroid Build Coastguard Worker 	/* skip the same xxh64 hash */
179*33b1fccfSAndroid Build Coastguard Worker 	p = &dedupe_tree[di->hash & (ARRAY_SIZE(dedupe_tree) - 1)];
180*33b1fccfSAndroid Build Coastguard Worker 	list_for_each_entry(k, p, list) {
181*33b1fccfSAndroid Build Coastguard Worker 		if (k->prefix_xxh64 == di->prefix_xxh64) {
182*33b1fccfSAndroid Build Coastguard Worker 			free(di);
183*33b1fccfSAndroid Build Coastguard Worker 			return 0;
184*33b1fccfSAndroid Build Coastguard Worker 		}
185*33b1fccfSAndroid Build Coastguard Worker 	}
186*33b1fccfSAndroid Build Coastguard Worker 	di->chain = dedupe_subtree;
187*33b1fccfSAndroid Build Coastguard Worker 	dedupe_subtree = di;
188*33b1fccfSAndroid Build Coastguard Worker 	list_add_tail(&di->list, p);
189*33b1fccfSAndroid Build Coastguard Worker 	return 0;
190*33b1fccfSAndroid Build Coastguard Worker }
191*33b1fccfSAndroid Build Coastguard Worker 
z_erofs_dedupe_commit(bool drop)192*33b1fccfSAndroid Build Coastguard Worker void z_erofs_dedupe_commit(bool drop)
193*33b1fccfSAndroid Build Coastguard Worker {
194*33b1fccfSAndroid Build Coastguard Worker 	if (!dedupe_subtree)
195*33b1fccfSAndroid Build Coastguard Worker 		return;
196*33b1fccfSAndroid Build Coastguard Worker 	if (drop) {
197*33b1fccfSAndroid Build Coastguard Worker 		struct z_erofs_dedupe_item *di, *n;
198*33b1fccfSAndroid Build Coastguard Worker 
199*33b1fccfSAndroid Build Coastguard Worker 		for (di = dedupe_subtree; di; di = n) {
200*33b1fccfSAndroid Build Coastguard Worker 			n = di->chain;
201*33b1fccfSAndroid Build Coastguard Worker 			list_del(&di->list);
202*33b1fccfSAndroid Build Coastguard Worker 			free(di);
203*33b1fccfSAndroid Build Coastguard Worker 		}
204*33b1fccfSAndroid Build Coastguard Worker 	}
205*33b1fccfSAndroid Build Coastguard Worker 	dedupe_subtree = NULL;
206*33b1fccfSAndroid Build Coastguard Worker }
207*33b1fccfSAndroid Build Coastguard Worker 
z_erofs_dedupe_init(unsigned int wsiz)208*33b1fccfSAndroid Build Coastguard Worker int z_erofs_dedupe_init(unsigned int wsiz)
209*33b1fccfSAndroid Build Coastguard Worker {
210*33b1fccfSAndroid Build Coastguard Worker 	struct list_head *p;
211*33b1fccfSAndroid Build Coastguard Worker 
212*33b1fccfSAndroid Build Coastguard Worker 	for (p = dedupe_tree;
213*33b1fccfSAndroid Build Coastguard Worker 		p < dedupe_tree + ARRAY_SIZE(dedupe_tree); ++p)
214*33b1fccfSAndroid Build Coastguard Worker 		init_list_head(p);
215*33b1fccfSAndroid Build Coastguard Worker 
216*33b1fccfSAndroid Build Coastguard Worker 	window_size = wsiz;
217*33b1fccfSAndroid Build Coastguard Worker 	rollinghash_rm = erofs_rollinghash_calc_rm(window_size);
218*33b1fccfSAndroid Build Coastguard Worker 	return 0;
219*33b1fccfSAndroid Build Coastguard Worker }
220*33b1fccfSAndroid Build Coastguard Worker 
z_erofs_dedupe_exit(void)221*33b1fccfSAndroid Build Coastguard Worker void z_erofs_dedupe_exit(void)
222*33b1fccfSAndroid Build Coastguard Worker {
223*33b1fccfSAndroid Build Coastguard Worker 	struct z_erofs_dedupe_item *di, *n;
224*33b1fccfSAndroid Build Coastguard Worker 	struct list_head *p;
225*33b1fccfSAndroid Build Coastguard Worker 
226*33b1fccfSAndroid Build Coastguard Worker 	if (!window_size)
227*33b1fccfSAndroid Build Coastguard Worker 		return;
228*33b1fccfSAndroid Build Coastguard Worker 
229*33b1fccfSAndroid Build Coastguard Worker 	z_erofs_dedupe_commit(true);
230*33b1fccfSAndroid Build Coastguard Worker 
231*33b1fccfSAndroid Build Coastguard Worker 	for (p = dedupe_tree;
232*33b1fccfSAndroid Build Coastguard Worker 		p < dedupe_tree + ARRAY_SIZE(dedupe_tree); ++p) {
233*33b1fccfSAndroid Build Coastguard Worker 		list_for_each_entry_safe(di, n, p, list) {
234*33b1fccfSAndroid Build Coastguard Worker 			list_del(&di->list);
235*33b1fccfSAndroid Build Coastguard Worker 			free(di);
236*33b1fccfSAndroid Build Coastguard Worker 		}
237*33b1fccfSAndroid Build Coastguard Worker 	}
238*33b1fccfSAndroid Build Coastguard Worker 	dedupe_subtree = NULL;
239*33b1fccfSAndroid Build Coastguard Worker }
240