1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright 2024 Rivos Inc.
4  */
5 
6 #include <linux/cpu.h>
7 #include <linux/cpumask.h>
8 #include <linux/jump_label.h>
9 #include <linux/kthread.h>
10 #include <linux/mm.h>
11 #include <linux/smp.h>
12 #include <linux/types.h>
13 #include <asm/cpufeature.h>
14 #include <asm/hwprobe.h>
15 #include <asm/vector.h>
16 
17 #include "copy-unaligned.h"
18 
19 #define MISALIGNED_ACCESS_JIFFIES_LG2 1
20 #define MISALIGNED_BUFFER_SIZE 0x4000
21 #define MISALIGNED_BUFFER_ORDER get_order(MISALIGNED_BUFFER_SIZE)
22 #define MISALIGNED_COPY_SIZE ((MISALIGNED_BUFFER_SIZE / 2) - 0x80)
23 
24 DEFINE_PER_CPU(long, misaligned_access_speed) = RISCV_HWPROBE_MISALIGNED_SCALAR_UNKNOWN;
25 DEFINE_PER_CPU(long, vector_misaligned_access) = RISCV_HWPROBE_MISALIGNED_VECTOR_UNSUPPORTED;
26 
27 #ifdef CONFIG_RISCV_PROBE_UNALIGNED_ACCESS
28 static cpumask_t fast_misaligned_access;
check_unaligned_access(void * param)29 static int check_unaligned_access(void *param)
30 {
31 	int cpu = smp_processor_id();
32 	u64 start_cycles, end_cycles;
33 	u64 word_cycles;
34 	u64 byte_cycles;
35 	int ratio;
36 	unsigned long start_jiffies, now;
37 	struct page *page = param;
38 	void *dst;
39 	void *src;
40 	long speed = RISCV_HWPROBE_MISALIGNED_SCALAR_SLOW;
41 
42 	if (per_cpu(misaligned_access_speed, cpu) != RISCV_HWPROBE_MISALIGNED_SCALAR_UNKNOWN)
43 		return 0;
44 
45 	/* Make an unaligned destination buffer. */
46 	dst = (void *)((unsigned long)page_address(page) | 0x1);
47 	/* Unalign src as well, but differently (off by 1 + 2 = 3). */
48 	src = dst + (MISALIGNED_BUFFER_SIZE / 2);
49 	src += 2;
50 	word_cycles = -1ULL;
51 	/* Do a warmup. */
52 	__riscv_copy_words_unaligned(dst, src, MISALIGNED_COPY_SIZE);
53 	preempt_disable();
54 	start_jiffies = jiffies;
55 	while ((now = jiffies) == start_jiffies)
56 		cpu_relax();
57 
58 	/*
59 	 * For a fixed amount of time, repeatedly try the function, and take
60 	 * the best time in cycles as the measurement.
61 	 */
62 	while (time_before(jiffies, now + (1 << MISALIGNED_ACCESS_JIFFIES_LG2))) {
63 		start_cycles = get_cycles64();
64 		/* Ensure the CSR read can't reorder WRT to the copy. */
65 		mb();
66 		__riscv_copy_words_unaligned(dst, src, MISALIGNED_COPY_SIZE);
67 		/* Ensure the copy ends before the end time is snapped. */
68 		mb();
69 		end_cycles = get_cycles64();
70 		if ((end_cycles - start_cycles) < word_cycles)
71 			word_cycles = end_cycles - start_cycles;
72 	}
73 
74 	byte_cycles = -1ULL;
75 	__riscv_copy_bytes_unaligned(dst, src, MISALIGNED_COPY_SIZE);
76 	start_jiffies = jiffies;
77 	while ((now = jiffies) == start_jiffies)
78 		cpu_relax();
79 
80 	while (time_before(jiffies, now + (1 << MISALIGNED_ACCESS_JIFFIES_LG2))) {
81 		start_cycles = get_cycles64();
82 		mb();
83 		__riscv_copy_bytes_unaligned(dst, src, MISALIGNED_COPY_SIZE);
84 		mb();
85 		end_cycles = get_cycles64();
86 		if ((end_cycles - start_cycles) < byte_cycles)
87 			byte_cycles = end_cycles - start_cycles;
88 	}
89 
90 	preempt_enable();
91 
92 	/* Don't divide by zero. */
93 	if (!word_cycles || !byte_cycles) {
94 		pr_warn("cpu%d: rdtime lacks granularity needed to measure unaligned access speed\n",
95 			cpu);
96 
97 		return 0;
98 	}
99 
100 	if (word_cycles < byte_cycles)
101 		speed = RISCV_HWPROBE_MISALIGNED_SCALAR_FAST;
102 
103 	ratio = div_u64((byte_cycles * 100), word_cycles);
104 	pr_info("cpu%d: Ratio of byte access time to unaligned word access is %d.%02d, unaligned accesses are %s\n",
105 		cpu,
106 		ratio / 100,
107 		ratio % 100,
108 		(speed == RISCV_HWPROBE_MISALIGNED_SCALAR_FAST) ? "fast" : "slow");
109 
110 	per_cpu(misaligned_access_speed, cpu) = speed;
111 
112 	/*
113 	 * Set the value of fast_misaligned_access of a CPU. These operations
114 	 * are atomic to avoid race conditions.
115 	 */
116 	if (speed == RISCV_HWPROBE_MISALIGNED_SCALAR_FAST)
117 		cpumask_set_cpu(cpu, &fast_misaligned_access);
118 	else
119 		cpumask_clear_cpu(cpu, &fast_misaligned_access);
120 
121 	return 0;
122 }
123 
check_unaligned_access_nonboot_cpu(void * param)124 static void __init check_unaligned_access_nonboot_cpu(void *param)
125 {
126 	unsigned int cpu = smp_processor_id();
127 	struct page **pages = param;
128 
129 	if (smp_processor_id() != 0)
130 		check_unaligned_access(pages[cpu]);
131 }
132 
133 DEFINE_STATIC_KEY_FALSE(fast_unaligned_access_speed_key);
134 
modify_unaligned_access_branches(cpumask_t * mask,int weight)135 static void modify_unaligned_access_branches(cpumask_t *mask, int weight)
136 {
137 	if (cpumask_weight(mask) == weight)
138 		static_branch_enable_cpuslocked(&fast_unaligned_access_speed_key);
139 	else
140 		static_branch_disable_cpuslocked(&fast_unaligned_access_speed_key);
141 }
142 
set_unaligned_access_static_branches_except_cpu(int cpu)143 static void set_unaligned_access_static_branches_except_cpu(int cpu)
144 {
145 	/*
146 	 * Same as set_unaligned_access_static_branches, except excludes the
147 	 * given CPU from the result. When a CPU is hotplugged into an offline
148 	 * state, this function is called before the CPU is set to offline in
149 	 * the cpumask, and thus the CPU needs to be explicitly excluded.
150 	 */
151 
152 	cpumask_t fast_except_me;
153 
154 	cpumask_and(&fast_except_me, &fast_misaligned_access, cpu_online_mask);
155 	cpumask_clear_cpu(cpu, &fast_except_me);
156 
157 	modify_unaligned_access_branches(&fast_except_me, num_online_cpus() - 1);
158 }
159 
set_unaligned_access_static_branches(void)160 static void set_unaligned_access_static_branches(void)
161 {
162 	/*
163 	 * This will be called after check_unaligned_access_all_cpus so the
164 	 * result of unaligned access speed for all CPUs will be available.
165 	 *
166 	 * To avoid the number of online cpus changing between reading
167 	 * cpu_online_mask and calling num_online_cpus, cpus_read_lock must be
168 	 * held before calling this function.
169 	 */
170 
171 	cpumask_t fast_and_online;
172 
173 	cpumask_and(&fast_and_online, &fast_misaligned_access, cpu_online_mask);
174 
175 	modify_unaligned_access_branches(&fast_and_online, num_online_cpus());
176 }
177 
lock_and_set_unaligned_access_static_branch(void)178 static int __init lock_and_set_unaligned_access_static_branch(void)
179 {
180 	cpus_read_lock();
181 	set_unaligned_access_static_branches();
182 	cpus_read_unlock();
183 
184 	return 0;
185 }
186 
187 arch_initcall_sync(lock_and_set_unaligned_access_static_branch);
188 
riscv_online_cpu(unsigned int cpu)189 static int riscv_online_cpu(unsigned int cpu)
190 {
191 	static struct page *buf;
192 
193 	/* We are already set since the last check */
194 	if (per_cpu(misaligned_access_speed, cpu) != RISCV_HWPROBE_MISALIGNED_SCALAR_UNKNOWN)
195 		goto exit;
196 
197 	check_unaligned_access_emulated(NULL);
198 	buf = alloc_pages(GFP_KERNEL, MISALIGNED_BUFFER_ORDER);
199 	if (!buf) {
200 		pr_warn("Allocation failure, not measuring misaligned performance\n");
201 		return -ENOMEM;
202 	}
203 
204 	check_unaligned_access(buf);
205 	__free_pages(buf, MISALIGNED_BUFFER_ORDER);
206 
207 exit:
208 	set_unaligned_access_static_branches();
209 
210 	return 0;
211 }
212 
riscv_offline_cpu(unsigned int cpu)213 static int riscv_offline_cpu(unsigned int cpu)
214 {
215 	set_unaligned_access_static_branches_except_cpu(cpu);
216 
217 	return 0;
218 }
219 
220 /* Measure unaligned access speed on all CPUs present at boot in parallel. */
check_unaligned_access_speed_all_cpus(void)221 static void __init check_unaligned_access_speed_all_cpus(void)
222 {
223 	unsigned int cpu;
224 	unsigned int cpu_count = num_possible_cpus();
225 	struct page **bufs = kcalloc(cpu_count, sizeof(*bufs), GFP_KERNEL);
226 
227 	if (!bufs) {
228 		pr_warn("Allocation failure, not measuring misaligned performance\n");
229 		return;
230 	}
231 
232 	/*
233 	 * Allocate separate buffers for each CPU so there's no fighting over
234 	 * cache lines.
235 	 */
236 	for_each_cpu(cpu, cpu_online_mask) {
237 		bufs[cpu] = alloc_pages(GFP_KERNEL, MISALIGNED_BUFFER_ORDER);
238 		if (!bufs[cpu]) {
239 			pr_warn("Allocation failure, not measuring misaligned performance\n");
240 			goto out;
241 		}
242 	}
243 
244 	/* Check everybody except 0, who stays behind to tend jiffies. */
245 	on_each_cpu(check_unaligned_access_nonboot_cpu, bufs, 1);
246 
247 	/* Check core 0. */
248 	smp_call_on_cpu(0, check_unaligned_access, bufs[0], true);
249 
250 out:
251 	for_each_cpu(cpu, cpu_online_mask) {
252 		if (bufs[cpu])
253 			__free_pages(bufs[cpu], MISALIGNED_BUFFER_ORDER);
254 	}
255 
256 	kfree(bufs);
257 }
258 #else /* CONFIG_RISCV_PROBE_UNALIGNED_ACCESS */
check_unaligned_access_speed_all_cpus(void)259 static void __init check_unaligned_access_speed_all_cpus(void)
260 {
261 }
262 #endif
263 
264 #ifdef CONFIG_RISCV_PROBE_VECTOR_UNALIGNED_ACCESS
check_vector_unaligned_access(struct work_struct * work __always_unused)265 static void check_vector_unaligned_access(struct work_struct *work __always_unused)
266 {
267 	int cpu = smp_processor_id();
268 	u64 start_cycles, end_cycles;
269 	u64 word_cycles;
270 	u64 byte_cycles;
271 	int ratio;
272 	unsigned long start_jiffies, now;
273 	struct page *page;
274 	void *dst;
275 	void *src;
276 	long speed = RISCV_HWPROBE_MISALIGNED_VECTOR_SLOW;
277 
278 	if (per_cpu(vector_misaligned_access, cpu) != RISCV_HWPROBE_MISALIGNED_VECTOR_UNKNOWN)
279 		return;
280 
281 	page = alloc_pages(GFP_KERNEL, MISALIGNED_BUFFER_ORDER);
282 	if (!page) {
283 		pr_warn("Allocation failure, not measuring vector misaligned performance\n");
284 		return;
285 	}
286 
287 	/* Make an unaligned destination buffer. */
288 	dst = (void *)((unsigned long)page_address(page) | 0x1);
289 	/* Unalign src as well, but differently (off by 1 + 2 = 3). */
290 	src = dst + (MISALIGNED_BUFFER_SIZE / 2);
291 	src += 2;
292 	word_cycles = -1ULL;
293 
294 	/* Do a warmup. */
295 	kernel_vector_begin();
296 	__riscv_copy_vec_words_unaligned(dst, src, MISALIGNED_COPY_SIZE);
297 
298 	start_jiffies = jiffies;
299 	while ((now = jiffies) == start_jiffies)
300 		cpu_relax();
301 
302 	/*
303 	 * For a fixed amount of time, repeatedly try the function, and take
304 	 * the best time in cycles as the measurement.
305 	 */
306 	while (time_before(jiffies, now + (1 << MISALIGNED_ACCESS_JIFFIES_LG2))) {
307 		start_cycles = get_cycles64();
308 		/* Ensure the CSR read can't reorder WRT to the copy. */
309 		mb();
310 		__riscv_copy_vec_words_unaligned(dst, src, MISALIGNED_COPY_SIZE);
311 		/* Ensure the copy ends before the end time is snapped. */
312 		mb();
313 		end_cycles = get_cycles64();
314 		if ((end_cycles - start_cycles) < word_cycles)
315 			word_cycles = end_cycles - start_cycles;
316 	}
317 
318 	byte_cycles = -1ULL;
319 	__riscv_copy_vec_bytes_unaligned(dst, src, MISALIGNED_COPY_SIZE);
320 	start_jiffies = jiffies;
321 	while ((now = jiffies) == start_jiffies)
322 		cpu_relax();
323 
324 	while (time_before(jiffies, now + (1 << MISALIGNED_ACCESS_JIFFIES_LG2))) {
325 		start_cycles = get_cycles64();
326 		/* Ensure the CSR read can't reorder WRT to the copy. */
327 		mb();
328 		__riscv_copy_vec_bytes_unaligned(dst, src, MISALIGNED_COPY_SIZE);
329 		/* Ensure the copy ends before the end time is snapped. */
330 		mb();
331 		end_cycles = get_cycles64();
332 		if ((end_cycles - start_cycles) < byte_cycles)
333 			byte_cycles = end_cycles - start_cycles;
334 	}
335 
336 	kernel_vector_end();
337 
338 	/* Don't divide by zero. */
339 	if (!word_cycles || !byte_cycles) {
340 		pr_warn("cpu%d: rdtime lacks granularity needed to measure unaligned vector access speed\n",
341 			cpu);
342 
343 		goto free;
344 	}
345 
346 	if (word_cycles < byte_cycles)
347 		speed = RISCV_HWPROBE_MISALIGNED_VECTOR_FAST;
348 
349 	ratio = div_u64((byte_cycles * 100), word_cycles);
350 	pr_info("cpu%d: Ratio of vector byte access time to vector unaligned word access is %d.%02d, unaligned accesses are %s\n",
351 		cpu,
352 		ratio / 100,
353 		ratio % 100,
354 		(speed ==  RISCV_HWPROBE_MISALIGNED_VECTOR_FAST) ? "fast" : "slow");
355 
356 	per_cpu(vector_misaligned_access, cpu) = speed;
357 
358 free:
359 	__free_pages(page, MISALIGNED_BUFFER_ORDER);
360 }
361 
362 /* Measure unaligned access speed on all CPUs present at boot in parallel. */
vec_check_unaligned_access_speed_all_cpus(void * unused __always_unused)363 static int __init vec_check_unaligned_access_speed_all_cpus(void *unused __always_unused)
364 {
365 	schedule_on_each_cpu(check_vector_unaligned_access);
366 
367 	return 0;
368 }
369 #else /* CONFIG_RISCV_PROBE_VECTOR_UNALIGNED_ACCESS */
vec_check_unaligned_access_speed_all_cpus(void * unused __always_unused)370 static int __init vec_check_unaligned_access_speed_all_cpus(void *unused __always_unused)
371 {
372 	return 0;
373 }
374 #endif
375 
riscv_online_cpu_vec(unsigned int cpu)376 static int riscv_online_cpu_vec(unsigned int cpu)
377 {
378 	if (!has_vector()) {
379 		per_cpu(vector_misaligned_access, cpu) = RISCV_HWPROBE_MISALIGNED_VECTOR_UNSUPPORTED;
380 		return 0;
381 	}
382 
383 #ifdef CONFIG_RISCV_PROBE_VECTOR_UNALIGNED_ACCESS
384 	if (per_cpu(vector_misaligned_access, cpu) != RISCV_HWPROBE_MISALIGNED_VECTOR_UNKNOWN)
385 		return 0;
386 
387 	check_vector_unaligned_access_emulated(NULL);
388 	check_vector_unaligned_access(NULL);
389 #endif
390 
391 	return 0;
392 }
393 
check_unaligned_access_all_cpus(void)394 static int __init check_unaligned_access_all_cpus(void)
395 {
396 	int cpu;
397 
398 	if (!check_unaligned_access_emulated_all_cpus())
399 		check_unaligned_access_speed_all_cpus();
400 
401 	if (!has_vector()) {
402 		for_each_online_cpu(cpu)
403 			per_cpu(vector_misaligned_access, cpu) = RISCV_HWPROBE_MISALIGNED_VECTOR_UNSUPPORTED;
404 	} else if (!check_vector_unaligned_access_emulated_all_cpus() &&
405 		   IS_ENABLED(CONFIG_RISCV_PROBE_VECTOR_UNALIGNED_ACCESS)) {
406 		kthread_run(vec_check_unaligned_access_speed_all_cpus,
407 			    NULL, "vec_check_unaligned_access_speed_all_cpus");
408 	}
409 
410 	/*
411 	 * Setup hotplug callbacks for any new CPUs that come online or go
412 	 * offline.
413 	 */
414 #ifdef CONFIG_RISCV_PROBE_UNALIGNED_ACCESS
415 	cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "riscv:online",
416 				  riscv_online_cpu, riscv_offline_cpu);
417 #endif
418 	cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "riscv:online",
419 				  riscv_online_cpu_vec, NULL);
420 
421 	return 0;
422 }
423 
424 arch_initcall(check_unaligned_access_all_cpus);
425