1 /***********************license start***********************************
2 * Copyright (c) 2003-2017 Cavium Inc. ([email protected]). All rights
3 * reserved.
4 *
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions are
8 * met:
9 *
10 * * Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 *
13 * * Redistributions in binary form must reproduce the above
14 * copyright notice, this list of conditions and the following
15 * disclaimer in the documentation and/or other materials provided
16 * with the distribution.
17 *
18 * * Neither the name of Cavium Inc. nor the names of
19 * its contributors may be used to endorse or promote products
20 * derived from this software without specific prior written
21 * permission.
22 *
23 * This Software, including technical data, may be subject to U.S. export
24 * control laws, including the U.S. Export Administration Act and its
25 * associated regulations, and may be subject to export or import
26 * regulations in other countries.
27 *
28 * TO THE MAXIMUM EXTENT PERMITTED BY LAW, THE SOFTWARE IS PROVIDED "AS IS"
29 * AND WITH ALL FAULTS AND CAVIUM INC. MAKES NO PROMISES, REPRESENTATIONS OR
30 * WARRANTIES, EITHER EXPRESS, IMPLIED, STATUTORY, OR OTHERWISE, WITH RESPECT
31 * TO THE SOFTWARE, INCLUDING ITS CONDITION, ITS CONFORMITY TO ANY
32 * REPRESENTATION OR DESCRIPTION, OR THE EXISTENCE OF ANY LATENT OR PATENT
33 * DEFECTS, AND CAVIUM SPECIFICALLY DISCLAIMS ALL IMPLIED (IF ANY) WARRANTIES
34 * OF TITLE, MERCHANTABILITY, NONINFRINGEMENT, FITNESS FOR A PARTICULAR
35 * PURPOSE, LACK OF VIRUSES, ACCURACY OR COMPLETENESS, QUIET ENJOYMENT,
36 * QUIET POSSESSION OR CORRESPONDENCE TO DESCRIPTION. THE ENTIRE RISK
37 * ARISING OUT OF USE OR PERFORMANCE OF THE SOFTWARE LIES WITH YOU.
38 ***********************license end**************************************/
39 #include <bdk.h>
40 #include "dram-internal.h"
41
42 #include <string.h>
43 #include <lame_string.h> /* for strtoul */
44 #include <libbdk-hal/bdk-atomic.h>
45 #include <libbdk-hal/bdk-clock.h>
46 #include <libbdk-hal/bdk-rng.h>
47 #include <libbdk-os/bdk-init.h>
48
49 // if enhanced verbosity levels are defined, use them
50 #if defined(VB_PRT)
51 #define ddr_print2(format, ...) VB_PRT(VBL_FAE, format, ##__VA_ARGS__)
52 #define ddr_print3(format, ...) VB_PRT(VBL_TME, format, ##__VA_ARGS__)
53 #define ddr_print4(format, ...) VB_PRT(VBL_DEV, format, ##__VA_ARGS__)
54 #define ddr_print5(format, ...) VB_PRT(VBL_DEV3, format, ##__VA_ARGS__)
55 #else
56 #define ddr_print2 ddr_print
57 #define ddr_print4 ddr_print
58 #define ddr_print5 ddr_print
59 #endif
60
61 static int64_t test_dram_byte_threads_done;
62 static uint64_t test_dram_byte_threads_errs;
63 static uint64_t test_dram_byte_lmc_errs[4];
64
65 #if 0
66 /*
67 * Suggested testing patterns.
68 */
69 static const uint64_t test_pattern_2[] = {
70 0xFFFFFFFFFFFFFFFFULL,
71 0xAAAAAAAAAAAAAAAAULL,
72 0xFFFFFFFFFFFFFFFFULL,
73 0xAAAAAAAAAAAAAAAAULL,
74 0x5555555555555555ULL,
75 0xAAAAAAAAAAAAAAAAULL,
76 0xFFFFFFFFFFFFFFFFULL,
77 0xAAAAAAAAAAAAAAAAULL,
78 0xFFFFFFFFFFFFFFFFULL,
79 0x5555555555555555ULL,
80 0xFFFFFFFFFFFFFFFFULL,
81 0x5555555555555555ULL,
82 0xAAAAAAAAAAAAAAAAULL,
83 0x5555555555555555ULL,
84 0xFFFFFFFFFFFFFFFFULL,
85 0x5555555555555555ULL,
86 };
87 /*
88 * or possibly
89 */
90 static const uint64_t test_pattern_3[] = {
91 0xFDFDFDFDFDFDFDFDULL,
92 0x8787878787878787ULL,
93 0xFEFEFEFEFEFEFEFEULL,
94 0xC3C3C3C3C3C3C3C3ULL,
95 0x7F7F7F7F7F7F7F7FULL,
96 0xE1E1E1E1E1E1E1E1ULL,
97 0xBFBFBFBFBFBFBFBFULL,
98 0xF0F0F0F0F0F0F0F0ULL,
99 0xDFDFDFDFDFDFDFDFULL,
100 0x7878787878787878ULL,
101 0xEFEFEFEFEFEFEFEFULL,
102 0x3C3C3C3C3C3C3C3CULL,
103 0xF7F7F7F7F7F7F7F7ULL,
104 0x1E1E1E1E1E1E1E1EULL,
105 0xFBFBFBFBFBFBFBFBULL,
106 0x0F0F0F0F0F0F0F0FULL,
107 };
108
109 static const uint64_t test_pattern_1[] = {
110 0xAAAAAAAAAAAAAAAAULL,
111 0x5555555555555555ULL,
112 0xAAAAAAAAAAAAAAAAULL,
113 0x5555555555555555ULL,
114 0xAAAAAAAAAAAAAAAAULL,
115 0x5555555555555555ULL,
116 0xAAAAAAAAAAAAAAAAULL,
117 0x5555555555555555ULL,
118 0xAAAAAAAAAAAAAAAAULL,
119 0x5555555555555555ULL,
120 0xAAAAAAAAAAAAAAAAULL,
121 0x5555555555555555ULL,
122 0xAAAAAAAAAAAAAAAAULL,
123 0x5555555555555555ULL,
124 0xAAAAAAAAAAAAAAAAULL,
125 0x5555555555555555ULL,
126 #if 0 // only need a cacheline size
127 0xAAAAAAAAAAAAAAAAULL,
128 0x5555555555555555ULL,
129 0xAAAAAAAAAAAAAAAAULL,
130 0x5555555555555555ULL,
131 0xAAAAAAAAAAAAAAAAULL,
132 0x5555555555555555ULL,
133 0xAAAAAAAAAAAAAAAAULL,
134 0x5555555555555555ULL,
135 0xAAAAAAAAAAAAAAAAULL,
136 0x5555555555555555ULL,
137 0xAAAAAAAAAAAAAAAAULL,
138 0x5555555555555555ULL,
139 0xAAAAAAAAAAAAAAAAULL,
140 0x5555555555555555ULL,
141 0xAAAAAAAAAAAAAAAAULL,
142 0x5555555555555555ULL,
143 #endif
144 };
145
146 // setup default for test pattern array
147 static const uint64_t *dram_tune_test_pattern = test_pattern_1;
148 #endif
149
150 // set this to 1 to shorten the testing to exit when all byte lanes have errors
151 // having this at 0 forces the testing to take place over the entire range every iteration,
152 // hopefully ensuring an even load on the memory subsystem
153 #define EXIT_WHEN_ALL_LANES_HAVE_ERRORS 0
154
155 #define DEFAULT_TEST_BURSTS 5 // FIXME: this is what works so far...// FIXME: was 7
156 int dram_tune_use_bursts = DEFAULT_TEST_BURSTS;
157
158 // dram_tune_rank_offset is used to offset the second area used in test_dram_mem_xor.
159 //
160 // If only a single-rank DIMM, the offset will be 256MB from the start of the first area,
161 // which is more than enough for the restricted looping/address range actually tested...
162 //
163 // If a 2-rank DIMM, the offset will be the size of a rank's address space, so the effect
164 // will be to have the first and second areas in different ranks on the same DIMM.
165 //
166 // So, we default this to single-rank, and it will be overridden when 2-ranks are detected.
167 //
168
169 // FIXME: ASSUME that we have DIMMS no less than 4GB in size
170
171 // offset to first area that avoids any boot stuff in low range (below 256MB)
172 #define AREA_BASE_OFFSET (1ULL << 28) // bit 28 always ON
173
174 // offset to duplicate area; may coincide with rank 1 base address for 2-rank 4GB DIMM
175 #define AREA_DUPE_OFFSET (1ULL << 31) // bit 31 always ON
176
177 // defaults to DUPE, but will be set elsewhere to offset to next RANK if multi-rank DIMM
178 static uint64_t dram_tune_rank_offset = AREA_DUPE_OFFSET; // default
179
180 // defaults to 0, but will be set elsewhere to the address offset to next DIMM if multi-slot
181 static uint64_t dram_tune_dimm_offset = 0; // default
182
183
184 static int speed_bin_offset[3] = {25, 20, 15};
185 static int speed_bin_winlen[3] = {70, 60, 60};
186
187 static int
get_speed_bin(bdk_node_t node,int lmc)188 get_speed_bin(bdk_node_t node, int lmc)
189 {
190 uint32_t mts_speed = (libdram_get_freq_from_pll(node, lmc) / 1000000) * 2;
191 int ret = 0;
192
193 // FIXME: is this reasonable speed "binning"?
194 if (mts_speed >= 1700) {
195 if (mts_speed >= 2000)
196 ret = 2;
197 else
198 ret = 1;
199 }
200
201 debug_print("N%d.LMC%d: %s: returning bin %d for MTS %d\n",
202 node, lmc, __func__, ret, mts_speed);
203
204 return ret;
205 }
206
is_low_risk_offset(int speed_bin,int offset)207 static int is_low_risk_offset(int speed_bin, int offset)
208 {
209 return (_abs(offset) <= speed_bin_offset[speed_bin]);
210 }
is_low_risk_winlen(int speed_bin,int winlen)211 static int is_low_risk_winlen(int speed_bin, int winlen)
212 {
213 return (winlen >= speed_bin_winlen[speed_bin]);
214 }
215
216 #define ENABLE_PREFETCH 0
217 #define ENABLE_WBIL2 1
218 #define ENABLE_SBLKDTY 0
219
220 #define BDK_SYS_CVMCACHE_INV_L2 "#0,c11,c1,#1" // L2 Cache Invalidate
221 #define BDK_CACHE_INV_L2(address) { asm volatile ("sys " BDK_SYS_CVMCACHE_INV_L2 ", %0" : : "r" (address)); }
222
dram_tuning_mem_xor(bdk_node_t node,int lmc,uint64_t p,uint64_t bitmask,uint64_t * xor_data)223 int dram_tuning_mem_xor(bdk_node_t node, int lmc, uint64_t p, uint64_t bitmask, uint64_t *xor_data)
224 {
225 uint64_t p1, p2, d1, d2;
226 uint64_t v, v1;
227 uint64_t p2offset = 0x10000000/* was: dram_tune_rank_offset; */; // FIXME?
228 uint64_t datamask;
229 uint64_t xor;
230 uint64_t i, j, k;
231 uint64_t ii;
232 int errors = 0;
233 //uint64_t index;
234 uint64_t pattern1 = bdk_rng_get_random64();
235 uint64_t pattern2 = 0;
236 uint64_t bad_bits[2] = {0,0};
237
238 #if ENABLE_SBLKDTY
239 BDK_CSR_MODIFY(c, node, BDK_L2C_CTL, c.s.dissblkdty = 0);
240 #endif
241
242 // Byte lanes may be clear in the mask to indicate no testing on that lane.
243 datamask = bitmask;
244
245 // final address must include LMC and node
246 p |= (lmc<<7); /* Map address into proper interface */
247 p = bdk_numa_get_address(node, p); /* Map to node */
248
249 /* Add offset to both test regions to not clobber boot stuff
250 * when running from L2 for NAND boot.
251 */
252 p += AREA_BASE_OFFSET; // make sure base is out of the way of boot
253
254 #define II_INC (1ULL << 29)
255 #define II_MAX (1ULL << 31)
256 #define K_INC (1ULL << 14)
257 #define K_MAX (1ULL << 20)
258 #define J_INC (1ULL << 9)
259 #define J_MAX (1ULL << 12)
260 #define I_INC (1ULL << 3)
261 #define I_MAX (1ULL << 7)
262
263 debug_print("N%d.LMC%d: dram_tuning_mem_xor: phys_addr=0x%lx\n",
264 node, lmc, p);
265
266 #if 0
267 int ix;
268 // add this loop to fill memory with the test pattern first
269 // loops are ordered so that only entire cachelines are written
270 for (ii = 0; ii < II_MAX; ii += II_INC) { // FIXME? extend the range of memory tested!!
271 for (k = 0; k < K_MAX; k += K_INC) {
272 for (j = 0; j < J_MAX; j += J_INC) {
273 p1 = p + ii + k + j;
274 p2 = p1 + p2offset;
275 for (i = 0, ix = 0; i < I_MAX; i += I_INC, ix++) {
276
277 v = dram_tune_test_pattern[ix];
278 v1 = v; // write the same thing to both areas
279
280 __bdk_dram_write64(p1 + i, v);
281 __bdk_dram_write64(p2 + i, v1);
282
283 }
284 #if ENABLE_WBIL2
285 BDK_CACHE_WBI_L2(p1);
286 BDK_CACHE_WBI_L2(p2);
287 #endif
288 }
289 }
290 } /* for (ii = 0; ii < (1ULL << 31); ii += (1ULL << 29)) */
291 #endif
292
293 #if ENABLE_PREFETCH
294 BDK_PREFETCH(p , BDK_CACHE_LINE_SIZE);
295 BDK_PREFETCH(p + p2offset, BDK_CACHE_LINE_SIZE);
296 #endif
297
298 // loops are ordered so that only a single 64-bit slot is written to each cacheline at one time,
299 // then the cachelines are forced out; this should maximize read/write traffic
300 for (ii = 0; ii < II_MAX; ii += II_INC) { // FIXME? extend the range of memory tested!!
301 for (k = 0; k < K_MAX; k += K_INC) {
302 for (i = 0; i < I_MAX; i += I_INC) {
303 for (j = 0; j < J_MAX; j += J_INC) {
304
305 p1 = p + ii + k + j;
306 p2 = p1 + p2offset;
307
308 #if ENABLE_PREFETCH
309 if (j < (J_MAX - J_INC)) {
310 BDK_PREFETCH(p1 + J_INC, BDK_CACHE_LINE_SIZE);
311 BDK_PREFETCH(p2 + J_INC, BDK_CACHE_LINE_SIZE);
312 }
313 #endif
314
315 v = pattern1 * (p1 + i);
316 v1 = v; // write the same thing to both areas
317
318 __bdk_dram_write64(p1 + i, v);
319 __bdk_dram_write64(p2 + i, v1);
320
321 #if ENABLE_WBIL2
322 BDK_CACHE_WBI_L2(p1);
323 BDK_CACHE_WBI_L2(p2);
324 #endif
325 }
326 }
327 }
328 } /* for (ii = 0; ii < (1ULL << 31); ii += (1ULL << 29)) */
329
330 BDK_DCACHE_INVALIDATE;
331
332 debug_print("N%d.LMC%d: dram_tuning_mem_xor: done INIT loop\n",
333 node, lmc);
334
335 /* Make a series of passes over the memory areas. */
336
337 for (int burst = 0; burst < 1/* was: dram_tune_use_bursts*/; burst++)
338 {
339 uint64_t this_pattern = bdk_rng_get_random64();
340 pattern2 ^= this_pattern;
341
342 /* XOR the data with a random value, applying the change to both
343 * memory areas.
344 */
345 #if ENABLE_PREFETCH
346 BDK_PREFETCH(p , BDK_CACHE_LINE_SIZE);
347 BDK_PREFETCH(p + p2offset, BDK_CACHE_LINE_SIZE);
348 #endif
349
350 for (ii = 0; ii < II_MAX; ii += II_INC) { // FIXME? extend the range of memory tested!!
351 for (k = 0; k < K_MAX; k += K_INC) {
352 for (i = 0; i < I_MAX; i += I_INC) { // FIXME: rearranged, did not make much difference?
353 for (j = 0; j < J_MAX; j += J_INC) {
354
355 p1 = p + ii + k + j;
356 p2 = p1 + p2offset;
357
358 #if ENABLE_PREFETCH
359 if (j < (J_MAX - J_INC)) {
360 BDK_PREFETCH(p1 + J_INC, BDK_CACHE_LINE_SIZE);
361 BDK_PREFETCH(p2 + J_INC, BDK_CACHE_LINE_SIZE);
362 }
363 #endif
364
365 v = __bdk_dram_read64(p1 + i) ^ this_pattern;
366 v1 = __bdk_dram_read64(p2 + i) ^ this_pattern;
367
368 #if ENABLE_WBIL2
369 BDK_CACHE_INV_L2(p1);
370 BDK_CACHE_INV_L2(p2);
371 #endif
372
373 __bdk_dram_write64(p1 + i, v);
374 __bdk_dram_write64(p2 + i, v1);
375
376 #if ENABLE_WBIL2
377 BDK_CACHE_WBI_L2(p1);
378 BDK_CACHE_WBI_L2(p2);
379 #endif
380 }
381 }
382 }
383 } /* for (ii = 0; ii < (1ULL << 31); ii += (1ULL << 29)) */
384
385 BDK_DCACHE_INVALIDATE;
386
387 debug_print("N%d.LMC%d: dram_tuning_mem_xor: done MODIFY loop\n",
388 node, lmc);
389
390 #if ENABLE_PREFETCH
391 BDK_PREFETCH(p , BDK_CACHE_LINE_SIZE);
392 BDK_PREFETCH(p + p2offset, BDK_CACHE_LINE_SIZE);
393 #endif
394
395 /* Look for differences in the areas. If there is a mismatch, reset
396 * both memory locations with the same pattern. Failing to do so
397 * means that on all subsequent passes the pair of locations remain
398 * out of sync giving spurious errors.
399 */
400 // FIXME: change the loop order so that an entire cache line is compared at one time
401 // FIXME: this is so that a read error that occurs *anywhere* on the cacheline will be caught,
402 // FIXME: rather than comparing only 1 cacheline slot at a time, where an error on a different
403 // FIXME: slot will be missed that time around
404 // Does the above make sense?
405
406 for (ii = 0; ii < II_MAX; ii += II_INC) { // FIXME? extend the range of memory tested!!
407 for (k = 0; k < K_MAX; k += K_INC) {
408 for (j = 0; j < J_MAX; j += J_INC) {
409
410 p1 = p + ii + k + j;
411 p2 = p1 + p2offset;
412
413 #if ENABLE_PREFETCH
414 if (j < (J_MAX - J_INC)) {
415 BDK_PREFETCH(p1 + J_INC, BDK_CACHE_LINE_SIZE);
416 BDK_PREFETCH(p2 + J_INC, BDK_CACHE_LINE_SIZE);
417 }
418 #endif
419
420 // process entire cachelines in the innermost loop
421 for (i = 0; i < I_MAX; i += I_INC) {
422
423 v = ((p1 + i) * pattern1) ^ pattern2; // FIXME: this should predict what we find...???
424 d1 = __bdk_dram_read64(p1 + i);
425 d2 = __bdk_dram_read64(p2 + i);
426
427 xor = ((d1 ^ v) | (d2 ^ v)) & datamask; // union of error bits only in active byte lanes
428
429 if (!xor)
430 continue;
431
432 // accumulate bad bits
433 bad_bits[0] |= xor;
434 //bad_bits[1] |= ~mpr_data1 & 0xffUL; // cannot do ECC here
435
436 int bybit = 1;
437 uint64_t bymsk = 0xffULL; // start in byte lane 0
438 while (xor != 0) {
439 debug_print("ERROR(%03d): [0x%016lX] [0x%016lX] expected 0x%016lX d1 %016lX d2 %016lX\n",
440 burst, p1, p2, v, d1, d2);
441 if (xor & bymsk) { // error(s) in this lane
442 errors |= bybit; // set the byte error bit
443 xor &= ~bymsk; // clear byte lane in error bits
444 datamask &= ~bymsk; // clear the byte lane in the mask
445 #if EXIT_WHEN_ALL_LANES_HAVE_ERRORS
446 if (datamask == 0) { // nothing left to do
447 return errors; // completely done when errors found in all byte lanes in datamask
448 }
449 #endif /* EXIT_WHEN_ALL_LANES_HAVE_ERRORS */
450 }
451 bymsk <<= 8; // move mask into next byte lane
452 bybit <<= 1; // move bit into next byte position
453 }
454 }
455 #if ENABLE_WBIL2
456 BDK_CACHE_WBI_L2(p1);
457 BDK_CACHE_WBI_L2(p2);
458 #endif
459 }
460 }
461 } /* for (ii = 0; ii < (1ULL << 31); ii += (1ULL << 29)) */
462
463 debug_print("N%d.LMC%d: dram_tuning_mem_xor: done TEST loop\n",
464 node, lmc);
465
466 } /* for (int burst = 0; burst < dram_tune_use_bursts; burst++) */
467
468 if (xor_data != NULL) { // send the bad bits back...
469 xor_data[0] = bad_bits[0];
470 xor_data[1] = bad_bits[1]; // let it be zeroed
471 }
472
473 #if ENABLE_SBLKDTY
474 BDK_CSR_MODIFY(c, node, BDK_L2C_CTL, c.s.dissblkdty = 1);
475 #endif
476
477 return errors;
478 }
479
480 #undef II_INC
481 #undef II_MAX
482
483 #define EXTRACT(v, lsb, width) (((v) >> (lsb)) & ((1ull << (width)) - 1))
484 #define LMCNO(address, xbits) (EXTRACT(address, 7, xbits) ^ EXTRACT(address, 20, xbits) ^ EXTRACT(address, 12, xbits))
485
486 // cores to use
487 #define DEFAULT_USE_CORES 44 // FIXME: was (1 << CORE_BITS)
488 int dram_tune_use_cores = DEFAULT_USE_CORES; // max cores to use, override available
489 int dram_tune_max_cores; // max cores available on a node
490 #define CORE_SHIFT 22 // FIXME: offset into rank_address passed to test_dram_byte
491
492 typedef void (*__dram_tuning_thread_t)(int arg, void *arg1);
493
494 typedef struct
495 {
496 bdk_node_t node;
497 int64_t num_lmcs;
498 uint64_t byte_mask;
499 } test_dram_byte_info_t;
500
501 static int dram_tune_use_xor2 = 1; // FIXME: do NOT default to original mem_xor (LMC-based) code
502
503 static int
run_dram_tuning_threads(bdk_node_t node,int num_lmcs,uint64_t bytemask)504 run_dram_tuning_threads(bdk_node_t node, int num_lmcs, uint64_t bytemask)
505 {
506 test_dram_byte_info_t test_dram_byte_info;
507 test_dram_byte_info_t *test_info = &test_dram_byte_info;
508 int total_count = 0;
509
510 test_info->node = node;
511 test_info->num_lmcs = num_lmcs;
512 test_info->byte_mask = bytemask;
513
514 // init some global data
515 bdk_atomic_set64(&test_dram_byte_threads_done, 0);
516 bdk_atomic_set64((int64_t *)&test_dram_byte_threads_errs, 0);
517 bdk_atomic_set64((int64_t *)&test_dram_byte_lmc_errs[0], 0);
518 bdk_atomic_set64((int64_t *)&test_dram_byte_lmc_errs[1], 0);
519 bdk_atomic_set64((int64_t *)&test_dram_byte_lmc_errs[2], 0);
520 bdk_atomic_set64((int64_t *)&test_dram_byte_lmc_errs[3], 0);
521
522 /* Start threads for cores on the node */
523 if (bdk_numa_exists(node)) {
524 /* FIXME(dhendrix): We shouldn't hit this. */
525 die("bdk_numa_exists() is non-zero\n");
526 }
527
528 #if 0
529 /* Wait for threads to finish */
530 while (bdk_atomic_get64(&test_dram_byte_threads_done) < total_count)
531 bdk_thread_yield();
532 #else
533 #define TIMEOUT_SECS 5 // FIXME: long enough so a pass for a given setting will not print
534 /* Wait for threads to finish, with progress */
535 int cur_count;
536 uint64_t cur_time;
537 uint64_t period = bdk_clock_get_rate(bdk_numa_local(), BDK_CLOCK_TIME) * TIMEOUT_SECS; // FIXME?
538 uint64_t timeout = bdk_clock_get_count(BDK_CLOCK_TIME) + period;
539 do {
540 // bdk_thread_yield(); /* FIXME(dhendrix): don't yield... */
541 cur_count = bdk_atomic_get64(&test_dram_byte_threads_done);
542 cur_time = bdk_clock_get_count(BDK_CLOCK_TIME);
543 if (cur_time >= timeout) {
544 printf("Waiting for %d cores\n", total_count - cur_count);
545 timeout = cur_time + period;
546 }
547 } while (cur_count < total_count);
548 #endif
549
550 // NOTE: this is the summary of errors across all LMCs
551 return (int)bdk_atomic_get64((int64_t *)&test_dram_byte_threads_errs);
552 }
553
554 /* These variables count the number of ECC errors. They should only be accessed atomically */
555 /* FIXME(dhendrix): redundant declaration in original BDK sources */
556 //extern int64_t __bdk_dram_ecc_single_bit_errors[];
557 extern int64_t __bdk_dram_ecc_double_bit_errors[];
558
559 #define DEFAULT_SAMPLE_GRAN 3 // sample for errors every N offset values
560 #define MIN_BYTE_OFFSET -63
561 #define MAX_BYTE_OFFSET +63
562 int dram_tune_use_gran = DEFAULT_SAMPLE_GRAN;
563
564 static int
auto_set_dll_offset(bdk_node_t node,int dll_offset_mode,int num_lmcs,int ddr_interface_64b,int do_tune)565 auto_set_dll_offset(bdk_node_t node, int dll_offset_mode,
566 int num_lmcs, int ddr_interface_64b,
567 int do_tune)
568 {
569 int byte_offset;
570 //unsigned short result[9];
571 int byte;
572 int byte_delay_start[4][9];
573 int byte_delay_count[4][9];
574 uint64_t byte_delay_windows [4][9];
575 int byte_delay_best_start[4][9];
576 int byte_delay_best_count[4][9];
577 //int this_rodt;
578 uint64_t ops_sum[4], dclk_sum[4];
579 uint64_t start_dram_dclk[4], stop_dram_dclk[4];
580 uint64_t start_dram_ops[4], stop_dram_ops[4];
581 int errors, tot_errors;
582 int lmc;
583 const char *mode_str = (dll_offset_mode == 2) ? "Read" : "Write"; /* FIXME(dhendrix): const */
584 int mode_is_read = (dll_offset_mode == 2);
585 const char *mode_blk = (dll_offset_mode == 2) ? " " : ""; /* FIXME(dhendrix): const */
586 int start_offset, end_offset, incr_offset;
587
588 int speed_bin = get_speed_bin(node, 0); // FIXME: just get from LMC0?
589 int needs_review_count = 0;
590
591 if (dram_tune_use_gran != DEFAULT_SAMPLE_GRAN) {
592 ddr_print2("N%d: Changing sample granularity from %d to %d\n",
593 node, DEFAULT_SAMPLE_GRAN, dram_tune_use_gran);
594 }
595 // ensure sample is taken at 0
596 start_offset = MIN_BYTE_OFFSET - (MIN_BYTE_OFFSET % dram_tune_use_gran);
597 end_offset = MAX_BYTE_OFFSET - (MAX_BYTE_OFFSET % dram_tune_use_gran);
598 incr_offset = dram_tune_use_gran;
599
600 memset(ops_sum, 0, sizeof(ops_sum));
601 memset(dclk_sum, 0, sizeof(dclk_sum));
602 memset(byte_delay_start, 0, sizeof(byte_delay_start));
603 memset(byte_delay_count, 0, sizeof(byte_delay_count));
604 memset(byte_delay_windows, 0, sizeof(byte_delay_windows));
605 memset(byte_delay_best_start, 0, sizeof(byte_delay_best_start));
606 memset(byte_delay_best_count, 0, sizeof(byte_delay_best_count));
607
608 // FIXME? consult LMC0 only
609 BDK_CSR_INIT(lmcx_config, node, BDK_LMCX_CONFIG(0));
610 if (lmcx_config.s.rank_ena) { // replace the default offset when there is more than 1 rank...
611 dram_tune_rank_offset = 1ull << (28 + lmcx_config.s.pbank_lsb - lmcx_config.s.rank_ena + (num_lmcs/2));
612 /* FIXME(dhendrix): %lx --> %llx */
613 ddr_print2("N%d: Tuning multiple ranks per DIMM (rank offset 0x%llx).\n", node, dram_tune_rank_offset);
614 }
615 if (lmcx_config.s.init_status & 0x0c) { // bit 2 or 3 set indicates 2 DIMMs
616 dram_tune_dimm_offset = 1ull << (28 + lmcx_config.s.pbank_lsb + (num_lmcs/2));
617 /* FIXME(dhendrix): %lx --> %llx */
618 ddr_print2("N%d: Tuning multiple DIMMs per channel (DIMM offset 0x%llx)\n", node, dram_tune_dimm_offset);
619 }
620
621 // FIXME? do this for LMC0 only
622 //BDK_CSR_INIT(comp_ctl2, node, BDK_LMCX_COMP_CTL2(0));
623 //this_rodt = comp_ctl2.s.rodt_ctl;
624
625 // construct the bytemask
626 int bytes_todo = (ddr_interface_64b) ? 0xff : 0x0f;
627 uint64_t bytemask = 0;
628 for (byte = 0; byte < 8; ++byte) {
629 if (bytes_todo & (1 << byte)) {
630 bytemask |= 0xfful << (8*byte); // set the bytes bits in the bytemask
631 }
632 } /* for (byte = 0; byte < 8; ++byte) */
633
634 // now loop through selected legal values for the DLL byte offset...
635
636 for (byte_offset = start_offset; byte_offset <= end_offset; byte_offset += incr_offset) {
637
638 // do the setup on active LMCs
639 for (lmc = 0; lmc < num_lmcs; lmc++) {
640 change_dll_offset_enable(node, lmc, 0);
641
642 // set all byte lanes at once
643 load_dll_offset(node, lmc, dll_offset_mode, byte_offset, 10 /* All bytes at once */);
644 // but then clear the ECC byte lane so it should be neutral for the test...
645 load_dll_offset(node, lmc, dll_offset_mode, 0, 8);
646
647 change_dll_offset_enable(node, lmc, 1);
648
649 // record start cycle CSRs here for utilization measure
650 start_dram_dclk[lmc] = BDK_CSR_READ(node, BDK_LMCX_DCLK_CNT(lmc));
651 start_dram_ops[lmc] = BDK_CSR_READ(node, BDK_LMCX_OPS_CNT(lmc));
652 } /* for (lmc = 0; lmc < num_lmcs; lmc++) */
653
654 bdk_watchdog_poke();
655
656 // run the test(s)
657 // only 1 call should be enough, let the bursts, etc, control the load...
658 run_dram_tuning_threads(node, num_lmcs, bytemask);
659
660 for (lmc = 0; lmc < num_lmcs; lmc++) {
661 // record stop cycle CSRs here for utilization measure
662 stop_dram_dclk[lmc] = BDK_CSR_READ(node, BDK_LMCX_DCLK_CNT(lmc));
663 stop_dram_ops[lmc] = BDK_CSR_READ(node, BDK_LMCX_OPS_CNT(lmc));
664
665 // accumulate...
666 ops_sum[lmc] += stop_dram_ops[lmc] - start_dram_ops[lmc];
667 dclk_sum[lmc] += stop_dram_dclk[lmc] - start_dram_dclk[lmc];
668
669 errors = test_dram_byte_lmc_errs[lmc];
670
671 // check errors by byte, but not ECC
672 for (byte = 0; byte < 8; ++byte) {
673 if (!(bytes_todo & (1 << byte))) // is this byte lane to be done
674 continue; // no
675
676 byte_delay_windows[lmc][byte] <<= 1; // always put in a zero
677 if (errors & (1 << byte)) { // yes, an error in this byte lane
678 byte_delay_count[lmc][byte] = 0; // stop now always
679 } else { // no error in this byte lane
680 if (byte_delay_count[lmc][byte] == 0) { // first success, set run start
681 byte_delay_start[lmc][byte] = byte_offset;
682 }
683 byte_delay_count[lmc][byte] += incr_offset; // bump run length
684
685 if (byte_delay_count[lmc][byte] > byte_delay_best_count[lmc][byte]) {
686 byte_delay_best_count[lmc][byte] = byte_delay_count[lmc][byte];
687 byte_delay_best_start[lmc][byte] = byte_delay_start[lmc][byte];
688 }
689 byte_delay_windows[lmc][byte] |= 1ULL; // for pass, put in a 1
690 }
691 } /* for (byte = 0; byte < 8; ++byte) */
692
693 // only print when there are errors and verbose...
694 if (errors) {
695 debug_print("DLL %s Offset Test %3d: errors 0x%x\n",
696 mode_str, byte_offset, errors);
697 }
698 } /* for (lmc = 0; lmc < num_lmcs; lmc++) */
699
700 } /* for (byte_offset=-63; byte_offset<63; byte_offset += incr_offset) */
701
702 // done with testing, load up and/or print out the offsets we found...
703
704 // only when margining...
705 if (!do_tune) {
706 printf(" \n");
707 printf("-------------------------------------\n");
708 #if 0
709 uint32_t mts_speed = (libdram_get_freq_from_pll(node, 0) * 2) / 1000000; // FIXME: sample LMC0
710 printf("N%d: Starting %s Timing Margining for %d MT/s.\n", node, mode_str, mts_speed);
711 #else
712 printf("N%d: Starting %s Timing Margining.\n", node, mode_str);
713 #endif
714 printf(" \n");
715 } /* if (!do_tune) */
716
717 for (lmc = 0; lmc < num_lmcs; lmc++) {
718 #if 1
719 // FIXME FIXME
720 // FIXME: this just makes ECC always show 0
721 byte_delay_best_start[lmc][8] = start_offset;
722 byte_delay_best_count[lmc][8] = end_offset - start_offset + incr_offset;
723 #endif
724
725 // disable offsets while we load...
726 change_dll_offset_enable(node, lmc, 0);
727
728 // only when margining...
729 if (!do_tune) {
730 // print the heading
731 printf(" \n");
732 printf("N%d.LMC%d: %s Timing Margin %s : ", node, lmc, mode_str, mode_blk);
733 printf(" ECC/8 ");
734 for (byte = 7; byte >= 0; byte--) {
735 printf(" Byte %d ", byte);
736 }
737 printf("\n");
738 } /* if (!do_tune) */
739
740 // print and load the offset values
741 // print the windows bit arrays
742 // only when margining...
743 if (!do_tune) {
744 printf("N%d.LMC%d: DLL %s Offset Amount %s : ", node, lmc, mode_str, mode_blk);
745 } else {
746 ddr_print("N%d.LMC%d: SW DLL %s Offset Amount %s : ", node, lmc, mode_str, mode_blk);
747 }
748 for (byte = 8; byte >= 0; --byte) { // print in "normal" reverse index order
749
750 int count = byte_delay_best_count[lmc][byte];
751 if (count == 0)
752 count = incr_offset; // should make non-tested ECC byte come out 0
753
754 byte_offset = byte_delay_best_start[lmc][byte] +
755 ((count - incr_offset) / 2); // adj by incr
756
757 if (!do_tune) { // do counting and special flag if margining
758 int will_need_review = !is_low_risk_winlen(speed_bin, (count - incr_offset)) &&
759 !is_low_risk_offset(speed_bin, byte_offset);
760
761 printf("%10d%c", byte_offset, (will_need_review) ? '<' :' ');
762
763 if (will_need_review)
764 needs_review_count++;
765 } else { // if just tuning, make the printout less lengthy
766 ddr_print("%5d ", byte_offset);
767 }
768
769 // FIXME? should we be able to override this?
770 if (mode_is_read) // for READ offsets, always store what we found
771 load_dll_offset(node, lmc, dll_offset_mode, byte_offset, byte);
772 else // for WRITE offsets, always store 0
773 load_dll_offset(node, lmc, dll_offset_mode, 0, byte);
774
775 }
776 if (!do_tune) {
777 printf("\n");
778 } else {
779 ddr_print("\n");
780 }
781
782
783 // re-enable the offsets now that we are done loading
784 change_dll_offset_enable(node, lmc, 1);
785
786 // only when margining...
787 if (!do_tune) {
788 // print the window sizes
789 printf("N%d.LMC%d: DLL %s Window Length %s : ", node, lmc, mode_str, mode_blk);
790 for (byte = 8; byte >= 0; --byte) { // print in "normal" reverse index order
791 int count = byte_delay_best_count[lmc][byte];
792 if (count == 0)
793 count = incr_offset; // should make non-tested ECC byte come out 0
794
795 // do this again since the "needs review" test is an AND...
796 byte_offset = byte_delay_best_start[lmc][byte] +
797 ((count - incr_offset) / 2); // adj by incr
798
799 int will_need_review = !is_low_risk_winlen(speed_bin, (count - incr_offset)) &&
800 !is_low_risk_offset(speed_bin, byte_offset);
801
802 printf("%10d%c", count - incr_offset, (will_need_review) ? '<' :' ');
803 }
804 printf("\n");
805
806 // print the window extents
807 printf("N%d.LMC%d: DLL %s Window Bounds %s : ", node, lmc, mode_str, mode_blk);
808 for (byte = 8; byte >= 0; --byte) { // print in "normal" reverse index order
809 int start = byte_delay_best_start[lmc][byte];
810 int count = byte_delay_best_count[lmc][byte];
811 if (count == 0)
812 count = incr_offset; // should make non-tested ECC byte come out 0
813 printf(" %3d to%3d ", start,
814 start + count - incr_offset);
815 }
816 printf("\n");
817 #if 0
818 // FIXME: should have a way to force these out...
819 // print the windows bit arrays
820 printf("N%d.LMC%d: DLL %s Window Bitmap%s : ", node, lmc, mode_str, mode_blk);
821 for (byte = 8; byte >= 0; --byte) { // print in "normal" reverse index order
822 printf("%010lx ", byte_delay_windows[lmc][byte]);
823 }
824 printf("\n");
825 #endif
826 } /* if (!do_tune) */
827 } /* for (lmc = 0; lmc < num_lmcs; lmc++) */
828
829 // only when margining...
830 if (!do_tune) {
831 // print the Summary line(s) here
832 printf(" \n");
833 printf("N%d: %s Timing Margining Summary : %s ", node, mode_str,
834 (needs_review_count > 0) ? "Needs Review" : "Low Risk");
835 if (needs_review_count > 0)
836 printf("(%d)", needs_review_count);
837 printf("\n");
838
839 // FIXME??? want to print here: "N0: %s Offsets have been applied already"
840
841 printf("-------------------------------------\n");
842 printf(" \n");
843 } /* if (!do_tune) */
844
845 // FIXME: we probably want this only when doing verbose...
846 // finally, print the utilizations all together
847 for (lmc = 0; lmc < num_lmcs; lmc++) {
848 uint64_t percent_x10 = ops_sum[lmc] * 1000 / dclk_sum[lmc];
849 /* FIXME(dhendrix): %lu --> %llu */
850 ddr_print2("N%d.LMC%d: ops %llu, cycles %llu, used %llu.%llu%%\n",
851 node, lmc, ops_sum[lmc], dclk_sum[lmc], percent_x10 / 10, percent_x10 % 10);
852 } /* for (lmc = 0; lmc < num_lmcs; lmc++) */
853
854 // FIXME: only when verbose, or only when there are errors?
855 // run the test one last time
856 // print whether there are errors or not, but only when verbose...
857 bdk_watchdog_poke();
858 debug_print("N%d: %s: Start running test one last time\n", node, __func__);
859 tot_errors = run_dram_tuning_threads(node, num_lmcs, bytemask);
860 debug_print("N%d: %s: Finished running test one last time\n", node, __func__);
861 if (tot_errors)
862 ddr_print2("%s Timing Final Test: errors 0x%x\n", mode_str, tot_errors);
863
864 return (do_tune) ? tot_errors : !!(needs_review_count > 0);
865 }
866
867 #define USE_L2_WAYS_LIMIT 0 // non-zero to enable L2 ways limiting
868
869 /*
870 * Automatically adjust the DLL offset for the data bytes
871 */
perform_dll_offset_tuning(bdk_node_t node,int dll_offset_mode,int do_tune)872 int perform_dll_offset_tuning(bdk_node_t node, int dll_offset_mode, int do_tune)
873 {
874 int ddr_interface_64b;
875 int save_ecc_ena[4];
876 bdk_lmcx_config_t lmc_config;
877 int lmc, num_lmcs = __bdk_dram_get_num_lmc(node);
878 const char *s;
879 #if USE_L2_WAYS_LIMIT
880 int ways, ways_print = 0;
881 #endif
882 #if 0
883 int dram_tune_use_rodt = -1, save_rodt[4];
884 bdk_lmcx_comp_ctl2_t comp_ctl2;
885 #endif
886 int loops = 1, loop;
887 uint64_t orig_coremask;
888 int errs = 0;
889
890 // enable any non-running cores on this node
891 orig_coremask = bdk_get_running_coremask(node);
892 /* FIXME(dhendrix): %lx --> %llx */
893 ddr_print4("N%d: %s: Starting cores (mask was 0x%llx)\n",
894 node, __func__, orig_coremask);
895 /* FIXME(dhendrix): don't call bdk_init_cores(). */
896 // bdk_init_cores(node, ~0ULL & ~orig_coremask);
897 dram_tune_max_cores = bdk_get_num_running_cores(node);
898
899 // but use only a certain number of cores, at most what is available
900 if ((s = getenv("ddr_tune_use_cores")) != NULL) {
901 dram_tune_use_cores = strtoul(s, NULL, 0);
902 if (dram_tune_use_cores <= 0) // allow 0 or negative to mean all
903 dram_tune_use_cores = dram_tune_max_cores;
904 }
905 if (dram_tune_use_cores > dram_tune_max_cores)
906 dram_tune_use_cores = dram_tune_max_cores;
907
908 // see if we want to do the tuning more than once per LMC...
909 if ((s = getenv("ddr_tune_use_loops"))) {
910 loops = strtoul(s, NULL, 0);
911 }
912
913 // see if we want to change the granularity of the byte_offset sampling
914 if ((s = getenv("ddr_tune_use_gran"))) {
915 dram_tune_use_gran = strtoul(s, NULL, 0);
916 }
917
918 // allow override of the test repeats (bursts) per thread create
919 if ((s = getenv("ddr_tune_use_bursts")) != NULL) {
920 dram_tune_use_bursts = strtoul(s, NULL, 10);
921 }
922
923 #if 0
924 // allow override of Read ODT setting just during the tuning run(s)
925 if ((s = getenv("ddr_tune_use_rodt")) != NULL) {
926 int temp = strtoul(s, NULL, 10);
927 // validity check
928 if (temp >= 0 && temp <= 7)
929 dram_tune_use_rodt = temp;
930 }
931 #endif
932
933 #if 0
934 // allow override of the test pattern
935 // FIXME: a bit simplistic...
936 if ((s = getenv("ddr_tune_use_pattern")) != NULL) {
937 int patno = strtoul(s, NULL, 10);
938 if (patno == 2)
939 dram_tune_test_pattern = test_pattern_2;
940 else if (patno == 3)
941 dram_tune_test_pattern = test_pattern_3;
942 else // all other values use default
943 dram_tune_test_pattern = test_pattern_1;
944 }
945 #endif
946
947 // allow override of the test mem_xor algorithm
948 if ((s = getenv("ddr_tune_use_xor2")) != NULL) {
949 dram_tune_use_xor2 = !!strtoul(s, NULL, 10);
950 }
951
952 // print current working values
953 ddr_print2("N%d: Tuning will use %d cores of max %d cores, and use %d repeats.\n",
954 node, dram_tune_use_cores, dram_tune_max_cores,
955 dram_tune_use_bursts);
956
957 #if USE_L2_WAYS_LIMIT
958 // see if L2 ways are limited
959 if ((s = lookup_env_parameter("limit_l2_ways")) != NULL) {
960 ways = strtoul(s, NULL, 10);
961 ways_print = 1;
962 } else {
963 ways = bdk_l2c_get_num_assoc(node);
964 }
965 #endif
966
967 #if 0
968 // if RODT is to be overridden during tuning, note change
969 if (dram_tune_use_rodt >= 0) {
970 ddr_print("N%d: using RODT %d for tuning.\n",
971 node, dram_tune_use_rodt);
972 }
973 #endif
974
975 // FIXME? get flag from LMC0 only
976 lmc_config.u = BDK_CSR_READ(node, BDK_LMCX_CONFIG(0));
977 ddr_interface_64b = !lmc_config.s.mode32b;
978
979 // do setup for each active LMC
980 debug_print("N%d: %s: starting LMCs setup.\n", node, __func__);
981 for (lmc = 0; lmc < num_lmcs; lmc++) {
982
983 #if 0
984 // if RODT change, save old and set new here...
985 if (dram_tune_use_rodt >= 0) {
986 comp_ctl2.u = BDK_CSR_READ(node, BDK_LMCX_COMP_CTL2(lmc));
987 save_rodt[lmc] = comp_ctl2.s.rodt_ctl;
988 comp_ctl2.s.rodt_ctl = dram_tune_use_rodt;
989 DRAM_CSR_WRITE(node, BDK_LMCX_COMP_CTL2(lmc), comp_ctl2.u);
990 BDK_CSR_READ(node, BDK_LMCX_COMP_CTL2(lmc));
991 }
992 #endif
993 /* Disable ECC for DRAM tests */
994 lmc_config.u = BDK_CSR_READ(node, BDK_LMCX_CONFIG(lmc));
995 save_ecc_ena[lmc] = lmc_config.s.ecc_ena;
996 lmc_config.s.ecc_ena = 0;
997 DRAM_CSR_WRITE(node, BDK_LMCX_CONFIG(lmc), lmc_config.u);
998 lmc_config.u = BDK_CSR_READ(node, BDK_LMCX_CONFIG(lmc));
999
1000 } /* for (lmc = 0; lmc < num_lmcs; lmc++) */
1001
1002 #if USE_L2_WAYS_LIMIT
1003 /* Disable l2 sets for DRAM testing */
1004 limit_l2_ways(node, 0, ways_print);
1005 #endif
1006
1007 // testing is done on all LMCs simultaneously
1008 // FIXME: for now, loop here to show what happens multiple times
1009 for (loop = 0; loop < loops; loop++) {
1010 /* Perform DLL offset tuning */
1011 errs = auto_set_dll_offset(node, dll_offset_mode, num_lmcs, ddr_interface_64b, do_tune);
1012 }
1013
1014 #if USE_L2_WAYS_LIMIT
1015 /* Restore the l2 set configuration */
1016 limit_l2_ways(node, ways, ways_print);
1017 #endif
1018
1019 // perform cleanup on all active LMCs
1020 debug_print("N%d: %s: starting LMCs cleanup.\n", node, __func__);
1021 for (lmc = 0; lmc < num_lmcs; lmc++) {
1022
1023 /* Restore ECC for DRAM tests */
1024 lmc_config.u = BDK_CSR_READ(node, BDK_LMCX_CONFIG(lmc));
1025 lmc_config.s.ecc_ena = save_ecc_ena[lmc];
1026 DRAM_CSR_WRITE(node, BDK_LMCX_CONFIG(lmc), lmc_config.u);
1027 lmc_config.u = BDK_CSR_READ(node, BDK_LMCX_CONFIG(lmc));
1028 #if 0
1029 // if RODT change, restore old here...
1030 if (dram_tune_use_rodt >= 0) {
1031 comp_ctl2.u = BDK_CSR_READ(node, BDK_LMCX_COMP_CTL2(lmc));
1032 comp_ctl2.s.rodt_ctl = save_rodt[lmc];
1033 DRAM_CSR_WRITE(node, BDK_LMCX_COMP_CTL2(lmc), comp_ctl2.u);
1034 BDK_CSR_READ(node, BDK_LMCX_COMP_CTL2(lmc));
1035 }
1036 #endif
1037 // finally, see if there are any read offset overrides after tuning
1038 // FIXME: provide a way to do write offsets also??
1039 if (dll_offset_mode == 2) {
1040 for (int by = 0; by < 9; by++) {
1041 if ((s = lookup_env_parameter("ddr%d_tune_byte%d", lmc, by)) != NULL) {
1042 int dllro = strtoul(s, NULL, 10);
1043 change_dll_offset_enable(node, lmc, 0);
1044 load_dll_offset(node, lmc, /* read */2, dllro, by);
1045 change_dll_offset_enable(node, lmc, 1);
1046 }
1047 }
1048 }
1049 } /* for (lmc = 0; lmc < num_lmcs; lmc++) */
1050
1051 // finish up...
1052
1053 #if 0
1054 // if RODT was overridden during tuning, note restore
1055 if (dram_tune_use_rodt >= 0) {
1056 ddr_print("N%d: restoring RODT %d after tuning.\n",
1057 node, save_rodt[0]); // FIXME? use LMC0
1058 }
1059 #endif
1060
1061 // put any cores on this node, that were not running at the start, back into reset
1062 /* FIXME(dhendrix): don't reset cores... */
1063 // uint64_t reset_coremask = bdk_get_running_coremask(node) & ~orig_coremask;
1064 uint64_t reset_coremask = 0;
1065 if (reset_coremask) {
1066 /* FIXME(dhendrix): %lx --> %llx */
1067 ddr_print4("N%d: %s: Stopping cores 0x%llx\n", node, __func__,
1068 reset_coremask);
1069 bdk_reset_cores(node, reset_coremask);
1070 } else {
1071 /* FIXME(dhendrix): %lx --> %llx */
1072 ddr_print4("N%d: %s: leaving cores set to 0x%llx\n", node, __func__,
1073 orig_coremask);
1074 }
1075
1076 return errs;
1077
1078 } /* perform_dll_offset_tuning */
1079
1080 /////////////////////////////////////////////////////////////////////////////////////////////
1081
1082 ///// HW-assist byte DLL offset tuning //////
1083
1084 #if 1
1085 // setup defaults for byte test pattern array
1086 // take these first two from the HRM section 6.9.13
1087 static const uint64_t byte_pattern_0[] = {
1088 0xFFAAFFFFFF55FFFFULL, // GP0
1089 0x55555555AAAAAAAAULL, // GP1
1090 0xAA55AAAAULL, // GP2
1091 };
1092 static const uint64_t byte_pattern_1[] = {
1093 0xFBF7EFDFBF7FFEFDULL, // GP0
1094 0x0F1E3C78F0E1C387ULL, // GP1
1095 0xF0E1BF7FULL, // GP2
1096 };
1097 // this is from Andrew via LFSR with PRBS=0xFFFFAAAA
1098 static const uint64_t byte_pattern_2[] = {
1099 0xEE55AADDEE55AADDULL, // GP0
1100 0x55AADDEE55AADDEEULL, // GP1
1101 0x55EEULL, // GP2
1102 };
1103 // this is from Mike via LFSR with PRBS=0x4A519909
1104 static const uint64_t byte_pattern_3[] = {
1105 0x0088CCEE0088CCEEULL, // GP0
1106 0xBB552211BB552211ULL, // GP1
1107 0xBB00ULL, // GP2
1108 };
1109
1110 static const uint64_t *byte_patterns[] = {
1111 byte_pattern_0, byte_pattern_1, byte_pattern_2, byte_pattern_3 // FIXME: use all we have
1112 };
1113 #define NUM_BYTE_PATTERNS ((int)(sizeof(byte_patterns)/sizeof(uint64_t *)))
1114
1115 #define DEFAULT_BYTE_BURSTS 32 // FIXME: this is what what the longest test usually has
1116 int dram_tune_byte_bursts = DEFAULT_BYTE_BURSTS;
1117 #endif
1118
1119 static void
setup_hw_pattern(bdk_node_t node,int lmc,const uint64_t * pattern_p)1120 setup_hw_pattern(bdk_node_t node, int lmc, const uint64_t *pattern_p)
1121 {
1122 /*
1123 3) Setup GENERAL_PURPOSE[0-2] registers with the data pattern of choice.
1124 a. GENERAL_PURPOSE0[DATA<63:0>] – sets the initial lower (rising edge) 64 bits of data.
1125 b. GENERAL_PURPOSE1[DATA<63:0>] – sets the initial upper (falling edge) 64 bits of data.
1126 c. GENERAL_PURPOSE2[DATA<15:0>] – sets the initial lower (rising edge <7:0>) and upper
1127 (falling edge <15:8>) ECC data.
1128 */
1129 DRAM_CSR_WRITE(node, BDK_LMCX_GENERAL_PURPOSE0(lmc), pattern_p[0]);
1130 DRAM_CSR_WRITE(node, BDK_LMCX_GENERAL_PURPOSE1(lmc), pattern_p[1]);
1131 DRAM_CSR_WRITE(node, BDK_LMCX_GENERAL_PURPOSE2(lmc), pattern_p[2]);
1132 }
1133
1134 #define DEFAULT_PRBS 0xFFFFAAAAUL /* FIXME: maybe try 0x4A519909UL */
1135
1136 static void
setup_lfsr_pattern(bdk_node_t node,int lmc,uint64_t data)1137 setup_lfsr_pattern(bdk_node_t node, int lmc, uint64_t data)
1138 {
1139 uint32_t prbs;
1140 const char *s;
1141
1142 if ((s = getenv("ddr_lfsr_prbs"))) {
1143 prbs = strtoul(s, NULL, 0);
1144 } else
1145 prbs = DEFAULT_PRBS; // FIXME: from data arg?
1146
1147 /*
1148 2) DBTRAIN_CTL[LFSR_PATTERN_SEL] = 1
1149 here data comes from the LFSR generating a PRBS pattern
1150 CHAR_CTL.EN = 0
1151 CHAR_CTL.SEL = 0; // for PRBS
1152 CHAR_CTL.DR = 1;
1153 CHAR_CTL.PRBS = setup for whatever type of PRBS to send
1154 CHAR_CTL.SKEW_ON = 1;
1155 */
1156 BDK_CSR_INIT(char_ctl, node, BDK_LMCX_CHAR_CTL(lmc));
1157 char_ctl.s.en = 0;
1158 char_ctl.s.sel = 0;
1159 char_ctl.s.dr = 1;
1160 char_ctl.s.prbs = prbs;
1161 char_ctl.s.skew_on = 1;
1162 DRAM_CSR_WRITE(node, BDK_LMCX_CHAR_CTL(lmc), char_ctl.u);
1163 }
1164
1165 /* FIXME(dhendrix): made static to avoid need for prototype */
1166 static int
choose_best_hw_patterns(bdk_node_t node,int lmc,int mode)1167 choose_best_hw_patterns(bdk_node_t node, int lmc, int mode)
1168 {
1169 int new_mode = mode;
1170 const char *s;
1171
1172 switch (mode) {
1173 case DBTRAIN_TEST: // always choose LFSR if chip supports it
1174 if (! CAVIUM_IS_MODEL(CAVIUM_CN88XX)) {
1175 int lfsr_enable = 1;
1176 if ((s = getenv("ddr_allow_lfsr"))) { // override?
1177 lfsr_enable = !!strtoul(s, NULL, 0);
1178 }
1179 if (lfsr_enable)
1180 new_mode = DBTRAIN_LFSR;
1181 }
1182 break;
1183 case DBTRAIN_DBI: // possibly can allow LFSR use?
1184 break;
1185 case DBTRAIN_LFSR: // forced already
1186 if (CAVIUM_IS_MODEL(CAVIUM_CN88XX)) {
1187 ddr_print("ERROR: illegal HW assist mode %d\n", mode);
1188 new_mode = DBTRAIN_TEST;
1189 }
1190 break;
1191 default:
1192 ddr_print("ERROR: unknown HW assist mode %d\n", mode);
1193 }
1194
1195 if (new_mode != mode)
1196 VB_PRT(VBL_DEV2, "choose_best_hw_patterns: changing mode %d to %d\n", mode, new_mode);
1197
1198 return new_mode;
1199 }
1200
1201 int
run_best_hw_patterns(bdk_node_t node,int lmc,uint64_t phys_addr,int mode,uint64_t * xor_data)1202 run_best_hw_patterns(bdk_node_t node, int lmc, uint64_t phys_addr,
1203 int mode, uint64_t *xor_data)
1204 {
1205 int pattern;
1206 const uint64_t *pattern_p;
1207 int errs, errors = 0;
1208
1209 // FIXME? always choose LFSR if chip supports it???
1210 mode = choose_best_hw_patterns(node, lmc, mode);
1211
1212 if (mode == DBTRAIN_LFSR) {
1213 setup_lfsr_pattern(node, lmc, 0);
1214 errors = test_dram_byte_hw(node, lmc, phys_addr, mode, xor_data);
1215 VB_PRT(VBL_DEV2, "%s: LFSR at A:0x%012llx errors 0x%x\n",
1216 __func__, phys_addr, errors);
1217 } else {
1218 for (pattern = 0; pattern < NUM_BYTE_PATTERNS; pattern++) {
1219 pattern_p = byte_patterns[pattern];
1220 setup_hw_pattern(node, lmc, pattern_p);
1221
1222 errs = test_dram_byte_hw(node, lmc, phys_addr, mode, xor_data);
1223
1224 VB_PRT(VBL_DEV2, "%s: PATTERN %d at A:0x%012llx errors 0x%x\n",
1225 __func__, pattern, phys_addr, errs);
1226
1227 errors |= errs;
1228 } /* for (pattern = 0; pattern < NUM_BYTE_PATTERNS; pattern++) */
1229 }
1230 return errors;
1231 }
1232
1233 static void
hw_assist_test_dll_offset(bdk_node_t node,int dll_offset_mode,int lmc,int bytelane)1234 hw_assist_test_dll_offset(bdk_node_t node, int dll_offset_mode,
1235 int lmc, int bytelane)
1236 {
1237 int byte_offset, new_best_offset[9];
1238 int rank_delay_start[4][9];
1239 int rank_delay_count[4][9];
1240 int rank_delay_best_start[4][9];
1241 int rank_delay_best_count[4][9];
1242 int errors[4];
1243 int num_lmcs = __bdk_dram_get_num_lmc(node);
1244 int rank_mask, rankx, active_ranks;
1245 int pattern;
1246 const uint64_t *pattern_p;
1247 int byte;
1248 const char *mode_str = (dll_offset_mode == 2) ? "Read" : "Write";
1249 int pat_best_offset[9];
1250 uint64_t phys_addr;
1251 int pat_beg, pat_end;
1252 int rank_beg, rank_end;
1253 int byte_lo, byte_hi;
1254 uint64_t hw_rank_offset;
1255 // FIXME? always choose LFSR if chip supports it???
1256 int mode = choose_best_hw_patterns(node, lmc, DBTRAIN_TEST);
1257
1258 if (bytelane == 0x0A) { // all bytelanes
1259 byte_lo = 0;
1260 byte_hi = 8;
1261 } else { // just 1
1262 byte_lo = byte_hi = bytelane;
1263 }
1264
1265 BDK_CSR_INIT(lmcx_config, node, BDK_LMCX_CONFIG(lmc));
1266 rank_mask = lmcx_config.s.init_status;
1267 // this should be correct for 1 or 2 ranks, 1 or 2 DIMMs
1268 hw_rank_offset = 1ull << (28 + lmcx_config.s.pbank_lsb - lmcx_config.s.rank_ena + (num_lmcs/2));
1269
1270 debug_print("N%d: %s: starting LMC%d with rank offset 0x%lx\n",
1271 node, __func__, lmc, hw_rank_offset);
1272
1273 // start of pattern loop
1274 // we do the set of tests for each pattern supplied...
1275
1276 memset(new_best_offset, 0, sizeof(new_best_offset));
1277 for (pattern = 0; pattern < NUM_BYTE_PATTERNS; pattern++) {
1278
1279 memset(pat_best_offset, 0, sizeof(pat_best_offset));
1280
1281 if (mode == DBTRAIN_TEST) {
1282 pattern_p = byte_patterns[pattern];
1283 setup_hw_pattern(node, lmc, pattern_p);
1284 } else {
1285 setup_lfsr_pattern(node, lmc, 0);
1286 }
1287
1288 // now loop through all legal values for the DLL byte offset...
1289
1290 #define BYTE_OFFSET_INCR 3 // FIXME: make this tunable?
1291
1292 memset(rank_delay_count, 0, sizeof(rank_delay_count));
1293 memset(rank_delay_start, 0, sizeof(rank_delay_start));
1294 memset(rank_delay_best_count, 0, sizeof(rank_delay_best_count));
1295 memset(rank_delay_best_start, 0, sizeof(rank_delay_best_start));
1296
1297 for (byte_offset = -63; byte_offset < 64; byte_offset += BYTE_OFFSET_INCR) {
1298
1299 // do the setup on the active LMC
1300 // set the bytelanes DLL offsets
1301 change_dll_offset_enable(node, lmc, 0);
1302 load_dll_offset(node, lmc, dll_offset_mode, byte_offset, bytelane); // FIXME? bytelane?
1303 change_dll_offset_enable(node, lmc, 1);
1304
1305 bdk_watchdog_poke();
1306
1307 // run the test on each rank
1308 // only 1 call per rank should be enough, let the bursts, loops, etc, control the load...
1309
1310 active_ranks = 0;
1311
1312 for (rankx = 0; rankx < 4; rankx++) {
1313 if (!(rank_mask & (1 << rankx)))
1314 continue;
1315
1316 phys_addr = hw_rank_offset * active_ranks;
1317 // FIXME: now done by test_dram_byte_hw()
1318 //phys_addr |= (lmc << 7);
1319 //phys_addr = bdk_numa_get_address(node, phys_addr); // map to node
1320
1321 active_ranks++;
1322
1323 // NOTE: return is a now a bitmask of the erroring bytelanes..
1324 errors[rankx] = test_dram_byte_hw(node, lmc, phys_addr, mode, NULL);
1325
1326 for (byte = byte_lo; byte <= byte_hi; byte++) { // do bytelane(s)
1327
1328 // check errors
1329 if (errors[rankx] & (1 << byte)) { // yes, an error in the byte lane in this rank
1330
1331 ddr_print5("N%d.LMC%d.R%d: Bytelane %d DLL %s Offset Test %3d: Address 0x%012llx errors 0x%x\n",
1332 node, lmc, rankx, bytelane, mode_str,
1333 byte_offset, phys_addr, errors[rankx]);
1334
1335 if (rank_delay_count[rankx][byte] > 0) { // had started run
1336 ddr_print5("N%d.LMC%d.R%d: Bytelane %d DLL %s Offset Test %3d: stopping a run here\n",
1337 node, lmc, rankx, bytelane, mode_str, byte_offset);
1338 rank_delay_count[rankx][byte] = 0; // stop now
1339 }
1340 // FIXME: else had not started run - nothing else to do?
1341 } else { // no error in the byte lane
1342 if (rank_delay_count[rankx][byte] == 0) { // first success, set run start
1343 ddr_print5("N%d.LMC%d.R%d: Bytelane %d DLL %s Offset Test %3d: starting a run here\n",
1344 node, lmc, rankx, bytelane, mode_str, byte_offset);
1345 rank_delay_start[rankx][byte] = byte_offset;
1346 }
1347 rank_delay_count[rankx][byte] += BYTE_OFFSET_INCR; // bump run length
1348
1349 // is this now the biggest window?
1350 if (rank_delay_count[rankx][byte] > rank_delay_best_count[rankx][byte]) {
1351 rank_delay_best_count[rankx][byte] = rank_delay_count[rankx][byte];
1352 rank_delay_best_start[rankx][byte] = rank_delay_start[rankx][byte];
1353 debug_print("N%d.LMC%d.R%d: Bytelane %d DLL %s Offset Test %3d: updating best to %d/%d\n",
1354 node, lmc, rankx, bytelane, mode_str, byte_offset,
1355 rank_delay_best_start[rankx][byte], rank_delay_best_count[rankx][byte]);
1356 }
1357 }
1358 } /* for (byte = byte_lo; byte <= byte_hi; byte++) */
1359 } /* for (rankx = 0; rankx < 4; rankx++) */
1360
1361 } /* for (byte_offset = -63; byte_offset < 64; byte_offset += BYTE_OFFSET_INCR) */
1362
1363 // now choose the best byte_offsets for this pattern according to the best windows of the tested ranks
1364 // calculate offset by constructing an average window from the rank windows
1365 for (byte = byte_lo; byte <= byte_hi; byte++) {
1366
1367 pat_beg = -999;
1368 pat_end = 999;
1369
1370 for (rankx = 0; rankx < 4; rankx++) {
1371 if (!(rank_mask & (1 << rankx)))
1372 continue;
1373
1374 rank_beg = rank_delay_best_start[rankx][byte];
1375 pat_beg = max(pat_beg, rank_beg);
1376 rank_end = rank_beg + rank_delay_best_count[rankx][byte] - BYTE_OFFSET_INCR;
1377 pat_end = min(pat_end, rank_end);
1378
1379 ddr_print5("N%d.LMC%d.R%d: Bytelane %d DLL %s Offset Test: Rank Window %3d:%3d\n",
1380 node, lmc, rankx, bytelane, mode_str, rank_beg, rank_end);
1381
1382 } /* for (rankx = 0; rankx < 4; rankx++) */
1383
1384 pat_best_offset[byte] = (pat_end + pat_beg) / 2;
1385 ddr_print4("N%d.LMC%d: Bytelane %d DLL %s Offset Test: Pattern %d Average %3d\n",
1386 node, lmc, byte, mode_str, pattern, pat_best_offset[byte]);
1387
1388 #if 0
1389 // FIXME: next print the window counts
1390 sprintf(sbuffer, "N%d.LMC%d Pattern %d: DLL %s Offset Count ",
1391 node, lmc, pattern, mode_str);
1392 printf("%-45s : ", sbuffer);
1393 printf(" %3d", byte_delay_best_count);
1394 printf("\n");
1395 #endif
1396
1397 new_best_offset[byte] += pat_best_offset[byte]; // sum the pattern averages
1398 } /* for (byte = byte_lo; byte <= byte_hi; byte++) */
1399 } /* for (pattern = 0; pattern < NUM_BYTE_PATTERNS; pattern++) */
1400 // end of pattern loop
1401
1402 ddr_print("N%d.LMC%d: HW DLL %s Offset Amount : ",
1403 node, lmc, mode_str);
1404
1405 for (byte = byte_hi; byte >= byte_lo; --byte) { // print in decending byte index order
1406 new_best_offset[byte] = divide_nint(new_best_offset[byte], NUM_BYTE_PATTERNS); // create the new average NINT
1407
1408 // print the best offsets from all patterns
1409
1410 if (bytelane == 0x0A) // print just the offset of all the bytes
1411 ddr_print("%5d ", new_best_offset[byte]);
1412 else
1413 ddr_print("(byte %d) %5d ", byte, new_best_offset[byte]);
1414
1415
1416 #if 1
1417 // done with testing, load up the best offsets we found...
1418 change_dll_offset_enable(node, lmc, 0); // disable offsets while we load...
1419 load_dll_offset(node, lmc, dll_offset_mode, new_best_offset[byte], byte);
1420 change_dll_offset_enable(node, lmc, 1); // re-enable the offsets now that we are done loading
1421 #endif
1422 } /* for (byte = byte_hi; byte >= byte_lo; --byte) */
1423
1424 ddr_print("\n");
1425
1426 #if 0
1427 // run the test one last time
1428 // print whether there are errors or not, but only when verbose...
1429 tot_errors = run_test_dram_byte_threads(node, num_lmcs, bytemask);
1430 printf("N%d.LMC%d: Bytelane %d DLL %s Offset Final Test: errors 0x%x\n",
1431 node, lmc, bytelane, mode_str, tot_errors);
1432 #endif
1433 }
1434
1435 /*
1436 * Automatically adjust the DLL offset for the selected bytelane using hardware-assist
1437 */
perform_HW_dll_offset_tuning(bdk_node_t node,int dll_offset_mode,int bytelane)1438 int perform_HW_dll_offset_tuning(bdk_node_t node, int dll_offset_mode, int bytelane)
1439 {
1440 int save_ecc_ena[4];
1441 bdk_lmcx_config_t lmc_config;
1442 int lmc, num_lmcs = __bdk_dram_get_num_lmc(node);
1443 const char *s;
1444 //bdk_lmcx_comp_ctl2_t comp_ctl2;
1445 int loops = 1, loop;
1446
1447 // see if we want to do the tuning more than once per LMC...
1448 if ((s = getenv("ddr_tune_ecc_loops"))) {
1449 loops = strtoul(s, NULL, 0);
1450 }
1451
1452 // allow override of the test repeats (bursts)
1453 if ((s = getenv("ddr_tune_byte_bursts")) != NULL) {
1454 dram_tune_byte_bursts = strtoul(s, NULL, 10);
1455 }
1456
1457 // print current working values
1458 ddr_print2("N%d: H/W Tuning for bytelane %d will use %d loops, %d bursts, and %d patterns.\n",
1459 node, bytelane, loops, dram_tune_byte_bursts,
1460 NUM_BYTE_PATTERNS);
1461
1462 // FIXME? get flag from LMC0 only
1463 lmc_config.u = BDK_CSR_READ(node, BDK_LMCX_CONFIG(0));
1464
1465 // do once for each active LMC
1466
1467 for (lmc = 0; lmc < num_lmcs; lmc++) {
1468
1469 ddr_print4("N%d: H/W Tuning: starting LMC%d bytelane %d tune.\n", node, lmc, bytelane);
1470
1471 /* Enable ECC for the HW tests */
1472 // NOTE: we do enable ECC, but the HW tests used will not generate "visible" errors
1473 lmc_config.u = BDK_CSR_READ(node, BDK_LMCX_CONFIG(lmc));
1474 save_ecc_ena[lmc] = lmc_config.s.ecc_ena;
1475 lmc_config.s.ecc_ena = 1;
1476 DRAM_CSR_WRITE(node, BDK_LMCX_CONFIG(lmc), lmc_config.u);
1477 lmc_config.u = BDK_CSR_READ(node, BDK_LMCX_CONFIG(lmc));
1478
1479 // testing is done on a single LMC at a time
1480 // FIXME: for now, loop here to show what happens multiple times
1481 for (loop = 0; loop < loops; loop++) {
1482 /* Perform DLL offset tuning */
1483 //auto_set_dll_offset(node, 1 /* 1=write */, lmc, bytelane);
1484 hw_assist_test_dll_offset(node, 2 /* 2=read */, lmc, bytelane);
1485 }
1486
1487 // perform cleanup on active LMC
1488 ddr_print4("N%d: H/W Tuning: finishing LMC%d bytelane %d tune.\n", node, lmc, bytelane);
1489
1490 /* Restore ECC for DRAM tests */
1491 lmc_config.u = BDK_CSR_READ(node, BDK_LMCX_CONFIG(lmc));
1492 lmc_config.s.ecc_ena = save_ecc_ena[lmc];
1493 DRAM_CSR_WRITE(node, BDK_LMCX_CONFIG(lmc), lmc_config.u);
1494 lmc_config.u = BDK_CSR_READ(node, BDK_LMCX_CONFIG(lmc));
1495
1496 // finally, see if there are any read offset overrides after tuning
1497 for (int by = 0; by < 9; by++) {
1498 if ((s = lookup_env_parameter("ddr%d_tune_byte%d", lmc, by)) != NULL) {
1499 int dllro = strtoul(s, NULL, 10);
1500 change_dll_offset_enable(node, lmc, 0);
1501 load_dll_offset(node, lmc, 2 /* 2=read */, dllro, by);
1502 change_dll_offset_enable(node, lmc, 1);
1503 }
1504 }
1505
1506 } /* for (lmc = 0; lmc < num_lmcs; lmc++) */
1507
1508 // finish up...
1509
1510 return 0;
1511
1512 } /* perform_HW_dll_offset_tuning */
1513